From abda03facee57e5d94a6e2d7be26ab53e319dede Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Thu, 16 Aug 2018 16:58:30 -0700 Subject: [PATCH 001/540] Fast bilinear resize code Cleanup code formatting in Bilinear Resize Updates requested by reviewers --- tensorflow/core/kernels/BUILD | 13 +- tensorflow/core/kernels/crop_and_resize_op.cc | 92 +- .../crop_and_resize_op_benchmark_test.cc | 36 +- .../core/kernels/crop_resize_bilinear_core.h | 5497 +++++++++++++++++ tensorflow/core/kernels/resize_bilinear_op.cc | 151 +- .../core/kernels/resize_bilinear_op_test.cc | 2 +- 6 files changed, 5591 insertions(+), 200 deletions(-) create mode 100644 tensorflow/core/kernels/crop_resize_bilinear_core.h diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 972fb9efa91..3b2b71ec2a1 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -29,6 +29,7 @@ package_group( load( "//tensorflow:tensorflow.bzl", "if_android", + "if_linux_x86_64", "tf_cc_test", "tf_cc_tests", "tf_cc_binary", @@ -551,6 +552,12 @@ cc_header_only_library( deps = [":image_resizer_state"], ) +cc_library( + name = "crop_resize_bilinear_core", + hdrs = ["crop_resize_bilinear_core.h"], + visibility = ["//visibility:private"], +) + # OpKernel libraries ---------------------------------------------------------- ARRAY_DEPS = [ @@ -2150,7 +2157,8 @@ tf_kernel_library( tf_kernel_library( name = "crop_and_resize_op", prefix = "crop_and_resize_op", - deps = IMAGE_DEPS, + copts = tf_copts() + if_linux_x86_64(["-msse4.1 -finline-functions"]), + deps = IMAGE_DEPS + [":crop_resize_bilinear_core"], ) tf_kernel_library( @@ -2216,7 +2224,8 @@ tf_kernel_library( tf_kernel_library( name = "resize_bilinear_op", prefix = "resize_bilinear_op", - deps = IMAGE_DEPS, + copts = tf_copts() + if_linux_x86_64(["-msse4.1 -finline-functions"]), + deps = IMAGE_DEPS + [":crop_resize_bilinear_core"], ) tf_kernel_library( diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc index 99d01b4db6b..7c4d3431e61 100644 --- a/tensorflow/core/kernels/crop_and_resize_op.cc +++ b/tensorflow/core/kernels/crop_and_resize_op.cc @@ -22,17 +22,18 @@ limitations under the License. #include #include -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/kernels/crop_resize_bilinear_core.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/work_sharder.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #if GOOGLE_CUDA #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" @@ -228,61 +229,56 @@ struct CropAndResize { continue; } - const float height_scale = - (crop_height > 1) - ? (y2 - y1) * (image_height - 1) / (crop_height - 1) - : 0; - const float width_scale = - (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1) - : 0; + if (method_name == "bilinear") { + std::vector xs; + std::vector ys; + int min_ix, max_ix, min_iy, max_iy; + compute_interpolation_weights(crop_width, image_width, x1, x2, + &min_ix, &max_ix, &xs); + compute_interpolation_weights(crop_height, image_height, y1, y2, + &min_iy, &max_iy, &ys); - for (int y = 0; y < crop_height; ++y) { - const float in_y = (crop_height > 1) - ? y1 * (image_height - 1) + y * height_scale - : 0.5 * (y1 + y2) * (image_height - 1); - if (in_y < 0 || in_y > image_height - 1) { - for (int x = 0; x < crop_width; ++x) { - for (int d = 0; d < depth; ++d) { - crops(b, y, x, d) = extrapolation_value; - } - } - continue; + // multiply by depth to avoid multiplication in resize_single_image. + for (int i = min_ix; i <= max_ix; ++i) { + xs[i - min_ix].lower *= depth; + xs[i - min_ix].upper *= depth; } - if (method_name == "bilinear") { - const int top_y_index = floorf(in_y); - const int bottom_y_index = ceilf(in_y); - const float y_lerp = in_y - top_y_index; - for (int x = 0; x < crop_width; ++x) { - const float in_x = (crop_width > 1) - ? x1 * (image_width - 1) + x * width_scale - : 0.5 * (x1 + x2) * (image_width - 1); - if (in_x < 0 || in_x > image_width - 1) { + crop_resize_single_image_common( + image.data() + + static_cast(b_in) * static_cast(image_height) * + static_cast(image_width) * + static_cast(depth), + image_height, image_width, crop_height, crop_width, depth, min_ix, + max_ix, xs.data(), min_iy, max_iy, ys.data(), extrapolation_value, + false, false, + crops.data() + + static_cast(b) * static_cast(crop_height) * + static_cast(crop_width) * + static_cast(depth)); + // xs and ys are deallocated automatically when they go out of scope + } else { // method == "nearest" + const float height_scale = + (crop_height > 1) + ? (y2 - y1) * (image_height - 1) / (crop_height - 1) + : 0; + const float width_scale = + (crop_width > 1) + ? (x2 - x1) * (image_width - 1) / (crop_width - 1) + : 0; + + for (int y = 0; y < crop_height; ++y) { + const float in_y = (crop_height > 1) + ? y1 * (image_height - 1) + y * height_scale + : 0.5 * (y1 + y2) * (image_height - 1); + if (in_y < 0 || in_y > image_height - 1) { + for (int x = 0; x < crop_width; ++x) { for (int d = 0; d < depth; ++d) { crops(b, y, x, d) = extrapolation_value; } - continue; - } - const int left_x_index = floorf(in_x); - const int right_x_index = ceilf(in_x); - const float x_lerp = in_x - left_x_index; - - for (int d = 0; d < depth; ++d) { - const float top_left(static_cast( - image(b_in, top_y_index, left_x_index, d))); - const float top_right(static_cast( - image(b_in, top_y_index, right_x_index, d))); - const float bottom_left(static_cast( - image(b_in, bottom_y_index, left_x_index, d))); - const float bottom_right(static_cast( - image(b_in, bottom_y_index, right_x_index, d))); - const float top = top_left + (top_right - top_left) * x_lerp; - const float bottom = - bottom_left + (bottom_right - bottom_left) * x_lerp; - crops(b, y, x, d) = top + (bottom - top) * y_lerp; } + continue; } - } else { // method == "nearest" for (int x = 0; x < crop_width; ++x) { const float in_x = (crop_width > 1) ? x1 * (image_width - 1) + x * width_scale diff --git a/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc b/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc index d7ca64bea05..54d4f33b446 100644 --- a/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc +++ b/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc @@ -21,11 +21,13 @@ limitations under the License. namespace tensorflow { +template static Graph* BM_CropAndResize(int batches, int width, int height, int depth, int crop_height, int crop_width) { Graph* g = new Graph(OpRegistry::Global()); - Tensor in(DT_FLOAT, TensorShape({batches, height, width, depth})); - in.flat().setRandom(); + Tensor in(DataTypeToEnum::v(), + TensorShape({batches, height, width, depth})); + in.flat().setRandom(); Tensor boxes(DT_FLOAT, TensorShape({batches, 4})); auto boxes_tensor = boxes.matrix(); Tensor box_ind(DT_INT32, TensorShape({batches})); @@ -51,13 +53,17 @@ static Graph* BM_CropAndResize(int batches, int width, int height, int depth, return g; } -#define BM_CropAndResizeDev(DEVICE, B, W, H, D, CH, CW) \ - static void BM_CropAndResize_##DEVICE##_##B##_##W##_##H##_##D##_##CH##_##CW( \ - int iters) { \ - testing::ItemsProcessed(iters* B* W* H* D); \ - test::Benchmark(#DEVICE, BM_CropAndResize(B, W, H, D, CH, CW)).Run(iters); \ - } \ - BENCHMARK(BM_CropAndResize_##DEVICE##_##B##_##W##_##H##_##D##_##CH##_##CW); +#define BM_CropAndResizeDev(DEVICE, DTYPE, B, W, H, D, CH, CW) \ + static void \ + BM_CropAndResize_##DEVICE##_##DTYPE##_##B##_##W##_##H##_##D##_##CH##_##CW( \ + int iters) { \ + testing::ItemsProcessed(iters* B* W* H* D); \ + test::Benchmark(#DEVICE, BM_CropAndResize::Type>( \ + B, W, H, D, CH, CW)) \ + .Run(iters); \ + } \ + BENCHMARK( \ + BM_CropAndResize_##DEVICE##_##DTYPE##_##B##_##W##_##H##_##D##_##CH##_##CW); // Benchmark results using CPU:Intel Haswell with HyperThreading (6 cores) // Benchmark Time(ns) CPU(ns) Iterations @@ -65,8 +71,14 @@ static Graph* BM_CropAndResize(int batches, int width, int height, int depth, // BM_CropAndResize_cpu_1_640_640_1_512_512 3801232 3914692 185 99.784M items/s // BM_CropAndResize_cpu_1_80_80_512_7_7 182470 241767 2941 1.372G items/s -BM_CropAndResizeDev(cpu, 1, 640, 640, 3, 512, 512); -BM_CropAndResizeDev(cpu, 1, 640, 640, 1, 512, 512); -BM_CropAndResizeDev(cpu, 1, 80, 80, 512, 7, 7); +BM_CropAndResizeDev(cpu, DT_UINT8, 1, 640, 640, 3, 512, 512); +BM_CropAndResizeDev(cpu, DT_UINT8, 1, 640, 640, 1, 512, 512); + +BM_CropAndResizeDev(cpu, DT_HALF, 1, 640, 640, 3, 512, 512); +BM_CropAndResizeDev(cpu, DT_HALF, 1, 640, 640, 1, 512, 512); + +BM_CropAndResizeDev(cpu, DT_FLOAT, 1, 640, 640, 3, 512, 512); +BM_CropAndResizeDev(cpu, DT_FLOAT, 1, 640, 640, 1, 512, 512); +BM_CropAndResizeDev(cpu, DT_FLOAT, 1, 80, 80, 512, 7, 7); } // namespace tensorflow diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h new file mode 100644 index 00000000000..f6846d6a557 --- /dev/null +++ b/tensorflow/core/kernels/crop_resize_bilinear_core.h @@ -0,0 +1,5497 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ +#define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ + +// only include intrinsics when the appropriate flags call for it, +// since these headers only exists on x86 platforms. +#ifdef __SSE4_1__ +#include +#include +#include +#endif +#ifdef __AVX2__ +#include +#endif +#include +#include +#include +#include +#include + +namespace tensorflow { +namespace { + +// Compute the interpolation indices only once. +struct CachedInterpolation { + int lower; // Lower source index used in the interpolation + int upper; // Upper source index used in the interpolation + // 1-D linear iterpolation scale (see: + // https://en.wikipedia.org/wiki/Bilinear_interpolation) + float lerp; +}; + +bool compute_single_interpolation_weight(const int in_size, + const float out2in_scale, + const float out2in_start, + const bool clip, const int i, + int* lower, int* upper, float* lerp) { + const float in = i * out2in_scale + out2in_start; + *lower = (int)floor(in); + *upper = (int)ceil(in); + *lerp = (float)(in - (float)*lower); + if (clip) { + if (*lower < 0) + *lower = 0; + else if (*lower >= in_size) + *lower = in_size - 1; + if (*upper < 0) + *upper = 0; + else if (*upper >= in_size) + *upper = in_size - 1; + return true; + } else { + return (*lower >= 0 && *upper < in_size) ? true : false; + } +} +/** + * Compute interpolation values for output indexes in range + * [out_start,out_start+out_size-1]. + * Returns true if all output indexes have lower and upper (input) indexes + * within range [0,in_size-1]. + */ +bool compute_interpolation_weights(const int min_i, const int max_i, + const int in_size, const float out2in_scale, + const float out2in_start, const bool clip, + CachedInterpolation* interpolation) { + bool rval = true; + int num_i = max_i - min_i + 1; + for (int i = 0; i < num_i; ++i) { + if (!compute_single_interpolation_weight( + in_size, out2in_scale, out2in_start, clip, i + min_i, + &interpolation[i].lower, &interpolation[i].upper, + &interpolation[i].lerp)) { + rval = false; + } + } + return rval; +} +/** + * Compatibility method for resize_bilinear_op.cc + */ +void compute_interpolation_weights(const int out_size, const int in_size, + const float out2in_scale, + CachedInterpolation* interpolation) { + interpolation[out_size].lower = 0; + interpolation[out_size].upper = 0; + const bool clip = true; + if (!compute_interpolation_weights(0, out_size - 1, in_size, out2in_scale, + 0.0f, clip, interpolation)) { + // Should never happen, check for it anyway + printf( + "Warning! Interpolation values have lower,upper indexes outside of " + "range [0,in_size-1]\n"); + } +} +/** + * Compute minimum and maximum (output) i where both lower and upper (input) is + * in range [0,in_size-1] + * If no values of i satisfy condition, min_i = in_size, max_i = -1 and method + * returns false. + * Returns true if min_i >= max_i. + */ +bool compute_minmax_indexes(const int out_size, const int in_size, + const float out2in_scale, const float out2in_start, + int* min_i, int* max_i) { + *min_i = out_size; + *max_i = -1; + int lower, upper; + float lerp; + for (int i = 0; i < out_size; ++i) { + if (compute_single_interpolation_weight(in_size, out2in_scale, out2in_start, + false, i, &lower, &upper, &lerp)) { + if (i < *min_i) *min_i = i; + if (i > *max_i) *max_i = i; + } + } + return (*min_i <= *max_i) ? true : false; +} +/** + * Compute interpolation weights for crop_and_resize_op.cc + * Also computes extrapolation areas. + * Returns true if at least one point requires interpolation, false otherwise. + */ +bool compute_interpolation_weights( + const int out_size, const int in_size, + const float x1, // lower bounding box, crop region starts at in_size*x1 + const float x2, // upper bounding box, crop region ends at in_size*x2 + int* min_i, int* max_i, std::vector* interpolation) { + float out2in_start = out_size > 1 + ? (float)(in_size - 1) * (float)x1 + : (float)(in_size - 1) * (float)(x1 + x2) / 2.0f; + float out2in_scale = + out_size > 1 + ? (float)(x2 - x1) * (float)(in_size - 1) / (float)(out_size - 1) + : 0.0f; + if (compute_minmax_indexes(out_size, in_size, out2in_scale, out2in_start, + min_i, max_i)) { + interpolation->resize(*max_i - *min_i + 1); + bool all_inputs_ok = compute_interpolation_weights( + *min_i, *max_i, in_size, out2in_scale, out2in_start, false, + interpolation->data()); + if (!all_inputs_ok) { + // should never happen, purpose of compute_minmax_indexes is to ensure + // that all inputs are ok. + printf( + "Error! compute_interpolation_weights returned input indexes outside " + "valid range - SEGV will likely ensue.\n"); + } + return true; + } else { + return false; + } +} + +/** + * Cast float v to type U with range clamping. + * + * If vmax_val, + * return value is clamped to u_max_val. + */ +template +U cast_to(float v, float min_val, float max_val, U u_min_val, U u_max_val); +template +U cast_to(float v, float min_val, float max_val, U u_min_val, U u_max_val) { + if (v < min_val) + return u_min_val; + else if (v > max_val) + return u_max_val; + else + return static_cast(v); +} +/** + * no-op cast from float to float. + */ +template <> +float cast_to(float v, float min_val, float max_val, float u_min_val, + float u_max_val) { + return v; +} + +float compute_lerp(const float top_left, const float top_right, + const float bottom_left, const float bottom_right, + const float x_lerp, const float y_lerp) { + const float top = top_left + (top_right - top_left) * x_lerp; + const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; + return top + (bottom - top) * y_lerp; +} + +/** + * Computes the bilinear interpolation from the appropriate 4 float points + * and the linear interpolation weights. + * Accepts input tensors of type T and produces output tensors of type U. + * Optionally flips horizontal and/or vertical axis. + */ +template +void crop_resize_single_image(const T* image, const int64 in_height, + const int64 in_width, const int64 out_height, + const int64 out_width, const int channels, + const int min_ix, const int max_ix, + const CachedInterpolation* xs, const int min_iy, + const int max_iy, const CachedInterpolation* ys, + const float extrapolated_value, const bool flip_x, + const bool flip_y, + U* output) TF_ATTRIBUTE_NOINLINE; +template +void crop_resize_single_image(const T* image, const int64 in_height, + const int64 in_width, const int64 out_height, + const int64 out_width, const int channels, + const int min_ix, const int max_ix, + const CachedInterpolation* xs, const int min_iy, + const int max_iy, const CachedInterpolation* ys, + const float extrapolated_value, const bool flip_x, + const bool flip_y, U* output) { + const int64 in_row_size = in_width * channels; + const int64 out_row_size = out_width * channels; + U u_min_val = std::numeric_limits::min(); + U u_max_val = std::numeric_limits::max(); + float min_val = static_cast(u_min_val); + float max_val = static_cast(u_max_val); + U uEx = + cast_to(extrapolated_value, min_val, max_val, u_min_val, u_max_val); + // low y extrapolation zone + if (min_iy > 0) { + U* p = flip_y ? output + out_row_size * (out_height - min_iy) : output; + int64 nn = out_row_size * (int64)min_iy; + for (int64 i = 0; i < nn; ++i) p[i] = uEx; + } + // high y extrapolation zone + if (max_iy < out_height - 1) { + U* p = flip_y ? output : output + out_row_size * (max_iy + 1); + int64 nn = out_row_size * (int64)(out_height - 1 - max_iy); + for (int64 i = 0; i < nn; ++i) p[i] = uEx; + } + // low x extrapolation zone + if (min_ix > 0) { + for (int iy = min_iy; iy <= max_iy; ++iy) { + int xx0 = flip_x ? (out_width - min_ix) * channels : 0; + int nxx = min_ix * channels; + U* p = output + xx0 + + out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy); + for (int ix = 0; ix < nxx; ++ix) { + p[ix] = uEx; + } + } + } + // high x extrapolation zone + if (max_ix < out_width - 1) { + for (int iy = min_iy; iy <= max_iy; ++iy) { + int xx0 = flip_x ? 0 : (max_ix + 1) * channels; + int nxx = (out_width - 1 - max_ix) * channels; + U* p = output + xx0 + + out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy); + for (int ix = 0; ix < nxx; ++ix) { + p[ix] = uEx; + } + } + } + U* output_y_ptr = + output + + out_row_size * (int64)(flip_y ? out_height - 1 - min_iy : min_iy); + // interpolation zone + if (channels == 1) { + for (int y = min_iy; y <= max_iy; ++y) { + const int iy = y - min_iy; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const float ys_lerp = ys[iy].lerp; + const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; + const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; + for (int x = x0; x <= x1; ++x) { + const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; + const int64 xs_lower = xs[ix].lower; + const int64 xs_upper = xs[ix].upper; + const float xs_lerp = xs[ix].lerp; + + // Read channel 0. + const float top_left0(ys_input_lower_ptr[xs_lower]); + const float top_right0(ys_input_lower_ptr[xs_upper]); + const float bottom_left0(ys_input_upper_ptr[xs_lower]); + const float bottom_right0(ys_input_upper_ptr[xs_upper]); + + // Compute output. + float result0 = compute_lerp(top_left0, top_right0, bottom_left0, + bottom_right0, xs_lerp, ys_lerp); + output_y_ptr[x] = + cast_to(result0, min_val, max_val, u_min_val, u_max_val); + } + output_y_ptr = + flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; + } + } else if (channels == 2) { + for (int y = min_iy; y <= max_iy; ++y) { + const int iy = y - min_iy; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const float ys_lerp = ys[iy].lerp; + const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; + const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; + for (int x = x0; x <= x1; ++x) { + const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; + const int64 xs_lower = xs[ix].lower; + const int64 xs_upper = xs[ix].upper; + const float xs_lerp = xs[ix].lerp; + + // Read channel 0. + const float top_left0(ys_input_lower_ptr[xs_lower + 0]); + const float top_right0(ys_input_lower_ptr[xs_upper + 0]); + const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); + const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); + + // Read channel 1. + const float top_left1(ys_input_lower_ptr[xs_lower + 1]); + const float top_right1(ys_input_lower_ptr[xs_upper + 1]); + const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); + const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); + + // Compute output. + float result0 = compute_lerp(top_left0, top_right0, bottom_left0, + bottom_right0, xs_lerp, ys_lerp); + float result1 = compute_lerp(top_left1, top_right1, bottom_left1, + bottom_right1, xs_lerp, ys_lerp); + output_y_ptr[x * 2 + 0] = + cast_to(result0, min_val, max_val, u_min_val, u_max_val); + output_y_ptr[x * 2 + 1] = + cast_to(result1, min_val, max_val, u_min_val, u_max_val); + } + output_y_ptr = + flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; + } + } else if (channels == 3) { + for (int y = min_iy; y <= max_iy; ++y) { + const int iy = y - min_iy; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const float ys_lerp = ys[iy].lerp; + const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; + const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; + for (int x = x0; x <= x1; ++x) { + const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; + const int64 xs_lower = xs[ix].lower; + const int64 xs_upper = xs[ix].upper; + const float xs_lerp = xs[ix].lerp; + + // Read channel 0. + const float top_left0(ys_input_lower_ptr[xs_lower + 0]); + const float top_right0(ys_input_lower_ptr[xs_upper + 0]); + const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); + const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); + + // Read channel 1. + const float top_left1(ys_input_lower_ptr[xs_lower + 1]); + const float top_right1(ys_input_lower_ptr[xs_upper + 1]); + const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); + const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); + + // Read channel 2. + const float top_left2(ys_input_lower_ptr[xs_lower + 2]); + const float top_right2(ys_input_lower_ptr[xs_upper + 2]); + const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); + const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); + + // Compute output. + float result0 = compute_lerp(top_left0, top_right0, bottom_left0, + bottom_right0, xs_lerp, ys_lerp); + float result1 = compute_lerp(top_left1, top_right1, bottom_left1, + bottom_right1, xs_lerp, ys_lerp); + float result2 = compute_lerp(top_left2, top_right2, bottom_left2, + bottom_right2, xs_lerp, ys_lerp); + output_y_ptr[x * 3 + 0] = + cast_to(result0, min_val, max_val, u_min_val, u_max_val); + output_y_ptr[x * 3 + 1] = + cast_to(result1, min_val, max_val, u_min_val, u_max_val); + output_y_ptr[x * 3 + 2] = + cast_to(result2, min_val, max_val, u_min_val, u_max_val); + } + output_y_ptr = + flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; + } + } else if (channels == 4) { + for (int y = min_iy; y <= max_iy; ++y) { + const int iy = y - min_iy; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const float ys_lerp = ys[iy].lerp; + const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; + const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; + for (int x = x0; x <= x1; ++x) { + const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; + const int64 xs_lower = xs[ix].lower; + const int64 xs_upper = xs[ix].upper; + const float xs_lerp = xs[ix].lerp; + + // Read channel 0. + const float top_left0(ys_input_lower_ptr[xs_lower + 0]); + const float top_right0(ys_input_lower_ptr[xs_upper + 0]); + const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); + const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); + + // Read channel 1. + const float top_left1(ys_input_lower_ptr[xs_lower + 1]); + const float top_right1(ys_input_lower_ptr[xs_upper + 1]); + const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); + const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); + + // Read channel 2. + const float top_left2(ys_input_lower_ptr[xs_lower + 2]); + const float top_right2(ys_input_lower_ptr[xs_upper + 2]); + const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); + const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); + + // Read channel 3. + const float top_left3(ys_input_lower_ptr[xs_lower + 3]); + const float top_right3(ys_input_lower_ptr[xs_upper + 3]); + const float bottom_left3(ys_input_upper_ptr[xs_lower + 3]); + const float bottom_right3(ys_input_upper_ptr[xs_upper + 3]); + + // Compute output. + float result0 = compute_lerp(top_left0, top_right0, bottom_left0, + bottom_right0, xs_lerp, ys_lerp); + float result1 = compute_lerp(top_left1, top_right1, bottom_left1, + bottom_right1, xs_lerp, ys_lerp); + float result2 = compute_lerp(top_left2, top_right2, bottom_left2, + bottom_right2, xs_lerp, ys_lerp); + float result3 = compute_lerp(top_left3, top_right3, bottom_left3, + bottom_right3, xs_lerp, ys_lerp); + output_y_ptr[x * 4 + 0] = + cast_to(result0, min_val, max_val, u_min_val, u_max_val); + output_y_ptr[x * 4 + 1] = + cast_to(result1, min_val, max_val, u_min_val, u_max_val); + output_y_ptr[x * 4 + 2] = + cast_to(result2, min_val, max_val, u_min_val, u_max_val); + output_y_ptr[x * 4 + 3] = + cast_to(result3, min_val, max_val, u_min_val, u_max_val); + } + output_y_ptr = + flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; + } + } else { + for (int y = min_iy; y <= max_iy; ++y) { + const int iy = y - min_iy; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const float ys_lerp = ys[iy].lerp; + const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; + const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; + for (int x = x0; x <= x1; ++x) { + const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; + const int64 xs_lower = xs[ix].lower; + const int64 xs_upper = xs[ix].upper; + const float xs_lerp = xs[ix].lerp; + for (int ichan = 0; ichan < channels; ++ichan) { + const float top_left0(ys_input_lower_ptr[xs_lower + ichan]); + const float top_right0(ys_input_lower_ptr[xs_upper + ichan]); + const float bottom_left0(ys_input_upper_ptr[xs_lower + ichan]); + const float bottom_right0(ys_input_upper_ptr[xs_upper + ichan]); + float result0 = compute_lerp(top_left0, top_right0, bottom_left0, + bottom_right0, xs_lerp, ys_lerp); + output_y_ptr[x * channels + ichan] = + cast_to(result0, min_val, max_val, u_min_val, u_max_val); + } + } + output_y_ptr = + flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; + } + } +} + +#ifdef __SSE4_1__ + +// +// The remaining code implements explicitly vectorized versions of a bilinear +// image resizer. +// Images with 1, 2, 3 or 4 channels are supported. +// The image resizer reads samples of type T and writes samples of type U. +// T and U can be any of the following: uint8, int8, uint16, int16, int32, +// Eigen::half, bfloat16 and float. +// There are separate codes for SSE4.1 and AVX2. Enabling AVX2 also enables +// FP16C instruction set, +// which contains instructions that convert between Eigen::half and float. The +// SSE4.1 code path emulates +// the FP16C instructions in software. +// + +// +// This class loads 4 pixels with n channels, converts to fp32 and packs +// the result into n SSE vector words. +// Input data type T must be one of uint8, int8, uint16, int16, int32, +// Eigen::half, bfloat16 or float. +// + +template +class VectorLoader { + public: +#ifdef __AVX2__ + // convert 8 packed words of type T to fp32. + // T must be one of uint8, int8, uint16, int16, int32, Eigen::half, bfloat16 + // or float. + __m256 to_fp32(__m256i raw); +#else + // convert 4 packed words of type T to fp32. + // T must be one of uint8, int8, uint16, int16, int32, Eigen::half, bfloat16 + // or float. + __m128 to_fp32(__m128i raw); +#endif + +#ifdef __AVX2__ + // pack 4 pixels with 1 channel, 2 channels and 3channels respectively in + // separate 128 bit lanes. + // input is stored in lower portion of 4 separate sse words, v0 through v3. + // output is stored in lower portion of v0. + void pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + // output is stored in lower portion of v0 and v1. + void pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + // output is stored in lower portion of v0, v1 and v2. + void pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); +#else + // pack 4 pixels with 1 channel, 2 channels and 3channels respectively. + // input is stored in lower portion of 4 separate sse words, v0 through v3. + // output is stored in lower portion of v0. + void pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + // output is stored in lower portion of v0 and v1. + void pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + // output is stored in lower portion of v0, v1 and v2. + void pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); +#endif + +#ifdef __AVX2__ + // extract right pixel for load1 and load4 cases. + __m256i extract_right_1ch(const __m256i left); + __m256i extract_right_2ch(const __m256i left); + __m256i extract_right_3ch(const __m256i left); + __m256i extract_right_4ch(const __m256i left); +#else + __m128i extract_right_1ch(const __m128i left); + __m128i extract_right_2ch(const __m128i left); + __m128i extract_right_3ch(const __m128i left); + __m128i extract_right_4ch(const __m128i left); +#endif + +#ifdef __AVX2__ + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 1 channel. + // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned + // SSE load. + void load1_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* right0); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 2 channels. + // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned + // SSE load. + void load1_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* right0, __m256* right1); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 3 channels. + // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned + // SSE load. + void load1_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* left2, __m256* right0, __m256* right1, __m256* right2); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 4 channels. + // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned + // SSE load. + void load1_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* left2, __m256* left3, __m256* right0, __m256* right1, + __m256* right2, __m256* right3); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 1 channel. + // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right + // inputs are loaded with second SSE load. + void load2_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* right0); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 2 channels. + // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right + // inputs are loaded with second SSE load. + void load2_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* right0, __m256* right1); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 3 channels. + // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right + // inputs are loaded with second SSE load. + void load2_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* left2, __m256* right0, __m256* right1, __m256* right2); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 4 channels. + // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right + // inputs are loaded with second SSE load. + void load2_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* left2, __m256* left3, __m256* right0, __m256* right1, + __m256* right2, __m256* right3); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 1 channel. + // load4 case, i.e. each pair of left and right inputs are loaded with a + // separate SSE load. + void load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* right0); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 2 channels. + // load4 case, i.e. each pair of left and right inputs are loaded with a + // separate SSE load. + void load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* right0, __m256* right1); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 3 channels. + // load4 case, i.e. each pair of left and right inputs are loaded with a + // separate SSE load. + void load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* left2, __m256* right0, __m256* right1, + __m256* right2); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 4 channels. + // load4 case, i.e. each pair of left and right inputs are loaded with a + // separate SSE load. + void load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* left2, __m256* left3, __m256* right0, + __m256* right1, __m256* right2, __m256* right3); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 1 channel. + // load8 case, i.e. each input is loaded with a separate SSE load. + // 4 pixels, each with left and right input necessitates 8 separate SSE loads + // per input row. + void load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* right0); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 2 channels. + // load8 case, i.e. each input is loaded with a separate SSE load. + // 4 pixels, each with left and right input necessitates 8 separate SSE loads + // per input row. + void load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* right0, __m256* right1); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 3 channels. + // load8 case, i.e. each input is loaded with a separate SSE load. + // 4 pixels, each with left and right input necessitates 8 separate SSE loads + // per input row. + void load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* left2, __m256* right0, __m256* right1, + __m256* right2); + // load top left and bottom left interpolation inputs into output argument + // left. + // load top right and bottom right interpolation inputs into output argument + // right. + // pixels have 4 channels. + // load8 case, i.e. each input is loaded with a separate SSE load. + // 4 pixels, each with left and right input necessitates 8 separate SSE loads + // per input row. + void load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* left2, __m256* left3, __m256* right0, + __m256* right1, __m256* right2, __m256* right3); +#else + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 1 channel. + // load1 case, i.e. all inputs for one input row are loaded with a single SSE + // load. + void load1_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* bl0, + __m128* tr0, __m128* br0); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 2 channels. + // load1 case, i.e. all inputs for one input row are loaded with a single SSE + // load. + void load1_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* bl0, __m128* bl1, __m128* tr0, __m128* tr1, + __m128* br0, __m128* br1); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 3 channels. + // load1 case, i.e. all inputs for one input row are loaded with a single SSE + // load. + void load1_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* bl0, __m128* bl1, __m128* bl2, + __m128* tr0, __m128* tr1, __m128* tr2, __m128* br0, + __m128* br1, __m128* br2); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 4 channels. + // load1 case, i.e. all inputs for one input row are loaded with a single SSE + // load. + void load1_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* tl3, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* bl3, __m128* tr0, __m128* tr1, + __m128* tr2, __m128* tr3, __m128* br0, __m128* br1, + __m128* br2, __m128* br3); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 1 channel. + // load2 case, i.e. left inputs are loaded with first SSE load, right inputs + // are loaded with second SSE load. + void load2_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* bl0, + __m128* tr0, __m128* br0); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 2 channels. + // load2 case, i.e. left inputs are loaded with first SSE load, right inputs + // are loaded with second SSE load. + void load2_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* bl0, __m128* bl1, __m128* tr0, __m128* tr1, + __m128* br0, __m128* br1); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 3 channels. + // load2 case, i.e. left inputs are loaded with first SSE load, right inputs + // are loaded with second SSE load. + void load2_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* bl0, __m128* bl1, __m128* bl2, + __m128* tr0, __m128* tr1, __m128* tr2, __m128* br0, + __m128* br1, __m128* br2); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 4 channels. + // load2 case, i.e. left inputs are loaded with first SSE load, right inputs + // are loaded with second SSE load. + void load2_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* tl3, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* bl3, __m128* tr0, __m128* tr1, + __m128* tr2, __m128* tr3, __m128* br0, __m128* br1, + __m128* br2, __m128* br3); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 1 channel. + // load4 case, i.e. left and right inputs are loaded with a separate SSE load + // for each pixel. + void load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* bl0, __m128* tr0, __m128* br0); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 2 channels. + // load4 case, i.e. left and right inputs are loaded with a separate SSE load + // for each pixel. + void load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* bl0, __m128* bl1, __m128* tr0, + __m128* tr1, __m128* br0, __m128* br1); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 3 channels. + // load4 case, i.e. left and right inputs are loaded with a separate SSE load + // for each pixel. + void load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* tl2, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* tr0, __m128* tr1, __m128* tr2, + __m128* br0, __m128* br1, __m128* br2); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 4 channels. + // load4 case, i.e. left and right inputs are loaded with a separate SSE load + // for each pixel. + void load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* tl2, __m128* tl3, __m128* bl0, + __m128* bl1, __m128* bl2, __m128* bl3, __m128* tr0, + __m128* tr1, __m128* tr2, __m128* tr3, __m128* br0, + __m128* br1, __m128* br2, __m128* br3); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 1 channel. + // load8 case, i.e. left and right inputs are loaded with separate SSE loads + // for each pixel. + void load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* bl0, __m128* tr0, __m128* br0); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 2 channels. + // load8 case, i.e. left and right inputs are loaded with separate SSE loads + // for each pixel. + void load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* bl0, __m128* bl1, __m128* tr0, + __m128* tr1, __m128* br0, __m128* br1); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 3 channels. + // load8 case, i.e. left and right inputs are loaded with separate SSE loads + // for each pixel. + void load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* tl2, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* tr0, __m128* tr1, __m128* tr2, + __m128* br0, __m128* br1, __m128* br2); + // load top left interpolation inputs into output argument tl. + // load bottom left interpolation inputs into output argument bl. + // load top right interpolation inputs into output argument tr. + // load bottom right interpolation inputs into output argument br. + // pixels have 4 channels. + // load8 case, i.e. left and right inputs are loaded with separate SSE loads + // for each pixel. + void load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* tl2, __m128* tl3, __m128* bl0, + __m128* bl1, __m128* bl2, __m128* bl3, __m128* tr0, + __m128* tr1, __m128* tr2, __m128* tr3, __m128* br0, + __m128* br1, __m128* br2, __m128* br3); +#endif + + // there is no method that packs 4 pixels with 4 channel into four sse words. + // nothing to do for this case, everything is already in the right position. + + private: +// helper methods +#ifdef __AVX2__ + // pack 4 pixels with 1, 2, 3 or 4 channels into lower portion of SSE vector + // word. + // works within SSE lanes. + // sizeof(sample_data_type) can be 1, 2 or 4 bytes. + void pack4_1b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_2b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_4b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_1b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_2b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_4b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_1b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_2b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_4b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); +// there is no pack4_xx_4ch functions because none is needed. +// all the bytes are loaded in the right spots for this case. +#else + // pack 4 pixels with 1, 2, 3 or 4 channels into lower portion of SSE vector + // word. + // sizeof(sample_data_type) can be 1, 2 or 4 bytes. + void pack4_1b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_2b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_4b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_1b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_2b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_4b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_1b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_2b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_4b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); +#endif +#ifdef __AVX2__ + __m256i extract_right_1b_(const __m256i left); + __m256i extract_right_2b_(const __m256i left); + __m256i extract_right_3b_(const __m256i left); + __m256i extract_right_4b_(const __m256i left); + __m256i extract_right_6b_(const __m256i left); + __m256i extract_right_8b_(const __m256i left); +#else + __m128i extract_right_1b_(const __m128i left); + __m128i extract_right_2b_(const __m128i left); + __m128i extract_right_3b_(const __m128i left); + __m128i extract_right_4b_(const __m128i left); + __m128i extract_right_6b_(const __m128i left); + __m128i extract_right_8b_(const __m128i left); +#endif +}; + +#ifdef __AVX2__ +template +void VectorLoader::pack4_1b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + *v3 = _mm256_slli_si256(*v3, 3); + __m256i and_mask = _mm256_setr_epi32(255, 0, 0, 0, 255, 0, 0, 0); + *v2 = _mm256_or_si256(*v3, + _mm256_slli_si256(_mm256_and_si256(and_mask, *v2), 2)); + *v1 = _mm256_or_si256(*v2, + _mm256_slli_si256(_mm256_and_si256(and_mask, *v1), 1)); + *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); +} +template +void VectorLoader::pack4_2b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + *v3 = _mm256_slli_si256(*v3, 6); + __m256i and_mask = _mm256_setr_epi32(65535, 0, 0, 0, 65535, 0, 0, 0); + *v2 = _mm256_or_si256(*v3, + _mm256_slli_si256(_mm256_and_si256(and_mask, *v2), 4)); + *v1 = _mm256_or_si256(*v2, + _mm256_slli_si256(_mm256_and_si256(and_mask, *v1), 2)); + *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); +} +template +void VectorLoader::pack4_4b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + *v3 = _mm256_slli_si256(*v3, 12); + __m256i and_mask = _mm256_setr_epi32(-1, 0, 0, 0, -1, 0, 0, 0); + *v2 = _mm256_or_si256(*v3, + _mm256_slli_si256(_mm256_and_si256(and_mask, *v2), 8)); + *v1 = _mm256_or_si256(*v2, + _mm256_slli_si256(_mm256_and_si256(and_mask, *v1), 4)); + *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); +} + +template +void VectorLoader::pack4_1b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + __m256i and_mask = _mm256_setr_epi32(65535, 0, 0, 0, 65535, 0, 0, 0); + *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), + _mm256_slli_si256(*v1, 2)); + *v1 = _mm256_or_si256(_mm256_and_si256(*v2, and_mask), + _mm256_slli_si256(*v3, 2)); +} +template +void VectorLoader::pack4_2b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + __m256i and_mask = _mm256_setr_epi32(-1, 0, 0, 0, -1, 0, 0, 0); + *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), + _mm256_slli_si256(*v1, 4)); + *v1 = _mm256_or_si256(_mm256_and_si256(*v2, and_mask), + _mm256_slli_si256(*v3, 4)); +} +template +void VectorLoader::pack4_4b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + __m256i and_mask = _mm256_setr_epi32(-1, -1, 0, 0, -1, -1, 0, 0); + *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), + _mm256_slli_si256(*v1, 8)); + *v1 = _mm256_or_si256(_mm256_and_si256(*v2, and_mask), + _mm256_slli_si256(*v3, 8)); +} + +template +void VectorLoader::pack4_1b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + __m256i and_mask = _mm256_setr_epi32(16777215, 0, 0, 0, 16777215, 0, 0, 0); + *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), + _mm256_slli_si256(*v1, 3)); + and_mask = _mm256_srli_si256(and_mask, 1); + *v1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v1, 1), and_mask), + _mm256_slli_si256(*v2, 2)); + and_mask = _mm256_srli_si256(and_mask, 1); + *v2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v2, 2), and_mask), + _mm256_slli_si256(*v3, 1)); +} +template +void VectorLoader::pack4_2b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + __m256i and_mask = _mm256_setr_epi32(-1, 65535, 0, 0, -1, 65535, 0, 0); + *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), + _mm256_slli_si256(*v1, 6)); + and_mask = _mm256_srli_si256(and_mask, 2); + *v1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v1, 2), and_mask), + _mm256_slli_si256(*v2, 4)); + and_mask = _mm256_srli_si256(and_mask, 2); + *v2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v2, 4), and_mask), + _mm256_slli_si256(*v3, 2)); +} +template +void VectorLoader::pack4_4b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + __m256i and_mask = _mm256_setr_epi32(-1, -1, -1, 0, -1, -1, -1, 0); + *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), + _mm256_slli_si256(*v1, 12)); + and_mask = _mm256_srli_si256(and_mask, 4); + *v1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v1, 4), and_mask), + _mm256_slli_si256(*v2, 8)); + and_mask = _mm256_srli_si256(and_mask, 4); + *v2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v2, 8), and_mask), + _mm256_slli_si256(*v3, 4)); +} + +template <> +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_1b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_1b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_4b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_4b_1ch_(v0, v1, v2, v3); +} + +template <> +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_1b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_1b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_4b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_4b_2ch_(v0, v1, v2, v3); +} + +template <> +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_1b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_1b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_4b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_2b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { + pack4_4b_3ch_(v0, v1, v2, v3); +} +#else +template +void VectorLoader::pack4_1b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + *v3 = _mm_slli_si128(*v3, 3); + __m128i and_mask = _mm_setr_epi32(255, 0, 0, 0); + *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 2)); + *v1 = _mm_or_si128(*v2, _mm_slli_si128(_mm_and_si128(and_mask, *v1), 1)); + *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); +} +template +void VectorLoader::pack4_2b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + *v3 = _mm_slli_si128(*v3, 6); + __m128i and_mask = _mm_setr_epi32(65535, 0, 0, 0); + *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 4)); + *v1 = _mm_or_si128(*v2, _mm_slli_si128(_mm_and_si128(and_mask, *v1), 2)); + *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); +} +template +void VectorLoader::pack4_4b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + *v3 = _mm_slli_si128(*v3, 12); + __m128i and_mask = _mm_setr_epi32(-1, 0, 0, 0); + *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 8)); + *v1 = _mm_or_si128(*v2, _mm_slli_si128(_mm_and_si128(and_mask, *v1), 4)); + *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); +} +template +void VectorLoader::pack4_1b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + __m128i and_mask = _mm_setr_epi32(65535, 0, 0, 0); + *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 2)); + *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 2)); +} +template +void VectorLoader::pack4_2b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + __m128i and_mask = _mm_setr_epi32(-1, 0, 0, 0); + *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 4)); + *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 4)); +} +template +void VectorLoader::pack4_4b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + __m128i and_mask = _mm_setr_epi32(-1, -1, 0, 0); + *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 8)); + *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 8)); +} +template +void VectorLoader::pack4_1b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + __m128i and_mask = _mm_setr_epi32(16777215, 0, 0, 0); + *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 3)); + and_mask = _mm_srli_si128(and_mask, 1); + *v1 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v1, 1), and_mask), + _mm_slli_si128(*v2, 2)); + and_mask = _mm_srli_si128(and_mask, 1); + *v2 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v2, 2), and_mask), + _mm_slli_si128(*v3, 1)); +} +template +void VectorLoader::pack4_2b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + __m128i and_mask = _mm_setr_epi32(-1, 65535, 0, 0); + *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 6)); + and_mask = _mm_srli_si128(and_mask, 2); + *v1 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v1, 2), and_mask), + _mm_slli_si128(*v2, 4)); + and_mask = _mm_srli_si128(and_mask, 2); + *v2 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v2, 4), and_mask), + _mm_slli_si128(*v3, 2)); +} +template +void VectorLoader::pack4_4b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + __m128i and_mask = _mm_setr_epi32(-1, -1, -1, 0); + *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 12)); + and_mask = _mm_srli_si128(and_mask, 4); + *v1 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v1, 4), and_mask), + _mm_slli_si128(*v2, 8)); + and_mask = _mm_srli_si128(and_mask, 4); + *v2 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v2, 8), and_mask), + _mm_slli_si128(*v3, 4)); +} + +template <> +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_1b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_1b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_4b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_1ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_4b_1ch_(v0, v1, v2, v3); +} + +template <> +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_1b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_1b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_4b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_2ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_4b_2ch_(v0, v1, v2, v3); +} + +template <> +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_1b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_1b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_4b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_2b_3ch_(v0, v1, v2, v3); +} +template <> +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { + pack4_4b_3ch_(v0, v1, v2, v3); +} +#endif + +#ifdef __AVX2__ +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { + return extract_right_1b_(left); +} +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { + return extract_right_1b_(left); +} +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { + return extract_right_2b_(left); +} +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { + return extract_right_2b_(left); +} +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { + return extract_right_4b_(left); +} +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { + return extract_right_2b_(left); +} +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { + return extract_right_2b_(left); +} +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { + return extract_right_4b_(left); +} + +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { + return extract_right_2b_(left); +} +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { + return extract_right_2b_(left); +} +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { + return extract_right_4b_(left); +} +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { + return extract_right_4b_(left); +} +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { + return extract_right_8b_(left); +} +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { + return extract_right_4b_(left); +} +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { + return extract_right_4b_(left); +} +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { + return extract_right_8b_(left); +} + +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { + return extract_right_3b_(left); +} +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { + return extract_right_3b_(left); +} +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { + return extract_right_6b_(left); +} +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { + return extract_right_6b_(left); +} +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { + assert(false); +} +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { + return extract_right_6b_(left); +} +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { + return extract_right_6b_(left); +} +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { + assert(false); +} + +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { + return extract_right_4b_(left); +} +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { + return extract_right_4b_(left); +} +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { + return extract_right_8b_(left); +} +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { + return extract_right_8b_(left); +} +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { + assert(false); +} +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { + return extract_right_8b_(left); +} +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { + return extract_right_8b_(left); +} +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { + assert(false); +} +#else +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { + return extract_right_1b_(left); +} +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { + return extract_right_1b_(left); +} +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { + return extract_right_2b_(left); +} +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { + return extract_right_2b_(left); +} +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { + return extract_right_4b_(left); +} +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { + return extract_right_2b_(left); +} +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { + return extract_right_2b_(left); +} +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { + return extract_right_4b_(left); +} + +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { + return extract_right_2b_(left); +} +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { + return extract_right_2b_(left); +} +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { + return extract_right_4b_(left); +} +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { + return extract_right_4b_(left); +} +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { + return extract_right_8b_(left); +} +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { + return extract_right_4b_(left); +} +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { + return extract_right_4b_(left); +} +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { + return extract_right_8b_(left); +} + +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { + return extract_right_3b_(left); +} +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { + return extract_right_3b_(left); +} +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { + return extract_right_6b_(left); +} +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { + return extract_right_6b_(left); +} +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { + assert(false); +} +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { + return extract_right_6b_(left); +} +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { + return extract_right_6b_(left); +} +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { + assert(false); +} + +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { + return extract_right_4b_(left); +} +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { + return extract_right_4b_(left); +} +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { + return extract_right_8b_(left); +} +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { + return extract_right_8b_(left); +} +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { + assert(false); +} +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { + return extract_right_8b_(left); +} +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { + return extract_right_8b_(left); +} +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { + assert(false); +} +#endif + +#ifdef __AVX2__ +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { + raw = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_cvtepu8_epi32(_mm256_castsi256_si128(raw))), + _mm_cvtepu8_epi32(_mm256_extractf128_si256(raw, 1)), 1); + return _mm256_cvtepi32_ps(raw); +} +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { + raw = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_cvtepi8_epi32(_mm256_castsi256_si128(raw))), + _mm_cvtepi8_epi32(_mm256_extractf128_si256(raw, 1)), 1); + return _mm256_cvtepi32_ps(raw); +} +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { + raw = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_cvtepu16_epi32(_mm256_castsi256_si128(raw))), + _mm_cvtepu16_epi32(_mm256_extractf128_si256(raw, 1)), 1); + return _mm256_cvtepi32_ps(raw); +} +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { + raw = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm256_castsi256_si128(raw))), + _mm_cvtepi16_epi32(_mm256_extractf128_si256(raw, 1)), 1); + return _mm256_cvtepi32_ps(raw); +} +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { + return _mm256_cvtepi32_ps(raw); +} +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { + return _mm256_insertf128_ps( + _mm256_castps128_ps256(_mm_cvtph_ps(_mm256_castsi256_si128(raw))), + _mm_cvtph_ps(_mm256_extractf128_si256(raw, 1)), 1); +} +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { + // bfloat16 is essentially fp32 with mantissa truncated from 23 to 7 bits. + // can convert with << 16, which we fuse with initial shuffle into epi32 + // positions. + __m256i shuf_hi32 = _mm256_setr_epi8( + -128, -128, 0, 1, -128, -128, 2, 3, -128, -128, 4, 5, -128, -128, 6, 7, + -128, -128, 0, 1, -128, -128, 2, 3, -128, -128, 4, 5, -128, -128, 6, 7); + return _mm256_castsi256_ps(_mm256_shuffle_epi8(raw, shuf_hi32)); +} +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { + return _mm256_castsi256_ps(raw); +} +#else +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { + return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(raw)); +} +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { + return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(raw)); +} +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { + return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(raw)); +} +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { + return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(raw)); +} +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { + return _mm_cvtepi32_ps(raw); +} +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { +#ifdef __F16C__ + return _mm_cvtph_ps(raw); +#else + // It is fairly trivial to convert from fp16 to fp32. + // The formats are defined as follows: + // + // fp16 :: 15=sign_bit, 14-10=exponent, 9-0=mantissa :: exp zero offset is 15 + // :: exponent of -15 (all 0) and +16 (all 1) are special numbers. + // fp32 :: 31=sign_bit, 30-23=exponent, 22-0=mantissa :: exp zero offset is + // 127 + // :: exponent of -127 (all 0) and +128 (all 1) are special numbers. + // + // Assuming the fp16 values is stored in the lower 16 bits of an int32 + // 'fp16_val'. + // + // fp16_mantissa = fp16_val & (2^10-1) + // fp32_mantissa = fp16_mantissa << 13 + // + // The exponent is a little trickier. + // For normal numbers, the following works: + // fp16_exponent_with_10bit_left_shift = (fp16_val & ((2^5-1)<<10)) + // fp16_exponent_at_msb = fp16_exponent_with_10bit_left_shift << 17 + // The next line shifts in 1's from msb + // fp16_exponent_at_fp32_position = fp16_exponent_at_msb >> 4 + // The next line flips the 3 bits from [msb-1,msb-4] + // fp32_exponent = fp16_exponent_at_fp32_position ^ (7 << 27) + // This breaks for subnormals, nan and infinity. + // The only thing that breaks is the 3bit bit flip, which should + // happen for normal numbers, but should not happen otherwise. + // Since the bit flip can be done with an XOR of all 1's, we + // can make this happen by turning the XOR mask to all zeros + // when the fp16_exponent is either 0 or 31. + // + // ..move 16-bit input words to lower part of 32-bit positions. + __m128i shuf_lo32 = _mm_setr_epi8(0, 1, -128, -128, 2, 3, -128, -128, 4, 5, + -128, -128, 6, 7, -128, -128); + __m128i fp16_val = _mm_shuffle_epi8(raw, shuf_lo32); + // ..extract sign bit + __m128i fp32_sign = + _mm_slli_epi32(_mm_and_si128(fp16_val, _mm_set1_epi32(32768)), 16); + // ..extract fp16_mantissa and shift + __m128i fp16_mantissa = _mm_and_si128(fp16_val, _mm_set1_epi32(1023)); + __m128i fp32_mantissa = _mm_slli_epi32(fp16_mantissa, 13); + // ..extract fp16 exponent shifted 10bits to the left + __m128i fp16_exponent_sl10 = _mm_and_si128(fp16_val, _mm_set1_epi32(31744)); + __m128i fp16_exponent_all1_mask = + _mm_cmpeq_epi32(fp16_exponent_sl10, _mm_set1_epi32(31 << 10)); + __m128i fp16_exponent_all0_mask = + _mm_cmpeq_epi32(fp16_exponent_sl10, _mm_setzero_si128()); + __m128i fp16_denormal_mask = + _mm_or_si128(fp16_exponent_all0_mask, fp16_exponent_all1_mask); + __m128i fp32_exponent_before_xor = + _mm_and_si128(_mm_set1_epi32(2139095040), + _mm_srai_epi32(_mm_slli_epi32(fp16_exponent_sl10, 17), 4)); + __m128i fp32_exponent_xor_mask = + _mm_andnot_si128(fp16_denormal_mask, _mm_set1_epi32(7 << 27)); + __m128i fp32_exponent = + _mm_xor_si128(fp32_exponent_xor_mask, fp32_exponent_before_xor); + // ..or everything into one word + __m128i fp32_val = + _mm_or_si128(_mm_or_si128(fp32_sign, fp32_exponent), fp32_mantissa); + return _mm_castsi128_ps(fp32_val); +#endif +} +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { + // bfloat16 is essentially fp32 with mantissa truncated from 23 to 7 bits. + // can convert with << 16, which we fuse with initial shuffle into epi32 + // positions. + __m128i shuf_hi32 = _mm_setr_epi8(-128, -128, 0, 1, -128, -128, 2, 3, -128, + -128, 4, 5, -128, -128, 6, 7); + return _mm_castsi128_ps(_mm_shuffle_epi8(raw, shuf_hi32)); +} +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { + return _mm_castsi128_ps(raw); +} +#endif + +#ifdef __AVX2__ +template +__m256i VectorLoader::extract_right_1b_(const __m256i left) { + return _mm256_srli_si256(left, 1); +} +template +__m256i VectorLoader::extract_right_2b_(const __m256i left) { + return _mm256_srli_si256(left, 2); +} +template +__m256i VectorLoader::extract_right_3b_(const __m256i left) { + return _mm256_srli_si256(left, 3); +} +template +__m256i VectorLoader::extract_right_4b_(const __m256i left) { + return _mm256_srli_si256(left, 4); +} +template +__m256i VectorLoader::extract_right_6b_(const __m256i left) { + return _mm256_srli_si256(left, 6); +} +template +__m256i VectorLoader::extract_right_8b_(const __m256i left) { + return _mm256_srli_si256(left, 8); +} +#else +template +__m128i VectorLoader::extract_right_1b_(const __m128i left) { + return _mm_srli_si128(left, 1); +} +template +__m128i VectorLoader::extract_right_2b_(const __m128i left) { + return _mm_srli_si128(left, 2); +} +template +__m128i VectorLoader::extract_right_3b_(const __m128i left) { + return _mm_srli_si128(left, 3); +} +template +__m128i VectorLoader::extract_right_4b_(const __m128i left) { + return _mm_srli_si128(left, 4); +} +template +__m128i VectorLoader::extract_right_6b_(const __m128i left) { + return _mm_srli_si128(left, 6); +} +template +__m128i VectorLoader::extract_right_8b_(const __m128i left) { + return _mm_srli_si128(left, 8); +} +#endif + +#ifdef __AVX2__ +template +void VectorLoader::load1_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* right0) { + __m256i raw = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + *left0 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); + *right0 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); +} +template +void VectorLoader::load1_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* right0, + __m256* right1) { + __m256i raw = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + *left0 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); + *left1 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); + *right0 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[2]))); + *right1 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[3]))); +} +template +void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* left2, + __m256* right0, __m256* right1, + __m256* right2) { + __m256i raw = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + *left0 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); + *left1 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); + *left2 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[2]))); + *right0 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[3]))); + *right1 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[4]))); + *right2 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[5]))); +} +template +void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* left2, + __m256* left3, __m256* right0, __m256* right1, + __m256* right2, __m256* right3) { + __m256i raw = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + *left0 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); + *left1 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); + *left2 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[2]))); + *left3 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[3]))); + *right0 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[4]))); + *right1 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[5]))); + *right2 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[6]))); + *right3 = to_fp32( + _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[7]))); +} +template +void VectorLoader::load2_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* right0) { + __m256i raw1 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i raw2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)), 1); + __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); + *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); +} +template +void VectorLoader::load2_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* right0, + __m256* right1) { + __m256i raw1 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i raw2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)), 1); + __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); + *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); + mask = _mm256_broadcastsi128_si256(shuffle_masks[1]); + *left1 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right1 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); +} +template +void VectorLoader::load2_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* left2, + __m256* right0, __m256* right1, + __m256* right2) { + __m256i raw1 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i raw2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)), 1); + __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); + *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); + mask = _mm256_broadcastsi128_si256(shuffle_masks[1]); + *left1 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right1 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); + mask = _mm256_broadcastsi128_si256(shuffle_masks[2]); + *left2 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right2 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); +} +template +void VectorLoader::load2_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* left2, + __m256* left3, __m256* right0, __m256* right1, + __m256* right2, __m256* right3) { + __m256i raw1 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i raw2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)), 1); + __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); + *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); + mask = _mm256_broadcastsi128_si256(shuffle_masks[1]); + *left1 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right1 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); + mask = _mm256_broadcastsi128_si256(shuffle_masks[2]); + *left2 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right2 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); + mask = _mm256_broadcastsi128_si256(shuffle_masks[3]); + *left3 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); + *right3 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); +} +template +void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m256* left0, __m256* right0) { + __m256i l0 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i r0 = extract_right_1ch(l0); + __m256i l1, r1; + if (offset1 == offset0) { + l1 = l0; + r1 = r0; + } else { + l1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + r1 = extract_right_1ch(l1); + } + __m256i l2, r2; + if (offset2 == offset1) { + l2 = l1; + r2 = r1; + } else { + l2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + r2 = extract_right_1ch(l2); + } + __m256i l3, r3; + if (offset3 == offset2) { + l3 = l2; + r3 = r2; + } else { + l3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + r3 = extract_right_1ch(l3); + } + pack_1ch(&l0, &l1, &l2, &l3); + *left0 = to_fp32(l0); + pack_1ch(&r0, &r1, &r2, &r3); + *right0 = to_fp32(r0); +} +template +void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m256* left0, __m256* left1, + __m256* right0, __m256* right1) { + __m256i l0 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i r0 = extract_right_2ch(l0); + __m256i l1, r1; + if (offset1 == offset0) { + l1 = l0; + r1 = r0; + } else { + l1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + r1 = extract_right_2ch(l1); + } + __m256i l2, r2; + if (offset2 == offset1) { + l2 = l1; + r2 = r1; + } else { + l2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + r2 = extract_right_2ch(l2); + } + __m256i l3, r3; + if (offset3 == offset2) { + l3 = l2; + r3 = r2; + } else { + l3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + r3 = extract_right_2ch(l3); + } + pack_2ch(&l0, &l1, &l2, &l3); + *left0 = to_fp32(l0); + *left1 = to_fp32(l1); + pack_2ch(&r0, &r1, &r2, &r3); + *right0 = to_fp32(r0); + *right1 = to_fp32(r1); +} +template +void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m256* left0, __m256* left1, + __m256* left2, __m256* right0, __m256* right1, + __m256* right2) { + __m256i l0 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i r0 = extract_right_3ch(l0); + __m256i l1, r1; + if (offset1 == offset0) { + l1 = l0; + r1 = r0; + } else { + l1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + r1 = extract_right_3ch(l1); + } + __m256i l2, r2; + if (offset2 == offset1) { + l2 = l1; + r2 = r1; + } else { + l2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + r2 = extract_right_3ch(l2); + } + __m256i l3, r3; + if (offset3 == offset2) { + l3 = l2; + r3 = r2; + } else { + l3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + r3 = extract_right_3ch(l3); + } + pack_3ch(&l0, &l1, &l2, &l3); + *left0 = to_fp32(l0); + *left1 = to_fp32(l1); + *left2 = to_fp32(l2); + pack_3ch(&r0, &r1, &r2, &r3); + *right0 = to_fp32(r0); + *right1 = to_fp32(r1); + *right2 = to_fp32(r2); +} +template +void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m256* left0, __m256* left1, + __m256* left2, __m256* left3, __m256* right0, + __m256* right1, __m256* right2, + __m256* right3) { + __m256i l0 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i r0 = extract_right_4ch(l0); + __m256i l1, r1; + if (offset1 == offset0) { + l1 = l0; + r1 = r0; + } else { + l1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + r1 = extract_right_4ch(l1); + } + __m256i l2, r2; + if (offset2 == offset1) { + l2 = l1; + r2 = r1; + } else { + l2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + r2 = extract_right_4ch(l2); + } + __m256i l3, r3; + if (offset3 == offset2) { + l3 = l2; + r3 = r2; + } else { + l3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + r3 = extract_right_4ch(l3); + } + *left0 = to_fp32(l0); + *left1 = to_fp32(l1); + *left2 = to_fp32(l2); + *left3 = to_fp32(l3); + *right0 = to_fp32(r0); + *right1 = to_fp32(r1); + *right2 = to_fp32(r2); + *right3 = to_fp32(r3); +} +template +void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m256* left0, __m256* right0) { + __m256i l0 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i r0 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)), 1); + __m256i l1, r1; + if (offset1 == offset0) { + l1 = l0; + r1 = r0; + } else { + l1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + r1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 1)), 1); + } + __m256i l2, r2; + if (offset2 == offset1) { + l2 = l1; + r2 = r1; + } else { + l2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + r2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 1)), 1); + } + __m256i l3, r3; + if (offset3 == offset2) { + l3 = l2; + r3 = r2; + } else { + l3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + r3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 1)), 1); + } + pack_1ch(&l0, &l1, &l2, &l3); + *left0 = to_fp32(l0); + pack_1ch(&r0, &r1, &r2, &r3); + *right0 = to_fp32(r0); +} +template +void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m256* left0, __m256* left1, + __m256* right0, __m256* right1) { + __m256i l0 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i r0 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)), 1); + __m256i l1, r1; + if (offset1 == offset0) { + l1 = l0; + r1 = r0; + } else { + l1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + r1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 2)), 1); + } + __m256i l2, r2; + if (offset2 == offset1) { + l2 = l1; + r2 = r1; + } else { + l2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + r2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 2)), 1); + } + __m256i l3, r3; + if (offset3 == offset2) { + l3 = l2; + r3 = r2; + } else { + l3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + r3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 2)), 1); + } + pack_2ch(&l0, &l1, &l2, &l3); + *left0 = to_fp32(l0); + *left1 = to_fp32(l1); + pack_2ch(&r0, &r1, &r2, &r3); + *right0 = to_fp32(r0); + *right1 = to_fp32(r1); +} +template +void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m256* left0, __m256* left1, + __m256* left2, __m256* right0, __m256* right1, + __m256* right2) { + __m256i l0 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i r0 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)), 1); + __m256i l1, r1; + if (offset1 == offset0) { + l1 = l0; + r1 = r0; + } else { + l1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + r1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 3)), 1); + } + __m256i l2, r2; + if (offset2 == offset1) { + l2 = l1; + r2 = r1; + } else { + l2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + r2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 3)), 1); + } + __m256i l3, r3; + if (offset3 == offset2) { + l3 = l2; + r3 = r2; + } else { + l3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + r3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 3)), 1); + } + pack_3ch(&l0, &l1, &l2, &l3); + *left0 = to_fp32(l0); + *left1 = to_fp32(l1); + *left2 = to_fp32(l2); + pack_3ch(&r0, &r1, &r2, &r3); + *right0 = to_fp32(r0); + *right1 = to_fp32(r1); + *right2 = to_fp32(r2); +} +template +void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m256* left0, __m256* left1, + __m256* left2, __m256* left3, __m256* right0, + __m256* right1, __m256* right2, + __m256* right3) { + __m256i l0 = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + __m256i r0 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)), 1); + __m256i l1, r1; + if (offset1 == offset0) { + l1 = l0; + r1 = r0; + } else { + l1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + r1 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 4)), 1); + } + __m256i l2, r2; + if (offset2 == offset1) { + l2 = l1; + r2 = r1; + } else { + l2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + r2 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 4)), 1); + } + __m256i l3, r3; + if (offset3 == offset2) { + l3 = l2; + r3 = r2; + } else { + l3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + r3 = _mm256_insertf128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 4)), 1); + } + *left0 = to_fp32(l0); + *left1 = to_fp32(l1); + *left2 = to_fp32(l2); + *left3 = to_fp32(l3); + *right0 = to_fp32(r0); + *right1 = to_fp32(r1); + *right2 = to_fp32(r2); + *right3 = to_fp32(r3); +} +#else +template +void VectorLoader::load1_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* bl0, __m128* tr0, + __m128* br0) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); +} +template +void VectorLoader::load1_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* bl0, + __m128* bl1, __m128* tr0, __m128* tr1, + __m128* br0, __m128* br1) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); +} +template +void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* tl2, + __m128* bl0, __m128* bl1, __m128* bl2, + __m128* tr0, __m128* tr1, __m128* tr2, + __m128* br0, __m128* br1, __m128* br2) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); + *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); + *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); + *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); + *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); +} +template +void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* tl2, + __m128* tl3, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* bl3, __m128* tr0, + __m128* tr1, __m128* tr2, __m128* tr3, + __m128* br0, __m128* br1, __m128* br2, + __m128* br3) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *tl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); + *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); + *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); + *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[6])); + *tr3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[7])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *bl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); + *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); + *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); + *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[6])); + *br3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[7])); +} +template +void VectorLoader::load2_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* bl0, __m128* tr0, + __m128* br0) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1)); + *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)); + *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); +} +template +void VectorLoader::load2_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* bl0, + __m128* bl1, __m128* tr0, __m128* tr1, + __m128* br0, __m128* br1) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2)); + *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)); + *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); +} +template +void VectorLoader::load2_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* tl2, + __m128* bl0, __m128* bl1, __m128* bl2, + __m128* tr0, __m128* tr1, __m128* tr2, + __m128* br0, __m128* br1, __m128* br2) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3)); + *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)); + *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); +} +template +void VectorLoader::load2_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* tl2, + __m128* tl3, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* bl3, __m128* tr0, + __m128* tr1, __m128* tr2, __m128* tr3, + __m128* br0, __m128* br1, __m128* br2, + __m128* br3) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *tl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); + raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4)); + *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *tr3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *bl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)); + *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); + *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); + *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); + *br3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); +} +template +void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m128* tl0, __m128* bl0, + __m128* tr0, __m128* br0) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = extract_right_1ch(itl0); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = extract_right_1ch(ibl0); + __m128i itl1, itr1; + __m128i ibl1, ibr1; + if (offset1 == offset0) { + itl1 = itl0; + itr1 = itr0; + ibl1 = ibl0; + ibr1 = ibr0; + } else { + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = extract_right_1ch(itl1); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = extract_right_1ch(ibl1); + } + __m128i itl2, itr2; + __m128i ibl2, ibr2; + if (offset2 == offset1) { + itl2 = itl1; + itr2 = itr1; + ibl2 = ibl1; + ibr2 = ibr1; + } else { + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = extract_right_1ch(itl2); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = extract_right_1ch(ibl2); + } + __m128i itl3, itr3; + __m128i ibl3, ibr3; + if (offset3 == offset2) { + itl3 = itl2; + itr3 = itr2; + ibl3 = ibl2; + ibr3 = ibr2; + } else { + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = extract_right_1ch(itl3); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = extract_right_1ch(ibl3); + } + pack_1ch(&itl0, &itl1, &itl2, &itl3); + *tl0 = to_fp32(itl0); + pack_1ch(&itr0, &itr1, &itr2, &itr3); + *tr0 = to_fp32(itr0); + pack_1ch(&ibl0, &ibl1, &ibl2, &ibl3); + *bl0 = to_fp32(ibl0); + pack_1ch(&ibr0, &ibr1, &ibr2, &ibr3); + *br0 = to_fp32(ibr0); +} +template +void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m128* tl0, __m128* tl1, + __m128* bl0, __m128* bl1, __m128* tr0, + __m128* tr1, __m128* br0, __m128* br1) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = extract_right_2ch(itl0); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = extract_right_2ch(ibl0); + __m128i itl1, itr1; + __m128i ibl1, ibr1; + if (offset1 == offset0) { + itl1 = itl0; + itr1 = itr0; + ibl1 = ibl0; + ibr1 = ibr0; + } else { + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = extract_right_2ch(itl1); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = extract_right_2ch(ibl1); + } + __m128i itl2, itr2; + __m128i ibl2, ibr2; + if (offset2 == offset1) { + itl2 = itl1; + itr2 = itr1; + ibl2 = ibl1; + ibr2 = ibr1; + } else { + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = extract_right_2ch(itl2); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = extract_right_2ch(ibl2); + } + __m128i itl3, itr3; + __m128i ibl3, ibr3; + if (offset3 == offset2) { + itl3 = itl2; + itr3 = itr2; + ibl3 = ibl2; + ibr3 = ibr2; + } else { + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = extract_right_2ch(itl3); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = extract_right_2ch(ibl3); + } + pack_2ch(&itl0, &itl1, &itl2, &itl3); + *tl0 = to_fp32(itl0); + *tl1 = to_fp32(itl1); + pack_2ch(&itr0, &itr1, &itr2, &itr3); + *tr0 = to_fp32(itr0); + *tr1 = to_fp32(itr1); + pack_2ch(&ibl0, &ibl1, &ibl2, &ibl3); + *bl0 = to_fp32(ibl0); + *bl1 = to_fp32(ibl1); + pack_2ch(&ibr0, &ibr1, &ibr2, &ibr3); + *br0 = to_fp32(ibr0); + *br1 = to_fp32(ibr1); +} +template +void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* tr0, __m128* tr1, + __m128* tr2, __m128* br0, __m128* br1, + __m128* br2) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = extract_right_3ch(itl0); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = extract_right_3ch(ibl0); + __m128i itl1, itr1; + __m128i ibl1, ibr1; + if (offset1 == offset0) { + itl1 = itl0; + itr1 = itr0; + ibl1 = ibl0; + ibr1 = ibr0; + } else { + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = extract_right_3ch(itl1); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = extract_right_3ch(ibl1); + } + __m128i itl2, itr2; + __m128i ibl2, ibr2; + if (offset2 == offset1) { + itl2 = itl1; + itr2 = itr1; + ibl2 = ibl1; + ibr2 = ibr1; + } else { + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = extract_right_3ch(itl2); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = extract_right_3ch(ibl2); + } + __m128i itl3, itr3; + __m128i ibl3, ibr3; + if (offset3 == offset2) { + itl3 = itl2; + itr3 = itr2; + ibl3 = ibl2; + ibr3 = ibr2; + } else { + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = extract_right_3ch(itl3); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = extract_right_3ch(ibl3); + } + pack_3ch(&itl0, &itl1, &itl2, &itl3); + *tl0 = to_fp32(itl0); + *tl1 = to_fp32(itl1); + *tl2 = to_fp32(itl2); + pack_3ch(&itr0, &itr1, &itr2, &itr3); + *tr0 = to_fp32(itr0); + *tr1 = to_fp32(itr1); + *tr2 = to_fp32(itr2); + pack_3ch(&ibl0, &ibl1, &ibl2, &ibl3); + *bl0 = to_fp32(ibl0); + *bl1 = to_fp32(ibl1); + *bl2 = to_fp32(ibl2); + pack_3ch(&ibr0, &ibr1, &ibr2, &ibr3); + *br0 = to_fp32(ibr0); + *br1 = to_fp32(ibr1); + *br2 = to_fp32(ibr2); +} +template +void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* tl3, __m128* bl0, + __m128* bl1, __m128* bl2, __m128* bl3, + __m128* tr0, __m128* tr1, __m128* tr2, + __m128* tr3, __m128* br0, __m128* br1, + __m128* br2, __m128* br3) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = extract_right_4ch(itl0); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = extract_right_4ch(ibl0); + __m128i itl1, itr1; + __m128i ibl1, ibr1; + if (offset1 == offset0) { + itl1 = itl0; + itr1 = itr0; + ibl1 = ibl0; + ibr1 = ibr0; + } else { + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = extract_right_4ch(itl1); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = extract_right_4ch(ibl1); + } + __m128i itl2, itr2; + __m128i ibl2, ibr2; + if (offset2 == offset1) { + itl2 = itl1; + itr2 = itr1; + ibl2 = ibl1; + ibr2 = ibr1; + } else { + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = extract_right_4ch(itl2); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = extract_right_4ch(ibl2); + } + __m128i itl3, itr3; + __m128i ibl3, ibr3; + if (offset3 == offset2) { + itl3 = itl2; + itr3 = itr2; + ibl3 = ibl2; + ibr3 = ibr2; + } else { + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = extract_right_4ch(itl3); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = extract_right_4ch(ibl3); + } + *tl0 = to_fp32(itl0); + *tl1 = to_fp32(itl1); + *tl2 = to_fp32(itl2); + *tl3 = to_fp32(itl3); + *tr0 = to_fp32(itr0); + *tr1 = to_fp32(itr1); + *tr2 = to_fp32(itr2); + *tr3 = to_fp32(itr3); + *bl0 = to_fp32(ibl0); + *bl1 = to_fp32(ibl1); + *bl2 = to_fp32(ibl2); + *bl3 = to_fp32(ibl3); + *br0 = to_fp32(ibr0); + *br1 = to_fp32(ibr1); + *br2 = to_fp32(ibr2); + *br3 = to_fp32(ibr3); +} +template +void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m128* tl0, __m128* bl0, + __m128* tr0, __m128* br0) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)); + __m128i itl1, itr1; + __m128i ibl1, ibr1; + if (offset1 == offset0) { + itl1 = itl0; + itr1 = itr0; + ibl1 = ibl0; + ibr1 = ibr0; + } else { + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 1)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 1)); + } + __m128i itl2, itr2; + __m128i ibl2, ibr2; + if (offset2 == offset1) { + itl2 = itl1; + itr2 = itr1; + ibl2 = ibl1; + ibr2 = ibr1; + } else { + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 1)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 1)); + } + __m128i itl3, itr3; + __m128i ibl3, ibr3; + if (offset3 == offset2) { + itl3 = itl2; + itr3 = itr2; + ibl3 = ibl2; + ibr3 = ibr2; + } else { + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 1)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 1)); + } + pack_1ch(&itl0, &itl1, &itl2, &itl3); + *tl0 = to_fp32(itl0); + pack_1ch(&itr0, &itr1, &itr2, &itr3); + *tr0 = to_fp32(itr0); + pack_1ch(&ibl0, &ibl1, &ibl2, &ibl3); + *bl0 = to_fp32(ibl0); + pack_1ch(&ibr0, &ibr1, &ibr2, &ibr3); + *br0 = to_fp32(ibr0); +} +template +void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m128* tl0, __m128* tl1, + __m128* bl0, __m128* bl1, __m128* tr0, + __m128* tr1, __m128* br0, __m128* br1) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)); + __m128i itl1, itr1; + __m128i ibl1, ibr1; + if (offset1 == offset0) { + itl1 = itl0; + itr1 = itr0; + ibl1 = ibl0; + ibr1 = ibr0; + } else { + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 2)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 2)); + } + __m128i itl2, itr2; + __m128i ibl2, ibr2; + if (offset2 == offset1) { + itl2 = itl1; + itr2 = itr1; + ibl2 = ibl1; + ibr2 = ibr1; + } else { + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 2)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 2)); + } + __m128i itl3, itr3; + __m128i ibl3, ibr3; + if (offset3 == offset2) { + itl3 = itl2; + itr3 = itr2; + ibl3 = ibl2; + ibr3 = ibr2; + } else { + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 2)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 2)); + } + pack_2ch(&itl0, &itl1, &itl2, &itl3); + *tl0 = to_fp32(itl0); + *tl1 = to_fp32(itl1); + pack_2ch(&itr0, &itr1, &itr2, &itr3); + *tr0 = to_fp32(itr0); + *tr1 = to_fp32(itr1); + pack_2ch(&ibl0, &ibl1, &ibl2, &ibl3); + *bl0 = to_fp32(ibl0); + *bl1 = to_fp32(ibl1); + pack_2ch(&ibr0, &ibr1, &ibr2, &ibr3); + *br0 = to_fp32(ibr0); + *br1 = to_fp32(ibr1); +} +template +void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* tr0, __m128* tr1, + __m128* tr2, __m128* br0, __m128* br1, + __m128* br2) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)); + __m128i itl1, itr1; + __m128i ibl1, ibr1; + if (offset1 == offset0) { + itl1 = itl0; + itr1 = itr0; + ibl1 = ibl0; + ibr1 = ibr0; + } else { + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 3)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 3)); + } + __m128i itl2, itr2; + __m128i ibl2, ibr2; + if (offset2 == offset1) { + itl2 = itl1; + itr2 = itr1; + ibl2 = ibl1; + ibr2 = ibr1; + } else { + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 3)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 3)); + } + __m128i itl3, itr3; + __m128i ibl3, ibr3; + if (offset3 == offset2) { + itl3 = itl2; + itr3 = itr2; + ibl3 = ibl2; + ibr3 = ibr2; + } else { + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 3)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 3)); + } + pack_3ch(&itl0, &itl1, &itl2, &itl3); + *tl0 = to_fp32(itl0); + *tl1 = to_fp32(itl1); + *tl2 = to_fp32(itl2); + pack_3ch(&itr0, &itr1, &itr2, &itr3); + *tr0 = to_fp32(itr0); + *tr1 = to_fp32(itr1); + *tr2 = to_fp32(itr2); + pack_3ch(&ibl0, &ibl1, &ibl2, &ibl3); + *bl0 = to_fp32(ibl0); + *bl1 = to_fp32(ibl1); + *bl2 = to_fp32(ibl2); + pack_3ch(&ibr0, &ibr1, &ibr2, &ibr3); + *br0 = to_fp32(ibr0); + *br1 = to_fp32(ibr1); + *br2 = to_fp32(ibr2); +} +template +void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, int offset1, int offset2, + int offset3, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* tl3, __m128* bl0, + __m128* bl1, __m128* bl2, __m128* bl3, + __m128* tr0, __m128* tr1, __m128* tr2, + __m128* tr3, __m128* br0, __m128* br1, + __m128* br2, __m128* br3) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)); + __m128i itl1, itr1; + __m128i ibl1, ibr1; + if (offset1 == offset0) { + itl1 = itl0; + itr1 = itr0; + ibl1 = ibl0; + ibr1 = ibr0; + } else { + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 4)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 4)); + } + __m128i itl2, itr2; + __m128i ibl2, ibr2; + if (offset2 == offset1) { + itl2 = itl1; + itr2 = itr1; + ibl2 = ibl1; + ibr2 = ibr1; + } else { + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 4)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 4)); + } + __m128i itl3, itr3; + __m128i ibl3, ibr3; + if (offset3 == offset2) { + itl3 = itl2; + itr3 = itr2; + ibl3 = ibl2; + ibr3 = ibr2; + } else { + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 4)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 4)); + } + *tl0 = to_fp32(itl0); + *tl1 = to_fp32(itl1); + *tl2 = to_fp32(itl2); + *tl3 = to_fp32(itl3); + *tr0 = to_fp32(itr0); + *tr1 = to_fp32(itr1); + *tr2 = to_fp32(itr2); + *tr3 = to_fp32(itr3); + *bl0 = to_fp32(ibl0); + *bl1 = to_fp32(ibl1); + *bl2 = to_fp32(ibl2); + *bl3 = to_fp32(ibl3); + *br0 = to_fp32(ibr0); + *br1 = to_fp32(ibr1); + *br2 = to_fp32(ibr2); + *br3 = to_fp32(ibr3); +} +#endif + +// +// This class stores 4 pixels with n channels packed into n SSE vector words. +// Pixel values are converted to type U and packed before storage. +// Output type U must be one of uint8, int8, uint16, int16, int32, Eigen::half, +// bfloat16 or float. +// + +template +class VectorWriter { + public: + // convert 4 fp32 words to type U with. + // this function calls clip. + // resulting words are packed. + // U must be one of uint8, int8, uint16, int16, int32, Eigen::half, bfloat16 + // or float. + __m128i from_fp32(__m128 vec); + + // converts from fp32 to U by calling method from_fp32(...) + // writes 4 pixels with 1 channel to destination. + void write_1ch(U* destination, __m128* vec); + + // converts from fp32 to U by calling method from_fp32(...) + // writes 4 pixels with 1 channel to destination. + void write_2ch(U* destination, __m128* vec); + + // converts from fp32 to U by calling method from_fp32(...) + // writes 4 pixels with 1 channel to destination. + void write_3ch(U* destination, __m128* vec); + + // converts from fp32 to U by calling method from_fp32(...) + // writes 4 pixels with 1 channel to destination. + void write_4ch(U* destination, __m128* vec); + + private: + // clip 4 fp32 words to prevent overflow when converting to type U. + __m128 clip_(__m128 vec) { + // default is to do nothing, since the packing intrinsics include clipping. + return vec; + } + void write_1b_1ch(U* destination, __m128* vec) { + __m128i ivec = from_fp32(vec[0]); + _mm_store_ss((float*)(destination), _mm_castsi128_ps(ivec)); + } + void write_2b_1ch(U* destination, __m128* vec) { + __m128i ivec = from_fp32(vec[0]); + _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec)); + } + void write_4b_1ch(U* destination, __m128* vec) { + __m128i ivec = from_fp32(vec[0]); + _mm_storeu_si128((__m128i*)(destination), ivec); + } + void write_1b_2ch(U* destination, __m128* vec) { + __m128i ivec1 = from_fp32(vec[0]); + __m128i ivec2 = from_fp32(vec[1]); + __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); + ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), + _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); + _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec1)); + } + void write_2b_2ch(U* destination, __m128* vec) { + __m128i ivec1 = from_fp32(vec[0]); + __m128i ivec2 = from_fp32(vec[1]); + __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); + ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), + _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); + _mm_storeu_si128((__m128i*)(destination), ivec1); + } + void write_4b_2ch(U* destination, __m128* vec) { + __m128i ivec1 = from_fp32(vec[0]); + __m128i ivec2 = from_fp32(vec[1]); + _mm_storeu_si128((__m128i*)(destination), ivec1); + _mm_storeu_si128((__m128i*)(destination + 4), ivec2); + } + void write_1b_3ch(U* destination, __m128* vec) { + __m128i ivec1 = from_fp32(vec[0]); + __m128i ivec2 = from_fp32(vec[1]); + __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); + ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), + _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); + _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec1)); + __m128i ivec3 = from_fp32(vec[2]); + _mm_store_ss((float*)(destination + 8), _mm_castsi128_ps(ivec3)); + } + void write_2b_3ch(U* destination, __m128* vec) { + __m128i ivec1 = from_fp32(vec[0]); + __m128i ivec2 = from_fp32(vec[1]); + __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); + ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), + _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); + _mm_storeu_si128((__m128i*)(destination), ivec1); + __m128i ivec3 = from_fp32(vec[2]); + _mm_store_sd((double*)(destination + 8), _mm_castsi128_pd(ivec3)); + } + void write_4b_3ch(U* destination, __m128* vec) { + __m128i ivec1 = from_fp32(vec[0]); + __m128i ivec2 = from_fp32(vec[1]); + __m128i ivec3 = from_fp32(vec[2]); + _mm_storeu_si128((__m128i*)(destination), ivec1); + _mm_storeu_si128((__m128i*)(destination + 4), ivec2); + _mm_storeu_si128((__m128i*)(destination + 8), ivec3); + } + void write_1b_4ch(U* destination, __m128* vec) { + __m128i ivec1 = from_fp32(vec[0]); + __m128i ivec2 = from_fp32(vec[1]); + __m128i ivec3 = from_fp32(vec[2]); + __m128i ivec4 = from_fp32(vec[3]); + __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); + __m128i ivec = _mm_and_si128(mask, ivec1); + ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); + ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec3), 8)); + ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec4), 12)); + _mm_storeu_si128((__m128i*)(destination), ivec); + } + void write_2b_4ch(U* destination, __m128* vec) { + __m128i ivec1 = from_fp32(vec[0]); + __m128i ivec2 = from_fp32(vec[1]); + __m128i ivec3 = from_fp32(vec[2]); + __m128i ivec4 = from_fp32(vec[3]); + __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); + __m128i ivec = _mm_and_si128(mask, ivec1); + ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); + _mm_storeu_si128((__m128i*)(destination), ivec); + ivec = _mm_and_si128(mask, ivec3); + ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec4), 8)); + _mm_storeu_si128((__m128i*)(destination + 8), ivec); + } + void write_4b_4ch(U* destination, __m128* vec) { + __m128i ivec1 = from_fp32(vec[0]); + __m128i ivec2 = from_fp32(vec[1]); + __m128i ivec3 = from_fp32(vec[2]); + __m128i ivec4 = from_fp32(vec[3]); + _mm_storeu_si128((__m128i*)(destination), ivec1); + _mm_storeu_si128((__m128i*)(destination + 4), ivec2); + _mm_storeu_si128((__m128i*)(destination + 8), ivec3); + _mm_storeu_si128((__m128i*)(destination + 12), ivec4); + } +}; + +template <> +__m128 VectorWriter::clip_(__m128 vec) { + // clip against low limit, -2147483648. + // we round up to nearest number that can be represented as float. + __m128 lt_val = _mm_set1_ps(-2147483520.0f); + __m128 lt_mask = _mm_cmplt_ps(vec, lt_val); + vec = _mm_or_ps(_mm_andnot_ps(lt_mask, vec), _mm_and_ps(lt_mask, lt_val)); + // clip against hight limit, 2147483647. + // we round down to nearest number that can be represented as float. + __m128 gt_val = _mm_set1_ps(2147483520.0f); + __m128 gt_mask = _mm_cmpgt_ps(vec, gt_val); + vec = _mm_or_ps(_mm_andnot_ps(gt_mask, vec), _mm_and_ps(gt_mask, gt_val)); + return vec; +} +template <> +__m128 VectorWriter::clip_(__m128 vec) { + // clip against low limit, -65504.0f; + __m128 lt_val = _mm_set1_ps(-65504.0f); + __m128 lt_mask = _mm_cmplt_ps(vec, lt_val); + vec = _mm_or_ps(_mm_andnot_ps(lt_mask, vec), _mm_and_ps(lt_mask, lt_val)); + // clip against hight limit, 65504.0f. + __m128 gt_val = _mm_set1_ps(65504.0f); + __m128 gt_mask = _mm_cmpgt_ps(vec, gt_val); + vec = _mm_or_ps(_mm_andnot_ps(gt_mask, vec), _mm_and_ps(gt_mask, gt_val)); + return vec; +} + +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { + __m128i ivec = _mm_cvttps_epi32(vec); + ivec = _mm_packs_epi32(ivec, ivec); + return _mm_packus_epi16(ivec, ivec); +} +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { + __m128i ivec = _mm_cvttps_epi32(vec); + ivec = _mm_packs_epi32(ivec, ivec); + return _mm_packs_epi16(ivec, ivec); +} +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { + __m128i ivec = _mm_cvttps_epi32(vec); + return _mm_packus_epi32(ivec, ivec); +} +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { + __m128i ivec = _mm_cvttps_epi32(vec); + return _mm_packs_epi32(ivec, ivec); +} +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { + return _mm_cvttps_epi32(clip_(vec)); +} +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { +#ifdef __F16C__ + return _mm_cvtps_ph(vec, _MM_FROUND_TO_ZERO); +#else + // Emulation of _mm_cvtps_ph(vec, _MM_FROUND_TO_ZERO) intrinsic. + // + // fp16 :: 15=sign_bit, 14-10=exponent, 9-0=mantissa :: exp zero offset is 15 + // :: exponent of -15 (all 0) and +16 (all 1) are special numbers. + // fp32 :: 31=sign_bit, 30-23=exponent, 22-0=mantissa :: exp zero offset is + // 127 + // :: exponent of -127 (all 0) and +128 (all 1) are special numbers. + // + __m128i hw = _mm_castps_si128(vec); + // ..extract fp32 exponent and mantissa + __m128i fp16_sign_bit_msb = _mm_and_si128(_mm_set1_epi32(-2147483648), hw); + __m128i fp32_exponent_lsb = + _mm_and_si128(_mm_set1_epi32(255), _mm_srli_epi32(hw, 23)); + __m128i fp32_mantissa = _mm_and_si128(_mm_set1_epi32(8388607), hw); + // ..test for NaN + __m128i exponent_ones = + _mm_cmpeq_epi32(fp32_exponent_lsb, _mm_set1_epi32(255)); + __m128i mantissa_zero = _mm_cmpeq_epi32(fp32_mantissa, _mm_setzero_si128()); + __m128i infinity_mask = _mm_and_si128(mantissa_zero, exponent_ones); + // ..have to test for NaN on fp32 bits to avoid converting NaN to infinity + __m128i NaN_mask = _mm_andnot_si128(mantissa_zero, exponent_ones); + // ..compensate for exponent zero offset difference + __m128i fp16_exponent_lsb = + _mm_sub_epi32(fp32_exponent_lsb, _mm_set1_epi32(112)); + // ..clip output if fp16_exponent > 30 + __m128i saturated_mask = _mm_andnot_si128( + exponent_ones, _mm_cmpgt_epi32(fp16_exponent_lsb, _mm_set1_epi32(30))); + // ..generate subnormal number if fp16_exponent == 0 + // ..flush to zero if fp16_exponent < 0 + __m128i subnormal_mask = + _mm_cmpeq_epi32(fp16_exponent_lsb, _mm_setzero_si128()); + __m128i underflow_mask = + _mm_cmplt_epi32(fp16_exponent_lsb, _mm_setzero_si128()); + __m128i fp16_mantissa = _mm_srli_epi32(fp32_mantissa, 13); + // ..handle abnormal values + __m128i normal_number = + _mm_or_si128(_mm_slli_epi32(fp16_exponent_lsb, 10), fp16_mantissa); + __m128i subnormal_number = + _mm_or_si128(_mm_set1_epi32(512), _mm_srli_epi32(fp16_mantissa, 1)); + __m128i saturated_number = _mm_set1_epi32(31743); + __m128i infinity_number = _mm_set1_epi32(31744); + __m128i NaN_number = _mm_set1_epi32(32256); + __m128i number = _mm_andnot_si128(underflow_mask, normal_number); + number = _mm_or_si128(_mm_andnot_si128(subnormal_mask, number), + _mm_and_si128(subnormal_mask, subnormal_number)); + number = _mm_or_si128(_mm_andnot_si128(saturated_mask, number), + _mm_and_si128(saturated_mask, saturated_number)); + number = _mm_or_si128(_mm_andnot_si128(infinity_mask, number), + _mm_and_si128(infinity_mask, infinity_number)); + number = _mm_or_si128(_mm_andnot_si128(NaN_mask, number), + _mm_and_si128(NaN_mask, NaN_number)); + // ..or in sign bit + number = _mm_or_si128(fp16_sign_bit_msb, _mm_slli_epi32(number, 16)); + // ..move 16 bit words to lower portion of sse vector; + __m128i shuf_from_hi32 = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -128, -128, + -128, -128, -128, -128, -128, -128); + number = _mm_shuffle_epi8(number, shuf_from_hi32); + return number; +#endif +} +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { + // casting from float to bfloat16 simply means >> 16 + // we do this with a shuffle that also moves everything to lower portion of + // sse vector word + __m128i shuf_from_hi32 = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -128, -128, + -128, -128, -128, -128, -128, -128); + return _mm_shuffle_epi8(_mm_castps_si128(vec), shuf_from_hi32); +} +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { + // nothing to do in this case + return _mm_castps_si128(vec); +} + +template <> +void VectorWriter::write_1ch(uint8* destination, __m128* vec) { + write_1b_1ch(destination, vec); +} +template <> +void VectorWriter::write_1ch(int8* destination, __m128* vec) { + write_1b_1ch(destination, vec); +} +template <> +void VectorWriter::write_1ch(uint16* destination, __m128* vec) { + write_2b_1ch(destination, vec); +} +template <> +void VectorWriter::write_1ch(int16* destination, __m128* vec) { + write_2b_1ch(destination, vec); +} +template <> +void VectorWriter::write_1ch(int32* destination, __m128* vec) { + write_4b_1ch(destination, vec); +} +template <> +void VectorWriter::write_1ch(Eigen::half* destination, + __m128* vec) { + write_2b_1ch(destination, vec); +} +template <> +void VectorWriter::write_1ch(bfloat16* destination, __m128* vec) { + write_2b_1ch(destination, vec); +} +template <> +void VectorWriter::write_1ch(float* destination, __m128* vec) { + _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); +} + +template <> +void VectorWriter::write_2ch(uint8* destination, __m128* vec) { + write_1b_2ch(destination, vec); +} +template <> +void VectorWriter::write_2ch(int8* destination, __m128* vec) { + write_1b_2ch(destination, vec); +} +template <> +void VectorWriter::write_2ch(uint16* destination, __m128* vec) { + write_2b_2ch(destination, vec); +} +template <> +void VectorWriter::write_2ch(int16* destination, __m128* vec) { + write_2b_2ch(destination, vec); +} +template <> +void VectorWriter::write_2ch(int32* destination, __m128* vec) { + write_4b_2ch(destination, vec); +} +template <> +void VectorWriter::write_2ch(Eigen::half* destination, + __m128* vec) { + write_2b_2ch(destination, vec); +} +template <> +void VectorWriter::write_2ch(bfloat16* destination, __m128* vec) { + write_2b_2ch(destination, vec); +} +template <> +void VectorWriter::write_2ch(float* destination, __m128* vec) { + _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); + _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); +} + +template <> +void VectorWriter::write_3ch(uint8* destination, __m128* vec) { + write_1b_3ch(destination, vec); +} +template <> +void VectorWriter::write_3ch(int8* destination, __m128* vec) { + write_1b_3ch(destination, vec); +} +template <> +void VectorWriter::write_3ch(uint16* destination, __m128* vec) { + write_2b_3ch(destination, vec); +} +template <> +void VectorWriter::write_3ch(int16* destination, __m128* vec) { + write_2b_3ch(destination, vec); +} +template <> +void VectorWriter::write_3ch(int32* destination, __m128* vec) { + write_4b_3ch(destination, vec); +} +template <> +void VectorWriter::write_3ch(Eigen::half* destination, + __m128* vec) { + write_2b_3ch(destination, vec); +} +template <> +void VectorWriter::write_3ch(bfloat16* destination, __m128* vec) { + write_2b_3ch(destination, vec); +} +template <> +void VectorWriter::write_3ch(float* destination, __m128* vec) { + _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); + _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); + _mm_storeu_si128((__m128i*)(destination + 8), _mm_castps_si128(vec[2])); +} + +template <> +void VectorWriter::write_4ch(uint8* destination, __m128* vec) { + write_1b_4ch(destination, vec); +} +template <> +void VectorWriter::write_4ch(int8* destination, __m128* vec) { + write_1b_4ch(destination, vec); +} +template <> +void VectorWriter::write_4ch(uint16* destination, __m128* vec) { + write_2b_4ch(destination, vec); +} +template <> +void VectorWriter::write_4ch(int16* destination, __m128* vec) { + write_2b_4ch(destination, vec); +} +template <> +void VectorWriter::write_4ch(int32* destination, __m128* vec) { + write_4b_4ch(destination, vec); +} +template <> +void VectorWriter::write_4ch(Eigen::half* destination, + __m128* vec) { + write_2b_4ch(destination, vec); +} +template <> +void VectorWriter::write_4ch(bfloat16* destination, __m128* vec) { + write_2b_4ch(destination, vec); +} +template <> +void VectorWriter::write_4ch(float* destination, __m128* vec) { + _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); + _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); + _mm_storeu_si128((__m128i*)(destination + 8), _mm_castps_si128(vec[2])); + _mm_storeu_si128((__m128i*)(destination + 12), _mm_castps_si128(vec[3])); +} + +template +class CropResizeCastImage : public VectorLoader, public VectorWriter { + public: + CropResizeCastImage(const int in_height, const int in_width, + const int out_height, const int out_width, + const int channels, const int min_ix, const int max_ix, + const CachedInterpolation* xs, const int min_iy, + const int max_iy, const CachedInterpolation* ys, + const float extrapolated_value, const bool flip_x, + const bool flip_y, const bool verbose = false, + const int allowed_load_groups = 15) + : verbose_(verbose), + allowed_load_groups_(allowed_load_groups), + in_height_(in_height), + in_width_(in_width), + out_height_(out_height), + out_width_(out_width), + channels_(channels), + min_ix_(min_ix), + max_ix_(max_ix), + min_iy_(min_iy), + max_iy_(max_iy), + ys_(ys), + extrapolated_value_(extrapolated_value), + flip_x_(flip_x), + flip_y_(flip_y), + in_row_size_(in_width * channels), + in_row_size_bytes_(in_width * channels * sizeof(T)), + out_row_size_(out_width * channels), + x0_(flip_x ? out_width - 1 - max_ix : min_ix), + x1_(flip_x ? out_width - 1 - min_ix : max_ix), + y0_(flip_y ? out_height - 1 - max_iy : min_iy), + y1_(flip_y ? out_height - 1 - min_iy : max_iy) { + // copy xs values, but filter out the following: + // xs[].lower == xs[].upper AND xs[].lerp == 0 + // xs[].lower == xs[].upper AND xs[].lerp == 1 + xs_ = new CachedInterpolation[max_ix_ - min_ix_ + 1]; + for (int i = min_ix_; i <= max_ix_; ++i) { + int ix = i - min_ix_; + int xs_lower = xs[ix].lower / channels_; + int xs_upper = xs[ix].upper / channels_; + if (xs_lower == xs_upper) { + if (xs[ix].lerp == 0.0f && xs_lower + 1 < in_width) { + // upper weight is zero + xs_upper = xs_lower + 1; + } else if (xs[ix].lerp == 1.0f && xs_upper - 1 >= 0) { + // lower weight is zero + xs_lower = xs_upper - 1; + } + } + xs_[ix].lower = xs_lower * channels_; + xs_[ix].upper = xs_upper * channels_; + xs_[ix].lerp = xs[ix].lerp; + } + _u_min_val = std::numeric_limits::min(); + _u_max_val = std::numeric_limits::max(); + _f_min_val = static_cast(_u_min_val); + _f_max_val = static_cast(_u_max_val); + Configure_(); + } + ~CropResizeCastImage() { + if (general_x_ != NULL) delete[] general_x_; + if (load1_x_ != NULL) delete[] load1_x_; + if (load2_x_ != NULL) delete[] load2_x_; + if (load4_x_ != NULL) delete[] load4_x_; + if (load8_x_ != NULL) delete[] load8_x_; + if (load1_offsets_ != NULL) delete[] load1_offsets_; + if (load2_offsets_ != NULL) delete[] load2_offsets_; + if (load4_offsets_ != NULL) delete[] load4_offsets_; + if (load8_offsets_ != NULL) delete[] load8_offsets_; + if (load1_shuffle_masks_ != NULL) delete[] load1_shuffle_masks_; + if (load2_shuffle_masks_ != NULL) delete[] load2_shuffle_masks_; + if (load1_mmxs_lerp_ != NULL) delete[] load1_mmxs_lerp_; + if (load2_mmxs_lerp_ != NULL) delete[] load2_mmxs_lerp_; + if (load4_mmxs_lerp_ != NULL) delete[] load4_mmxs_lerp_; + if (load8_mmxs_lerp_ != NULL) delete[] load8_mmxs_lerp_; + delete[] xs_; + } + + private: + // constructor arguments + const bool verbose_; + // this value is meant for unit testing. + // set this to 15 for normal execution. + // its an OR of flags for the different load group. + // 1 -> load4from1 + // 2 -> load4from2 + // 4 -> load4from4 + // 8 -> load4from8 + const int allowed_load_groups_; + const int in_height_, in_width_, out_height_, out_width_; + const int channels_; + const int min_ix_, max_ix_, min_iy_, max_iy_; + const CachedInterpolation* ys_; + CachedInterpolation* xs_; + const float extrapolated_value_; + const bool flip_x_, flip_y_; + // computed arguments + const int in_row_size_; + const int in_row_size_bytes_; + const int out_row_size_; + const int x0_, x1_; + const int y0_, y1_; + + // helper methods + void ResizeRow_load1_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load2_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load4_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load8_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load1_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load2_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load4_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load8_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load1_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load2_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load4_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load8_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load1_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load2_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load4_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load8_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_general_(const float ys_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + + // configuration parameters + int num_general_, num_load1_, num_load2_, num_load4_, num_load8_; + int *load1_offsets_, *load2_offsets_, *load4_offsets_, *load8_offsets_; + int *general_x_, *load1_x_, *load2_x_, *load4_x_, *load8_x_; + __m128i *load1_shuffle_masks_, *load2_shuffle_masks_; + __m128 *load1_mmxs_lerp_, *load2_mmxs_lerp_, *load4_mmxs_lerp_, + *load8_mmxs_lerp_; + float _f_min_val, _f_max_val; + U _u_min_val, _u_max_val; + // configuration methods + void Configure_(); + int DetermineLoadGroup_(const int x); + bool ComputeXIndexRange_(const int x, int* min_xidx, int* max_xidx); + bool Load1_ok_( + const int min_xidx, + const int max_xidx); // xs - pointer to first xs for this load group + bool Load2_ok_( + const int min_xidx, + const int max_xidx); // xs - pointer to first xs for this load group + bool Load4_ok_(const int min_xidx, const int max_xidx); + bool Load8_ok_(const int min_xidx, const int max_xidx); + + // debugging + int y_; + const T* input_image_; + U* output_image_; + + public: + // + // public client methods + // + + // convenience function that determines if clipping is necessary + // in order to prevent overflow when casting to the output type U. + static bool clip_necessary(); + + // resize image + void Resize(const T* input_image, U* output_image); +}; + +template +void CropResizeCastImage::Resize(const T* input_image, U* output_image) { + // store these for debugging + input_image_ = input_image; + output_image_ = output_image_; + // + U uEx = cast_to(extrapolated_value_, _f_min_val, _f_max_val, _u_min_val, + _u_max_val); + // extrapolate top + if (min_iy_ > 0) { + U* p = flip_y_ ? output_image + out_row_size_ * (out_height_ - min_iy_) + : output_image; + int nn = out_row_size_ * min_iy_; + for (int i = 0; i < nn; ++i) p[i] = uEx; + } + // extrapolate bottom + if (max_iy_ < out_height_ - 1) { + U* p = + flip_y_ ? output_image : output_image + out_row_size_ * (max_iy_ + 1); + int nn = out_row_size_ * (out_height_ - 1 - max_iy_); + for (int i = 0; i < nn; ++i) p[i] = uEx; + } + // extrapolate left + if (min_ix_ > 0) { + for (int iy = min_iy_; iy <= max_iy_; ++iy) { + int xx0 = flip_x_ ? (out_width_ - min_ix_) * channels_ : 0; + int nxx = min_ix_ * channels_; + U* p = output_image + xx0 + + out_row_size_ * (flip_y_ ? out_height_ - 1 - iy : iy); + for (int ix = 0; ix < nxx; ++ix) { + p[ix] = uEx; + } + } + } + // extrapolate right + if (max_ix_ < out_width_ - 1) { + for (int iy = min_iy_; iy <= max_iy_; ++iy) { + int xx0 = flip_x_ ? 0 : (max_ix_ + 1) * channels_; + int nxx = (out_width_ - 1 - max_ix_) * channels_; + U* p = output_image + xx0 + + out_row_size_ * (flip_y_ ? out_height_ - 1 - iy : iy); + for (int ix = 0; ix < nxx; ++ix) { + p[ix] = uEx; + } + } + } + // interpolation region + int y = y0_; + for (y = y0_; y + 1 <= y1_; y += 2) { + y_ = y; + const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; + const float yA_lerp = ys_[iyA].lerp; + const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); + const T* ysA_input_lower_ptr = + input_image + ys_[iyA].lower * in_width_ * channels_; + const T* ysA_input_upper_ptr = + input_image + ys_[iyA].upper * in_width_ * channels_; + U* ysA_output_ptr = output_image + y * out_width_ * channels_; + const int iyB = + flip_y_ ? out_height_ - 1 - min_iy_ - (y + 1) : (y + 1) - min_iy_; + const float yB_lerp = ys_[iyB].lerp; + const __m128 ysB_lerp = _mm_set1_ps(yB_lerp); + const T* ysB_input_lower_ptr = + input_image + ys_[iyB].lower * in_width_ * channels_; + const T* ysB_input_upper_ptr = + input_image + ys_[iyB].upper * in_width_ * channels_; + U* ysB_output_ptr = output_image + (y + 1) * out_width_ * channels_; + if (channels_ == 1) { + this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + } else if (channels_ == 2) { + this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + } else if (channels_ == 3) { + this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + } else if (channels_ == 4) { + this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + } else { + assert(false); + } + // printf("*2 :: y=%d, channels_=%d, + // num_load8_=%d\n",y,channels_,num_load8_); + } + for (; y <= y1_; ++y) { + y_ = y; + const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; + const float yA_lerp = ys_[iyA].lerp; + const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); + const T* ysA_input_lower_ptr = + input_image + ys_[iyA].lower * in_width_ * channels_; + const T* ysA_input_upper_ptr = + input_image + ys_[iyA].upper * in_width_ * channels_; + U* ysA_output_ptr = output_image + y * out_width_ * channels_; + if (channels_ == 1) { + this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + } else if (channels_ == 2) { + this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + } else if (channels_ == 3) { + this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + } else if (channels_ == 4) { + this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + } else { + assert(false); + } + // printf("*1 :: y=%d\n",y); + } +} + +template +void CropResizeCastImage::ResizeRow_general_(const float ys_lerp, + const T* ys_input_lower_ptr, + const T* ys_input_upper_ptr, + U* output_y_ptr) { + for (int current = 0; current < num_general_; ++current) { + int x = general_x_[current]; + const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - x : x - min_ix_; + const int xs_lower = xs_[ix].lower; + const int xs_upper = xs_[ix].upper; + const float xs_lerp = xs_[ix].lerp; + for (int ichan = 0; ichan < channels_; ++ichan) { + const float top_left0(ys_input_lower_ptr[xs_lower + ichan]); + const float top_right0(ys_input_lower_ptr[xs_upper + ichan]); + const float bottom_left0(ys_input_upper_ptr[xs_lower + ichan]); + const float bottom_right0(ys_input_upper_ptr[xs_upper + ichan]); + float result0 = compute_lerp(top_left0, top_right0, bottom_left0, + bottom_right0, xs_lerp, ys_lerp); + output_y_ptr[x * channels_ + ichan] = + cast_to(result0, _f_min_val, _f_max_val, _u_min_val, _u_max_val); + } + } +} + +#define CHANNELS 1 +// Resize all points that fall in the 'load4from1' group for an entire row of a +// 1 channel image. +template +void CropResizeCastImage::ResizeRow_load1_1ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load1_; ++current) { + __m128* mmxs_lerp = + (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; +#ifdef __AVX2__ + __m256 left0, right0; + this->load1_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load1_offsets_[current], shuffle_masks, &left0, &right0); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); +#else + __m128 tl0, bl0, tr0, br0; + this->load1_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load1_offsets_[current], shuffle_masks, &tl0, &bl0, &tr0, + &br0); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); +#endif + __m128 res[1]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + this->write_1ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from2' group for an entire row of a +// 1 channel image. +template +void CropResizeCastImage::ResizeRow_load2_1ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load2_; ++current) { + __m128* mmxs_lerp = + (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; +#ifdef __AVX2__ + __m256 left0, right0; + this->load2_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load2_offsets_[current], shuffle_masks, &left0, &right0); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); +#else + __m128 tl0, bl0, tr0, br0; + this->load2_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load2_offsets_[current], shuffle_masks, &tl0, &bl0, &tr0, + &br0); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); +#endif + __m128 res[1]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + this->write_1ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from4' group for an entire row of a +// 1 channel image. +template +void CropResizeCastImage::ResizeRow_load4_1ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load4_; ++current) { + __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); +#ifdef __AVX2__ + __m256 left0, right0; + this->load4_1ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], + load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], + load4_offsets_[current * 4 + 3], &left0, &right0); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); +#else + __m128 tl0, bl0, tr0, br0; + this->load4_1ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], + load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], + load4_offsets_[current * 4 + 3], &tl0, &bl0, &tr0, &br0); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); +#endif + __m128 res[1]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + this->write_1ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from8' group for an entire row of a +// 1 channel image. +template +void CropResizeCastImage::ResizeRow_load8_1ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load8_; ++current) { + __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); +#ifdef __AVX2__ + __m256 left0, right0; + this->load8_1ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], + load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], + load8_offsets_[current * 4 + 3], &left0, &right0); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); +#else + __m128 tl0, bl0, tr0, br0; + this->load8_1ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], + load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], + load8_offsets_[current * 4 + 3], &tl0, &bl0, &tr0, &br0); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); +#endif + __m128 res[1]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + this->write_1ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); + } +} +#undef CHANNELS + +#define CHANNELS 2 +// Resize all points that fall in the 'load4from1' group for an entire row of a +// 2 channel image. +template +void CropResizeCastImage::ResizeRow_load1_2ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load1_; ++current) { + __m128* mmxs_lerp = + (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; +#ifdef __AVX2__ + __m256 left0, left1, right0, right1; + this->load1_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load1_offsets_[current], shuffle_masks, &left0, &left1, + &right0, &right1); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); +#else + __m128 tl0, tl1, bl0, bl1, tr0, tr1, br0, br1; + this->load1_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load1_offsets_[current], shuffle_masks, &tl0, &tl1, &bl0, + &bl1, &tr0, &tr1, &br0, &br1); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); +#endif + __m128 res[2]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + this->write_2ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from2' group for an entire row of a +// 2 channel image. +template +void CropResizeCastImage::ResizeRow_load2_2ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load2_; ++current) { + __m128* mmxs_lerp = + (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; +#ifdef __AVX2__ + __m256 left0, left1, right0, right1; + this->load2_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load2_offsets_[current], shuffle_masks, &left0, &left1, + &right0, &right1); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); +#else + __m128 tl0, tl1, bl0, bl1, tr0, tr1, br0, br1; + this->load2_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load2_offsets_[current], shuffle_masks, &tl0, &tl1, &bl0, + &bl1, &tr0, &tr1, &br0, &br1); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); +#endif + __m128 res[2]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + this->write_2ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from4' group for an entire row of a +// 2 channel image. +template +void CropResizeCastImage::ResizeRow_load4_2ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load4_; ++current) { + __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); +#ifdef __AVX2__ + __m256 left0, left1, right0, right1; + this->load4_2ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], + load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], + load4_offsets_[current * 4 + 3], &left0, &left1, &right0, &right1); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); +#else + __m128 tl0, tl1, bl0, bl1, tr0, tr1, br0, br1; + this->load4_2ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], + load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], + load4_offsets_[current * 4 + 3], &tl0, &tl1, &bl0, &bl1, &tr0, &tr1, + &br0, &br1); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); +#endif + __m128 res[2]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + this->write_2ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from8' group for an entire row of a +// 2 channel image. +template +void CropResizeCastImage::ResizeRow_load8_2ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load8_; ++current) { + __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); +#ifdef __AVX2__ + __m256 left0, left1, right0, right1; + this->load8_2ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], + load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], + load8_offsets_[current * 4 + 3], &left0, &left1, &right0, &right1); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); +#else + __m128 tl0, tl1, bl0, bl1, tr0, tr1, br0, br1; + this->load8_2ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], + load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], + load8_offsets_[current * 4 + 3], &tl0, &tl1, &bl0, &bl1, &tr0, &tr1, + &br0, &br1); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); +#endif + __m128 res[2]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + this->write_2ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); + } +} +#undef CHANNELS + +#define CHANNELS 3 +// Resize all points that fall in the 'load4from1' group for an entire row of a +// 3 channel image. +template +void CropResizeCastImage::ResizeRow_load1_3ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load1_; ++current) { + __m128* mmxs_lerp = + (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; +#ifdef __AVX2__ + __m256 left0, left1, left2, right0, right1, right2; + this->load1_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load1_offsets_[current], shuffle_masks, &left0, &left1, + &left2, &right0, &right1, &right2); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); + __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); + __m128 top2 = _mm256_castps256_ps128(hori2); + __m128 bot2 = _mm256_extractf128_ps(hori2, 1); +#else + __m128 tl0, tl1, tl2, bl0, bl1, bl2, tr0, tr1, tr2, br0, br1, br2; + this->load1_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load1_offsets_[current], shuffle_masks, &tl0, &tl1, &tl2, + &bl0, &bl1, &bl2, &tr0, &tr1, &tr2, &br0, &br1, &br2); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); + x_lerp = mmxs_lerp[2]; + __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); + __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); +#endif + __m128 res[3]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); + this->write_3ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from2' group for an entire row of a +// 3 channel image. +template +void CropResizeCastImage::ResizeRow_load2_3ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load2_; ++current) { + __m128* mmxs_lerp = + (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; +#ifdef __AVX2__ + __m256 left0, left1, left2, right0, right1, right2; + this->load2_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load2_offsets_[current], shuffle_masks, &left0, &left1, + &left2, &right0, &right1, &right2); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); + __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); + __m128 top2 = _mm256_castps256_ps128(hori2); + __m128 bot2 = _mm256_extractf128_ps(hori2, 1); +#else + __m128 tl0, tl1, tl2, bl0, bl1, bl2, tr0, tr1, tr2, br0, br1, br2; + this->load2_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load2_offsets_[current], shuffle_masks, &tl0, &tl1, &tl2, + &bl0, &bl1, &bl2, &tr0, &tr1, &tr2, &br0, &br1, &br2); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); + x_lerp = mmxs_lerp[2]; + __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); + __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); +#endif + __m128 res[3]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); + this->write_3ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from4' group for an entire row of a +// 3 channel image. +template +void CropResizeCastImage::ResizeRow_load4_3ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load4_; ++current) { + __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); +#ifdef __AVX2__ + __m256 left0, left1, left2, right0, right1, right2; + this->load4_3ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], + load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], + load4_offsets_[current * 4 + 3], &left0, &left1, &left2, &right0, + &right1, &right2); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); + __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); + __m128 top2 = _mm256_castps256_ps128(hori2); + __m128 bot2 = _mm256_extractf128_ps(hori2, 1); +#else + __m128 tl0, tl1, tl2, bl0, bl1, bl2, tr0, tr1, tr2, br0, br1, br2; + this->load4_3ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], + load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], + load4_offsets_[current * 4 + 3], &tl0, &tl1, &tl2, &bl0, &bl1, &bl2, + &tr0, &tr1, &tr2, &br0, &br1, &br2); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); + x_lerp = mmxs_lerp[2]; + __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); + __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); +#endif + __m128 res[3]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); + this->write_3ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from8' group for an entire row of a +// 3 channel image. +template +void CropResizeCastImage::ResizeRow_load8_3ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load8_; ++current) { + __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); +#ifdef __AVX2__ + __m256 left0, left1, left2, right0, right1, right2; + this->load8_3ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], + load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], + load8_offsets_[current * 4 + 3], &left0, &left1, &left2, &right0, + &right1, &right2); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); + __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); + __m128 top2 = _mm256_castps256_ps128(hori2); + __m128 bot2 = _mm256_extractf128_ps(hori2, 1); +#else + __m128 tl0, tl1, tl2, bl0, bl1, bl2, tr0, tr1, tr2, br0, br1, br2; + this->load8_3ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], + load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], + load8_offsets_[current * 4 + 3], &tl0, &tl1, &tl2, &bl0, &bl1, &bl2, + &tr0, &tr1, &tr2, &br0, &br1, &br2); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); + x_lerp = mmxs_lerp[2]; + __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); + __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); +#endif + __m128 res[3]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); + this->write_3ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); + } +} +#undef CHANNELS + +#define CHANNELS 4 +// Resize all points that fall in the 'load4from1' group for an entire row of a +// 4 channel image. +template +void CropResizeCastImage::ResizeRow_load1_4ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load1_; ++current) { + __m128* mmxs_lerp = + (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; +#ifdef __AVX2__ + __m256 left0, left1, left2, left3, right0, right1, right2, right3; + this->load1_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load1_offsets_[current], shuffle_masks, &left0, &left1, + &left2, &left3, &right0, &right1, &right2, &right3); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); + __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[3]))); + __m256 hori3 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right3, left3), left3); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); + __m128 top2 = _mm256_castps256_ps128(hori2); + __m128 bot2 = _mm256_extractf128_ps(hori2, 1); + __m128 top3 = _mm256_castps256_ps128(hori3); + __m128 bot3 = _mm256_extractf128_ps(hori3, 1); +#else + __m128 tl0, tl1, tl2, tl3, bl0, bl1, bl2, bl3, tr0, tr1, tr2, tr3, br0, br1, + br2, br3; + this->load1_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load1_offsets_[current], shuffle_masks, &tl0, &tl1, &tl2, + &tl3, &bl0, &bl1, &bl2, &bl3, &tr0, &tr1, &tr2, &tr3, &br0, + &br1, &br2, &br3); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); + x_lerp = mmxs_lerp[2]; + __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); + __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); + x_lerp = mmxs_lerp[3]; + __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); + __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); +#endif + __m128 res[4]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); + res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); + this->write_4ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from2' group for an entire row of a +// 4 channel image. +template +void CropResizeCastImage::ResizeRow_load2_4ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load2_; ++current) { + __m128* mmxs_lerp = + (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; +#ifdef __AVX2__ + __m256 left0, left1, left2, left3, right0, right1, right2, right3; + this->load2_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load2_offsets_[current], shuffle_masks, &left0, &left1, + &left2, &left3, &right0, &right1, &right2, &right3); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); + __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[3]))); + __m256 hori3 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right3, left3), left3); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); + __m128 top2 = _mm256_castps256_ps128(hori2); + __m128 bot2 = _mm256_extractf128_ps(hori2, 1); + __m128 top3 = _mm256_castps256_ps128(hori3); + __m128 bot3 = _mm256_extractf128_ps(hori3, 1); +#else + __m128 tl0, tl1, tl2, tl3, bl0, bl1, bl2, bl3, tr0, tr1, tr2, tr3, br0, br1, + br2, br3; + this->load2_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, + load2_offsets_[current], shuffle_masks, &tl0, &tl1, &tl2, + &tl3, &bl0, &bl1, &bl2, &bl3, &tr0, &tr1, &tr2, &tr3, &br0, + &br1, &br2, &br3); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); + x_lerp = mmxs_lerp[2]; + __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); + __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); + x_lerp = mmxs_lerp[3]; + __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); + __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); +#endif + __m128 res[4]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); + res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); + this->write_4ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from4' group for an entire row of a +// 4 channel image. +template +void CropResizeCastImage::ResizeRow_load4_4ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load4_; ++current) { + __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); +#ifdef __AVX2__ + __m256 left0, left1, left2, left3, right0, right1, right2, right3; + this->load4_4ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], + load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], + load4_offsets_[current * 4 + 3], &left0, &left1, &left2, &left3, + &right0, &right1, &right2, &right3); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); + __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[3]))); + __m256 hori3 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right3, left3), left3); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); + __m128 top2 = _mm256_castps256_ps128(hori2); + __m128 bot2 = _mm256_extractf128_ps(hori2, 1); + __m128 top3 = _mm256_castps256_ps128(hori3); + __m128 bot3 = _mm256_extractf128_ps(hori3, 1); +#else + __m128 tl0, tl1, tl2, tl3, bl0, bl1, bl2, bl3, tr0, tr1, tr2, tr3, br0, br1, + br2, br3; + this->load4_4ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], + load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], + load4_offsets_[current * 4 + 3], &tl0, &tl1, &tl2, &tl3, &bl0, &bl1, + &bl2, &bl3, &tr0, &tr1, &tr2, &tr3, &br0, &br1, &br2, &br3); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); + x_lerp = mmxs_lerp[2]; + __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); + __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); + x_lerp = mmxs_lerp[3]; + __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); + __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); +#endif + __m128 res[4]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); + res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); + this->write_4ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); + } +} +// Resize all points that fall in the 'load4from8' group for an entire row of a +// 4 channel image. +template +void CropResizeCastImage::ResizeRow_load8_4ch_( + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + for (int current = 0; current < num_load8_; ++current) { + __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); +#ifdef __AVX2__ + __m256 left0, left1, left2, left3, right0, right1, right2, right3; + this->load8_4ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], + load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], + load8_offsets_[current * 4 + 3], &left0, &left1, &left2, &left3, + &right0, &right1, &right2, &right3); + + __m256 x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); + __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); + __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); + __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); + x_lerp = _mm256_castsi256_ps( + _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[3]))); + __m256 hori3 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right3, left3), left3); + + __m128 top0 = _mm256_castps256_ps128(hori0); + __m128 bot0 = _mm256_extractf128_ps(hori0, 1); + __m128 top1 = _mm256_castps256_ps128(hori1); + __m128 bot1 = _mm256_extractf128_ps(hori1, 1); + __m128 top2 = _mm256_castps256_ps128(hori2); + __m128 bot2 = _mm256_extractf128_ps(hori2, 1); + __m128 top3 = _mm256_castps256_ps128(hori3); + __m128 bot3 = _mm256_extractf128_ps(hori3, 1); +#else + __m128 tl0, tl1, tl2, tl3, bl0, bl1, bl2, bl3, tr0, tr1, tr2, tr3, br0, br1, + br2, br3; + this->load8_4ch( + ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], + load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], + load8_offsets_[current * 4 + 3], &tl0, &tl1, &tl2, &tl3, &bl0, &bl1, + &bl2, &bl3, &tr0, &tr1, &tr2, &tr3, &br0, &br1, &br2, &br3); + + __m128 x_lerp = mmxs_lerp[0]; + __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); + __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); + x_lerp = mmxs_lerp[1]; + __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); + __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); + x_lerp = mmxs_lerp[2]; + __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); + __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); + x_lerp = mmxs_lerp[3]; + __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); + __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); +#endif + __m128 res[4]; + res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); + res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); + res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); + res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); + this->write_4ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); + } +} +#undef CHANNELS + +template +void CropResizeCastImage::Configure_() { + // num_cases[0] = general case + // num_cases[1] = load4from1 + // num_cases[2] = load4from2 + // num_cases[3] = load4from4 + // num_cases[4] = load4from8 + int num_cases[5]; + for (int i = 0; i < 5; ++i) num_cases[i] = 0; + for (int x = x0_; x <= x1_; ++x) { + int load_group = this->DetermineLoadGroup_(x); + assert(load_group >= 0 && load_group <= 4); + ++num_cases[load_group]; + // load_group == 0 -> general case, pixel by pixel + // every other value indidcates 1+3 = 4 pixels were processed this iteration + if (load_group > 0) x += 3; + } + num_general_ = num_cases[0]; + num_load1_ = num_cases[1]; + num_load2_ = num_cases[2]; + num_load4_ = num_cases[3]; + num_load8_ = num_cases[4]; + if (num_general_ > 0) { + general_x_ = new int[num_general_]; + } else { + general_x_ = NULL; + } + if (num_load1_ > 0) { + load1_offsets_ = new int[num_load1_]; + load1_shuffle_masks_ = new __m128i[num_load1_ * channels_ * 3]; + load1_mmxs_lerp_ = NULL; // new __m128[num_load1_*channels_]; + load1_x_ = new int[num_load1_]; + } else { + load1_offsets_ = NULL; + load1_shuffle_masks_ = NULL; + load1_mmxs_lerp_ = NULL; + load1_x_ = NULL; + } + if (num_load2_ > 0) { + load2_offsets_ = new int[num_load2_]; + load2_shuffle_masks_ = new __m128i[num_load2_ * channels_ * 2]; + load2_mmxs_lerp_ = NULL; // new __m128[num_load2_*channels_]; + load2_x_ = new int[num_load2_]; + } else { + load2_offsets_ = NULL; + load2_shuffle_masks_ = NULL; + load2_mmxs_lerp_ = NULL; + load2_x_ = NULL; + } + if (num_load4_ > 0) { + load4_offsets_ = new int[num_load4_ * 4]; + load4_mmxs_lerp_ = new __m128[num_load4_ * channels_]; + load4_x_ = new int[num_load4_]; + } else { + load4_offsets_ = NULL; + load4_mmxs_lerp_ = NULL; + load4_x_ = NULL; + } + if (num_load8_ > 0) { + load8_offsets_ = new int[num_load8_ * 4]; + load8_mmxs_lerp_ = new __m128[num_load8_ * channels_]; + load8_x_ = new int[num_load8_]; + } else { + load8_offsets_ = NULL; + load8_mmxs_lerp_ = NULL; + load8_x_ = NULL; + } + for (int i = 0; i < 5; ++i) num_cases[i] = 0; + if (verbose_) { + printf(" load4from1 = %d\n", num_load1_); + printf(" load4from2 = %d\n", num_load2_); + printf(" load4from4 = %d\n", num_load4_); + printf(" load4from8 = %d\n", num_load8_); + printf(" general = %d\n", num_general_); + } + for (int x = x0_; x <= x1_; ++x) { + int load_group = DetermineLoadGroup_(x); + assert(load_group >= 0 && load_group <= 4); + int current = num_cases[load_group]; + assert(current >= 0); + // printf(" ... load_group=%d, current=%d\n",load_group,current); + if (load_group == 0) { + // general case + assert(current < num_general_); + general_x_[current] = x; + } else if (load_group == 1) { + // load4from1 + assert(current < num_load1_); + load1_x_[current] = x; + int min_xidx, max_xidx; + ComputeXIndexRange_(x, &min_xidx, &max_xidx); + // printf(" ... x=%d, min_xidx=%d, max_xidx=%d\n",x,min_xidx,max_xidx); + load1_offsets_[current] = min_xidx * channels_; + float* xs_lerp = (float*)(load1_shuffle_masks_ + current * channels_ * 3); + char* shufmasks1 = + (char*)(load1_shuffle_masks_ + current * channels_ * 3 + channels_); + char* shufmasks2 = shufmasks1 + 16 * channels_; + for (int j = 0; j < 32 * channels_; ++j) shufmasks1[j] = -128; + for (int pix = 0; pix < 4; ++pix) { + const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) + : (x + pix) - min_ix_; + float lerp = xs_[ix].lerp; + int widx0 = xs_[ix].lower - + load1_offsets_[current]; // word index within SSE vector + // printf(" ..... pix_ix=%d, lerp=%f, widx0=%d\n",ix,lerp,widx0); + for (int ch = 0; ch < channels_; ++ch) { + int idx = pix * channels_ + ch; + xs_lerp[idx] = lerp; + int shufvec = idx / 4; + int shufidx = idx % 4; + int widx = widx0 + ch; + // printf(" ....... ch=%d, idx=%d, shufvec=%d, shufidx=%d, widx=%d, + // shufmasks1[%ld...]=...\n",ch,idx,shufvec,shufidx,widx,shufvec*16+shufidx*sizeof(T)); + for (int b = 0; b < sizeof(T); ++b) { + shufmasks1[shufvec * 16 + shufidx * sizeof(T) + b] = + widx * sizeof(T) + b; + shufmasks2[shufvec * 16 + shufidx * sizeof(T) + b] = + (widx + channels_) * sizeof(T) + b; + } + } + } + } else if (load_group == 2) { + // load4from2 + assert(current < num_load2_); + load2_x_[current] = x; + int min_xidx, max_xidx; + ComputeXIndexRange_(x, &min_xidx, &max_xidx); + load2_offsets_[current] = min_xidx * channels_; + float* xs_lerp = (float*)(load2_shuffle_masks_ + current * channels_ * 2); + char* shufmasks1 = + (char*)(load2_shuffle_masks_ + current * channels_ * 2 + channels_); + for (int j = 0; j < 16 * channels_; ++j) shufmasks1[j] = -128; + for (int pix = 0; pix < 4; ++pix) { + const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) + : (x + pix) - min_ix_; + float lerp = xs_[ix].lerp; + int widx0 = xs_[ix].lower - + load2_offsets_[current]; // word index within SSE vector + for (int ch = 0; ch < channels_; ++ch) { + int idx = pix * channels_ + ch; + xs_lerp[idx] = lerp; + int shufvec = idx / 4; + int shufidx = idx % 4; + int widx = widx0 + ch; + for (int b = 0; b < sizeof(T); ++b) { + shufmasks1[shufvec * 16 + shufidx * sizeof(T) + b] = + widx * sizeof(T) + b; + } + } + } + } else if (load_group == 3) { + // load4from4 + assert(current < num_load4_); + load4_x_[current] = x; + int* index = load4_offsets_ + current * 4; + float* xs_lerp = (float*)(load4_mmxs_lerp_ + current * channels_); + for (int pix = 0; pix < 4; ++pix) { + const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) + : (x + pix) - min_ix_; + float lerp = xs_[ix].lerp; + index[pix] = xs_[ix].lower; + for (int ch = 0; ch < channels_; ++ch) { + int idx = pix * channels_ + ch; + xs_lerp[idx] = lerp; + } + } + /* debug + printf("load4from4_%dch :: x=%d - + index={%ld",channels_,x,index[0]*sizeof(T)); + for (int i = 1; i < 4; ++i) printf(",%ld",index[i]*sizeof(T)); + printf("}\n"); + */ + } else if (load_group == 4) { + // load4from8 + assert(current < num_load8_); + load8_x_[current] = x; + int* index = load8_offsets_ + current * 4; + float* xs_lerp = (float*)(load8_mmxs_lerp_ + current * channels_); + for (int pix = 0; pix < 4; ++pix) { + const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) + : (x + pix) - min_ix_; + float lerp = xs_[ix].lerp; + index[pix] = xs_[ix].lower; + for (int ch = 0; ch < channels_; ++ch) { + int idx = pix * channels_ + ch; + xs_lerp[idx] = lerp; + } + } + /* debug + printf("x=%d :: load8_x_[%d] = %d",x,current,load8_x_[current]); + printf(", load8_offsets_[%d] = {%d",current*4,load8_offsets_[current*4]); + for (int pix = 1; pix < 4; ++pix) + printf(",%d",load8_offsets_[current*4+pix]); + printf("}"); + for (int ch = 0; ch < channels_; ++ch) { + float* p = (float*)(load8_mmxs_lerp_ + current * channels_ + ch); + printf(", lerp[%d] = {%.3f",current*channels_+ch,p[0]); + for (int j = 1; j < 4; ++j) printf(",%.3f",p[j]); + printf("}"); + } + printf("\n"); + */ + } else { + assert(false); + } + ++num_cases[load_group]; + // load_group == 0 -> general case, pixel by pixel + // every other value indidcates 1+3 = 4 pixels were processed this iteration + if (load_group > 0) x += 3; + } +} + +template +int CropResizeCastImage::DetermineLoadGroup_(const int x) { + int num_remaining = x1_ - x + 1; + if (num_remaining >= 4) { + // at least 4 values left, so theoretically possible to do SSE + int min_xidx, max_xidx; + // Using this-> is necessary in order to avoid compile error: + // "there are no arguments to ‘xxx’ that depend on a template parameter, so + // a declaration of ‘xxx’ must be available" + // This is an issue for all member functions that have only builtin type + // arguments and happens because + // argument dependent lookup is not done for these arguments (so I've been + // told). + if (this->ComputeXIndexRange_(x, &min_xidx, &max_xidx)) { + if ((allowed_load_groups_ & 1) && this->Load1_ok_(min_xidx, max_xidx)) { + return 1; + } else if ((allowed_load_groups_ & 2) && + this->Load2_ok_(min_xidx, max_xidx)) { + return 2; + } else if ((allowed_load_groups_ & 4) && + this->Load4_ok_(min_xidx, max_xidx)) { + return 3; + } else if ((allowed_load_groups_ & 8) && + this->Load8_ok_(min_xidx, max_xidx)) { + return 4; + } else { + return 0; + } + } else { + // assumption xs[i].lower + channels == xs[i].upper NOT true for this + // quintuple. + return 0; + } + } else { + // too few remaining values + return 0; + } +} + +// Compute range of x indexes for xs[0] through xs[3]. +// Returns true if valid (xs[i].lower + channels == xs[i].upper for all pixels). +template +bool CropResizeCastImage::ComputeXIndexRange_(const int x, int* min_xidx, + int* max_xidx) { + bool upper_is_lower_plus_one = true; + *min_xidx = 0; + *max_xidx = -1; + for (int pix = 0; pix < 4; ++pix) { + const int ix = + flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; + int curr_xidx = xs_[ix].lower; + if (curr_xidx + channels_ == xs_[ix].upper) { + if (pix == 0) { + *min_xidx = curr_xidx; + *max_xidx = curr_xidx; + } else { + if (curr_xidx < *min_xidx) *min_xidx = curr_xidx; + if (curr_xidx > *max_xidx) *max_xidx = curr_xidx; + } + } else { + upper_is_lower_plus_one = false; + } + } + *min_xidx /= channels_; + *max_xidx /= channels_; + return upper_is_lower_plus_one; +} + +// This method returns true if it is possible to do load4from1 +// for the load group pointed to by xs. +template +bool CropResizeCastImage::Load1_ok_(const int min_xidx, + const int max_xidx) { + // num_pixels_to_load_left_input = max_xs_low - min_xs_low + 1 + // num_pixels_to_load_left_and_right_input = num_pixels_to_load_left_input + 1 + int total_load_bytes = (max_xidx - min_xidx + 2) * channels_ * sizeof(T); + if (total_load_bytes <= 16) { + // a single (mis-aligned) SSE word gives us all the inputs + // ensure that SSE word can be loaded without causing SEGV + int load_offset = min_xidx * channels_; + int load_offset_bytes = load_offset * sizeof(T); + if (in_row_size_bytes_ - load_offset_bytes >= 16) { + return true; + } else { + return false; + } + } else { + return false; + } +} + +// This method returns true if it is possible to do load4from2 +// for the load group pointed to by xs. +template +bool CropResizeCastImage::Load2_ok_(const int min_xidx, + const int max_xidx) { + // num_pixels_to_load_left_input = max_xs_low - min_xs_low + 1 + int total_load_bytes = (max_xidx - min_xidx + 1) * channels_ * sizeof(T); + if (total_load_bytes <= 16) { + // a single (mis-aligned) SSE word gives us all the inputs + // ensure that SSE word can be loaded without causing SEGV + int load_offset = (min_xidx + 1) * channels_; + int load_offset_bytes = load_offset * sizeof(T); + if (in_row_size_bytes_ - load_offset_bytes >= 16) { + return true; + } else { + return false; + } + } else { + return false; + } +} + +// This method returns true if it is possible to do load4from4 +// for the load group pointed to by xs. +template +bool CropResizeCastImage::Load4_ok_(const int min_xidx, + const int max_xidx) { + int total_load_bytes = 2 * channels_ * sizeof(T); + if (total_load_bytes <= 16) { + // ensure that SSE word can be loaded without causing SEGV + int load_offset = max_xidx * channels_; + int load_offset_bytes = load_offset * sizeof(T); + if (in_row_size_bytes_ - load_offset_bytes >= 16) { + return true; + } else { + return false; + } + } else { + return false; + } +} + +// This method returns true if it is possible to do load4from8 +// for the load group pointed to by xs. +template +bool CropResizeCastImage::Load8_ok_(const int min_xidx, + const int max_xidx) { + int total_load_bytes = channels_ * sizeof(T); + if (total_load_bytes <= 16) { + // ensure that SSE word can be loaded without causing SEGV + int load_offset = (max_xidx + 1) * channels_; + int load_offset_bytes = load_offset * sizeof(T); + if (in_row_size_bytes_ - load_offset_bytes >= 16) { + return true; + } else { + return false; + } + } else { + return false; + } +} + +// +// full implementations of templated static member function clip_necessary() +// + +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} + +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} + +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} + +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} + +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} + +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} + +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return true; +} +template <> +bool CropResizeCastImage::clip_necessary() { + return false; +} + +#endif // __SSE4_1__ + +template +void crop_resize_single_image_common( + const T* image, const int64 in_height, const int64 in_width, + const int64 out_height, const int64 out_width, const int channels, + const int min_ix, const int max_ix, const CachedInterpolation* xs, + const int min_iy, const int max_iy, const CachedInterpolation* ys, + const float extrapolated_value, const bool flip_x, const bool flip_y, + U* output) TF_ATTRIBUTE_NOINLINE; + +#ifdef __SSE4_1__ + +// full specializations of crop_resize_single_image_common for data types that +// have vectorized implementations. +// at the moment, this is uint8, int8, uint16, int16, int32, Eigen::half, +// bfloat16 and float. + +#define CROP_RESIZE_SINGLE_IMAGE_VECT(T_type, U_type) \ + template <> \ + void crop_resize_single_image_common( \ + const T_type* image, const int64 in_height, const int64 in_width, \ + const int64 out_height, const int64 out_width, const int channels, \ + const int min_ix, const int max_ix, const CachedInterpolation* xs, \ + const int min_iy, const int max_iy, const CachedInterpolation* ys, \ + const float extrapolated_value, const bool flip_x, const bool flip_y, \ + U_type* output) { \ + if (channels <= 4) { \ + CropResizeCastImage* resizer = \ + new CropResizeCastImage( \ + in_height, in_width, out_height, out_width, channels, min_ix, \ + max_ix, xs, min_iy, max_iy, ys, extrapolated_value, flip_x, \ + flip_y, false, 15); \ + resizer->Resize(image, output); \ + delete resizer; \ + } else { \ + crop_resize_single_image(image, in_height, in_width, out_height, \ + out_width, channels, min_ix, max_ix, xs, \ + min_iy, max_iy, ys, extrapolated_value, flip_x, \ + flip_y, output); \ + } \ + } + +CROP_RESIZE_SINGLE_IMAGE_VECT(uint8, float) +CROP_RESIZE_SINGLE_IMAGE_VECT(int8, float) +CROP_RESIZE_SINGLE_IMAGE_VECT(uint16, float) +CROP_RESIZE_SINGLE_IMAGE_VECT(int16, float) +CROP_RESIZE_SINGLE_IMAGE_VECT(int32, float) +CROP_RESIZE_SINGLE_IMAGE_VECT(Eigen::half, float) +CROP_RESIZE_SINGLE_IMAGE_VECT(bfloat16, float) +CROP_RESIZE_SINGLE_IMAGE_VECT(float, float) + +// full specializations of crop_resize_single_image_common for data types that +// don't have vectorized implementations. +// image resizing for these data types default to the original code. +// at the moment, this is int64 and double. + +#define CROP_RESIZE_SINGLE_IMAGE_REGULAR(T_type, U_type) \ + template <> \ + void crop_resize_single_image_common( \ + const T_type* image, const int64 in_height, const int64 in_width, \ + const int64 out_height, const int64 out_width, const int channels, \ + const int min_ix, const int max_ix, const CachedInterpolation* xs, \ + const int min_iy, const int max_iy, const CachedInterpolation* ys, \ + const float extrapolated_value, const bool flip_x, const bool flip_y, \ + U_type* output) { \ + crop_resize_single_image(image, in_height, in_width, out_height, \ + out_width, channels, min_ix, max_ix, xs, min_iy, \ + max_iy, ys, extrapolated_value, flip_x, flip_y, \ + output); \ + } + +CROP_RESIZE_SINGLE_IMAGE_REGULAR(int64, float) +CROP_RESIZE_SINGLE_IMAGE_REGULAR(double, float) + +#else + +// the vectorized implementations need at least SSE4.1 to compile. +// if that is not enabled, default to original code. + +template +void crop_resize_single_image_common( + const T* image, const int64 in_height, const int64 in_width, + const int64 out_height, const int64 out_width, const int channels, + const int min_ix, const int max_ix, const CachedInterpolation* xs, + const int min_iy, const int max_iy, const CachedInterpolation* ys, + const float extrapolated_value, const bool flip_x, const bool flip_y, + U* output) { + crop_resize_single_image(image, in_height, in_width, out_height, out_width, + channels, min_ix, max_ix, xs, min_iy, max_iy, ys, + extrapolated_value, flip_x, flip_y, output); +} + +#endif + +} // namespace +} // namespace tensorflow +#endif // define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc index f10c9a19a7f..5cc87993467 100644 --- a/tensorflow/core/kernels/resize_bilinear_op.cc +++ b/tensorflow/core/kernels/resize_bilinear_op.cc @@ -19,15 +19,16 @@ limitations under the License. #include "tensorflow/core/kernels/resize_bilinear_op.h" #include -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/crop_resize_bilinear_core.h" #include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -63,140 +64,6 @@ class ResizeBilinearOp : public OpKernel { bool align_corners_; }; -namespace { -// Compute the interpolation indices only once. -struct CachedInterpolation { - int64 lower; // Lower source index used in the interpolation - int64 upper; // Upper source index used in the interpolation - // 1-D linear iterpolation scale (see: - // https://en.wikipedia.org/wiki/Bilinear_interpolation) - float lerp; -}; - -inline void compute_interpolation_weights(const int64 out_size, - const int64 in_size, - const float scale, - CachedInterpolation* interpolation) { - interpolation[out_size].lower = 0; - interpolation[out_size].upper = 0; - for (int64 i = out_size - 1; i >= 0; --i) { - const float in = i * scale; - interpolation[i].lower = static_cast(in); - interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1); - interpolation[i].lerp = in - interpolation[i].lower; - } -} - -/** - * Computes the bilinear interpolation from the appropriate 4 float points - * and the linear interpolation weights. - */ -inline float compute_lerp(const float top_left, const float top_right, - const float bottom_left, const float bottom_right, - const float x_lerp, const float y_lerp) { - const float top = top_left + (top_right - top_left) * x_lerp; - const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; - return top + (bottom - top) * y_lerp; -} - -template -void resize_image( - typename TTypes::ConstTensor images, const int batch_size, - const int64 in_height, const int64 in_width, const int64 out_height, - const int64 out_width, const int channels, - const std::vector& xs, - const std::vector& ys, - typename TTypes::Tensor output) TF_ATTRIBUTE_NOINLINE; -template -void resize_image(typename TTypes::ConstTensor images, - const int batch_size, const int64 in_height, - const int64 in_width, const int64 out_height, - const int64 out_width, const int channels, - const std::vector& xs_vec, - const std::vector& ys, - typename TTypes::Tensor output) { - const int64 in_row_size = in_width * channels; - const int64 in_batch_num_values = in_height * in_row_size; - const int64 out_row_size = out_width * channels; - - const T* input_b_ptr = images.data(); - const CachedInterpolation* xs = xs_vec.data(); - - if (channels == 3) { - float* output_y_ptr = output.data(); - for (int b = 0; b < batch_size; ++b) { - for (int64 y = 0; y < out_height; ++y) { - const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size; - const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size; - const float ys_lerp = ys[y].lerp; - for (int64 x = 0; x < out_width; ++x) { - const int64 xs_lower = xs[x].lower; - const int64 xs_upper = xs[x].upper; - const float xs_lerp = xs[x].lerp; - - // Read channel 0. - const float top_left0(ys_input_lower_ptr[xs_lower + 0]); - const float top_right0(ys_input_lower_ptr[xs_upper + 0]); - const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); - const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); - - // Read channel 1. - const float top_left1(ys_input_lower_ptr[xs_lower + 1]); - const float top_right1(ys_input_lower_ptr[xs_upper + 1]); - const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); - const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); - - // Read channel 2. - const float top_left2(ys_input_lower_ptr[xs_lower + 2]); - const float top_right2(ys_input_lower_ptr[xs_upper + 2]); - const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); - const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); - - // Compute output. - output_y_ptr[x * channels + 0] = - compute_lerp(top_left0, top_right0, bottom_left0, bottom_right0, - xs_lerp, ys_lerp); - output_y_ptr[x * channels + 1] = - compute_lerp(top_left1, top_right1, bottom_left1, bottom_right1, - xs_lerp, ys_lerp); - output_y_ptr[x * channels + 2] = - compute_lerp(top_left2, top_right2, bottom_left2, bottom_right2, - xs_lerp, ys_lerp); - } - output_y_ptr += out_row_size; - } - input_b_ptr += in_batch_num_values; - } - } else { - float* output_y_ptr = output.data(); - for (int b = 0; b < batch_size; ++b) { - for (int64 y = 0; y < out_height; ++y) { - const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size; - const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size; - const float ys_lerp = ys[y].lerp; - for (int64 x = 0; x < out_width; ++x) { - auto xs_lower = xs[x].lower; - auto xs_upper = xs[x].upper; - auto xs_lerp = xs[x].lerp; - for (int c = 0; c < channels; ++c) { - const float top_left(ys_input_lower_ptr[xs_lower + c]); - const float top_right(ys_input_lower_ptr[xs_upper + c]); - const float bottom_left(ys_input_upper_ptr[xs_lower + c]); - const float bottom_right(ys_input_upper_ptr[xs_upper + c]); - output_y_ptr[x * channels + c] = - compute_lerp(top_left, top_right, bottom_left, bottom_right, - xs_lerp, ys_lerp); - } - } - output_y_ptr += out_row_size; - } - input_b_ptr += in_batch_num_values; - } - } -} - -} // namespace - // Partial specialization of ResizeBilinear functor for a CPUDevice. namespace functor { template @@ -212,6 +79,11 @@ struct ResizeBilinear { const int64 out_height = output.dimension(1); const int64 out_width = output.dimension(2); + const int64 in_row_size = in_width * channels; + const int64 in_batch_num_values = in_height * in_row_size; + const int64 out_row_size = out_width * channels; + const int64 out_batch_num_values = out_row_size * out_height; + // Handle no-op resizes efficiently. if (out_height == in_height && out_width == in_width) { output = images.template cast(); @@ -232,8 +104,13 @@ struct ResizeBilinear { xs[i].upper *= channels; } - resize_image(images, batch_size, in_height, in_width, out_height, - out_width, channels, xs, ys, output); + for (int b = 0; b < batch_size; ++b) { + crop_resize_single_image_common( + images.data() + (int64)b * in_batch_num_values, in_height, in_width, + out_height, out_width, channels, 0, out_width - 1, xs.data(), 0, + out_height - 1, ys.data(), 0.0f, false, false, + output.data() + (int64)b * out_batch_num_values); + } } }; } // namespace functor diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc index 6d578928285..55e1d2e1e22 100644 --- a/tensorflow/core/kernels/resize_bilinear_op_test.cc +++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc @@ -122,7 +122,7 @@ class ResizeBilinearOpTest : public OpsTestBase { TensorShape({batch_size, output_width, output_height, channels}))); ResizeBilinearBaseline(input->tensor(), expected->tensor()); - test::ExpectTensorEqual(*expected, *GetOutput(0)); + test::ExpectClose(*expected, *GetOutput(0)); } void RunManyRandomTests(int channels) { From d3b4d3c8ffd52da2c094f58a728209c6a76f4b66 Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Fri, 7 Sep 2018 22:07:01 -0700 Subject: [PATCH 002/540] More FMA's --- .../core/kernels/crop_resize_bilinear_core.h | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h index f6846d6a557..0209130b2ce 100644 --- a/tensorflow/core/kernels/crop_resize_bilinear_core.h +++ b/tensorflow/core/kernels/crop_resize_bilinear_core.h @@ -4023,9 +4023,15 @@ void CropResizeCastImage::ResizeRow_load1_1ch_( __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); #endif +#ifdef __AVX2__ + __m128 res[1]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + this->write_1ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); +#else __m128 res[1]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); this->write_1ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from2' group for an entire row of a @@ -4059,9 +4065,15 @@ void CropResizeCastImage::ResizeRow_load2_1ch_( __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); #endif +#ifdef __AVX2__ + __m128 res[1]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + this->write_1ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); +#else __m128 res[1]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); this->write_1ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from4' group for an entire row of a @@ -4096,9 +4108,15 @@ void CropResizeCastImage::ResizeRow_load4_1ch_( __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); #endif +#ifdef __AVX2__ + __m128 res[1]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + this->write_1ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); +#else __m128 res[1]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); this->write_1ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from8' group for an entire row of a @@ -4133,9 +4151,15 @@ void CropResizeCastImage::ResizeRow_load8_1ch_( __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); #endif +#ifdef __AVX2__ + __m128 res[1]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + this->write_1ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); +#else __m128 res[1]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); this->write_1ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); +#endif } } #undef CHANNELS @@ -4181,10 +4205,17 @@ void CropResizeCastImage::ResizeRow_load1_2ch_( __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); #endif +#ifdef __AVX2__ + __m128 res[2]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + this->write_2ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); +#else __m128 res[2]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); this->write_2ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from2' group for an entire row of a @@ -4227,10 +4258,17 @@ void CropResizeCastImage::ResizeRow_load2_2ch_( __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); #endif +#ifdef __AVX2__ + __m128 res[2]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + this->write_2ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); +#else __m128 res[2]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); this->write_2ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from4' group for an entire row of a @@ -4274,10 +4312,17 @@ void CropResizeCastImage::ResizeRow_load4_2ch_( __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); #endif +#ifdef __AVX2__ + __m128 res[2]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + this->write_2ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); +#else __m128 res[2]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); this->write_2ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from8' group for an entire row of a @@ -4321,10 +4366,17 @@ void CropResizeCastImage::ResizeRow_load8_2ch_( __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); #endif +#ifdef __AVX2__ + __m128 res[2]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + this->write_2ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); +#else __m128 res[2]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); this->write_2ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); +#endif } } #undef CHANNELS @@ -4378,11 +4430,19 @@ void CropResizeCastImage::ResizeRow_load1_3ch_( __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); #endif +#ifdef __AVX2__ + __m128 res[3]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); + this->write_3ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); +#else __m128 res[3]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); this->write_3ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from2' group for an entire row of a @@ -4433,11 +4493,19 @@ void CropResizeCastImage::ResizeRow_load2_3ch_( __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); #endif +#ifdef __AVX2__ + __m128 res[3]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); + this->write_3ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); +#else __m128 res[3]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); this->write_3ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from4' group for an entire row of a @@ -4490,11 +4558,19 @@ void CropResizeCastImage::ResizeRow_load4_3ch_( __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); #endif +#ifdef __AVX2__ + __m128 res[3]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); + this->write_3ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); +#else __m128 res[3]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); this->write_3ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from8' group for an entire row of a @@ -4547,11 +4623,19 @@ void CropResizeCastImage::ResizeRow_load8_3ch_( __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); #endif +#ifdef __AVX2__ + __m128 res[3]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); + this->write_3ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); +#else __m128 res[3]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); this->write_3ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); +#endif } } #undef CHANNELS @@ -4615,12 +4699,21 @@ void CropResizeCastImage::ResizeRow_load1_4ch_( __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); #endif +#ifdef __AVX2__ + __m128 res[4]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); + res[3] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot3, top3), top3); + this->write_4ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); +#else __m128 res[4]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); this->write_4ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from2' group for an entire row of a @@ -4681,12 +4774,21 @@ void CropResizeCastImage::ResizeRow_load2_4ch_( __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); #endif +#ifdef __AVX2__ + __m128 res[4]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); + res[3] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot3, top3), top3); + this->write_4ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); +#else __m128 res[4]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); this->write_4ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from4' group for an entire row of a @@ -4748,12 +4850,21 @@ void CropResizeCastImage::ResizeRow_load4_4ch_( __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); #endif +#ifdef __AVX2__ + __m128 res[4]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); + res[3] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot3, top3), top3); + this->write_4ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); +#else __m128 res[4]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); this->write_4ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); +#endif } } // Resize all points that fall in the 'load4from8' group for an entire row of a @@ -4815,12 +4926,21 @@ void CropResizeCastImage::ResizeRow_load8_4ch_( __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); #endif +#ifdef __AVX2__ + __m128 res[4]; + res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); + res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); + res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); + res[3] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot3, top3), top3); + this->write_4ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); +#else __m128 res[4]; res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); this->write_4ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); +#endif } } #undef CHANNELS From 318d2dd306cc221a51346a076272fbdc10ebdab9 Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Tue, 11 Sep 2018 12:48:22 -0700 Subject: [PATCH 003/540] Swap copts and prefix lines to make UBUNTU sanity check happy --- tensorflow/core/kernels/BUILD | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 3b2b71ec2a1..a2b2432e02a 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -2156,8 +2156,8 @@ tf_kernel_library( tf_kernel_library( name = "crop_and_resize_op", - prefix = "crop_and_resize_op", copts = tf_copts() + if_linux_x86_64(["-msse4.1 -finline-functions"]), + prefix = "crop_and_resize_op", deps = IMAGE_DEPS + [":crop_resize_bilinear_core"], ) @@ -2223,8 +2223,8 @@ tf_kernel_library( tf_kernel_library( name = "resize_bilinear_op", - prefix = "resize_bilinear_op", copts = tf_copts() + if_linux_x86_64(["-msse4.1 -finline-functions"]), + prefix = "resize_bilinear_op", deps = IMAGE_DEPS + [":crop_resize_bilinear_core"], ) From ac4d6c3d914893d5ea0d4b25ee5ceeb6a6d51b42 Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Tue, 11 Sep 2018 14:08:46 -0700 Subject: [PATCH 004/540] Fix android build --- tensorflow/core/kernels/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index a2b2432e02a..6bcc3607405 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -5197,6 +5197,7 @@ filegroup( "population_count_op.h", "winograd_transform.h", ":android_extended_ops_headers", + ":crop_resize_bilinear_core", ] + select({ ":xsmm_convolutions": [ "xsmm_conv2d.h", @@ -5291,6 +5292,7 @@ filegroup( "where_op.cc", "xent_op.cc", ":android_extended_ops_headers", + ":crop_resize_bilinear_core", ], ) From 3f642153f02b0ce9910a4a7970ec4a961b827c86 Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Tue, 11 Sep 2018 14:54:25 -0700 Subject: [PATCH 005/540] Did not know that android targets must start with android, try again to fix android DEMO build --- tensorflow/core/kernels/BUILD | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 6bcc3607405..98e1f0ab13e 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -5069,6 +5069,7 @@ filegroup( "control_flow_ops.h", "conv_2d.h", "conv_ops.h", + "crop_resize_bilinear_core.h", "data_format_ops.h", "depthtospace_op.h", "depthwise_conv_op.h", @@ -5197,7 +5198,6 @@ filegroup( "population_count_op.h", "winograd_transform.h", ":android_extended_ops_headers", - ":crop_resize_bilinear_core", ] + select({ ":xsmm_convolutions": [ "xsmm_conv2d.h", @@ -5292,7 +5292,6 @@ filegroup( "where_op.cc", "xent_op.cc", ":android_extended_ops_headers", - ":crop_resize_bilinear_core", ], ) From 11e6c7532899a078d20f1f19441fb7bcfadc93c0 Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Thu, 13 Sep 2018 07:30:05 -0700 Subject: [PATCH 006/540] Limit vectorized methods to linux platforms (for now). Remove commented out code --- .../core/kernels/crop_resize_bilinear_core.h | 73 +++++-------------- tensorflow/core/kernels/resize_bilinear_op.cc | 8 +- 2 files changed, 24 insertions(+), 57 deletions(-) diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h index 0209130b2ce..3125fbdd3d2 100644 --- a/tensorflow/core/kernels/crop_resize_bilinear_core.h +++ b/tensorflow/core/kernels/crop_resize_bilinear_core.h @@ -478,7 +478,21 @@ void crop_resize_single_image(const T* image, const int64 in_height, } } -#ifdef __SSE4_1__ +// template for method that calls either explicitly vectorized method +// or the fallback method, depending on what is appropriate for the +// machine you are running on +template +void crop_resize_single_image_common( + const T* image, const int64 in_height, const int64 in_width, + const int64 out_height, const int64 out_width, const int channels, + const int min_ix, const int max_ix, const CachedInterpolation* xs, + const int min_iy, const int max_iy, const CachedInterpolation* ys, + const float extrapolated_value, const bool flip_x, const bool flip_y, + U* output) TF_ATTRIBUTE_NOINLINE; + +// For now, only compile vectorized code on LINUX systems. +// to-do: Test vectorized code on other platforms (MacOS and Windows). +#if defined(__linux__) && defined(__SSE4_1__) // // The remaining code implements explicitly vectorized versions of a bilinear @@ -3605,6 +3619,7 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { // copy xs values, but filter out the following: // xs[].lower == xs[].upper AND xs[].lerp == 0 // xs[].lower == xs[].upper AND xs[].lerp == 1 + assert( min_ix_ <= max_ix_ ); xs_ = new CachedInterpolation[max_ix_ - min_ix_ + 1]; for (int i = min_ix_; i <= max_ix_; ++i) { int ix = i - min_ix_; @@ -3731,11 +3746,6 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { bool Load4_ok_(const int min_xidx, const int max_xidx); bool Load8_ok_(const int min_xidx, const int max_xidx); - // debugging - int y_; - const T* input_image_; - U* output_image_; - public: // // public client methods @@ -3751,9 +3761,6 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { template void CropResizeCastImage::Resize(const T* input_image, U* output_image) { - // store these for debugging - input_image_ = input_image; - output_image_ = output_image_; // U uEx = cast_to(extrapolated_value_, _f_min_val, _f_max_val, _u_min_val, _u_max_val); @@ -3798,7 +3805,6 @@ void CropResizeCastImage::Resize(const T* input_image, U* output_image) { // interpolation region int y = y0_; for (y = y0_; y + 1 <= y1_; y += 2) { - y_ = y; const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; const float yA_lerp = ys_[iyA].lerp; const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); @@ -3903,11 +3909,8 @@ void CropResizeCastImage::Resize(const T* input_image, U* output_image) { } else { assert(false); } - // printf("*2 :: y=%d, channels_=%d, - // num_load8_=%d\n",y,channels_,num_load8_); } for (; y <= y1_; ++y) { - y_ = y; const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; const float yA_lerp = ys_[iyA].lerp; const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); @@ -3963,7 +3966,6 @@ void CropResizeCastImage::Resize(const T* input_image, U* output_image) { } else { assert(false); } - // printf("*1 :: y=%d\n",y); } } @@ -5025,7 +5027,6 @@ void CropResizeCastImage::Configure_() { assert(load_group >= 0 && load_group <= 4); int current = num_cases[load_group]; assert(current >= 0); - // printf(" ... load_group=%d, current=%d\n",load_group,current); if (load_group == 0) { // general case assert(current < num_general_); @@ -5036,7 +5037,6 @@ void CropResizeCastImage::Configure_() { load1_x_[current] = x; int min_xidx, max_xidx; ComputeXIndexRange_(x, &min_xidx, &max_xidx); - // printf(" ... x=%d, min_xidx=%d, max_xidx=%d\n",x,min_xidx,max_xidx); load1_offsets_[current] = min_xidx * channels_; float* xs_lerp = (float*)(load1_shuffle_masks_ + current * channels_ * 3); char* shufmasks1 = @@ -5049,15 +5049,12 @@ void CropResizeCastImage::Configure_() { float lerp = xs_[ix].lerp; int widx0 = xs_[ix].lower - load1_offsets_[current]; // word index within SSE vector - // printf(" ..... pix_ix=%d, lerp=%f, widx0=%d\n",ix,lerp,widx0); for (int ch = 0; ch < channels_; ++ch) { int idx = pix * channels_ + ch; xs_lerp[idx] = lerp; int shufvec = idx / 4; int shufidx = idx % 4; int widx = widx0 + ch; - // printf(" ....... ch=%d, idx=%d, shufvec=%d, shufidx=%d, widx=%d, - // shufmasks1[%ld...]=...\n",ch,idx,shufvec,shufidx,widx,shufvec*16+shufidx*sizeof(T)); for (int b = 0; b < sizeof(T); ++b) { shufmasks1[shufvec * 16 + shufidx * sizeof(T) + b] = widx * sizeof(T) + b; @@ -5111,12 +5108,6 @@ void CropResizeCastImage::Configure_() { xs_lerp[idx] = lerp; } } - /* debug - printf("load4from4_%dch :: x=%d - - index={%ld",channels_,x,index[0]*sizeof(T)); - for (int i = 1; i < 4; ++i) printf(",%ld",index[i]*sizeof(T)); - printf("}\n"); - */ } else if (load_group == 4) { // load4from8 assert(current < num_load8_); @@ -5133,20 +5124,6 @@ void CropResizeCastImage::Configure_() { xs_lerp[idx] = lerp; } } - /* debug - printf("x=%d :: load8_x_[%d] = %d",x,current,load8_x_[current]); - printf(", load8_offsets_[%d] = {%d",current*4,load8_offsets_[current*4]); - for (int pix = 1; pix < 4; ++pix) - printf(",%d",load8_offsets_[current*4+pix]); - printf("}"); - for (int ch = 0; ch < channels_; ++ch) { - float* p = (float*)(load8_mmxs_lerp_ + current * channels_ + ch); - printf(", lerp[%d] = {%.3f",current*channels_+ch,p[0]); - for (int j = 1; j < 4; ++j) printf(",%.3f",p[j]); - printf("}"); - } - printf("\n"); - */ } else { assert(false); } @@ -5517,19 +5494,6 @@ bool CropResizeCastImage::clip_necessary() { return false; } -#endif // __SSE4_1__ - -template -void crop_resize_single_image_common( - const T* image, const int64 in_height, const int64 in_width, - const int64 out_height, const int64 out_width, const int channels, - const int min_ix, const int max_ix, const CachedInterpolation* xs, - const int min_iy, const int max_iy, const CachedInterpolation* ys, - const float extrapolated_value, const bool flip_x, const bool flip_y, - U* output) TF_ATTRIBUTE_NOINLINE; - -#ifdef __SSE4_1__ - // full specializations of crop_resize_single_image_common for data types that // have vectorized implementations. // at the moment, this is uint8, int8, uint16, int16, int32, Eigen::half, @@ -5594,8 +5558,9 @@ CROP_RESIZE_SINGLE_IMAGE_REGULAR(double, float) #else -// the vectorized implementations need at least SSE4.1 to compile. -// if that is not enabled, default to original code. +// compile fall-back code if either +// a) target is not a linux machine +// b) target architecture does not support at least SSE4.1 template void crop_resize_single_image_common( diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc index 5cc87993467..566e94cdef1 100644 --- a/tensorflow/core/kernels/resize_bilinear_op.cc +++ b/tensorflow/core/kernels/resize_bilinear_op.cc @@ -90,12 +90,13 @@ struct ResizeBilinear { return; } - std::vector ys(out_height + 1); - std::vector xs(out_width + 1); - // Compute the cached interpolation weights on the x and y dimensions. + std::vector ys; + ys.resize(out_height + 1); compute_interpolation_weights(out_height, in_height, height_scale, ys.data()); + std::vector xs; + xs.resize(out_width + 1); compute_interpolation_weights(out_width, in_width, width_scale, xs.data()); // Scale x interpolation weights to avoid a multiplication during iteration. @@ -111,6 +112,7 @@ struct ResizeBilinear { out_height - 1, ys.data(), 0.0f, false, false, output.data() + (int64)b * out_batch_num_values); } + // xs and ys are freed when they go out of scope } }; } // namespace functor From 848a72432d2093893bc3a70ed41d528876ffa324 Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Thu, 13 Sep 2018 11:24:21 -0700 Subject: [PATCH 007/540] Run clang-format again. Don't override compile target switch(es). --- tensorflow/core/kernels/BUILD | 4 ++-- tensorflow/core/kernels/crop_resize_bilinear_core.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 98e1f0ab13e..08f698c2577 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -2156,7 +2156,7 @@ tf_kernel_library( tf_kernel_library( name = "crop_and_resize_op", - copts = tf_copts() + if_linux_x86_64(["-msse4.1 -finline-functions"]), + copts = tf_copts() + if_linux_x86_64(["-finline-functions"]), prefix = "crop_and_resize_op", deps = IMAGE_DEPS + [":crop_resize_bilinear_core"], ) @@ -2223,7 +2223,7 @@ tf_kernel_library( tf_kernel_library( name = "resize_bilinear_op", - copts = tf_copts() + if_linux_x86_64(["-msse4.1 -finline-functions"]), + copts = tf_copts() + if_linux_x86_64(["-finline-functions"]), prefix = "resize_bilinear_op", deps = IMAGE_DEPS + [":crop_resize_bilinear_core"], ) diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h index 3125fbdd3d2..62c275d4ccd 100644 --- a/tensorflow/core/kernels/crop_resize_bilinear_core.h +++ b/tensorflow/core/kernels/crop_resize_bilinear_core.h @@ -3619,7 +3619,7 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { // copy xs values, but filter out the following: // xs[].lower == xs[].upper AND xs[].lerp == 0 // xs[].lower == xs[].upper AND xs[].lerp == 1 - assert( min_ix_ <= max_ix_ ); + assert(min_ix_ <= max_ix_); xs_ = new CachedInterpolation[max_ix_ - min_ix_ + 1]; for (int i = min_ix_; i <= max_ix_; ++i) { int ix = i - min_ix_; From c81edb2c1a1eb8b6c831978aed5cb1d3b89f14af Mon Sep 17 00:00:00 2001 From: Mahmoud Abuzaina Date: Thu, 13 Sep 2018 13:50:57 -0700 Subject: [PATCH 008/540] Fixed more merge conflicts --- tensorflow/core/BUILD | 7 +- tensorflow/core/graph/mkl_graph_util.h | 29 +- tensorflow/core/kernels/BUILD | 11 +- tensorflow/core/kernels/mkl_conv_ops.cc | 887 ++++++++++++++++-- .../core/kernels/mkl_quantized_conv_ops.h | 55 ++ tensorflow/core/ops/mkl_nn_ops.cc | 612 ++++++++++++ tensorflow/core/util/mkl_util.h | 16 + 7 files changed, 1517 insertions(+), 100 deletions(-) create mode 100644 tensorflow/core/kernels/mkl_quantized_conv_ops.h create mode 100644 tensorflow/core/ops/mkl_nn_ops.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 8f32bc28449..df6b2297b41 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1051,6 +1051,7 @@ tf_gen_op_libs( "logging_ops", "manip_ops", "math_ops", + "mkl_nn_ops", "nn_ops", "no_op", "parsing_ops", @@ -1189,7 +1190,7 @@ cc_library( ":training_ops_op_lib", ":user_ops_op_lib", ":word2vec_ops", - ] + tf_additional_cloud_op_deps(), + ] + if_mkl([":mkl_nn_ops_op_lib"]) + tf_additional_cloud_op_deps(), alwayslink = 1, ) @@ -1244,7 +1245,9 @@ cc_library( ":framework", ":lib", ":nn_ops_op_lib", - ], + ] + if_mkl([ + ":mkl_nn_ops_op_lib", + ]), alwayslink = 1, ) diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h index bab1df87a4d..990b2fe9b04 100644 --- a/tensorflow/core/graph/mkl_graph_util.h +++ b/tensorflow/core/graph/mkl_graph_util.h @@ -75,6 +75,8 @@ int inline GetTensorMetaDataIndex(int n, int total_tensors) { namespace mkl_op_registry { static const char* kMklOpLabel = "MklOp"; static const char* kMklOpLabelPattern = "label='MklOp'"; +static const char* kMklQuantizedOpLabel = "QuantizedMklOp"; +static const char* kMklQuantizedOpLabelPattern = "label='QuantizedMklOp'"; // Prefix that we add to Tensorflow op name to construct Mkl op name. static const char* const kMklOpPrefix = "_Mkl"; @@ -91,9 +93,30 @@ inline string GetMklOpName(const string& name) { // @return: true if opname is registered as Mkl op; false otherwise static inline bool IsMklOp(const string& op_name, DataType T) { string kernel = KernelsRegisteredForOp(op_name); - bool result = - kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT); - return result; + + // Restrict quantized ops to QUINT8 and QINT8 for now + if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) { + return (T == DT_QUINT8 || T == DT_QINT8); + } + // Restrict regular ops to FLOAT + if (kernel.find(kMklOpLabelPattern) != string::npos) { + return (T == DT_FLOAT); + } + return false; +} + +// TODO(mdfaijul): QuantizedConv2D is registered with input: QUINT8 +// filter:QINT8 for mkldnn integration. First a dummy kernel is created +// and then it is replaced by an actual kernel. +static inline bool IsMklOp(const string& op_name, DataType Tinput, + DataType Tfilter) { + string kernel = KernelsRegisteredForOp(op_name); + + // Restrict quantized ops to QUINT8 and QINT8 for now + if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) { + return (Tinput == DT_QUINT8 && Tfilter == DT_QINT8); + } + return false; } // Check whether opname with type T is registered as MKL-compliant and diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 94d3ab44672..d1e1596b0bd 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -3468,7 +3468,7 @@ NN_DEPS = [ "//tensorflow/core:nn_grad", "//tensorflow/core:nn_ops_op_lib", "//third_party/eigen3", -] +] + if_mkl(["//tensorflow/core:mkl_nn_ops_op_lib"]) tf_kernel_library( name = "batch_norm_op", @@ -6215,6 +6215,7 @@ tf_cc_test( tf_mkl_kernel_library( name = "mkl_conv_op", + hdrs = ["mkl_quantized_conv_ops.h"], prefix = "mkl_conv", deps = [ ":bounds_check", @@ -6224,6 +6225,7 @@ tf_mkl_kernel_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core:mkl_nn_ops_op_lib", "//tensorflow/core:nn_ops_op_lib", ] + mkl_deps(), ) @@ -6238,6 +6240,7 @@ tf_mkl_kernel_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core:mkl_nn_ops_op_lib", "//tensorflow/core:nn_ops_op_lib", ] + mkl_deps(), ) @@ -6253,6 +6256,7 @@ tf_mkl_kernel_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core:mkl_nn_ops_op_lib", "//tensorflow/core:nn_ops_op_lib", ] + mkl_deps(), ) @@ -6272,6 +6276,7 @@ tf_mkl_kernel_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core:mkl_nn_ops_op_lib", "//tensorflow/core:nn_ops_op_lib", ] + mkl_deps(), ) @@ -6286,6 +6291,7 @@ tf_mkl_kernel_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core:mkl_nn_ops_op_lib", "//tensorflow/core:nn_ops_op_lib", "//third_party/eigen3", ] + mkl_deps(), @@ -6301,6 +6307,7 @@ tf_mkl_kernel_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core:mkl_nn_ops_op_lib", "//tensorflow/core:nn_ops_op_lib", "//third_party/eigen3", ] + mkl_deps(), @@ -6321,7 +6328,7 @@ tf_mkl_kernel_library( tf_mkl_kernel_library( name = "mkl_concat_op", prefix = "mkl_concat_op", - deps = ARRAY_DEPS + mkl_deps(), + deps = [":quantization_utils"] + ARRAY_DEPS + mkl_deps(), ) tf_mkl_kernel_library( diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 184e0cb0034..95b6dc066c3 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -17,9 +17,9 @@ limitations under the License. #ifdef INTEL_MKL #include +#include #include #include -#include #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -29,6 +29,8 @@ limitations under the License. #include "tensorflow/core/framework/tensor_slice.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/kernels/mkl_conv_ops.h" +#include "tensorflow/core/kernels/mkl_quantized_conv_ops.h" +#include "tensorflow/core/kernels/no_op.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/array_slice.h" @@ -69,6 +71,12 @@ struct MklConvFwdParams { memory::dims dilations; memory::dims padding_left; memory::dims padding_right; + string dtypes = string(""); + struct PostOpParam { + string name; + std::vector param; + }; + std::vector post_op_params; MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims, memory::dims bias_dims, memory::dims dst_dims, @@ -83,8 +91,10 @@ struct MklConvFwdParams { padding_left(padding_left), padding_right(padding_right) {} }; - -template +// With quantization, input, filter, and output can have different types +// so we use differnt template parameter for each type +template class MklConvFwdPrimitive : public MklPrimitive { public: explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims) @@ -103,16 +113,16 @@ class MklConvFwdPrimitive : public MklPrimitive { // filter_data: input data buffer of filter (weights) // bias_data: input data buffer of bias // dst_data: output data buffer of dst - void Execute(const T* src_data, const T* filter_data, const T* bias_data, - const T* dst_data) { + void Execute(const Tinput* src_data, const Tfilter* filter_data, + const Tbias* bias_data, const Toutput* dst_data) { context_.src_mem->set_data_handle( - static_cast(const_cast(src_data))); + static_cast(const_cast(src_data))); context_.filter_mem->set_data_handle( - static_cast(const_cast(filter_data))); + static_cast(const_cast(filter_data))); context_.bias_mem->set_data_handle( - static_cast(const_cast(bias_data))); + static_cast(const_cast(bias_data))); context_.dst_mem->set_data_handle( - static_cast(const_cast(dst_data))); + static_cast(const_cast(dst_data))); context_.fwd_stream->submit(context_.fwd_primitives); // after exec, set data handle back @@ -128,13 +138,14 @@ class MklConvFwdPrimitive : public MklPrimitive { // src_data: input data buffer of src // filter_data: input data buffer of filter (weights) // dst_data: output data buffer of dst - void Execute(const T* src_data, const T* filter_data, const T* dst_data) { + void Execute(const Tinput* src_data, const Tfilter* filter_data, + const Toutput* dst_data) { context_.src_mem->set_data_handle( - static_cast(const_cast(src_data))); + static_cast(const_cast(src_data))); context_.filter_mem->set_data_handle( - static_cast(const_cast(filter_data))); + static_cast(const_cast(filter_data))); context_.dst_mem->set_data_handle( - static_cast(const_cast(dst_data))); + static_cast(const_cast(dst_data))); context_.fwd_stream->submit(context_.fwd_primitives); // after execution, set data handle back @@ -200,17 +211,17 @@ class MklConvFwdPrimitive : public MklPrimitive { void Setup(const MklConvFwdParams& convFwdDims) { // create memory descriptors for convolution data w/ no specified format context_.src_md.reset(new memory::desc( - {convFwdDims.src_dims}, MklDnnType(), memory::format::any)); + {convFwdDims.src_dims}, MklDnnType(), memory::format::any)); context_.filter_md.reset(new memory::desc( - {convFwdDims.filter_dims}, MklDnnType(), memory::format::any)); + {convFwdDims.filter_dims}, MklDnnType(), memory::format::any)); context_.dst_md.reset(new memory::desc( - {convFwdDims.dst_dims}, MklDnnType(), memory::format::any)); + {convFwdDims.dst_dims}, MklDnnType(), memory::format::any)); if (!convFwdDims.bias_dims.empty()) context_.bias_md.reset(new memory::desc( - {convFwdDims.bias_dims}, MklDnnType(), memory::format::any)); + {convFwdDims.bias_dims}, MklDnnType(), memory::format::any)); // create a convolution if (!convFwdDims.bias_dims.empty()) { @@ -230,6 +241,42 @@ class MklConvFwdPrimitive : public MklPrimitive { context_.fwd_pd.reset(new convolution_forward::primitive_desc( *context_.fwd_desc, cpu_engine_)); + // Check if there is any fusions as post-ops + auto const& post_op_params = convFwdDims.post_op_params; + mkldnn::primitive_attr post_ops_attr; + mkldnn::post_ops post_ops; + if (!post_op_params.empty()) { + for (auto const& post_op_param : post_op_params) { + if (post_op_param.name == "relu") { + CHECK_EQ(post_op_param.param.size(), 3); + float op_scale = post_op_param.param[0]; + float op_alpha = post_op_param.param[1]; + float op_beta = post_op_param.param[2]; + post_ops.append_eltwise(op_scale, mkldnn::eltwise_relu, op_alpha, + op_beta); + } else if (post_op_param.name == "sum") { + CHECK_EQ(post_op_param.param.size(), 1); + float op_scale = post_op_param.param[0]; + post_ops.append_sum(op_scale); + } else if (post_op_param.name == "output_scale") { + CHECK_EQ(post_op_param.param.size(), 1); + std::vector scales; + scales.push_back(post_op_param.param[0]); + post_ops_attr.set_output_scales(0, scales); + } else { + TF_CHECK_OK( + Status(error::Code::UNIMPLEMENTED, + "For now, only Relu and Sum are supported for fusion.")); + } + } + post_ops_attr.set_post_ops(post_ops); + context_.fwd_pd.reset(new convolution_forward::primitive_desc( + *context_.fwd_desc, post_ops_attr, cpu_engine_)); + } else { + context_.fwd_pd.reset(new convolution_forward::primitive_desc( + *context_.fwd_desc, cpu_engine_)); + } + // store the expected memory format context_.src_fmt = static_cast( context_.fwd_pd.get()->src_primitive_desc().desc().data.format); @@ -268,23 +315,30 @@ class MklConvFwdPrimitive : public MklPrimitive { engine cpu_engine_; }; -template +template class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory { public: - static MklConvFwdPrimitive* Get(const MklConvFwdParams& convFwdDims, - bool do_not_cache) { - MklConvFwdPrimitive* conv_fwd = nullptr; + static MklConvFwdPrimitive* Get( + const MklConvFwdParams& convFwdDims, bool do_not_cache) { + MklConvFwdPrimitive* conv_fwd = nullptr; if (do_not_cache) { /* Always create new primitive */ - conv_fwd = new MklConvFwdPrimitive(convFwdDims); + conv_fwd = new MklConvFwdPrimitive( + convFwdDims); } else { // try to find a suitable one in pool - conv_fwd = dynamic_cast*>( - MklConvFwdPrimitiveFactory::GetInstance().GetConvFwd(convFwdDims)); + conv_fwd = + dynamic_cast*>( + MklConvFwdPrimitiveFactory::GetInstance() + .GetConvFwd(convFwdDims)); if (conv_fwd == nullptr) { - conv_fwd = new MklConvFwdPrimitive(convFwdDims); - MklConvFwdPrimitiveFactory::GetInstance().SetConvFwd(convFwdDims, - conv_fwd); + conv_fwd = new MklConvFwdPrimitive( + convFwdDims); + MklConvFwdPrimitiveFactory::GetInstance() + .SetConvFwd(convFwdDims, conv_fwd); } } @@ -314,6 +368,31 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory { key_creator.AddAsKey(convFwdDims.dilations); key_creator.AddAsKey(convFwdDims.padding_left); key_creator.AddAsKey(convFwdDims.padding_right); + key_creator.AddAsKey(convFwdDims.dtypes); + + // Generate keys for post-ops + for (auto const& post_op_param : convFwdDims.post_op_params) { + if (post_op_param.name == "relu") { + CHECK_EQ(post_op_param.param.size(), 3); + key_creator.AddAsKey(post_op_param.name); + key_creator.AddAsKey(post_op_param.param[0]); + key_creator.AddAsKey(post_op_param.param[1]); + key_creator.AddAsKey(post_op_param.param[2]); + } else if (post_op_param.name == "sum") { + CHECK_EQ(post_op_param.param.size(), 1); + key_creator.AddAsKey(post_op_param.name); + key_creator.AddAsKey(post_op_param.param[0]); + } else if (post_op_param.name == "output_scale") { + CHECK_EQ(post_op_param.param.size(), 1); + key_creator.AddAsKey(post_op_param.name); + key_creator.AddAsKey(post_op_param.param[0]); + } else { + TF_CHECK_OK( + Status(error::Code::UNIMPLEMENTED, + "For now, only Relu and Sum are supported for fusion.")); + } + } + return key_creator.GetKey(); } @@ -757,10 +836,23 @@ class MklConvOp : public OpKernel { TensorFormat data_format_; }; +// FP32 kernel registration for INTEL_MKL_ML +REGISTER_KERNEL_BUILDER(Name("_MklConv2D") + .Device(DEVICE_CPU) + .TypeConstraint("T") + .Label(mkl_op_registry::kMklOpLabel), + MklConv2DOp); +REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias") + .Device(DEVICE_CPU) + .TypeConstraint("T") + .Label(mkl_op_registry::kMklOpLabel), + MklConv2DOp); + #else // Base class for convolution forward operations -template +template class MklConvOp : public OpKernel { public: ~MklConvOp() {} @@ -831,8 +923,8 @@ class MklConvOp : public OpKernel { errors::InvalidArgument("Filter should not be in " "Mkl Layout")); - MklDnnData src(&cpu_engine); - MklDnnData filter(&cpu_engine); + MklDnnData src(&cpu_engine_); + MklDnnData filter(&cpu_engine_); memory::dims src_dims, filter_dims, padding_left, padding_right, dilations, strides; @@ -865,9 +957,15 @@ class MklConvOp : public OpKernel { // as 2nd output of Conv2D/3D. filter_mkl_shape.SetMklTensor(false); Tensor* output_filter_tensor = nullptr; - AllocateOutputSetMklShape(context, kOutputIndex_Filter, - &output_filter_tensor, - filter_tf_shape, filter_mkl_shape); + // MklConv2D also outputs converted filter as 2nd output. + if (typeid(Tinput) == typeid(float) && + typeid(Tfilter) == typeid(float) && + typeid(Toutput) == typeid(float)) { + filter_mkl_shape.SetMklTensor(false); + AllocateOutputSetMklShape(context, kOutputIndex_Filter, + &output_filter_tensor, filter_tf_shape, + filter_mkl_shape); + } return; } @@ -887,15 +985,17 @@ class MklConvOp : public OpKernel { // Conv3D: NDHWC or NCDHW auto src_md = src_mkl_shape.IsMklTensor() ? src_mkl_shape.GetMklLayout() - : memory::desc(src_dims, MklDnnType(), tf_fmt); + : memory::desc(src_dims, MklDnnType(), tf_fmt); + src.SetUsrMem(src_md, &src_tensor); // Although filter shape (filter_dims) required is in MKL-DNN order, // the layout is Tensorflow's layout (HWIO). auto filter_md = filter_mkl_shape.IsMklTensor() // Should NEVER be true ? filter_mkl_shape.GetMklLayout() - : memory::desc(filter_dims, MklDnnType(), + : memory::desc(filter_dims, MklDnnType(), isConv2D ? memory::format::hwio : memory::format::dhwio); + filter.SetUsrMem(filter_md, &filter_tensor); // MKLDNN dilation starts from 0. for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1; @@ -905,27 +1005,39 @@ class MklConvOp : public OpKernel { // in the following cases // 1. Legacy CPU without AVX512/AVX2, or // 2. 1x1 convolution with stride != 1 - bool do_not_cache = MklPrimitiveFactory::IsPrimitiveMemOptEnabled() && + bool do_not_cache = MklPrimitiveFactory::IsPrimitiveMemOptEnabled() && (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) && - (MklPrimitiveFactory::IsLegacyPlatform() || + (MklPrimitiveFactory::IsLegacyPlatform() || IsConv1x1StrideNot1(filter_dims, strides)); // get a conv2d fwd from primitive pool - MklConvFwdPrimitive* conv_fwd = nullptr; + MklConvFwdPrimitive* + conv_fwd = nullptr; if (biasEnabled) { memory::dims bias_dims = {}; conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims); MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims, dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv_fwd = MklConvFwdPrimitiveFactory::Get( - convFwdDims, do_not_cache); + + // TODO(mdfaijul): Extend the basic parameters for data types and + // fusions + this->ExtendConvFwdParams(context, convFwdDims); + + conv_fwd = MklConvFwdPrimitiveFactory::Get(convFwdDims, + do_not_cache); } else { MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS, dst_dims_mkl_order, strides, dilations, padding_left, padding_right); - conv_fwd = MklConvFwdPrimitiveFactory::Get( - convFwdDims, do_not_cache); + + // Extend the basic parameters for data types and fusions + this->ExtendConvFwdParams(context, convFwdDims); + + conv_fwd = MklConvFwdPrimitiveFactory::Get(convFwdDims, + do_not_cache); } // allocate output tensors output_tensor and filter_out_tensor @@ -934,38 +1046,42 @@ class MklConvOp : public OpKernel { AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt, &dst_tensor); Tensor* filter_out_tensor = nullptr; - AllocateFilterOutputTensor(context, *conv_fwd_pd, - TFShapeToMklDnnDims(filter_tf_shape), - &filter_out_tensor); + if (typeid(Tinput) == typeid(float) && typeid(Tfilter) == typeid(float) && + typeid(Toutput) == typeid(float)) { + AllocateFilterOutputTensor(context, *conv_fwd_pd, + TFShapeToMklDnnDims(filter_tf_shape), + &filter_out_tensor); + } - T* dst_data = static_cast(dst_tensor->flat().data()); + Ttemp_output* dst_data = + reinterpret_cast(dst_tensor->flat().data()); // check whether src/filter need reorder - T *src_data = nullptr; + Tinput* src_data = nullptr; if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) { src.SetUsrMem(src_md, &src_tensor); src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc()); - src_data = static_cast(src.GetOpMem().get_data_handle()); + src_data = static_cast(src.GetOpMem().get_data_handle()); } else { - src_data = static_cast(const_cast(src_tensor.flat().data())); + src_data = static_cast( + const_cast(src_tensor.flat().data())); } - T* filter_data = nullptr; + Tfilter* filter_data = nullptr; if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) { filter.SetUsrMem(filter_md, &filter_tensor); filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(), filter.GetTensorBuffer(filter_out_tensor)); - filter_data = static_cast(filter.GetOpMem().get_data_handle()); + filter_data = static_cast(filter.GetOpMem().get_data_handle()); } else { filter_data = - static_cast(const_cast(filter_tensor.flat().data())); + static_cast(const_cast(filter_tensor.flat().data())); } // execute convolution if (biasEnabled) { const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias); - T* bias_data = static_cast(const_cast( - bias_tensor.flat().data())); - + Tbias* bias_data = + this->GetBiasHandle(context, conv_fwd_pd, bias_tensor); conv_fwd->Execute(src_data, filter_data, bias_data, dst_data); } else { conv_fwd->Execute(src_data, filter_data, dst_data); @@ -982,6 +1098,62 @@ class MklConvOp : public OpKernel { } } + protected: + virtual void ExtendConvFwdParams(OpKernelContext* context, + MklConvFwdParams& params) { + // Create a string from data types of input, filter, bias, and output. + params.dtypes.append(typeid(Tinput).name()); + params.dtypes.append(typeid(Tfilter).name()); + params.dtypes.append(typeid(Tbias).name()); + params.dtypes.append(typeid(Toutput).name()); + } + + virtual Tbias* GetBiasHandle( + OpKernelContext* context, + std::shared_ptr& + conv2d_fwd_pd, + const Tensor& bias_tensor) { + if (biasEnabled) { + return static_cast( + const_cast(bias_tensor.flat().data())); + } else { + return nullptr; + } + } + + // Allocate output tensor. + virtual void AllocateOutputTensor( + OpKernelContext* context, + const convolution_forward::primitive_desc& conv_prim_desc, + const memory::dims& output_dims_mkl_order, + memory::format output_tf_format, Tensor** output_tensor) { + CHECK_NOTNULL(output_tensor); + auto dst_pd = conv_prim_desc.dst_primitive_desc(); + + auto dst_md = dst_pd.desc(); + if (!std::is_same::value) { + dst_md.data.data_type = + static_cast(MklDnnType()); + dst_pd = memory::primitive_desc(dst_md, cpu_engine_); + } + // Allocate shape of Mkl tensor. + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(true); + output_mkl_shape.SetMklLayout(&dst_pd); + output_mkl_shape.SetElemType(MklDnnType()); + output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), + output_dims_mkl_order, output_tf_format); + + // Allocate shape of TF tensor. + TensorShape output_tf_shape; + output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput))); + + AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor, + output_tf_shape, output_mkl_shape); + } + + engine cpu_engine_ = engine(engine::cpu, 0); + private: std::vector strides_; std::vector dilations_; @@ -990,34 +1162,8 @@ class MklConvOp : public OpKernel { const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2; const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1; const int kDilationH = 0, kDilationW = 1; - engine cpu_engine = engine(engine::cpu, 0); - // Allocate output tensor. - void AllocateOutputTensor( - OpKernelContext* context, - const convolution_forward::primitive_desc& conv_prim_desc, - const memory::dims& output_dims_mkl_order, - memory::format output_tf_format, Tensor** output_tensor) { - CHECK_NOTNULL(output_tensor); - auto dst_pd = conv_prim_desc.dst_primitive_desc(); - - // Allocate shape of Mkl tensor. - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(true); - output_mkl_shape.SetMklLayout(&dst_pd); - output_mkl_shape.SetElemType(MklDnnType()); - output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, output_tf_format); - - // Allocate shape of TF tensor. - TensorShape output_tf_shape; - output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T))); - - AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor, - output_tf_shape, output_mkl_shape); - } - - // Allocate output tensor. + // Allocate filter output tensor. void AllocateFilterOutputTensor( OpKernelContext* context, const convolution_forward::primitive_desc& conv_prim_desc, @@ -1029,7 +1175,7 @@ class MklConvOp : public OpKernel { MklDnnShape filter_mkl_shape; filter_mkl_shape.SetMklTensor(true); filter_mkl_shape.SetMklLayout(&filter_pd); - filter_mkl_shape.SetElemType(MklDnnType()); + filter_mkl_shape.SetElemType(MklDnnType()); // The format of the filter is actually OIhw8i8o, but TF doesn't support // this format. Just use format::blocked for now because the layout @@ -1039,17 +1185,17 @@ class MklConvOp : public OpKernel { // Allocate the data space for the filter to propagate as TF tensor. TensorShape filter_tf_shape; - filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T))); + filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(Tfilter))); AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor, filter_tf_shape, filter_mkl_shape); } - // Prepare and execute net - checks for input and output reorders. void PrepareAndExecuteNet( const convolution_forward::primitive_desc& conv_prim_desc, - MklDnnData* src, MklDnnData* filter, MklDnnData* bias, - MklDnnData* output, Tensor* filter_out_tensor) { + MklDnnData* src, MklDnnData* filter, + MklDnnData* bias, MklDnnData* output, + Tensor* filter_out_tensor) { CHECK_NOTNULL(filter_out_tensor); // Create reorders between user layout and MKL layout if it is needed and @@ -1080,20 +1226,575 @@ class MklConvOp : public OpKernel { } }; -#endif +// We create new class for each verison of Quantized Convolution and inherit +// from the FP32 version of the base class +template +class MklQuantizedConv2DOp + : public MklConvOp { + public: + virtual ~MklQuantizedConv2DOp() { + if (this->input_bias_ != nullptr) { + delete this->input_bias_; + input_bias_ = nullptr; + } + + if (this->scaled_bias_ != nullptr) { + delete this->scaled_bias_; + scaled_bias_ = nullptr; + } + } + + explicit MklQuantizedConv2DOp(OpKernelConstruction* context) + : MklConvOp(context) {} + + void Compute(OpKernelContext* context) override { + // Compute int32 output tensor + MklConvOp::Compute(context); + + // Compute additional outputs: min/max scalars. + int bias_index_offset; + bias_index_offset = biasEnabled ? 1 : 0; + + const float min_input = + context->input(2 + bias_index_offset).flat()(0); + const float max_input = + context->input(3 + bias_index_offset).flat()(0); + const float min_filter = + context->input(4 + bias_index_offset).flat()(0); + const float max_filter = + context->input(5 + bias_index_offset).flat()(0); + + float min_output_value; + float max_output_value; + if (std::is_same::value || + std::is_same::value) { + // This is the case the convolution and requantization are fused. + // min_freezed_output and max_freezed_output are the actual range + // for the output + min_output_value = context->input(6 + bias_index_offset).flat()(0); + max_output_value = context->input(7 + bias_index_offset).flat()(0); + } else { + MklQuantizationRangeForMultiplication( + min_input, max_input, min_filter, max_filter, &min_output_value, + &max_output_value); + } + + Tensor* output_min = nullptr; + Tensor* output_max = nullptr; + MklDnnShape output_min_mkl_shape, output_max_mkl_shape; + output_min_mkl_shape.SetMklTensor(false); + output_max_mkl_shape.SetMklTensor(false); + AllocateOutputSetMklShape(context, 1, &output_min, {}, + output_min_mkl_shape); + AllocateOutputSetMklShape(context, 2, &output_max, {}, + output_max_mkl_shape); + output_min->flat()(0) = min_output_value; + output_max->flat()(0) = max_output_value; + } + + protected: + void ExtendConvFwdParams(OpKernelContext* context, + MklConvFwdParams& params) override { + MklConvOp::ExtendConvFwdParams(context, params); + + // When the output type is quint8, the output data id requantized + // into quint8. A post_op "output_scale" is added to do the conversion. + if (std::is_same::value || + std::is_same::value) { + int bias_index_offset; + bias_index_offset = biasEnabled ? 1 : 0; + + const float min_input = + context->input(2 + bias_index_offset).flat()(0); + const float max_input = + context->input(3 + bias_index_offset).flat()(0); + const float min_filter = + context->input(4 + bias_index_offset).flat()(0); + const float max_filter = + context->input(5 + bias_index_offset).flat()(0); + const float min_freezed_output = + context->input(6 + bias_index_offset).flat()(0); + const float max_freezed_output = + context->input(7 + bias_index_offset).flat()(0); + + float min_output_value; + float max_output_value; + MklQuantizationRangeForMultiplication( + min_input, max_input, min_filter, max_filter, &min_output_value, + &max_output_value); + float scale_int32 = + std::max(std::abs(min_output_value), std::abs(max_output_value)); + float scale_eightbit = + std::max(std::abs(min_freezed_output), std::abs(max_freezed_output)); + float scale = 1.0; + if (std::is_same::value) + scale = scale_int32 / scale_eightbit / static_cast(1 << 23); + else + scale = scale_int32 / scale_eightbit / static_cast(1 << 24); + + std::vector output_scale; + output_scale.push_back(scale); + params.post_op_params.push_back({"output_scale", output_scale}); + } + } + + Tbias* GetBiasHandle( + OpKernelContext* context, + std::shared_ptr& conv_fwd_pd, + const Tensor& bias_tensor) override { + int bias_index_offset; + bias_index_offset = biasEnabled ? 1 : 0; + + const float min_input = + context->input(2 + bias_index_offset).flat()(0); + const float max_input = + context->input(3 + bias_index_offset).flat()(0); + const float min_filter = + context->input(4 + bias_index_offset).flat()(0); + const float max_filter = + context->input(5 + bias_index_offset).flat()(0); + + std::vector net; + if (biasEnabled) { + if (std::is_same::value) { + return static_cast( + const_cast(bias_tensor.flat().data())); + } + // If bias is enabled and requantization is not fused, scale the + // bias to be consistent with quantized-input and quantized-filter. + float bias_scale = 255.0 * 127.0 / + (std::max(std::abs(max_input), std::abs(min_input)) * + std::max(std::abs(max_filter), std::abs(min_filter))); + std::vector scales; + scales.push_back(bias_scale); + mkldnn::primitive_attr bias_attr; + bias_attr.set_output_scales(0, scales); + + void* bias_buf = static_cast( + const_cast(bias_tensor.flat().data())); + input_bias_ = new memory(conv_fwd_pd->bias_primitive_desc(), bias_buf); + scaled_bias_ = new memory(conv_fwd_pd->bias_primitive_desc()); + auto reorder_desc = mkldnn::reorder::primitive_desc( + input_bias_->get_primitive_desc(), scaled_bias_->get_primitive_desc(), + bias_attr); + net.push_back(mkldnn::reorder(reorder_desc, *input_bias_, *scaled_bias_)); + stream(stream::kind::eager).submit(net).wait(); + return reinterpret_cast(scaled_bias_->get_data_handle()); + } else { + return nullptr; + } + } + + memory* input_bias_ = nullptr; + memory* scaled_bias_ = nullptr; +}; + +template +class MklQuantizedConv2DReluOp + : public MklQuantizedConv2DOp { + public: + virtual ~MklQuantizedConv2DReluOp() {} + + explicit MklQuantizedConv2DReluOp(OpKernelConstruction* context) + : MklQuantizedConv2DOp( + context) {} + + protected: + void ExtendConvFwdParams(OpKernelContext* context, + MklConvFwdParams& params) override { + MklQuantizedConv2DOp::ExtendConvFwdParams(context, params); + params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}}); + } +}; + +template +class MklQuantizedConv2DSumReluOp + : public MklQuantizedConv2DOp { + public: + virtual ~MklQuantizedConv2DSumReluOp() { + if (this->summand_ != nullptr) { + delete this->summand_; + summand_ = nullptr; + } + + if (this->dst_ != nullptr) { + delete this->dst_; + dst_ = nullptr; + } + } + + explicit MklQuantizedConv2DSumReluOp(OpKernelConstruction* context) + : MklQuantizedConv2DOp( + context) {} + + protected: + void ExtendConvFwdParams(OpKernelContext* context, + MklConvFwdParams& params) override { + MklQuantizedConv2DOp::ExtendConvFwdParams(context, params); + // Calculate the scale (beta in mkldnn api term) for sum + if (std::is_same::value) { + int summand_idx = context->num_inputs() / 2 - 1 - 2; + DataType summand_type = this->input_type(summand_idx); + bool summand_condition = + (summand_type == DT_QINT8) || (summand_type == DT_QUINT8); + CHECK((summand_condition)); + int bias_index_offset = biasEnabled ? 1 : 0; + const float min_freezed_output = + context->input(6 + bias_index_offset).flat()(0); + const float max_freezed_output = + context->input(7 + bias_index_offset).flat()(0); + const float min_freezed_summand = + context->input(9 + bias_index_offset).flat()(0); + const float max_freezed_summand = + context->input(10 + bias_index_offset).flat()(0); + + float scale_output = + std::max(std::abs(min_freezed_output), std::abs(max_freezed_output)); + float scale_summand = std::max(std::abs(min_freezed_summand), + std::abs(max_freezed_summand)); + if (summand_type == DT_QUINT8) + params.post_op_params.push_back( + {"sum", {scale_summand / scale_output}}); + else + params.post_op_params.push_back( + {"sum", {2.0 * scale_summand / scale_output}}); + } else { + params.post_op_params.push_back({"sum", {1.0}}); + } + params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}}); + } + + // Allocate output tensor. + void AllocateOutputTensor( + OpKernelContext* context, + const convolution_forward::primitive_desc& conv_prim_desc, + const memory::dims& output_dims_mkl_order, + memory::format output_tf_format, Tensor** output_tensor) override { + int summand_idx = context->num_inputs() / 2 - 1; + float reorder_sum_scale = 1.0; + if (std::is_same::value) { + summand_idx -= 2; + DataType summand_type = this->input_type(summand_idx); + bool summand_condition = + (summand_type == DT_QINT8) || (summand_type == DT_QUINT8); + CHECK((summand_condition)); + Tensor& summand = const_cast(MklGetInput(context, summand_idx)); + MklDnnShape summand_mkl_shape; + GetMklShape(context, summand_idx, &summand_mkl_shape); + auto dst_md = summand_mkl_shape.GetMklLayout(); + if (summand_mkl_shape.IsMklTensor()) { + if (summand_type == DT_QINT8) { + summand.UnsafeCopyFromInternal(summand, DT_QUINT8, summand.shape()); + dst_md.data.data_type = + static_cast(MklDnnType()); + summand_mkl_shape.SetMklLayout(&dst_md); + summand_mkl_shape.SetElemType(MklDnnType()); + } + ForwardMklTensorInToOutWithMklShape(context, summand_idx, 0, + summand_mkl_shape); + *output_tensor = const_cast(&summand); + return; + } else { + TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION, + "Current fusion is not successful.")); + } + } + // TODO(mdfaijul): Add cleaner code for non-mkl tensor + MklConvOp::AllocateOutputTensor(context, conv_prim_desc, + output_dims_mkl_order, + output_tf_format, + output_tensor); + const Tensor& summand = MklGetInput(context, summand_idx); + if (summand.dtype() != DT_FLOAT) + TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION, + "Current fusion requires summand to be float")); + MklDnnShape summand_mkl_shape; + GetMklShape(context, summand_idx, &summand_mkl_shape); + // We need to compute scale for the summand + int bias_index_offset = biasEnabled ? 1 : 0; + const float min_input = + context->input(2 + bias_index_offset).flat()(0); + const float max_input = + context->input(3 + bias_index_offset).flat()(0); + const float min_filter = + context->input(4 + bias_index_offset).flat()(0); + const float max_filter = + context->input(5 + bias_index_offset).flat()(0); + + reorder_sum_scale = + 255.0 * 127.0 / (std::max(std::abs(max_input), std::abs(min_input)) * + std::max(std::abs(max_filter), std::abs(min_filter))); + std::vector scales; + scales.push_back(reorder_sum_scale); + mkldnn::primitive_attr reorder_attr; + reorder_attr.set_output_scales(0, scales); + + auto summand_md = + summand_mkl_shape.IsMklTensor() + ? summand_mkl_shape.GetMklLayout() + : memory::desc(output_dims_mkl_order, MklDnnType(), + memory::format::nhwc); + auto summand_pd = memory::primitive_desc(summand_md, this->cpu_engine_); + void* summand_buf = + static_cast(const_cast(summand.flat().data())); + void* dst_buf = + static_cast((*output_tensor)->flat().data()); + summand_ = new memory(summand_pd, summand_buf); + dst_ = new memory(conv_prim_desc.dst_primitive_desc(), dst_buf); + auto reorder_desc = mkldnn::reorder::primitive_desc( + summand_pd, conv_prim_desc.dst_primitive_desc(), reorder_attr); + + std::vector net; + net.push_back(mkldnn::reorder(reorder_desc, *summand_, *dst_)); + stream(stream::kind::eager).submit(net).wait(); + } + + memory* summand_ = nullptr; + memory* dst_ = nullptr; +}; + +// INT8 kernel registration +// Register NoOp kernel for QunatizedConv2D for qint8 filter +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2D") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); + +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); + +// Register a templatized implementation of MklQuntizedConv2D. +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2D") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DOp); + +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DOp); + +// Register NoOp kernel for QuantizedConv2DWithBias to get a python interface. +// This kernel will be replaced by an MKL kernel during graph +// optimization pass. +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBias") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); + +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); + +// Register a templatized implementation MklQuantizedConv2DWithBias. +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DWithBias") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DOp); + +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DWithBiasAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("Tbias") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DOp); +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DWithBiasAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("Tbias") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DOp); + +// Register NoOp kernel for QuantizedConv2DAndRelu to get a python interface. +// This kernel will be replaced by an MKL kernel during graph-optimization pass. +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndRelu") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); + +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndReluAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); + +// Register a templatized implementation of MklQuantizedConv2DAndRelu. +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DAndRelu") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DReluOp); + +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DAndReluAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DReluOp); + +// Register NoOp kernel for QuantizedConv2DWithBiasAndRelu to get a python +// interface. +// This kernel will be replaced by an MKL kernel during graph-optimization pass. +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndRelu") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); + +// Register NoOp kernel for QuantizedConv2DWithBiasAndReluAndRequantize +// to get a python interface. +// This kernel will be replaced by an MKL kernel during graph-optimization pass. +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndReluAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); + +// Register a templatized implementation of MklQuantizedConv2DWithBiasAndRelu. +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DWithBiasAndRelu") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DReluOp); + +// Register a templatized implementation of +// MklQuantizedConv2DWithBiasAndReluAndRequantize. +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DWithBiasAndReluAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("Tbias") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DReluOp); +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DWithBiasAndReluAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("Tbias") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DReluOp); + +// Register NoOp kernel for QuantizedConv2DWithBiasSumAndRelu to get a python +// interface. +// This kernel will be replaced by an MKL kernel during graph-optimization pass. +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasSumAndRelu") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); + +REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasSumAndReluAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); +REGISTER_KERNEL_BUILDER( + Name("QuantizedConv2DWithBiasSignedSumAndReluAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type"), + NoOp); +// Register a templatized implementation of MklQuantizedConv2DWithBiasAndRelu. +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DWithBiasSumAndRelu") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DSumReluOp); + +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DSumReluOp); +REGISTER_KERNEL_BUILDER( + Name("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize") + .Device(DEVICE_CPU) + .TypeConstraint("Tinput") + .TypeConstraint("Tfilter") + .TypeConstraint("out_type") + .Label(mkl_op_registry::kMklQuantizedOpLabel), + MklQuantizedConv2DSumReluOp); +#endif // INTEL_MKL_ML // Register 2D operations #define REGISTER_MKL_CPU_2D(T) \ REGISTER_KERNEL_BUILDER(Name("_MklConv2D") \ .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ + .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ + MklConvOp); \ REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias") \ .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ + .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ + MklConvOp); \ REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ @@ -1108,7 +1809,7 @@ TF_CALL_float(REGISTER_MKL_CPU_2D); .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); + MklConvOp); TF_CALL_float(REGISTER_MKL_CPU_3D); } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h new file mode 100644 index 00000000000..98b14cda5cd --- /dev/null +++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h @@ -0,0 +1,55 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_ +#define TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_ + +#include "tensorflow/core/framework/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +#ifdef INTEL_MKL + +namespace tensorflow { +template +float MklFloatForOneQuantizedLevel(float range_min, float range_max) { + const int64 highest = static_cast(Eigen::NumTraits::highest()); + const int64 lowest = static_cast(Eigen::NumTraits::lowest()); + const float float_for_one_quantized_level = + (range_max - range_min) / (highest - lowest); + return float_for_one_quantized_level; +} + +template +void MklQuantizationRangeForMultiplication(float min_a, float max_a, + float min_b, float max_b, + float* min_c, float* max_c) { + const float a_float_for_one_quant_level = + MklFloatForOneQuantizedLevel(min_a, max_a); + const float b_float_for_one_quant_level = + MklFloatForOneQuantizedLevel(min_b, max_b); + + const int64 c_highest = static_cast(Eigen::NumTraits::highest()); + const int64 c_lowest = static_cast(Eigen::NumTraits::lowest()); + const float c_float_for_one_quant_level = + a_float_for_one_quant_level * b_float_for_one_quant_level; + + *min_c = c_float_for_one_quant_level * c_lowest; + *max_c = c_float_for_one_quant_level * c_highest; +} +} // namespace tensorflow + +#endif // INTEL_MKL + +#endif // TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_ diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc new file mode 100644 index 00000000000..9be3470820e --- /dev/null +++ b/tensorflow/core/ops/mkl_nn_ops.cc @@ -0,0 +1,612 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/util/mirror_pad_mode.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" + +// For now, this file only includes MKL quantized ops. In the +// future, we will move all other MKL ops from nn_ops.cc to this file. + +#ifdef INTEL_MKL + +namespace tensorflow { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +REGISTER_OP("_MklQuantizedMaxPool") + .Input("input: T") + .Input("min_input: float") + .Input("max_input: float") + .Input("mkl_input: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Output("output: T") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("T: quantizedtype") + .Attr("ksize: list(int) >= 4") + .Attr("strides: list(int) >= 4") + .Attr(GetPaddingAttrString()) + .SetShapeFn(shape_inference::MaxPoolShape) + .Doc(R"doc( +MKL version of QuantizedMaxPool operator. Uses MKL DNN APIs to perform max pooling +on the quantized input. + +NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + +REGISTER_OP("_MklQuantizedAvgPool") + .Input("input: T") + .Input("min_input: float") + .Input("max_input: float") + .Input("mkl_input: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Output("output: T") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("T: quantizedtype") + .Attr("ksize: list(int) >= 4") + .Attr("strides: list(int) >= 4") + .Attr(GetPaddingAttrString()) + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::AvgPoolShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }) + .Doc(R"doc( +MKL version of QuantizedAvgPool operator. Uses MKL DNN APIs to perform average pooling +on the quantized input. + +NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + +REGISTER_OP("_MklQuantizedConv2D") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("T: quantizedtype") // Additional attribute "T" for enabling MklToTf + // conversion + .Attr("out_type: quantizedtype = DT_QINT32") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DAndRequantize") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("min_freezed_output: float") + .Input("max_freezed_output: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Input("mkl_min_freezed_output: uint8") + .Input("mkl_max_freezed_output: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("T: quantizedtype") // Additional attribute "T" for enabling MklToTf + // conversion + .Attr("out_type: quantizedtype = DT_QINT8") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DWithBias") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("bias: float") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_bias: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("T: quantizedtype") // Additional attribute "T" for + // enabling MklToTf conversion + .Attr("out_type: quantizedtype = DT_QINT32") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DWithBiasAndRequantize") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("bias: Tbias") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("min_freezed_output: float") + .Input("max_freezed_output: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_bias: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Input("mkl_min_freezed_output: uint8") + .Input("mkl_max_freezed_output: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("Tbias: {float, qint32}") + .Attr("T: quantizedtype") // Additional attribute "T" for + // enabling MklToTf conversion + .Attr("out_type: quantizedtype = DT_QINT8") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DAndRelu") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("T: quantizedtype") // Additional attribute "T" for enabling MklToTf + // conversion + .Attr("out_type: quantizedtype = DT_QINT32") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DAndReluAndRequantize") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("min_freezed_output: float") + .Input("max_freezed_output: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Input("mkl_min_freezed_output: uint8") + .Input("mkl_max_freezed_output: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("T: quantizedtype") // Additional attribute "T" for enabling MklToTf + // conversion + .Attr("out_type: quantizedtype = DT_QUINT8") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DWithBiasAndRelu") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("bias: float") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_bias: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("T: quantizedtype") // Additional attribute "T" for + // enabling MklToTf conversion + .Attr("out_type: quantizedtype = DT_QINT32") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DWithBiasAndReluAndRequantize") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("bias: Tbias") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("min_freezed_output: float") + .Input("max_freezed_output: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_bias: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Input("mkl_min_freezed_output: uint8") + .Input("mkl_max_freezed_output: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("Tbias: {float, qint32}") + .Attr("T: quantizedtype") // Additional attribute "T" for + // enabling MklToTf conversion + .Attr("out_type: quantizedtype = DT_QUINT8") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndRelu") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("bias: float") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("summand: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_bias: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Input("mkl_summand: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("T: quantizedtype") // Additional attribute "T" for + // enabling MklToTf conversion + .Attr("out_type: quantizedtype = DT_QINT32") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("bias: Tbias") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("min_freezed_output: float") + .Input("max_freezed_output: float") + .Input("summand: Tsummand") + .Input("min_summand: float") + .Input("max_summand: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_bias: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Input("mkl_min_freezed_output: uint8") + .Input("mkl_max_freezed_output: uint8") + .Input("mkl_summand: uint8") + .Input("mkl_min_summand: uint8") + .Input("mkl_max_summand: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("Tbias: {float, qint32}") + .Attr("Tsummand: quantizedtype") + .Attr("T: quantizedtype") // Additional attribute "T" for + // enabling MklToTf conversion + .Attr("out_type: quantizedtype = DT_QUINT8") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +REGISTER_OP("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize") + .Input("input: Tinput") + .Input("filter: Tfilter") + .Input("bias: Tbias") + .Input("min_input: float") + .Input("max_input: float") + .Input("min_filter: float") + .Input("max_filter: float") + .Input("min_freezed_output: float") + .Input("max_freezed_output: float") + .Input("summand: Tsummand") + .Input("min_summand: float") + .Input("max_summand: float") + .Input("mkl_input: uint8") + .Input("mkl_filter: uint8") + .Input("mkl_bias: uint8") + .Input("mkl_min_input: uint8") + .Input("mkl_max_input: uint8") + .Input("mkl_min_filter: uint8") + .Input("mkl_max_filter: uint8") + .Input("mkl_min_freezed_output: uint8") + .Input("mkl_max_freezed_output: uint8") + .Input("mkl_summand: uint8") + .Input("mkl_min_summand: uint8") + .Input("mkl_max_summand: uint8") + .Output("output: out_type") + .Output("min_output: float") + .Output("max_output: float") + .Output("mkl_output: uint8") + .Output("mkl_min_output: uint8") + .Output("mkl_max_output: uint8") + .Attr("Tinput: quantizedtype") + .Attr("Tfilter: quantizedtype") + .Attr("Tbias: {float, qint32}") + .Attr("Tsummand: quantizedtype") + .Attr("T: quantizedtype") // Additional attribute "T" for + // enabling MklToTf conversion + .Attr("out_type: quantizedtype = DT_QUINT8") + .Attr("data_format: string = 'NHWC'") + .Attr("strides: list(int)") + .Attr(GetPaddingAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c)); + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused)); + c->set_output(1, c->Scalar()); + c->set_output(2, c->Scalar()); + return Status::OK(); + }); + +} // namespace tensorflow + +#endif // INTEL_MKL diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 680211edffb..883fa612d5e 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -104,6 +104,10 @@ typedef enum { Dim3d_I = 1 } MklDnnDims3D; +typedef enum { + QUANTIZED_VERSION = 0, + FP_VERSION, +} MklQuantization; static const int kSmallBatchSize = 32; #ifdef INTEL_MKL_ML_ONLY @@ -1387,6 +1391,18 @@ template <> memory::data_type MklDnnType() { return memory::data_type::f32; } +template <> +memory::data_type MklDnnType() { + return memory::data_type::u8; +} +template <> +memory::data_type MklDnnType() { + return memory::data_type::s8; +} +template <> +memory::data_type MklDnnType() { + return memory::data_type::s32; +} /// Map TensorFlow's data format into MKL-DNN 3D data format /// @input: TensorFlow data format From e26263af36a8e3dcff7581022b40b06e1a1e33ed Mon Sep 17 00:00:00 2001 From: mdfaijul Date: Wed, 3 Oct 2018 11:11:37 -0700 Subject: [PATCH 009/540] Fixed style with clang-format --- tensorflow/core/kernels/mkl_conv_ops.cc | 200 ++++++++++++------------ 1 file changed, 100 insertions(+), 100 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 95b6dc066c3..dfad990aace 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -323,21 +323,21 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory { const MklConvFwdParams& convFwdDims, bool do_not_cache) { MklConvFwdPrimitive* conv_fwd = nullptr; - if (do_not_cache) { /* Always create new primitive */ + if (do_not_cache) {/* Always create new primitive */ conv_fwd = new MklConvFwdPrimitive( convFwdDims); } else { // try to find a suitable one in pool - conv_fwd = - dynamic_cast*>( - MklConvFwdPrimitiveFactory::GetInstance() - .GetConvFwd(convFwdDims)); + conv_fwd = dynamic_cast< + MklConvFwdPrimitive*>( + MklConvFwdPrimitiveFactory::GetInstance() + .GetConvFwd(convFwdDims)); if (conv_fwd == nullptr) { conv_fwd = new MklConvFwdPrimitive( convFwdDims); MklConvFwdPrimitiveFactory::GetInstance() + Toutput>::GetInstance() .SetConvFwd(convFwdDims, conv_fwd); } } @@ -425,15 +425,16 @@ class MklConvOp : public OpKernel { OP_REQUIRES(context, FormatFromString(data_format, &data_format_), errors::InvalidArgument("Invalid data format")); OP_REQUIRES(context, strides_.size() == 4, - errors::InvalidArgument("Sliding window strides field must " - "specify 4 dimensions")); + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); - OP_REQUIRES( - context, stride_n == 1 && stride_c == 1, - errors::InvalidArgument("Current implementation does not yet support " - "strides in the batch and depth dimensions.")); + OP_REQUIRES(context, stride_n == 1 && stride_c == 1, + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); } @@ -467,19 +468,18 @@ class MklConvOp : public OpKernel { filter.shape().DebugString())); for (int i = 0; i < 3; i++) { - OP_REQUIRES( - context, - FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), + std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } const int64 input_depth = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C') : GetTensorDim(input, data_format_, 'C'); - OP_REQUIRES(context, input_depth == filter.dim_size(2), - errors::InvalidArgument( - "input and filter must have the same depth: ", input_depth, - " vs ", filter.dim_size(2))); + OP_REQUIRES( + context, input_depth == filter.dim_size(2), + errors::InvalidArgument("input and filter must have the same depth: ", + input_depth, " vs ", filter.dim_size(2))); // The last dimension for filter is out_depth. const int out_depth = static_cast(filter.dim_size(3)); @@ -488,10 +488,9 @@ class MklConvOp : public OpKernel { const int64 input_rows_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H') : GetTensorDim(input, data_format_, 'H'); - OP_REQUIRES( - context, - FastBoundsCheck(input_rows_raw, std::numeric_limits::max()), - errors::InvalidArgument("Input rows too large")); + OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, + std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); const int input_rows = static_cast(input_rows_raw); const int filter_rows = static_cast(filter.dim_size(0)); @@ -500,10 +499,9 @@ class MklConvOp : public OpKernel { const int64 input_cols_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W') : GetTensorDim(input, data_format_, 'W'); - OP_REQUIRES( - context, - FastBoundsCheck(input_cols_raw, std::numeric_limits::max()), - errors::InvalidArgument("Input cols too large")); + OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, + std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); const int input_cols = static_cast(input_cols_raw); const int filter_cols = static_cast(filter.dim_size(1)); @@ -511,10 +509,9 @@ class MklConvOp : public OpKernel { const int64 input_batch_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N') : GetTensorDim(input, data_format_, 'N'); - OP_REQUIRES( - context, - FastBoundsCheck(input_batch_raw, std::numeric_limits::max()), - errors::InvalidArgument("batch is too large")); + OP_REQUIRES(context, FastBoundsCheck(input_batch_raw, + std::numeric_limits::max()), + errors::InvalidArgument("batch is too large")); const int batch = static_cast(input_batch_raw); // For now we take the stride from the second and third dimensions only (we @@ -732,7 +729,7 @@ class MklConvOp : public OpKernel { mkl_prim_convert_input; dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias, mkl_lt_internal_input; - void *mkl_buf_convert_input, *mkl_buf_convert_filter, + void* mkl_buf_convert_input, *mkl_buf_convert_filter, *mkl_buf_convert_bias; mkl_prim_convert_filter = nullptr; mkl_prim_convert_bias = nullptr; @@ -865,21 +862,23 @@ class MklConvOp : public OpKernel { OP_REQUIRES(context, FormatFromString(data_format, &data_format_), errors::InvalidArgument("Invalid data format")); OP_REQUIRES(context, (strides_.size() == 4 || strides_.size() == 5), - errors::InvalidArgument("Sliding window strides field must " - "specify 4 or 5 dimensions")); + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 or 5 dimensions")); const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); - OP_REQUIRES( - context, stride_n == 1 && stride_c == 1, - errors::InvalidArgument("Current implementation does not yet support " - "strides in the batch and depth dimensions.")); + OP_REQUIRES(context, stride_n == 1 && stride_c == 1, + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); if (strides_.size() == 4) { OP_REQUIRES(context, dilations_.size() == 4, - errors::InvalidArgument("Sliding window dilations field must " - "specify 4 dimensions")); + errors::InvalidArgument( + "Sliding window dilations field must " + "specify 4 dimensions")); const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N'); const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C'); const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H'); @@ -893,19 +892,18 @@ class MklConvOp : public OpKernel { errors::InvalidArgument("Dilated rates should be larger than 0.")); } else if (strides_.size() == 5) { OP_REQUIRES(context, dilations_.size() == 5, - errors::InvalidArgument("Dilation rates field must " - "specify 5 dimensions")); - OP_REQUIRES(context, - (GetTensorDim(dilations_, data_format_, 'N') == 1 && - GetTensorDim(dilations_, data_format_, 'C') == 1), + errors::InvalidArgument( + "Dilation rates field must " + "specify 5 dimensions")); + OP_REQUIRES(context, (GetTensorDim(dilations_, data_format_, 'N') == 1 && + GetTensorDim(dilations_, data_format_, 'C') == 1), errors::InvalidArgument( "Current implementation does not yet support " "dilations rates in the batch and depth dimensions.")); OP_REQUIRES( - context, - (GetTensorDim(dilations_, data_format_, '0') > 0 && - GetTensorDim(dilations_, data_format_, '1') > 0 && - GetTensorDim(dilations_, data_format_, '2') > 0), + context, (GetTensorDim(dilations_, data_format_, '0') > 0 && + GetTensorDim(dilations_, data_format_, '1') > 0 && + GetTensorDim(dilations_, data_format_, '2') > 0), errors::InvalidArgument("Dilated rates should be larger than 0.")); } } @@ -920,25 +918,26 @@ class MklConvOp : public OpKernel { GetMklShape(context, kInputIndex_Src, &src_mkl_shape); GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape); OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false, - errors::InvalidArgument("Filter should not be in " - "Mkl Layout")); + errors::InvalidArgument( + "Filter should not be in " + "Mkl Layout")); MklDnnData src(&cpu_engine_); MklDnnData filter(&cpu_engine_); memory::dims src_dims, filter_dims, padding_left, padding_right, - dilations, strides; + dilations, strides; memory::dims dst_dims_tf_order, dst_dims_mkl_order; // Get shapes of input tensors in MKL-DNN order MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_, - dilations_); + dilations_); auto src_tf_shape = GetTfShape(context, kInputIndex_Src); auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter); conv_utl.GetConvFwdSizesInMklOrder( - src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, - &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, - &padding_left, &padding_right); + src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides, + &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left, + &padding_right); if (!context->status().ok()) return; // Check for corner case - if there is nothing to compute, return. @@ -946,21 +945,19 @@ class MklConvOp : public OpKernel { // Corner cases: output with 0 elements and 0 batch size. Tensor* dst_tensor = nullptr; - if (dst_tf_shape.num_elements() == 0 || - dst_dims_tf_order[0] == 0) { + if (dst_tf_shape.num_elements() == 0 || dst_dims_tf_order[0] == 0) { MklDnnShape dst_mkl_shape; dst_mkl_shape.SetMklTensor(false); - AllocateOutputSetMklShape(context, kOutputIndex_Dst, - &dst_tensor, src_tf_shape, dst_mkl_shape); + AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor, + src_tf_shape, dst_mkl_shape); // MklConv2D/3D also outputs converted filter // as 2nd output of Conv2D/3D. filter_mkl_shape.SetMklTensor(false); Tensor* output_filter_tensor = nullptr; // MklConv2D also outputs converted filter as 2nd output. - if (typeid(Tinput) == typeid(float) && - typeid(Tfilter) == typeid(float) && - typeid(Toutput) == typeid(float)) { + if (typeid(Tinput) == typeid(float)&&typeid(Tfilter) == + typeid(float)&&typeid(Toutput) == typeid(float)) { filter_mkl_shape.SetMklTensor(false); AllocateOutputSetMklShape(context, kOutputIndex_Filter, &output_filter_tensor, filter_tf_shape, @@ -1005,10 +1002,11 @@ class MklConvOp : public OpKernel { // in the following cases // 1. Legacy CPU without AVX512/AVX2, or // 2. 1x1 convolution with stride != 1 - bool do_not_cache = MklPrimitiveFactory::IsPrimitiveMemOptEnabled() && - (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) && - (MklPrimitiveFactory::IsLegacyPlatform() || - IsConv1x1StrideNot1(filter_dims, strides)); + bool do_not_cache = + MklPrimitiveFactory::IsPrimitiveMemOptEnabled() && + (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) && + (MklPrimitiveFactory::IsLegacyPlatform() || + IsConv1x1StrideNot1(filter_dims, strides)); // get a conv2d fwd from primitive pool MklConvFwdPrimitive* @@ -1043,11 +1041,11 @@ class MklConvOp : public OpKernel { // allocate output tensors output_tensor and filter_out_tensor std::shared_ptr conv_fwd_pd = conv_fwd->GetPrimitiveDesc(); - AllocateOutputTensor(context, *conv_fwd_pd, - dst_dims_mkl_order, tf_fmt, &dst_tensor); + AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt, + &dst_tensor); Tensor* filter_out_tensor = nullptr; - if (typeid(Tinput) == typeid(float) && typeid(Tfilter) == typeid(float) && - typeid(Toutput) == typeid(float)) { + if (typeid(Tinput) == typeid(float)&&typeid(Tfilter) == + typeid(float)&&typeid(Toutput) == typeid(float)) { AllocateFilterOutputTensor(context, *conv_fwd_pd, TFShapeToMklDnnDims(filter_tf_shape), &filter_out_tensor); @@ -1071,10 +1069,11 @@ class MklConvOp : public OpKernel { filter.SetUsrMem(filter_md, &filter_tensor); filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(), filter.GetTensorBuffer(filter_out_tensor)); - filter_data = static_cast(filter.GetOpMem().get_data_handle()); - } else { filter_data = - static_cast(const_cast(filter_tensor.flat().data())); + static_cast(filter.GetOpMem().get_data_handle()); + } else { + filter_data = static_cast( + const_cast(filter_tensor.flat().data())); } // execute convolution @@ -1089,12 +1088,14 @@ class MklConvOp : public OpKernel { // delete primitive since it is not cached. if (do_not_cache) delete conv_fwd; - } catch (mkldnn::error &e) { + } + catch (mkldnn::error& e) { string error_msg = tensorflow::strings::StrCat( "Status: ", e.status, ", message: ", string(e.message), ", in file ", __FILE__, ":", __LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", error_msg)); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -1784,32 +1785,31 @@ REGISTER_KERNEL_BUILDER( #endif // INTEL_MKL_ML // Register 2D operations -#define REGISTER_MKL_CPU_2D(T) \ - REGISTER_KERNEL_BUILDER(Name("_MklConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ +#define REGISTER_MKL_CPU_2D(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2D").Device(DEVICE_CPU).TypeConstraint("T").Label( \ + mkl_op_registry::kMklOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ MklDummyOp); TF_CALL_float(REGISTER_MKL_CPU_2D); // Register 3D operations -#define REGISTER_MKL_CPU_3D(T) \ - REGISTER_KERNEL_BUILDER(Name("_MklConv3D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); +#define REGISTER_MKL_CPU_3D(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv3D").Device(DEVICE_CPU).TypeConstraint("T").Label( \ + mkl_op_registry::kMklOpLabel), \ + MklConvOp); TF_CALL_float(REGISTER_MKL_CPU_3D); } // namespace tensorflow From 10f5bbd27382c17defd2029c791a42a4e9e431fd Mon Sep 17 00:00:00 2001 From: mdfaijul Date: Thu, 4 Oct 2018 13:08:15 -0700 Subject: [PATCH 010/540] ran clang+llvm-3.9.0. --- tensorflow/core/kernels/mkl_conv_ops.cc | 100 ++++++++++++------------ 1 file changed, 49 insertions(+), 51 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index dfad990aace..1ecc15d459b 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -323,7 +323,7 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory { const MklConvFwdParams& convFwdDims, bool do_not_cache) { MklConvFwdPrimitive* conv_fwd = nullptr; - if (do_not_cache) {/* Always create new primitive */ + if (do_not_cache) { /* Always create new primitive */ conv_fwd = new MklConvFwdPrimitive( convFwdDims); } else { @@ -425,16 +425,15 @@ class MklConvOp : public OpKernel { OP_REQUIRES(context, FormatFromString(data_format, &data_format_), errors::InvalidArgument("Invalid data format")); OP_REQUIRES(context, strides_.size() == 4, - errors::InvalidArgument( - "Sliding window strides field must " - "specify 4 dimensions")); + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); - OP_REQUIRES(context, stride_n == 1 && stride_c == 1, - errors::InvalidArgument( - "Current implementation does not yet support " - "strides in the batch and depth dimensions.")); + OP_REQUIRES( + context, stride_n == 1 && stride_c == 1, + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); } @@ -729,7 +728,7 @@ class MklConvOp : public OpKernel { mkl_prim_convert_input; dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias, mkl_lt_internal_input; - void* mkl_buf_convert_input, *mkl_buf_convert_filter, + void *mkl_buf_convert_input, *mkl_buf_convert_filter, *mkl_buf_convert_bias; mkl_prim_convert_filter = nullptr; mkl_prim_convert_bias = nullptr; @@ -862,23 +861,21 @@ class MklConvOp : public OpKernel { OP_REQUIRES(context, FormatFromString(data_format, &data_format_), errors::InvalidArgument("Invalid data format")); OP_REQUIRES(context, (strides_.size() == 4 || strides_.size() == 5), - errors::InvalidArgument( - "Sliding window strides field must " - "specify 4 or 5 dimensions")); + errors::InvalidArgument("Sliding window strides field must " + "specify 4 or 5 dimensions")); const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); - OP_REQUIRES(context, stride_n == 1 && stride_c == 1, - errors::InvalidArgument( - "Current implementation does not yet support " - "strides in the batch and depth dimensions.")); + OP_REQUIRES( + context, stride_n == 1 && stride_c == 1, + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); if (strides_.size() == 4) { OP_REQUIRES(context, dilations_.size() == 4, - errors::InvalidArgument( - "Sliding window dilations field must " - "specify 4 dimensions")); + errors::InvalidArgument("Sliding window dilations field must " + "specify 4 dimensions")); const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N'); const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C'); const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H'); @@ -892,9 +889,8 @@ class MklConvOp : public OpKernel { errors::InvalidArgument("Dilated rates should be larger than 0.")); } else if (strides_.size() == 5) { OP_REQUIRES(context, dilations_.size() == 5, - errors::InvalidArgument( - "Dilation rates field must " - "specify 5 dimensions")); + errors::InvalidArgument("Dilation rates field must " + "specify 5 dimensions")); OP_REQUIRES(context, (GetTensorDim(dilations_, data_format_, 'N') == 1 && GetTensorDim(dilations_, data_format_, 'C') == 1), errors::InvalidArgument( @@ -918,9 +914,8 @@ class MklConvOp : public OpKernel { GetMklShape(context, kInputIndex_Src, &src_mkl_shape); GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape); OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false, - errors::InvalidArgument( - "Filter should not be in " - "Mkl Layout")); + errors::InvalidArgument("Filter should not be in " + "Mkl Layout")); MklDnnData src(&cpu_engine_); MklDnnData filter(&cpu_engine_); @@ -956,8 +951,9 @@ class MklConvOp : public OpKernel { filter_mkl_shape.SetMklTensor(false); Tensor* output_filter_tensor = nullptr; // MklConv2D also outputs converted filter as 2nd output. - if (typeid(Tinput) == typeid(float)&&typeid(Tfilter) == - typeid(float)&&typeid(Toutput) == typeid(float)) { + if (typeid(Tinput) == typeid(float) && + typeid(Tfilter) == typeid(float) && + typeid(Toutput) == typeid(float)) { filter_mkl_shape.SetMklTensor(false); AllocateOutputSetMklShape(context, kOutputIndex_Filter, &output_filter_tensor, filter_tf_shape, @@ -1044,8 +1040,8 @@ class MklConvOp : public OpKernel { AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt, &dst_tensor); Tensor* filter_out_tensor = nullptr; - if (typeid(Tinput) == typeid(float)&&typeid(Tfilter) == - typeid(float)&&typeid(Toutput) == typeid(float)) { + if (typeid(Tinput) == typeid(float) && typeid(Tfilter) == typeid(float) && + typeid(Toutput) == typeid(float)) { AllocateFilterOutputTensor(context, *conv_fwd_pd, TFShapeToMklDnnDims(filter_tf_shape), &filter_out_tensor); @@ -1088,8 +1084,7 @@ class MklConvOp : public OpKernel { // delete primitive since it is not cached. if (do_not_cache) delete conv_fwd; - } - catch (mkldnn::error& e) { + } catch (mkldnn::error& e) { string error_msg = tensorflow::strings::StrCat( "Status: ", e.status, ", message: ", string(e.message), ", in file ", __FILE__, ":", __LINE__); @@ -1785,31 +1780,34 @@ REGISTER_KERNEL_BUILDER( #endif // INTEL_MKL_ML // Register 2D operations -#define REGISTER_MKL_CPU_2D(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv2D").Device(DEVICE_CPU).TypeConstraint("T").Label( \ - mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv2DWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ +#define REGISTER_MKL_CPU_2D(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ MklDummyOp); TF_CALL_float(REGISTER_MKL_CPU_2D); // Register 3D operations -#define REGISTER_MKL_CPU_3D(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv3D").Device(DEVICE_CPU).TypeConstraint("T").Label( \ - mkl_op_registry::kMklOpLabel), \ - MklConvOp); +#define REGISTER_MKL_CPU_3D(T) \ + REGISTER_KERNEL_BUILDER(Name("_MklConv3D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConvOp); TF_CALL_float(REGISTER_MKL_CPU_3D); } // namespace tensorflow From 775188c321335b1053fd1fb174efd607e5173d59 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 9 Oct 2018 20:51:30 +0000 Subject: [PATCH 011/540] Add validation to axis for tf.nn.softmax This fix tries to address the issue raised in 22793 where an invalid axis (outside of `[-dim, dim)`) still returns value. This behavior is different from most other ops in tf like `tf.argmax`/etc. This fix add the validation of axis so that an error will be returned in case of invalid axis. This fix fixes 22793. Signed-off-by: Yong Tang --- tensorflow/python/ops/nn_ops.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 04962da7f76..56f36260f98 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -30,6 +30,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import check_ops from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import math_ops @@ -1679,22 +1680,23 @@ def _softmax(logits, compute_op, dim=-1, name=None): # If dim is not the last dimension, we have to do a transpose so that we can # still perform softmax on its last dimension. + is_valid_dim = control_flow_ops.Assert(math_ops.logical_and(math_ops.greater_equal(dim, -shape.ndims), math_ops.less(dim, shape.ndims)), [dim]) + with ops.control_dependencies([is_valid_dim]): + # Swap logits' dimension of dim and its last dimension. + input_rank = array_ops.rank(logits) + dim_axis = dim % shape.ndims + logits = _swap_axis(logits, dim_axis, math_ops.subtract(input_rank, 1)) - # Swap logits' dimension of dim and its last dimension. - input_rank = array_ops.rank(logits) - dim_axis = dim % shape.ndims - logits = _swap_axis(logits, dim_axis, math_ops.subtract(input_rank, 1)) + # Do the actual softmax on its last dimension. + output = compute_op(logits) - # Do the actual softmax on its last dimension. - output = compute_op(logits) + output = _swap_axis( + output, dim_axis, math_ops.subtract(input_rank, 1), name=name) - output = _swap_axis( - output, dim_axis, math_ops.subtract(input_rank, 1), name=name) + # Make shape inference work since transpose may erase its static shape. + output.set_shape(shape) - # Make shape inference work since transpose may erase its static shape. - output.set_shape(shape) - - return output + return output @tf_export("nn.softmax", "math.softmax") From 7ec309774d2eeb4285a0eb6ba0585848fc50054b Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 9 Oct 2018 20:56:17 +0000 Subject: [PATCH 012/540] Pylint fix Signed-off-by: Yong Tang --- tensorflow/python/ops/nn_ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 56f36260f98..70601dfaba5 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -30,8 +30,8 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops -from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import check_ops +from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops @@ -1680,7 +1680,9 @@ def _softmax(logits, compute_op, dim=-1, name=None): # If dim is not the last dimension, we have to do a transpose so that we can # still perform softmax on its last dimension. - is_valid_dim = control_flow_ops.Assert(math_ops.logical_and(math_ops.greater_equal(dim, -shape.ndims), math_ops.less(dim, shape.ndims)), [dim]) + is_valid_dim = control_flow_ops.Assert(math_ops.logical_and( + math_ops.greater_equal(dim, -shape.ndims), + math_ops.less(dim, shape.ndims)), [dim]) with ops.control_dependencies([is_valid_dim]): # Swap logits' dimension of dim and its last dimension. input_rank = array_ops.rank(logits) From 980227aabdd20da19a8824d1f828e22fb8bf5c1e Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 9 Oct 2018 20:56:55 +0000 Subject: [PATCH 013/540] Add test case for axis validation with tf.nn.softmax Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/softmax_op_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py index 89f4697e5cb..55849304e8f 100644 --- a/tensorflow/python/kernel_tests/softmax_op_test.py +++ b/tensorflow/python/kernel_tests/softmax_op_test.py @@ -222,6 +222,13 @@ class SoftmaxTest(test.TestCase): with self.assertRaises(errors_impl.InvalidArgumentError): nn_ops.softmax([1., 2., 3., 4.], axis=dim).eval() + def testInvalidAxis(self): + # Test case for GitHub issue 22793. + with self.cached_session(): + ones = array_ops.ones(shape=[2, 3]) + with self.assertRaises(errors_impl.InvalidArgumentError): + nn_ops.softmax(ones, axis=2).eval() + def testLargeDims(self): # Make sure that we properly handle large inputs. See # https://github.com/tensorflow/tensorflow/issues/4425 for details From 278c0fbc7e4fad5644d2d79b4a48a4918d109dad Mon Sep 17 00:00:00 2001 From: mdfaijul Date: Thu, 11 Oct 2018 17:02:22 -0700 Subject: [PATCH 014/540] changed enum to enum class --- tensorflow/core/util/mkl_util.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 883fa612d5e..a225850d217 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -104,10 +104,10 @@ typedef enum { Dim3d_I = 1 } MklDnnDims3D; -typedef enum { - QUANTIZED_VERSION = 0, +enum class MklQuantization { + QUANTIZED_VERSION, FP_VERSION, -} MklQuantization; +}; static const int kSmallBatchSize = 32; #ifdef INTEL_MKL_ML_ONLY From 79b0e3a8229530aaeea489676af8ab170debcaf7 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 13 Oct 2018 16:23:27 +0000 Subject: [PATCH 015/540] Update softmax shape validation to not use tensor graph Signed-off-by: Yong Tang --- tensorflow/python/ops/nn_ops.py | 34 +++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 70601dfaba5..2064d77ae18 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -25,13 +25,13 @@ import numpy as np from tensorflow.python.compat import compat from tensorflow.python.eager import context from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors_impl from tensorflow.python.framework import graph_util from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops -from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops @@ -1678,27 +1678,29 @@ def _softmax(logits, compute_op, dim=-1, name=None): if is_last_dim: return compute_op(logits, name=name) + dim_val = tensor_util.constant_value(dim) if isinstance(dim, ops.Tensor) else dim + if dim_val is not None and (dim_val < -shape.ndims or dim_val >= shape.ndims): + raise errors_impl.InvalidArgumentError( + None, None, + "Dimension (%d) must be in the range [%d, %d) where %d is the number of dimensions in the input." % (dim_val, -shape.ndims, shape.ndims, shape.ndims)) # If dim is not the last dimension, we have to do a transpose so that we can # still perform softmax on its last dimension. - is_valid_dim = control_flow_ops.Assert(math_ops.logical_and( - math_ops.greater_equal(dim, -shape.ndims), - math_ops.less(dim, shape.ndims)), [dim]) - with ops.control_dependencies([is_valid_dim]): - # Swap logits' dimension of dim and its last dimension. - input_rank = array_ops.rank(logits) - dim_axis = dim % shape.ndims - logits = _swap_axis(logits, dim_axis, math_ops.subtract(input_rank, 1)) - # Do the actual softmax on its last dimension. - output = compute_op(logits) + # Swap logits' dimension of dim and its last dimension. + input_rank = array_ops.rank(logits) + dim_axis = dim % shape.ndims + logits = _swap_axis(logits, dim_axis, math_ops.subtract(input_rank, 1)) - output = _swap_axis( - output, dim_axis, math_ops.subtract(input_rank, 1), name=name) + # Do the actual softmax on its last dimension. + output = compute_op(logits) - # Make shape inference work since transpose may erase its static shape. - output.set_shape(shape) + output = _swap_axis( + output, dim_axis, math_ops.subtract(input_rank, 1), name=name) - return output + # Make shape inference work since transpose may erase its static shape. + output.set_shape(shape) + + return output @tf_export("nn.softmax", "math.softmax") From d5ab586494a992733b8873531237dd2d200afeed Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 13 Oct 2018 16:30:00 +0000 Subject: [PATCH 016/540] Pylint fix for string too long Signed-off-by: Yong Tang --- tensorflow/python/ops/nn_ops.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 2064d77ae18..953aa42e029 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -1678,11 +1678,16 @@ def _softmax(logits, compute_op, dim=-1, name=None): if is_last_dim: return compute_op(logits, name=name) - dim_val = tensor_util.constant_value(dim) if isinstance(dim, ops.Tensor) else dim + dim_val = dim + if isinstance(dim, ops.Tensor): + dim_val = tensor_util.constant_value(dim) if dim_val is not None and (dim_val < -shape.ndims or dim_val >= shape.ndims): raise errors_impl.InvalidArgumentError( None, None, - "Dimension (%d) must be in the range [%d, %d) where %d is the number of dimensions in the input." % (dim_val, -shape.ndims, shape.ndims, shape.ndims)) + "Dimension (%d) must be in the range [%d, %d) where %d is the number of" + " dimensions in the input." % + (dim_val, -shape.ndims, shape.ndims, shape.ndims)) + # If dim is not the last dimension, we have to do a transpose so that we can # still perform softmax on its last dimension. From fd9d2b0cb61b61ef8cf7e5b8459f20bfdfea127c Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 13 Oct 2018 17:19:33 +0000 Subject: [PATCH 017/540] Fix tests Signed-off-by: Yong Tang --- tensorflow/python/ops/nn_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py index 2fabb2e966a..6499d346528 100644 --- a/tensorflow/python/ops/nn_test.py +++ b/tensorflow/python/ops/nn_test.py @@ -95,7 +95,7 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase): arr = np.linspace(0., 1, 12).reshape(3, 4) x_neg_axis = nn_ops.softmax(arr, axis=-2) y_pos_axis = nn_ops.softmax(arr, axis=0) - z_gt_axis = nn_ops.softmax(arr, axis=4) + z_gt_axis = nn_ops.softmax(arr, axis=0) x_neg_axis_tf = self.evaluate(x_neg_axis) y_pos_axis_tf = self.evaluate(y_pos_axis) z_gt_axis_tf = self.evaluate(z_gt_axis) @@ -180,7 +180,7 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase): arr = np.linspace(0., 1, 12).reshape(3, 4) x_neg_axis = nn_ops.log_softmax(arr, axis=-2) y_pos_axis = nn_ops.log_softmax(arr, axis=0) - z_gt_axis = nn_ops.log_softmax(arr, axis=4) + z_gt_axis = nn_ops.log_softmax(arr, axis=0) x_neg_axis_tf = self.evaluate(x_neg_axis) y_pos_axis_tf = self.evaluate(y_pos_axis) z_gt_axis_tf = self.evaluate(z_gt_axis) From 73b318f8e698cd61adb8e281ea3b95b886003617 Mon Sep 17 00:00:00 2001 From: Grzegorz Pawelczak Date: Mon, 15 Oct 2018 11:33:42 +0100 Subject: [PATCH 018/540] [XLA] Sink constants into the conditional computation in while loop --- .../service/while_loop_constant_sinking.cc | 45 +++++-- .../xla/service/while_loop_constant_sinking.h | 9 +- .../while_loop_constant_sinking_test.cc | 127 ++++++++++++++++++ tensorflow/compiler/xla/service/while_util.cc | 13 ++ tensorflow/compiler/xla/service/while_util.h | 7 + 5 files changed, 182 insertions(+), 19 deletions(-) diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc index 067cfcc17d6..49c05e9cf75 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc @@ -46,8 +46,9 @@ static Status ReplaceUsesWhileKeepingLoopInvariance( return Status::OK(); } -StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody( +StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop( HloInstruction* while_instr) { + HloComputation* while_cond = while_instr->while_condition(); HloComputation* while_body = while_instr->while_body(); const HloInstruction& init_value = *while_instr->operand(0); @@ -57,24 +58,44 @@ StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody( bool changed = false; - for (HloInstruction* invariant_gte : - WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) { - int64 index = invariant_gte->tuple_index(); + auto invariant_conditional_gte_index_to_inst = + WhileUtil::GetGTEsMapForWhileConditional(*while_cond); + auto invariant_body_gtes = + WhileUtil::GetInvariantGTEsForWhileBody(*while_body); + + for (HloInstruction* invariant_body_gte : invariant_body_gtes) { + int64 index = invariant_body_gte->tuple_index(); const HloInstruction& invariant_value = *init_value.operand(index); - // Should have at least one user that's not while_body_root. - if (invariant_gte->user_count() <= 1) { - continue; - } + // Original value should be a constant + if (invariant_value.opcode() != HloOpcode::kConstant) continue; - if (invariant_value.opcode() == HloOpcode::kConstant) { + // Sink into the while_body + // Should have at least one user that's not while_body_root. + if (invariant_body_gte->user_count() > 1) { auto* constant_instr = while_body->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk")); TF_RETURN_IF_ERROR(ReplaceUsesWhileKeepingLoopInvariance( - invariant_gte, constant_instr, while_body->root_instruction(), + invariant_body_gte, constant_instr, while_body->root_instruction(), index)); changed = true; } + + // Check if there is a corresponding GTE in while_conditional + auto it = invariant_conditional_gte_index_to_inst.find(index); + if (it == invariant_conditional_gte_index_to_inst.end()) { + continue; + } + + auto* invariant_cond_gte = it->second; + // Should have at least one user + if (invariant_cond_gte->user_count() > 0) { + auto* constant_instr = + while_cond->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk")); + TF_RETURN_IF_ERROR( + invariant_cond_gte->ReplaceAllUsesWith(constant_instr)); + changed = true; + } } return changed; @@ -115,10 +136,8 @@ StatusOr WhileLoopConstantSinking::Run(HloModule* module) { } for (HloInstruction* while_instr : while_instrs) { - // We only sink into while loop bodies, but this can be extended to - // transform conditions as well. TF_ASSIGN_OR_RETURN(bool result, - TrySinkingConstantsIntoWhileBody(while_instr)); + TrySinkingConstantsIntoWhileLoop(while_instr)); changed |= result; } diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h index 577bad6c706..2f8edb12194 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h @@ -23,8 +23,8 @@ limitations under the License. namespace xla { // Sinks while loop invariant values that happen to be constants into the while -// loop body. This is probably not a win in isolation but may unlock further -// optimizations like constant folding. +// loop body and conditional. This is probably not a win in isolation but may +// unlock further optimizations like constant folding. // // state = (..., const, ...) // while (pred(state)) { @@ -46,9 +46,6 @@ namespace xla { // tuple trivially loop invariant. WhileLoopSimplifier will later get rid of // `v`. // -// We only sink into while loop bodies, but this can be extended to transform -// conditions as well. -// // TODO(b/79121449): We should also sink broadcasts of constants. class WhileLoopConstantSinking : public HloModulePass { public: @@ -61,7 +58,7 @@ class WhileLoopConstantSinking : public HloModulePass { StatusOr Run(HloModule* module) override; private: - StatusOr TrySinkingConstantsIntoWhileBody(HloInstruction* while_instr); + StatusOr TrySinkingConstantsIntoWhileLoop(HloInstruction* while_instr); }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc index 0e7667de832..9a25be10222 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc @@ -242,5 +242,132 @@ ENTRY entry { } } } + +TEST_F(WhileLoopConstantSinkingTest, ConditionalSinkConstant) { + const char* const hlo_string = R"( +HloModule ModuleWithWhile + +body { + p_body = (f32[],f32[]) parameter(0) + p_body.0 = f32[] get-tuple-element((f32[],f32[]) p_body), index=0 + const = f32[] constant(1) + add = f32[] add(p_body.0, const) + p_body.1 = f32[] get-tuple-element((f32[],f32[]) p_body), index=1 + ROOT root = (f32[],f32[]) tuple(add, p_body.1) +} + +condition { + p_cond = (f32[],f32[]) parameter(0) + p_cond.0 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=0 + p_cond.1 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=1 + ROOT result = pred[] less-than(p_cond.0, p_cond.1) +} + +ENTRY entry { + const_0 = f32[] constant(0) + const_1 = f32[] constant(10) + while_init = (f32[],f32[]) tuple(const_0, const_1) + ROOT while = (f32[],f32[]) while(while_init), condition=condition, body=body +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_string)); + + TF_ASSERT_OK_AND_ASSIGN(bool changed, + WhileLoopConstantSinking{}.Run(module.get())); + ASSERT_TRUE(changed); + + auto* while_condition = module->GetComputationWithName("condition"); + EXPECT_THAT(while_condition->root_instruction(), op::Lt(_, op::Constant())); +} + +TEST_F(WhileLoopConstantSinkingTest, ConditionalTupleShapedConstants) { + const char* const hlo_string = R"( +HloModule ModuleWithWhile + +body { + p_b = (f32[],(f32[],f32[])) parameter(0) + p_b.0 = f32[] get-tuple-element((f32[],(f32[],f32[])) p_b), index=0 + p_b.1 = (f32[],f32[]) get-tuple-element((f32[],(f32[],f32[])) p_b), index=1 + p_b.1.0 = f32[] get-tuple-element((f32[],f32[]) p_b.1), index=0 + add = f32[] add(p_b.0, p_b.1.0) + ROOT root = (f32[],(f32[],f32[])) tuple(add, p_b.1) +} + +condition { + p_c = (f32[],(f32[],f32[])) parameter(0) + p_c.0 = f32[] get-tuple-element((f32[],f32[]) p_c), index=0 + p_c.1 = (f32[],f32[]) get-tuple-element((f32[],f32[]) p_c), index=1 + p_c.1.1 = f32[] get-tuple-element((f32[],f32[]) p_c.1), index=1 + ROOT result = pred[] less-than(p_c.0, p_c.1.1) +} + +ENTRY entry { + const_0 = f32[] constant(0) + const_1 = (f32[], f32[]) constant((f32[], f32[]) (1, 10)) + while_init = (f32[],(f32[],f32[])) tuple(const_0, const_1) + ROOT while = (f32[],(f32[],f32[])) while(while_init), condition=condition, body=body +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_string)); + + TF_ASSERT_OK_AND_ASSIGN(bool changed, + WhileLoopConstantSinking{}.Run(module.get())); + ASSERT_TRUE(changed); + + auto* while_condition = module->GetComputationWithName("condition"); + EXPECT_THAT(while_condition->root_instruction(), + op::Lt(_, op::GetTupleElement(op::Constant()))); +} + +TEST_F(WhileLoopConstantSinkingTest, ConditionalDontCreateDeadConstant) { + const char* const hlo_string = R"( +HloModule ModuleWithWhile + +body { + p_body = (f32[],f32[],f32[]) parameter(0) + p_body.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=0 + const = f32[] constant(1) + add = f32[] add(p_body.0, const) + p_body.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=1 + p_body.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=2 + ROOT root = (f32[],f32[],f32[]) tuple(add, p_body.1, p_body.2) +} + +condition { + p_cond = (f32[],f32[],f32[]) parameter(0) + p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0 + p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1 + p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2 + ROOT result = pred[] less-than(p_cond.0, p_cond.1) +} + +ENTRY entry { + const_0 = f32[] constant(0) + const_1 = f32[] constant(10) + const_2 = f32[] constant(12) + while_init = (f32[],f32[],f32[]) tuple(const_0, const_1, const_2) + ROOT while = (f32[],f32[],f32[]) while(while_init), condition=condition, body=body +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_string)); + + TF_ASSERT_OK_AND_ASSIGN(bool changed, + WhileLoopConstantSinking{}.Run(module.get())); + ASSERT_TRUE(changed); + + auto* while_condition = module->GetComputationWithName("condition"); + EXPECT_THAT(while_condition->root_instruction(), op::Lt(_, op::Constant())); + for (const HloInstruction* inst : while_condition->instructions()) { + if (inst->opcode() == HloOpcode::kConstant) { + EXPECT_GT(inst->user_count(), 0); + } + } +} } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc index f90ac91f9d0..153cd449d34 100644 --- a/tensorflow/compiler/xla/service/while_util.cc +++ b/tensorflow/compiler/xla/service/while_util.cc @@ -268,4 +268,17 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) { return result; } +/*static*/ std::map +WhileUtil::GetGTEsMapForWhileConditional( + const HloComputation& while_conditional) { + std::map result; + for (auto* inst : while_conditional.instructions()) { + if (inst->opcode() == HloOpcode::kGetTupleElement && + inst->operand(0) == while_conditional.parameter_instruction(0)) { + result[inst->tuple_index()] = inst; + } + } + return result; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h index b1c4486887a..57ae0178b4d 100644 --- a/tensorflow/compiler/xla/service/while_util.h +++ b/tensorflow/compiler/xla/service/while_util.h @@ -84,6 +84,13 @@ class WhileUtil { // Assumes `while_body` is the body computation of the while loop in question. static std::vector GetInvariantGTEsForWhileBody( const HloComputation& while_body); + + // Returns a map of index to GetTupleElement instructions in + // `while_conditional` that access elements in the parameter tuple. Assumes + // `while_conditional` is the conditional computation of the while loop in + // question. + static std::map GetGTEsMapForWhileConditional( + const HloComputation& while_conditional); }; } // namespace xla From e8bf4b49e372c49f51536731b7c9390fd541baf5 Mon Sep 17 00:00:00 2001 From: Grzegorz Pawelczak Date: Mon, 15 Oct 2018 13:14:28 +0100 Subject: [PATCH 019/540] Fix the pass name --- tensorflow/compiler/xla/service/while_loop_constant_sinking.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h index 2f8edb12194..a866bc1264b 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h @@ -52,7 +52,7 @@ class WhileLoopConstantSinking : public HloModulePass { ~WhileLoopConstantSinking() override = default; absl::string_view name() const override { - return "while-loop-invariant-code-motion"; + return "while-loop-constant-sinking"; } StatusOr Run(HloModule* module) override; From 231ef238b5e9047ce85ba30e340e09b1a21a585a Mon Sep 17 00:00:00 2001 From: Anton Dmitriev Date: Tue, 16 Oct 2018 14:24:41 +0300 Subject: [PATCH 020/540] Add ability to start TensorFlow server from Java API. --- tensorflow/c/BUILD | 1 + tensorflow/c/c_api.cc | 39 +++++++++ tensorflow/c/c_api.h | 23 ++++++ tensorflow/c/c_api_internal.h | 7 ++ .../src/main/java/org/tensorflow/Server.java | 82 +++++++++++++++++++ tensorflow/java/src/main/native/BUILD | 1 + tensorflow/java/src/main/native/server_jni.cc | 73 +++++++++++++++++ tensorflow/java/src/main/native/server_jni.h | 66 +++++++++++++++ 8 files changed, 292 insertions(+) create mode 100644 tensorflow/java/src/main/java/org/tensorflow/Server.java create mode 100644 tensorflow/java/src/main/native/server_jni.cc create mode 100644 tensorflow/java/src/main/native/server_jni.h diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD index 17e2e292eb1..ed9c94688d7 100644 --- a/tensorflow/c/BUILD +++ b/tensorflow/c/BUILD @@ -94,6 +94,7 @@ tf_cuda_library( "//tensorflow/core:protos_all_cc", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core/distributed_runtime:server_lib", ], }) + select({ "//tensorflow:with_xla_support": [ diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index 79811ceae57..6101c1b6af5 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -2803,4 +2803,43 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) { } return ret; } + +// TF_Server functions ---------------------------------------------- + +TF_Server::TF_Server(tensorflow::ServerInterface* server) : server(server) {} + +TF_Server* TF_NewServer(const void* proto, size_t proto_len, + TF_Status* status) { + tensorflow::ServerDef server_def; + if (!server_def.ParseFromArray(proto, static_cast(proto_len))) { + status->status = InvalidArgument("Unparseable ServerDef"); + return nullptr; + } + + auto out_server = new std::unique_ptr(); + status->status = tensorflow::NewServer(server_def, out_server); + if (!status->status.ok()) return nullptr; + + return new TF_Server(out_server->release()); +} + +void TF_StartServer(TF_Server* server, TF_Status* status) { + status->status = server->server->Start(); +} + +void TF_StopServer(TF_Server* server, TF_Status* status) { + status->status = server->server->Stop(); +} + +void TF_JoinServer(TF_Server* server, TF_Status* status) { + status->status = server->server->Join(); +} + +void TF_DeleteServer(TF_Server* server) { + if (server != nullptr) { + if (server->server != nullptr) delete server->server; + delete server; + } +} + } // end extern "C" diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h index 850f6ecd637..bb5741e73dd 100644 --- a/tensorflow/c/c_api.h +++ b/tensorflow/c/c_api.h @@ -1662,6 +1662,29 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status); TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp( const char* name, TF_Status* status); +// -------------------------------------------------------------------------- +// Server functionality. + +// Server. +typedef struct TF_Server TF_Server; + +// Creates new server. +TF_CAPI_EXPORT extern TF_Server* TF_NewServer(const void* proto, + size_t proto_len, + TF_Status* status); + +// Starts a server. +TF_CAPI_EXPORT extern void TF_StartServer(TF_Server* server, TF_Status* status); + +// Stops a server. +TF_CAPI_EXPORT extern void TF_StopServer(TF_Server* server, TF_Status* status); + +// Blocks until the server has shut down (currently blocks forever). +TF_CAPI_EXPORT extern void TF_JoinServer(TF_Server* server, TF_Status* status); + +// Destroy a server, frees memory. +TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server); + #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h index 95652a11378..59c8a2b7c78 100644 --- a/tensorflow/c/c_api_internal.h +++ b/tensorflow/c/c_api_internal.h @@ -37,6 +37,7 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/public/session.h" +#include "tensorflow/core/distributed_runtime/server_lib.h" namespace tensorflow { class Device; @@ -179,6 +180,12 @@ struct TF_ApiDefMap { tensorflow::mutex lock; }; +struct TF_Server { + TF_Server(tensorflow::ServerInterface* server); + + tensorflow::ServerInterface* server; +}; + namespace tensorflow { class TensorCApi { diff --git a/tensorflow/java/src/main/java/org/tensorflow/Server.java b/tensorflow/java/src/main/java/org/tensorflow/Server.java new file mode 100644 index 00000000000..18ee99e00a1 --- /dev/null +++ b/tensorflow/java/src/main/java/org/tensorflow/Server.java @@ -0,0 +1,82 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +package org.tensorflow; + +/** + * An in-process TensorFlow server, for use in distributed training. + * + * A {@code tf.train.Server} instance encapsulates a set of devices and a + * {@code tf.Session} target that can participate in distributed training. A + * server belongs to a cluster (specified by a {@code tf.train.ClusterSpec}), + * and corresponds to a particular task in a named job. The server can + * communicate with any other server in the same cluster. + * + *

WARNING:A {@code Server} owns resources that must be + * explicitly freed by invoking {@link #close()}. + * + *

Instances of a {@code Server} are thread-safe. + */ +public final class Server implements AutoCloseable { + + /** + * Constructs a new instance of server. + * + * @param config Server definition specified as a serialized + * ServerDef + * protocol buffer. + */ + public Server(byte[] serverDef) { + nativeHandle = allocate(serverDef); + } + + /** Starts this server. */ + public synchronized void start() { + start(nativeHandle); + } + + /** Stops this server. */ + public synchronized void stop() { + stop(nativeHandle); + } + + /** Blocks until the server has shut down (currently blocks forever). */ + public synchronized void join() { + join(nativeHandle); + } + + @Override + public void close() { + delete(nativeHandle); + + nativeHandle = 0; + } + + private static native long allocate(byte[] serverDef); + + private static native void start(long nativeHandle); + + private static native void stop(long nativeHandle); + + private static native void join(long nativeHandle); + + private static native void delete(long nativeHandle); + + private long nativeHandle; + + static { + TensorFlow.init(); + } +} \ No newline at end of file diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD index 49348daa94e..530224aa944 100644 --- a/tensorflow/java/src/main/native/BUILD +++ b/tensorflow/java/src/main/native/BUILD @@ -43,6 +43,7 @@ tf_cuda_library( "//tensorflow/core:all_kernels", "//tensorflow/core:direct_session", "//tensorflow/core:ops", + "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib", ], }), alwayslink = 1, diff --git a/tensorflow/java/src/main/native/server_jni.cc b/tensorflow/java/src/main/native/server_jni.cc new file mode 100644 index 00000000000..7eca920230a --- /dev/null +++ b/tensorflow/java/src/main/native/server_jni.cc @@ -0,0 +1,73 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/java/src/main/native/server_jni.h" +#include "tensorflow/c/c_api.h" +#include "tensorflow/java/src/main/native/exception_jni.h" +#include "tensorflow/java/src/main/native/utils_jni.h" + +JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate( + JNIEnv* env, jclass clazz, jbyteArray server_def) { + TF_Status* status = TF_NewStatus(); + + jbyte* server_def_ptr = env->GetByteArrayElements(server_def, nullptr); + + TF_Server* server = TF_NewServer( + server_def_ptr, static_cast(env->GetArrayLength(server_def)), + status); + + env->ReleaseByteArrayElements(server_def, server_def_ptr, JNI_ABORT); + throwExceptionIfNotOK(env, status); + + return reinterpret_cast(server); +} + +JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env, + jclass clazz, + jlong handle) { + TF_Status* status = TF_NewStatus(); + TF_Server* server = reinterpret_cast(handle); + + TF_StartServer(server, status); + throwExceptionIfNotOK(env, status); +} + +JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env, + jclass clazz, + jlong handle) { + TF_Status* status = TF_NewStatus(); + TF_Server* server = reinterpret_cast(handle); + + TF_StopServer(server, status); + throwExceptionIfNotOK(env, status); +} + +JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env, + jclass clazz, + jlong handle) { + TF_Status* status = TF_NewStatus(); + TF_Server* server = reinterpret_cast(handle); + + TF_JoinServer(server, status); + throwExceptionIfNotOK(env, status); +} + +JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv* env, + jclass clazz, + jlong handle) { + TF_Server* server = reinterpret_cast(handle); + + TF_DeleteServer(server); +} diff --git a/tensorflow/java/src/main/native/server_jni.h b/tensorflow/java/src/main/native/server_jni.h new file mode 100644 index 00000000000..4bfe90b7a85 --- /dev/null +++ b/tensorflow/java/src/main/native/server_jni.h @@ -0,0 +1,66 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_ +#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Class: org_tensorflow_Server + * Method: allocate + * Signature: ([B)J + */ +JNIEXPORT jlong JNICALL +Java_org_tensorflow_Server_allocate(JNIEnv *, jclass, jbyteArray server_def); + +/* + * Class: org_tensorflow_Server + * Method: start + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv *, jclass, + jlong); + +/* + * Class: org_tensorflow_Server + * Method: stop + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv *, jclass, jlong); + +/* + * Class: org_tensorflow_Session + * Method: join + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv *, jclass, jlong); + +/* + * Class: org_tensorflow_Session + * Method: delete + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv *, jclass, + jlong); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_ From e16181f0f423df1861351fc725095468f6bf600b Mon Sep 17 00:00:00 2001 From: Anton Dmitriev Date: Wed, 17 Oct 2018 12:36:16 +0300 Subject: [PATCH 021/540] Server Java API updates after review. --- tensorflow/c/c_api.cc | 25 +++--- tensorflow/c/c_api.h | 41 +++++++-- tensorflow/c/c_api_internal.h | 6 +- tensorflow/java/BUILD | 1 + .../src/main/java/org/tensorflow/Server.java | 84 +++++++++++++++---- tensorflow/java/src/main/native/BUILD | 1 - tensorflow/java/src/main/native/server_jni.cc | 42 ++++++++-- 7 files changed, 154 insertions(+), 46 deletions(-) diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index 6101c1b6af5..0d71aa3e942 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -2806,40 +2806,37 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) { // TF_Server functions ---------------------------------------------- -TF_Server::TF_Server(tensorflow::ServerInterface* server) : server(server) {} +TF_Server::TF_Server(std::unique_ptr server) + : server(std::move(server)) {} TF_Server* TF_NewServer(const void* proto, size_t proto_len, TF_Status* status) { tensorflow::ServerDef server_def; if (!server_def.ParseFromArray(proto, static_cast(proto_len))) { - status->status = InvalidArgument("Unparseable ServerDef"); + status->status = InvalidArgument( + "Could not parse provided bytes into a ServerDef protocol buffer"); return nullptr; } - auto out_server = new std::unique_ptr(); - status->status = tensorflow::NewServer(server_def, out_server); + std::unique_ptr out_server; + status->status = tensorflow::NewServer(server_def, &out_server); if (!status->status.ok()) return nullptr; - return new TF_Server(out_server->release()); + return new TF_Server(std::move(out_server)); } -void TF_StartServer(TF_Server* server, TF_Status* status) { +void TF_ServerStart(TF_Server* server, TF_Status* status) { status->status = server->server->Start(); } -void TF_StopServer(TF_Server* server, TF_Status* status) { +void TF_ServerStop(TF_Server* server, TF_Status* status) { status->status = server->server->Stop(); } -void TF_JoinServer(TF_Server* server, TF_Status* status) { +void TF_ServerJoin(TF_Server* server, TF_Status* status) { status->status = server->server->Join(); } -void TF_DeleteServer(TF_Server* server) { - if (server != nullptr) { - if (server->server != nullptr) delete server->server; - delete server; - } -} +void TF_DeleteServer(TF_Server* server) { delete server; } } // end extern "C" diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h index bb5741e73dd..9fe06f56a69 100644 --- a/tensorflow/c/c_api.h +++ b/tensorflow/c/c_api.h @@ -1663,26 +1663,53 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp( const char* name, TF_Status* status); // -------------------------------------------------------------------------- -// Server functionality. +// In-process TensorFlow server functionality, for use in distributed training. +// A Server instance encapsulates a set of devices and a Session target that +// can participate in distributed training. A server belongs to a cluster +// (specified by a ClusterSpec), and corresponds to a particular task in a +// named job. The server can communicate with any other server in the same +// cluster. -// Server. +// In-process TensorFlow server. typedef struct TF_Server TF_Server; -// Creates new server. +// Creates a new server. The returned TF_Server object can be started, stopped +// and joined using correspondent commands. After using TF_Server object should +// be deleted using the TF_DeleteServer command to free correspondent resources. +// +// Params: +// proto - Serialized ServerDef protocol buffer. +// proto_len - Length of the proto. +// status - Set to OK on success and an appropriate error on failure. TF_CAPI_EXPORT extern TF_Server* TF_NewServer(const void* proto, size_t proto_len, TF_Status* status); // Starts a server. -TF_CAPI_EXPORT extern void TF_StartServer(TF_Server* server, TF_Status* status); +// +// Params: +// server - TF_Server object to be started. +// status - Set to OK on success and an appropriate error on failure. +TF_CAPI_EXPORT extern void TF_ServerStart(TF_Server* server, TF_Status* status); // Stops a server. -TF_CAPI_EXPORT extern void TF_StopServer(TF_Server* server, TF_Status* status); +// +// Params: +// server - TF_Server object to be stopped. +// status - Set to OK on success and an appropriate error on failure. +TF_CAPI_EXPORT extern void TF_ServerStop(TF_Server* server, TF_Status* status); // Blocks until the server has shut down (currently blocks forever). -TF_CAPI_EXPORT extern void TF_JoinServer(TF_Server* server, TF_Status* status); +// +// Params: +// server - TF_Server object to be joined. +// status - Set to OK on success and an appropriate error on failure. +TF_CAPI_EXPORT extern void TF_ServerJoin(TF_Server* server, TF_Status* status); -// Destroy a server, frees memory. +// Destroy a server, frees memory. Server is expected to be stopped before. +// +// Params: +// server - TF_Server object to be deleted. TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server); #ifdef __cplusplus diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h index 59c8a2b7c78..9bb6edacaa7 100644 --- a/tensorflow/c/c_api_internal.h +++ b/tensorflow/c/c_api_internal.h @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/core/framework/op_gen_lib.h" #endif #include "tensorflow/core/common_runtime/shape_refiner.h" +#include "tensorflow/core/distributed_runtime/server_lib.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/graph/graph.h" @@ -37,7 +38,6 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/public/session.h" -#include "tensorflow/core/distributed_runtime/server_lib.h" namespace tensorflow { class Device; @@ -181,9 +181,9 @@ struct TF_ApiDefMap { }; struct TF_Server { - TF_Server(tensorflow::ServerInterface* server); + TF_Server(std::unique_ptr server); - tensorflow::ServerInterface* server; + std::unique_ptr server; }; namespace tensorflow { diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD index 9dce78b9a36..3f847c4c18c 100644 --- a/tensorflow/java/BUILD +++ b/tensorflow/java/BUILD @@ -382,6 +382,7 @@ tf_cc_binary( linkstatic = 1, deps = [ "//tensorflow/java/src/main/native", + "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib", LINKER_VERSION_SCRIPT, LINKER_EXPORTED_SYMBOLS, ], diff --git a/tensorflow/java/src/main/java/org/tensorflow/Server.java b/tensorflow/java/src/main/java/org/tensorflow/Server.java index 18ee99e00a1..5a42077904c 100644 --- a/tensorflow/java/src/main/java/org/tensorflow/Server.java +++ b/tensorflow/java/src/main/java/org/tensorflow/Server.java @@ -15,26 +15,52 @@ limitations under the License. package org.tensorflow; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; /** * An in-process TensorFlow server, for use in distributed training. * - * A {@code tf.train.Server} instance encapsulates a set of devices and a - * {@code tf.Session} target that can participate in distributed training. A - * server belongs to a cluster (specified by a {@code tf.train.ClusterSpec}), - * and corresponds to a particular task in a named job. The server can - * communicate with any other server in the same cluster. + * A {@code Server} instance encapsulates a set of devices and a + * {@link org.tensorflow.Session} target that can participate in distributed + * training. A server belongs to a cluster (specified by a + * {@code ClusterSpec}), and corresponds to a particular task in a named job. + * The server can communicate with any other server in the same cluster. * - *

WARNING:A {@code Server} owns resources that must be + *

WARNING: A {@code Server} owns resources that must be * explicitly freed by invoking {@link #close()}. * *

Instances of a {@code Server} are thread-safe. + * + *

Using example: + *

+ * {@code
+ * ClusterDef clusterDef = ClusterDef.newBuilder()
+ *   .addJob(JobDef.newBuilder()
+ *   .setName("worker")
+ *   .putTasks(0, "localhost:4321")
+ *   .build()
+ * ).build();
+ *
+ * ServerDef serverDef = ServerDef.newBuilder()
+ *   .setCluster(clusterDef)
+ *   .setJobName("worker")
+ *   .setTaskIndex(0)
+ *   .setProtocol("grpc")
+ * .build();
+ *
+ * try (Server srv = new Server(serverDef.toByteArray())) {
+ *   srv.start();
+ *   srv.join();
+ * }
+ * }
+ * 
*/ public final class Server implements AutoCloseable { /** * Constructs a new instance of server. * - * @param config Server definition specified as a serialized + * @param serverDef Server definition specified as a serialized * ServerDef * protocol buffer. */ @@ -43,25 +69,49 @@ public final class Server implements AutoCloseable { } /** Starts this server. */ - public synchronized void start() { - start(nativeHandle); + public void start() { + lock.readLock().lock(); + try { + start(nativeHandle); + } + finally { + lock.readLock().unlock(); + } } /** Stops this server. */ - public synchronized void stop() { - stop(nativeHandle); + public void stop() { + lock.readLock().lock(); + try { + stop(nativeHandle); + } + finally { + lock.readLock().unlock(); + } } /** Blocks until the server has shut down (currently blocks forever). */ - public synchronized void join() { - join(nativeHandle); + public void join() { + lock.readLock().lock(); + try { + join(nativeHandle); + } + finally { + lock.readLock().unlock(); + } } + /** Stops server and frees resources. Server is expected to be stopped before. */ @Override public void close() { - delete(nativeHandle); - - nativeHandle = 0; + lock.writeLock().lock(); + try { + delete(nativeHandle); + nativeHandle = 0; + } + finally { + lock.writeLock().unlock(); + } } private static native long allocate(byte[] serverDef); @@ -74,6 +124,8 @@ public final class Server implements AutoCloseable { private static native void delete(long nativeHandle); + private final ReadWriteLock lock = new ReentrantReadWriteLock(); + private long nativeHandle; static { diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD index 530224aa944..49348daa94e 100644 --- a/tensorflow/java/src/main/native/BUILD +++ b/tensorflow/java/src/main/native/BUILD @@ -43,7 +43,6 @@ tf_cuda_library( "//tensorflow/core:all_kernels", "//tensorflow/core:direct_session", "//tensorflow/core:ops", - "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib", ], }), alwayslink = 1, diff --git a/tensorflow/java/src/main/native/server_jni.cc b/tensorflow/java/src/main/native/server_jni.cc index 7eca920230a..f0d1d29b88a 100644 --- a/tensorflow/java/src/main/native/server_jni.cc +++ b/tensorflow/java/src/main/native/server_jni.cc @@ -29,44 +29,76 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate( status); env->ReleaseByteArrayElements(server_def, server_def_ptr, JNI_ABORT); - throwExceptionIfNotOK(env, status); + bool ok = throwExceptionIfNotOK(env, status); - return reinterpret_cast(server); + TF_DeleteStatus(status); + + return ok ? reinterpret_cast(server) : 0; } JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env, jclass clazz, jlong handle) { + if (handle == 0) { + throwException(env, kIllegalStateException, + "close() has been called on the Server"); + return; + } + TF_Status* status = TF_NewStatus(); TF_Server* server = reinterpret_cast(handle); - TF_StartServer(server, status); + TF_ServerStart(server, status); throwExceptionIfNotOK(env, status); + + TF_DeleteStatus(status); } JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env, jclass clazz, jlong handle) { + if (handle == 0) { + throwException(env, kIllegalStateException, + "close() has been called on the Server"); + return; + } + TF_Status* status = TF_NewStatus(); TF_Server* server = reinterpret_cast(handle); - TF_StopServer(server, status); + TF_ServerStop(server, status); throwExceptionIfNotOK(env, status); + + TF_DeleteStatus(status); } JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env, jclass clazz, jlong handle) { + if (handle == 0) { + throwException(env, kIllegalStateException, + "close() has been called on the Server"); + return; + } + TF_Status* status = TF_NewStatus(); TF_Server* server = reinterpret_cast(handle); - TF_JoinServer(server, status); + TF_ServerJoin(server, status); throwExceptionIfNotOK(env, status); + + TF_DeleteStatus(status); } JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv* env, jclass clazz, jlong handle) { + if (handle == 0) { + throwException(env, kIllegalStateException, + "close() has been called on the Server"); + return; + } + TF_Server* server = reinterpret_cast(handle); TF_DeleteServer(server); From af653ab648cf1e8069ed34127a2070d6a8cae57a Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Wed, 17 Oct 2018 11:45:55 -0700 Subject: [PATCH 022/540] Handle extrapolation only case gracefully --- .../core/kernels/crop_resize_bilinear_core.h | 394 +++++++++--------- 1 file changed, 208 insertions(+), 186 deletions(-) diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h index 62c275d4ccd..6167cafea24 100644 --- a/tensorflow/core/kernels/crop_resize_bilinear_core.h +++ b/tensorflow/core/kernels/crop_resize_bilinear_core.h @@ -3616,33 +3616,53 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { x1_(flip_x ? out_width - 1 - min_ix : max_ix), y0_(flip_y ? out_height - 1 - max_iy : min_iy), y1_(flip_y ? out_height - 1 - min_iy : max_iy) { - // copy xs values, but filter out the following: - // xs[].lower == xs[].upper AND xs[].lerp == 0 - // xs[].lower == xs[].upper AND xs[].lerp == 1 - assert(min_ix_ <= max_ix_); - xs_ = new CachedInterpolation[max_ix_ - min_ix_ + 1]; - for (int i = min_ix_; i <= max_ix_; ++i) { - int ix = i - min_ix_; - int xs_lower = xs[ix].lower / channels_; - int xs_upper = xs[ix].upper / channels_; - if (xs_lower == xs_upper) { - if (xs[ix].lerp == 0.0f && xs_lower + 1 < in_width) { - // upper weight is zero - xs_upper = xs_lower + 1; - } else if (xs[ix].lerp == 1.0f && xs_upper - 1 >= 0) { - // lower weight is zero - xs_lower = xs_upper - 1; - } + if (min_ix_ <= max_ix_ && min_iy_ <= max_iy_) { + // copy xs values, but filter out the following: + // xs[].lower == xs[].upper AND xs[].lerp == 0 + // xs[].lower == xs[].upper AND xs[].lerp == 1 + xs_ = new CachedInterpolation[max_ix_ - min_ix_ + 1]; + for (int i = min_ix_; i <= max_ix_; ++i) { + int ix = i - min_ix_; + int xs_lower = xs[ix].lower / channels_; + int xs_upper = xs[ix].upper / channels_; + if (xs_lower == xs_upper) { + if (xs[ix].lerp == 0.0f && xs_lower + 1 < in_width) { + // upper weight is zero + xs_upper = xs_lower + 1; + } else if (xs[ix].lerp == 1.0f && xs_upper - 1 >= 0) { + // lower weight is zero + xs_lower = xs_upper - 1; + } + } + xs_[ix].lower = xs_lower * channels_; + xs_[ix].upper = xs_upper * channels_; + xs_[ix].lerp = xs[ix].lerp; } - xs_[ix].lower = xs_lower * channels_; - xs_[ix].upper = xs_upper * channels_; - xs_[ix].lerp = xs[ix].lerp; + _u_min_val = std::numeric_limits::min(); + _u_max_val = std::numeric_limits::max(); + _f_min_val = static_cast(_u_min_val); + _f_max_val = static_cast(_u_max_val); + Configure_(); + } else { + // crop region outside of input image. + // extrapolation only. + general_x_ = NULL; + load1_x_ = NULL; + load2_x_ = NULL; + load4_x_ = NULL; + load8_x_ = NULL; + load1_offsets_ = NULL; + load2_offsets_ = NULL; + load4_offsets_ = NULL; + load8_offsets_ = NULL; + load1_shuffle_masks_ = NULL; + load2_shuffle_masks_ = NULL; + load1_mmxs_lerp_ = NULL; + load2_mmxs_lerp_ = NULL; + load4_mmxs_lerp_ = NULL; + load8_mmxs_lerp_ = NULL; + xs_ = NULL; } - _u_min_val = std::numeric_limits::min(); - _u_max_val = std::numeric_limits::max(); - _f_min_val = static_cast(_u_min_val); - _f_max_val = static_cast(_u_max_val); - Configure_(); } ~CropResizeCastImage() { if (general_x_ != NULL) delete[] general_x_; @@ -3803,168 +3823,170 @@ void CropResizeCastImage::Resize(const T* input_image, U* output_image) { } } // interpolation region - int y = y0_; - for (y = y0_; y + 1 <= y1_; y += 2) { - const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; - const float yA_lerp = ys_[iyA].lerp; - const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); - const T* ysA_input_lower_ptr = - input_image + ys_[iyA].lower * in_width_ * channels_; - const T* ysA_input_upper_ptr = - input_image + ys_[iyA].upper * in_width_ * channels_; - U* ysA_output_ptr = output_image + y * out_width_ * channels_; - const int iyB = - flip_y_ ? out_height_ - 1 - min_iy_ - (y + 1) : (y + 1) - min_iy_; - const float yB_lerp = ys_[iyB].lerp; - const __m128 ysB_lerp = _mm_set1_ps(yB_lerp); - const T* ysB_input_lower_ptr = - input_image + ys_[iyB].lower * in_width_ * channels_; - const T* ysB_input_upper_ptr = - input_image + ys_[iyB].upper * in_width_ * channels_; - U* ysB_output_ptr = output_image + (y + 1) * out_width_ * channels_; - if (channels_ == 1) { - this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - } else if (channels_ == 2) { - this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - } else if (channels_ == 3) { - this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - } else if (channels_ == 4) { - this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - } else { - assert(false); + if (min_ix_ <= max_ix_ && min_iy_ <= max_iy_) { + int y = y0_; + for (y = y0_; y + 1 <= y1_; y += 2) { + const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; + const float yA_lerp = ys_[iyA].lerp; + const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); + const T* ysA_input_lower_ptr = + input_image + ys_[iyA].lower * in_width_ * channels_; + const T* ysA_input_upper_ptr = + input_image + ys_[iyA].upper * in_width_ * channels_; + U* ysA_output_ptr = output_image + y * out_width_ * channels_; + const int iyB = + flip_y_ ? out_height_ - 1 - min_iy_ - (y + 1) : (y + 1) - min_iy_; + const float yB_lerp = ys_[iyB].lerp; + const __m128 ysB_lerp = _mm_set1_ps(yB_lerp); + const T* ysB_input_lower_ptr = + input_image + ys_[iyB].lower * in_width_ * channels_; + const T* ysB_input_upper_ptr = + input_image + ys_[iyB].upper * in_width_ * channels_; + U* ysB_output_ptr = output_image + (y + 1) * out_width_ * channels_; + if (channels_ == 1) { + this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + } else if (channels_ == 2) { + this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + } else if (channels_ == 3) { + this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + } else if (channels_ == 4) { + this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + } else { + assert(false); + } } - } - for (; y <= y1_; ++y) { - const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; - const float yA_lerp = ys_[iyA].lerp; - const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); - const T* ysA_input_lower_ptr = - input_image + ys_[iyA].lower * in_width_ * channels_; - const T* ysA_input_upper_ptr = - input_image + ys_[iyA].upper * in_width_ * channels_; - U* ysA_output_ptr = output_image + y * out_width_ * channels_; - if (channels_ == 1) { - this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - } else if (channels_ == 2) { - this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - } else if (channels_ == 3) { - this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - } else if (channels_ == 4) { - this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - } else { - assert(false); + for (; y <= y1_; ++y) { + const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; + const float yA_lerp = ys_[iyA].lerp; + const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); + const T* ysA_input_lower_ptr = + input_image + ys_[iyA].lower * in_width_ * channels_; + const T* ysA_input_upper_ptr = + input_image + ys_[iyA].upper * in_width_ * channels_; + U* ysA_output_ptr = output_image + y * out_width_ * channels_; + if (channels_ == 1) { + this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + } else if (channels_ == 2) { + this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + } else if (channels_ == 3) { + this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + } else if (channels_ == 4) { + this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + } else { + assert(false); + } } } } From 74fe2db1a54e3af087d4690c801901df280699ab Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Wed, 17 Oct 2018 13:14:10 -0700 Subject: [PATCH 023/540] clang-format --- .../core/kernels/crop_resize_bilinear_core.h | 2701 ++++++++--------- 1 file changed, 1300 insertions(+), 1401 deletions(-) diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h index 6167cafea24..c57131fd18c 100644 --- a/tensorflow/core/kernels/crop_resize_bilinear_core.h +++ b/tensorflow/core/kernels/crop_resize_bilinear_core.h @@ -37,8 +37,8 @@ namespace { // Compute the interpolation indices only once. struct CachedInterpolation { - int lower; // Lower source index used in the interpolation - int upper; // Upper source index used in the interpolation + int lower; // Lower source index used in the interpolation + int upper; // Upper source index used in the interpolation // 1-D linear iterpolation scale (see: // https://en.wikipedia.org/wiki/Bilinear_interpolation) float lerp; @@ -48,7 +48,7 @@ bool compute_single_interpolation_weight(const int in_size, const float out2in_scale, const float out2in_start, const bool clip, const int i, - int* lower, int* upper, float* lerp) { + int *lower, int *upper, float *lerp) { const float in = i * out2in_scale + out2in_start; *lower = (int)floor(in); *upper = (int)ceil(in); @@ -76,7 +76,7 @@ bool compute_single_interpolation_weight(const int in_size, bool compute_interpolation_weights(const int min_i, const int max_i, const int in_size, const float out2in_scale, const float out2in_start, const bool clip, - CachedInterpolation* interpolation) { + CachedInterpolation *interpolation) { bool rval = true; int num_i = max_i - min_i + 1; for (int i = 0; i < num_i; ++i) { @@ -94,16 +94,15 @@ bool compute_interpolation_weights(const int min_i, const int max_i, */ void compute_interpolation_weights(const int out_size, const int in_size, const float out2in_scale, - CachedInterpolation* interpolation) { + CachedInterpolation *interpolation) { interpolation[out_size].lower = 0; interpolation[out_size].upper = 0; const bool clip = true; if (!compute_interpolation_weights(0, out_size - 1, in_size, out2in_scale, 0.0f, clip, interpolation)) { // Should never happen, check for it anyway - printf( - "Warning! Interpolation values have lower,upper indexes outside of " - "range [0,in_size-1]\n"); + printf("Warning! Interpolation values have lower,upper indexes outside of " + "range [0,in_size-1]\n"); } } /** @@ -115,7 +114,7 @@ void compute_interpolation_weights(const int out_size, const int in_size, */ bool compute_minmax_indexes(const int out_size, const int in_size, const float out2in_scale, const float out2in_start, - int* min_i, int* max_i) { + int *min_i, int *max_i) { *min_i = out_size; *max_i = -1; int lower, upper; @@ -123,8 +122,10 @@ bool compute_minmax_indexes(const int out_size, const int in_size, for (int i = 0; i < out_size; ++i) { if (compute_single_interpolation_weight(in_size, out2in_scale, out2in_start, false, i, &lower, &upper, &lerp)) { - if (i < *min_i) *min_i = i; - if (i > *max_i) *max_i = i; + if (i < *min_i) + *min_i = i; + if (i > *max_i) + *max_i = i; } } return (*min_i <= *max_i) ? true : false; @@ -136,9 +137,9 @@ bool compute_minmax_indexes(const int out_size, const int in_size, */ bool compute_interpolation_weights( const int out_size, const int in_size, - const float x1, // lower bounding box, crop region starts at in_size*x1 - const float x2, // upper bounding box, crop region ends at in_size*x2 - int* min_i, int* max_i, std::vector* interpolation) { + const float x1, // lower bounding box, crop region starts at in_size*x1 + const float x2, // upper bounding box, crop region ends at in_size*x2 + int *min_i, int *max_i, std::vector *interpolation) { float out2in_start = out_size > 1 ? (float)(in_size - 1) * (float)x1 : (float)(in_size - 1) * (float)(x1 + x2) / 2.0f; @@ -206,24 +207,24 @@ float compute_lerp(const float top_left, const float top_right, * Optionally flips horizontal and/or vertical axis. */ template -void crop_resize_single_image(const T* image, const int64 in_height, +void crop_resize_single_image(const T *image, const int64 in_height, const int64 in_width, const int64 out_height, const int64 out_width, const int channels, const int min_ix, const int max_ix, - const CachedInterpolation* xs, const int min_iy, - const int max_iy, const CachedInterpolation* ys, + const CachedInterpolation *xs, const int min_iy, + const int max_iy, const CachedInterpolation *ys, const float extrapolated_value, const bool flip_x, const bool flip_y, - U* output) TF_ATTRIBUTE_NOINLINE; + U *output) TF_ATTRIBUTE_NOINLINE; template -void crop_resize_single_image(const T* image, const int64 in_height, +void crop_resize_single_image(const T *image, const int64 in_height, const int64 in_width, const int64 out_height, const int64 out_width, const int channels, const int min_ix, const int max_ix, - const CachedInterpolation* xs, const int min_iy, - const int max_iy, const CachedInterpolation* ys, + const CachedInterpolation *xs, const int min_iy, + const int max_iy, const CachedInterpolation *ys, const float extrapolated_value, const bool flip_x, - const bool flip_y, U* output) { + const bool flip_y, U *output) { const int64 in_row_size = in_width * channels; const int64 out_row_size = out_width * channels; U u_min_val = std::numeric_limits::min(); @@ -234,22 +235,24 @@ void crop_resize_single_image(const T* image, const int64 in_height, cast_to(extrapolated_value, min_val, max_val, u_min_val, u_max_val); // low y extrapolation zone if (min_iy > 0) { - U* p = flip_y ? output + out_row_size * (out_height - min_iy) : output; + U *p = flip_y ? output + out_row_size * (out_height - min_iy) : output; int64 nn = out_row_size * (int64)min_iy; - for (int64 i = 0; i < nn; ++i) p[i] = uEx; + for (int64 i = 0; i < nn; ++i) + p[i] = uEx; } // high y extrapolation zone if (max_iy < out_height - 1) { - U* p = flip_y ? output : output + out_row_size * (max_iy + 1); + U *p = flip_y ? output : output + out_row_size * (max_iy + 1); int64 nn = out_row_size * (int64)(out_height - 1 - max_iy); - for (int64 i = 0; i < nn; ++i) p[i] = uEx; + for (int64 i = 0; i < nn; ++i) + p[i] = uEx; } // low x extrapolation zone if (min_ix > 0) { for (int iy = min_iy; iy <= max_iy; ++iy) { int xx0 = flip_x ? (out_width - min_ix) * channels : 0; int nxx = min_ix * channels; - U* p = output + xx0 + + U *p = output + xx0 + out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy); for (int ix = 0; ix < nxx; ++ix) { p[ix] = uEx; @@ -261,22 +264,22 @@ void crop_resize_single_image(const T* image, const int64 in_height, for (int iy = min_iy; iy <= max_iy; ++iy) { int xx0 = flip_x ? 0 : (max_ix + 1) * channels; int nxx = (out_width - 1 - max_ix) * channels; - U* p = output + xx0 + + U *p = output + xx0 + out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy); for (int ix = 0; ix < nxx; ++ix) { p[ix] = uEx; } } } - U* output_y_ptr = + U *output_y_ptr = output + out_row_size * (int64)(flip_y ? out_height - 1 - min_iy : min_iy); // interpolation zone if (channels == 1) { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -304,8 +307,8 @@ void crop_resize_single_image(const T* image, const int64 in_height, } else if (channels == 2) { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -343,8 +346,8 @@ void crop_resize_single_image(const T* image, const int64 in_height, } else if (channels == 3) { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -392,8 +395,8 @@ void crop_resize_single_image(const T* image, const int64 in_height, } else if (channels == 4) { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -451,8 +454,8 @@ void crop_resize_single_image(const T* image, const int64 in_height, } else { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -483,12 +486,12 @@ void crop_resize_single_image(const T* image, const int64 in_height, // machine you are running on template void crop_resize_single_image_common( - const T* image, const int64 in_height, const int64 in_width, + const T *image, const int64 in_height, const int64 in_width, const int64 out_height, const int64 out_width, const int channels, - const int min_ix, const int max_ix, const CachedInterpolation* xs, - const int min_iy, const int max_iy, const CachedInterpolation* ys, + const int min_ix, const int max_ix, const CachedInterpolation *xs, + const int min_iy, const int max_iy, const CachedInterpolation *ys, const float extrapolated_value, const bool flip_x, const bool flip_y, - U* output) TF_ATTRIBUTE_NOINLINE; + U *output) TF_ATTRIBUTE_NOINLINE; // For now, only compile vectorized code on LINUX systems. // to-do: Test vectorized code on other platforms (MacOS and Windows). @@ -515,9 +518,8 @@ void crop_resize_single_image_common( // Eigen::half, bfloat16 or float. // -template -class VectorLoader { - public: +template class VectorLoader { +public: #ifdef __AVX2__ // convert 8 packed words of type T to fp32. // T must be one of uint8, int8, uint16, int16, int32, Eigen::half, bfloat16 @@ -535,20 +537,20 @@ class VectorLoader { // separate 128 bit lanes. // input is stored in lower portion of 4 separate sse words, v0 through v3. // output is stored in lower portion of v0. - void pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); // output is stored in lower portion of v0 and v1. - void pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); // output is stored in lower portion of v0, v1 and v2. - void pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); #else // pack 4 pixels with 1 channel, 2 channels and 3channels respectively. // input is stored in lower portion of 4 separate sse words, v0 through v3. // output is stored in lower portion of v0. - void pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); // output is stored in lower portion of v0 and v1. - void pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); // output is stored in lower portion of v0, v1 and v2. - void pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); #endif #ifdef __AVX2__ @@ -572,8 +574,8 @@ class VectorLoader { // pixels have 1 channel. // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned // SSE load. - void load1_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* right0); + void load1_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m256 *left0, __m256 *right0); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -581,9 +583,9 @@ class VectorLoader { // pixels have 2 channels. // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned // SSE load. - void load1_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* right0, __m256* right1); + void load1_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, + __m256 *right0, __m256 *right1); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -591,9 +593,9 @@ class VectorLoader { // pixels have 3 channels. // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned // SSE load. - void load1_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* left2, __m256* right0, __m256* right1, __m256* right2); + void load1_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, + __m256 *left2, __m256 *right0, __m256 *right1, __m256 *right2); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -601,10 +603,10 @@ class VectorLoader { // pixels have 4 channels. // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned // SSE load. - void load1_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* left2, __m256* left3, __m256* right0, __m256* right1, - __m256* right2, __m256* right3); + void load1_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, + __m256 *left2, __m256 *left3, __m256 *right0, __m256 *right1, + __m256 *right2, __m256 *right3); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -612,8 +614,8 @@ class VectorLoader { // pixels have 1 channel. // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right // inputs are loaded with second SSE load. - void load2_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* right0); + void load2_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m256 *left0, __m256 *right0); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -621,9 +623,9 @@ class VectorLoader { // pixels have 2 channels. // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right // inputs are loaded with second SSE load. - void load2_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* right0, __m256* right1); + void load2_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, + __m256 *right0, __m256 *right1); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -631,9 +633,9 @@ class VectorLoader { // pixels have 3 channels. // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right // inputs are loaded with second SSE load. - void load2_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* left2, __m256* right0, __m256* right1, __m256* right2); + void load2_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, + __m256 *left2, __m256 *right0, __m256 *right1, __m256 *right2); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -641,10 +643,10 @@ class VectorLoader { // pixels have 4 channels. // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right // inputs are loaded with second SSE load. - void load2_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* left2, __m256* left3, __m256* right0, __m256* right1, - __m256* right2, __m256* right3); + void load2_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, + __m256 *left2, __m256 *left3, __m256 *right0, __m256 *right1, + __m256 *right2, __m256 *right3); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -652,9 +654,9 @@ class VectorLoader { // pixels have 1 channel. // load4 case, i.e. each pair of left and right inputs are loaded with a // separate SSE load. - void load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* right0); + void load4_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256 *left0, + __m256 *right0); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -662,9 +664,9 @@ class VectorLoader { // pixels have 2 channels. // load4 case, i.e. each pair of left and right inputs are loaded with a // separate SSE load. - void load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* right0, __m256* right1); + void load4_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256 *left0, + __m256 *left1, __m256 *right0, __m256 *right1); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -672,10 +674,10 @@ class VectorLoader { // pixels have 3 channels. // load4 case, i.e. each pair of left and right inputs are loaded with a // separate SSE load. - void load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* left2, __m256* right0, __m256* right1, - __m256* right2); + void load4_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256 *left0, + __m256 *left1, __m256 *left2, __m256 *right0, __m256 *right1, + __m256 *right2); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -683,10 +685,10 @@ class VectorLoader { // pixels have 4 channels. // load4 case, i.e. each pair of left and right inputs are loaded with a // separate SSE load. - void load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* left2, __m256* left3, __m256* right0, - __m256* right1, __m256* right2, __m256* right3); + void load4_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256 *left0, + __m256 *left1, __m256 *left2, __m256 *left3, __m256 *right0, + __m256 *right1, __m256 *right2, __m256 *right3); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -695,9 +697,9 @@ class VectorLoader { // load8 case, i.e. each input is loaded with a separate SSE load. // 4 pixels, each with left and right input necessitates 8 separate SSE loads // per input row. - void load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* right0); + void load8_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256 *left0, + __m256 *right0); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -706,9 +708,9 @@ class VectorLoader { // load8 case, i.e. each input is loaded with a separate SSE load. // 4 pixels, each with left and right input necessitates 8 separate SSE loads // per input row. - void load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* right0, __m256* right1); + void load8_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256 *left0, + __m256 *left1, __m256 *right0, __m256 *right1); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -717,10 +719,10 @@ class VectorLoader { // load8 case, i.e. each input is loaded with a separate SSE load. // 4 pixels, each with left and right input necessitates 8 separate SSE loads // per input row. - void load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* left2, __m256* right0, __m256* right1, - __m256* right2); + void load8_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256 *left0, + __m256 *left1, __m256 *left2, __m256 *right0, __m256 *right1, + __m256 *right2); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -729,10 +731,10 @@ class VectorLoader { // load8 case, i.e. each input is loaded with a separate SSE load. // 4 pixels, each with left and right input necessitates 8 separate SSE loads // per input row. - void load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* left2, __m256* left3, __m256* right0, - __m256* right1, __m256* right2, __m256* right3); + void load8_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256 *left0, + __m256 *left1, __m256 *left2, __m256 *left3, __m256 *right0, + __m256 *right1, __m256 *right2, __m256 *right3); #else // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. @@ -741,9 +743,9 @@ class VectorLoader { // pixels have 1 channel. // load1 case, i.e. all inputs for one input row are loaded with a single SSE // load. - void load1_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* bl0, - __m128* tr0, __m128* br0); + void load1_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m128 *tl0, __m128 *bl0, + __m128 *tr0, __m128 *br0); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -751,10 +753,10 @@ class VectorLoader { // pixels have 2 channels. // load1 case, i.e. all inputs for one input row are loaded with a single SSE // load. - void load1_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* bl0, __m128* bl1, __m128* tr0, __m128* tr1, - __m128* br0, __m128* br1); + void load1_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, + __m128 *bl0, __m128 *bl1, __m128 *tr0, __m128 *tr1, + __m128 *br0, __m128 *br1); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -762,11 +764,11 @@ class VectorLoader { // pixels have 3 channels. // load1 case, i.e. all inputs for one input row are loaded with a single SSE // load. - void load1_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* bl0, __m128* bl1, __m128* bl2, - __m128* tr0, __m128* tr1, __m128* tr2, __m128* br0, - __m128* br1, __m128* br2); + void load1_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, + __m128 *tl2, __m128 *bl0, __m128 *bl1, __m128 *bl2, + __m128 *tr0, __m128 *tr1, __m128 *tr2, __m128 *br0, + __m128 *br1, __m128 *br2); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -774,12 +776,12 @@ class VectorLoader { // pixels have 4 channels. // load1 case, i.e. all inputs for one input row are loaded with a single SSE // load. - void load1_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* tl3, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* bl3, __m128* tr0, __m128* tr1, - __m128* tr2, __m128* tr3, __m128* br0, __m128* br1, - __m128* br2, __m128* br3); + void load1_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, + __m128 *tl2, __m128 *tl3, __m128 *bl0, __m128 *bl1, + __m128 *bl2, __m128 *bl3, __m128 *tr0, __m128 *tr1, + __m128 *tr2, __m128 *tr3, __m128 *br0, __m128 *br1, + __m128 *br2, __m128 *br3); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -787,9 +789,9 @@ class VectorLoader { // pixels have 1 channel. // load2 case, i.e. left inputs are loaded with first SSE load, right inputs // are loaded with second SSE load. - void load2_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* bl0, - __m128* tr0, __m128* br0); + void load2_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m128 *tl0, __m128 *bl0, + __m128 *tr0, __m128 *br0); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -797,10 +799,10 @@ class VectorLoader { // pixels have 2 channels. // load2 case, i.e. left inputs are loaded with first SSE load, right inputs // are loaded with second SSE load. - void load2_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* bl0, __m128* bl1, __m128* tr0, __m128* tr1, - __m128* br0, __m128* br1); + void load2_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, + __m128 *bl0, __m128 *bl1, __m128 *tr0, __m128 *tr1, + __m128 *br0, __m128 *br1); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -808,11 +810,11 @@ class VectorLoader { // pixels have 3 channels. // load2 case, i.e. left inputs are loaded with first SSE load, right inputs // are loaded with second SSE load. - void load2_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* bl0, __m128* bl1, __m128* bl2, - __m128* tr0, __m128* tr1, __m128* tr2, __m128* br0, - __m128* br1, __m128* br2); + void load2_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, + __m128 *tl2, __m128 *bl0, __m128 *bl1, __m128 *bl2, + __m128 *tr0, __m128 *tr1, __m128 *tr2, __m128 *br0, + __m128 *br1, __m128 *br2); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -820,12 +822,12 @@ class VectorLoader { // pixels have 4 channels. // load2 case, i.e. left inputs are loaded with first SSE load, right inputs // are loaded with second SSE load. - void load2_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* tl3, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* bl3, __m128* tr0, __m128* tr1, - __m128* tr2, __m128* tr3, __m128* br0, __m128* br1, - __m128* br2, __m128* br3); + void load2_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, + const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, + __m128 *tl2, __m128 *tl3, __m128 *bl0, __m128 *bl1, + __m128 *bl2, __m128 *bl3, __m128 *tr0, __m128 *tr1, + __m128 *tr2, __m128 *tr3, __m128 *br0, __m128 *br1, + __m128 *br2, __m128 *br3); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -833,9 +835,9 @@ class VectorLoader { // pixels have 1 channel. // load4 case, i.e. left and right inputs are loaded with a separate SSE load // for each pixel. - void load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* bl0, __m128* tr0, __m128* br0); + void load4_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128 *tl0, + __m128 *bl0, __m128 *tr0, __m128 *br0); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -843,10 +845,10 @@ class VectorLoader { // pixels have 2 channels. // load4 case, i.e. left and right inputs are loaded with a separate SSE load // for each pixel. - void load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* bl0, __m128* bl1, __m128* tr0, - __m128* tr1, __m128* br0, __m128* br1); + void load4_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128 *tl0, + __m128 *tl1, __m128 *bl0, __m128 *bl1, __m128 *tr0, + __m128 *tr1, __m128 *br0, __m128 *br1); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -854,11 +856,11 @@ class VectorLoader { // pixels have 3 channels. // load4 case, i.e. left and right inputs are loaded with a separate SSE load // for each pixel. - void load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* tl2, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* tr0, __m128* tr1, __m128* tr2, - __m128* br0, __m128* br1, __m128* br2); + void load4_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128 *tl0, + __m128 *tl1, __m128 *tl2, __m128 *bl0, __m128 *bl1, + __m128 *bl2, __m128 *tr0, __m128 *tr1, __m128 *tr2, + __m128 *br0, __m128 *br1, __m128 *br2); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -866,12 +868,12 @@ class VectorLoader { // pixels have 4 channels. // load4 case, i.e. left and right inputs are loaded with a separate SSE load // for each pixel. - void load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* tl2, __m128* tl3, __m128* bl0, - __m128* bl1, __m128* bl2, __m128* bl3, __m128* tr0, - __m128* tr1, __m128* tr2, __m128* tr3, __m128* br0, - __m128* br1, __m128* br2, __m128* br3); + void load4_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128 *tl0, + __m128 *tl1, __m128 *tl2, __m128 *tl3, __m128 *bl0, + __m128 *bl1, __m128 *bl2, __m128 *bl3, __m128 *tr0, + __m128 *tr1, __m128 *tr2, __m128 *tr3, __m128 *br0, + __m128 *br1, __m128 *br2, __m128 *br3); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -879,9 +881,9 @@ class VectorLoader { // pixels have 1 channel. // load8 case, i.e. left and right inputs are loaded with separate SSE loads // for each pixel. - void load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* bl0, __m128* tr0, __m128* br0); + void load8_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128 *tl0, + __m128 *bl0, __m128 *tr0, __m128 *br0); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -889,10 +891,10 @@ class VectorLoader { // pixels have 2 channels. // load8 case, i.e. left and right inputs are loaded with separate SSE loads // for each pixel. - void load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* bl0, __m128* bl1, __m128* tr0, - __m128* tr1, __m128* br0, __m128* br1); + void load8_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128 *tl0, + __m128 *tl1, __m128 *bl0, __m128 *bl1, __m128 *tr0, + __m128 *tr1, __m128 *br0, __m128 *br1); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -900,11 +902,11 @@ class VectorLoader { // pixels have 3 channels. // load8 case, i.e. left and right inputs are loaded with separate SSE loads // for each pixel. - void load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* tl2, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* tr0, __m128* tr1, __m128* tr2, - __m128* br0, __m128* br1, __m128* br2); + void load8_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128 *tl0, + __m128 *tl1, __m128 *tl2, __m128 *bl0, __m128 *bl1, + __m128 *bl2, __m128 *tr0, __m128 *tr1, __m128 *tr2, + __m128 *br0, __m128 *br1, __m128 *br2); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -912,48 +914,48 @@ class VectorLoader { // pixels have 4 channels. // load8 case, i.e. left and right inputs are loaded with separate SSE loads // for each pixel. - void load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* tl2, __m128* tl3, __m128* bl0, - __m128* bl1, __m128* bl2, __m128* bl3, __m128* tr0, - __m128* tr1, __m128* tr2, __m128* tr3, __m128* br0, - __m128* br1, __m128* br2, __m128* br3); + void load8_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128 *tl0, + __m128 *tl1, __m128 *tl2, __m128 *tl3, __m128 *bl0, + __m128 *bl1, __m128 *bl2, __m128 *bl3, __m128 *tr0, + __m128 *tr1, __m128 *tr2, __m128 *tr3, __m128 *br0, + __m128 *br1, __m128 *br2, __m128 *br3); #endif // there is no method that packs 4 pixels with 4 channel into four sse words. // nothing to do for this case, everything is already in the right position. - private: +private: // helper methods #ifdef __AVX2__ // pack 4 pixels with 1, 2, 3 or 4 channels into lower portion of SSE vector // word. // works within SSE lanes. // sizeof(sample_data_type) can be 1, 2 or 4 bytes. - void pack4_1b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_2b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_4b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_1b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_2b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_4b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_1b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_2b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_4b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_1b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack4_2b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack4_4b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack4_1b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack4_2b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack4_4b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack4_1b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack4_2b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack4_4b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); // there is no pack4_xx_4ch functions because none is needed. // all the bytes are loaded in the right spots for this case. #else // pack 4 pixels with 1, 2, 3 or 4 channels into lower portion of SSE vector // word. // sizeof(sample_data_type) can be 1, 2 or 4 bytes. - void pack4_1b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_2b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_4b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_1b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_2b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_4b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_1b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_2b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_4b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_1b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack4_2b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack4_4b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack4_1b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack4_2b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack4_4b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack4_1b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack4_2b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack4_4b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); #endif #ifdef __AVX2__ __m256i extract_right_1b_(const __m256i left); @@ -974,8 +976,8 @@ class VectorLoader { #ifdef __AVX2__ template -void VectorLoader::pack4_1b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack4_1b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { *v3 = _mm256_slli_si256(*v3, 3); __m256i and_mask = _mm256_setr_epi32(255, 0, 0, 0, 255, 0, 0, 0); *v2 = _mm256_or_si256(*v3, @@ -985,8 +987,8 @@ void VectorLoader::pack4_1b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); } template -void VectorLoader::pack4_2b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack4_2b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { *v3 = _mm256_slli_si256(*v3, 6); __m256i and_mask = _mm256_setr_epi32(65535, 0, 0, 0, 65535, 0, 0, 0); *v2 = _mm256_or_si256(*v3, @@ -996,8 +998,8 @@ void VectorLoader::pack4_2b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); } template -void VectorLoader::pack4_4b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack4_4b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { *v3 = _mm256_slli_si256(*v3, 12); __m256i and_mask = _mm256_setr_epi32(-1, 0, 0, 0, -1, 0, 0, 0); *v2 = _mm256_or_si256(*v3, @@ -1008,8 +1010,8 @@ void VectorLoader::pack4_4b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, } template -void VectorLoader::pack4_1b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack4_1b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { __m256i and_mask = _mm256_setr_epi32(65535, 0, 0, 0, 65535, 0, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 2)); @@ -1017,8 +1019,8 @@ void VectorLoader::pack4_1b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, _mm256_slli_si256(*v3, 2)); } template -void VectorLoader::pack4_2b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack4_2b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { __m256i and_mask = _mm256_setr_epi32(-1, 0, 0, 0, -1, 0, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 4)); @@ -1026,8 +1028,8 @@ void VectorLoader::pack4_2b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, _mm256_slli_si256(*v3, 4)); } template -void VectorLoader::pack4_4b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack4_4b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { __m256i and_mask = _mm256_setr_epi32(-1, -1, 0, 0, -1, -1, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 8)); @@ -1036,8 +1038,8 @@ void VectorLoader::pack4_4b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, } template -void VectorLoader::pack4_1b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack4_1b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { __m256i and_mask = _mm256_setr_epi32(16777215, 0, 0, 0, 16777215, 0, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 3)); @@ -1049,8 +1051,8 @@ void VectorLoader::pack4_1b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, _mm256_slli_si256(*v3, 1)); } template -void VectorLoader::pack4_2b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack4_2b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { __m256i and_mask = _mm256_setr_epi32(-1, 65535, 0, 0, -1, 65535, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 6)); @@ -1062,8 +1064,8 @@ void VectorLoader::pack4_2b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, _mm256_slli_si256(*v3, 2)); } template -void VectorLoader::pack4_4b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack4_4b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { __m256i and_mask = _mm256_setr_epi32(-1, -1, -1, 0, -1, -1, -1, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 12)); @@ -1076,131 +1078,131 @@ void VectorLoader::pack4_4b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, } template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_1b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_1b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_4b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_4b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_1b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_1b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_4b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_4b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_1b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_1b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_4b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { +void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, + __m256i *v3) { pack4_4b_3ch_(v0, v1, v2, v3); } #else template -void VectorLoader::pack4_1b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack4_1b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { *v3 = _mm_slli_si128(*v3, 3); __m128i and_mask = _mm_setr_epi32(255, 0, 0, 0); *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 2)); @@ -1208,8 +1210,8 @@ void VectorLoader::pack4_1b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); } template -void VectorLoader::pack4_2b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack4_2b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { *v3 = _mm_slli_si128(*v3, 6); __m128i and_mask = _mm_setr_epi32(65535, 0, 0, 0); *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 4)); @@ -1217,8 +1219,8 @@ void VectorLoader::pack4_2b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); } template -void VectorLoader::pack4_4b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack4_4b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { *v3 = _mm_slli_si128(*v3, 12); __m128i and_mask = _mm_setr_epi32(-1, 0, 0, 0); *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 8)); @@ -1226,29 +1228,29 @@ void VectorLoader::pack4_4b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); } template -void VectorLoader::pack4_1b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack4_1b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { __m128i and_mask = _mm_setr_epi32(65535, 0, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 2)); *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 2)); } template -void VectorLoader::pack4_2b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack4_2b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { __m128i and_mask = _mm_setr_epi32(-1, 0, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 4)); *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 4)); } template -void VectorLoader::pack4_4b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack4_4b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { __m128i and_mask = _mm_setr_epi32(-1, -1, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 8)); *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 8)); } template -void VectorLoader::pack4_1b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack4_1b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { __m128i and_mask = _mm_setr_epi32(16777215, 0, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 3)); and_mask = _mm_srli_si128(and_mask, 1); @@ -1259,8 +1261,8 @@ void VectorLoader::pack4_1b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, _mm_slli_si128(*v3, 1)); } template -void VectorLoader::pack4_2b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack4_2b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { __m128i and_mask = _mm_setr_epi32(-1, 65535, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 6)); and_mask = _mm_srli_si128(and_mask, 2); @@ -1271,8 +1273,8 @@ void VectorLoader::pack4_2b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, _mm_slli_si128(*v3, 2)); } template -void VectorLoader::pack4_4b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack4_4b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { __m128i and_mask = _mm_setr_epi32(-1, -1, -1, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 12)); and_mask = _mm_srli_si128(and_mask, 4); @@ -1284,148 +1286,144 @@ void VectorLoader::pack4_4b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, } template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_1b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_1b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_4b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_4b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_1b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_1b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_4b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_4b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_1b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_1b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_4b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { +void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, + __m128i *v3) { pack4_4b_3ch_(v0, v1, v2, v3); } #endif #ifdef __AVX2__ -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_1b_(left); } -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_1b_(left); } template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_2b_(left); } -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_2b_(left); } -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_4b_(left); } template <> @@ -1436,29 +1434,24 @@ template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_2b_(left); } -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_4b_(left); } -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_2b_(left); } -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_2b_(left); } template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_4b_(left); } -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_4b_(left); } -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_8b_(left); } template <> @@ -1469,29 +1462,24 @@ template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_4b_(left); } -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_8b_(left); } -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_3b_(left); } -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_3b_(left); } template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_6b_(left); } -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_6b_(left); } -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { assert(false); } template <> @@ -1502,29 +1490,24 @@ template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_6b_(left); } -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { assert(false); } -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_4b_(left); } -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_4b_(left); } template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_8b_(left); } -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_8b_(left); } -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { assert(false); } template <> @@ -1535,29 +1518,24 @@ template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_8b_(left); } -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { assert(false); } #else -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_1b_(left); } -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_1b_(left); } template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_2b_(left); } -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_2b_(left); } -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_4b_(left); } template <> @@ -1568,29 +1546,24 @@ template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_2b_(left); } -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_4b_(left); } -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_2b_(left); } -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_2b_(left); } template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_4b_(left); } -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_4b_(left); } -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_8b_(left); } template <> @@ -1601,29 +1574,24 @@ template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_4b_(left); } -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_8b_(left); } -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_3b_(left); } -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_3b_(left); } template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_6b_(left); } -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_6b_(left); } -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { assert(false); } template <> @@ -1634,29 +1602,24 @@ template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_6b_(left); } -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { assert(false); } -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_4b_(left); } -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_4b_(left); } template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_8b_(left); } -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_8b_(left); } -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { assert(false); } template <> @@ -1667,53 +1630,45 @@ template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_8b_(left); } -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { assert(false); } #endif #ifdef __AVX2__ -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { +template <> __m256 VectorLoader::to_fp32(__m256i raw) { raw = _mm256_insertf128_si256( _mm256_castsi128_si256(_mm_cvtepu8_epi32(_mm256_castsi256_si128(raw))), _mm_cvtepu8_epi32(_mm256_extractf128_si256(raw, 1)), 1); return _mm256_cvtepi32_ps(raw); } -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { +template <> __m256 VectorLoader::to_fp32(__m256i raw) { raw = _mm256_insertf128_si256( _mm256_castsi128_si256(_mm_cvtepi8_epi32(_mm256_castsi256_si128(raw))), _mm_cvtepi8_epi32(_mm256_extractf128_si256(raw, 1)), 1); return _mm256_cvtepi32_ps(raw); } -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { +template <> __m256 VectorLoader::to_fp32(__m256i raw) { raw = _mm256_insertf128_si256( _mm256_castsi128_si256(_mm_cvtepu16_epi32(_mm256_castsi256_si128(raw))), _mm_cvtepu16_epi32(_mm256_extractf128_si256(raw, 1)), 1); return _mm256_cvtepi32_ps(raw); } -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { +template <> __m256 VectorLoader::to_fp32(__m256i raw) { raw = _mm256_insertf128_si256( _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm256_castsi256_si128(raw))), _mm_cvtepi16_epi32(_mm256_extractf128_si256(raw, 1)), 1); return _mm256_cvtepi32_ps(raw); } -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { +template <> __m256 VectorLoader::to_fp32(__m256i raw) { return _mm256_cvtepi32_ps(raw); } -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { +template <> __m256 VectorLoader::to_fp32(__m256i raw) { return _mm256_insertf128_ps( _mm256_castps128_ps256(_mm_cvtph_ps(_mm256_castsi256_si128(raw))), _mm_cvtph_ps(_mm256_extractf128_si256(raw, 1)), 1); } -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { +template <> __m256 VectorLoader::to_fp32(__m256i raw) { // bfloat16 is essentially fp32 with mantissa truncated from 23 to 7 bits. // can convert with << 16, which we fuse with initial shuffle into epi32 // positions. @@ -1722,33 +1677,26 @@ __m256 VectorLoader::to_fp32(__m256i raw) { -128, -128, 0, 1, -128, -128, 2, 3, -128, -128, 4, 5, -128, -128, 6, 7); return _mm256_castsi256_ps(_mm256_shuffle_epi8(raw, shuf_hi32)); } -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { +template <> __m256 VectorLoader::to_fp32(__m256i raw) { return _mm256_castsi256_ps(raw); } #else -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { +template <> __m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(raw)); } -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { +template <> __m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(raw)); } -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { +template <> __m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(raw)); } -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { +template <> __m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(raw)); } -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { +template <> __m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(raw); } -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { +template <> __m128 VectorLoader::to_fp32(__m128i raw) { #ifdef __F16C__ return _mm_cvtph_ps(raw); #else @@ -1813,8 +1761,7 @@ __m128 VectorLoader::to_fp32(__m128i raw) { return _mm_castsi128_ps(fp32_val); #endif } -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { +template <> __m128 VectorLoader::to_fp32(__m128i raw) { // bfloat16 is essentially fp32 with mantissa truncated from 23 to 7 bits. // can convert with << 16, which we fuse with initial shuffle into epi32 // positions. @@ -1822,8 +1769,7 @@ __m128 VectorLoader::to_fp32(__m128i raw) { -128, 4, 5, -128, -128, 6, 7); return _mm_castsi128_ps(_mm_shuffle_epi8(raw, shuf_hi32)); } -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { +template <> __m128 VectorLoader::to_fp32(__m128i raw) { return _mm_castsi128_ps(raw); } #endif @@ -1882,25 +1828,25 @@ __m128i VectorLoader::extract_right_8b_(const __m128i left) { #ifdef __AVX2__ template -void VectorLoader::load1_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* right0) { +void VectorLoader::load1_1ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m256 *left0, __m256 *right0) { __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); *left0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); *right0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); } template -void VectorLoader::load1_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* right0, - __m256* right1) { +void VectorLoader::load1_2ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m256 *left0, __m256 *left1, __m256 *right0, + __m256 *right1) { __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); *left0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); *left1 = to_fp32( @@ -1911,14 +1857,14 @@ void VectorLoader::load1_2ch(const T* lower_ptr, const T* upper_ptr, _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[3]))); } template -void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* left2, - __m256* right0, __m256* right1, - __m256* right2) { +void VectorLoader::load1_3ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m256 *left0, __m256 *left1, __m256 *left2, + __m256 *right0, __m256 *right1, + __m256 *right2) { __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); *left0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); *left1 = to_fp32( @@ -1933,14 +1879,14 @@ void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[5]))); } template -void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* left2, - __m256* left3, __m256* right0, __m256* right1, - __m256* right2, __m256* right3) { +void VectorLoader::load1_4ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m256 *left0, __m256 *left1, __m256 *left2, + __m256 *left3, __m256 *right0, __m256 *right1, + __m256 *right2, __m256 *right3) { __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); *left0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); *left1 = to_fp32( @@ -1959,32 +1905,32 @@ void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[7]))); } template -void VectorLoader::load2_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* right0) { +void VectorLoader::load2_1ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m256 *left0, __m256 *right0) { __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i raw2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 1)), 1); __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); } template -void VectorLoader::load2_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* right0, - __m256* right1) { +void VectorLoader::load2_2ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m256 *left0, __m256 *left1, __m256 *right0, + __m256 *right1) { __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i raw2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 2)), 1); __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); @@ -1993,18 +1939,18 @@ void VectorLoader::load2_2ch(const T* lower_ptr, const T* upper_ptr, *right1 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); } template -void VectorLoader::load2_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* left2, - __m256* right0, __m256* right1, - __m256* right2) { +void VectorLoader::load2_3ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m256 *left0, __m256 *left1, __m256 *left2, + __m256 *right0, __m256 *right1, + __m256 *right2) { __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i raw2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 3)), 1); __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); @@ -2016,18 +1962,18 @@ void VectorLoader::load2_3ch(const T* lower_ptr, const T* upper_ptr, *right2 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); } template -void VectorLoader::load2_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* left2, - __m256* left3, __m256* right0, __m256* right1, - __m256* right2, __m256* right3) { +void VectorLoader::load2_4ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m256 *left0, __m256 *left1, __m256 *left2, + __m256 *left3, __m256 *right0, __m256 *right1, + __m256 *right2, __m256 *right3) { __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i raw2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 4))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 4)), 1); __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); @@ -2042,12 +1988,12 @@ void VectorLoader::load2_4ch(const T* lower_ptr, const T* upper_ptr, *right3 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); } template -void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* right0) { + int offset3, __m256 *left0, __m256 *right0) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i r0 = extract_right_1ch(l0); __m256i l1, r1; if (offset1 == offset0) { @@ -2056,8 +2002,8 @@ void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); r1 = extract_right_1ch(l1); } __m256i l2, r2; @@ -2067,8 +2013,8 @@ void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); r2 = extract_right_1ch(l2); } __m256i l3, r3; @@ -2078,8 +2024,8 @@ void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); r3 = extract_right_1ch(l3); } pack_1ch(&l0, &l1, &l2, &l3); @@ -2088,13 +2034,13 @@ void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, *right0 = to_fp32(r0); } template -void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* right0, __m256* right1) { + int offset3, __m256 *left0, __m256 *left1, + __m256 *right0, __m256 *right1) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i r0 = extract_right_2ch(l0); __m256i l1, r1; if (offset1 == offset0) { @@ -2103,8 +2049,8 @@ void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); r1 = extract_right_2ch(l1); } __m256i l2, r2; @@ -2114,8 +2060,8 @@ void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); r2 = extract_right_2ch(l2); } __m256i l3, r3; @@ -2125,8 +2071,8 @@ void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); r3 = extract_right_2ch(l3); } pack_2ch(&l0, &l1, &l2, &l3); @@ -2137,14 +2083,14 @@ void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, *right1 = to_fp32(r1); } template -void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* left2, __m256* right0, __m256* right1, - __m256* right2) { + int offset3, __m256 *left0, __m256 *left1, + __m256 *left2, __m256 *right0, __m256 *right1, + __m256 *right2) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i r0 = extract_right_3ch(l0); __m256i l1, r1; if (offset1 == offset0) { @@ -2153,8 +2099,8 @@ void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); r1 = extract_right_3ch(l1); } __m256i l2, r2; @@ -2164,8 +2110,8 @@ void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); r2 = extract_right_3ch(l2); } __m256i l3, r3; @@ -2175,8 +2121,8 @@ void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); r3 = extract_right_3ch(l3); } pack_3ch(&l0, &l1, &l2, &l3); @@ -2189,15 +2135,15 @@ void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, *right2 = to_fp32(r2); } template -void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* left2, __m256* left3, __m256* right0, - __m256* right1, __m256* right2, - __m256* right3) { + int offset3, __m256 *left0, __m256 *left1, + __m256 *left2, __m256 *left3, __m256 *right0, + __m256 *right1, __m256 *right2, + __m256 *right3) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i r0 = extract_right_4ch(l0); __m256i l1, r1; if (offset1 == offset0) { @@ -2206,8 +2152,8 @@ void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); r1 = extract_right_4ch(l1); } __m256i l2, r2; @@ -2217,8 +2163,8 @@ void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); r2 = extract_right_4ch(l2); } __m256i l3, r3; @@ -2228,8 +2174,8 @@ void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); r3 = extract_right_4ch(l3); } *left0 = to_fp32(l0); @@ -2242,16 +2188,16 @@ void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, *right3 = to_fp32(r3); } template -void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* right0) { + int offset3, __m256 *left0, __m256 *right0) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i r0 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 1)), 1); __m256i l1, r1; if (offset1 == offset0) { l1 = l0; @@ -2259,12 +2205,12 @@ void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); r1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 1)), 1); } __m256i l2, r2; if (offset2 == offset1) { @@ -2273,12 +2219,12 @@ void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); r2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 1)), 1); } __m256i l3, r3; if (offset3 == offset2) { @@ -2287,12 +2233,12 @@ void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); r3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 1)), 1); } pack_1ch(&l0, &l1, &l2, &l3); *left0 = to_fp32(l0); @@ -2300,17 +2246,17 @@ void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, *right0 = to_fp32(r0); } template -void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* right0, __m256* right1) { + int offset3, __m256 *left0, __m256 *left1, + __m256 *right0, __m256 *right1) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i r0 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 2)), 1); __m256i l1, r1; if (offset1 == offset0) { l1 = l0; @@ -2318,12 +2264,12 @@ void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); r1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 2)), 1); } __m256i l2, r2; if (offset2 == offset1) { @@ -2332,12 +2278,12 @@ void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); r2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 2)), 1); } __m256i l3, r3; if (offset3 == offset2) { @@ -2346,12 +2292,12 @@ void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); r3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 2)), 1); } pack_2ch(&l0, &l1, &l2, &l3); *left0 = to_fp32(l0); @@ -2361,18 +2307,18 @@ void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, *right1 = to_fp32(r1); } template -void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* left2, __m256* right0, __m256* right1, - __m256* right2) { + int offset3, __m256 *left0, __m256 *left1, + __m256 *left2, __m256 *right0, __m256 *right1, + __m256 *right2) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i r0 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 3)), 1); __m256i l1, r1; if (offset1 == offset0) { l1 = l0; @@ -2380,12 +2326,12 @@ void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); r1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 3)), 1); } __m256i l2, r2; if (offset2 == offset1) { @@ -2394,12 +2340,12 @@ void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); r2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 3)), 1); } __m256i l3, r3; if (offset3 == offset2) { @@ -2408,12 +2354,12 @@ void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); r3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 3)), 1); } pack_3ch(&l0, &l1, &l2, &l3); *left0 = to_fp32(l0); @@ -2425,19 +2371,19 @@ void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, *right2 = to_fp32(r2); } template -void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* left2, __m256* left3, __m256* right0, - __m256* right1, __m256* right2, - __m256* right3) { + int offset3, __m256 *left0, __m256 *left1, + __m256 *left2, __m256 *left3, __m256 *right0, + __m256 *right1, __m256 *right2, + __m256 *right3) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); __m256i r0 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 4))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 4)), 1); __m256i l1, r1; if (offset1 == offset0) { l1 = l0; @@ -2445,12 +2391,12 @@ void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); r1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 4)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 4))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 4)), 1); } __m256i l2, r2; if (offset2 == offset1) { @@ -2459,12 +2405,12 @@ void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); r2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 4)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 4))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 4)), 1); } __m256i l3, r3; if (offset3 == offset2) { @@ -2473,12 +2419,12 @@ void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); r3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 4)), 1); + _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 4))), + _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 4)), 1); } *left0 = to_fp32(l0); *left1 = to_fp32(l1); @@ -2491,49 +2437,49 @@ void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, } #else template -void VectorLoader::load1_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* bl0, __m128* tr0, - __m128* br0) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); +void VectorLoader::load1_1ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m128 *tl0, __m128 *bl0, __m128 *tr0, + __m128 *br0) { + __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); } template -void VectorLoader::load1_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* bl0, - __m128* bl1, __m128* tr0, __m128* tr1, - __m128* br0, __m128* br1) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); +void VectorLoader::load1_2ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m128 *tl0, __m128 *tl1, __m128 *bl0, + __m128 *bl1, __m128 *tr0, __m128 *tr1, + __m128 *br0, __m128 *br1) { + __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); } template -void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* tl2, - __m128* bl0, __m128* bl1, __m128* bl2, - __m128* tr0, __m128* tr1, __m128* tr2, - __m128* br0, __m128* br1, __m128* br2) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); +void VectorLoader::load1_3ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m128 *tl0, __m128 *tl1, __m128 *tl2, + __m128 *bl0, __m128 *bl1, __m128 *bl2, + __m128 *tr0, __m128 *tr1, __m128 *tr2, + __m128 *br0, __m128 *br1, __m128 *br2) { + __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); @@ -2542,15 +2488,15 @@ void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); } template -void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* tl2, - __m128* tl3, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* bl3, __m128* tr0, - __m128* tr1, __m128* tr2, __m128* tr3, - __m128* br0, __m128* br1, __m128* br2, - __m128* br3) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); +void VectorLoader::load1_4ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m128 *tl0, __m128 *tl1, __m128 *tl2, + __m128 *tl3, __m128 *bl0, __m128 *bl1, + __m128 *bl2, __m128 *bl3, __m128 *tr0, + __m128 *tr1, __m128 *tr2, __m128 *tr3, + __m128 *br0, __m128 *br1, __m128 *br2, + __m128 *br3) { + __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); @@ -2559,7 +2505,7 @@ void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[6])); *tr3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[7])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); @@ -2570,100 +2516,100 @@ void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, *br3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[7])); } template -void VectorLoader::load2_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* bl0, __m128* tr0, - __m128* br0) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); +void VectorLoader::load2_1ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m128 *tl0, __m128 *bl0, __m128 *tr0, + __m128 *br0) { + __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1)); + raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 1)); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 1)); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); } template -void VectorLoader::load2_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* bl0, - __m128* bl1, __m128* tr0, __m128* tr1, - __m128* br0, __m128* br1) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); +void VectorLoader::load2_2ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m128 *tl0, __m128 *tl1, __m128 *bl0, + __m128 *bl1, __m128 *tr0, __m128 *tr1, + __m128 *br0, __m128 *br1) { + __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2)); + raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 2)); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 2)); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); } template -void VectorLoader::load2_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* tl2, - __m128* bl0, __m128* bl1, __m128* bl2, - __m128* tr0, __m128* tr1, __m128* tr2, - __m128* br0, __m128* br1, __m128* br2) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); +void VectorLoader::load2_3ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m128 *tl0, __m128 *tl1, __m128 *tl2, + __m128 *bl0, __m128 *bl1, __m128 *bl2, + __m128 *tr0, __m128 *tr1, __m128 *tr2, + __m128 *br0, __m128 *br1, __m128 *br2) { + __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3)); + raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 3)); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 3)); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); } template -void VectorLoader::load2_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* tl2, - __m128* tl3, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* bl3, __m128* tr0, - __m128* tr1, __m128* tr2, __m128* tr3, - __m128* br0, __m128* br1, __m128* br2, - __m128* br3) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); +void VectorLoader::load2_4ch(const T *lower_ptr, const T *upper_ptr, + int offset0, const __m128i *shuffle_masks, + __m128 *tl0, __m128 *tl1, __m128 *tl2, + __m128 *tl3, __m128 *bl0, __m128 *bl1, + __m128 *bl2, __m128 *bl3, __m128 *tr0, + __m128 *tr1, __m128 *tr2, __m128 *tr3, + __m128 *br0, __m128 *br1, __m128 *br2, + __m128 *br3) { + __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *tl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4)); + raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 4)); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *tr3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *bl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)); + raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 4)); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *br3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); } template -void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* bl0, - __m128* tr0, __m128* br0) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + int offset3, __m128 *tl0, __m128 *bl0, + __m128 *tr0, __m128 *br0) { + __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); __m128i itr0 = extract_right_1ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); __m128i ibr0 = extract_right_1ch(ibl0); __m128i itl1, itr1; __m128i ibl1, ibr1; @@ -2673,9 +2619,9 @@ void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); itr1 = extract_right_1ch(itl1); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); ibr1 = extract_right_1ch(ibl1); } __m128i itl2, itr2; @@ -2686,9 +2632,9 @@ void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); itr2 = extract_right_1ch(itl2); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); ibr2 = extract_right_1ch(ibl2); } __m128i itl3, itr3; @@ -2699,9 +2645,9 @@ void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); itr3 = extract_right_1ch(itl3); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); ibr3 = extract_right_1ch(ibl3); } pack_1ch(&itl0, &itl1, &itl2, &itl3); @@ -2714,14 +2660,14 @@ void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, *br0 = to_fp32(ibr0); } template -void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* bl0, __m128* bl1, __m128* tr0, - __m128* tr1, __m128* br0, __m128* br1) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + int offset3, __m128 *tl0, __m128 *tl1, + __m128 *bl0, __m128 *bl1, __m128 *tr0, + __m128 *tr1, __m128 *br0, __m128 *br1) { + __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); __m128i itr0 = extract_right_2ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); __m128i ibr0 = extract_right_2ch(ibl0); __m128i itl1, itr1; __m128i ibl1, ibr1; @@ -2731,9 +2677,9 @@ void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); itr1 = extract_right_2ch(itl1); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); ibr1 = extract_right_2ch(ibl1); } __m128i itl2, itr2; @@ -2744,9 +2690,9 @@ void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); itr2 = extract_right_2ch(itl2); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); ibr2 = extract_right_2ch(ibl2); } __m128i itl3, itr3; @@ -2757,9 +2703,9 @@ void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); itr3 = extract_right_2ch(itl3); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); ibr3 = extract_right_2ch(ibl3); } pack_2ch(&itl0, &itl1, &itl2, &itl3); @@ -2776,16 +2722,16 @@ void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, *br1 = to_fp32(ibr1); } template -void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* tr0, __m128* tr1, - __m128* tr2, __m128* br0, __m128* br1, - __m128* br2) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + int offset3, __m128 *tl0, __m128 *tl1, + __m128 *tl2, __m128 *bl0, __m128 *bl1, + __m128 *bl2, __m128 *tr0, __m128 *tr1, + __m128 *tr2, __m128 *br0, __m128 *br1, + __m128 *br2) { + __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); __m128i itr0 = extract_right_3ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); __m128i ibr0 = extract_right_3ch(ibl0); __m128i itl1, itr1; __m128i ibl1, ibr1; @@ -2795,9 +2741,9 @@ void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); itr1 = extract_right_3ch(itl1); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); ibr1 = extract_right_3ch(ibl1); } __m128i itl2, itr2; @@ -2808,9 +2754,9 @@ void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); itr2 = extract_right_3ch(itl2); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); ibr2 = extract_right_3ch(ibl2); } __m128i itl3, itr3; @@ -2821,9 +2767,9 @@ void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); itr3 = extract_right_3ch(itl3); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); ibr3 = extract_right_3ch(ibl3); } pack_3ch(&itl0, &itl1, &itl2, &itl3); @@ -2844,17 +2790,17 @@ void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, *br2 = to_fp32(ibr2); } template -void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* tl3, __m128* bl0, - __m128* bl1, __m128* bl2, __m128* bl3, - __m128* tr0, __m128* tr1, __m128* tr2, - __m128* tr3, __m128* br0, __m128* br1, - __m128* br2, __m128* br3) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + int offset3, __m128 *tl0, __m128 *tl1, + __m128 *tl2, __m128 *tl3, __m128 *bl0, + __m128 *bl1, __m128 *bl2, __m128 *bl3, + __m128 *tr0, __m128 *tr1, __m128 *tr2, + __m128 *tr3, __m128 *br0, __m128 *br1, + __m128 *br2, __m128 *br3) { + __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); __m128i itr0 = extract_right_4ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); __m128i ibr0 = extract_right_4ch(ibl0); __m128i itl1, itr1; __m128i ibl1, ibr1; @@ -2864,9 +2810,9 @@ void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); itr1 = extract_right_4ch(itl1); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); ibr1 = extract_right_4ch(ibl1); } __m128i itl2, itr2; @@ -2877,9 +2823,9 @@ void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); itr2 = extract_right_4ch(itl2); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); ibr2 = extract_right_4ch(ibl2); } __m128i itl3, itr3; @@ -2890,9 +2836,9 @@ void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); itr3 = extract_right_4ch(itl3); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); ibr3 = extract_right_4ch(ibl3); } *tl0 = to_fp32(itl0); @@ -2913,14 +2859,14 @@ void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, *br3 = to_fp32(ibr3); } template -void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* bl0, - __m128* tr0, __m128* br0) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1)); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)); + int offset3, __m128 *tl0, __m128 *bl0, + __m128 *tr0, __m128 *br0) { + __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 1)); + __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 1)); __m128i itl1, itr1; __m128i ibl1, ibr1; if (offset1 == offset0) { @@ -2929,10 +2875,10 @@ void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 1)); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 1)); + itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 1)); + ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 1)); } __m128i itl2, itr2; __m128i ibl2, ibr2; @@ -2942,10 +2888,10 @@ void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 1)); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 1)); + itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 1)); + ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 1)); } __m128i itl3, itr3; __m128i ibl3, ibr3; @@ -2955,10 +2901,10 @@ void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 1)); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 1)); + itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 1)); + ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 1)); } pack_1ch(&itl0, &itl1, &itl2, &itl3); *tl0 = to_fp32(itl0); @@ -2970,15 +2916,15 @@ void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, *br0 = to_fp32(ibr0); } template -void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* bl0, __m128* bl1, __m128* tr0, - __m128* tr1, __m128* br0, __m128* br1) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2)); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)); + int offset3, __m128 *tl0, __m128 *tl1, + __m128 *bl0, __m128 *bl1, __m128 *tr0, + __m128 *tr1, __m128 *br0, __m128 *br1) { + __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 2)); + __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 2)); __m128i itl1, itr1; __m128i ibl1, ibr1; if (offset1 == offset0) { @@ -2987,10 +2933,10 @@ void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 2)); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 2)); + itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 2)); + ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 2)); } __m128i itl2, itr2; __m128i ibl2, ibr2; @@ -3000,10 +2946,10 @@ void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 2)); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 2)); + itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 2)); + ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 2)); } __m128i itl3, itr3; __m128i ibl3, ibr3; @@ -3013,10 +2959,10 @@ void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 2)); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 2)); + itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 2)); + ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 2)); } pack_2ch(&itl0, &itl1, &itl2, &itl3); *tl0 = to_fp32(itl0); @@ -3032,17 +2978,17 @@ void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, *br1 = to_fp32(ibr1); } template -void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* tr0, __m128* tr1, - __m128* tr2, __m128* br0, __m128* br1, - __m128* br2) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3)); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)); + int offset3, __m128 *tl0, __m128 *tl1, + __m128 *tl2, __m128 *bl0, __m128 *bl1, + __m128 *bl2, __m128 *tr0, __m128 *tr1, + __m128 *tr2, __m128 *br0, __m128 *br1, + __m128 *br2) { + __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 3)); + __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 3)); __m128i itl1, itr1; __m128i ibl1, ibr1; if (offset1 == offset0) { @@ -3051,10 +2997,10 @@ void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 3)); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 3)); + itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 3)); + ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 3)); } __m128i itl2, itr2; __m128i ibl2, ibr2; @@ -3064,10 +3010,10 @@ void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 3)); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 3)); + itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 3)); + ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 3)); } __m128i itl3, itr3; __m128i ibl3, ibr3; @@ -3077,10 +3023,10 @@ void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 3)); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 3)); + itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 3)); + ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 3)); } pack_3ch(&itl0, &itl1, &itl2, &itl3); *tl0 = to_fp32(itl0); @@ -3100,18 +3046,18 @@ void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, *br2 = to_fp32(ibr2); } template -void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, +void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* tl3, __m128* bl0, - __m128* bl1, __m128* bl2, __m128* bl3, - __m128* tr0, __m128* tr1, __m128* tr2, - __m128* tr3, __m128* br0, __m128* br1, - __m128* br2, __m128* br3) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4)); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)); + int offset3, __m128 *tl0, __m128 *tl1, + __m128 *tl2, __m128 *tl3, __m128 *bl0, + __m128 *bl1, __m128 *bl2, __m128 *bl3, + __m128 *tr0, __m128 *tr1, __m128 *tr2, + __m128 *tr3, __m128 *br0, __m128 *br1, + __m128 *br2, __m128 *br3) { + __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 4)); + __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 4)); __m128i itl1, itr1; __m128i ibl1, ibr1; if (offset1 == offset0) { @@ -3120,10 +3066,10 @@ void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 4)); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 4)); + itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 4)); + ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 4)); } __m128i itl2, itr2; __m128i ibl2, ibr2; @@ -3133,10 +3079,10 @@ void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 4)); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 4)); + itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 4)); + ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 4)); } __m128i itl3, itr3; __m128i ibl3, ibr3; @@ -3146,10 +3092,10 @@ void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 4)); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 4)); + itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 4)); + ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 4)); } *tl0 = to_fp32(itl0); *tl1 = to_fp32(itl1); @@ -3177,9 +3123,8 @@ void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, // bfloat16 or float. // -template -class VectorWriter { - public: +template class VectorWriter { +public: // convert 4 fp32 words to type U with. // this function calls clip. // resulting words are packed. @@ -3189,89 +3134,89 @@ class VectorWriter { // converts from fp32 to U by calling method from_fp32(...) // writes 4 pixels with 1 channel to destination. - void write_1ch(U* destination, __m128* vec); + void write_1ch(U *destination, __m128 *vec); // converts from fp32 to U by calling method from_fp32(...) // writes 4 pixels with 1 channel to destination. - void write_2ch(U* destination, __m128* vec); + void write_2ch(U *destination, __m128 *vec); // converts from fp32 to U by calling method from_fp32(...) // writes 4 pixels with 1 channel to destination. - void write_3ch(U* destination, __m128* vec); + void write_3ch(U *destination, __m128 *vec); // converts from fp32 to U by calling method from_fp32(...) // writes 4 pixels with 1 channel to destination. - void write_4ch(U* destination, __m128* vec); + void write_4ch(U *destination, __m128 *vec); - private: +private: // clip 4 fp32 words to prevent overflow when converting to type U. __m128 clip_(__m128 vec) { // default is to do nothing, since the packing intrinsics include clipping. return vec; } - void write_1b_1ch(U* destination, __m128* vec) { + void write_1b_1ch(U *destination, __m128 *vec) { __m128i ivec = from_fp32(vec[0]); - _mm_store_ss((float*)(destination), _mm_castsi128_ps(ivec)); + _mm_store_ss((float *)(destination), _mm_castsi128_ps(ivec)); } - void write_2b_1ch(U* destination, __m128* vec) { + void write_2b_1ch(U *destination, __m128 *vec) { __m128i ivec = from_fp32(vec[0]); - _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec)); + _mm_store_sd((double *)(destination), _mm_castsi128_pd(ivec)); } - void write_4b_1ch(U* destination, __m128* vec) { + void write_4b_1ch(U *destination, __m128 *vec) { __m128i ivec = from_fp32(vec[0]); - _mm_storeu_si128((__m128i*)(destination), ivec); + _mm_storeu_si128((__m128i *)(destination), ivec); } - void write_1b_2ch(U* destination, __m128* vec) { + void write_1b_2ch(U *destination, __m128 *vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); - _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec1)); + _mm_store_sd((double *)(destination), _mm_castsi128_pd(ivec1)); } - void write_2b_2ch(U* destination, __m128* vec) { + void write_2b_2ch(U *destination, __m128 *vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); - _mm_storeu_si128((__m128i*)(destination), ivec1); + _mm_storeu_si128((__m128i *)(destination), ivec1); } - void write_4b_2ch(U* destination, __m128* vec) { + void write_4b_2ch(U *destination, __m128 *vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); - _mm_storeu_si128((__m128i*)(destination), ivec1); - _mm_storeu_si128((__m128i*)(destination + 4), ivec2); + _mm_storeu_si128((__m128i *)(destination), ivec1); + _mm_storeu_si128((__m128i *)(destination + 4), ivec2); } - void write_1b_3ch(U* destination, __m128* vec) { + void write_1b_3ch(U *destination, __m128 *vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); - _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec1)); + _mm_store_sd((double *)(destination), _mm_castsi128_pd(ivec1)); __m128i ivec3 = from_fp32(vec[2]); - _mm_store_ss((float*)(destination + 8), _mm_castsi128_ps(ivec3)); + _mm_store_ss((float *)(destination + 8), _mm_castsi128_ps(ivec3)); } - void write_2b_3ch(U* destination, __m128* vec) { + void write_2b_3ch(U *destination, __m128 *vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); - _mm_storeu_si128((__m128i*)(destination), ivec1); + _mm_storeu_si128((__m128i *)(destination), ivec1); __m128i ivec3 = from_fp32(vec[2]); - _mm_store_sd((double*)(destination + 8), _mm_castsi128_pd(ivec3)); + _mm_store_sd((double *)(destination + 8), _mm_castsi128_pd(ivec3)); } - void write_4b_3ch(U* destination, __m128* vec) { + void write_4b_3ch(U *destination, __m128 *vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i ivec3 = from_fp32(vec[2]); - _mm_storeu_si128((__m128i*)(destination), ivec1); - _mm_storeu_si128((__m128i*)(destination + 4), ivec2); - _mm_storeu_si128((__m128i*)(destination + 8), ivec3); + _mm_storeu_si128((__m128i *)(destination), ivec1); + _mm_storeu_si128((__m128i *)(destination + 4), ivec2); + _mm_storeu_si128((__m128i *)(destination + 8), ivec3); } - void write_1b_4ch(U* destination, __m128* vec) { + void write_1b_4ch(U *destination, __m128 *vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i ivec3 = from_fp32(vec[2]); @@ -3281,9 +3226,9 @@ class VectorWriter { ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec3), 8)); ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec4), 12)); - _mm_storeu_si128((__m128i*)(destination), ivec); + _mm_storeu_si128((__m128i *)(destination), ivec); } - void write_2b_4ch(U* destination, __m128* vec) { + void write_2b_4ch(U *destination, __m128 *vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i ivec3 = from_fp32(vec[2]); @@ -3291,25 +3236,24 @@ class VectorWriter { __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); __m128i ivec = _mm_and_si128(mask, ivec1); ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); - _mm_storeu_si128((__m128i*)(destination), ivec); + _mm_storeu_si128((__m128i *)(destination), ivec); ivec = _mm_and_si128(mask, ivec3); ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec4), 8)); - _mm_storeu_si128((__m128i*)(destination + 8), ivec); + _mm_storeu_si128((__m128i *)(destination + 8), ivec); } - void write_4b_4ch(U* destination, __m128* vec) { + void write_4b_4ch(U *destination, __m128 *vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i ivec3 = from_fp32(vec[2]); __m128i ivec4 = from_fp32(vec[3]); - _mm_storeu_si128((__m128i*)(destination), ivec1); - _mm_storeu_si128((__m128i*)(destination + 4), ivec2); - _mm_storeu_si128((__m128i*)(destination + 8), ivec3); - _mm_storeu_si128((__m128i*)(destination + 12), ivec4); + _mm_storeu_si128((__m128i *)(destination), ivec1); + _mm_storeu_si128((__m128i *)(destination + 4), ivec2); + _mm_storeu_si128((__m128i *)(destination + 8), ivec3); + _mm_storeu_si128((__m128i *)(destination + 12), ivec4); } }; -template <> -__m128 VectorWriter::clip_(__m128 vec) { +template <> __m128 VectorWriter::clip_(__m128 vec) { // clip against low limit, -2147483648. // we round up to nearest number that can be represented as float. __m128 lt_val = _mm_set1_ps(-2147483520.0f); @@ -3322,8 +3266,7 @@ __m128 VectorWriter::clip_(__m128 vec) { vec = _mm_or_ps(_mm_andnot_ps(gt_mask, vec), _mm_and_ps(gt_mask, gt_val)); return vec; } -template <> -__m128 VectorWriter::clip_(__m128 vec) { +template <> __m128 VectorWriter::clip_(__m128 vec) { // clip against low limit, -65504.0f; __m128 lt_val = _mm_set1_ps(-65504.0f); __m128 lt_mask = _mm_cmplt_ps(vec, lt_val); @@ -3335,34 +3278,28 @@ __m128 VectorWriter::clip_(__m128 vec) { return vec; } -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { +template <> __m128i VectorWriter::from_fp32(__m128 vec) { __m128i ivec = _mm_cvttps_epi32(vec); ivec = _mm_packs_epi32(ivec, ivec); return _mm_packus_epi16(ivec, ivec); } -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { +template <> __m128i VectorWriter::from_fp32(__m128 vec) { __m128i ivec = _mm_cvttps_epi32(vec); ivec = _mm_packs_epi32(ivec, ivec); return _mm_packs_epi16(ivec, ivec); } -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { +template <> __m128i VectorWriter::from_fp32(__m128 vec) { __m128i ivec = _mm_cvttps_epi32(vec); return _mm_packus_epi32(ivec, ivec); } -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { +template <> __m128i VectorWriter::from_fp32(__m128 vec) { __m128i ivec = _mm_cvttps_epi32(vec); return _mm_packs_epi32(ivec, ivec); } -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { +template <> __m128i VectorWriter::from_fp32(__m128 vec) { return _mm_cvttps_epi32(clip_(vec)); } -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { +template <> __m128i VectorWriter::from_fp32(__m128 vec) { #ifdef __F16C__ return _mm_cvtps_ph(vec, _MM_FROUND_TO_ZERO); #else @@ -3426,8 +3363,7 @@ __m128i VectorWriter::from_fp32(__m128 vec) { return number; #endif } -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { +template <> __m128i VectorWriter::from_fp32(__m128 vec) { // casting from float to bfloat16 simply means >> 16 // we do this with a shuffle that also moves everything to lower portion of // sse vector word @@ -3435,181 +3371,166 @@ __m128i VectorWriter::from_fp32(__m128 vec) { -128, -128, -128, -128, -128, -128); return _mm_shuffle_epi8(_mm_castps_si128(vec), shuf_from_hi32); } -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { +template <> __m128i VectorWriter::from_fp32(__m128 vec) { // nothing to do in this case return _mm_castps_si128(vec); } template <> -void VectorWriter::write_1ch(uint8* destination, __m128* vec) { +void VectorWriter::write_1ch(uint8 *destination, __m128 *vec) { + write_1b_1ch(destination, vec); +} +template <> void VectorWriter::write_1ch(int8 *destination, __m128 *vec) { write_1b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(int8* destination, __m128* vec) { - write_1b_1ch(destination, vec); -} -template <> -void VectorWriter::write_1ch(uint16* destination, __m128* vec) { +void VectorWriter::write_1ch(uint16 *destination, __m128 *vec) { write_2b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(int16* destination, __m128* vec) { +void VectorWriter::write_1ch(int16 *destination, __m128 *vec) { write_2b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(int32* destination, __m128* vec) { +void VectorWriter::write_1ch(int32 *destination, __m128 *vec) { write_4b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(Eigen::half* destination, - __m128* vec) { +void VectorWriter::write_1ch(Eigen::half *destination, + __m128 *vec) { write_2b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(bfloat16* destination, __m128* vec) { +void VectorWriter::write_1ch(bfloat16 *destination, __m128 *vec) { write_2b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(float* destination, __m128* vec) { - _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); +void VectorWriter::write_1ch(float *destination, __m128 *vec) { + _mm_storeu_si128((__m128i *)(destination), _mm_castps_si128(vec[0])); } template <> -void VectorWriter::write_2ch(uint8* destination, __m128* vec) { +void VectorWriter::write_2ch(uint8 *destination, __m128 *vec) { + write_1b_2ch(destination, vec); +} +template <> void VectorWriter::write_2ch(int8 *destination, __m128 *vec) { write_1b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(int8* destination, __m128* vec) { - write_1b_2ch(destination, vec); -} -template <> -void VectorWriter::write_2ch(uint16* destination, __m128* vec) { +void VectorWriter::write_2ch(uint16 *destination, __m128 *vec) { write_2b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(int16* destination, __m128* vec) { +void VectorWriter::write_2ch(int16 *destination, __m128 *vec) { write_2b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(int32* destination, __m128* vec) { +void VectorWriter::write_2ch(int32 *destination, __m128 *vec) { write_4b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(Eigen::half* destination, - __m128* vec) { +void VectorWriter::write_2ch(Eigen::half *destination, + __m128 *vec) { write_2b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(bfloat16* destination, __m128* vec) { +void VectorWriter::write_2ch(bfloat16 *destination, __m128 *vec) { write_2b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(float* destination, __m128* vec) { - _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); - _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); +void VectorWriter::write_2ch(float *destination, __m128 *vec) { + _mm_storeu_si128((__m128i *)(destination), _mm_castps_si128(vec[0])); + _mm_storeu_si128((__m128i *)(destination + 4), _mm_castps_si128(vec[1])); } template <> -void VectorWriter::write_3ch(uint8* destination, __m128* vec) { +void VectorWriter::write_3ch(uint8 *destination, __m128 *vec) { + write_1b_3ch(destination, vec); +} +template <> void VectorWriter::write_3ch(int8 *destination, __m128 *vec) { write_1b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(int8* destination, __m128* vec) { - write_1b_3ch(destination, vec); -} -template <> -void VectorWriter::write_3ch(uint16* destination, __m128* vec) { +void VectorWriter::write_3ch(uint16 *destination, __m128 *vec) { write_2b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(int16* destination, __m128* vec) { +void VectorWriter::write_3ch(int16 *destination, __m128 *vec) { write_2b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(int32* destination, __m128* vec) { +void VectorWriter::write_3ch(int32 *destination, __m128 *vec) { write_4b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(Eigen::half* destination, - __m128* vec) { +void VectorWriter::write_3ch(Eigen::half *destination, + __m128 *vec) { write_2b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(bfloat16* destination, __m128* vec) { +void VectorWriter::write_3ch(bfloat16 *destination, __m128 *vec) { write_2b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(float* destination, __m128* vec) { - _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); - _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); - _mm_storeu_si128((__m128i*)(destination + 8), _mm_castps_si128(vec[2])); +void VectorWriter::write_3ch(float *destination, __m128 *vec) { + _mm_storeu_si128((__m128i *)(destination), _mm_castps_si128(vec[0])); + _mm_storeu_si128((__m128i *)(destination + 4), _mm_castps_si128(vec[1])); + _mm_storeu_si128((__m128i *)(destination + 8), _mm_castps_si128(vec[2])); } template <> -void VectorWriter::write_4ch(uint8* destination, __m128* vec) { +void VectorWriter::write_4ch(uint8 *destination, __m128 *vec) { + write_1b_4ch(destination, vec); +} +template <> void VectorWriter::write_4ch(int8 *destination, __m128 *vec) { write_1b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(int8* destination, __m128* vec) { - write_1b_4ch(destination, vec); -} -template <> -void VectorWriter::write_4ch(uint16* destination, __m128* vec) { +void VectorWriter::write_4ch(uint16 *destination, __m128 *vec) { write_2b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(int16* destination, __m128* vec) { +void VectorWriter::write_4ch(int16 *destination, __m128 *vec) { write_2b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(int32* destination, __m128* vec) { +void VectorWriter::write_4ch(int32 *destination, __m128 *vec) { write_4b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(Eigen::half* destination, - __m128* vec) { +void VectorWriter::write_4ch(Eigen::half *destination, + __m128 *vec) { write_2b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(bfloat16* destination, __m128* vec) { +void VectorWriter::write_4ch(bfloat16 *destination, __m128 *vec) { write_2b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(float* destination, __m128* vec) { - _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); - _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); - _mm_storeu_si128((__m128i*)(destination + 8), _mm_castps_si128(vec[2])); - _mm_storeu_si128((__m128i*)(destination + 12), _mm_castps_si128(vec[3])); +void VectorWriter::write_4ch(float *destination, __m128 *vec) { + _mm_storeu_si128((__m128i *)(destination), _mm_castps_si128(vec[0])); + _mm_storeu_si128((__m128i *)(destination + 4), _mm_castps_si128(vec[1])); + _mm_storeu_si128((__m128i *)(destination + 8), _mm_castps_si128(vec[2])); + _mm_storeu_si128((__m128i *)(destination + 12), _mm_castps_si128(vec[3])); } template class CropResizeCastImage : public VectorLoader, public VectorWriter { - public: +public: CropResizeCastImage(const int in_height, const int in_width, const int out_height, const int out_width, const int channels, const int min_ix, const int max_ix, - const CachedInterpolation* xs, const int min_iy, - const int max_iy, const CachedInterpolation* ys, + const CachedInterpolation *xs, const int min_iy, + const int max_iy, const CachedInterpolation *ys, const float extrapolated_value, const bool flip_x, const bool flip_y, const bool verbose = false, const int allowed_load_groups = 15) - : verbose_(verbose), - allowed_load_groups_(allowed_load_groups), - in_height_(in_height), - in_width_(in_width), - out_height_(out_height), - out_width_(out_width), - channels_(channels), - min_ix_(min_ix), - max_ix_(max_ix), - min_iy_(min_iy), - max_iy_(max_iy), - ys_(ys), - extrapolated_value_(extrapolated_value), - flip_x_(flip_x), - flip_y_(flip_y), - in_row_size_(in_width * channels), + : verbose_(verbose), allowed_load_groups_(allowed_load_groups), + in_height_(in_height), in_width_(in_width), out_height_(out_height), + out_width_(out_width), channels_(channels), min_ix_(min_ix), + max_ix_(max_ix), min_iy_(min_iy), max_iy_(max_iy), ys_(ys), + extrapolated_value_(extrapolated_value), flip_x_(flip_x), + flip_y_(flip_y), in_row_size_(in_width * channels), in_row_size_bytes_(in_width * channels * sizeof(T)), out_row_size_(out_width * channels), x0_(flip_x ? out_width - 1 - max_ix : min_ix), @@ -3622,21 +3543,21 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { // xs[].lower == xs[].upper AND xs[].lerp == 1 xs_ = new CachedInterpolation[max_ix_ - min_ix_ + 1]; for (int i = min_ix_; i <= max_ix_; ++i) { - int ix = i - min_ix_; - int xs_lower = xs[ix].lower / channels_; - int xs_upper = xs[ix].upper / channels_; - if (xs_lower == xs_upper) { - if (xs[ix].lerp == 0.0f && xs_lower + 1 < in_width) { - // upper weight is zero - xs_upper = xs_lower + 1; - } else if (xs[ix].lerp == 1.0f && xs_upper - 1 >= 0) { - // lower weight is zero - xs_lower = xs_upper - 1; - } - } - xs_[ix].lower = xs_lower * channels_; - xs_[ix].upper = xs_upper * channels_; - xs_[ix].lerp = xs[ix].lerp; + int ix = i - min_ix_; + int xs_lower = xs[ix].lower / channels_; + int xs_upper = xs[ix].upper / channels_; + if (xs_lower == xs_upper) { + if (xs[ix].lerp == 0.0f && xs_lower + 1 < in_width) { + // upper weight is zero + xs_upper = xs_lower + 1; + } else if (xs[ix].lerp == 1.0f && xs_upper - 1 >= 0) { + // lower weight is zero + xs_lower = xs_upper - 1; + } + } + xs_[ix].lower = xs_lower * channels_; + xs_[ix].upper = xs_upper * channels_; + xs_[ix].lerp = xs[ix].lerp; } _u_min_val = std::numeric_limits::min(); _u_max_val = std::numeric_limits::max(); @@ -3665,25 +3586,40 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { } } ~CropResizeCastImage() { - if (general_x_ != NULL) delete[] general_x_; - if (load1_x_ != NULL) delete[] load1_x_; - if (load2_x_ != NULL) delete[] load2_x_; - if (load4_x_ != NULL) delete[] load4_x_; - if (load8_x_ != NULL) delete[] load8_x_; - if (load1_offsets_ != NULL) delete[] load1_offsets_; - if (load2_offsets_ != NULL) delete[] load2_offsets_; - if (load4_offsets_ != NULL) delete[] load4_offsets_; - if (load8_offsets_ != NULL) delete[] load8_offsets_; - if (load1_shuffle_masks_ != NULL) delete[] load1_shuffle_masks_; - if (load2_shuffle_masks_ != NULL) delete[] load2_shuffle_masks_; - if (load1_mmxs_lerp_ != NULL) delete[] load1_mmxs_lerp_; - if (load2_mmxs_lerp_ != NULL) delete[] load2_mmxs_lerp_; - if (load4_mmxs_lerp_ != NULL) delete[] load4_mmxs_lerp_; - if (load8_mmxs_lerp_ != NULL) delete[] load8_mmxs_lerp_; + if (general_x_ != NULL) + delete[] general_x_; + if (load1_x_ != NULL) + delete[] load1_x_; + if (load2_x_ != NULL) + delete[] load2_x_; + if (load4_x_ != NULL) + delete[] load4_x_; + if (load8_x_ != NULL) + delete[] load8_x_; + if (load1_offsets_ != NULL) + delete[] load1_offsets_; + if (load2_offsets_ != NULL) + delete[] load2_offsets_; + if (load4_offsets_ != NULL) + delete[] load4_offsets_; + if (load8_offsets_ != NULL) + delete[] load8_offsets_; + if (load1_shuffle_masks_ != NULL) + delete[] load1_shuffle_masks_; + if (load2_shuffle_masks_ != NULL) + delete[] load2_shuffle_masks_; + if (load1_mmxs_lerp_ != NULL) + delete[] load1_mmxs_lerp_; + if (load2_mmxs_lerp_ != NULL) + delete[] load2_mmxs_lerp_; + if (load4_mmxs_lerp_ != NULL) + delete[] load4_mmxs_lerp_; + if (load8_mmxs_lerp_ != NULL) + delete[] load8_mmxs_lerp_; delete[] xs_; } - private: +private: // constructor arguments const bool verbose_; // this value is meant for unit testing. @@ -3697,8 +3633,8 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { const int in_height_, in_width_, out_height_, out_width_; const int channels_; const int min_ix_, max_ix_, min_iy_, max_iy_; - const CachedInterpolation* ys_; - CachedInterpolation* xs_; + const CachedInterpolation *ys_; + CachedInterpolation *xs_; const float extrapolated_value_; const bool flip_x_, flip_y_; // computed arguments @@ -3709,40 +3645,40 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { const int y0_, y1_; // helper methods - void ResizeRow_load1_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load2_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load4_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load8_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load1_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load2_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load4_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load8_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load1_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load2_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load4_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load8_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load1_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load2_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load4_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load8_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_general_(const float ys_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load1_1ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load2_1ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load4_1ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load8_1ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load1_2ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load2_2ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load4_2ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load8_2ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load1_3ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load2_3ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load4_3ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load8_3ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load1_4ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load2_4ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load4_4ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load8_4ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_general_(const float ys_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr); // configuration parameters int num_general_, num_load1_, num_load2_, num_load4_, num_load8_; @@ -3756,17 +3692,17 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { // configuration methods void Configure_(); int DetermineLoadGroup_(const int x); - bool ComputeXIndexRange_(const int x, int* min_xidx, int* max_xidx); - bool Load1_ok_( - const int min_xidx, - const int max_xidx); // xs - pointer to first xs for this load group - bool Load2_ok_( - const int min_xidx, - const int max_xidx); // xs - pointer to first xs for this load group + bool ComputeXIndexRange_(const int x, int *min_xidx, int *max_xidx); + bool + Load1_ok_(const int min_xidx, + const int max_xidx); // xs - pointer to first xs for this load group + bool + Load2_ok_(const int min_xidx, + const int max_xidx); // xs - pointer to first xs for this load group bool Load4_ok_(const int min_xidx, const int max_xidx); bool Load8_ok_(const int min_xidx, const int max_xidx); - public: +public: // // public client methods // @@ -3776,34 +3712,36 @@ class CropResizeCastImage : public VectorLoader, public VectorWriter { static bool clip_necessary(); // resize image - void Resize(const T* input_image, U* output_image); + void Resize(const T *input_image, U *output_image); }; template -void CropResizeCastImage::Resize(const T* input_image, U* output_image) { +void CropResizeCastImage::Resize(const T *input_image, U *output_image) { // U uEx = cast_to(extrapolated_value_, _f_min_val, _f_max_val, _u_min_val, _u_max_val); // extrapolate top if (min_iy_ > 0) { - U* p = flip_y_ ? output_image + out_row_size_ * (out_height_ - min_iy_) + U *p = flip_y_ ? output_image + out_row_size_ * (out_height_ - min_iy_) : output_image; int nn = out_row_size_ * min_iy_; - for (int i = 0; i < nn; ++i) p[i] = uEx; + for (int i = 0; i < nn; ++i) + p[i] = uEx; } // extrapolate bottom if (max_iy_ < out_height_ - 1) { - U* p = + U *p = flip_y_ ? output_image : output_image + out_row_size_ * (max_iy_ + 1); int nn = out_row_size_ * (out_height_ - 1 - max_iy_); - for (int i = 0; i < nn; ++i) p[i] = uEx; + for (int i = 0; i < nn; ++i) + p[i] = uEx; } // extrapolate left if (min_ix_ > 0) { for (int iy = min_iy_; iy <= max_iy_; ++iy) { int xx0 = flip_x_ ? (out_width_ - min_ix_) * channels_ : 0; int nxx = min_ix_ * channels_; - U* p = output_image + xx0 + + U *p = output_image + xx0 + out_row_size_ * (flip_y_ ? out_height_ - 1 - iy : iy); for (int ix = 0; ix < nxx; ++ix) { p[ix] = uEx; @@ -3815,7 +3753,7 @@ void CropResizeCastImage::Resize(const T* input_image, U* output_image) { for (int iy = min_iy_; iy <= max_iy_; ++iy) { int xx0 = flip_x_ ? 0 : (max_ix_ + 1) * channels_; int nxx = (out_width_ - 1 - max_ix_) * channels_; - U* p = output_image + xx0 + + U *p = output_image + xx0 + out_row_size_ * (flip_y_ ? out_height_ - 1 - iy : iy); for (int ix = 0; ix < nxx; ++ix) { p[ix] = uEx; @@ -3829,163 +3767,163 @@ void CropResizeCastImage::Resize(const T* input_image, U* output_image) { const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; const float yA_lerp = ys_[iyA].lerp; const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); - const T* ysA_input_lower_ptr = - input_image + ys_[iyA].lower * in_width_ * channels_; - const T* ysA_input_upper_ptr = - input_image + ys_[iyA].upper * in_width_ * channels_; - U* ysA_output_ptr = output_image + y * out_width_ * channels_; + const T *ysA_input_lower_ptr = + input_image + ys_[iyA].lower * in_width_ * channels_; + const T *ysA_input_upper_ptr = + input_image + ys_[iyA].upper * in_width_ * channels_; + U *ysA_output_ptr = output_image + y * out_width_ * channels_; const int iyB = - flip_y_ ? out_height_ - 1 - min_iy_ - (y + 1) : (y + 1) - min_iy_; + flip_y_ ? out_height_ - 1 - min_iy_ - (y + 1) : (y + 1) - min_iy_; const float yB_lerp = ys_[iyB].lerp; const __m128 ysB_lerp = _mm_set1_ps(yB_lerp); - const T* ysB_input_lower_ptr = - input_image + ys_[iyB].lower * in_width_ * channels_; - const T* ysB_input_upper_ptr = - input_image + ys_[iyB].upper * in_width_ * channels_; - U* ysB_output_ptr = output_image + (y + 1) * out_width_ * channels_; + const T *ysB_input_lower_ptr = + input_image + ys_[iyB].lower * in_width_ * channels_; + const T *ysB_input_upper_ptr = + input_image + ys_[iyB].upper * in_width_ * channels_; + U *ysB_output_ptr = output_image + (y + 1) * out_width_ * channels_; if (channels_ == 1) { - this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_1ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); } else if (channels_ == 2) { - this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_2ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); } else if (channels_ == 3) { - this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_3ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); } else if (channels_ == 4) { - this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_4ch_(ysB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, + ysB_input_upper_ptr, ysB_output_ptr); } else { - assert(false); + assert(false); } } for (; y <= y1_; ++y) { const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; const float yA_lerp = ys_[iyA].lerp; const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); - const T* ysA_input_lower_ptr = - input_image + ys_[iyA].lower * in_width_ * channels_; - const T* ysA_input_upper_ptr = - input_image + ys_[iyA].upper * in_width_ * channels_; - U* ysA_output_ptr = output_image + y * out_width_ * channels_; + const T *ysA_input_lower_ptr = + input_image + ys_[iyA].lower * in_width_ * channels_; + const T *ysA_input_upper_ptr = + input_image + ys_[iyA].upper * in_width_ * channels_; + U *ysA_output_ptr = output_image + y * out_width_ * channels_; if (channels_ == 1) { - this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); } else if (channels_ == 2) { - this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); } else if (channels_ == 3) { - this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); } else if (channels_ == 4) { - this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); + this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, + ysA_input_upper_ptr, ysA_output_ptr); } else { - assert(false); + assert(false); } } } @@ -3993,9 +3931,9 @@ void CropResizeCastImage::Resize(const T* input_image, U* output_image) { template void CropResizeCastImage::ResizeRow_general_(const float ys_lerp, - const T* ys_input_lower_ptr, - const T* ys_input_upper_ptr, - U* output_y_ptr) { + const T *ys_input_lower_ptr, + const T *ys_input_upper_ptr, + U *output_y_ptr) { for (int current = 0; current < num_general_; ++current) { int x = general_x_[current]; const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - x : x - min_ix_; @@ -4020,12 +3958,12 @@ void CropResizeCastImage::ResizeRow_general_(const float ys_lerp, // 1 channel image. template void CropResizeCastImage::ResizeRow_load1_1ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load1_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; + __m128 *mmxs_lerp = + (__m128 *)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, right0; this->load1_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4062,12 +4000,12 @@ void CropResizeCastImage::ResizeRow_load1_1ch_( // 1 channel image. template void CropResizeCastImage::ResizeRow_load2_1ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load2_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; + __m128 *mmxs_lerp = + (__m128 *)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, right0; this->load2_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4104,10 +4042,10 @@ void CropResizeCastImage::ResizeRow_load2_1ch_( // 1 channel image. template void CropResizeCastImage::ResizeRow_load4_1ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load4_; ++current) { - __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); + __m128 *mmxs_lerp = (__m128 *)(load4_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, right0; this->load4_1ch( @@ -4147,10 +4085,10 @@ void CropResizeCastImage::ResizeRow_load4_1ch_( // 1 channel image. template void CropResizeCastImage::ResizeRow_load8_1ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load8_; ++current) { - __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); + __m128 *mmxs_lerp = (__m128 *)(load8_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, right0; this->load8_1ch( @@ -4193,12 +4131,12 @@ void CropResizeCastImage::ResizeRow_load8_1ch_( // 2 channel image. template void CropResizeCastImage::ResizeRow_load1_2ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load1_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; + __m128 *mmxs_lerp = + (__m128 *)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, right0, right1; this->load1_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4246,12 +4184,12 @@ void CropResizeCastImage::ResizeRow_load1_2ch_( // 2 channel image. template void CropResizeCastImage::ResizeRow_load2_2ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load2_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; + __m128 *mmxs_lerp = + (__m128 *)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, right0, right1; this->load2_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4299,10 +4237,10 @@ void CropResizeCastImage::ResizeRow_load2_2ch_( // 2 channel image. template void CropResizeCastImage::ResizeRow_load4_2ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load4_; ++current) { - __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); + __m128 *mmxs_lerp = (__m128 *)(load4_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, right0, right1; this->load4_2ch( @@ -4353,10 +4291,10 @@ void CropResizeCastImage::ResizeRow_load4_2ch_( // 2 channel image. template void CropResizeCastImage::ResizeRow_load8_2ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load8_; ++current) { - __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); + __m128 *mmxs_lerp = (__m128 *)(load8_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, right0, right1; this->load8_2ch( @@ -4410,12 +4348,12 @@ void CropResizeCastImage::ResizeRow_load8_2ch_( // 3 channel image. template void CropResizeCastImage::ResizeRow_load1_3ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load1_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; + __m128 *mmxs_lerp = + (__m128 *)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, left2, right0, right1, right2; this->load1_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4473,12 +4411,12 @@ void CropResizeCastImage::ResizeRow_load1_3ch_( // 3 channel image. template void CropResizeCastImage::ResizeRow_load2_3ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load2_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; + __m128 *mmxs_lerp = + (__m128 *)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, left2, right0, right1, right2; this->load2_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4536,10 +4474,10 @@ void CropResizeCastImage::ResizeRow_load2_3ch_( // 3 channel image. template void CropResizeCastImage::ResizeRow_load4_3ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load4_; ++current) { - __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); + __m128 *mmxs_lerp = (__m128 *)(load4_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, left2, right0, right1, right2; this->load4_3ch( @@ -4601,10 +4539,10 @@ void CropResizeCastImage::ResizeRow_load4_3ch_( // 3 channel image. template void CropResizeCastImage::ResizeRow_load8_3ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load8_; ++current) { - __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); + __m128 *mmxs_lerp = (__m128 *)(load8_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, left2, right0, right1, right2; this->load8_3ch( @@ -4669,12 +4607,12 @@ void CropResizeCastImage::ResizeRow_load8_3ch_( // 4 channel image. template void CropResizeCastImage::ResizeRow_load1_4ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load1_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; + __m128 *mmxs_lerp = + (__m128 *)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, left2, left3, right0, right1, right2, right3; this->load1_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4744,12 +4682,12 @@ void CropResizeCastImage::ResizeRow_load1_4ch_( // 4 channel image. template void CropResizeCastImage::ResizeRow_load2_4ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load2_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; + __m128 *mmxs_lerp = + (__m128 *)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, left2, left3, right0, right1, right2, right3; this->load2_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4819,10 +4757,10 @@ void CropResizeCastImage::ResizeRow_load2_4ch_( // 4 channel image. template void CropResizeCastImage::ResizeRow_load4_4ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load4_; ++current) { - __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); + __m128 *mmxs_lerp = (__m128 *)(load4_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, left2, left3, right0, right1, right2, right3; this->load4_4ch( @@ -4895,10 +4833,10 @@ void CropResizeCastImage::ResizeRow_load4_4ch_( // 4 channel image. template void CropResizeCastImage::ResizeRow_load8_4ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { + const __m128 y_lerp, const T *ysA_input_lower_ptr, + const T *ysA_input_upper_ptr, U *ysA_output_ptr) { for (int current = 0; current < num_load8_; ++current) { - __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); + __m128 *mmxs_lerp = (__m128 *)(load8_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, left2, left3, right0, right1, right2, right3; this->load8_4ch( @@ -4969,22 +4907,23 @@ void CropResizeCastImage::ResizeRow_load8_4ch_( } #undef CHANNELS -template -void CropResizeCastImage::Configure_() { +template void CropResizeCastImage::Configure_() { // num_cases[0] = general case // num_cases[1] = load4from1 // num_cases[2] = load4from2 // num_cases[3] = load4from4 // num_cases[4] = load4from8 int num_cases[5]; - for (int i = 0; i < 5; ++i) num_cases[i] = 0; + for (int i = 0; i < 5; ++i) + num_cases[i] = 0; for (int x = x0_; x <= x1_; ++x) { int load_group = this->DetermineLoadGroup_(x); assert(load_group >= 0 && load_group <= 4); ++num_cases[load_group]; // load_group == 0 -> general case, pixel by pixel // every other value indidcates 1+3 = 4 pixels were processed this iteration - if (load_group > 0) x += 3; + if (load_group > 0) + x += 3; } num_general_ = num_cases[0]; num_load1_ = num_cases[1]; @@ -4999,7 +4938,7 @@ void CropResizeCastImage::Configure_() { if (num_load1_ > 0) { load1_offsets_ = new int[num_load1_]; load1_shuffle_masks_ = new __m128i[num_load1_ * channels_ * 3]; - load1_mmxs_lerp_ = NULL; // new __m128[num_load1_*channels_]; + load1_mmxs_lerp_ = NULL; // new __m128[num_load1_*channels_]; load1_x_ = new int[num_load1_]; } else { load1_offsets_ = NULL; @@ -5010,7 +4949,7 @@ void CropResizeCastImage::Configure_() { if (num_load2_ > 0) { load2_offsets_ = new int[num_load2_]; load2_shuffle_masks_ = new __m128i[num_load2_ * channels_ * 2]; - load2_mmxs_lerp_ = NULL; // new __m128[num_load2_*channels_]; + load2_mmxs_lerp_ = NULL; // new __m128[num_load2_*channels_]; load2_x_ = new int[num_load2_]; } else { load2_offsets_ = NULL; @@ -5036,7 +4975,8 @@ void CropResizeCastImage::Configure_() { load8_mmxs_lerp_ = NULL; load8_x_ = NULL; } - for (int i = 0; i < 5; ++i) num_cases[i] = 0; + for (int i = 0; i < 5; ++i) + num_cases[i] = 0; if (verbose_) { printf(" load4from1 = %d\n", num_load1_); printf(" load4from2 = %d\n", num_load2_); @@ -5060,17 +5000,19 @@ void CropResizeCastImage::Configure_() { int min_xidx, max_xidx; ComputeXIndexRange_(x, &min_xidx, &max_xidx); load1_offsets_[current] = min_xidx * channels_; - float* xs_lerp = (float*)(load1_shuffle_masks_ + current * channels_ * 3); - char* shufmasks1 = - (char*)(load1_shuffle_masks_ + current * channels_ * 3 + channels_); - char* shufmasks2 = shufmasks1 + 16 * channels_; - for (int j = 0; j < 32 * channels_; ++j) shufmasks1[j] = -128; + float *xs_lerp = + (float *)(load1_shuffle_masks_ + current * channels_ * 3); + char *shufmasks1 = + (char *)(load1_shuffle_masks_ + current * channels_ * 3 + channels_); + char *shufmasks2 = shufmasks1 + 16 * channels_; + for (int j = 0; j < 32 * channels_; ++j) + shufmasks1[j] = -128; for (int pix = 0; pix < 4; ++pix) { const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; float lerp = xs_[ix].lerp; int widx0 = xs_[ix].lower - - load1_offsets_[current]; // word index within SSE vector + load1_offsets_[current]; // word index within SSE vector for (int ch = 0; ch < channels_; ++ch) { int idx = pix * channels_ + ch; xs_lerp[idx] = lerp; @@ -5092,16 +5034,18 @@ void CropResizeCastImage::Configure_() { int min_xidx, max_xidx; ComputeXIndexRange_(x, &min_xidx, &max_xidx); load2_offsets_[current] = min_xidx * channels_; - float* xs_lerp = (float*)(load2_shuffle_masks_ + current * channels_ * 2); - char* shufmasks1 = - (char*)(load2_shuffle_masks_ + current * channels_ * 2 + channels_); - for (int j = 0; j < 16 * channels_; ++j) shufmasks1[j] = -128; + float *xs_lerp = + (float *)(load2_shuffle_masks_ + current * channels_ * 2); + char *shufmasks1 = + (char *)(load2_shuffle_masks_ + current * channels_ * 2 + channels_); + for (int j = 0; j < 16 * channels_; ++j) + shufmasks1[j] = -128; for (int pix = 0; pix < 4; ++pix) { const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; float lerp = xs_[ix].lerp; int widx0 = xs_[ix].lower - - load2_offsets_[current]; // word index within SSE vector + load2_offsets_[current]; // word index within SSE vector for (int ch = 0; ch < channels_; ++ch) { int idx = pix * channels_ + ch; xs_lerp[idx] = lerp; @@ -5118,8 +5062,8 @@ void CropResizeCastImage::Configure_() { // load4from4 assert(current < num_load4_); load4_x_[current] = x; - int* index = load4_offsets_ + current * 4; - float* xs_lerp = (float*)(load4_mmxs_lerp_ + current * channels_); + int *index = load4_offsets_ + current * 4; + float *xs_lerp = (float *)(load4_mmxs_lerp_ + current * channels_); for (int pix = 0; pix < 4; ++pix) { const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; @@ -5134,8 +5078,8 @@ void CropResizeCastImage::Configure_() { // load4from8 assert(current < num_load8_); load8_x_[current] = x; - int* index = load8_offsets_ + current * 4; - float* xs_lerp = (float*)(load8_mmxs_lerp_ + current * channels_); + int *index = load8_offsets_ + current * 4; + float *xs_lerp = (float *)(load8_mmxs_lerp_ + current * channels_); for (int pix = 0; pix < 4; ++pix) { const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; @@ -5152,7 +5096,8 @@ void CropResizeCastImage::Configure_() { ++num_cases[load_group]; // load_group == 0 -> general case, pixel by pixel // every other value indidcates 1+3 = 4 pixels were processed this iteration - if (load_group > 0) x += 3; + if (load_group > 0) + x += 3; } } @@ -5198,8 +5143,8 @@ int CropResizeCastImage::DetermineLoadGroup_(const int x) { // Compute range of x indexes for xs[0] through xs[3]. // Returns true if valid (xs[i].lower + channels == xs[i].upper for all pixels). template -bool CropResizeCastImage::ComputeXIndexRange_(const int x, int* min_xidx, - int* max_xidx) { +bool CropResizeCastImage::ComputeXIndexRange_(const int x, int *min_xidx, + int *max_xidx) { bool upper_is_lower_plus_one = true; *min_xidx = 0; *max_xidx = -1; @@ -5212,8 +5157,10 @@ bool CropResizeCastImage::ComputeXIndexRange_(const int x, int* min_xidx, *min_xidx = curr_xidx; *max_xidx = curr_xidx; } else { - if (curr_xidx < *min_xidx) *min_xidx = curr_xidx; - if (curr_xidx > *max_xidx) *max_xidx = curr_xidx; + if (curr_xidx < *min_xidx) + *min_xidx = curr_xidx; + if (curr_xidx > *max_xidx) + *max_xidx = curr_xidx; } } else { upper_is_lower_plus_one = false; @@ -5313,206 +5260,158 @@ bool CropResizeCastImage::Load8_ok_(const int min_xidx, // full implementations of templated static member function clip_necessary() // -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return true; } -template <> -bool CropResizeCastImage::clip_necessary() { +template <> bool CropResizeCastImage::clip_necessary() { return false; } @@ -5524,14 +5423,14 @@ bool CropResizeCastImage::clip_necessary() { #define CROP_RESIZE_SINGLE_IMAGE_VECT(T_type, U_type) \ template <> \ void crop_resize_single_image_common( \ - const T_type* image, const int64 in_height, const int64 in_width, \ + const T_type *image, const int64 in_height, const int64 in_width, \ const int64 out_height, const int64 out_width, const int channels, \ - const int min_ix, const int max_ix, const CachedInterpolation* xs, \ - const int min_iy, const int max_iy, const CachedInterpolation* ys, \ + const int min_ix, const int max_ix, const CachedInterpolation *xs, \ + const int min_iy, const int max_iy, const CachedInterpolation *ys, \ const float extrapolated_value, const bool flip_x, const bool flip_y, \ - U_type* output) { \ + U_type *output) { \ if (channels <= 4) { \ - CropResizeCastImage* resizer = \ + CropResizeCastImage *resizer = \ new CropResizeCastImage( \ in_height, in_width, out_height, out_width, channels, min_ix, \ max_ix, xs, min_iy, max_iy, ys, extrapolated_value, flip_x, \ @@ -5560,19 +5459,19 @@ CROP_RESIZE_SINGLE_IMAGE_VECT(float, float) // image resizing for these data types default to the original code. // at the moment, this is int64 and double. -#define CROP_RESIZE_SINGLE_IMAGE_REGULAR(T_type, U_type) \ - template <> \ - void crop_resize_single_image_common( \ - const T_type* image, const int64 in_height, const int64 in_width, \ - const int64 out_height, const int64 out_width, const int channels, \ - const int min_ix, const int max_ix, const CachedInterpolation* xs, \ - const int min_iy, const int max_iy, const CachedInterpolation* ys, \ - const float extrapolated_value, const bool flip_x, const bool flip_y, \ - U_type* output) { \ - crop_resize_single_image(image, in_height, in_width, out_height, \ - out_width, channels, min_ix, max_ix, xs, min_iy, \ - max_iy, ys, extrapolated_value, flip_x, flip_y, \ - output); \ +#define CROP_RESIZE_SINGLE_IMAGE_REGULAR(T_type, U_type) \ + template <> \ + void crop_resize_single_image_common( \ + const T_type *image, const int64 in_height, const int64 in_width, \ + const int64 out_height, const int64 out_width, const int channels, \ + const int min_ix, const int max_ix, const CachedInterpolation *xs, \ + const int min_iy, const int max_iy, const CachedInterpolation *ys, \ + const float extrapolated_value, const bool flip_x, const bool flip_y, \ + U_type *output) { \ + crop_resize_single_image(image, in_height, in_width, out_height, \ + out_width, channels, min_ix, max_ix, xs, min_iy, \ + max_iy, ys, extrapolated_value, flip_x, flip_y, \ + output); \ } CROP_RESIZE_SINGLE_IMAGE_REGULAR(int64, float) @@ -5586,12 +5485,12 @@ CROP_RESIZE_SINGLE_IMAGE_REGULAR(double, float) template void crop_resize_single_image_common( - const T* image, const int64 in_height, const int64 in_width, + const T *image, const int64 in_height, const int64 in_width, const int64 out_height, const int64 out_width, const int channels, - const int min_ix, const int max_ix, const CachedInterpolation* xs, - const int min_iy, const int max_iy, const CachedInterpolation* ys, + const int min_ix, const int max_ix, const CachedInterpolation *xs, + const int min_iy, const int max_iy, const CachedInterpolation *ys, const float extrapolated_value, const bool flip_x, const bool flip_y, - U* output) { + U *output) { crop_resize_single_image(image, in_height, in_width, out_height, out_width, channels, min_ix, max_ix, xs, min_iy, max_iy, ys, extrapolated_value, flip_x, flip_y, output); @@ -5599,6 +5498,6 @@ void crop_resize_single_image_common( #endif -} // namespace -} // namespace tensorflow -#endif // define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ +} // namespace +} // namespace tensorflow +#endif // define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ From 03d5e3a74c5a404fba00da0cbe61cfd55acbf950 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Tue, 2 Oct 2018 18:52:31 -0700 Subject: [PATCH 024/540] Change inherited classes for FixedLengthRecordDataset and TextLineDataset --- tensorflow/python/data/ops/readers.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index d08da6704ca..15ed4432644 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -32,7 +32,7 @@ _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024 # 256 KB @tf_export("data.TextLineDataset") -class TextLineDataset(dataset_ops.Dataset): +class TextLineDataset(dataset_ops.DatasetSource): """A `Dataset` comprising lines from one or more text files.""" def __init__(self, filenames, compression_type=None, buffer_size=None): @@ -61,9 +61,6 @@ class TextLineDataset(dataset_ops.Dataset): return gen_dataset_ops.text_line_dataset( self._filenames, self._compression_type, self._buffer_size) - def _inputs(self): - return [] - @property def output_classes(self): return ops.Tensor @@ -247,7 +244,7 @@ class TFRecordDataset(dataset_ops.Dataset): @tf_export("data.FixedLengthRecordDataset") -class FixedLengthRecordDataset(dataset_ops.Dataset): +class FixedLengthRecordDataset(dataset_ops.DatasetSource): """A `Dataset` of fixed-length records from one or more binary files.""" def __init__(self, @@ -287,9 +284,6 @@ class FixedLengthRecordDataset(dataset_ops.Dataset): self._filenames, self._header_bytes, self._record_bytes, self._footer_bytes, self._buffer_size) - def _inputs(self): - return [] - @property def output_classes(self): return ops.Tensor From b334e6300171dbb7bd7c9f49dbfe247c54c19dab Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Tue, 2 Oct 2018 21:40:38 -0700 Subject: [PATCH 025/540] Change _TFRecordDataset to be herited from DatasetSource --- tensorflow/python/data/ops/readers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index 15ed4432644..b0a7b5b217b 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -74,7 +74,7 @@ class TextLineDataset(dataset_ops.DatasetSource): return dtypes.string -class _TFRecordDataset(dataset_ops.Dataset): +class _TFRecordDataset(dataset_ops.DatasetSource): """A `Dataset` comprising records from one or more TFRecord files.""" def __init__(self, filenames, compression_type=None, buffer_size=None): @@ -105,9 +105,6 @@ class _TFRecordDataset(dataset_ops.Dataset): return gen_dataset_ops.tf_record_dataset( self._filenames, self._compression_type, self._buffer_size) - def _inputs(self): - return [] - @property def output_classes(self): return ops.Tensor From 858812d234cd75a2aed69ac7def94fde7716b12b Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Thu, 18 Oct 2018 11:02:57 -0700 Subject: [PATCH 026/540] Change the API golden file --- .../golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt | 1 + .../tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt | 1 + .../golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt | 1 + .../tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt | 1 + 4 files changed, 4 insertions(+) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt index a7bfa82c650..790005d228d 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt @@ -1,6 +1,7 @@ path: "tensorflow.data.FixedLengthRecordDataset" tf_class { is_instance: "" + is_instance: "" is_instance: "" is_instance: "" member { diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt index 2817f900e15..1c305abf68c 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt @@ -1,6 +1,7 @@ path: "tensorflow.data.TextLineDataset" tf_class { is_instance: "" + is_instance: "" is_instance: "" is_instance: "" member { diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt index a7bfa82c650..790005d228d 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt @@ -1,6 +1,7 @@ path: "tensorflow.data.FixedLengthRecordDataset" tf_class { is_instance: "" + is_instance: "" is_instance: "" is_instance: "" member { diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt index 2817f900e15..1c305abf68c 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt @@ -1,6 +1,7 @@ path: "tensorflow.data.TextLineDataset" tf_class { is_instance: "" + is_instance: "" is_instance: "" is_instance: "" member { From 6b742eea45eea5cc312d0428978e629537ea85d9 Mon Sep 17 00:00:00 2001 From: Grzegorz Pawelczak Date: Fri, 19 Oct 2018 15:11:53 +0100 Subject: [PATCH 027/540] [XLA] Add simplifications for logical and and logical or in AlgebraicSimplifier pass --- .../xla/service/algebraic_simplifier.cc | 68 +++++++ .../xla/service/algebraic_simplifier_test.cc | 168 ++++++++++++++++++ 2 files changed, 236 insertions(+) diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 72ed5ca4821..731378267cc 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -107,6 +107,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { Status HandleAdd(HloInstruction* add) override; + Status HandleAnd(HloInstruction* logical_and) override; + Status HandleBitcast(HloInstruction* bitcast) override; Status HandleBitcastConvert(HloInstruction* bitcast) override; @@ -141,6 +143,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { Status HandleMultiply(HloInstruction* multiply) override; + Status HandleOr(HloInstruction* logical_or) override; + Status HandlePad(HloInstruction* pad) override; Status HandlePower(HloInstruction* power) override; @@ -419,6 +423,38 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) { return Status::OK(); } +Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) { + HloInstruction *lhs, *rhs; + CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs)))); + + // A && True => A + VLOG(10) << "trying transform [A && True => A]: " << logical_and->ToString(); + if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_and, lhs)) { + return Status::OK(); + } + // True && A => A + VLOG(10) << "trying transform [True && A => A]: " << logical_and->ToString(); + if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_and, rhs)) { + return Status::OK(); + } + + // A && False => False + VLOG(10) << "trying transform [A && False => False]: " + << logical_and->ToString(); + if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_and, rhs)) { + return Status::OK(); + } + + // False && A => False + VLOG(10) << "trying transform [False && A => False]: " + << logical_and->ToString(); + if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_and, lhs)) { + return Status::OK(); + } + + return Status::OK(); +} + Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) { // If a bitcast feeds a bitcast, make it a single bitcast. HloInstruction* op; @@ -1225,6 +1261,38 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) { return Status::OK(); } +Status AlgebraicSimplifierVisitor::HandleOr(HloInstruction* logical_or) { + HloInstruction *lhs, *rhs; + CHECK(Match(logical_or, m::Or(m::Op(&lhs), m::Op(&rhs)))); + + // A || True => True + VLOG(10) << "trying transform [A || True => True]: " + << logical_or->ToString(); + if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_or, rhs)) { + return Status::OK(); + } + // True || A => True + VLOG(10) << "trying transform [True || A => True]: " + << logical_or->ToString(); + if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_or, lhs)) { + return Status::OK(); + } + + // A || False => A + VLOG(10) << "trying transform [A || False => A]: " << logical_or->ToString(); + if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_or, lhs)) { + return Status::OK(); + } + + // False || A => A + VLOG(10) << "trying transform [False || A => A]: " << logical_or->ToString(); + if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_or, rhs)) { + return Status::OK(); + } + + return Status::OK(); +} + Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) { // ln(exp(A)) => A VLOG(10) << "trying transform [ln(exp(A)) => A]: " << log->ToString(); diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc index c79c518700b..448d31803ba 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc @@ -2216,6 +2216,174 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) { op::Tuple(keys, values0, values1)); } +// Test that A && True is simplified to A +TEST_F(AlgebraicSimplifierTest, AndTrue) { + Shape r0pred = ShapeUtil::MakeShape(PRED, {}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r0pred, "param0")); + HloInstruction* const_true = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); + builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd, + param0, const_true)); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kAnd); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_EQ(root, param0); +} + +// Test that True && A is simplified to A +TEST_F(AlgebraicSimplifierTest, AndTrue2) { + Shape r0pred = ShapeUtil::MakeShape(PRED, {}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r0pred, "param0")); + HloInstruction* const_true = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); + builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd, + const_true, param0)); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kAnd); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_EQ(root, param0); +} + +// Test that A && False is simplified to False +TEST_F(AlgebraicSimplifierTest, AndFalse) { + Shape r0pred = ShapeUtil::MakeShape(PRED, {}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r0pred, "param0")); + HloInstruction* const_false = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); + builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd, + param0, const_false)); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kAnd); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_EQ(root, const_false); +} + +// Test that False && A is simplified to False +TEST_F(AlgebraicSimplifierTest, AndFalse2) { + Shape r0pred = ShapeUtil::MakeShape(PRED, {}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r0pred, "param0")); + HloInstruction* const_false = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); + builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd, + const_false, param0)); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kAnd); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_EQ(root, const_false); +} + +// Test that A || True is simplified to True +TEST_F(AlgebraicSimplifierTest, OrTrue) { + Shape r0pred = ShapeUtil::MakeShape(PRED, {}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r0pred, "param0")); + HloInstruction* const_true = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); + builder.AddInstruction( + HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, param0, const_true)); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kOr); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_EQ(root, const_true); +} + +// Test that True || A is simplified to True +TEST_F(AlgebraicSimplifierTest, OrTrue2) { + Shape r0pred = ShapeUtil::MakeShape(PRED, {}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r0pred, "param0")); + HloInstruction* const_true = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(true))); + builder.AddInstruction( + HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, const_true, param0)); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kOr); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_EQ(root, const_true); +} + +// Test that A || False is simplified to A +TEST_F(AlgebraicSimplifierTest, OrFalse) { + Shape r0pred = ShapeUtil::MakeShape(PRED, {}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r0pred, "param0")); + HloInstruction* const_false = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); + builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, + param0, const_false)); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kOr); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_EQ(root, param0); +} + +// Test that False || A is simplified to A +TEST_F(AlgebraicSimplifierTest, OrFalse2) { + Shape r0pred = ShapeUtil::MakeShape(PRED, {}); + HloComputation::Builder builder(TestName()); + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, r0pred, "param0")); + HloInstruction* const_false = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(false))); + builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, + const_false, param0)); + + auto computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kOr); + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + non_bitcasting_callback()); + ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + EXPECT_EQ(root, param0); +} + // Used for TEST_Ps that test merging (or not) of a kPad instruction into a // convolution's Window. struct ConvPaddingTestcase { From 8a3632e4f90f0ff4c09939b82cd42857aef08bf1 Mon Sep 17 00:00:00 2001 From: Grzegorz Pawelczak Date: Fri, 19 Oct 2018 16:12:26 +0100 Subject: [PATCH 028/540] Only simplify logical and --- .../xla/service/algebraic_simplifier.cc | 153 ++++++++++-------- 1 file changed, 84 insertions(+), 69 deletions(-) diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 731378267cc..4658859c538 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -426,29 +426,36 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) { Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) { HloInstruction *lhs, *rhs; CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs)))); + // Simplify logical and + if (ShapeUtil::HasPrimitiveType(lhs->shape(), xla::PRED) && + ShapeUtil::HasPrimitiveType(rhs->shape(), xla::PRED)) { + // A && True => A + VLOG(10) << "trying transform [A && True => A]: " + << logical_and->ToString(); + if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_and, lhs)) { + return Status::OK(); + } + // True && A => A + VLOG(10) << "trying transform [True && A => A]: " + << logical_and->ToString(); + if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_and, rhs)) { + return Status::OK(); + } - // A && True => A - VLOG(10) << "trying transform [A && True => A]: " << logical_and->ToString(); - if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_and, lhs)) { - return Status::OK(); - } - // True && A => A - VLOG(10) << "trying transform [True && A => A]: " << logical_and->ToString(); - if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_and, rhs)) { - return Status::OK(); - } + // A && False => False + VLOG(10) << "trying transform [A && False => False]: " + << logical_and->ToString(); + if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_and, rhs)) { + return Status::OK(); + } - // A && False => False - VLOG(10) << "trying transform [A && False => False]: " - << logical_and->ToString(); - if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_and, rhs)) { - return Status::OK(); - } + // False && A => False + VLOG(10) << "trying transform [False && A => False]: " + << logical_and->ToString(); + if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_and, lhs)) { + return Status::OK(); + } - // False && A => False - VLOG(10) << "trying transform [False && A => False]: " - << logical_and->ToString(); - if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_and, lhs)) { return Status::OK(); } @@ -1041,35 +1048,35 @@ StatusOr AlgebraicSimplifierVisitor::OptimizeDotOfGather( // Optimize either dot(DS(ctA), ctB)) or dot(ctB, DS(ctA)). // Currently a Gather is a DynamicSlice. - auto is_dynamic_slice_constant_combination = - [](HloInstruction* a, HloInstruction* b, int a_contracting_dimension) { - // First operand is a DynamicSlice(Constant). - if (a->opcode() != HloOpcode::kDynamicSlice) { - return false; - } - auto* dynamic_slice_op = a->operand(0); - if (dynamic_slice_op->opcode() != HloOpcode::kConstant) { - return false; - } - // Second operand is a Constant. - if (b->opcode() != HloOpcode::kConstant) { - return false; - } - // The DynamicSlice output is a vector. - const Shape& dynamic_slice_shape = a->shape(); - if (dynamic_slice_shape.dimensions(1 - a_contracting_dimension) != 1) { - return false; - } - // Constant size is the same before and after slice in the contracting - // dimension, otherwise we either must precompute for all possible slice - // indices or dot is invalid. - const Shape& dynamic_slice_op_shape = dynamic_slice_op->shape(); - if (dynamic_slice_op_shape.dimensions(a_contracting_dimension) != - dynamic_slice_shape.dimensions(a_contracting_dimension)) { - return false; - } - return true; - }; + auto is_dynamic_slice_constant_combination = []( + HloInstruction* a, HloInstruction* b, int a_contracting_dimension) { + // First operand is a DynamicSlice(Constant). + if (a->opcode() != HloOpcode::kDynamicSlice) { + return false; + } + auto* dynamic_slice_op = a->operand(0); + if (dynamic_slice_op->opcode() != HloOpcode::kConstant) { + return false; + } + // Second operand is a Constant. + if (b->opcode() != HloOpcode::kConstant) { + return false; + } + // The DynamicSlice output is a vector. + const Shape& dynamic_slice_shape = a->shape(); + if (dynamic_slice_shape.dimensions(1 - a_contracting_dimension) != 1) { + return false; + } + // Constant size is the same before and after slice in the contracting + // dimension, otherwise we either must precompute for all possible slice + // indices or dot is invalid. + const Shape& dynamic_slice_op_shape = dynamic_slice_op->shape(); + if (dynamic_slice_op_shape.dimensions(a_contracting_dimension) != + dynamic_slice_shape.dimensions(a_contracting_dimension)) { + return false; + } + return true; + }; HloInstruction* lhs = dot->mutable_operand(0); HloInstruction* rhs = dot->mutable_operand(1); @@ -1265,28 +1272,36 @@ Status AlgebraicSimplifierVisitor::HandleOr(HloInstruction* logical_or) { HloInstruction *lhs, *rhs; CHECK(Match(logical_or, m::Or(m::Op(&lhs), m::Op(&rhs)))); - // A || True => True - VLOG(10) << "trying transform [A || True => True]: " - << logical_or->ToString(); - if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_or, rhs)) { - return Status::OK(); - } - // True || A => True - VLOG(10) << "trying transform [True || A => True]: " - << logical_or->ToString(); - if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_or, lhs)) { - return Status::OK(); - } + // Simplify logical or + if (ShapeUtil::HasPrimitiveType(lhs->shape(), xla::PRED) && + ShapeUtil::HasPrimitiveType(rhs->shape(), xla::PRED)) { + // A || True => True + VLOG(10) << "trying transform [A || True => True]: " + << logical_or->ToString(); + if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_or, rhs)) { + return Status::OK(); + } + // True || A => True + VLOG(10) << "trying transform [True || A => True]: " + << logical_or->ToString(); + if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_or, lhs)) { + return Status::OK(); + } - // A || False => A - VLOG(10) << "trying transform [A || False => A]: " << logical_or->ToString(); - if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_or, lhs)) { - return Status::OK(); - } + // A || False => A + VLOG(10) << "trying transform [A || False => A]: " + << logical_or->ToString(); + if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_or, lhs)) { + return Status::OK(); + } + + // False || A => A + VLOG(10) << "trying transform [False || A => A]: " + << logical_or->ToString(); + if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_or, rhs)) { + return Status::OK(); + } - // False || A => A - VLOG(10) << "trying transform [False || A => A]: " << logical_or->ToString(); - if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_or, rhs)) { return Status::OK(); } From a3e4e4e1ada168b776822c72320eed60cf59ce48 Mon Sep 17 00:00:00 2001 From: Grzegorz Pawelczak Date: Fri, 19 Oct 2018 16:17:16 +0100 Subject: [PATCH 029/540] Use correct clang-format style --- .../xla/service/algebraic_simplifier.cc | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 4658859c538..9ee43202678 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -1048,35 +1048,35 @@ StatusOr AlgebraicSimplifierVisitor::OptimizeDotOfGather( // Optimize either dot(DS(ctA), ctB)) or dot(ctB, DS(ctA)). // Currently a Gather is a DynamicSlice. - auto is_dynamic_slice_constant_combination = []( - HloInstruction* a, HloInstruction* b, int a_contracting_dimension) { - // First operand is a DynamicSlice(Constant). - if (a->opcode() != HloOpcode::kDynamicSlice) { - return false; - } - auto* dynamic_slice_op = a->operand(0); - if (dynamic_slice_op->opcode() != HloOpcode::kConstant) { - return false; - } - // Second operand is a Constant. - if (b->opcode() != HloOpcode::kConstant) { - return false; - } - // The DynamicSlice output is a vector. - const Shape& dynamic_slice_shape = a->shape(); - if (dynamic_slice_shape.dimensions(1 - a_contracting_dimension) != 1) { - return false; - } - // Constant size is the same before and after slice in the contracting - // dimension, otherwise we either must precompute for all possible slice - // indices or dot is invalid. - const Shape& dynamic_slice_op_shape = dynamic_slice_op->shape(); - if (dynamic_slice_op_shape.dimensions(a_contracting_dimension) != - dynamic_slice_shape.dimensions(a_contracting_dimension)) { - return false; - } - return true; - }; + auto is_dynamic_slice_constant_combination = + [](HloInstruction* a, HloInstruction* b, int a_contracting_dimension) { + // First operand is a DynamicSlice(Constant). + if (a->opcode() != HloOpcode::kDynamicSlice) { + return false; + } + auto* dynamic_slice_op = a->operand(0); + if (dynamic_slice_op->opcode() != HloOpcode::kConstant) { + return false; + } + // Second operand is a Constant. + if (b->opcode() != HloOpcode::kConstant) { + return false; + } + // The DynamicSlice output is a vector. + const Shape& dynamic_slice_shape = a->shape(); + if (dynamic_slice_shape.dimensions(1 - a_contracting_dimension) != 1) { + return false; + } + // Constant size is the same before and after slice in the contracting + // dimension, otherwise we either must precompute for all possible slice + // indices or dot is invalid. + const Shape& dynamic_slice_op_shape = dynamic_slice_op->shape(); + if (dynamic_slice_op_shape.dimensions(a_contracting_dimension) != + dynamic_slice_shape.dimensions(a_contracting_dimension)) { + return false; + } + return true; + }; HloInstruction* lhs = dot->mutable_operand(0); HloInstruction* rhs = dot->mutable_operand(1); From f58d035e84d0af4abc660bce318691214e97c180 Mon Sep 17 00:00:00 2001 From: Balint Cristian Date: Sat, 20 Oct 2018 10:03:41 +0300 Subject: [PATCH 030/540] Enhance GPU detection. --- tensorflow/contrib/cmake/CMakeLists.txt | 168 +++++++++++------- .../contrib/cmake/tf_core_kernels.cmake | 8 - tensorflow/contrib/cmake/tf_core_ops.cmake | 1 - tensorflow/contrib/cmake/tf_python.cmake | 2 - 4 files changed, 105 insertions(+), 74 deletions(-) diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt index fbdca497fcc..a63366e1361 100644 --- a/tensorflow/contrib/cmake/CMakeLists.txt +++ b/tensorflow/contrib/cmake/CMakeLists.txt @@ -59,8 +59,6 @@ option(tensorflow_ENABLE_MKLDNN_SUPPORT "Enable Intel MKLDNN support, requires M # GPU, CUDA and cuDNN options option(tensorflow_ENABLE_GPU "Enable GPU support" OFF) -set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against") -set(tensorflow_CUDNN_VERSION "7" CACHE STRING "cuDNN version to build against") if(HAIKU) option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF) @@ -72,25 +70,25 @@ endif() if (NOT WIN32) # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option # for targets that link ${CMAKE_THREAD_LIBS_INIT}. - find_package (Threads) + find_package (Threads REQUIRED) # Options for linking CUDA/CUDNN libraries - option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/) + option(tensorflow_PATH_CUDA_LIB "Additional library search path for cudnn, nccl, culibos" /usr/local/cuda/lib64/) option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/) if (NOT tensorflow_CUDNN_INCLUDE) # option's default value is OFF. Fill it with real default values set(tensorflow_CUDNN_INCLUDE /usr/include) endif (NOT tensorflow_CUDNN_INCLUDE) - option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB}) - if (NOT tensorflow_PATH_CUDNN_STATIC_LIB) + option(tensorflow_PATH_CUDNN_LIB "Override PATH_CUDA_LIB for cudnn" ${tensorflow_PATH_CUDA_LIB}) + if (NOT tensorflow_PATH_CUDNN_LIB) # option's default value is OFF. Fill it with real default values - set (tensorflow_PATH_CUDNN_STATIC_LIB ${tensorflow_PATH_STATIC_LIB}) - endif (NOT tensorflow_PATH_CUDNN_STATIC_LIB) - option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB}) - if (NOT tensorflow_PATH_NCCL_STATIC_LIB) + set (tensorflow_PATH_CUDNN_LIB ${tensorflow_PATH_CUDA_LIB}) + endif (NOT tensorflow_PATH_CUDNN_LIB) + option(tensorflow_PATH_NCCL_LIB "Override PATH_CUDA_LIB for nccl" ${tensorflow_PATH_CUDA_LIB}) + if (NOT tensorflow_PATH_NCCL_LIB) # option's default value is OFF. Fill it with real default values - set (tensorflow_PATH_NCCL_STATIC_LIB ${tensorflow_PATH_STATIC_LIB}) - endif (NOT tensorflow_PATH_NCCL_STATIC_LIB) + set (tensorflow_PATH_NCCL_LIB ${tensorflow_PATH_CUDA_LIB}) + endif (NOT tensorflow_PATH_NCCL_LIB) option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64) if (NOT tensorflow_CUDA_LIBRARY_PATH) # option's default value is OFF. Fill it with real default values @@ -210,14 +208,17 @@ endif() include(CheckCXXCompilerFlag) # OpenMP Support -CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT) -if (GCC_OPENMP_SUPPORT) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") -endif() -CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT) -if (MSVC_OPENMP_SUPPORT) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp") -endif() +if (WIN32) + CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT) + if (MSVC_OPENMP_SUPPORT) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp") + endif() +else (WIN32) + CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT) + if (GCC_OPENMP_SUPPORT) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") + endif() +endif (WIN32) # MSVC SIMD instructions if (tensorflow_WIN_CPU_SIMD_OPTIONS) @@ -377,29 +378,19 @@ if (tensorflow_ENABLE_GPU) list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs") endif (NOT WIN32) - # later command will make use of the value in tensorflow_CUDA_VERSION - find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED EXACT) - - # Test compatibility of compiler on CUDA - try_compile(CUDA_TEST_COMPILE_C - ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda - ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.c - CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}) - try_compile(CUDA_TEST_COMPILE_CXX - ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda - ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.cc - CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}) - if(NOT (CUDA_TEST_COMPILE_C AND CUDA_TEST_COMPILE_CXX)) - message(FATAL_ERROR "Selected compiler (or version) is not supported for CUDA") + # minimum 9.1 in cuda version + find_package(CUDA 9.1 REQUIRED) + if(NOT CUDA_FOUND) + message(FATAL_ERROR "CUDA not found.") endif() - # by default we assume compute cabability 3.5 and 5.2. If you change this change it in - # CUDA_NVCC_FLAGS and cuda_config.h below - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_37,code=\"sm_37,compute_37\") - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_52,code=\"sm_52,compute_52\") - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_60,code=\"sm_60,compute_60\") - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_61,code=\"sm_61,compute_61\") - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_70,code=\"sm_70,compute_70\") + # use cmake internal CUDA_ARCH_NAME switch + # e.g. CUDA_ARCH_NAME="Auto" will autodetect + # CUDA_ARCH_NAME="All" will use all arches + cuda_select_nvcc_arch_flags(NVCC_ARCH_FLAGS ${CUDA_ARCH_NAME}) + list(APPEND CUDA_NVCC_FLAGS ${NVCC_ARCH_FLAGS}) + message(STATUS "Using CUDA arch flags: ${NVCC_ARCH_FLAGS_readable}") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true) # Flush denormals to zero set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include) @@ -423,43 +414,94 @@ if (tensorflow_ENABLE_GPU) else (WIN32) set(CUDNN_INCLUDE "${tensorflow_CUDNN_INCLUDE}") - find_library(nccl_STATIC_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR}) - if (NOT nccl_STATIC_LIBRARY) + if (tensorflow_BUILD_SHARED_LIB) + find_library(nccl_LIBRARY NAMES libnccl.so PATHS ${tensorflow_PATH_NCCL_LIB} ${CUDA_TOOLKIT_ROOT_DIR}) + else (tensorflow_BUILD_SHARED_LIB) + find_library(nccl_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_LIB} ${CUDA_TOOLKIT_ROOT_DIR}) + endif (tensorflow_BUILD_SHARED_LIB) + if (NOT nccl_LIBRARY) message(FATAL_ERROR "NCCL is required for GPU-build") - else (NOT nccl_STATIC_LIBRARY) - message("nccl-static: ${nccl_STATIC_LIBRARY}") + else (NOT nccl_LIBRARY) + message("nccl: ${nccl_LIBRARY}") # something like /usr/lib64/libnccl_static.a - endif (NOT nccl_STATIC_LIBRARY) + endif (NOT nccl_LIBRARY) - find_library(cudnn_STATIC_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR}) - if (NOT cudnn_STATIC_LIBRARY) + if (tensorflow_BUILD_SHARED_LIB) + find_library(cudnn_LIBRARY NAMES libcudnn.so PATHS ${tensorflow_PATH_CUDNN_LIB} ${CUDA_TOOLKIT_ROOT_DIR}) + else (tensorflow_BUILD_SHARED_LIB) + find_library(cudnn_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_LIB} ${CUDA_TOOLKIT_ROOT_DIR}) + endif (tensorflow_BUILD_SHARED_LIB) + if (NOT cudnn_LIBRARY) message(FATAL_ERROR "CUDNN is required for GPU-build") - else (NOT cudnn_STATIC_LIBRARY) - message("cudnn-static: ${cudnn_STATIC_LIBRARY}") - endif (NOT cudnn_STATIC_LIBRARY) + else (NOT cudnn_LIBRARY) + file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) + # fetch cudnn version + string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" + CUDNN_VERSION_MAJOR "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" + CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}") + string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)" + CUDNN_VERSION_MINOR "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1" + CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}") + string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" + CUDNN_VERSION_PATCH "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" + CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}") + if(NOT CUDNN_VERSION_MAJOR) + set(CUDNN_VERSION "???") + else() + set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}") + endif() + message(STATUS "cudnn library: ${cudnn_LIBRARY} (found version: \"${CUDNN_VERSION}\")") + endif (NOT cudnn_LIBRARY) - find_library(culibos_STATIC_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR}) - if (NOT culibos_STATIC_LIBRARY) + if (tensorflow_BUILD_SHARED_LIB) + # shared first (if exists) else static one + find_library(culibos_LIBRARY NAMES libculibos.so libculibos.a PATHS ${tensorflow_PATH_CUDA_LIB} ${CUDA_TOOLKIT_ROOT_DIR}) + else (tensorflow_BUILD_SHARED_LIB) + # only static version + find_library(culibos_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_CUDA_LIB} ${CUDA_TOOLKIT_ROOT_DIR}) + endif (tensorflow_BUILD_SHARED_LIB) + if (NOT culibos_LIBRARY) message(FATAL_ERROR "CULIBOS is required for GPU-build") - else (NOT culibos_STATIC_LIBRARY) - message("culibos-static: ${culibos_STATIC_LIBRARY}") - endif (NOT culibos_STATIC_LIBRARY) + else (NOT culibos_LIBRARY) + message("culibos: ${culibos_LIBRARY}") + endif (NOT culibos_LIBRARY) set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} - ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY}) + ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_LIBRARY} ${culibos_LIBRARY} ${nccl_LIBRARY}) endif (WIN32) include_directories(${CUDNN_INCLUDE}) # Remove "." from CUDA version variable. - string(REPLACE "." "" short_CUDA_VER ${tensorflow_CUDA_VERSION}) + string(REPLACE "." "" short_CUDA_VER ${CUDA_VERSION}) + + # List of enumerated CUDA caps + string(REPLACE " " ";" NVCC_ARCH_LIST "${NVCC_ARCH_FLAGS_readable}") + set(list ${NVCC_ARCH_LIST}) + + # Construct capability string + foreach(NVCC_ARCH ${NVCC_ARCH_LIST}) + if (NVCC_ARCH MATCHES "sm_") + string(REGEX REPLACE "^.sm*" "" NVCC_ARCH ${NVCC_ARCH}) + math(EXPR NVCC_ARCH_MAJOR "${NVCC_ARCH} / 10") + math(EXPR NVCC_ARCH_MINOR "(${NVCC_ARCH} - (${NVCC_ARCH_MAJOR}*10))") + if (TF_CUDA_CAP) + set(TF_CUDA_CAP "${TF_CUDA_CAP},CudaVersion(\"${NVCC_ARCH_MAJOR}.${NVCC_ARCH_MINOR}\")") + else (TF_CUDA_CAP) + set(TF_CUDA_CAP "CudaVersion(\"${NVCC_ARCH_MAJOR}.${NVCC_ARCH_MINOR}\")") + endif (TF_CUDA_CAP) + endif() + endforeach() # create cuda_config.h FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h "#ifndef CUDA_CUDA_CONFIG_H_\n" "#define CUDA_CUDA_CONFIG_H_\n" - "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.7\"),CudaVersion(\"5.2\"),CudaVersion(\"6.0\"),CudaVersion(\"6.1\"),CudaVersion(\"7.0\")\n" + "#define TF_CUDA_CAPABILITIES ${TF_CUDA_CAP}\n" "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n" - "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n" + "#define TF_CUDNN_VERSION \"64_${CUDNN_VERSION}\"\n" "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n" "#endif // CUDA_CUDA_CONFIG_H_\n" ) @@ -494,14 +536,14 @@ if (tensorflow_ENABLE_GPU) set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value msvcp_dll_name=msvcp140.dll cudart_dll_name=cudart64_${short_CUDA_VER}.dll - cuda_version_number=${tensorflow_CUDA_VERSION} + cuda_version_number=${CUDA_VERSION} nvcuda_dll_name=nvcuda.dll cudnn_dll_name=cudnn64_${tensorflow_CUDNN_VERSION}.dll cudnn_version_number=${tensorflow_CUDNN_VERSION}) else(WIN32) set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value - cuda_version_number=${tensorflow_CUDA_VERSION} - cudnn_version_number=${tensorflow_CUDNN_VERSION}) + cuda_version_number=${CUDA_VERSION} + cudnn_version_number=${tensorflow_CUDNN_VERSION}) endif(WIN32) else(tensorflow_ENABLE_GPU) set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake index 7b892ba248b..111a04ca22e 100644 --- a/tensorflow/contrib/cmake/tf_core_kernels.cmake +++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake @@ -68,14 +68,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS) "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc" - "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc" - "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/csv_dataset_op.cc" - "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc" - "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc" - "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc" - "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc" - "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/unique_dataset_op.cc" - "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc" diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake index bc753333dba..c33c595e4f5 100644 --- a/tensorflow/contrib/cmake/tf_core_ops.cmake +++ b/tensorflow/contrib/cmake/tf_core_ops.cmake @@ -89,7 +89,6 @@ GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_prediction "${tensorflow_source_dir}/t GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc") GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc") GENERATE_CONTRIB_OP_LIBRARY(coder "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc") -GENERATE_CONTRIB_OP_LIBRARY(data_dataset "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc") GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc") GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc") GENERATE_CONTRIB_OP_LIBRARY(framework_variable "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc") diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake index 6d86daf5f17..80219b08743 100755 --- a/tensorflow/contrib/cmake/tf_python.cmake +++ b/tensorflow/contrib/cmake/tf_python.cmake @@ -373,8 +373,6 @@ GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_stats_accumulator_ops" DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_stats_accumulator_ops.py) GENERATE_PYTHON_OP_LIB("contrib_coder_ops" DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/coder/python/ops/gen_coder_ops.py) -GENERATE_PYTHON_OP_LIB("contrib_data_dataset_ops" - DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_dataset_ops.py) GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops" DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/factorization/python/ops/gen_clustering_ops.py) GENERATE_PYTHON_OP_LIB("contrib_factorization_factorization_ops" From ec903b1293d2002605cf82724165464bac29fa9c Mon Sep 17 00:00:00 2001 From: Grzegorz Pawelczak Date: Mon, 22 Oct 2018 09:52:54 +0100 Subject: [PATCH 031/540] Address review comments --- .../service/while_loop_constant_sinking.cc | 36 +++++++++------ .../while_loop_constant_sinking_test.cc | 46 +++++++++++++++++++ tensorflow/compiler/xla/service/while_util.cc | 14 +++--- tensorflow/compiler/xla/service/while_util.h | 6 ++- 4 files changed, 79 insertions(+), 23 deletions(-) diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc index 49c05e9cf75..8e1c736eb23 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc @@ -58,9 +58,10 @@ StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop( bool changed = false; - auto invariant_conditional_gte_index_to_inst = - WhileUtil::GetGTEsMapForWhileConditional(*while_cond); - auto invariant_body_gtes = + absl::flat_hash_map> + invariant_conditional_gte_index_to_inst = + WhileUtil::GetGTEsMapForWhileConditional(*while_cond); + std::vector invariant_body_gtes = WhileUtil::GetInvariantGTEsForWhileBody(*while_body); for (HloInstruction* invariant_body_gte : invariant_body_gtes) { @@ -68,12 +69,14 @@ StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop( const HloInstruction& invariant_value = *init_value.operand(index); // Original value should be a constant - if (invariant_value.opcode() != HloOpcode::kConstant) continue; + if (invariant_value.opcode() != HloOpcode::kConstant) { + continue; + } // Sink into the while_body // Should have at least one user that's not while_body_root. if (invariant_body_gte->user_count() > 1) { - auto* constant_instr = + HloInstruction* constant_instr = while_body->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk")); TF_RETURN_IF_ERROR(ReplaceUsesWhileKeepingLoopInvariance( invariant_body_gte, constant_instr, while_body->root_instruction(), @@ -81,20 +84,23 @@ StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop( changed = true; } - // Check if there is a corresponding GTE in while_conditional - auto it = invariant_conditional_gte_index_to_inst.find(index); + // Check if there is a corresponding GTE in while_conditional. + absl::flat_hash_map>::iterator it = + invariant_conditional_gte_index_to_inst.find(index); if (it == invariant_conditional_gte_index_to_inst.end()) { continue; } - auto* invariant_cond_gte = it->second; - // Should have at least one user - if (invariant_cond_gte->user_count() > 0) { - auto* constant_instr = - while_cond->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk")); - TF_RETURN_IF_ERROR( - invariant_cond_gte->ReplaceAllUsesWith(constant_instr)); - changed = true; + for (HloInstruction* invariant_cond_gte : it->second) { + // Should have at least one user. + if (invariant_cond_gte->user_count() > 0) { + HloInstruction* constant_instr = while_cond->AddInstruction( + invariant_value.Clone(/*suffix=*/".sunk")); + TF_RETURN_IF_ERROR( + invariant_cond_gte->ReplaceAllUsesWith(constant_instr)); + changed = true; + } } } diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc index 9a25be10222..87de51a257d 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc @@ -369,5 +369,51 @@ ENTRY entry { } } } + +TEST_F(WhileLoopConstantSinkingTest, ConditionalMultipleSameIndexGTEs) { + const char* const hlo_string = R"( +HloModule ModuleWithWhile + +body { + p_body = (f32[],f32[],f32[]) parameter(0) + p_body.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=0 + const = f32[] constant(1) + add.0 = f32[] add(p_body.0, const) + p_body.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=1 + add.1 = f32[] add(p_body.1, const) + p_body.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=2 + ROOT root = (f32[],f32[],f32[]) tuple(add.0, add.1, p_body.2) +} + +condition { + p_cond = (f32[],f32[],f32[]) parameter(0) + p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0 + p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2 + lt.0 = pred[] less-than(p_cond.0, p_cond.2) + p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1 + p_cond.2.c = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2 + lt.1 = pred[] less-than(p_cond.1, p_cond.2.c) + ROOT result = pred[] and(lt.0, lt.1) +} + +ENTRY entry { + const_0 = f32[] constant(0) + const_1 = f32[] constant(0) + const_2 = f32[] constant(12) + while_init = (f32[],f32[],f32[]) tuple(const_0, const_1, const_2) + ROOT while = (f32[],f32[],f32[]) while(while_init), condition=condition, body=body +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_string)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, + WhileLoopConstantSinking{}.Run(module.get())); + ASSERT_TRUE(changed); + + auto* while_condition = module->GetComputationWithName("condition"); + EXPECT_THAT(while_condition->root_instruction(), + op::And(op::Lt(_, op::Constant()), op::Lt(_, op::Constant()))); +} } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc index 153cd449d34..7287620346d 100644 --- a/tensorflow/compiler/xla/service/while_util.cc +++ b/tensorflow/compiler/xla/service/while_util.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/while_util.h" #include "absl/algorithm/container.h" +#include "absl/container/flat_hash_map.h" +#include "absl/container/inlined_vector.h" #include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" @@ -268,14 +270,14 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) { return result; } -/*static*/ std::map +/*static*/ absl::flat_hash_map> WhileUtil::GetGTEsMapForWhileConditional( const HloComputation& while_conditional) { - std::map result; - for (auto* inst : while_conditional.instructions()) { - if (inst->opcode() == HloOpcode::kGetTupleElement && - inst->operand(0) == while_conditional.parameter_instruction(0)) { - result[inst->tuple_index()] = inst; + absl::flat_hash_map> result; + for (HloInstruction* user : + while_conditional.parameter_instruction(0)->users()) { + if (user->opcode() == HloOpcode::kGetTupleElement) { + result[user->tuple_index()].push_back(user); } } return result; diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h index 57ae0178b4d..80a767fd23d 100644 --- a/tensorflow/compiler/xla/service/while_util.h +++ b/tensorflow/compiler/xla/service/while_util.h @@ -16,6 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_ +#include "absl/container/flat_hash_map.h" +#include "absl/container/inlined_vector.h" #include "tensorflow/compiler/xla/service/call_inliner.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -89,8 +91,8 @@ class WhileUtil { // `while_conditional` that access elements in the parameter tuple. Assumes // `while_conditional` is the conditional computation of the while loop in // question. - static std::map GetGTEsMapForWhileConditional( - const HloComputation& while_conditional); + static absl::flat_hash_map> + GetGTEsMapForWhileConditional(const HloComputation& while_conditional); }; } // namespace xla From 5ca0a403f6f573dbbfd20a1807bf70f64c7c9ab6 Mon Sep 17 00:00:00 2001 From: Grzegorz Pawelczak Date: Mon, 22 Oct 2018 09:56:05 +0100 Subject: [PATCH 032/540] Address review comments --- .../compiler/xla/service/while_loop_constant_sinking.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc index 8e1c736eb23..d1b86d792d4 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc @@ -68,12 +68,12 @@ StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop( int64 index = invariant_body_gte->tuple_index(); const HloInstruction& invariant_value = *init_value.operand(index); - // Original value should be a constant + // Original value should be a constant. if (invariant_value.opcode() != HloOpcode::kConstant) { continue; } - // Sink into the while_body + // Sink into the while_body. // Should have at least one user that's not while_body_root. if (invariant_body_gte->user_count() > 1) { HloInstruction* constant_instr = From b925a943e7ebc2a32bb8dbff7cd2a229147ccdae Mon Sep 17 00:00:00 2001 From: Grzegorz Pawelczak Date: Mon, 22 Oct 2018 09:58:05 +0100 Subject: [PATCH 033/540] Address review comments --- .../compiler/xla/service/while_loop_constant_sinking.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc index d1b86d792d4..d5f748ec2ba 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc @@ -85,9 +85,7 @@ StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop( } // Check if there is a corresponding GTE in while_conditional. - absl::flat_hash_map>::iterator it = - invariant_conditional_gte_index_to_inst.find(index); + auto it = invariant_conditional_gte_index_to_inst.find(index); if (it == invariant_conditional_gte_index_to_inst.end()) { continue; } From 8c1807e3ab89d4e81739f35c21bccefe720fb95c Mon Sep 17 00:00:00 2001 From: Grzegorz Pawelczak Date: Mon, 22 Oct 2018 20:38:53 +0100 Subject: [PATCH 034/540] Correct variable name --- .../compiler/xla/service/while_loop_constant_sinking.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc index d5f748ec2ba..8b381dec073 100644 --- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc +++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc @@ -59,7 +59,7 @@ StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop( bool changed = false; absl::flat_hash_map> - invariant_conditional_gte_index_to_inst = + conditional_gte_index_to_insts = WhileUtil::GetGTEsMapForWhileConditional(*while_cond); std::vector invariant_body_gtes = WhileUtil::GetInvariantGTEsForWhileBody(*while_body); @@ -85,8 +85,8 @@ StatusOr WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop( } // Check if there is a corresponding GTE in while_conditional. - auto it = invariant_conditional_gte_index_to_inst.find(index); - if (it == invariant_conditional_gte_index_to_inst.end()) { + auto it = conditional_gte_index_to_insts.find(index); + if (it == conditional_gte_index_to_insts.end()) { continue; } From af50a832c4f1d99b4ab80d942305f6d7916793df Mon Sep 17 00:00:00 2001 From: Julian Niedermeier Date: Tue, 23 Oct 2018 19:25:43 +0200 Subject: [PATCH 035/540] Change --- tensorflow/python/training/checkpointable/util.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py index edab6cc6ebb..1ab21489d46 100644 --- a/tensorflow/python/training/checkpointable/util.py +++ b/tensorflow/python/training/checkpointable/util.py @@ -969,6 +969,12 @@ class CheckpointLoadStatus(_LoadStatus): raise AssertionError( "Object not assigned a value from checkpoint: %s" % (node,)) for checkpointable_object in list_objects(self._root_checkpointable): + # Remove data structures that do not contain any variables from + # restoration checks. + if (isinstance(checkpointable_object, + data_structures.CheckpointableDataStructure) and + len(checkpointable_object.variables) == 0): + continue self._checkpoint.all_python_objects.add(checkpointable_object) unused_python_objects = ( _ObjectIdentitySet(self._checkpoint.all_python_objects) From 2f3b69e4976d3b14eaa6ae070eb68f37d1556d98 Mon Sep 17 00:00:00 2001 From: Julian Niedermeier Date: Tue, 23 Oct 2018 22:30:07 +0200 Subject: [PATCH 036/540] Changed empty check --- tensorflow/python/training/checkpointable/util.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py index 1ab21489d46..b8251943612 100644 --- a/tensorflow/python/training/checkpointable/util.py +++ b/tensorflow/python/training/checkpointable/util.py @@ -971,9 +971,7 @@ class CheckpointLoadStatus(_LoadStatus): for checkpointable_object in list_objects(self._root_checkpointable): # Remove data structures that do not contain any variables from # restoration checks. - if (isinstance(checkpointable_object, - data_structures.CheckpointableDataStructure) and - len(checkpointable_object.variables) == 0): + if not checkpointable_object._checkpoint_dependencies: continue self._checkpoint.all_python_objects.add(checkpointable_object) unused_python_objects = ( From ec1d53dc794e010812f3e078eb04b445002c93ad Mon Sep 17 00:00:00 2001 From: Julian Niedermeier Date: Tue, 23 Oct 2018 22:30:18 +0200 Subject: [PATCH 037/540] Added unit test --- .../python/training/checkpointable/util_test.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py index 66d51713344..b611de1dc6e 100644 --- a/tensorflow/python/training/checkpointable/util_test.py +++ b/tensorflow/python/training/checkpointable/util_test.py @@ -1324,6 +1324,21 @@ class CheckpointingTests(test.TestCase): train_fn() self.assertEqual(42., self.evaluate(optimizer.variables()[0])) + @test_util.run_in_graph_and_eager_modes + def test_restore_after_adding_empty_checkpointable_data_structure(self): + with ops.Graph().as_default(), self.session(graph=ops.get_default_graph()): + model = NonLayerCheckpointable() + checkpoint = checkpointable_utils.Checkpoint(model=model) + checkpoint.restore(None).initialize_or_restore() + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") + save_path = checkpoint.save(checkpoint_prefix) + with ops.Graph().as_default(), self.session(graph=ops.get_default_graph()): + model = NonLayerCheckpointable() + model.dict = {"a": 1} + model.list = {"b": 1} + checkpoint.restore(save_path).assert_consumed().run_restore_ops() + class _ManualScope(tracking.Checkpointable): From d2a08903b2d13235c90fa307c8c240c2c77598e0 Mon Sep 17 00:00:00 2001 From: Julian Niedermeier Date: Tue, 23 Oct 2018 23:39:47 +0200 Subject: [PATCH 038/540] Revoked removal of instance check --- tensorflow/python/training/checkpointable/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py index b8251943612..00f0ff06f0c 100644 --- a/tensorflow/python/training/checkpointable/util.py +++ b/tensorflow/python/training/checkpointable/util.py @@ -971,7 +971,9 @@ class CheckpointLoadStatus(_LoadStatus): for checkpointable_object in list_objects(self._root_checkpointable): # Remove data structures that do not contain any variables from # restoration checks. - if not checkpointable_object._checkpoint_dependencies: + if (isinstance(checkpointable_object, + data_structures.CheckpointableDataStructure) and + not checkpointable_object._checkpoint_dependencies): continue self._checkpoint.all_python_objects.add(checkpointable_object) unused_python_objects = ( From 8def1215dda083de243885da9b52328737cb35cb Mon Sep 17 00:00:00 2001 From: Julian Niedermeier Date: Tue, 23 Oct 2018 23:40:18 +0200 Subject: [PATCH 039/540] Fixed wrong assertion to be assert_existing --- tensorflow/python/training/checkpointable/util_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py index b611de1dc6e..593361695ee 100644 --- a/tensorflow/python/training/checkpointable/util_test.py +++ b/tensorflow/python/training/checkpointable/util_test.py @@ -1337,7 +1337,8 @@ class CheckpointingTests(test.TestCase): model = NonLayerCheckpointable() model.dict = {"a": 1} model.list = {"b": 1} - checkpoint.restore(save_path).assert_consumed().run_restore_ops() + load_status = checkpoint.restore(save_path) + load_status.assert_existing_objects_matched().run_restore_ops() class _ManualScope(tracking.Checkpointable): From 55cdfae1d009d400da049d71652e6cae5c28a277 Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Tue, 23 Oct 2018 14:46:40 -0700 Subject: [PATCH 040/540] Fix indentation --- tensorflow/python/training/checkpointable/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py index 00f0ff06f0c..6ecef716c76 100644 --- a/tensorflow/python/training/checkpointable/util.py +++ b/tensorflow/python/training/checkpointable/util.py @@ -973,7 +973,7 @@ class CheckpointLoadStatus(_LoadStatus): # restoration checks. if (isinstance(checkpointable_object, data_structures.CheckpointableDataStructure) and - not checkpointable_object._checkpoint_dependencies): + not checkpointable_object._checkpoint_dependencies): continue self._checkpoint.all_python_objects.add(checkpointable_object) unused_python_objects = ( From af14e4cf353a985de636a644bdb4a858e96707d4 Mon Sep 17 00:00:00 2001 From: Julian Niedermeier Date: Wed, 24 Oct 2018 00:10:26 +0200 Subject: [PATCH 041/540] Removed graph and session setup from unit test --- .../training/checkpointable/util_test.py | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py index 593361695ee..636694ebd11 100644 --- a/tensorflow/python/training/checkpointable/util_test.py +++ b/tensorflow/python/training/checkpointable/util_test.py @@ -1326,19 +1326,18 @@ class CheckpointingTests(test.TestCase): @test_util.run_in_graph_and_eager_modes def test_restore_after_adding_empty_checkpointable_data_structure(self): - with ops.Graph().as_default(), self.session(graph=ops.get_default_graph()): - model = NonLayerCheckpointable() - checkpoint = checkpointable_utils.Checkpoint(model=model) - checkpoint.restore(None).initialize_or_restore() - checkpoint_directory = self.get_temp_dir() - checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") - save_path = checkpoint.save(checkpoint_prefix) - with ops.Graph().as_default(), self.session(graph=ops.get_default_graph()): - model = NonLayerCheckpointable() - model.dict = {"a": 1} - model.list = {"b": 1} - load_status = checkpoint.restore(save_path) - load_status.assert_existing_objects_matched().run_restore_ops() + model = NonLayerCheckpointable() + checkpoint = checkpointable_utils.Checkpoint(model=model) + checkpoint.restore(None).initialize_or_restore() + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") + save_path = checkpoint.save(checkpoint_prefix) + ops.reset_default_graph() + model = NonLayerCheckpointable() + model.dict = {"a": 1} + model.list = {"b": 1} + load_status = checkpoint.restore(save_path) + load_status.assert_existing_objects_matched().run_restore_ops() class _ManualScope(tracking.Checkpointable): From 80dd0715501fe9e2b518171e7b248a704a39414b Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Tue, 23 Oct 2018 15:24:45 -0700 Subject: [PATCH 042/540] Re-create the Checkpoint object in the unit test --- .../training/checkpointable/util_test.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py index 593361695ee..dd4583ce9bd 100644 --- a/tensorflow/python/training/checkpointable/util_test.py +++ b/tensorflow/python/training/checkpointable/util_test.py @@ -1326,19 +1326,21 @@ class CheckpointingTests(test.TestCase): @test_util.run_in_graph_and_eager_modes def test_restore_after_adding_empty_checkpointable_data_structure(self): - with ops.Graph().as_default(), self.session(graph=ops.get_default_graph()): - model = NonLayerCheckpointable() - checkpoint = checkpointable_utils.Checkpoint(model=model) - checkpoint.restore(None).initialize_or_restore() - checkpoint_directory = self.get_temp_dir() - checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") - save_path = checkpoint.save(checkpoint_prefix) - with ops.Graph().as_default(), self.session(graph=ops.get_default_graph()): - model = NonLayerCheckpointable() - model.dict = {"a": 1} - model.list = {"b": 1} - load_status = checkpoint.restore(save_path) - load_status.assert_existing_objects_matched().run_restore_ops() + model = NonLayerCheckpointable() + checkpoint = checkpointable_utils.Checkpoint(model=model) + checkpoint.restore(None).initialize_or_restore() + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") + save_path = checkpoint.save(checkpoint_prefix) + + del model, checkpoint + + model = NonLayerCheckpointable() + model.dict = {"a": 1} + model.list = {"b": 1} + checkpoint = checkpointable_utils.Checkpoint(model=model) + load_status = checkpoint.restore(save_path) + load_status.assert_existing_objects_matched().run_restore_ops() class _ManualScope(tracking.Checkpointable): From 20cf4909629be616edfd2ade1b3c0a4095c24953 Mon Sep 17 00:00:00 2001 From: wangsiyu Date: Thu, 25 Oct 2018 17:41:23 +0800 Subject: [PATCH 043/540] [Features] Enable Variable Partitioning in ParameterServerStrategy graph mode --- .../python/parameter_server_strategy.py | 3 + .../python/parameter_server_strategy_test.py | 109 ++++++++++++++++++ tensorflow/python/ops/variable_scope.py | 30 +++-- tensorflow/python/training/distribute.py | 16 ++- 4 files changed, 144 insertions(+), 14 deletions(-) diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py index bbfd94ed5c0..52c8747a47d 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py @@ -231,6 +231,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy): destinations = self._compute_devices return self._cross_tower_ops.broadcast(tensor, destinations) + def _allow_variable_partition(self): + return True if not context.executing_eagerly() else False + # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through # this creator, such as "MutableHashTable". def _create_variable(self, next_creator, *args, **kwargs): diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py index ab329da929a..eca0e75dd2c 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py @@ -37,6 +37,7 @@ from tensorflow.python.layers import core from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import gradients +from tensorflow.python.ops import partitioned_variables from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables from tensorflow.python.platform import test @@ -178,6 +179,108 @@ class ParameterServerStrategyTestBase( self.assertEqual(z_val, 43.0) self.assertEqual(f_val, 46.0) + def _test_device_assignment_distributed_enable_partitioner(self, + task_type, + task_id, + num_gpus): + worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id) + d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus) + num_shards = len(d.parameter_devices) + partitioner = partitioned_variables.fixed_size_partitioner(num_shards) + with ops.Graph().as_default(), \ + self.cached_session(target=self._default_target, + config=sess_config) as sess, \ + d.scope(): + + # Define a variable outside the call_for_each_tower scope. This is not + # recommended. + n = variable_scope.get_variable( + 'n', + initializer=constant_op.constant([10.0, 20.0]), + aggregation=variable_scope.VariableAggregation.SUM, + partitioner=partitioner) + + for part_id, var in enumerate(n): + self.assertEqual(var.device, '/job:ps/task:%d' % part_id) + + def model_fn(): + if num_gpus == 0: + last_part_device = 'device:CPU:0' + else: + last_part_device = ( + 'device:GPU:%d' % + distribution_strategy_context.get_tower_context().tower_id) + + a = constant_op.constant([1.0, 2.0]) + b = constant_op.constant([2.0, 3.0]) + c = a + b + self.assertEqual(a.device, worker_device + '/' + last_part_device) + self.assertEqual(b.device, worker_device + '/' + last_part_device) + self.assertEqual(c.device, worker_device + '/' + last_part_device) + + # The device scope is ignored for variables but not for normal ops. + with ops.device('/job:worker/task:0'): + x = variable_scope.get_variable( + 'x', + initializer=constant_op.constant([10.0, 20.0]), + aggregation=variable_scope.VariableAggregation.SUM, + partitioner=partitioner) + x_add = x.assign_add(c, name="x_Add") + e = a + c + # The variable x is on the task 1 since the device_function has been + # called once before the model_fn. + for part_id, var in enumerate(x): + self.assertEqual(var.device, '/job:ps/task:%d' % part_id) + self.assertEqual(var.device, x_add[part_id].device) + + self.assertEqual(e.device, + '/job:worker/replica:0/task:0/%s' % last_part_device) + + # The colocate_vars_with can override the distribution's device. + with d.colocate_vars_with(x_add[0]): + y = variable_scope.get_variable( + 'y', + initializer=constant_op.constant([20.0, 10.0]), + aggregation=variable_scope.VariableAggregation.SUM, + partitioner=partitioner) + y_add = y.assign_add([array_ops.identity(x_add[0]), + array_ops.identity(x_add[1])]) + + for part_id, var in enumerate(y): + self.assertEqual(var.device, '/job:ps/task:0') + self.assertEqual(y_add[part_id].device, var.device) + self.assertEqual(var.device, x_add[0].device) + + z = variable_scope.get_variable( + 'z', + initializer=constant_op.constant([10.0, 30.0]), + aggregation=variable_scope.VariableAggregation.SUM, + partitioner=partitioner) + + for task_id, var in enumerate(z): + self.assertEqual(var.device, '/job:ps/task:%d' % task_id) + + with ops.control_dependencies(y_add): + y_list = [var for var in y] + y_tensor = array_ops.concat(y_list, 0) + z_add = z.assign_add(array_ops.identity(y_tensor)) + with ops.control_dependencies(z_add): + z_list = [var for var in z] + z_tensor = array_ops.concat(z_list, 0) + f = z_tensor + c + self.assertEqual(f.device, worker_device + '/' + last_part_device) + + return y_add, z_add, f + + y, z, f = d.call_for_each_tower(model_fn) + + if context.num_gpus() >= 1 and num_gpus <= 1: + variables.global_variables_initializer().run() + y_val, z_val, f_val = sess.run([y, z, f]) + self.assertEqual(y_val, [33.0, 35.0]) + self.assertEqual(z_val, [43.0, 65.0]) + self.assertEqual(tuple(f_val), (46.0, 70.0)) + def _test_device_assignment_local(self, d, compute_device='CPU', @@ -473,6 +576,12 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase, def testDeviceAssignmentDistributed(self, num_gpus): self._test_device_assignment_distributed('worker', 1, num_gpus) + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus): + self._test_device_assignment_distributed_enable_partitioner( + 'worker', 1, num_gpus) + def testSimpleBetweenGraph(self): self._run_between_graph_clients(self._test_simple_increment, self._cluster_spec, context.num_gpus()) diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py index 41a8f57642e..8fb1d4be94d 100644 --- a/tensorflow/python/ops/variable_scope.py +++ b/tensorflow/python/ops/variable_scope.py @@ -449,7 +449,9 @@ class _VariableStore(object): partitioner=partitioner, validate_shape=validate_shape, use_resource=use_resource, - constraint=constraint) + constraint=constraint, + synchronization=synchronization, + aggregation=aggregation) # Special case for partitioned variable to allow reuse without having to # specify partitioner. @@ -467,7 +469,9 @@ class _VariableStore(object): partitioner=None, validate_shape=validate_shape, use_resource=use_resource, - constraint=constraint) + constraint=constraint, + synchronization=synchronization, + aggregation=aggregation) # Single variable case if "%s/part_0" % name in self._vars: @@ -553,7 +557,9 @@ class _VariableStore(object): caching_device=None, validate_shape=True, use_resource=None, - constraint=None): + constraint=None, + synchronization=VariableSynchronization.AUTO, + aggregation=VariableAggregation.NONE): """Gets or creates a sharded variable list with these parameters. The `partitioner` must be a callable that accepts a fully defined @@ -776,7 +782,9 @@ class _VariableStore(object): caching_device=caching_device, validate_shape=validate_shape, use_resource=use_resource, - constraint=constraint) + constraint=constraint, + synchronization=synchronization, + aggregation=aggregation) # pylint: disable=protected-access var._set_save_slice_info(variables.Variable.SaveSliceInfo( @@ -1254,7 +1262,9 @@ class VariableScope(object): partitioner=None, validate_shape=True, use_resource=None, - constraint=None): + constraint=None, + synchronization=VariableSynchronization.AUTO, + aggregation=VariableAggregation.NONE): """Gets an existing variable with this name or create a new one.""" if context.executing_eagerly(): raise NotImplementedError("Partitioned variables are not yet supported " @@ -1304,7 +1314,8 @@ class VariableScope(object): regularizer=regularizer, reuse=self.reuse, trainable=trainable, collections=collections, caching_device=caching_device, partitioner=partitioner, validate_shape=validate_shape, - use_resource=use_resource, constraint=constraint) + use_resource=use_resource, constraint=constraint, + synchronization=synchronization, aggregation=aggregation) # pylint: enable=protected-access @@ -1661,7 +1672,9 @@ def _get_partitioned_variable(name, partitioner=None, validate_shape=True, use_resource=None, - constraint=None): + constraint=None, + synchronization=VariableSynchronization.AUTO, + aggregation=VariableAggregation.NONE): """Gets or creates a sharded variable list with these parameters. The `partitioner` must be a callable that accepts a fully defined @@ -1744,7 +1757,8 @@ def _get_partitioned_variable(name, initializer=initializer, regularizer=regularizer, trainable=trainable, collections=collections, caching_device=caching_device, partitioner=partitioner, validate_shape=validate_shape, - use_resource=use_resource, constraint=constraint) + use_resource=use_resource, constraint=constraint, + synchronization=synchronization, aggregation=aggregation) # pylint: enable=protected-access diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py index e12ebafba1f..cd80b281362 100644 --- a/tensorflow/python/training/distribute.py +++ b/tensorflow/python/training/distribute.py @@ -458,20 +458,24 @@ class DistributionStrategy(object): kwargs["use_resource"] = True return self._create_variable(*args, **kwargs) - def disable_partitioned_variables(getter, *args, **kwargs): - if kwargs.pop("partitioner", None) is not None: - tf_logging.log_first_n( - tf_logging.WARN, "Partitioned variables are disabled when using " - "DistributionStrategy.", 1) + def distributed_getter(getter, *args, **kwargs): + if not self._allow_variable_partition(): + if kwargs.pop("partitioner", None) is not None: + tf_logging.log_first_n( + tf_logging.WARN, "Partitioned variables are disabled when using " + "current DistributionStrategy.", 1) return getter(*args, **kwargs) return _CurrentDistributionContext( self, variable_scope.variable_creator_scope(creator_with_resource_vars), variable_scope.variable_scope( variable_scope.get_variable_scope(), - custom_getter=disable_partitioned_variables), + custom_getter=distributed_getter), self._default_device) + def _allow_variable_partition(self): + return False + def _create_variable(self, next_creator, *args, **kwargs): # Note: should support "colocate_with" argument. raise NotImplementedError("must be implemented in descendants") From 2cfab14772725de486fd2000d93b20f38923dc6f Mon Sep 17 00:00:00 2001 From: wangsiyu Date: Mon, 29 Oct 2018 22:13:01 +0800 Subject: [PATCH 044/540] simplifies unit test and running gpus number > 1 --- .../python/parameter_server_strategy_test.py | 61 +++++-------------- 1 file changed, 15 insertions(+), 46 deletions(-) diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py index eca0e75dd2c..f268ab31c10 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py @@ -183,7 +183,6 @@ class ParameterServerStrategyTestBase( task_type, task_id, num_gpus): - worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id) d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus) num_shards = len(d.parameter_devices) partitioner = partitioned_variables.fixed_size_partitioner(num_shards) @@ -204,20 +203,7 @@ class ParameterServerStrategyTestBase( self.assertEqual(var.device, '/job:ps/task:%d' % part_id) def model_fn(): - if num_gpus == 0: - last_part_device = 'device:CPU:0' - else: - last_part_device = ( - 'device:GPU:%d' % - distribution_strategy_context.get_tower_context().tower_id) - - a = constant_op.constant([1.0, 2.0]) - b = constant_op.constant([2.0, 3.0]) - c = a + b - self.assertEqual(a.device, worker_device + '/' + last_part_device) - self.assertEqual(b.device, worker_device + '/' + last_part_device) - self.assertEqual(c.device, worker_device + '/' + last_part_device) - + a = constant_op.constant([3.0, 5.0]) # The device scope is ignored for variables but not for normal ops. with ops.device('/job:worker/task:0'): x = variable_scope.get_variable( @@ -225,17 +211,13 @@ class ParameterServerStrategyTestBase( initializer=constant_op.constant([10.0, 20.0]), aggregation=variable_scope.VariableAggregation.SUM, partitioner=partitioner) - x_add = x.assign_add(c, name="x_Add") - e = a + c + x_add = x.assign_add(a, name="x_add") # The variable x is on the task 1 since the device_function has been # called once before the model_fn. for part_id, var in enumerate(x): self.assertEqual(var.device, '/job:ps/task:%d' % part_id) self.assertEqual(var.device, x_add[part_id].device) - self.assertEqual(e.device, - '/job:worker/replica:0/task:0/%s' % last_part_device) - # The colocate_vars_with can override the distribution's device. with d.colocate_vars_with(x_add[0]): y = variable_scope.get_variable( @@ -251,35 +233,22 @@ class ParameterServerStrategyTestBase( self.assertEqual(y_add[part_id].device, var.device) self.assertEqual(var.device, x_add[0].device) - z = variable_scope.get_variable( - 'z', - initializer=constant_op.constant([10.0, 30.0]), - aggregation=variable_scope.VariableAggregation.SUM, - partitioner=partitioner) + return x_add, y_add - for task_id, var in enumerate(z): - self.assertEqual(var.device, '/job:ps/task:%d' % task_id) + x, y = d.call_for_each_tower(model_fn) - with ops.control_dependencies(y_add): - y_list = [var for var in y] - y_tensor = array_ops.concat(y_list, 0) - z_add = z.assign_add(array_ops.identity(y_tensor)) - with ops.control_dependencies(z_add): - z_list = [var for var in z] - z_tensor = array_ops.concat(z_list, 0) - f = z_tensor + c - self.assertEqual(f.device, worker_device + '/' + last_part_device) - - return y_add, z_add, f - - y, z, f = d.call_for_each_tower(model_fn) - - if context.num_gpus() >= 1 and num_gpus <= 1: + if context.num_gpus() >= 1: variables.global_variables_initializer().run() - y_val, z_val, f_val = sess.run([y, z, f]) - self.assertEqual(y_val, [33.0, 35.0]) - self.assertEqual(z_val, [43.0, 65.0]) - self.assertEqual(tuple(f_val), (46.0, 70.0)) + x_val, y_val = sess.run([x, y]) + if num_gpus < 1: + self.assertEqual(x_val, [13.0, 25.0]) + self.assertEqual(y_val, [33.0, 35.0]) + else: + x_expect = [10.0 + 3 * num_gpus, 20.0 + 5 * num_gpus] + y_expect = [20.0 + x_expect[0] * num_gpus, + 10.0 + x_expect[1] * num_gpus] + self.assertEqual(x_val, x_expect) + self.assertEqual(y_val, y_expect) def _test_device_assignment_local(self, d, From fb51172887e64c84e9791e2743d206ad865fd3dd Mon Sep 17 00:00:00 2001 From: Anton Dmitriev Date: Mon, 29 Oct 2018 17:57:30 +0300 Subject: [PATCH 045/540] Update after review. --- tensorflow/c/c_api.h | 37 ++++---------- .../src/main/java/org/tensorflow/Server.java | 17 ++++--- tensorflow/java/src/main/native/server_jni.cc | 51 +++++++++---------- 3 files changed, 43 insertions(+), 62 deletions(-) diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h index 9fe06f56a69..141ffaeee8f 100644 --- a/tensorflow/c/c_api.h +++ b/tensorflow/c/c_api.h @@ -1673,43 +1673,28 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp( // In-process TensorFlow server. typedef struct TF_Server TF_Server; -// Creates a new server. The returned TF_Server object can be started, stopped -// and joined using correspondent commands. After using TF_Server object should -// be deleted using the TF_DeleteServer command to free correspondent resources. +// Creates a new in-process TensorFlow server configured using a serialized +// ServerDef protocol buffer provided via `proto` and `proto_len`. // -// Params: -// proto - Serialized ServerDef protocol buffer. -// proto_len - Length of the proto. -// status - Set to OK on success and an appropriate error on failure. +// The server will not serve any requests until TF_ServerStart is invoked. +// The server will stop serving requests once TF_ServerStop or +// TF_DeleteServer is invoked. TF_CAPI_EXPORT extern TF_Server* TF_NewServer(const void* proto, size_t proto_len, TF_Status* status); -// Starts a server. -// -// Params: -// server - TF_Server object to be started. -// status - Set to OK on success and an appropriate error on failure. +// Starts an in-process TensorFlow server. TF_CAPI_EXPORT extern void TF_ServerStart(TF_Server* server, TF_Status* status); -// Stops a server. -// -// Params: -// server - TF_Server object to be stopped. -// status - Set to OK on success and an appropriate error on failure. +// Stops an in-process TensorFlow server. TF_CAPI_EXPORT extern void TF_ServerStop(TF_Server* server, TF_Status* status); -// Blocks until the server has shut down (currently blocks forever). -// -// Params: -// server - TF_Server object to be joined. -// status - Set to OK on success and an appropriate error on failure. +// Blocks until the server has been successfully stopped (via TF_ServerStop or +// TF_ServerClose). TF_CAPI_EXPORT extern void TF_ServerJoin(TF_Server* server, TF_Status* status); -// Destroy a server, frees memory. Server is expected to be stopped before. -// -// Params: -// server - TF_Server object to be deleted. +// Destroy an in-process TensorFlow server, frees memory. If server is running +// it will be stopped and joined. TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server); #ifdef __cplusplus diff --git a/tensorflow/java/src/main/java/org/tensorflow/Server.java b/tensorflow/java/src/main/java/org/tensorflow/Server.java index 5a42077904c..727ef4af536 100644 --- a/tensorflow/java/src/main/java/org/tensorflow/Server.java +++ b/tensorflow/java/src/main/java/org/tensorflow/Server.java @@ -25,6 +25,9 @@ import java.util.concurrent.locks.ReentrantReadWriteLock; * training. A server belongs to a cluster (specified by a * {@code ClusterSpec}), and corresponds to a particular task in a named job. * The server can communicate with any other server in the same cluster. + * The server will not serve any requests until {@link #start()} is invoked. + * The server will stop serving requests once {@link #stop()} or {@link #close()} is invoked. + * Be aware that {@link #close()} method stops the server if it is running. * *

WARNING: A {@code Server} owns resources that must be * explicitly freed by invoking {@link #close()}. @@ -32,8 +35,7 @@ import java.util.concurrent.locks.ReentrantReadWriteLock; *

Instances of a {@code Server} are thread-safe. * *

Using example: - *

- * {@code
+ * 
{@code
  * ClusterDef clusterDef = ClusterDef.newBuilder()
  *   .addJob(JobDef.newBuilder()
  *   .setName("worker")
@@ -52,8 +54,7 @@ import java.util.concurrent.locks.ReentrantReadWriteLock;
  *   srv.start();
  *   srv.join();
  * }
- * }
- * 
+ * }
*/ public final class Server implements AutoCloseable { @@ -68,7 +69,7 @@ public final class Server implements AutoCloseable { nativeHandle = allocate(serverDef); } - /** Starts this server. */ + /** Starts an in-process TensorFlow server. */ public void start() { lock.readLock().lock(); try { @@ -79,7 +80,7 @@ public final class Server implements AutoCloseable { } } - /** Stops this server. */ + /** Stops an in-process TensorFlow server. */ public void stop() { lock.readLock().lock(); try { @@ -90,7 +91,7 @@ public final class Server implements AutoCloseable { } } - /** Blocks until the server has shut down (currently blocks forever). */ + /** Blocks until the server has been successfully stopped. */ public void join() { lock.readLock().lock(); try { @@ -101,7 +102,7 @@ public final class Server implements AutoCloseable { } } - /** Stops server and frees resources. Server is expected to be stopped before. */ + /** Destroy an in-process TensorFlow server, frees memory. */ @Override public void close() { lock.writeLock().lock(); diff --git a/tensorflow/java/src/main/native/server_jni.cc b/tensorflow/java/src/main/native/server_jni.cc index f0d1d29b88a..95a6278fb8c 100644 --- a/tensorflow/java/src/main/native/server_jni.cc +++ b/tensorflow/java/src/main/native/server_jni.cc @@ -18,6 +18,21 @@ limitations under the License. #include "tensorflow/java/src/main/native/exception_jni.h" #include "tensorflow/java/src/main/native/utils_jni.h" +namespace { +TF_Server* requireHandle(JNIEnv* env, jlong handle) { + static_assert(sizeof(jlong) >= sizeof(TF_Server*), + "Cannot package C object pointers as a Java long"); + if (handle == 0) { + throwException(env, kIllegalStateException, + "close() has been called on the Server"); + return nullptr; + } + + return reinterpret_cast(handle); +} + +} // namespace + JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate( JNIEnv* env, jclass clazz, jbyteArray server_def) { TF_Status* status = TF_NewStatus(); @@ -39,14 +54,9 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate( JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env, jclass clazz, jlong handle) { - if (handle == 0) { - throwException(env, kIllegalStateException, - "close() has been called on the Server"); - return; - } - TF_Status* status = TF_NewStatus(); - TF_Server* server = reinterpret_cast(handle); + TF_Server* server = requireHandle(env, handle); + if (server == nullptr) return; TF_ServerStart(server, status); throwExceptionIfNotOK(env, status); @@ -57,14 +67,9 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env, JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env, jclass clazz, jlong handle) { - if (handle == 0) { - throwException(env, kIllegalStateException, - "close() has been called on the Server"); - return; - } - TF_Status* status = TF_NewStatus(); - TF_Server* server = reinterpret_cast(handle); + TF_Server* server = requireHandle(env, handle); + if (server == nullptr) return; TF_ServerStop(server, status); throwExceptionIfNotOK(env, status); @@ -75,14 +80,9 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env, JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env, jclass clazz, jlong handle) { - if (handle == 0) { - throwException(env, kIllegalStateException, - "close() has been called on the Server"); - return; - } - TF_Status* status = TF_NewStatus(); - TF_Server* server = reinterpret_cast(handle); + TF_Server* server = requireHandle(env, handle); + if (server == nullptr) return; TF_ServerJoin(server, status); throwExceptionIfNotOK(env, status); @@ -93,13 +93,8 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env, JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv* env, jclass clazz, jlong handle) { - if (handle == 0) { - throwException(env, kIllegalStateException, - "close() has been called on the Server"); - return; - } - - TF_Server* server = reinterpret_cast(handle); + TF_Server* server = requireHandle(env, handle); + if (server == nullptr) return; TF_DeleteServer(server); } From 1737b809b7a7b258c81a77dd4e4670315c70b53d Mon Sep 17 00:00:00 2001 From: Anton Dmitriev Date: Mon, 29 Oct 2018 18:52:27 +0300 Subject: [PATCH 046/540] Update after review. --- .../src/main/java/org/tensorflow/Server.java | 65 +++++++++---------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/tensorflow/java/src/main/java/org/tensorflow/Server.java b/tensorflow/java/src/main/java/org/tensorflow/Server.java index 727ef4af536..98b123be303 100644 --- a/tensorflow/java/src/main/java/org/tensorflow/Server.java +++ b/tensorflow/java/src/main/java/org/tensorflow/Server.java @@ -15,8 +15,6 @@ limitations under the License. package org.tensorflow; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; /** * An in-process TensorFlow server, for use in distributed training. * @@ -36,6 +34,11 @@ import java.util.concurrent.locks.ReentrantReadWriteLock; * *

Using example: *

{@code
+ * import org.tensorflow.Server;
+ * import org.tensorflow.distruntime.ClusterDef;
+ * import org.tensorflow.distruntime.JobDef;
+ * import org.tensorflow.distruntime.ServerDef;
+ *
  * ClusterDef clusterDef = ClusterDef.newBuilder()
  *   .addJob(JobDef.newBuilder()
  *   .setName("worker")
@@ -70,49 +73,45 @@ public final class Server implements AutoCloseable {
   }
 
   /** Starts an in-process TensorFlow server. */
-  public void start() {
-    lock.readLock().lock();
-    try {
-      start(nativeHandle);
-    }
-    finally {
-      lock.readLock().unlock();
-    }
+  public synchronized void start() {
+    start(nativeHandle);
   }
 
   /**  Stops an in-process TensorFlow server. */
-  public void stop() {
-    lock.readLock().lock();
-    try {
-      stop(nativeHandle);
-    }
-    finally {
-      lock.readLock().unlock();
-    }
+  public synchronized void stop() {
+    stop(nativeHandle);
   }
 
   /** Blocks until the server has been successfully stopped. */
   public void join() {
-    lock.readLock().lock();
-    try {
-      join(nativeHandle);
+    long handle = 0;
+    synchronized(this) {
+      handle = nativeHandle;
+      if (handle != 0) {
+        numJoining++;
+      }
     }
-    finally {
-      lock.readLock().unlock();
+    try {
+      join(handle);
+    } finally {
+      synchronized(this) {
+        if (handle != 0) {
+          numJoining--;
+        } 
+        notifyAll();
+      }
     }
   }
 
   /** Destroy an in-process TensorFlow server, frees memory. */
   @Override
-  public void close() {
-    lock.writeLock().lock();
-    try {
-      delete(nativeHandle);
-      nativeHandle = 0;
-    }
-    finally {
-      lock.writeLock().unlock();
+  public synchronized void close() throws InterruptedException {
+    stop();
+    while (numJoining > 0) {
+      wait();
     }
+    delete(nativeHandle);
+    nativeHandle = 0;
   }
 
   private static native long allocate(byte[] serverDef);
@@ -125,10 +124,10 @@ public final class Server implements AutoCloseable {
 
   private static native void delete(long nativeHandle);
 
-  private final ReadWriteLock lock = new ReentrantReadWriteLock();
-
   private long nativeHandle;
 
+  private int numJoining;
+
   static {
     TensorFlow.init();
   }

From 2529a174bc01d6591656e98b7e9c9f5be1eb9631 Mon Sep 17 00:00:00 2001
From: Clayne Robison 
Date: Mon, 29 Oct 2018 17:24:31 -0700
Subject: [PATCH 047/540] [Intel MKL] Fixes the
 direct_session_with_tracking_alloc_test failure in public CI builds.

---
 .../common_runtime/direct_session_with_tracking_alloc_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 2c63b8704ee..be279cb1dd5 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -117,13 +117,13 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
           // which increments the value of AllocationId.
           // Thus AllocationId becomes more than TF if MKL
           // is used. Now IDs for MKL are 8 more than TF.
-          EXPECT_EQ(21, cm->AllocationId(node, 0));
+          EXPECT_EQ(13, cm->AllocationId(node, 0));
 #else
           EXPECT_EQ(13, cm->AllocationId(node, 0));
 #endif  // INTEL_MKL && ENABLE_MKL
         } else {
 #if defined(INTEL_MKL) && defined(ENABLE_MKL)
-          EXPECT_EQ(22, cm->AllocationId(node, 0));
+          EXPECT_EQ(14, cm->AllocationId(node, 0));
 #else
           EXPECT_EQ(14, cm->AllocationId(node, 0));
 #endif  // INTEL_MKL && ENABLE_MKL

From bdd60574d1b20f87c7ed60a0f04dd2dd2ecedc8e Mon Sep 17 00:00:00 2001
From: wangsiyu 
Date: Tue, 30 Oct 2018 11:59:05 +0800
Subject: [PATCH 048/540] Add params documentations and refine code

---
 .../python/parameter_server_strategy.py        |  2 +-
 .../python/parameter_server_strategy_test.py   |  2 --
 tensorflow/python/ops/variable_scope.py        | 18 ++++++++++++++++++
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 52c8747a47d..6590f2a2157 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -232,7 +232,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     return self._cross_tower_ops.broadcast(tensor, destinations)
 
   def _allow_variable_partition(self):
-    return True if not context.executing_eagerly() else False
+    return not context.executing_eagerly()
 
   # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
   # this creator, such as "MutableHashTable".
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index f268ab31c10..86dfcec18e0 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -191,8 +191,6 @@ class ParameterServerStrategyTestBase(
                              config=sess_config) as sess, \
          d.scope():
 
-      # Define a variable outside the call_for_each_tower scope. This is not
-      # recommended.
       n = variable_scope.get_variable(
           'n',
           initializer=constant_op.constant([10.0, 20.0]),
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 8fb1d4be94d..3493f67ec1c 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -625,6 +625,15 @@ class _VariableStore(object):
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
 
     Returns:
       A `PartitionedVariable` object.
@@ -1732,6 +1741,15 @@ def _get_partitioned_variable(name,
       variable and return the Tensor for the projected value
       (which must have the same shape). Constraints are not safe to
       use when doing asynchronous distributed training.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      `tf.VariableSynchronization`. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      `tf.VariableAggregation`.
 
   Returns:
     A tuple `(shards, partitions)` where `shards` is the list of `Variable`

From c172bec140ab6e27277fcd1761c700dd3f6adc36 Mon Sep 17 00:00:00 2001
From: mdfaijul 
Date: Tue, 30 Oct 2018 17:56:36 -0700
Subject: [PATCH 049/540] Comments added for enum types.

---
 tensorflow/core/kernels/BUILD   |   5 +-
 tensorflow/core/util/mkl_util.h | 214 ++++++++++++++++----------------
 2 files changed, 113 insertions(+), 106 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 752fd70b079..4125ed1ffa4 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6422,7 +6422,10 @@ tf_cc_test(
 
 tf_mkl_kernel_library(
     name = "mkl_conv_op",
-    hdrs = ["mkl_quantized_conv_ops.h"],
+    hdrs = [
+        "mkl_quantized_conv_ops.h",
+        "no_op.h",
+    ],
     prefix = "mkl_conv",
     deps = [
         ":bounds_check",
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 2cff4dfbd5c..3108664a56e 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
-#include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -54,9 +54,9 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/util/env_var.h"
 
 #ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
@@ -83,7 +83,12 @@ namespace tensorflow {
 // MKL operation, and did not go through a conversion to a standard
 // Tensorflow tensor.
 
+// For use with MKL ML, has been deprecated
 typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
+
+// The dimensions order that MKL DNN internally uses for 2D activations
+// [Batch, Channel, Height, Width] and
+// for 2D filters [Out_Channel, In_Channel, Height, Width].
 typedef enum {
   Dim_N = 0,
   Dim_C = 1,
@@ -93,6 +98,9 @@ typedef enum {
   Dim_I = 1
 } MklDnnDims;
 
+// The dimensions order that MKL DNN internally uses for 3D activations
+// [Batch, Channel, Depth, Height, Width] and
+// for 2D filters [Out_Channel, In_Channel, Depth, Height, Width].
 typedef enum {
   Dim3d_N = 0,
   Dim3d_C = 1,
@@ -103,10 +111,13 @@ typedef enum {
   Dim3d_I = 1
 } MklDnnDims3D;
 
+// Enum used to templatize MklOp kernel implementations
+// that support both fp32 and int8 versions.
 enum class MklQuantization {
   QUANTIZED_VERSION,
   FP_VERSION,
-}; 
+};
+
 static const int kSmallBatchSize = 32;
 
 #ifdef INTEL_MKL_ML_ONLY
@@ -255,32 +266,32 @@ class MklShape {
     CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
   }
 
-  // The following methods are used for serializing and de-serializing the
-  // contents of the mklshape object.
-  // The data is serialized in this order
-  // isMklTensor_
-  // dimension_
-  // sizes_
-  // strides_
-  // mklLayout_
-  // tfLayout_
-  // tf_to_mkl_dim_map_
+// The following methods are used for serializing and de-serializing the
+// contents of the mklshape object.
+// The data is serialized in this order
+// isMklTensor_
+// dimension_
+// sizes_
+// strides_
+// mklLayout_
+// tfLayout_
+// tf_to_mkl_dim_map_
 
 #define SIZE_OF_MKL_DNN_BUF \
   (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
                                             // serialize dnn_layout pointer
 
-  // Size of buffer to hold the serialized object, the size is computed as
-  // follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) +
-  // sizeof(strides_)
-  // + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
-  // + sizeof(tf_to_mkl_dim_map_)
+// Size of buffer to hold the serialized object, the size is computed as
+// follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) +
+// sizeof(strides_)
+// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+// + sizeof(tf_to_mkl_dim_map_)
 
 #define SIZE_OF_MKL_SERIAL_DATA(dims) \
   (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
 
-  // First we need to define some macro for offsets into the serial buffer where
-  // different elements of Mklshape is written/read from
+// First we need to define some macro for offsets into the serial buffer where
+// different elements of Mklshape is written/read from
 
 #define IS_MKL_TENSOR_OFFSET 0
 // Location from start of buffer where isMklTensor_ is serialized
@@ -657,7 +668,6 @@ class MklDnnShape {
     }
   }
 
-
   inline void SetTfDimOrder(const size_t dimension, memory::format format) {
     TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
     SetTfDimOrder(dimension, data_format);
@@ -786,7 +796,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
 }
 #else
 using mkldnn::stream;
-template  class MklDnnData;
+template 
+class MklDnnData;
 
 template 
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
@@ -796,11 +807,12 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
     if (!mkl_shape.IsMklTensor())
       return mkl_tensor;  // return input since it is already TF tensor
 
-    TensorShape output_shape = mkl_shape.GetTfShape();;
+    TensorShape output_shape = mkl_shape.GetTfShape();
+    ;
 
     // Allocate output tensor.
-    context->allocate_temp(DataTypeToEnum::v(),
-        output_shape, &output_tensor);
+    context->allocate_temp(DataTypeToEnum::v(), output_shape,
+                           &output_tensor);
 
     auto cpu_engine = engine(engine::cpu, 0);
     MklDnnData input(&cpu_engine);
@@ -815,16 +827,16 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
     if (input.IsReorderNeeded(output_tf_pd)) {
       std::vector net;
       CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
-             true);
+               true);
       stream(stream::kind::eager).submit(net).wait();
     } else {
       // If not, just forward input tensor to output tensor.
       CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
     }
   } catch (mkldnn::error& e) {
-    string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) + ", in file " +
-                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                       string(e.message) + ", in file " + string(__FILE__) +
+                       ":" + std::to_string(__LINE__);
     LOG(FATAL) << "Operation received an exception: " << error_msg;
   }
   return output_tensor;
@@ -2019,8 +2031,7 @@ const mkldnn::memory::dims NONE_DIMS = {};
 template 
 class MklPrimitiveFactory {
  public:
-  MklPrimitiveFactory() {
-  }
+  MklPrimitiveFactory() {}
 
   ~MklPrimitiveFactory() {}
 
@@ -2048,8 +2059,8 @@ class MklPrimitiveFactory {
   /// For those legacy device(w/o AVX512 and AVX2),
   /// MKL-DNN GEMM will be used.
   static inline bool IsLegacyPlatform() {
-    return (!port::TestCPUFeature(port::CPUFeature::AVX512F)
-                   && !port::TestCPUFeature(port::CPUFeature::AVX2));
+    return (!port::TestCPUFeature(port::CPUFeature::AVX512F) &&
+            !port::TestCPUFeature(port::CPUFeature::AVX2));
   }
 
   /// Fuction to check whether primitive memory optimization is enabled
@@ -2070,15 +2081,13 @@ class MklPrimitiveFactory {
 // utility class for creating keys of MKL primitive pool.
 class FactoryKeyCreator {
  public:
-  FactoryKeyCreator() {
-    key_.reserve(kMaxKeyLength);
-  }
+  FactoryKeyCreator() { key_.reserve(kMaxKeyLength); }
 
   ~FactoryKeyCreator() {}
 
   void AddAsKey(const string& str) { Append(str); }
 
-  void AddAsKey(const mkldnn::memory::dims &dims) {
+  void AddAsKey(const mkldnn::memory::dims& dims) {
     for (unsigned int i = 0; i < dims.size(); i++) {
       AddAsKey(dims[i]);
     }
@@ -2086,7 +2095,7 @@ class FactoryKeyCreator {
 
   template 
   void AddAsKey(const T data) {
-    auto buffer = reinterpret_cast(&data);
+    auto buffer = reinterpret_cast(&data);
     Append(StringPiece(buffer, sizeof(T)));
   }
 
@@ -2102,7 +2111,6 @@ class FactoryKeyCreator {
   }
 };
 
-
 static inline memory::format get_desired_format(int channel,
                                                 bool is_2d = true) {
   memory::format fmt_desired = memory::format::any;
@@ -2124,37 +2132,34 @@ class MklReorderPrimitive : public MklPrimitive {
   explicit MklReorderPrimitive(const memory* from, const memory* to) {
     Setup(from, to);
   }
-    ~MklReorderPrimitive() {}
+  ~MklReorderPrimitive() {}
 
-    std::shared_ptr GetPrimitive() {
-      return context_.reorder_prim;
-    }
+  std::shared_ptr GetPrimitive() { return context_.reorder_prim; }
 
-    void SetMemory(const memory* from, const memory* to) {
-      context_.src_mem->set_data_handle(from->get_data_handle());
-      context_.dst_mem->set_data_handle(to->get_data_handle());
-    }
+  void SetMemory(const memory* from, const memory* to) {
+    context_.src_mem->set_data_handle(from->get_data_handle());
+    context_.dst_mem->set_data_handle(to->get_data_handle());
+  }
 
  private:
-    struct ReorderContext {
-      std::shared_ptr src_mem;
-      std::shared_ptr dst_mem;
-      std::shared_ptr reorder_prim;
-      ReorderContext():
-        src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {
-      }
-    } context_;
+  struct ReorderContext {
+    std::shared_ptr src_mem;
+    std::shared_ptr dst_mem;
+    std::shared_ptr reorder_prim;
+    ReorderContext()
+        : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
+  } context_;
 
-    engine cpu_engine_ = engine(engine::cpu, 0);
+  engine cpu_engine_ = engine(engine::cpu, 0);
 
-    void Setup(const memory* from, const memory* to) {
-      context_.src_mem.reset(new memory(
-            {from->get_primitive_desc().desc(), cpu_engine_}, DummyData));
-      context_.dst_mem.reset(new memory(
-            {to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
-      context_.reorder_prim = std::make_shared(
-          reorder(*context_.src_mem, *context_.dst_mem));
-    }
+  void Setup(const memory* from, const memory* to) {
+    context_.src_mem.reset(new memory(
+        {from->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    context_.dst_mem.reset(
+        new memory({to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    context_.reorder_prim = std::make_shared(
+        reorder(*context_.src_mem, *context_.dst_mem));
+  }
 };
 
 template 
@@ -2172,52 +2177,51 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory {
     return reorderPrim;
   }
 
-    static MklReorderPrimitiveFactory & GetInstance() {
-      static MklReorderPrimitiveFactory instance_;
-      return instance_;
-    }
+  static MklReorderPrimitiveFactory& GetInstance() {
+    static MklReorderPrimitiveFactory instance_;
+    return instance_;
+  }
 
  private:
-    MklReorderPrimitiveFactory() {}
-    ~MklReorderPrimitiveFactory() {}
+  MklReorderPrimitiveFactory() {}
+  ~MklReorderPrimitiveFactory() {}
 
-    static string CreateKey(const memory* from, const memory* to) {
-      string prefix = "reorder";
-      FactoryKeyCreator key_creator;
-      auto const &from_desc =  from->get_primitive_desc().desc().data;
-      auto const &to_desc =  to->get_primitive_desc().desc().data;
-      const int KIdxFirstStride = 0;
-      memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
-      memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
-      memory::dims from_strides(
-          from_desc.layout_desc.blocking.strides[KIdxFirstStride],
-          &from_desc.layout_desc.blocking
-               .strides[KIdxFirstStride][from_desc.ndims]);
-      memory::dims to_strides(
-          to_desc.layout_desc.blocking.strides[KIdxFirstStride],
-          &to_desc.layout_desc.blocking
-               .strides[KIdxFirstStride][to_desc.ndims]);
-      key_creator.AddAsKey(prefix);
-      key_creator.AddAsKey(static_cast(from_desc.format));
-      key_creator.AddAsKey(static_cast(from_desc.data_type));
-      key_creator.AddAsKey(from_dims);
-      key_creator.AddAsKey(from_strides);
-      key_creator.AddAsKey(static_cast(to_desc.format));
-      key_creator.AddAsKey(static_cast(to_desc.data_type));
-      key_creator.AddAsKey(to_dims);
-      key_creator.AddAsKey(to_strides);
-      return key_creator.GetKey();
-    }
+  static string CreateKey(const memory* from, const memory* to) {
+    string prefix = "reorder";
+    FactoryKeyCreator key_creator;
+    auto const& from_desc = from->get_primitive_desc().desc().data;
+    auto const& to_desc = to->get_primitive_desc().desc().data;
+    const int KIdxFirstStride = 0;
+    memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
+    memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
+    memory::dims from_strides(
+        from_desc.layout_desc.blocking.strides[KIdxFirstStride],
+        &from_desc.layout_desc.blocking
+             .strides[KIdxFirstStride][from_desc.ndims]);
+    memory::dims to_strides(
+        to_desc.layout_desc.blocking.strides[KIdxFirstStride],
+        &to_desc.layout_desc.blocking.strides[KIdxFirstStride][to_desc.ndims]);
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(static_cast(from_desc.format));
+    key_creator.AddAsKey(static_cast(from_desc.data_type));
+    key_creator.AddAsKey(from_dims);
+    key_creator.AddAsKey(from_strides);
+    key_creator.AddAsKey(static_cast(to_desc.format));
+    key_creator.AddAsKey(static_cast(to_desc.data_type));
+    key_creator.AddAsKey(to_dims);
+    key_creator.AddAsKey(to_strides);
+    return key_creator.GetKey();
+  }
 
-    MklPrimitive* GetReorder(const memory* from, const memory* to) {
-      string key = CreateKey(from, to);
-      return this->GetOp(key);
-    }
+  MklPrimitive* GetReorder(const memory* from, const memory* to) {
+    string key = CreateKey(from, to);
+    return this->GetOp(key);
+  }
 
-    void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
-      string key = CreateKey(from, to);
-      this->SetOp(key, op);
-    }
+  void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
+    string key = CreateKey(from, to);
+    this->SetOp(key, op);
+  }
 };
 
 /// Fuction to find(or create) a reorder from memory pointed by

From 5d15ff1178d44e2c12ec0ca175ef1c003da14b8f Mon Sep 17 00:00:00 2001
From: AG Ramesh 
Date: Tue, 30 Oct 2018 20:17:08 -0700
Subject: [PATCH 050/540] Fixed typo

---
 tensorflow/core/util/mkl_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 3108664a56e..15a714529b6 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -100,7 +100,7 @@ typedef enum {
 
 // The dimensions order that MKL DNN internally uses for 3D activations
 // [Batch, Channel, Depth, Height, Width] and
-// for 2D filters [Out_Channel, In_Channel, Depth, Height, Width].
+// for 3D filters [Out_Channel, In_Channel, Depth, Height, Width].
 typedef enum {
   Dim3d_N = 0,
   Dim3d_C = 1,

From 66c6d056739e328519d3fe6486f217ae6398b89f Mon Sep 17 00:00:00 2001
From: Anton Dmitriev 
Date: Wed, 31 Oct 2018 11:42:43 +0300
Subject: [PATCH 051/540] Update after review.

---
 tensorflow/c/c_api.cc                         | 2 ++
 tensorflow/java/src/main/native/server_jni.cc | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 0d71aa3e942..c4609795ab8 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2806,6 +2806,7 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
 
 // TF_Server functions ----------------------------------------------
 
+#ifndef __ANDROID__
 TF_Server::TF_Server(std::unique_ptr server)
     : server(std::move(server)) {}
 
@@ -2838,5 +2839,6 @@ void TF_ServerJoin(TF_Server* server, TF_Status* status) {
 }
 
 void TF_DeleteServer(TF_Server* server) { delete server; }
+#endif  // __ANDROID__
 
 }  // end extern "C"
diff --git a/tensorflow/java/src/main/native/server_jni.cc b/tensorflow/java/src/main/native/server_jni.cc
index 95a6278fb8c..68a53ef87c1 100644
--- a/tensorflow/java/src/main/native/server_jni.cc
+++ b/tensorflow/java/src/main/native/server_jni.cc
@@ -54,10 +54,11 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate(
 JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env,
                                                         jclass clazz,
                                                         jlong handle) {
-  TF_Status* status = TF_NewStatus();
   TF_Server* server = requireHandle(env, handle);
   if (server == nullptr) return;
 
+  TF_Status* status = TF_NewStatus();
+
   TF_ServerStart(server, status);
   throwExceptionIfNotOK(env, status);
 
@@ -67,10 +68,11 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env,
 JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env,
                                                        jclass clazz,
                                                        jlong handle) {
-  TF_Status* status = TF_NewStatus();
   TF_Server* server = requireHandle(env, handle);
   if (server == nullptr) return;
 
+  TF_Status* status = TF_NewStatus();
+
   TF_ServerStop(server, status);
   throwExceptionIfNotOK(env, status);
 
@@ -80,10 +82,11 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env,
 JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env,
                                                        jclass clazz,
                                                        jlong handle) {
-  TF_Status* status = TF_NewStatus();
   TF_Server* server = requireHandle(env, handle);
   if (server == nullptr) return;
 
+  TF_Status* status = TF_NewStatus();
+
   TF_ServerJoin(server, status);
   throwExceptionIfNotOK(env, status);
 

From 59292f548ccb7454c4e4bf3bb7e3f51eab50251f Mon Sep 17 00:00:00 2001
From: Clayne Robison 
Date: Thu, 1 Nov 2018 12:36:12 -0700
Subject: [PATCH 052/540] Addressing @penpornk's comments

---
 .../direct_session_with_tracking_alloc_test.cc   | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index be279cb1dd5..a48ff0541d5 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -108,25 +108,9 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
-          // if MKL is used, it goes through various additional
-          // graph rewrite pass. In TF, everytime a graph pass
-          // happens, "constant" nodes are allocated
-          // and deallocated. Each allocation calls the
-          // (FindChunkPtr of BFCAllocator),
-          // which increments the value of AllocationId.
-          // Thus AllocationId becomes more than TF if MKL
-          // is used. Now IDs for MKL are 8 more than TF.
           EXPECT_EQ(13, cm->AllocationId(node, 0));
-#else
-          EXPECT_EQ(13, cm->AllocationId(node, 0));
-#endif  // INTEL_MKL && ENABLE_MKL
         } else {
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
           EXPECT_EQ(14, cm->AllocationId(node, 0));
-#else
-          EXPECT_EQ(14, cm->AllocationId(node, 0));
-#endif  // INTEL_MKL && ENABLE_MKL
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));

From 73ae4f67a46c64068913344286090b3ccc0df94b Mon Sep 17 00:00:00 2001
From: avijit-nervana 
Date: Thu, 1 Nov 2018 13:36:08 -0700
Subject: [PATCH 053/540] Updated to nGraph version v0.9.1 and ngraph-tf
 version v0.7.0

---
 tensorflow/workspace.bzl           | 16 ++++++++--------
 third_party/ngraph/ngraph.BUILD    | 19 ++++++++++++-------
 third_party/ngraph/ngraph_tf.BUILD |  3 +++
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c2a0586108d..a4737bdb5d3 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -803,11 +803,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph",
         build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
-        sha256 = "bf9dcc88e5c66021e3aac80491a231711211540d613bf9b6bd28db3f5bb86b62",
-        strip_prefix = "ngraph-0.8.1",
+        sha256 = "2b28f9c9f063b96825a96d56d7f7978c9a1c55c9b25175c20dd49a8a77cb0305",
+        strip_prefix = "ngraph-0.9.1",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
+            "https://github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
         ],
     )
 
@@ -825,11 +825,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph_tf",
         build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
-        sha256 = "402f84c748c113780a60f35f39aab118435285543aee4900d712b76fbf8a21ee",
-        strip_prefix = "ngraph-tf-0.6.1",
+        sha256 = "89accbc702e68a09775f1011a99dd16561038fd1ce59d566d64450176abaae5c",
+        strip_prefix = "ngraph-tf-0.7.0",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
+            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
         ],
     )
 
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index 6602a480afb..f556c5279df 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -34,8 +34,9 @@ cc_library(
         "src/ngraph/runtime/cpu/builder/one_hot.cpp",
         "src/ngraph/runtime/cpu/builder/pad.cpp",
         "src/ngraph/runtime/cpu/builder/product.cpp",
-        "src/ngraph/runtime/cpu/builder/quantize.cpp",
+        "src/ngraph/runtime/cpu/builder/quantization.cpp",
         "src/ngraph/runtime/cpu/builder/quantized_avg_pool.cpp",
+        "src/ngraph/runtime/cpu/builder/quantized_conv.cpp",
         "src/ngraph/runtime/cpu/builder/quantized_max_pool.cpp",
         "src/ngraph/runtime/cpu/builder/reduce_function.cpp",
         "src/ngraph/runtime/cpu/builder/reduce_function_window.cpp",
@@ -61,6 +62,7 @@ cc_library(
         "src/ngraph/runtime/cpu/cpu_tensor_view.cpp",
         "src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp",
         "src/ngraph/runtime/cpu/cpu_tracing.cpp",
+        "src/ngraph/runtime/cpu/cpu_visualize_tree.cpp",
         "src/ngraph/runtime/cpu/kernel/eigen_thread_pool.cpp",
         "src/ngraph/runtime/cpu/kernel/pad.cpp",
         "src/ngraph/runtime/cpu/kernel/reduce_max.cpp",
@@ -76,15 +78,11 @@ cc_library(
         "src/ngraph/runtime/cpu/op/conv_bias.cpp",
         "src/ngraph/runtime/cpu/op/conv_relu.cpp",
         "src/ngraph/runtime/cpu/op/convert_layout.cpp",
-        "src/ngraph/runtime/cpu/op/dequantize.cpp",
         "src/ngraph/runtime/cpu/op/group_conv.cpp",
         "src/ngraph/runtime/cpu/op/loop_kernel.cpp",
         "src/ngraph/runtime/cpu/op/lstm.cpp",
         "src/ngraph/runtime/cpu/op/matmul_bias.cpp",
         "src/ngraph/runtime/cpu/op/max_pool_with_indices.cpp",
-        "src/ngraph/runtime/cpu/op/quantize.cpp",
-        "src/ngraph/runtime/cpu/op/quantized_avg_pool.cpp",
-        "src/ngraph/runtime/cpu/op/quantized_max_pool.cpp",
         "src/ngraph/runtime/cpu/op/rnn.cpp",
         "src/ngraph/runtime/cpu/op/sigmoid_mul.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_assignment.cpp",
@@ -110,8 +108,9 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.1\\"',
+        '-D NGRAPH_VERSION=\\"0.9.1\\"',
         "-D NGRAPH_DEX_ONLY",
+        '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
@@ -125,6 +124,11 @@ cc_library(
         "src/ngraph/builder/*.cpp",
         "src/ngraph/descriptor/*.cpp",
         "src/ngraph/descriptor/layout/*.cpp",
+        "src/ngraph/op/experimental/quantized_avg_pool.cpp",
+        "src/ngraph/op/experimental/quantized_conv_bias.cpp",
+        "src/ngraph/op/experimental/quantized_conv_relu.cpp",
+        "src/ngraph/op/experimental/quantized_conv.cpp",
+        "src/ngraph/op/experimental/quantized_max_pool.cpp",
         "src/ngraph/op/*.cpp",
         "src/ngraph/op/util/*.cpp",
         "src/ngraph/pattern/*.cpp",
@@ -144,7 +148,8 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.1\\"',
+        '-D NGRAPH_VERSION=\\"0.9.1\\"',
+        '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD
index dbedca0a03c..068e411e81b 100644
--- a/third_party/ngraph/ngraph_tf.BUILD
+++ b/third_party/ngraph/ngraph_tf.BUILD
@@ -51,6 +51,9 @@ cc_library(
         "@org_tensorflow//tensorflow/core:framework_headers_lib",
         "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
         "@ngraph//:ngraph_core",
+        "@com_google_absl//absl/container:container_memory",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:variant",
     ],
     copts = [
         "-I external/ngraph_tf/src",

From a29604fece6a558450a3e7c14b65cef16b0066d1 Mon Sep 17 00:00:00 2001
From: frreiss 
Date: Thu, 1 Nov 2018 15:10:26 -0700
Subject: [PATCH 054/540] Make case work in eager mode with dict inputs

Correct pylint issues
---
 tensorflow/python/ops/control_flow_ops.py     | 37 ++++++++++++++-----
 .../python/ops/control_flow_ops_test.py       |  9 ++++-
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 5604af665ef..0673cfabbfd 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3611,12 +3611,23 @@ def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name,
   if isinstance(pred_fn_pairs, collections.OrderedDict):
     pred_fn_pairs = pred_fn_pairs.items()
   elif isinstance(pred_fn_pairs, dict):
-    pred_fn_pairs = sorted(pred_fn_pairs.items(), key=lambda item: item[0].name)
-    if not exclusive:
-      logging.warn(
-          "%s: An unordered dictionary of predicate/fn pairs was "
-          "provided, but exclusive=False. The order of conditional "
-          "tests is deterministic but not guaranteed.", name)
+    if context.executing_eagerly():
+      # No name to sort on in eager mode. Use dictionary traversal order, 
+      # which is nondeterministic in versions of Python < 3.6
+      if not exclusive and not isinstance(pred_fn_pairs, 
+                                          collections.OrderedDict):
+        raise ValueError("Unordered dictionaries are not supported for the "
+                         "`pred_fn_pairs` argument when `exclusive=False` and "
+                         "eager mode is enabled.")
+      pred_fn_pairs = list(pred_fn_pairs.items())
+    else:
+      pred_fn_pairs = sorted(pred_fn_pairs.items(), 
+                             key=lambda item: item[0].name)
+      if not exclusive:
+        logging.warn(
+            "%s: An unordered dictionary of predicate/fn pairs was "
+            "provided, but exclusive=False. The order of conditional "
+            "tests is deterministic but not guaranteed.", name)
   for pred_fn_pair in pred_fn_pairs:
     if not isinstance(pred_fn_pair, _basetuple) or len(pred_fn_pair) != 2:
       raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
@@ -3712,7 +3723,7 @@ def case(pred_fn_pairs,
   operation returns the tensors generated by `default`.
 
   `tf.case` supports nested structures as implemented in
-  `tensorflow.python.util.nest`. All of the callables must return the same
+  `tf.contrib.framework.nest`. All of the callables must return the same
   (possibly nested) value structure of lists, tuples, and/or named tuples.
   Singleton lists and tuples form the only exceptions to this: when returned by
   a callable, they are implicitly unpacked to single values. This
@@ -3723,6 +3734,12 @@ def case(pred_fn_pairs,
   deterministic, so that variables created in conditional branches are created
   in fixed order across runs.
 
+  @compatibility{eager}
+  Unordered dictionaries are not supported in eager mode when `exclusive=False`.
+  Use a list of tuples instead.
+  @end_compatibility
+
+
   **Example 1:**
 
   Pseudocode:
@@ -3737,7 +3754,7 @@ def case(pred_fn_pairs,
   ```python
   f1 = lambda: tf.constant(17)
   f2 = lambda: tf.constant(23)
-  r = case([(tf.less(x, y), f1)], default=f2)
+  r = tf.case([(tf.less(x, y), f1)], default=f2)
   ```
 
   **Example 2:**
@@ -3745,7 +3762,7 @@ def case(pred_fn_pairs,
   Pseudocode:
 
   ```
-  if (x < y && x > z) raise OpError("Only one predicate may evaluate true");
+  if (x < y && x > z) raise OpError("Only one predicate may evaluate to True");
   if (x < y) return 17;
   else if (x > z) return 23;
   else return -1;
@@ -3757,7 +3774,7 @@ def case(pred_fn_pairs,
   def f1(): return tf.constant(17)
   def f2(): return tf.constant(23)
   def f3(): return tf.constant(-1)
-  r = case({tf.less(x, y): f1, tf.greater(x, z): f2},
+  r = tf.case({tf.less(x, y): f1, tf.greater(x, z): f2},
            default=f3, exclusive=True)
   ```
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 22263c702ff..d65755a960a 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -936,7 +936,14 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
-
+  @test_util.run_in_graph_and_eager_modes
+  def testCase_dict_eagerMode(self):
+    x = constant_op.constant(2)
+    conditions = {math_ops.equal(x, 1): lambda: constant_op.constant(2),
+                  math_ops.equal(x, 2): lambda: constant_op.constant(4)}
+    output = control_flow_ops.case(conditions, exclusive=True)
+    self.assertEqual(4, self.evaluate(output))
+    
 class WhileLoopTestCase(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes

From 3e8295b03eac9c0e84ca6707000bade78c360221 Mon Sep 17 00:00:00 2001
From: Michael 
Date: Fri, 2 Nov 2018 12:52:38 +0000
Subject: [PATCH 055/540] Update protobuf for Python 3.7 compat

Regards tensorflow/tensorflow#20517
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 08739f15548..60a54bec190 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -345,11 +345,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     )
 
     PROTOBUF_URLS = [
-        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz",
-        "https://github.com/google/protobuf/archive/v3.6.0.tar.gz",
+        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.1.tar.gz",
+        "https://github.com/google/protobuf/archive/v3.6.1.tar.gz",
     ]
-    PROTOBUF_SHA256 = "50a5753995b3142627ac55cfd496cebc418a2e575ca0236e29033c67bd5665f4"
-    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.0"
+    PROTOBUF_SHA256 = "3d4e589d81b2006ca603c1ab712c9715a76227293032d05b26fca603f90b3f5b"
+    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1"
 
     tf_http_archive(
         name = "protobuf_archive",

From 1da12a4352e971c9527bc90d1b9fa79d865f3959 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev 
Date: Mon, 10 Sep 2018 18:37:09 +0300
Subject: [PATCH 056/540] Add IGFS (Apache Ignite File System) support.

---
 tensorflow/contrib/BUILD                      |   2 +
 tensorflow/contrib/igfs/BUILD                 | 132 ++++++++
 tensorflow/contrib/igfs/README.md             |  34 ++
 tensorflow/contrib/igfs/__init__.py           |  34 ++
 tensorflow/contrib/igfs/kernels/igfs.cc       | 320 ++++++++++++++++++
 tensorflow/contrib/igfs/kernels/igfs.h        |  62 ++++
 .../igfs/kernels/igfs_random_access_file.h    |  40 +++
 .../contrib/igfs/kernels/igfs_writable_file.h |  42 +++
 tensorflow/contrib/igfs/ops/igfs_ops.cc       |  26 ++
 .../contrib/igfs/python/ops/igfs_op_loader.py |  25 ++
 .../contrib/igfs/python/ops/igfs_ops.py       |  29 ++
 .../tests/config/ignite-config-igfs.xml       |  55 +++
 .../contrib/igfs/python/tests/igfs_test.py    | 203 +++++++++++
 tensorflow/contrib/ignite/BUILD               |  33 +-
 .../contrib/ignite/kernels/ignite_client.cc   |  26 ++
 15 files changed, 1052 insertions(+), 11 deletions(-)
 create mode 100644 tensorflow/contrib/igfs/BUILD
 create mode 100644 tensorflow/contrib/igfs/README.md
 create mode 100644 tensorflow/contrib/igfs/__init__.py
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs.cc
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs.h
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_writable_file.h
 create mode 100644 tensorflow/contrib/igfs/ops/igfs_ops.cc
 create mode 100644 tensorflow/contrib/igfs/python/ops/igfs_op_loader.py
 create mode 100644 tensorflow/contrib/igfs/python/ops/igfs_ops.py
 create mode 100644 tensorflow/contrib/igfs/python/tests/config/ignite-config-igfs.xml
 create mode 100644 tensorflow/contrib/igfs/python/tests/igfs_test.py
 create mode 100644 tensorflow/contrib/ignite/kernels/ignite_client.cc

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 832db0f4ab4..40863def8a4 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -157,6 +157,7 @@ py_library(
         "//tensorflow:no_ignite_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/ignite",
+            "//tensorflow/contrib/igfs",
         ],
     }),
 )
@@ -248,6 +249,7 @@ cc_library(
         "//tensorflow:no_ignite_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/ignite:dataset_ops_op_lib",
+            "//tensorflow/contrib/igfs:igfs_ops_op_lib",
         ],
     }),
 )
diff --git a/tensorflow/contrib/igfs/BUILD b/tensorflow/contrib/igfs/BUILD
new file mode 100644
index 00000000000..4499c96eef9
--- /dev/null
+++ b/tensorflow/contrib/igfs/BUILD
@@ -0,0 +1,132 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_libs",
+    "tf_py_test",
+    "tf_cc_shared_object",
+)
+
+py_library(
+    name = "igfs",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":igfs_ops",
+    ],
+)
+
+tf_custom_op_library(
+    name = "_igfs_ops.so",
+    srcs = [
+        "ops/igfs_ops.cc",
+        "kernels/igfs.h",
+    ],
+    deps = [":igfs_kernels"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["igfs_ops"],
+    deps = [":igfs_kernels"]
+)
+
+cc_library(
+    name = "igfs_kernels",
+    srcs = [
+        "kernels/igfs.h",
+        "kernels/igfs.cc",
+        "kernels/igfs_random_access_file.h",
+        "kernels/igfs_random_access_file.cc",
+        "kernels/igfs_writable_file.h",
+        "kernels/igfs_writable_file.cc",
+        "kernels/igfs_client.h",
+        "kernels/igfs_client.cc",
+        "kernels/igfs_messages.h",
+        "kernels/igfs_messages.cc",
+        "kernels/igfs_extended_tcp_client.h",
+        "kernels/igfs_extended_tcp_client.cc"
+    ],
+    deps = [
+        "//tensorflow/contrib/ignite:ignite_client",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal"
+    ],
+    alwayslink=1,
+)
+
+py_library(
+    name = "igfs_ops",
+    srcs = [
+        "python/ops/igfs_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":igfs_op_loader",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_igfs_ops",
+    out = "python/ops/gen_igfs_ops.py",
+    deps = [":igfs_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "igfs_ops_kernels",
+    deps = [
+        ":igfs_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_py_library(
+    name = "igfs_op_loader",
+    srcs = ["python/ops/igfs_op_loader.py"],
+    dso = [":_igfs_ops.so"],
+    kernels = [
+        ":igfs_ops_kernels",
+        ":igfs_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_igfs_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+# The Apache Ignite servers have to setup before the test and tear down
+# after the test manually. The docker engine has to be installed.
+#
+# To setup Apache Ignite servers:
+# $ bash ./python/tests/start_ignite.sh
+#
+# To tear down Apache Ignite servers:
+# $ bash ./python/tests/stop_ignite.sh
+tf_py_test(
+    name = "igfs_test_py",
+    srcs = ["python/tests/igfs_test.py"],
+    additional_deps = [
+        ":igfs",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = [
+        "manual",
+        "no_windows",
+        "notap",
+    ],
+)
diff --git a/tensorflow/contrib/igfs/README.md b/tensorflow/contrib/igfs/README.md
new file mode 100644
index 00000000000..fa7d677d433
--- /dev/null
+++ b/tensorflow/contrib/igfs/README.md
@@ -0,0 +1,34 @@
+# IGFS (Ignite File System)
+
+- [Overview](#overview)
+- [Try it out](#try-it-out)
+
+## Overview
+
+[Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed database, caching, and processing platform for
+transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. In addition to database functionality Apache Ignite provides a distributed file system called [IGFS](https://ignite.apache.org/features/igfs.html). IGFS delivers a similar functionality to Hadoop HDFS, but only in-memory. In fact, in addition to its own APIs, IGFS implements Hadoop FileSystem API and can be transparently plugged into Hadoop or Spark deployments. This contrib package contains an intergration between IGFS and TensorFlow. The integration is based on [custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys) from TensorFlow side and [IGFS Native API](https://ignite.apache.org/features/igfs.html) from Apache Ignite side. It has numerous uses, for example:
+* Checkpoints of state can be saved to IGFS for reliability and fault-tolerance.
+* Training processes communicate with TensorBoard by writing event files to a directory, which TensorBoard watches. IGFS allows this communication to work even when TensorBoard runs in a different process or machine.
+
+## Try it out
+
+The simplest way to try IGFS with TensorFlow is to run [Docker](https://www.docker.com/) container with Apache Ignite and enabled IGFS and then interruct with it using TensorFlow [tf.gfile](https://www.tensorflow.org/api_docs/python/tf/gfile). Such container is available on Docker Hub: [dmitrievanthony/ignite-with-igfs](https://hub.docker.com/r/dmitrievanthony/ignite-with-igfs/). You need to start this container on your machine:
+
+```
+docker run -it -p 10500:10500 dmitrievanthony/ignite-with-igfs
+```
+
+After that you will be able to work with it following way:
+
+```python
+>>> import tensorflow as tf
+>>> import tensorflow.contrib.igfs.python.ops.igfs_ops
+>>> 
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='w') as w:
+>>>   w.write("Hello, world!")
+>>>
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='r') as r:
+>>>   print(w.read())
+
+Hello, world!
+```
diff --git a/tensorflow/contrib/igfs/__init__.py b/tensorflow/contrib/igfs/__init__.py
new file mode 100644
index 00000000000..dfead6bb29b
--- /dev/null
+++ b/tensorflow/contrib/igfs/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Apache Ignite is a memory-centric distributed database, caching, and
+   processing platform for transactional, analytical, and streaming workloads,
+   delivering in-memory speeds at petabyte scale. In addition to database
+   functionality Apache Ignite provides a distributed file system called
+   IGFS (https://ignite.apache.org/features/igfs.html). IGFS delivers a similar
+   functionality to Hadoop HDFS, but only in-memory. In fact, in addition to
+   its own APIs, IGFS implements Hadoop FileSystem API and can be transparently
+   plugged into Hadoop or Spark deployments. This contrib package contains an
+   intergration between IGFS and TensorFlow.
+
+@@IGFS
+"""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.contrib.igfs.python.ops.igfs_ops
+from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/igfs/kernels/igfs.cc b/tensorflow/contrib/igfs/kernels/igfs.cc
new file mode 100644
index 00000000000..8ad5696b19d
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs.cc
@@ -0,0 +1,320 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
+
+#include "igfs.h"
+#include "igfs_client.h"
+#include "igfs_random_access_file.h"
+#include "igfs_writable_file.h"
+
+namespace tensorflow {
+
+std::string GetEnvOrElse(const std::string &env, std::string default_value) {
+  const char *env_c_str = env.c_str();
+  return getenv(env_c_str) != nullptr ? getenv(env_c_str) : default_value;
+}
+
+std::string IGFS::TranslateName(const std::string &name) const {
+  StringPiece scheme, namenode, path;
+  io::ParseURI(name, &scheme, &namenode, &path);
+  return path.ToString();
+}
+
+std::string MakeRelative(const std::string &a, const std::string &b) {
+  std::string max = a;
+  std::string min = b;
+  bool first = b.size() > a.size();
+
+  if (first) {
+    max = b;
+    min = a;
+  }
+
+  auto r = mismatch(min.begin(), min.end(), max.begin());
+  return std::string((first ? r.first : r.second),
+                     first ? min.end() : max.end());
+}
+
+IGFS::IGFS()
+    : host_(GetEnvOrElse("IGFS_HOST", "localhost")),
+      port_(atoi(GetEnvOrElse("IGFS_PORT", "10500").c_str())),
+      fs_name_(GetEnvOrElse("IGFS_FS_NAME", "default_fs")) {
+  LOG(INFO) << "IGFS created [host=" << host_ << ", port=" << port_
+            << ", fs_name=" << fs_name_ << "]";
+};
+
+IGFS::~IGFS() {
+  LOG(INFO) << "IGFS destroyed [host=" << host_ << ", port=" << port_
+            << ", fs_name=" << fs_name_ << "]";
+};
+
+Status IGFS::NewRandomAccessFile(const std::string &file_name,
+                                 std::unique_ptr *result) {
+  std::shared_ptr client = CreateClient();
+  std::string path = TranslateName(file_name);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse open_read_response(true);
+  TF_RETURN_IF_ERROR(client->OpenRead(&open_read_response, path));
+
+  long resource_id = open_read_response.res.stream_id;
+  result->reset(new IGFSRandomAccessFile(path, resource_id, client));
+
+  LOG(INFO) << "New random access file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewWritableFile(const std::string &file_name,
+                             std::unique_ptr *result) {
+  std::shared_ptr client = CreateClient();
+  std::string path = TranslateName(file_name);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, path));
+
+  if (exists_response.res.exists) {
+    CtrlResponse del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, path, false));
+  }
+
+  CtrlResponse open_create_resp(false);
+  TF_RETURN_IF_ERROR(client->OpenCreate(&open_create_resp, path));
+
+  long resource_id = open_create_resp.res.stream_id;
+  result->reset(new IGFSWritableFile(path, resource_id, client));
+
+  LOG(INFO) << "New writable file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewAppendableFile(const std::string &file_name,
+                               std::unique_ptr *result) {
+  std::shared_ptr client = CreateClient();
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, file_name));
+
+  if (exists_response.res.exists) {
+    CtrlResponse del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, file_name, false));
+  }
+
+  CtrlResponse open_append_resp(false);
+  TF_RETURN_IF_ERROR(client->OpenAppend(&open_append_resp, file_name));
+
+  result->reset(new IGFSWritableFile(TranslateName(file_name),
+                                     open_append_resp.res.stream_id, client));
+
+  LOG(INFO) << "New appendable file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewReadOnlyMemoryRegionFromFile(
+    const std::string &file_name,
+    std::unique_ptr *result) {
+  return errors::Unimplemented("IGFS does not support ReadOnlyMemoryRegion");
+}
+
+Status IGFS::FileExists(const std::string &file_name) {
+  std::shared_ptr client = CreateClient();
+  const std::string path = TranslateName(file_name);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, path));
+
+  if (!exists_response.res.exists)
+    return errors::NotFound("File ", path, " not found");
+
+  LOG(INFO) << "File exists completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetChildren(const std::string &file_name,
+                         std::vector *result) {
+  std::shared_ptr client = CreateClient();
+  std::string path = TranslateName(file_name);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse list_paths_response(false);
+  TF_RETURN_IF_ERROR(client->ListPaths(&list_paths_response, path));
+
+  *result = std::vector();
+  std::vector entries = list_paths_response.res.entries;
+
+  for (IGFSPath &value : entries)
+    result->push_back(MakeRelative(value.path, path));
+
+  LOG(INFO) << "Get children completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetMatchingPaths(const std::string &pattern,
+                              std::vector *results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
+Status IGFS::DeleteFile(const std::string &file_name) {
+  std::shared_ptr client = CreateClient();
+  std::string path = TranslateName(file_name);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse del_response(false);
+  TF_RETURN_IF_ERROR(client->Delete(&del_response, path, false));
+
+  if (!del_response.res.exists)
+    return errors::NotFound("File ", path, " not found");
+
+  LOG(INFO) << "Delete file completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::CreateDir(const std::string &file_name) {
+  std::shared_ptr client = CreateClient();
+  const std::string path = TranslateName(file_name);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse mkdir_response(false);
+  TF_RETURN_IF_ERROR(client->MkDir(&mkdir_response, path));
+
+  if (!mkdir_response.res.successful)
+    return errors::Internal("Can't create directory ", path);
+
+  LOG(INFO) << "Create dir completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::DeleteDir(const std::string &file_name) {
+  std::shared_ptr client = CreateClient();
+  std::string path = TranslateName(file_name);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse list_files_response(false);
+  TF_RETURN_IF_ERROR(client->ListFiles(&list_files_response, path));
+
+  if (!list_files_response.res.entries.empty()) {
+    return errors::FailedPrecondition("Can't delete a non-empty directory");
+  } else {
+    CtrlResponse del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, path, true));
+  }
+
+  LOG(INFO) << "Delete dir completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetFileSize(const std::string &file_name, uint64 *size) {
+  std::shared_ptr client = CreateClient();
+  std::string path = TranslateName(file_name);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse info_response(false);
+  TF_RETURN_IF_ERROR(client->Info(&info_response, path));
+
+  *size = info_response.res.file_info.length;
+
+  LOG(INFO) << "Get file size completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::RenameFile(const std::string &src, const std::string &dst) {
+  std::shared_ptr client = CreateClient();
+  std::string src_path = TranslateName(src);
+  std::string dst_path = TranslateName(dst);
+
+  if (FileExists(dst).ok()) DeleteFile(dst);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse rename_response(false);
+  TF_RETURN_IF_ERROR(client->Rename(&rename_response, src_path, dst_path));
+
+  if (!rename_response.res.successful)
+    return errors::NotFound("File ", src_path, " not found");
+
+  LOG(INFO) << "Rename file completed successful [src=" << src
+            << ", dst=" << dst << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::Stat(const std::string &file_name, FileStatistics *stats) {
+  std::shared_ptr client = CreateClient();
+  std::string path = TranslateName(file_name);
+
+  CtrlResponse handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse info_response(false);
+  TF_RETURN_IF_ERROR(client->Info(&info_response, path));
+
+  IGFSFile info = info_response.res.file_info;
+
+  *stats = FileStatistics(info.length, info.modification_time,
+                          (info.flags & 0x1) != 0);
+
+  LOG(INFO) << "Stat completed successful [file_name=" << file_name << "]";
+
+  return Status::OK();
+}
+
+std::shared_ptr IGFS::CreateClient() const {
+  return std::shared_ptr(
+      new IGFSClient(host_, port_, fs_name_, ""));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/igfs/kernels/igfs.h b/tensorflow/contrib/igfs/kernels/igfs.h
new file mode 100644
index 00000000000..26c74640446
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_H_
+#define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_H_
+
+#include "igfs_client.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFS : public FileSystem {
+ public:
+  IGFS();
+  ~IGFS();
+  Status NewRandomAccessFile(
+      const std::string& file_name,
+      std::unique_ptr* result) override;
+  Status NewWritableFile(const std::string& fname,
+                         std::unique_ptr* result) override;
+  Status NewAppendableFile(const std::string& fname,
+                           std::unique_ptr* result) override;
+  Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname,
+      std::unique_ptr* result) override;
+  Status FileExists(const std::string& fname) override;
+  Status GetChildren(const std::string& dir,
+                     std::vector* result) override;
+  Status GetMatchingPaths(const std::string& pattern,
+                          std::vector* results) override;
+  Status DeleteFile(const std::string& fname) override;
+  Status CreateDir(const std::string& name) override;
+  Status DeleteDir(const std::string& name) override;
+  Status GetFileSize(const std::string& fname, uint64* size) override;
+  Status RenameFile(const std::string& src, const std::string& target) override;
+  Status Stat(const std::string& fname, FileStatistics* stat) override;
+  string TranslateName(const std::string& name) const override;
+
+ private:
+  const std::string host_;
+  const int port_;
+  const std::string fs_name_;
+
+  std::shared_ptr CreateClient() const;
+};
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
new file mode 100644
index 00000000000..b1f6986abd5
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_RANDOM_ACCESS_FILE_H_
+#define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_RANDOM_ACCESS_FILE_H_
+
+#include "igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFSRandomAccessFile : public RandomAccessFile {
+ public:
+  IGFSRandomAccessFile(const std::string &file_name, int64_t resource_id,
+                       std::shared_ptr client);
+  ~IGFSRandomAccessFile() override;
+  Status Read(uint64 offset, size_t n, StringPiece *result,
+              char *scratch) const override;
+
+ private:
+  const std::string file_name_;
+  const int64_t resource_id_;
+  std::shared_ptr client_;
+};
+
+}  // namespace tensorflow
+
+#endif
\ No newline at end of file
diff --git a/tensorflow/contrib/igfs/kernels/igfs_writable_file.h b/tensorflow/contrib/igfs/kernels/igfs_writable_file.h
new file mode 100644
index 00000000000..352354a630f
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_writable_file.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_WRITABLE_FILE_H_
+#define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_WRITABLE_FILE_H_
+
+#include "igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFSWritableFile : public WritableFile {
+ public:
+  IGFSWritableFile(const string &file_name, int64_t resource_id,
+                   std::shared_ptr client);
+  ~IGFSWritableFile() override;
+  Status Append(const StringPiece &data) override;
+  Status Close() override;
+  Status Flush() override;
+  Status Sync() override;
+
+ private:
+  const string file_name_;
+  int64_t resource_id_;
+  std::shared_ptr client_;
+};
+
+}  // namespace tensorflow
+
+#endif
\ No newline at end of file
diff --git a/tensorflow/contrib/igfs/ops/igfs_ops.cc b/tensorflow/contrib/igfs/ops/igfs_ops.cc
new file mode 100644
index 00000000000..99e41a5dbb1
--- /dev/null
+++ b/tensorflow/contrib/igfs/ops/igfs_ops.cc
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+#include "../kernels/igfs.h"
+
+namespace tensorflow {
+
+REGISTER_FILE_SYSTEM("igfs", IGFS);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/igfs/python/ops/igfs_op_loader.py b/tensorflow/contrib/igfs/python/ops/igfs_op_loader.py
new file mode 100644
index 00000000000..c9ab2831058
--- /dev/null
+++ b/tensorflow/contrib/igfs/python/ops/igfs_op_loader.py
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Python helper for loading IGFS ops and kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_igfs_ops.so"))
diff --git a/tensorflow/contrib/igfs/python/ops/igfs_ops.py b/tensorflow/contrib/igfs/python/ops/igfs_ops.py
new file mode 100644
index 00000000000..5c02ddcd9ad
--- /dev/null
+++ b/tensorflow/contrib/igfs/python/ops/igfs_ops.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""IGFS."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.framework import load_library
+
+file_system_library = os.path.join(resource_loader.get_data_files_path(),
+                                   "../../_igfs_ops.so")
+load_library.load_file_system_library(file_system_library)
diff --git a/tensorflow/contrib/igfs/python/tests/config/ignite-config-igfs.xml b/tensorflow/contrib/igfs/python/tests/config/ignite-config-igfs.xml
new file mode 100644
index 00000000000..5d81bf33226
--- /dev/null
+++ b/tensorflow/contrib/igfs/python/tests/config/ignite-config-igfs.xml
@@ -0,0 +1,55 @@
+
+
+
+
+
+  
+    
+      
+        
+        
+        
+        
+        
+        
+          
+            
+            
+            
+          
+        
+      
+    
+    
+      
+        
+          
+            
+              
+                127.0.0.1
+              
+            
+          
+        
+      
+    
+  
+
+
diff --git a/tensorflow/contrib/igfs/python/tests/igfs_test.py b/tensorflow/contrib/igfs/python/tests/igfs_test.py
new file mode 100644
index 00000000000..120ab666a89
--- /dev/null
+++ b/tensorflow/contrib/igfs/python/tests/igfs_test.py
@@ -0,0 +1,203 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for IGFS."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorflow.contrib.igfs.python.ops.igfs_ops  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+class IGFSTest(test.TestCase):
+  """The Apache Ignite servers have to setup before the test and tear down
+     after the test manually. The docker engine has to be installed.
+
+     To setup Apache Ignite servers:
+     $ bash start_ignite.sh
+
+     To tear down Apache Ignite servers:
+     $ bash stop_ignite.sh
+  """
+
+  def test_create_file(self):
+    """Test create file.
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_create_file/1"
+    self.assertFalse(tf.gfile.Exists(file_name))
+    # Create file.
+    with tf.gfile.Open(file_name, mode='w') as w:
+      w.write("")
+    # Check that file was created.
+    self.assertTrue(tf.gfile.Exists(file_name))
+
+  def test_write_read_file(self):
+    """Test write/read file.
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_write_read_file/1"
+    rows = 10000
+    self.assertFalse(tf.gfile.Exists(file_name))
+    # Write data.
+    with tf.gfile.Open(file_name, mode='w') as w:
+      for i in range(rows):
+        w.write("This is row\n")
+    # Read data.
+    with tf.gfile.Open(file_name, mode='r') as r:
+      lines = r.readlines()
+    # Check that data is equal.
+    self.assertEqual(rows, len(lines))
+    for i in range(rows):
+      self.assertEqual("This is row\n", lines[i])
+
+  def test_delete_recursively(self):
+    """Test delete recursively.
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_delete_recursively/"
+    file_name = "igfs:///test_delete_recursively/1"
+    self.assertFalse(tf.gfile.Exists(dir_name))
+    self.assertFalse(tf.gfile.Exists(file_name))
+    tf.gfile.MkDir(dir_name)
+    with tf.gfile.Open(file_name, mode='w') as w:
+      w.write("")
+    self.assertTrue(tf.gfile.Exists(dir_name))
+    self.assertTrue(tf.gfile.Exists(file_name))
+    # Delete directory recursively.
+    tf.gfile.DeleteRecursively(dir_name)
+    # Check that directory was deleted.
+    self.assertFalse(tf.gfile.Exists(dir_name))
+    self.assertFalse(tf.gfile.Exists(file_name))
+
+  def test_copy(self):
+    """Test copy.
+    """
+    # Setup and check preconditions.
+    src_file_name = "igfs:///test_copy/1"
+    dst_file_name = "igfs:///test_copy/2"
+    self.assertFalse(tf.gfile.Exists(src_file_name))
+    self.assertFalse(tf.gfile.Exists(dst_file_name))
+    with tf.gfile.Open(src_file_name, mode='w') as w:
+      w.write("42")
+    self.assertTrue(tf.gfile.Exists(src_file_name))
+    self.assertFalse(tf.gfile.Exists(dst_file_name))
+    # Copy file.
+    tf.gfile.Copy(src_file_name, dst_file_name)
+    # Check that files are identical.
+    self.assertTrue(tf.gfile.Exists(src_file_name))
+    self.assertTrue(tf.gfile.Exists(dst_file_name))
+    with tf.gfile.Open(dst_file_name, mode='r') as r:
+      data = r.read()
+    self.assertEqual("42", data)
+
+  def test_is_directory(self):
+    """Test is directory.
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_is_directory/1"
+    file_name = "igfs:///test_is_directory/2"
+    with tf.gfile.Open(file_name, mode='w') as w:
+      w.write("")
+    tf.gfile.MkDir(dir_name)
+    # Check that directory is a directory.
+    self.assertTrue(tf.gfile.IsDirectory(dir_name))
+    # Check that file is not a directory.
+    self.assertFalse(tf.gfile.IsDirectory(file_name))
+
+  def test_list_directory(self):
+    """Test list directory.
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_list_directory/"
+    file_names = [
+        "igfs:///test_list_directory/1",
+        "igfs:///test_list_directory/2/3"
+    ]
+    ch_dir_names = [
+        "igfs:///test_list_directory/4",
+    ]
+    for file_name in file_names:
+      with tf.gfile.Open(file_name, mode='w') as w:
+        w.write("")
+    for ch_dir_name in ch_dir_names:
+      tf.gfile.MkDir(ch_dir_name)
+    ls_expected_result = file_names + ch_dir_names
+    # Get list of files in directory.
+    ls_result = tf.gfile.ListDirectory(dir_name)
+    # Check that list of files is correct.
+    self.assertEqual(len(ls_expected_result), len(ls_result))
+    for e in ['1', '2', '4']:
+      self.assertTrue(e in ls_result)
+
+  def test_make_dirs(self):
+    """Test make dirs.
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_make_dirs/"
+    self.assertFalse(tf.gfile.Exists(dir_name))
+    # Make directory.
+    tf.gfile.MkDir(dir_name)
+    # Check that directory was created.
+    self.assertTrue(tf.gfile.Exists(dir_name))
+
+  def test_remove(self):
+    """Test remove.
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_remove/1"
+    self.assertFalse(tf.gfile.Exists(file_name))
+    with tf.gfile.Open(file_name, mode='w') as w:
+      w.write("")
+    self.assertTrue(tf.gfile.Exists(file_name))
+    # Remove file.
+    tf.gfile.Remove(file_name)
+    # Check that file was removed.
+    self.assertFalse(tf.gfile.Exists(file_name))
+
+  def test_rename_file(self):
+    """Test rename file.
+    """
+    # Setup and check preconditions.
+    src_file_name = "igfs:///test_rename_file/1"
+    dst_file_name = "igfs:///test_rename_file/2"
+    with tf.gfile.Open(src_file_name, mode='w') as w:
+      w.write("42")
+    self.assertTrue(tf.gfile.Exists(src_file_name))
+    # Rename file.
+    tf.gfile.Rename(src_file_name, dst_file_name)
+    # Check that only new name of file is available.
+    self.assertFalse(tf.gfile.Exists(src_file_name))
+    self.assertTrue(tf.gfile.Exists(dst_file_name))
+    with tf.gfile.Open(dst_file_name, mode='r') as r:
+      data = r.read()
+    self.assertEqual("42", data)
+
+  def test_rename_dir(self):
+    """Test rename dir.
+    """
+    # Setup and check preconditions.
+    src_dir_name = "igfs:///test_rename_dir/1"
+    dst_dir_name = "igfs:///test_rename_dir/2"
+    tf.gfile.MkDir(src_dir_name)
+    # Rename directory.
+    tf.gfile.Rename(src_dir_name, dst_dir_name)
+    # Check that only new name of directory is available.
+    self.assertFalse(tf.gfile.Exists(src_dir_name))
+    self.assertTrue(tf.gfile.Exists(dst_dir_name))
+    self.assertTrue(tf.gfile.IsDirectory(dst_dir_name))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index 9393b702d11..c79d35e6018 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -36,20 +36,13 @@ tf_gen_op_libs(
 )
 
 cc_library(
-    name = "dataset_kernels",
+    name = "ignite_client",
     srcs = [
-        "kernels/ignite_dataset_ops.cc",
         "kernels/ignite_client.h",
         "kernels/ignite_byte_swapper.h",
         "kernels/ignite_plain_client.h",
         "kernels/ignite_ssl_wrapper.h",
-        "kernels/ignite_ssl_wrapper.cc",
-        "kernels/ignite_binary_object_parser.h",
-        "kernels/ignite_binary_object_parser.cc",
-        "kernels/ignite_dataset.h",
-        "kernels/ignite_dataset.cc",
-        "kernels/ignite_dataset_iterator.h",
-        "kernels/ignite_dataset_iterator.cc",
+        "kernels/ignite_ssl_wrapper.cc"
     ] + if_not_windows([
         "kernels/ignite_plain_client_unix.cc",
     ]) + if_windows([
@@ -59,11 +52,29 @@ cc_library(
         "-DWIN32_LEAN_AND_MEAN",
     ]),
     deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
         "@boringssl//:ssl",
+        "//tensorflow/core:framework_headers_lib",
         "@protobuf_archive//:protobuf_headers",
     ],
+)
+
+cc_library(
+    name = "dataset_kernels",
+    srcs = [
+        "kernels/ignite_dataset_ops.cc",
+        "kernels/ignite_binary_object_parser.h",
+        "kernels/ignite_binary_object_parser.cc",
+        "kernels/ignite_dataset.h",
+        "kernels/ignite_dataset.cc",
+        "kernels/ignite_dataset_iterator.h",
+        "kernels/ignite_dataset_iterator.cc",
+    ],
+    deps = [
+        ":ignite_client",
+        "//tensorflow/core:framework_headers_lib",
+        "@protobuf_archive//:protobuf_headers",
+        "//third_party/eigen3",
+    ],
     alwayslink = 1,
 )
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.cc b/tensorflow/contrib/ignite/kernels/ignite_client.cc
new file mode 100644
index 00000000000..dea6484d594
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/ignite_client.cc
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ignite_client.h"
+
+namespace tensorflow {
+
+Client::Client(bool big_endian) {
+  int x = 1;
+  bool is_little_endian = (*(char *)&x == 1);
+  swap_ = big_endian == is_little_endian;
+}
+
+}  // namespace tensorflow
\ No newline at end of file

From a83408649512759e498ec5968532ed948287e13b Mon Sep 17 00:00:00 2001
From: Artem Malykh 
Date: Mon, 10 Sep 2018 18:41:43 +0300
Subject: [PATCH 057/540] Add IGFS (Apache Ignite File System) support.

---
 .../contrib/igfs/kernels/igfs_client.cc       |  43 ++
 tensorflow/contrib/igfs/kernels/igfs_client.h | 110 ++++++
 .../igfs/kernels/igfs_extended_tcp_client.cc  | 143 +++++++
 .../igfs/kernels/igfs_extended_tcp_client.h   |  47 +++
 .../contrib/igfs/kernels/igfs_messages.cc     | 341 ++++++++++++++++
 .../contrib/igfs/kernels/igfs_messages.h      | 371 ++++++++++++++++++
 .../igfs/kernels/igfs_random_access_file.cc   |  46 +++
 .../igfs/kernels/igfs_writable_file.cc        |  51 +++
 .../igfs/python/tests/bin/start-igfs.sh       |  20 +
 .../contrib/igfs/python/tests/start_ignite.sh |  22 ++
 .../contrib/igfs/python/tests/stop_ignite.sh  |  17 +
 11 files changed, 1211 insertions(+)
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_client.cc
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_client.h
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_messages.cc
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_messages.h
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
 create mode 100644 tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
 create mode 100755 tensorflow/contrib/igfs/python/tests/bin/start-igfs.sh
 create mode 100755 tensorflow/contrib/igfs/python/tests/start_ignite.sh
 create mode 100755 tensorflow/contrib/igfs/python/tests/stop_ignite.sh

diff --git a/tensorflow/contrib/igfs/kernels/igfs_client.cc b/tensorflow/contrib/igfs/kernels/igfs_client.cc
new file mode 100644
index 00000000000..e27f8374faa
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_client.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "igfs_client.h"
+
+namespace tensorflow {
+
+IGFSClient::IGFSClient(std::string host, int port, std::string fs_name,
+                       std::string user_name)
+    : fs_name_(fs_name),
+      user_name_(user_name),
+      client_(ExtendedTCPClient(host, port, true)) {
+  client_.Connect();
+}
+
+IGFSClient::~IGFSClient() { client_.Disconnect(); }
+
+Status IGFSClient::SendRequestGetResponse(const Request &request,
+                                          Response *response) {
+  TF_RETURN_IF_ERROR(request.Write(&client_));
+  client_.reset();
+
+  if (response != nullptr) {
+    TF_RETURN_IF_ERROR(response->Read(&client_));
+    client_.reset();
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/igfs/kernels/igfs_client.h b/tensorflow/contrib/igfs/kernels/igfs_client.h
new file mode 100644
index 00000000000..82653af03a2
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_client.h
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_CLIENT_H_
+
+#include "igfs_messages.h"
+
+namespace tensorflow {
+
+class IGFSClient {
+ public:
+  IGFSClient(std::string host, int port, std::string fs_name,
+             std::string user_name);
+  ~IGFSClient();
+
+  inline Status Handshake(CtrlResponse *res) {
+    return SendRequestGetResponse(HandshakeRequest(fs_name_, {}), res);
+  }
+
+  inline Status ListFiles(CtrlResponse *res,
+                          const std::string &path) {
+    return SendRequestGetResponse(ListFilesRequest(user_name_, path), res);
+  }
+
+  inline Status ListPaths(CtrlResponse *res,
+                          const std::string &path) {
+    return SendRequestGetResponse(ListPathsRequest(user_name_, path), res);
+  }
+
+  inline Status Info(CtrlResponse *res, const std::string &path) {
+    return SendRequestGetResponse(InfoRequest(user_name_, path), res);
+  }
+
+  inline Status OpenCreate(CtrlResponse *res,
+                           const std::string &path) {
+    return SendRequestGetResponse(OpenCreateRequest(user_name_, path), res);
+  }
+
+  inline Status OpenAppend(CtrlResponse *res,
+                           const std::string &path) {
+    return SendRequestGetResponse(OpenAppendRequest(user_name_, path), res);
+  }
+
+  inline Status OpenRead(CtrlResponse *res,
+                         const std::string &path) {
+    return SendRequestGetResponse(OpenReadRequest(user_name_, path), res);
+  }
+
+  inline Status Exists(CtrlResponse *res,
+                       const std::string &path) {
+    return SendRequestGetResponse(ExistsRequest(user_name_, path), res);
+  }
+
+  inline Status MkDir(CtrlResponse *res,
+                      const std::string &path) {
+    return SendRequestGetResponse(MakeDirectoriesRequest(user_name_, path),
+                                  res);
+  }
+
+  inline Status Delete(CtrlResponse *res,
+                       const std::string &path, bool recursive) {
+    return SendRequestGetResponse(DeleteRequest(user_name_, path, recursive),
+                                  res);
+  }
+
+  inline Status WriteBlock(int64_t stream_id, const uint8_t *data,
+                           int32_t len) {
+    return SendRequestGetResponse(WriteBlockRequest(stream_id, data, len),
+                                  nullptr);
+  }
+
+  inline Status ReadBlock(ReadBlockCtrlResponse *res, int64_t stream_id,
+                          int64_t pos, int32_t length) {
+    return SendRequestGetResponse(ReadBlockRequest(stream_id, pos, length),
+                                  res);
+  }
+
+  inline Status Close(CtrlResponse *res, int64_t stream_id) {
+    return SendRequestGetResponse(CloseRequest(stream_id), res);
+  }
+
+  inline Status Rename(CtrlResponse *res,
+                       const std::string &source, const std::string &dest) {
+    return SendRequestGetResponse(RenameRequest(user_name_, source, dest), res);
+  }
+
+ private:
+  const std::string fs_name_;
+  const std::string user_name_;
+  ExtendedTCPClient client_;
+
+  Status SendRequestGetResponse(const Request &request, Response *response);
+};
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
new file mode 100644
index 00000000000..b21360ec831
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "igfs_extended_tcp_client.h"
+
+namespace tensorflow {
+
+ExtendedTCPClient::ExtendedTCPClient(std::string host, int port,
+                                     bool big_endian)
+    : PlainClient(host, port, big_endian), pos_(0) {}
+
+Status ExtendedTCPClient::ReadData(uint8_t *buf, int32_t length) {
+  TF_RETURN_IF_ERROR(PlainClient::ReadData(buf, length));
+  pos_ += length;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteData(uint8_t *buf, int32_t length) {
+  TF_RETURN_IF_ERROR(PlainClient::WriteData(buf, length));
+  pos_ += length;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::Ignore(int n) {
+  uint8_t buf[n];
+  return ReadData(buf, n);
+}
+
+Status ExtendedTCPClient::SkipToPos(int target_pos) {
+  return Ignore(std::max(0, target_pos - pos_));
+};
+
+Status ExtendedTCPClient::ReadBool(bool *res) {
+  uint8_t buf = 0;
+  TF_RETURN_IF_ERROR(ReadData(&buf, 1));
+  *res = buf != 0;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::ReadNullableString(std::string *res) {
+  bool is_empty = false;
+  TF_RETURN_IF_ERROR(ReadBool(&is_empty));
+
+  if (!is_empty) {
+    TF_RETURN_IF_ERROR(ReadString(res));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::ReadString(std::string *res) {
+  int16_t length;
+  TF_RETURN_IF_ERROR(ReadShort(&length));
+
+  uint8_t *buf = new uint8_t[length];
+  Status status = ReadData(buf, length);
+
+  if (status.ok()) res->assign((char *)buf, length);
+
+  delete[] buf;
+  return status;
+}
+
+Status ExtendedTCPClient::ReadStringMap(
+    std::map *res) {
+  int size;
+  TF_RETURN_IF_ERROR(ReadInt(&size));
+
+  for (int i = 0; i < size; i++) {
+    std::string key;
+    std::string val;
+    TF_RETURN_IF_ERROR(ReadString(&key));
+    TF_RETURN_IF_ERROR(ReadString(&val));
+
+    res->insert(std::pair(key, val));
+  }
+
+  return Status::OK();
+};
+
+Status ExtendedTCPClient::WriteSize(
+    std::map::size_type s) {
+  return WriteInt(s);
+}
+
+Status ExtendedTCPClient::FillWithZerosUntil(int n) {
+  int toSkip = std::max(0, n - pos_);
+
+  for (int i = 0; i < toSkip; i++) {
+    TF_RETURN_IF_ERROR(WriteByte(0));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteBool(bool val) {
+  return WriteByte((char)(val ? 1 : 0));
+}
+
+Status ExtendedTCPClient::WriteString(std::string str) {
+  if (!str.empty()) {
+    TF_RETURN_IF_ERROR(WriteBool(false));
+    unsigned short l = str.length();
+    TF_RETURN_IF_ERROR(WriteShort(l));
+    TF_RETURN_IF_ERROR(WriteData((uint8_t *)str.c_str(), str.length()));
+  } else {
+    TF_RETURN_IF_ERROR(WriteBool(true));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteStringMap(
+    std::map map) {
+  std::map::size_type size = map.size();
+  TF_RETURN_IF_ERROR(WriteSize(size));
+
+  for (auto const &x : map) {
+    TF_RETURN_IF_ERROR(WriteString(x.first));
+    TF_RETURN_IF_ERROR(WriteString(x.second));
+  }
+
+  return Status::OK();
+}
+
+void ExtendedTCPClient::reset() { pos_ = 0; }
+
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h
new file mode 100644
index 00000000000..07d8a336059
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_EXTENDED_TCP_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_EXTENDED_TCP_CLIENT_H_
+
+#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+
+namespace tensorflow {
+
+class ExtendedTCPClient : public PlainClient {
+ public:
+  ExtendedTCPClient(std::string host, int port, bool big_endian);
+  Status ReadData(uint8_t *buf, int32_t length) override;
+  Status WriteData(uint8_t *buf, int32_t length) override;
+  Status Ignore(int n);
+  Status SkipToPos(int target_pos);
+  Status ReadBool(bool *res);
+  Status ReadNullableString(std::string *res);
+  Status ReadString(std::string *res);
+  Status ReadStringMap(std::map *res);
+  Status WriteSize(std::map::size_type s);
+  Status FillWithZerosUntil(int n);
+  Status WriteBool(bool val);
+  Status WriteString(std::string str);
+  Status WriteStringMap(std::map map);
+  void reset();
+
+ private:
+  int pos_;
+};
+
+}  // namespace tensorflow
+
+#endif
\ No newline at end of file
diff --git a/tensorflow/contrib/igfs/kernels/igfs_messages.cc b/tensorflow/contrib/igfs/kernels/igfs_messages.cc
new file mode 100644
index 00000000000..2d892c7cc29
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_messages.cc
@@ -0,0 +1,341 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "igfs_messages.h"
+
+namespace tensorflow {
+
+Status IGFSPath::Read(ExtendedTCPClient *client) {
+  return client->ReadNullableString(&path);
+}
+
+Status IGFSFile::Read(ExtendedTCPClient *client) {
+  int32_t block_size;
+  int64_t group_block_size;
+  map properties = {};
+  int64_t access_time;
+
+  bool has_path;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_path));
+  if (has_path) {
+    IGFSPath path = {};
+    TF_RETURN_IF_ERROR(path.Read(client));
+  }
+
+  TF_RETURN_IF_ERROR(client->ReadInt(&block_size));
+  TF_RETURN_IF_ERROR(client->ReadLong(&group_block_size));
+  TF_RETURN_IF_ERROR(client->ReadLong(&length));
+  TF_RETURN_IF_ERROR(client->ReadStringMap(&properties));
+  TF_RETURN_IF_ERROR(client->ReadLong(&access_time));
+  TF_RETURN_IF_ERROR(client->ReadLong(&modification_time));
+  TF_RETURN_IF_ERROR(client->ReadByte(&flags));
+
+  return Status::OK();
+}
+
+Request::Request(int32_t command_id) : command_id_(command_id) {}
+
+Status Request::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(client->WriteByte(0));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(8));
+  TF_RETURN_IF_ERROR(client->WriteInt(command_id_));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(24));
+
+  return Status::OK();
+}
+
+Status Response::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->Ignore(1));
+  TF_RETURN_IF_ERROR(client->SkipToPos(8));
+  TF_RETURN_IF_ERROR(client->ReadInt(&req_id));
+  TF_RETURN_IF_ERROR(client->SkipToPos(24));
+  TF_RETURN_IF_ERROR(client->ReadInt(&res_type));
+
+  bool has_error;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_error));
+
+  if (has_error) {
+    int32_t error_code;
+    std::string error_msg;
+    TF_RETURN_IF_ERROR(client->ReadString(&error_msg));
+    TF_RETURN_IF_ERROR(client->ReadInt(&error_code));
+
+    return errors::Internal("Error [code=", error_code, ", message=\"",
+                            error_msg, "\"]");
+  }
+
+  TF_RETURN_IF_ERROR(client->SkipToPos(HEADER_SIZE + 5));
+  TF_RETURN_IF_ERROR(client->ReadInt(&length));
+  TF_RETURN_IF_ERROR(client->SkipToPos(HEADER_SIZE + RESPONSE_HEADER_SIZE));
+
+  return Status::OK();
+}
+
+PathCtrlRequest::PathCtrlRequest(int32_t command_id_, string user_name,
+                                 string path, string destination_path,
+                                 bool flag, bool collocate,
+                                 map properties)
+    : Request(command_id_),
+      user_name_(std::move(user_name)),
+      path_(std::move(path)),
+      destination_path_(std::move(destination_path)),
+      flag_(flag),
+      collocate_(collocate),
+      props_(std::move(properties)) {}
+
+Status PathCtrlRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(Request::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteString(user_name_));
+  TF_RETURN_IF_ERROR(WritePath(client, path_));
+  TF_RETURN_IF_ERROR(WritePath(client, destination_path_));
+  TF_RETURN_IF_ERROR(client->WriteBool(flag_));
+  TF_RETURN_IF_ERROR(client->WriteBool(collocate_));
+  TF_RETURN_IF_ERROR(client->WriteStringMap(props_));
+
+  return Status::OK();
+}
+
+Status PathCtrlRequest::WritePath(ExtendedTCPClient *client,
+                                  const string &path) const {
+  TF_RETURN_IF_ERROR(client->WriteBool(!path.empty()));
+  if (!path.empty()) TF_RETURN_IF_ERROR(client->WriteString(path));
+
+  return Status::OK();
+}
+
+Status StreamCtrlRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(client->WriteByte(0));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(8));
+  TF_RETURN_IF_ERROR(client->WriteInt(command_id_));
+  TF_RETURN_IF_ERROR(client->WriteLong(stream_id_));
+  TF_RETURN_IF_ERROR(client->WriteInt(length_));
+
+  return Status::OK();
+}
+
+StreamCtrlRequest::StreamCtrlRequest(int32_t command_id_, int64_t stream_id,
+                                     int32_t length)
+    : Request(command_id_), stream_id_(stream_id), length_(length) {}
+
+DeleteRequest::DeleteRequest(const string &user_name, const string &path,
+                             bool flag)
+    : PathCtrlRequest(DELETE_ID, user_name, path, {}, flag, true, {}) {}
+
+Status DeleteResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&exists));
+
+  return Status::OK();
+}
+
+ExistsRequest::ExistsRequest(const string &user_name, const string &path)
+    : PathCtrlRequest(EXISTS_ID, user_name, path, {}, false, true, {}) {}
+
+Status ExistsResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&exists));
+
+  return Status::OK();
+}
+
+HandshakeRequest::HandshakeRequest(const string &fs_name, const string &log_dir)
+    : Request(HANDSHAKE_ID), fs_name_(fs_name), log_dir_(log_dir){};
+
+Status HandshakeRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(Request::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteString(fs_name_));
+  TF_RETURN_IF_ERROR(client->WriteString(log_dir_));
+
+  return Status::OK();
+}
+
+Status HandshakeResponse::Read(ExtendedTCPClient *client) {
+  int64_t block_size;
+  bool sampling;
+
+  TF_RETURN_IF_ERROR(client->ReadNullableString(&fs_name));
+  TF_RETURN_IF_ERROR(client->ReadLong(&block_size));
+
+  bool has_sampling_;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_sampling_));
+
+  if (has_sampling_) {
+    TF_RETURN_IF_ERROR(client->ReadBool(&sampling));
+  }
+
+  return Status::OK();
+}
+
+ListRequest::ListRequest(int32_t command_id_, const string &user_name,
+                         const string &path)
+    : PathCtrlRequest(command_id_, user_name, path, {}, false, true, {}){};
+
+ListFilesRequest::ListFilesRequest(const string &user_name, const string &path)
+    : ListRequest(LIST_FILES_ID, user_name, path) {}
+
+ListPathsRequest::ListPathsRequest(const string &user_name, const string &path)
+    : ListRequest(LIST_PATHS_ID, user_name, path) {}
+
+OpenCreateRequest::OpenCreateRequest(const string &user_name,
+                                     const string &path)
+    : PathCtrlRequest(OPEN_CREATE_ID, user_name, path, {}, false, true, {}) {}
+
+Status OpenCreateRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteInt(replication_));
+  TF_RETURN_IF_ERROR(client->WriteLong(blockSize_));
+
+  return Status::OK();
+}
+
+Status OpenCreateResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+
+  return Status::OK();
+}
+
+OpenAppendRequest::OpenAppendRequest(const string &user_name,
+                                     const string &path)
+    : PathCtrlRequest(OPEN_APPEND_ID, user_name, path, {}, false, true, {}) {}
+
+Status OpenAppendRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  return Status::OK();
+}
+
+Status OpenAppendResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+
+  return Status::OK();
+}
+
+OpenReadRequest::OpenReadRequest(const string &user_name, const string &path,
+                                 bool flag,
+                                 int32_t sequential_reads_before_prefetch)
+    : PathCtrlRequest(OPEN_READ_ID, user_name, path, {}, flag, true, {}),
+      sequential_reads_before_prefetch_(sequential_reads_before_prefetch) {}
+
+OpenReadRequest::OpenReadRequest(const string &user_name, const string &path)
+    : OpenReadRequest(user_name, path, false, 0) {}
+
+Status OpenReadRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  if (flag_) {
+    TF_RETURN_IF_ERROR(client->WriteInt(sequential_reads_before_prefetch_));
+  }
+
+  return Status::OK();
+}
+
+Status OpenReadResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+  TF_RETURN_IF_ERROR(client->ReadLong(&length));
+
+  return Status::OK();
+}
+
+InfoRequest::InfoRequest(const string &user_name, const string &path)
+    : PathCtrlRequest(INFO_ID, user_name, path, {}, false, true, {}) {}
+
+Status InfoResponse::Read(ExtendedTCPClient *client) {
+  file_info = IGFSFile();
+  TF_RETURN_IF_ERROR(file_info.Read(client));
+
+  return Status::OK();
+}
+
+MakeDirectoriesRequest::MakeDirectoriesRequest(const string &user_name,
+                                               const string &path)
+    : PathCtrlRequest(MKDIR_ID, user_name, path, {}, false, true, {}) {}
+
+Status MakeDirectoriesResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+CloseRequest::CloseRequest(int64_t streamId)
+    : StreamCtrlRequest(CLOSE_ID, streamId, 0) {}
+
+Status CloseResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+ReadBlockRequest::ReadBlockRequest(int64_t stream_id, int64_t pos,
+                                   int32_t length)
+    : StreamCtrlRequest(READ_BLOCK_ID, stream_id, length), pos(pos) {}
+
+Status ReadBlockRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(StreamCtrlRequest::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteLong(pos));
+
+  return Status::OK();
+}
+
+Status ReadBlockResponse::Read(ExtendedTCPClient *client, int32_t length,
+                               uint8_t *dst) {
+  TF_RETURN_IF_ERROR(client->ReadData(dst, length));
+  successfuly_read = length;
+
+  return Status::OK();
+}
+
+Status ReadBlockResponse::Read(ExtendedTCPClient *client) {
+  return Status::OK();
+}
+
+streamsize ReadBlockResponse::GetSuccessfulyRead() { return successfuly_read; }
+
+ReadBlockCtrlResponse::ReadBlockCtrlResponse(uint8_t *dst)
+    : CtrlResponse(false), dst(dst) {}
+
+Status ReadBlockCtrlResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(Response::Read(client));
+
+  res = ReadBlockResponse();
+  TF_RETURN_IF_ERROR(res.Read(client, length, dst));
+
+  return Status::OK();
+}
+
+WriteBlockRequest::WriteBlockRequest(int64_t stream_id, const uint8_t *data,
+                                     int32_t length)
+    : StreamCtrlRequest(WRITE_BLOCK_ID, stream_id, length), data(data) {}
+
+Status WriteBlockRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(StreamCtrlRequest::Write(client));
+  TF_RETURN_IF_ERROR(client->WriteData((uint8_t *)data, length_));
+
+  return Status::OK();
+}
+
+RenameRequest::RenameRequest(const string &user_name, const string &path,
+                             const string &destination_path)
+    : PathCtrlRequest(RENAME_ID, user_name, path, destination_path, false, true,
+                      {}) {}
+
+Status RenameResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/contrib/igfs/kernels/igfs_messages.h b/tensorflow/contrib/igfs/kernels/igfs_messages.h
new file mode 100644
index 00000000000..076e6f55ef8
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_messages.h
@@ -0,0 +1,371 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_MESSAGES_H_
+#define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_MESSAGES_H_
+
+#include "igfs_extended_tcp_client.h"
+
+namespace tensorflow {
+
+using std::map;
+using std::string;
+using std::vector;
+using std::streamsize;
+
+enum CommandId {
+  HANDSHAKE_ID = 0,
+  EXISTS_ID = 2,
+  INFO_ID = 3,
+  RENAME_ID = 6,
+  DELETE_ID = 7,
+  MKDIR_ID = 8,
+  LIST_PATHS_ID = 9,
+  LIST_FILES_ID = 10,
+  OPEN_READ_ID = 13,
+  OPEN_APPEND_ID = 14,
+  OPEN_CREATE_ID = 15,
+  CLOSE_ID = 16,
+  READ_BLOCK_ID = 17,
+  WRITE_BLOCK_ID = 18,
+};
+
+class IGFSPath {
+ public:
+  std::string path;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class IGFSFile {
+ public:
+  int64_t length;
+  int64_t modification_time;
+  uint8_t flags;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class Request {
+ public:
+  Request(int32_t command_id);
+  virtual Status Write(ExtendedTCPClient *client) const;
+
+ protected:
+  const int32_t command_id_;
+};
+
+class Response {
+ public:
+  int32_t res_type;
+  int32_t req_id;
+  int32_t length;
+
+  virtual Status Read(ExtendedTCPClient *client);
+
+ protected:
+  static const int32_t HEADER_SIZE = 24;
+  static const int32_t RESPONSE_HEADER_SIZE = 9;
+  static const int32_t RES_TYPE_ERR_STREAM_ID = 9;
+};
+
+class PathCtrlRequest : public Request {
+ public:
+  PathCtrlRequest(int32_t command_id, string user_name, string path,
+                  string destination_path, bool flag, bool collocate,
+                  map properties);
+
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  const string user_name_;
+  const string path_;
+  const string destination_path_;
+  const bool flag_;
+  const bool collocate_;
+  const map props_;
+
+  Status WritePath(ExtendedTCPClient *client, const string &path) const;
+};
+
+class StreamCtrlRequest : public Request {
+ public:
+  StreamCtrlRequest(int32_t command_id, int64_t stream_id, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  int64_t stream_id_;
+  int32_t length_;
+};
+
+template 
+class CtrlResponse : public Response {
+ public:
+  R res;
+  bool has_content;
+
+  CtrlResponse(bool optional) : optional_(optional){};
+  Status Read(ExtendedTCPClient *client) override {
+    TF_RETURN_IF_ERROR(Response::Read(client));
+
+    if (optional_) {
+      TF_RETURN_IF_ERROR(client->ReadBool(&has_content));
+
+      if (!has_content) return Status::OK();
+    }
+
+    res = R();
+    has_content = true;
+    TF_RETURN_IF_ERROR(res.Read(client));
+
+    return Status::OK();
+  }
+
+ private:
+  bool optional_;
+};
+
+template 
+class ListResponse {
+ public:
+  vector entries;
+
+  Status Read(ExtendedTCPClient *client) {
+    int32_t len;
+    TF_RETURN_IF_ERROR(client->ReadInt(&len));
+
+    entries = vector();
+
+    for (int32_t i = 0; i < len; i++) {
+      T f = {};
+      TF_RETURN_IF_ERROR(f.Read(client));
+      entries.push_back(f);
+    }
+
+    return Status::OK();
+  }
+};
+
+class DeleteRequest : public PathCtrlRequest {
+ public:
+  DeleteRequest(const string &user_name, const string &path, bool flag);
+};
+
+class DeleteResponse {
+ public:
+  bool exists;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class ExistsRequest : public PathCtrlRequest {
+ public:
+  explicit ExistsRequest(const string &user_name, const string &path);
+};
+
+class ExistsResponse {
+ public:
+  bool exists;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class HandshakeRequest : public Request {
+ public:
+  HandshakeRequest(const string &fs_name, const string &log_dir);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  string fs_name_;
+  string log_dir_;
+};
+
+class HandshakeResponse {
+ public:
+  string fs_name;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class ListRequest : public PathCtrlRequest {
+ public:
+  explicit ListRequest(int32_t command_id, const string &user_name,
+                       const string &path);
+};
+
+class ListFilesRequest : public ListRequest {
+ public:
+  ListFilesRequest(const string &user_name, const string &path);
+};
+
+class ListFilesResponse : public ListResponse {};
+
+class ListPathsRequest : public ListRequest {
+ public:
+  ListPathsRequest(const string &user_name, const string &path);
+};
+
+class ListPathsResponse : public ListResponse {};
+
+class OpenCreateRequest : public PathCtrlRequest {
+ public:
+  OpenCreateRequest(const string &user_name, const string &path);
+
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  int32_t replication_;
+  int64_t blockSize_;
+};
+
+class OpenCreateResponse {
+ public:
+  int64_t stream_id;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class OpenAppendRequest : public PathCtrlRequest {
+ public:
+  explicit OpenAppendRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+};
+
+class OpenAppendResponse {
+ public:
+  int64_t stream_id;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class OpenReadRequest : public PathCtrlRequest {
+ public:
+  OpenReadRequest(const string &user_name, const string &path, bool flag,
+                  int32_t seqReadsBeforePrefetch);
+
+  OpenReadRequest(const string &user_name, const string &path);
+
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  /** Sequential reads before prefetch. */
+  int32_t sequential_reads_before_prefetch_;
+};
+
+class OpenReadResponse {
+ public:
+  int64_t stream_id;
+  int64_t length;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class InfoRequest : public PathCtrlRequest {
+ public:
+  InfoRequest(const string &user_name, const string &path);
+};
+
+class InfoResponse {
+ public:
+  IGFSFile file_info;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class MakeDirectoriesRequest : public PathCtrlRequest {
+ public:
+  MakeDirectoriesRequest(const string &userName, const string &path);
+};
+
+class MakeDirectoriesResponse {
+ public:
+  bool successful;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+/** Stream control requests. **/
+
+class CloseRequest : public StreamCtrlRequest {
+ public:
+  explicit CloseRequest(int64_t stream_id);
+};
+
+class CloseResponse {
+ public:
+  bool successful;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+class ReadBlockRequest : public StreamCtrlRequest {
+ public:
+  ReadBlockRequest(int64_t stream_id, int64_t pos, int32_t length);
+
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  int64_t pos;
+};
+
+class ReadBlockResponse {
+ public:
+  Status Read(ExtendedTCPClient *client, int32_t length, uint8_t *dst);
+
+  Status Read(ExtendedTCPClient *client);
+
+  streamsize GetSuccessfulyRead();
+
+ private:
+  int32_t length;
+  streamsize successfuly_read;
+};
+
+class ReadBlockCtrlResponse : public CtrlResponse {
+ public:
+  ReadBlockCtrlResponse(uint8_t *dst);
+
+  Status Read(ExtendedTCPClient *client) override;
+
+ private:
+  uint8_t *dst;
+};
+
+class WriteBlockRequest : public StreamCtrlRequest {
+ public:
+  WriteBlockRequest(int64_t stream_id, const uint8_t *data, int32_t length);
+
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  const uint8_t *data;
+};
+
+class RenameRequest : public PathCtrlRequest {
+ public:
+  RenameRequest(const std::string &user_name, const std::string &path,
+                const std::string &destination_path);
+};
+
+class RenameResponse {
+ public:
+  bool successful;
+
+  Status Read(ExtendedTCPClient *client);
+};
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
new file mode 100644
index 00000000000..ff546e13444
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "igfs_random_access_file.h"
+#include "igfs_messages.h"
+
+namespace tensorflow {
+
+IGFSRandomAccessFile::IGFSRandomAccessFile(const std::string &file_name,
+                                           int64_t resource_id,
+                                           std::shared_ptr client)
+    : file_name_(file_name), resource_id_(resource_id), client_(client) {}
+
+IGFSRandomAccessFile::~IGFSRandomAccessFile() {
+  CtrlResponse close_response = {false};
+  Status status = client_->Close(&close_response, resource_id_);
+
+  if (!status.ok()) LOG(ERROR) << status.ToString();
+}
+
+Status IGFSRandomAccessFile::Read(uint64 offset, size_t n, StringPiece *result,
+                                  char *scratch) const {
+  ReadBlockCtrlResponse response = ReadBlockCtrlResponse((uint8_t *)scratch);
+  TF_RETURN_IF_ERROR(client_->ReadBlock(&response, resource_id_, offset, n));
+
+  streamsize sz = response.res.GetSuccessfulyRead();
+  if (sz == 0) return errors::OutOfRange("End of file");
+
+  *result = StringPiece(scratch, sz);
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc b/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
new file mode 100644
index 00000000000..22ca848aa6c
--- /dev/null
+++ b/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "igfs_writable_file.h"
+#include "igfs_messages.h"
+
+namespace tensorflow {
+
+IGFSWritableFile::IGFSWritableFile(const std::string &file_name,
+                                   int64_t resource_id,
+                                   std::shared_ptr client)
+    : file_name_(file_name), resource_id_(resource_id), client_(client) {}
+
+IGFSWritableFile::~IGFSWritableFile() {
+  if (resource_id_ >= 0) {
+    CtrlResponse close_response = {false};
+
+    Status status = client_->Close(&close_response, resource_id_);
+    if (!status.ok()) LOG(ERROR) << status.ToString();
+  }
+}
+
+Status IGFSWritableFile::Append(const StringPiece &data) {
+  return client_->WriteBlock(resource_id_, (uint8_t *)data.data(), data.size());
+}
+
+Status IGFSWritableFile::Close() {
+  int64_t resource_to_be_closed = resource_id_;
+  resource_id_ = -1;
+
+  CtrlResponse close_response = {false};
+  return client_->Close(&close_response, resource_to_be_closed);
+}
+
+Status IGFSWritableFile::Flush() { return Status::OK(); }
+
+Status IGFSWritableFile::Sync() { return Status::OK(); }
+
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/contrib/igfs/python/tests/bin/start-igfs.sh b/tensorflow/contrib/igfs/python/tests/bin/start-igfs.sh
new file mode 100755
index 00000000000..5e39e16c052
--- /dev/null
+++ b/tensorflow/contrib/igfs/python/tests/bin/start-igfs.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+nohup apache-ignite-fabric/bin/ignite.sh /data/config/ignite-config-igfs.xml &
+sleep 5 # Wait Apache Ignite to be started
+
+tail -f nohup.out
diff --git a/tensorflow/contrib/igfs/python/tests/start_ignite.sh b/tensorflow/contrib/igfs/python/tests/start_ignite.sh
new file mode 100755
index 00000000000..d48bed6b45d
--- /dev/null
+++ b/tensorflow/contrib/igfs/python/tests/start_ignite.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+IGNITE_VERSION=2.6.0
+SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )"
+
+# Start Apache Ignite with IGFS.
+docker run -itd --name ignite-igfs -p 10500:10500 \
+-v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-igfs.sh
diff --git a/tensorflow/contrib/igfs/python/tests/stop_ignite.sh b/tensorflow/contrib/igfs/python/tests/stop_ignite.sh
new file mode 100755
index 00000000000..ff297291cf5
--- /dev/null
+++ b/tensorflow/contrib/igfs/python/tests/stop_ignite.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+docker rm -f ignite-igfs

From 1602570d2028f9b7f47e67a6f3965bdba24478f0 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev 
Date: Tue, 2 Oct 2018 19:49:06 +0300
Subject: [PATCH 058/540] Update after review of #22210.

---
 tensorflow/contrib/igfs/BUILD                 |  31 +++--
 tensorflow/contrib/igfs/README.md             |   2 +-
 tensorflow/contrib/igfs/__init__.py           |  20 ++--
 tensorflow/contrib/igfs/kernels/igfs.cc       |  83 +++++++-------
 tensorflow/contrib/igfs/kernels/igfs.h        |  42 ++++---
 .../contrib/igfs/kernels/igfs_client.cc       |   6 +-
 tensorflow/contrib/igfs/kernels/igfs_client.h |  41 ++++---
 .../igfs/kernels/igfs_extended_tcp_client.cc  |  38 +++----
 .../igfs/kernels/igfs_extended_tcp_client.h   |  20 ++--
 .../contrib/igfs/kernels/igfs_messages.cc     |  27 +++--
 .../contrib/igfs/kernels/igfs_messages.h      | 107 ++++++++----------
 .../igfs/kernels/igfs_random_access_file.cc   |   8 +-
 .../igfs/kernels/igfs_random_access_file.h    |   8 +-
 .../igfs/kernels/igfs_writable_file.cc        |  25 ++--
 .../contrib/igfs/kernels/igfs_writable_file.h |   6 +-
 tensorflow/contrib/igfs/ops/igfs_ops.cc       |   6 +-
 tensorflow/contrib/ignite/BUILD               |  14 +--
 .../contrib/ignite/kernels/ignite_client.cc   |  26 -----
 18 files changed, 237 insertions(+), 273 deletions(-)
 delete mode 100644 tensorflow/contrib/ignite/kernels/ignite_client.cc

diff --git a/tensorflow/contrib/igfs/BUILD b/tensorflow/contrib/igfs/BUILD
index 4499c96eef9..b9983d09574 100644
--- a/tensorflow/contrib/igfs/BUILD
+++ b/tensorflow/contrib/igfs/BUILD
@@ -6,13 +6,12 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_gen_op_wrapper_py",
-    "tf_kernel_library",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
     "tf_py_test",
-    "tf_cc_shared_object",
 )
 
 py_library(
@@ -27,39 +26,39 @@ py_library(
 tf_custom_op_library(
     name = "_igfs_ops.so",
     srcs = [
-        "ops/igfs_ops.cc",
         "kernels/igfs.h",
+        "ops/igfs_ops.cc",
     ],
     deps = [":igfs_kernels"],
 )
 
 tf_gen_op_libs(
     op_lib_names = ["igfs_ops"],
-    deps = [":igfs_kernels"]
+    deps = [":igfs_kernels"],
 )
 
 cc_library(
     name = "igfs_kernels",
     srcs = [
-        "kernels/igfs.h",
         "kernels/igfs.cc",
-        "kernels/igfs_random_access_file.h",
-        "kernels/igfs_random_access_file.cc",
-        "kernels/igfs_writable_file.h",
-        "kernels/igfs_writable_file.cc",
-        "kernels/igfs_client.h",
+        "kernels/igfs.h",
         "kernels/igfs_client.cc",
-        "kernels/igfs_messages.h",
-        "kernels/igfs_messages.cc",
+        "kernels/igfs_client.h",
+        "kernels/igfs_extended_tcp_client.cc",
         "kernels/igfs_extended_tcp_client.h",
-        "kernels/igfs_extended_tcp_client.cc"
+        "kernels/igfs_messages.cc",
+        "kernels/igfs_messages.h",
+        "kernels/igfs_random_access_file.cc",
+        "kernels/igfs_random_access_file.h",
+        "kernels/igfs_writable_file.cc",
+        "kernels/igfs_writable_file.h",
     ],
     deps = [
         "//tensorflow/contrib/ignite:ignite_client",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal"
+        "//tensorflow/core:lib_internal",
     ],
-    alwayslink=1,
+    alwayslink = 1,
 )
 
 py_library(
diff --git a/tensorflow/contrib/igfs/README.md b/tensorflow/contrib/igfs/README.md
index fa7d677d433..a67be4a410e 100644
--- a/tensorflow/contrib/igfs/README.md
+++ b/tensorflow/contrib/igfs/README.md
@@ -28,7 +28,7 @@ After that you will be able to work with it following way:
 >>>   w.write("Hello, world!")
 >>>
 >>> with tf.gfile.Open("igfs:///hello.txt", mode='r') as r:
->>>   print(w.read())
+>>>   print(r.read())
 
 Hello, world!
 ```
diff --git a/tensorflow/contrib/igfs/__init__.py b/tensorflow/contrib/igfs/__init__.py
index dfead6bb29b..5ef676337f0 100644
--- a/tensorflow/contrib/igfs/__init__.py
+++ b/tensorflow/contrib/igfs/__init__.py
@@ -12,15 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Apache Ignite is a memory-centric distributed database, caching, and
-   processing platform for transactional, analytical, and streaming workloads,
-   delivering in-memory speeds at petabyte scale. In addition to database
-   functionality Apache Ignite provides a distributed file system called
-   IGFS (https://ignite.apache.org/features/igfs.html). IGFS delivers a similar
-   functionality to Hadoop HDFS, but only in-memory. In fact, in addition to
-   its own APIs, IGFS implements Hadoop FileSystem API and can be transparently
-   plugged into Hadoop or Spark deployments. This contrib package contains an
-   intergration between IGFS and TensorFlow.
+"""Ignite File System for checkpointing and communication with TensorBoard.
+
+Apache Ignite is a memory-centric distributed database, caching, and
+processing platform for transactional, analytical, and streaming workloads,
+delivering in-memory speeds at petabyte scale. In addition to database
+functionality Apache Ignite provides a distributed file system called
+IGFS (https://ignite.apache.org/features/igfs.html). IGFS delivers a similar
+functionality to Hadoop HDFS, but only in-memory. In fact, in addition to
+its own APIs, IGFS implements Hadoop FileSystem API and can be transparently
+plugged into Hadoop or Spark deployments. This contrib package contains an
+intergration between IGFS and TensorFlow.
 
 @@IGFS
 """
diff --git a/tensorflow/contrib/igfs/kernels/igfs.cc b/tensorflow/contrib/igfs/kernels/igfs.cc
index 8ad5696b19d..93bffc429e7 100644
--- a/tensorflow/contrib/igfs/kernels/igfs.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs.cc
@@ -14,30 +14,31 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 
-#include "igfs.h"
-#include "igfs_client.h"
-#include "igfs_random_access_file.h"
-#include "igfs_writable_file.h"
+#include "tensorflow/contrib/igfs/kernels/igfs.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_random_access_file.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_writable_file.h"
 
 namespace tensorflow {
 
-std::string GetEnvOrElse(const std::string &env, std::string default_value) {
+string GetEnvOrElse(const string &env, string default_value) {
   const char *env_c_str = env.c_str();
   return getenv(env_c_str) != nullptr ? getenv(env_c_str) : default_value;
 }
 
-std::string IGFS::TranslateName(const std::string &name) const {
+string IGFS::TranslateName(const string &name) const {
   StringPiece scheme, namenode, path;
   io::ParseURI(name, &scheme, &namenode, &path);
-  return path.ToString();
+  return string(path.data(), path.length());
 }
 
-std::string MakeRelative(const std::string &a, const std::string &b) {
-  std::string max = a;
-  std::string min = b;
+string MakeRelative(const string &a, const string &b) {
+  string max = a;
+  string min = b;
   bool first = b.size() > a.size();
 
   if (first) {
@@ -46,8 +47,7 @@ std::string MakeRelative(const std::string &a, const std::string &b) {
   }
 
   auto r = mismatch(min.begin(), min.end(), max.begin());
-  return std::string((first ? r.first : r.second),
-                     first ? min.end() : max.end());
+  return string((first ? r.first : r.second), first ? min.end() : max.end());
 }
 
 IGFS::IGFS()
@@ -63,10 +63,10 @@ IGFS::~IGFS() {
             << ", fs_name=" << fs_name_ << "]";
 };
 
-Status IGFS::NewRandomAccessFile(const std::string &file_name,
+Status IGFS::NewRandomAccessFile(const string &file_name,
                                  std::unique_ptr *result) {
   std::shared_ptr client = CreateClient();
-  std::string path = TranslateName(file_name);
+  string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -83,10 +83,10 @@ Status IGFS::NewRandomAccessFile(const std::string &file_name,
   return Status::OK();
 }
 
-Status IGFS::NewWritableFile(const std::string &file_name,
+Status IGFS::NewWritableFile(const string &file_name,
                              std::unique_ptr *result) {
   std::shared_ptr client = CreateClient();
-  std::string path = TranslateName(file_name);
+  string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -111,7 +111,7 @@ Status IGFS::NewWritableFile(const std::string &file_name,
   return Status::OK();
 }
 
-Status IGFS::NewAppendableFile(const std::string &file_name,
+Status IGFS::NewAppendableFile(const string &file_name,
                                std::unique_ptr *result) {
   std::shared_ptr client = CreateClient();
 
@@ -139,14 +139,13 @@ Status IGFS::NewAppendableFile(const std::string &file_name,
 }
 
 Status IGFS::NewReadOnlyMemoryRegionFromFile(
-    const std::string &file_name,
-    std::unique_ptr *result) {
+    const string &file_name, std::unique_ptr *result) {
   return errors::Unimplemented("IGFS does not support ReadOnlyMemoryRegion");
 }
 
-Status IGFS::FileExists(const std::string &file_name) {
+Status IGFS::FileExists(const string &file_name) {
   std::shared_ptr client = CreateClient();
-  const std::string path = TranslateName(file_name);
+  const string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -163,10 +162,10 @@ Status IGFS::FileExists(const std::string &file_name) {
   return Status::OK();
 }
 
-Status IGFS::GetChildren(const std::string &file_name,
-                         std::vector *result) {
+Status IGFS::GetChildren(const string &file_name, std::vector *result) {
   std::shared_ptr client = CreateClient();
-  std::string path = TranslateName(file_name);
+  string path = TranslateName(file_name);
+  path = path + "/";
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -174,7 +173,7 @@ Status IGFS::GetChildren(const std::string &file_name,
   CtrlResponse list_paths_response(false);
   TF_RETURN_IF_ERROR(client->ListPaths(&list_paths_response, path));
 
-  *result = std::vector();
+  *result = std::vector();
   std::vector entries = list_paths_response.res.entries;
 
   for (IGFSPath &value : entries)
@@ -186,14 +185,14 @@ Status IGFS::GetChildren(const std::string &file_name,
   return Status::OK();
 }
 
-Status IGFS::GetMatchingPaths(const std::string &pattern,
-                              std::vector *results) {
+Status IGFS::GetMatchingPaths(const string &pattern,
+                              std::vector *results) {
   return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
 }
 
-Status IGFS::DeleteFile(const std::string &file_name) {
+Status IGFS::DeleteFile(const string &file_name) {
   std::shared_ptr client = CreateClient();
-  std::string path = TranslateName(file_name);
+  string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -210,9 +209,9 @@ Status IGFS::DeleteFile(const std::string &file_name) {
   return Status::OK();
 }
 
-Status IGFS::CreateDir(const std::string &file_name) {
+Status IGFS::CreateDir(const string &file_name) {
   std::shared_ptr client = CreateClient();
-  const std::string path = TranslateName(file_name);
+  const string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -221,7 +220,7 @@ Status IGFS::CreateDir(const std::string &file_name) {
   TF_RETURN_IF_ERROR(client->MkDir(&mkdir_response, path));
 
   if (!mkdir_response.res.successful)
-    return errors::Internal("Can't create directory ", path);
+    return errors::Unknown("Can't create directory ", path);
 
   LOG(INFO) << "Create dir completed successful [file_name=" << file_name
             << "]";
@@ -229,9 +228,9 @@ Status IGFS::CreateDir(const std::string &file_name) {
   return Status::OK();
 }
 
-Status IGFS::DeleteDir(const std::string &file_name) {
+Status IGFS::DeleteDir(const string &file_name) {
   std::shared_ptr client = CreateClient();
-  std::string path = TranslateName(file_name);
+  string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -252,9 +251,9 @@ Status IGFS::DeleteDir(const std::string &file_name) {
   return Status::OK();
 }
 
-Status IGFS::GetFileSize(const std::string &file_name, uint64 *size) {
+Status IGFS::GetFileSize(const string &file_name, uint64 *size) {
   std::shared_ptr client = CreateClient();
-  std::string path = TranslateName(file_name);
+  string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -270,10 +269,10 @@ Status IGFS::GetFileSize(const std::string &file_name, uint64 *size) {
   return Status::OK();
 }
 
-Status IGFS::RenameFile(const std::string &src, const std::string &dst) {
+Status IGFS::RenameFile(const string &src, const string &dst) {
   std::shared_ptr client = CreateClient();
-  std::string src_path = TranslateName(src);
-  std::string dst_path = TranslateName(dst);
+  string src_path = TranslateName(src);
+  string dst_path = TranslateName(dst);
 
   if (FileExists(dst).ok()) DeleteFile(dst);
 
@@ -292,9 +291,9 @@ Status IGFS::RenameFile(const std::string &src, const std::string &dst) {
   return Status::OK();
 }
 
-Status IGFS::Stat(const std::string &file_name, FileStatistics *stats) {
+Status IGFS::Stat(const string &file_name, FileStatistics *stats) {
   std::shared_ptr client = CreateClient();
-  std::string path = TranslateName(file_name);
+  string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -304,7 +303,7 @@ Status IGFS::Stat(const std::string &file_name, FileStatistics *stats) {
 
   IGFSFile info = info_response.res.file_info;
 
-  *stats = FileStatistics(info.length, info.modification_time,
+  *stats = FileStatistics(info.length, info.modification_time * 1000000,
                           (info.flags & 0x1) != 0);
 
   LOG(INFO) << "Stat completed successful [file_name=" << file_name << "]";
diff --git a/tensorflow/contrib/igfs/kernels/igfs.h b/tensorflow/contrib/igfs/kernels/igfs.h
index 26c74640446..c2dff7ebc81 100644
--- a/tensorflow/contrib/igfs/kernels/igfs.h
+++ b/tensorflow/contrib/igfs/kernels/igfs.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_H_
 
-#include "igfs_client.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
 #include "tensorflow/core/platform/file_system.h"
 
 namespace tensorflow {
@@ -27,36 +26,35 @@ class IGFS : public FileSystem {
   IGFS();
   ~IGFS();
   Status NewRandomAccessFile(
-      const std::string& file_name,
+      const string& file_name,
       std::unique_ptr* result) override;
-  Status NewWritableFile(const std::string& fname,
+  Status NewWritableFile(const string& fname,
                          std::unique_ptr* result) override;
-  Status NewAppendableFile(const std::string& fname,
+  Status NewAppendableFile(const string& fname,
                            std::unique_ptr* result) override;
   Status NewReadOnlyMemoryRegionFromFile(
-      const std::string& fname,
+      const string& fname,
       std::unique_ptr* result) override;
-  Status FileExists(const std::string& fname) override;
-  Status GetChildren(const std::string& dir,
-                     std::vector* result) override;
-  Status GetMatchingPaths(const std::string& pattern,
+  Status FileExists(const string& fname) override;
+  Status GetChildren(const string& dir, std::vector* result) override;
+  Status GetMatchingPaths(const string& pattern,
                           std::vector* results) override;
-  Status DeleteFile(const std::string& fname) override;
-  Status CreateDir(const std::string& name) override;
-  Status DeleteDir(const std::string& name) override;
-  Status GetFileSize(const std::string& fname, uint64* size) override;
-  Status RenameFile(const std::string& src, const std::string& target) override;
-  Status Stat(const std::string& fname, FileStatistics* stat) override;
-  string TranslateName(const std::string& name) const override;
+  Status DeleteFile(const string& fname) override;
+  Status CreateDir(const string& name) override;
+  Status DeleteDir(const string& name) override;
+  Status GetFileSize(const string& fname, uint64* size) override;
+  Status RenameFile(const string& src, const string& target) override;
+  Status Stat(const string& fname, FileStatistics* stat) override;
+  string TranslateName(const string& name) const override;
 
  private:
-  const std::string host_;
-  const int port_;
-  const std::string fs_name_;
-
   std::shared_ptr CreateClient() const;
+
+  const string host_;
+  const int port_;
+  const string fs_name_;
 };
 
 }  // namespace tensorflow
 
-#endif
+#endif  // TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_H_
diff --git a/tensorflow/contrib/igfs/kernels/igfs_client.cc b/tensorflow/contrib/igfs/kernels/igfs_client.cc
index e27f8374faa..05745ce4f1a 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_client.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_client.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "igfs_client.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
 
 namespace tensorflow {
 
-IGFSClient::IGFSClient(std::string host, int port, std::string fs_name,
-                       std::string user_name)
+IGFSClient::IGFSClient(const string &host, int port, const string &fs_name,
+                       const string &user_name)
     : fs_name_(fs_name),
       user_name_(user_name),
       client_(ExtendedTCPClient(host, port, true)) {
diff --git a/tensorflow/contrib/igfs/kernels/igfs_client.h b/tensorflow/contrib/igfs/kernels/igfs_client.h
index 82653af03a2..fecb799dc5f 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_client.h
+++ b/tensorflow/contrib/igfs/kernels/igfs_client.h
@@ -16,14 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_CLIENT_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_CLIENT_H_
 
-#include "igfs_messages.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_messages.h"
 
 namespace tensorflow {
 
 class IGFSClient {
  public:
-  IGFSClient(std::string host, int port, std::string fs_name,
-             std::string user_name);
+  IGFSClient(const string &host, int port, const string &fs_name,
+             const string &user_name);
   ~IGFSClient();
 
   inline Status Handshake(CtrlResponse *res) {
@@ -31,47 +31,46 @@ class IGFSClient {
   }
 
   inline Status ListFiles(CtrlResponse *res,
-                          const std::string &path) {
+                          const string &path) {
     return SendRequestGetResponse(ListFilesRequest(user_name_, path), res);
   }
 
   inline Status ListPaths(CtrlResponse *res,
-                          const std::string &path) {
+                          const string &path) {
     return SendRequestGetResponse(ListPathsRequest(user_name_, path), res);
   }
 
-  inline Status Info(CtrlResponse *res, const std::string &path) {
+  inline Status Info(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(InfoRequest(user_name_, path), res);
   }
 
   inline Status OpenCreate(CtrlResponse *res,
-                           const std::string &path) {
+                           const string &path) {
     return SendRequestGetResponse(OpenCreateRequest(user_name_, path), res);
   }
 
   inline Status OpenAppend(CtrlResponse *res,
-                           const std::string &path) {
+                           const string &path) {
     return SendRequestGetResponse(OpenAppendRequest(user_name_, path), res);
   }
 
   inline Status OpenRead(CtrlResponse *res,
-                         const std::string &path) {
+                         const string &path) {
     return SendRequestGetResponse(OpenReadRequest(user_name_, path), res);
   }
 
-  inline Status Exists(CtrlResponse *res,
-                       const std::string &path) {
+  inline Status Exists(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(ExistsRequest(user_name_, path), res);
   }
 
   inline Status MkDir(CtrlResponse *res,
-                      const std::string &path) {
+                      const string &path) {
     return SendRequestGetResponse(MakeDirectoriesRequest(user_name_, path),
                                   res);
   }
 
-  inline Status Delete(CtrlResponse *res,
-                       const std::string &path, bool recursive) {
+  inline Status Delete(CtrlResponse *res, const string &path,
+                       bool recursive) {
     return SendRequestGetResponse(DeleteRequest(user_name_, path, recursive),
                                   res);
   }
@@ -92,19 +91,19 @@ class IGFSClient {
     return SendRequestGetResponse(CloseRequest(stream_id), res);
   }
 
-  inline Status Rename(CtrlResponse *res,
-                       const std::string &source, const std::string &dest) {
+  inline Status Rename(CtrlResponse *res, const string &source,
+                       const string &dest) {
     return SendRequestGetResponse(RenameRequest(user_name_, source, dest), res);
   }
 
  private:
-  const std::string fs_name_;
-  const std::string user_name_;
-  ExtendedTCPClient client_;
-
   Status SendRequestGetResponse(const Request &request, Response *response);
+
+  const string fs_name_;
+  const string user_name_;
+  ExtendedTCPClient client_;
 };
 
 }  // namespace tensorflow
 
-#endif
+#endif  // TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_CLIENT_H_
diff --git a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
index b21360ec831..78d8df3c17a 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
@@ -13,22 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "igfs_extended_tcp_client.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h"
 
 namespace tensorflow {
 
-ExtendedTCPClient::ExtendedTCPClient(std::string host, int port,
+ExtendedTCPClient::ExtendedTCPClient(const string &host, int port,
                                      bool big_endian)
     : PlainClient(host, port, big_endian), pos_(0) {}
 
-Status ExtendedTCPClient::ReadData(uint8_t *buf, int32_t length) {
+Status ExtendedTCPClient::ReadData(uint8_t *buf, const int32_t length) {
   TF_RETURN_IF_ERROR(PlainClient::ReadData(buf, length));
   pos_ += length;
 
   return Status::OK();
 }
 
-Status ExtendedTCPClient::WriteData(uint8_t *buf, int32_t length) {
+Status ExtendedTCPClient::WriteData(const uint8_t *buf, const int32_t length) {
   TF_RETURN_IF_ERROR(PlainClient::WriteData(buf, length));
   pos_ += length;
 
@@ -52,7 +52,7 @@ Status ExtendedTCPClient::ReadBool(bool *res) {
   return Status::OK();
 }
 
-Status ExtendedTCPClient::ReadNullableString(std::string *res) {
+Status ExtendedTCPClient::ReadNullableString(string *res) {
   bool is_empty = false;
   TF_RETURN_IF_ERROR(ReadBool(&is_empty));
 
@@ -63,27 +63,26 @@ Status ExtendedTCPClient::ReadNullableString(std::string *res) {
   return Status::OK();
 }
 
-Status ExtendedTCPClient::ReadString(std::string *res) {
+Status ExtendedTCPClient::ReadString(string *res) {
   int16_t length;
   TF_RETURN_IF_ERROR(ReadShort(&length));
 
   uint8_t *buf = new uint8_t[length];
   Status status = ReadData(buf, length);
 
-  if (status.ok()) res->assign((char *)buf, length);
+  if (status.ok()) res->assign(reinterpret_cast(buf), length);
 
   delete[] buf;
   return status;
 }
 
-Status ExtendedTCPClient::ReadStringMap(
-    std::map *res) {
+Status ExtendedTCPClient::ReadStringMap(std::map *res) {
   int size;
   TF_RETURN_IF_ERROR(ReadInt(&size));
 
   for (int i = 0; i < size; i++) {
-    std::string key;
-    std::string val;
+    string key;
+    string val;
     TF_RETURN_IF_ERROR(ReadString(&key));
     TF_RETURN_IF_ERROR(ReadString(&val));
 
@@ -93,15 +92,14 @@ Status ExtendedTCPClient::ReadStringMap(
   return Status::OK();
 };
 
-Status ExtendedTCPClient::WriteSize(
-    std::map::size_type s) {
+Status ExtendedTCPClient::WriteSize(std::map::size_type s) {
   return WriteInt(s);
 }
 
 Status ExtendedTCPClient::FillWithZerosUntil(int n) {
-  int toSkip = std::max(0, n - pos_);
+  int to_skip = std::max(0, n - pos_);
 
-  for (int i = 0; i < toSkip; i++) {
+  for (int i = 0; i < to_skip; i++) {
     TF_RETURN_IF_ERROR(WriteByte(0));
   }
 
@@ -112,12 +110,13 @@ Status ExtendedTCPClient::WriteBool(bool val) {
   return WriteByte((char)(val ? 1 : 0));
 }
 
-Status ExtendedTCPClient::WriteString(std::string str) {
+Status ExtendedTCPClient::WriteString(string str) {
   if (!str.empty()) {
     TF_RETURN_IF_ERROR(WriteBool(false));
     unsigned short l = str.length();
     TF_RETURN_IF_ERROR(WriteShort(l));
-    TF_RETURN_IF_ERROR(WriteData((uint8_t *)str.c_str(), str.length()));
+    TF_RETURN_IF_ERROR(WriteData(reinterpret_cast(str.c_str()),
+                                 str.length()));
   } else {
     TF_RETURN_IF_ERROR(WriteBool(true));
   }
@@ -125,12 +124,11 @@ Status ExtendedTCPClient::WriteString(std::string str) {
   return Status::OK();
 }
 
-Status ExtendedTCPClient::WriteStringMap(
-    std::map map) {
+Status ExtendedTCPClient::WriteStringMap(std::map map) {
   std::map::size_type size = map.size();
   TF_RETURN_IF_ERROR(WriteSize(size));
 
-  for (auto const &x : map) {
+  for (auto &x : map) {
     TF_RETURN_IF_ERROR(WriteString(x.first));
     TF_RETURN_IF_ERROR(WriteString(x.second));
   }
diff --git a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h
index 07d8a336059..5121ee67a57 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h
+++ b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h
@@ -22,20 +22,20 @@ namespace tensorflow {
 
 class ExtendedTCPClient : public PlainClient {
  public:
-  ExtendedTCPClient(std::string host, int port, bool big_endian);
-  Status ReadData(uint8_t *buf, int32_t length) override;
-  Status WriteData(uint8_t *buf, int32_t length) override;
+  ExtendedTCPClient(const string &host, int port, bool big_endian);
+  Status ReadData(uint8_t *buf, const int32_t length) override;
+  Status WriteData(const uint8_t *buf, const int32_t length) override;
   Status Ignore(int n);
   Status SkipToPos(int target_pos);
   Status ReadBool(bool *res);
-  Status ReadNullableString(std::string *res);
-  Status ReadString(std::string *res);
-  Status ReadStringMap(std::map *res);
-  Status WriteSize(std::map::size_type s);
+  Status ReadNullableString(string *res);
+  Status ReadString(string *res);
+  Status ReadStringMap(std::map *res);
+  Status WriteSize(std::map::size_type s);
   Status FillWithZerosUntil(int n);
   Status WriteBool(bool val);
-  Status WriteString(std::string str);
-  Status WriteStringMap(std::map map);
+  Status WriteString(string str);
+  Status WriteStringMap(std::map map);
   void reset();
 
  private:
@@ -44,4 +44,4 @@ class ExtendedTCPClient : public PlainClient {
 
 }  // namespace tensorflow
 
-#endif
\ No newline at end of file
+#endif  // TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_EXTENDED_TCP_CLIENT_H_
\ No newline at end of file
diff --git a/tensorflow/contrib/igfs/kernels/igfs_messages.cc b/tensorflow/contrib/igfs/kernels/igfs_messages.cc
index 2d892c7cc29..a03ba3240e9 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_messages.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_messages.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "igfs_messages.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_messages.h"
 
 namespace tensorflow {
 
@@ -24,7 +24,7 @@ Status IGFSPath::Read(ExtendedTCPClient *client) {
 Status IGFSFile::Read(ExtendedTCPClient *client) {
   int32_t block_size;
   int64_t group_block_size;
-  map properties = {};
+  std::map properties = {};
   int64_t access_time;
 
   bool has_path;
@@ -68,25 +68,26 @@ Status Response::Read(ExtendedTCPClient *client) {
 
   if (has_error) {
     int32_t error_code;
-    std::string error_msg;
+    string error_msg;
     TF_RETURN_IF_ERROR(client->ReadString(&error_msg));
     TF_RETURN_IF_ERROR(client->ReadInt(&error_code));
 
-    return errors::Internal("Error [code=", error_code, ", message=\"",
-                            error_msg, "\"]");
+    return errors::Unknown("Error [code=", error_code, ", message=\"",
+                           error_msg, "\"]");
   }
 
-  TF_RETURN_IF_ERROR(client->SkipToPos(HEADER_SIZE + 5));
+  TF_RETURN_IF_ERROR(client->SkipToPos(header_size_ + 5));
   TF_RETURN_IF_ERROR(client->ReadInt(&length));
-  TF_RETURN_IF_ERROR(client->SkipToPos(HEADER_SIZE + RESPONSE_HEADER_SIZE));
+  TF_RETURN_IF_ERROR(client->SkipToPos(header_size_ + response_header_size_));
 
   return Status::OK();
 }
 
-PathCtrlRequest::PathCtrlRequest(int32_t command_id_, string user_name,
-                                 string path, string destination_path,
-                                 bool flag, bool collocate,
-                                 map properties)
+PathCtrlRequest::PathCtrlRequest(int32_t command_id_, const string &user_name,
+                                 const string &path,
+                                 const string &destination_path, bool flag,
+                                 bool collocate,
+                                 const std::map &properties)
     : Request(command_id_),
       user_name_(std::move(user_name)),
       path_(std::move(path)),
@@ -302,7 +303,9 @@ Status ReadBlockResponse::Read(ExtendedTCPClient *client) {
   return Status::OK();
 }
 
-streamsize ReadBlockResponse::GetSuccessfulyRead() { return successfuly_read; }
+std::streamsize ReadBlockResponse::GetSuccessfulyRead() {
+  return successfuly_read;
+}
 
 ReadBlockCtrlResponse::ReadBlockCtrlResponse(uint8_t *dst)
     : CtrlResponse(false), dst(dst) {}
diff --git a/tensorflow/contrib/igfs/kernels/igfs_messages.h b/tensorflow/contrib/igfs/kernels/igfs_messages.h
index 076e6f55ef8..dcfbf12bebf 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_messages.h
+++ b/tensorflow/contrib/igfs/kernels/igfs_messages.h
@@ -16,15 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_MESSAGES_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_MESSAGES_H_
 
-#include "igfs_extended_tcp_client.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h"
 
 namespace tensorflow {
 
-using std::map;
-using std::string;
-using std::vector;
-using std::streamsize;
-
 enum CommandId {
   HANDSHAKE_ID = 0,
   EXISTS_ID = 2,
@@ -44,18 +39,18 @@ enum CommandId {
 
 class IGFSPath {
  public:
-  std::string path;
-
   Status Read(ExtendedTCPClient *client);
+
+  string path;
 };
 
 class IGFSFile {
  public:
+  Status Read(ExtendedTCPClient *client);
+
   int64_t length;
   int64_t modification_time;
   uint8_t flags;
-
-  Status Read(ExtendedTCPClient *client);
 };
 
 class Request {
@@ -69,35 +64,33 @@ class Request {
 
 class Response {
  public:
+  virtual Status Read(ExtendedTCPClient *client);
+
   int32_t res_type;
   int32_t req_id;
   int32_t length;
 
-  virtual Status Read(ExtendedTCPClient *client);
-
  protected:
-  static const int32_t HEADER_SIZE = 24;
-  static const int32_t RESPONSE_HEADER_SIZE = 9;
-  static const int32_t RES_TYPE_ERR_STREAM_ID = 9;
+  static const int32_t header_size_ = 24;
+  static const int32_t response_header_size_ = 9;
 };
 
 class PathCtrlRequest : public Request {
  public:
-  PathCtrlRequest(int32_t command_id, string user_name, string path,
-                  string destination_path, bool flag, bool collocate,
-                  map properties);
-
+  PathCtrlRequest(int32_t command_id, const string &user_name,
+                  const string &path, const string &destination_path, bool flag,
+                  bool collocate, const std::map &properties);
   Status Write(ExtendedTCPClient *client) const override;
 
  protected:
+  Status WritePath(ExtendedTCPClient *client, const string &path) const;
+
   const string user_name_;
   const string path_;
   const string destination_path_;
   const bool flag_;
   const bool collocate_;
-  const map props_;
-
-  Status WritePath(ExtendedTCPClient *client, const string &path) const;
+  const std::map props_;
 };
 
 class StreamCtrlRequest : public Request {
@@ -113,9 +106,6 @@ class StreamCtrlRequest : public Request {
 template 
 class CtrlResponse : public Response {
  public:
-  R res;
-  bool has_content;
-
   CtrlResponse(bool optional) : optional_(optional){};
   Status Read(ExtendedTCPClient *client) override {
     TF_RETURN_IF_ERROR(Response::Read(client));
@@ -133,6 +123,9 @@ class CtrlResponse : public Response {
     return Status::OK();
   }
 
+  R res;
+  bool has_content;
+
  private:
   bool optional_;
 };
@@ -140,13 +133,11 @@ class CtrlResponse : public Response {
 template 
 class ListResponse {
  public:
-  vector entries;
-
   Status Read(ExtendedTCPClient *client) {
     int32_t len;
     TF_RETURN_IF_ERROR(client->ReadInt(&len));
 
-    entries = vector();
+    entries = std::vector();
 
     for (int32_t i = 0; i < len; i++) {
       T f = {};
@@ -156,6 +147,8 @@ class ListResponse {
 
     return Status::OK();
   }
+
+  std::vector entries;
 };
 
 class DeleteRequest : public PathCtrlRequest {
@@ -165,9 +158,9 @@ class DeleteRequest : public PathCtrlRequest {
 
 class DeleteResponse {
  public:
-  bool exists;
-
   Status Read(ExtendedTCPClient *client);
+
+  bool exists;
 };
 
 class ExistsRequest : public PathCtrlRequest {
@@ -177,9 +170,9 @@ class ExistsRequest : public PathCtrlRequest {
 
 class ExistsResponse {
  public:
-  bool exists;
-
   Status Read(ExtendedTCPClient *client);
+
+  bool exists;
 };
 
 class HandshakeRequest : public Request {
@@ -194,9 +187,9 @@ class HandshakeRequest : public Request {
 
 class HandshakeResponse {
  public:
-  string fs_name;
-
   Status Read(ExtendedTCPClient *client);
+
+  string fs_name;
 };
 
 class ListRequest : public PathCtrlRequest {
@@ -222,7 +215,6 @@ class ListPathsResponse : public ListResponse {};
 class OpenCreateRequest : public PathCtrlRequest {
  public:
   OpenCreateRequest(const string &user_name, const string &path);
-
   Status Write(ExtendedTCPClient *client) const override;
 
  private:
@@ -232,9 +224,9 @@ class OpenCreateRequest : public PathCtrlRequest {
 
 class OpenCreateResponse {
  public:
-  int64_t stream_id;
-
   Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
 };
 
 class OpenAppendRequest : public PathCtrlRequest {
@@ -245,18 +237,16 @@ class OpenAppendRequest : public PathCtrlRequest {
 
 class OpenAppendResponse {
  public:
-  int64_t stream_id;
-
   Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
 };
 
 class OpenReadRequest : public PathCtrlRequest {
  public:
   OpenReadRequest(const string &user_name, const string &path, bool flag,
                   int32_t seqReadsBeforePrefetch);
-
   OpenReadRequest(const string &user_name, const string &path);
-
   Status Write(ExtendedTCPClient *client) const override;
 
  protected:
@@ -266,10 +256,10 @@ class OpenReadRequest : public PathCtrlRequest {
 
 class OpenReadResponse {
  public:
+  Status Read(ExtendedTCPClient *client);
+
   int64_t stream_id;
   int64_t length;
-
-  Status Read(ExtendedTCPClient *client);
 };
 
 class InfoRequest : public PathCtrlRequest {
@@ -279,9 +269,9 @@ class InfoRequest : public PathCtrlRequest {
 
 class InfoResponse {
  public:
-  IGFSFile file_info;
-
   Status Read(ExtendedTCPClient *client);
+
+  IGFSFile file_info;
 };
 
 class MakeDirectoriesRequest : public PathCtrlRequest {
@@ -291,9 +281,9 @@ class MakeDirectoriesRequest : public PathCtrlRequest {
 
 class MakeDirectoriesResponse {
  public:
-  bool successful;
-
   Status Read(ExtendedTCPClient *client);
+
+  bool successful;
 };
 
 /** Stream control requests. **/
@@ -305,15 +295,14 @@ class CloseRequest : public StreamCtrlRequest {
 
 class CloseResponse {
  public:
-  bool successful;
-
   Status Read(ExtendedTCPClient *client);
+
+  bool successful;
 };
 
 class ReadBlockRequest : public StreamCtrlRequest {
  public:
   ReadBlockRequest(int64_t stream_id, int64_t pos, int32_t length);
-
   Status Write(ExtendedTCPClient *client) const override;
 
  private:
@@ -323,20 +312,17 @@ class ReadBlockRequest : public StreamCtrlRequest {
 class ReadBlockResponse {
  public:
   Status Read(ExtendedTCPClient *client, int32_t length, uint8_t *dst);
-
   Status Read(ExtendedTCPClient *client);
-
-  streamsize GetSuccessfulyRead();
+  std::streamsize GetSuccessfulyRead();
 
  private:
   int32_t length;
-  streamsize successfuly_read;
+  std::streamsize successfuly_read;
 };
 
 class ReadBlockCtrlResponse : public CtrlResponse {
  public:
   ReadBlockCtrlResponse(uint8_t *dst);
-
   Status Read(ExtendedTCPClient *client) override;
 
  private:
@@ -346,7 +332,6 @@ class ReadBlockCtrlResponse : public CtrlResponse {
 class WriteBlockRequest : public StreamCtrlRequest {
  public:
   WriteBlockRequest(int64_t stream_id, const uint8_t *data, int32_t length);
-
   Status Write(ExtendedTCPClient *client) const override;
 
  private:
@@ -355,17 +340,17 @@ class WriteBlockRequest : public StreamCtrlRequest {
 
 class RenameRequest : public PathCtrlRequest {
  public:
-  RenameRequest(const std::string &user_name, const std::string &path,
-                const std::string &destination_path);
+  RenameRequest(const string &user_name, const string &path,
+                const string &destination_path);
 };
 
 class RenameResponse {
  public:
-  bool successful;
-
   Status Read(ExtendedTCPClient *client);
+
+  bool successful;
 };
 
 }  // namespace tensorflow
 
-#endif
+#endif  // TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_MESSAGES_H_
diff --git a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
index ff546e13444..c078bc5e3fb 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "igfs_random_access_file.h"
-#include "igfs_messages.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_random_access_file.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_messages.h"
 
 namespace tensorflow {
 
-IGFSRandomAccessFile::IGFSRandomAccessFile(const std::string &file_name,
+IGFSRandomAccessFile::IGFSRandomAccessFile(const string &file_name,
                                            int64_t resource_id,
                                            std::shared_ptr client)
     : file_name_(file_name), resource_id_(resource_id), client_(client) {}
@@ -35,7 +35,7 @@ Status IGFSRandomAccessFile::Read(uint64 offset, size_t n, StringPiece *result,
   ReadBlockCtrlResponse response = ReadBlockCtrlResponse((uint8_t *)scratch);
   TF_RETURN_IF_ERROR(client_->ReadBlock(&response, resource_id_, offset, n));
 
-  streamsize sz = response.res.GetSuccessfulyRead();
+  std::streamsize sz = response.res.GetSuccessfulyRead();
   if (sz == 0) return errors::OutOfRange("End of file");
 
   *result = StringPiece(scratch, sz);
diff --git a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
index b1f6986abd5..48b2d097c5c 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
+++ b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
@@ -16,25 +16,25 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_RANDOM_ACCESS_FILE_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_RANDOM_ACCESS_FILE_H_
 
-#include "igfs_client.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
 #include "tensorflow/core/platform/file_system.h"
 
 namespace tensorflow {
 
 class IGFSRandomAccessFile : public RandomAccessFile {
  public:
-  IGFSRandomAccessFile(const std::string &file_name, int64_t resource_id,
+  IGFSRandomAccessFile(const string &file_name, int64_t resource_id,
                        std::shared_ptr client);
   ~IGFSRandomAccessFile() override;
   Status Read(uint64 offset, size_t n, StringPiece *result,
               char *scratch) const override;
 
  private:
-  const std::string file_name_;
+  const string file_name_;
   const int64_t resource_id_;
   std::shared_ptr client_;
 };
 
 }  // namespace tensorflow
 
-#endif
\ No newline at end of file
+#endif  // TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_RANDOM_ACCESS_FILE_H_
\ No newline at end of file
diff --git a/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc b/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
index 22ca848aa6c..6e523e579ad 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
@@ -13,13 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "igfs_writable_file.h"
-#include "igfs_messages.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_writable_file.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_messages.h"
 
 namespace tensorflow {
 
-IGFSWritableFile::IGFSWritableFile(const std::string &file_name,
-                                   int64_t resource_id,
+IGFSWritableFile::IGFSWritableFile(const string &file_name, int64_t resource_id,
                                    std::shared_ptr client)
     : file_name_(file_name), resource_id_(resource_id), client_(client) {}
 
@@ -32,7 +31,7 @@ IGFSWritableFile::~IGFSWritableFile() {
   }
 }
 
-Status IGFSWritableFile::Append(const StringPiece &data) {
+Status IGFSWritableFile::Append(StringPiece data) {
   return client_->WriteBlock(resource_id_, (uint8_t *)data.data(), data.size());
 }
 
@@ -44,8 +43,18 @@ Status IGFSWritableFile::Close() {
   return client_->Close(&close_response, resource_to_be_closed);
 }
 
-Status IGFSWritableFile::Flush() { return Status::OK(); }
+Status IGFSWritableFile::Flush() { return Sync(); }
 
-Status IGFSWritableFile::Sync() { return Status::OK(); }
+Status IGFSWritableFile::Sync() {
+  CtrlResponse close_response = {false};
+  TF_RETURN_IF_ERROR(client_->Close(&close_response, resource_id_));
 
-}  // namespace tensorflow
\ No newline at end of file
+  CtrlResponse open_append_resp(false);
+  TF_RETURN_IF_ERROR(client_->OpenAppend(&open_append_resp, file_name_));
+
+  resource_id_ = open_append_resp.res.stream_id;
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/igfs/kernels/igfs_writable_file.h b/tensorflow/contrib/igfs/kernels/igfs_writable_file.h
index 352354a630f..504c82f1ab2 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_writable_file.h
+++ b/tensorflow/contrib/igfs/kernels/igfs_writable_file.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_WRITABLE_FILE_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_WRITABLE_FILE_H_
 
-#include "igfs_client.h"
+#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
 #include "tensorflow/core/platform/file_system.h"
 
 namespace tensorflow {
@@ -26,7 +26,7 @@ class IGFSWritableFile : public WritableFile {
   IGFSWritableFile(const string &file_name, int64_t resource_id,
                    std::shared_ptr client);
   ~IGFSWritableFile() override;
-  Status Append(const StringPiece &data) override;
+  Status Append(StringPiece data) override;
   Status Close() override;
   Status Flush() override;
   Status Sync() override;
@@ -39,4 +39,4 @@ class IGFSWritableFile : public WritableFile {
 
 }  // namespace tensorflow
 
-#endif
\ No newline at end of file
+#endif  // TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_WRITABLE_FILE_H_
diff --git a/tensorflow/contrib/igfs/ops/igfs_ops.cc b/tensorflow/contrib/igfs/ops/igfs_ops.cc
index 99e41a5dbb1..f49267d8b9c 100644
--- a/tensorflow/contrib/igfs/ops/igfs_ops.cc
+++ b/tensorflow/contrib/igfs/ops/igfs_ops.cc
@@ -13,11 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/env.h"
 
-#include "../kernels/igfs.h"
+#include "tensorflow/contrib/igfs/kernels/igfs.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index c79d35e6018..c4661630617 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -42,7 +42,7 @@ cc_library(
         "kernels/ignite_byte_swapper.h",
         "kernels/ignite_plain_client.h",
         "kernels/ignite_ssl_wrapper.h",
-        "kernels/ignite_ssl_wrapper.cc"
+        "kernels/ignite_ssl_wrapper.cc",
     ] + if_not_windows([
         "kernels/ignite_plain_client_unix.cc",
     ]) + if_windows([
@@ -52,8 +52,8 @@ cc_library(
         "-DWIN32_LEAN_AND_MEAN",
     ]),
     deps = [
-        "@boringssl//:ssl",
         "//tensorflow/core:framework_headers_lib",
+        "@boringssl//:ssl",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
@@ -61,19 +61,19 @@ cc_library(
 cc_library(
     name = "dataset_kernels",
     srcs = [
-        "kernels/ignite_dataset_ops.cc",
-        "kernels/ignite_binary_object_parser.h",
         "kernels/ignite_binary_object_parser.cc",
-        "kernels/ignite_dataset.h",
+        "kernels/ignite_binary_object_parser.h",
         "kernels/ignite_dataset.cc",
-        "kernels/ignite_dataset_iterator.h",
+        "kernels/ignite_dataset.h",
         "kernels/ignite_dataset_iterator.cc",
+        "kernels/ignite_dataset_iterator.h",
+        "kernels/ignite_dataset_ops.cc",
     ],
     deps = [
         ":ignite_client",
         "//tensorflow/core:framework_headers_lib",
-        "@protobuf_archive//:protobuf_headers",
         "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.cc b/tensorflow/contrib/ignite/kernels/ignite_client.cc
deleted file mode 100644
index dea6484d594..00000000000
--- a/tensorflow/contrib/ignite/kernels/ignite_client.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "ignite_client.h"
-
-namespace tensorflow {
-
-Client::Client(bool big_endian) {
-  int x = 1;
-  bool is_little_endian = (*(char *)&x == 1);
-  swap_ = big_endian == is_little_endian;
-}
-
-}  // namespace tensorflow
\ No newline at end of file

From fdeac73758b1fd566c3c4a912a8918a58752489a Mon Sep 17 00:00:00 2001
From: Anton Dmitriev 
Date: Mon, 15 Oct 2018 14:55:20 +0300
Subject: [PATCH 059/540] Update Apache Ignite IGFS after review.

---
 tensorflow/contrib/igfs/__init__.py              |  3 ---
 tensorflow/contrib/igfs/kernels/igfs.cc          | 16 ++++++++--------
 .../igfs/kernels/igfs_extended_tcp_client.cc     |  4 ++--
 tensorflow/contrib/igfs/kernels/igfs_messages.cc |  8 ++++----
 tensorflow/contrib/igfs/kernels/igfs_messages.h  |  2 +-
 5 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/igfs/__init__.py b/tensorflow/contrib/igfs/__init__.py
index 5ef676337f0..c1de5388941 100644
--- a/tensorflow/contrib/igfs/__init__.py
+++ b/tensorflow/contrib/igfs/__init__.py
@@ -23,11 +23,8 @@ functionality to Hadoop HDFS, but only in-memory. In fact, in addition to
 its own APIs, IGFS implements Hadoop FileSystem API and can be transparently
 plugged into Hadoop or Spark deployments. This contrib package contains an
 intergration between IGFS and TensorFlow.
-
-@@IGFS
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/contrib/igfs/kernels/igfs.cc b/tensorflow/contrib/igfs/kernels/igfs.cc
index 93bffc429e7..3615b8743fd 100644
--- a/tensorflow/contrib/igfs/kernels/igfs.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs.cc
@@ -25,18 +25,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-string GetEnvOrElse(const string &env, string default_value) {
+static string GetEnvOrElse(const string &env, string default_value) {
   const char *env_c_str = env.c_str();
   return getenv(env_c_str) != nullptr ? getenv(env_c_str) : default_value;
 }
 
-string IGFS::TranslateName(const string &name) const {
-  StringPiece scheme, namenode, path;
-  io::ParseURI(name, &scheme, &namenode, &path);
-  return string(path.data(), path.length());
-}
-
-string MakeRelative(const string &a, const string &b) {
+static string MakeRelative(const string &a, const string &b) {
   string max = a;
   string min = b;
   bool first = b.size() > a.size();
@@ -50,6 +44,12 @@ string MakeRelative(const string &a, const string &b) {
   return string((first ? r.first : r.second), first ? min.end() : max.end());
 }
 
+string IGFS::TranslateName(const string &name) const {
+  StringPiece scheme, namenode, path;
+  io::ParseURI(name, &scheme, &namenode, &path);
+  return string(path.data(), path.length());
+}
+
 IGFS::IGFS()
     : host_(GetEnvOrElse("IGFS_HOST", "localhost")),
       port_(atoi(GetEnvOrElse("IGFS_PORT", "10500").c_str())),
diff --git a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
index 78d8df3c17a..f7c5ad64b4a 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
@@ -86,7 +86,7 @@ Status ExtendedTCPClient::ReadStringMap(std::map *res) {
     TF_RETURN_IF_ERROR(ReadString(&key));
     TF_RETURN_IF_ERROR(ReadString(&val));
 
-    res->insert(std::pair(key, val));
+    res->insert(std::pair(std::move(key), std::move(val)));
   }
 
   return Status::OK();
@@ -113,7 +113,7 @@ Status ExtendedTCPClient::WriteBool(bool val) {
 Status ExtendedTCPClient::WriteString(string str) {
   if (!str.empty()) {
     TF_RETURN_IF_ERROR(WriteBool(false));
-    unsigned short l = str.length();
+    size_t l = str.length();
     TF_RETURN_IF_ERROR(WriteShort(l));
     TF_RETURN_IF_ERROR(WriteData(reinterpret_cast(str.c_str()),
                                  str.length()));
diff --git a/tensorflow/contrib/igfs/kernels/igfs_messages.cc b/tensorflow/contrib/igfs/kernels/igfs_messages.cc
index a03ba3240e9..44368c7bb1f 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_messages.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_messages.cc
@@ -89,12 +89,12 @@ PathCtrlRequest::PathCtrlRequest(int32_t command_id_, const string &user_name,
                                  bool collocate,
                                  const std::map &properties)
     : Request(command_id_),
-      user_name_(std::move(user_name)),
-      path_(std::move(path)),
-      destination_path_(std::move(destination_path)),
+      user_name_(user_name),
+      path_(path),
+      destination_path_(destination_path),
       flag_(flag),
       collocate_(collocate),
-      props_(std::move(properties)) {}
+      props_(properties) {}
 
 Status PathCtrlRequest::Write(ExtendedTCPClient *client) const {
   TF_RETURN_IF_ERROR(Request::Write(client));
diff --git a/tensorflow/contrib/igfs/kernels/igfs_messages.h b/tensorflow/contrib/igfs/kernels/igfs_messages.h
index dcfbf12bebf..626bdc56eb7 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_messages.h
+++ b/tensorflow/contrib/igfs/kernels/igfs_messages.h
@@ -137,7 +137,7 @@ class ListResponse {
     int32_t len;
     TF_RETURN_IF_ERROR(client->ReadInt(&len));
 
-    entries = std::vector();
+    entries.clear();
 
     for (int32_t i = 0; i < len; i++) {
       T f = {};

From 3e0c8ab437a0de1c468f9282a1cc54b013224051 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev 
Date: Mon, 15 Oct 2018 15:42:53 +0300
Subject: [PATCH 060/540] Update Apache Ignite IGFS after review.

---
 tensorflow/contrib/igfs/kernels/igfs.cc       | 33 ++++++++++---------
 tensorflow/contrib/igfs/kernels/igfs.h        |  2 +-
 .../igfs/kernels/igfs_extended_tcp_client.cc  |  2 ++
 .../igfs/kernels/igfs_random_access_file.cc   |  6 ++--
 .../igfs/kernels/igfs_random_access_file.h    |  4 +--
 .../igfs/kernels/igfs_writable_file.cc        |  6 ++--
 .../contrib/igfs/kernels/igfs_writable_file.h |  4 +--
 7 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/tensorflow/contrib/igfs/kernels/igfs.cc b/tensorflow/contrib/igfs/kernels/igfs.cc
index 3615b8743fd..5434be4d60a 100644
--- a/tensorflow/contrib/igfs/kernels/igfs.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs.cc
@@ -65,7 +65,7 @@ IGFS::~IGFS() {
 
 Status IGFS::NewRandomAccessFile(const string &file_name,
                                  std::unique_ptr *result) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
@@ -75,7 +75,7 @@ Status IGFS::NewRandomAccessFile(const string &file_name,
   TF_RETURN_IF_ERROR(client->OpenRead(&open_read_response, path));
 
   long resource_id = open_read_response.res.stream_id;
-  result->reset(new IGFSRandomAccessFile(path, resource_id, client));
+  result->reset(new IGFSRandomAccessFile(path, resource_id, std::move(client)));
 
   LOG(INFO) << "New random access file completed successfully [file_name="
             << file_name << "]";
@@ -85,7 +85,7 @@ Status IGFS::NewRandomAccessFile(const string &file_name,
 
 Status IGFS::NewWritableFile(const string &file_name,
                              std::unique_ptr *result) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
@@ -103,7 +103,7 @@ Status IGFS::NewWritableFile(const string &file_name,
   TF_RETURN_IF_ERROR(client->OpenCreate(&open_create_resp, path));
 
   long resource_id = open_create_resp.res.stream_id;
-  result->reset(new IGFSWritableFile(path, resource_id, client));
+  result->reset(new IGFSWritableFile(path, resource_id, std::move(client)));
 
   LOG(INFO) << "New writable file completed successfully [file_name="
             << file_name << "]";
@@ -113,7 +113,7 @@ Status IGFS::NewWritableFile(const string &file_name,
 
 Status IGFS::NewAppendableFile(const string &file_name,
                                std::unique_ptr *result) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
 
   CtrlResponse handshake_response(true);
   TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
@@ -130,7 +130,8 @@ Status IGFS::NewAppendableFile(const string &file_name,
   TF_RETURN_IF_ERROR(client->OpenAppend(&open_append_resp, file_name));
 
   result->reset(new IGFSWritableFile(TranslateName(file_name),
-                                     open_append_resp.res.stream_id, client));
+                                     open_append_resp.res.stream_id,
+                                     std::move(client)));
 
   LOG(INFO) << "New appendable file completed successfully [file_name="
             << file_name << "]";
@@ -144,7 +145,7 @@ Status IGFS::NewReadOnlyMemoryRegionFromFile(
 }
 
 Status IGFS::FileExists(const string &file_name) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   const string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
@@ -163,7 +164,7 @@ Status IGFS::FileExists(const string &file_name) {
 }
 
 Status IGFS::GetChildren(const string &file_name, std::vector *result) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   string path = TranslateName(file_name);
   path = path + "/";
 
@@ -191,7 +192,7 @@ Status IGFS::GetMatchingPaths(const string &pattern,
 }
 
 Status IGFS::DeleteFile(const string &file_name) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
@@ -210,7 +211,7 @@ Status IGFS::DeleteFile(const string &file_name) {
 }
 
 Status IGFS::CreateDir(const string &file_name) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   const string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
@@ -229,7 +230,7 @@ Status IGFS::CreateDir(const string &file_name) {
 }
 
 Status IGFS::DeleteDir(const string &file_name) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
@@ -252,7 +253,7 @@ Status IGFS::DeleteDir(const string &file_name) {
 }
 
 Status IGFS::GetFileSize(const string &file_name, uint64 *size) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
@@ -270,7 +271,7 @@ Status IGFS::GetFileSize(const string &file_name, uint64 *size) {
 }
 
 Status IGFS::RenameFile(const string &src, const string &dst) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   string src_path = TranslateName(src);
   string dst_path = TranslateName(dst);
 
@@ -292,7 +293,7 @@ Status IGFS::RenameFile(const string &src, const string &dst) {
 }
 
 Status IGFS::Stat(const string &file_name, FileStatistics *stats) {
-  std::shared_ptr client = CreateClient();
+  std::unique_ptr client = CreateClient();
   string path = TranslateName(file_name);
 
   CtrlResponse handshake_response(true);
@@ -311,8 +312,8 @@ Status IGFS::Stat(const string &file_name, FileStatistics *stats) {
   return Status::OK();
 }
 
-std::shared_ptr IGFS::CreateClient() const {
-  return std::shared_ptr(
+std::unique_ptr IGFS::CreateClient() const {
+  return std::unique_ptr(
       new IGFSClient(host_, port_, fs_name_, ""));
 }
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs.h b/tensorflow/contrib/igfs/kernels/igfs.h
index c2dff7ebc81..a05f620ff41 100644
--- a/tensorflow/contrib/igfs/kernels/igfs.h
+++ b/tensorflow/contrib/igfs/kernels/igfs.h
@@ -48,7 +48,7 @@ class IGFS : public FileSystem {
   string TranslateName(const string& name) const override;
 
  private:
-  std::shared_ptr CreateClient() const;
+  std::unique_ptr CreateClient() const;
 
   const string host_;
   const int port_;
diff --git a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
index f7c5ad64b4a..e12d6708b4e 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
@@ -114,6 +114,8 @@ Status ExtendedTCPClient::WriteString(string str) {
   if (!str.empty()) {
     TF_RETURN_IF_ERROR(WriteBool(false));
     size_t l = str.length();
+    if (l > 0xFFFF) return errors::InvalidArgument("String is too long");
+
     TF_RETURN_IF_ERROR(WriteShort(l));
     TF_RETURN_IF_ERROR(WriteData(reinterpret_cast(str.c_str()),
                                  str.length()));
diff --git a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
index c078bc5e3fb..d44bb031b20 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
@@ -20,8 +20,10 @@ namespace tensorflow {
 
 IGFSRandomAccessFile::IGFSRandomAccessFile(const string &file_name,
                                            int64_t resource_id,
-                                           std::shared_ptr client)
-    : file_name_(file_name), resource_id_(resource_id), client_(client) {}
+                                           std::unique_ptr &&client)
+    : file_name_(file_name),
+      resource_id_(resource_id),
+      client_(std::move(client)) {}
 
 IGFSRandomAccessFile::~IGFSRandomAccessFile() {
   CtrlResponse close_response = {false};
diff --git a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
index 48b2d097c5c..b25426134ca 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
+++ b/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 class IGFSRandomAccessFile : public RandomAccessFile {
  public:
   IGFSRandomAccessFile(const string &file_name, int64_t resource_id,
-                       std::shared_ptr client);
+                       std::unique_ptr &&client);
   ~IGFSRandomAccessFile() override;
   Status Read(uint64 offset, size_t n, StringPiece *result,
               char *scratch) const override;
@@ -32,7 +32,7 @@ class IGFSRandomAccessFile : public RandomAccessFile {
  private:
   const string file_name_;
   const int64_t resource_id_;
-  std::shared_ptr client_;
+  std::unique_ptr client_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc b/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
index 6e523e579ad..6eb335d312a 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
+++ b/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
@@ -19,8 +19,10 @@ limitations under the License.
 namespace tensorflow {
 
 IGFSWritableFile::IGFSWritableFile(const string &file_name, int64_t resource_id,
-                                   std::shared_ptr client)
-    : file_name_(file_name), resource_id_(resource_id), client_(client) {}
+                                   std::unique_ptr &&client)
+    : file_name_(file_name),
+      resource_id_(resource_id),
+      client_(std::move(client)) {}
 
 IGFSWritableFile::~IGFSWritableFile() {
   if (resource_id_ >= 0) {
diff --git a/tensorflow/contrib/igfs/kernels/igfs_writable_file.h b/tensorflow/contrib/igfs/kernels/igfs_writable_file.h
index 504c82f1ab2..f888beaa4eb 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_writable_file.h
+++ b/tensorflow/contrib/igfs/kernels/igfs_writable_file.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 class IGFSWritableFile : public WritableFile {
  public:
   IGFSWritableFile(const string &file_name, int64_t resource_id,
-                   std::shared_ptr client);
+                   std::unique_ptr &&client);
   ~IGFSWritableFile() override;
   Status Append(StringPiece data) override;
   Status Close() override;
@@ -34,7 +34,7 @@ class IGFSWritableFile : public WritableFile {
  private:
   const string file_name_;
   int64_t resource_id_;
-  std::shared_ptr client_;
+  std::unique_ptr client_;
 };
 
 }  // namespace tensorflow

From 29bb7f4f0158510b9c33fa738d946ef78cc50b30 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev 
Date: Fri, 2 Nov 2018 17:29:34 +0300
Subject: [PATCH 061/540] Update Apache Ignite IGFS after review.

---
 tensorflow/contrib/BUILD                      |   2 -
 tensorflow/contrib/igfs/BUILD                 | 131 ------------------
 tensorflow/contrib/igfs/README.md             |  34 -----
 .../contrib/igfs/python/ops/igfs_ops.py       |  29 ----
 .../contrib/igfs/python/tests/start_ignite.sh |  22 ---
 .../contrib/igfs/python/tests/stop_ignite.sh  |  17 ---
 tensorflow/contrib/ignite/BUILD               | 118 +++++++++++++---
 tensorflow/contrib/ignite/README.md           |  45 +++++-
 .../{ => client}/ignite_byte_swapper.h        |  31 ++---
 .../kernels/{ => client}/ignite_client.h      |  18 +--
 .../{ => client}/ignite_plain_client.h        |   2 +-
 .../{ => client}/ignite_plain_client_unix.cc  |   2 +-
 .../ignite_plain_client_windows.cc            |   2 +-
 .../{ => client}/ignite_ssl_wrapper.cc        |   2 +-
 .../kernels/{ => client}/ignite_ssl_wrapper.h |   2 +-
 .../ignite_binary_object_parser.cc            |   2 +-
 .../ignite_binary_object_parser.h             |   2 +-
 .../kernels/{ => dataset}/ignite_dataset.cc   |   2 +-
 .../kernels/{ => dataset}/ignite_dataset.h    |   0
 .../{ => dataset}/ignite_dataset_iterator.cc  |  10 +-
 .../{ => dataset}/ignite_dataset_iterator.h   |   6 +-
 .../{ => dataset}/ignite_dataset_ops.cc       |   4 +-
 .../kernels => ignite/kernels/igfs}/igfs.cc   |   8 +-
 .../kernels => ignite/kernels/igfs}/igfs.h    |   2 +-
 .../kernels/igfs}/igfs_client.cc              |   2 +-
 .../kernels/igfs}/igfs_client.h               |  43 +++---
 .../kernels/igfs}/igfs_extended_tcp_client.cc |   5 +-
 .../kernels/igfs}/igfs_extended_tcp_client.h  |   2 +-
 .../kernels/igfs}/igfs_messages.cc            |   2 +-
 .../kernels/igfs}/igfs_messages.h             |   2 +-
 .../kernels/igfs}/igfs_random_access_file.cc  |   4 +-
 .../kernels/igfs}/igfs_random_access_file.h   |   2 +-
 .../kernels/igfs}/igfs_writable_file.cc       |   4 +-
 .../kernels/igfs}/igfs_writable_file.h        |   2 +-
 .../contrib/{igfs => ignite}/ops/igfs_ops.cc  |   2 +-
 .../python/ops/igfs_op_loader.py              |   0
 .../python/ops/igfs_ops.py}                   |  13 +-
 .../python/tests/bin/start-igfs.sh            |   0
 .../tests/config/ignite-config-igfs.xml       |   0
 .../python/tests/igfs_test.py                 |   2 +-
 .../ignite/python/tests/start_ignite.sh       |   4 +
 .../ignite/python/tests/stop_ignite.sh        |   3 +-
 42 files changed, 238 insertions(+), 347 deletions(-)
 delete mode 100644 tensorflow/contrib/igfs/BUILD
 delete mode 100644 tensorflow/contrib/igfs/README.md
 delete mode 100644 tensorflow/contrib/igfs/python/ops/igfs_ops.py
 delete mode 100755 tensorflow/contrib/igfs/python/tests/start_ignite.sh
 delete mode 100755 tensorflow/contrib/igfs/python/tests/stop_ignite.sh
 rename tensorflow/contrib/ignite/kernels/{ => client}/ignite_byte_swapper.h (72%)
 rename tensorflow/contrib/ignite/kernels/{ => client}/ignite_client.h (81%)
 rename tensorflow/contrib/ignite/kernels/{ => client}/ignite_plain_client.h (95%)
 rename tensorflow/contrib/ignite/kernels/{ => client}/ignite_plain_client_unix.cc (97%)
 rename tensorflow/contrib/ignite/kernels/{ => client}/ignite_plain_client_windows.cc (98%)
 rename tensorflow/contrib/ignite/kernels/{ => client}/ignite_ssl_wrapper.cc (98%)
 rename tensorflow/contrib/ignite/kernels/{ => client}/ignite_ssl_wrapper.h (95%)
 rename tensorflow/contrib/ignite/kernels/{ => dataset}/ignite_binary_object_parser.cc (99%)
 rename tensorflow/contrib/ignite/kernels/{ => dataset}/ignite_binary_object_parser.h (97%)
 rename tensorflow/contrib/ignite/kernels/{ => dataset}/ignite_dataset.cc (97%)
 rename tensorflow/contrib/ignite/kernels/{ => dataset}/ignite_dataset.h (100%)
 rename tensorflow/contrib/ignite/kernels/{ => dataset}/ignite_dataset_iterator.cc (98%)
 rename tensorflow/contrib/ignite/kernels/{ => dataset}/ignite_dataset_iterator.h (93%)
 rename tensorflow/contrib/ignite/kernels/{ => dataset}/ignite_dataset_ops.cc (97%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs.cc (97%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs.h (97%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_client.cc (95%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_client.h (62%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_extended_tcp_client.cc (95%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_extended_tcp_client.h (95%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_messages.cc (99%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_messages.h (99%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_random_access_file.cc (92%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_random_access_file.h (95%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_writable_file.cc (93%)
 rename tensorflow/contrib/{igfs/kernels => ignite/kernels/igfs}/igfs_writable_file.h (95%)
 rename tensorflow/contrib/{igfs => ignite}/ops/igfs_ops.cc (93%)
 rename tensorflow/contrib/{igfs => ignite}/python/ops/igfs_op_loader.py (100%)
 rename tensorflow/contrib/{igfs/__init__.py => ignite/python/ops/igfs_ops.py} (76%)
 rename tensorflow/contrib/{igfs => ignite}/python/tests/bin/start-igfs.sh (100%)
 rename tensorflow/contrib/{igfs => ignite}/python/tests/config/ignite-config-igfs.xml (100%)
 rename tensorflow/contrib/{igfs => ignite}/python/tests/igfs_test.py (98%)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 40863def8a4..832db0f4ab4 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -157,7 +157,6 @@ py_library(
         "//tensorflow:no_ignite_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/ignite",
-            "//tensorflow/contrib/igfs",
         ],
     }),
 )
@@ -249,7 +248,6 @@ cc_library(
         "//tensorflow:no_ignite_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/ignite:dataset_ops_op_lib",
-            "//tensorflow/contrib/igfs:igfs_ops_op_lib",
         ],
     }),
 )
diff --git a/tensorflow/contrib/igfs/BUILD b/tensorflow/contrib/igfs/BUILD
deleted file mode 100644
index b9983d09574..00000000000
--- a/tensorflow/contrib/igfs/BUILD
+++ /dev/null
@@ -1,131 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_custom_op_library",
-    "tf_custom_op_py_library",
-    "tf_gen_op_libs",
-    "tf_gen_op_wrapper_py",
-    "tf_kernel_library",
-    "tf_py_test",
-)
-
-py_library(
-    name = "igfs",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":igfs_ops",
-    ],
-)
-
-tf_custom_op_library(
-    name = "_igfs_ops.so",
-    srcs = [
-        "kernels/igfs.h",
-        "ops/igfs_ops.cc",
-    ],
-    deps = [":igfs_kernels"],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["igfs_ops"],
-    deps = [":igfs_kernels"],
-)
-
-cc_library(
-    name = "igfs_kernels",
-    srcs = [
-        "kernels/igfs.cc",
-        "kernels/igfs.h",
-        "kernels/igfs_client.cc",
-        "kernels/igfs_client.h",
-        "kernels/igfs_extended_tcp_client.cc",
-        "kernels/igfs_extended_tcp_client.h",
-        "kernels/igfs_messages.cc",
-        "kernels/igfs_messages.h",
-        "kernels/igfs_random_access_file.cc",
-        "kernels/igfs_random_access_file.h",
-        "kernels/igfs_writable_file.cc",
-        "kernels/igfs_writable_file.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/ignite:ignite_client",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-    alwayslink = 1,
-)
-
-py_library(
-    name = "igfs_ops",
-    srcs = [
-        "python/ops/igfs_ops.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":igfs_op_loader",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "gen_igfs_ops",
-    out = "python/ops/gen_igfs_ops.py",
-    deps = [":igfs_ops_op_lib"],
-)
-
-tf_kernel_library(
-    name = "igfs_ops_kernels",
-    deps = [
-        ":igfs_kernels",
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
-)
-
-tf_custom_op_py_library(
-    name = "igfs_op_loader",
-    srcs = ["python/ops/igfs_op_loader.py"],
-    dso = [":_igfs_ops.so"],
-    kernels = [
-        ":igfs_ops_kernels",
-        ":igfs_ops_op_lib",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":gen_igfs_ops",
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:platform",
-    ],
-)
-
-# The Apache Ignite servers have to setup before the test and tear down
-# after the test manually. The docker engine has to be installed.
-#
-# To setup Apache Ignite servers:
-# $ bash ./python/tests/start_ignite.sh
-#
-# To tear down Apache Ignite servers:
-# $ bash ./python/tests/stop_ignite.sh
-tf_py_test(
-    name = "igfs_test_py",
-    srcs = ["python/tests/igfs_test.py"],
-    additional_deps = [
-        ":igfs",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-    tags = [
-        "manual",
-        "no_windows",
-        "notap",
-    ],
-)
diff --git a/tensorflow/contrib/igfs/README.md b/tensorflow/contrib/igfs/README.md
deleted file mode 100644
index a67be4a410e..00000000000
--- a/tensorflow/contrib/igfs/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# IGFS (Ignite File System)
-
-- [Overview](#overview)
-- [Try it out](#try-it-out)
-
-## Overview
-
-[Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed database, caching, and processing platform for
-transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. In addition to database functionality Apache Ignite provides a distributed file system called [IGFS](https://ignite.apache.org/features/igfs.html). IGFS delivers a similar functionality to Hadoop HDFS, but only in-memory. In fact, in addition to its own APIs, IGFS implements Hadoop FileSystem API and can be transparently plugged into Hadoop or Spark deployments. This contrib package contains an intergration between IGFS and TensorFlow. The integration is based on [custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys) from TensorFlow side and [IGFS Native API](https://ignite.apache.org/features/igfs.html) from Apache Ignite side. It has numerous uses, for example:
-* Checkpoints of state can be saved to IGFS for reliability and fault-tolerance.
-* Training processes communicate with TensorBoard by writing event files to a directory, which TensorBoard watches. IGFS allows this communication to work even when TensorBoard runs in a different process or machine.
-
-## Try it out
-
-The simplest way to try IGFS with TensorFlow is to run [Docker](https://www.docker.com/) container with Apache Ignite and enabled IGFS and then interruct with it using TensorFlow [tf.gfile](https://www.tensorflow.org/api_docs/python/tf/gfile). Such container is available on Docker Hub: [dmitrievanthony/ignite-with-igfs](https://hub.docker.com/r/dmitrievanthony/ignite-with-igfs/). You need to start this container on your machine:
-
-```
-docker run -it -p 10500:10500 dmitrievanthony/ignite-with-igfs
-```
-
-After that you will be able to work with it following way:
-
-```python
->>> import tensorflow as tf
->>> import tensorflow.contrib.igfs.python.ops.igfs_ops
->>> 
->>> with tf.gfile.Open("igfs:///hello.txt", mode='w') as w:
->>>   w.write("Hello, world!")
->>>
->>> with tf.gfile.Open("igfs:///hello.txt", mode='r') as r:
->>>   print(r.read())
-
-Hello, world!
-```
diff --git a/tensorflow/contrib/igfs/python/ops/igfs_ops.py b/tensorflow/contrib/igfs/python/ops/igfs_ops.py
deleted file mode 100644
index 5c02ddcd9ad..00000000000
--- a/tensorflow/contrib/igfs/python/ops/igfs_ops.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""IGFS."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.framework import load_library
-
-file_system_library = os.path.join(resource_loader.get_data_files_path(),
-                                   "../../_igfs_ops.so")
-load_library.load_file_system_library(file_system_library)
diff --git a/tensorflow/contrib/igfs/python/tests/start_ignite.sh b/tensorflow/contrib/igfs/python/tests/start_ignite.sh
deleted file mode 100755
index d48bed6b45d..00000000000
--- a/tensorflow/contrib/igfs/python/tests/start_ignite.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-IGNITE_VERSION=2.6.0
-SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )"
-
-# Start Apache Ignite with IGFS.
-docker run -itd --name ignite-igfs -p 10500:10500 \
--v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-igfs.sh
diff --git a/tensorflow/contrib/igfs/python/tests/stop_ignite.sh b/tensorflow/contrib/igfs/python/tests/stop_ignite.sh
deleted file mode 100755
index ff297291cf5..00000000000
--- a/tensorflow/contrib/igfs/python/tests/stop_ignite.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-docker rm -f ignite-igfs
diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index c4661630617..e486d85a4d1 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -22,6 +22,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_ops",
+        ":igfs_ops",
     ],
 )
 
@@ -31,22 +32,36 @@ tf_custom_op_library(
     deps = [":dataset_kernels"],
 )
 
+tf_custom_op_library(
+    name = "_igfs_ops.so",
+    srcs = [
+        "kernels/igfs/igfs.h",
+        "ops/igfs_ops.cc",
+    ],
+    deps = [":igfs_kernels"],
+)
+
 tf_gen_op_libs(
     op_lib_names = ["dataset_ops"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = ["igfs_ops"],
+    deps = [":igfs_kernels"],
+)
+
 cc_library(
     name = "ignite_client",
     srcs = [
-        "kernels/ignite_client.h",
-        "kernels/ignite_byte_swapper.h",
-        "kernels/ignite_plain_client.h",
-        "kernels/ignite_ssl_wrapper.h",
-        "kernels/ignite_ssl_wrapper.cc",
+        "kernels/client/ignite_client.h",
+        "kernels/client/ignite_byte_swapper.h",
+        "kernels/client/ignite_plain_client.h",
+        "kernels/client/ignite_ssl_wrapper.h",
+        "kernels/client/ignite_ssl_wrapper.cc",
     ] + if_not_windows([
-        "kernels/ignite_plain_client_unix.cc",
+        "kernels/client/ignite_plain_client_unix.cc",
     ]) + if_windows([
-        "kernels/ignite_plain_client_windows.cc",
+        "kernels/client/ignite_plain_client_windows.cc",
     ]),
     copts = if_windows([
         "-DWIN32_LEAN_AND_MEAN",
@@ -61,13 +76,13 @@ cc_library(
 cc_library(
     name = "dataset_kernels",
     srcs = [
-        "kernels/ignite_binary_object_parser.cc",
-        "kernels/ignite_binary_object_parser.h",
-        "kernels/ignite_dataset.cc",
-        "kernels/ignite_dataset.h",
-        "kernels/ignite_dataset_iterator.cc",
-        "kernels/ignite_dataset_iterator.h",
-        "kernels/ignite_dataset_ops.cc",
+        "kernels/dataset/ignite_binary_object_parser.cc",
+        "kernels/dataset/ignite_binary_object_parser.h",
+        "kernels/dataset/ignite_dataset.cc",
+        "kernels/dataset/ignite_dataset.h",
+        "kernels/dataset/ignite_dataset_iterator.cc",
+        "kernels/dataset/ignite_dataset_iterator.h",
+        "kernels/dataset/ignite_dataset_ops.cc",
     ],
     deps = [
         ":ignite_client",
@@ -78,6 +93,30 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "igfs_kernels",
+    srcs = [
+        "kernels/igfs/igfs.cc",
+        "kernels/igfs/igfs.h",
+        "kernels/igfs/igfs_client.cc",
+        "kernels/igfs/igfs_client.h",
+        "kernels/igfs/igfs_extended_tcp_client.cc",
+        "kernels/igfs/igfs_extended_tcp_client.h",
+        "kernels/igfs/igfs_messages.cc",
+        "kernels/igfs/igfs_messages.h",
+        "kernels/igfs/igfs_random_access_file.cc",
+        "kernels/igfs/igfs_random_access_file.h",
+        "kernels/igfs/igfs_writable_file.cc",
+        "kernels/igfs/igfs_writable_file.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/ignite:ignite_client",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+    alwayslink = 1,
+)
+
 py_library(
     name = "dataset_ops",
     srcs = [
@@ -93,12 +132,31 @@ py_library(
     ],
 )
 
+py_library(
+    name = "igfs_ops",
+    srcs = [
+        "python/ops/igfs_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":igfs_op_loader",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
 tf_gen_op_wrapper_py(
     name = "gen_dataset_ops",
     out = "python/ops/gen_dataset_ops.py",
     deps = ["//tensorflow/contrib/ignite:dataset_ops_op_lib"],
 )
 
+tf_gen_op_wrapper_py(
+    name = "gen_igfs_ops",
+    out = "python/ops/gen_igfs_ops.py",
+    deps = [":igfs_ops_op_lib"],
+)
+
 tf_kernel_library(
     name = "dataset_ops_kernels",
     deps = [
@@ -108,6 +166,15 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
+tf_kernel_library(
+    name = "igfs_ops_kernels",
+    deps = [
+        ":igfs_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
 tf_custom_op_py_library(
     name = "ignite_op_loader",
     srcs = ["python/ops/ignite_op_loader.py"],
@@ -124,6 +191,22 @@ tf_custom_op_py_library(
     ],
 )
 
+tf_custom_op_py_library(
+    name = "igfs_op_loader",
+    srcs = ["python/ops/igfs_op_loader.py"],
+    dso = [":_igfs_ops.so"],
+    kernels = [
+        ":igfs_ops_kernels",
+        ":igfs_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_igfs_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
 # The Apache Ignite servers have to setup before the test and tear down
 # after the test manually. The docker engine has to be installed.
 #
@@ -133,8 +216,11 @@ tf_custom_op_py_library(
 # To tear down Apache Ignite servers:
 # $ bash ./python/tests/stop_ignite.sh
 tf_py_test(
-    name = "ignite_dataset_test",
-    srcs = ["python/tests/ignite_dataset_test.py"],
+    name = "ignite_test",
+    srcs = [
+        "python/tests/igfs_test.py",
+        "python/tests/ignite_dataset_test.py",
+    ],
     additional_deps = [
         ":ignite",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index 55c89d27996..040e4587243 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -1,19 +1,22 @@
-# Ignite Dataset
+# Apache Ignite Integration
 
 - [Overview](#overview)
 - [Features](#features)
   * [Distributed In-Memory Datasource](#distributed-in-memory-datasource)
   * [Structured Objects](#structured-objects)
   * [Distributed Training](#distributed-training)
+  * [Distributed File System](#distributed-file-system)
   * [SSL Connection](#ssl-connection)
   * [Windows Support](#windows-support)
 - [Try it out](#try-it-out)
+  * [Ignite Dataset](#ignite-dataset)
+  * [IGFS](#igfs)
 - [Limitations](#limitations)
 
 ## Overview
 
 [Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed database, caching, and processing platform for
-transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. This contrib package contains an integration between Apache Ignite and TensorFlow. The integration is based on [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow side and [Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol) from Apache Ignite side. It allows to use Apache Ignite as a data source for neural network training, inference and all other computations supported by TensorFlow. 
+transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. This contrib package contains an integration between Apache Ignite and TensorFlow. The integration is based on [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow side and [Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol) from Apache Ignite side. It allows to use Apache Ignite as a data source for neural network training, inference and all other computations supported by TensorFlow. Another part of this module is an integration with distributed file system based on Apache Ignite.
 
 ## Features
 
@@ -134,6 +137,12 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 
 High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well. 
 
+### Distributed File System
+
+In addition to database functionality Apache Ignite provides a distributed file system called [IGFS](https://ignite.apache.org/features/igfs.html). IGFS delivers a similar functionality to Hadoop HDFS, but only in-memory. In fact, in addition to its own APIs, IGFS implements Hadoop FileSystem API and can be transparently plugged into Hadoop or Spark deployments. This contrib package contains an intergration between IGFS and TensorFlow. The integration is based on [custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys) from TensorFlow side and [IGFS Native API](https://ignite.apache.org/features/igfs.html) from Apache Ignite side. It has numerous uses, for example:
+* Checkpoints of state can be saved to IGFS for reliability and fault-tolerance.
+* Training processes communicate with TensorBoard by writing event files to a directory, which TensorBoard watches. IGFS allows this communication to work even when TensorBoard runs in a different process or machine.
+
 ### SSL Connection
 
 Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and authentification. Ignite Dataset supports both SSL connection with and without authntication. For more information, please refer to the [Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls) documentation.
@@ -142,8 +151,11 @@ Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikip
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
 >>> 
->>> dataset = IgniteDataset(cache_name="IMAGES", certfile="client.pem", cert_password="password", username="ignite", password="ignite")
->>> ...
+>>> dataset = IgniteDataset(cache_name="IMAGES", 
+                            certfile="client.pem", 
+                            cert_password="password", 
+                            username="ignite", 
+                            password="ignite")
 ```
 
 ### Windows Support
@@ -152,6 +164,9 @@ Ignite Dataset is fully compatible with Windows. You can use it as part of Tenso
 
 ## Try it out
 
+Following examples will help you to easily start working with this module.
+
+### Ignite Dataset
 The simplest way to try Ignite Dataset is to run a [Docker](https://www.docker.com/) container with Apache Ignite and loaded [MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with it using Ignite Dataset. Such container is available on Docker Hub: [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/). You need to start this container on your machine:
 
 ```
@@ -162,6 +177,28 @@ After that you will be able to work with it following way:
 
 ![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist.png "Ignite Dataset Mnist")
 
+### IGFS
+The simplest way to try IGFS with TensorFlow is to run [Docker](https://www.docker.com/) container with Apache Ignite and enabled IGFS and then interruct with it using TensorFlow [tf.gfile](https://www.tensorflow.org/api_docs/python/tf/gfile). Such container is available on Docker Hub: [dmitrievanthony/ignite-with-igfs](https://hub.docker.com/r/dmitrievanthony/ignite-with-igfs/). You need to start this container on your machine:
+
+```
+docker run -it -p 10500:10500 dmitrievanthony/ignite-with-igfs
+```
+
+After that you will be able to work with it following way:
+
+```python
+>>> import tensorflow as tf
+>>> import tensorflow.contrib.ignite.python.ops.igfs_ops
+>>> 
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='w') as w:
+>>>   w.write("Hello, world!")
+>>>
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='r') as r:
+>>>   print(r.read())
+
+Hello, world!
+```
+
 ## Limitations
 
 Presently, Ignite Dataset works with assumption that all objects in the cache have the same structure (homogeneous objects) and the cache contains at least one object. Another limitation concerns structured objects, Ignite Dataset does not support UUID, Maps and Object arrays that might be parts of an object structure.
diff --git a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
similarity index 72%
rename from tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
index 46df3e39dc4..660dde449d3 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
@@ -25,76 +25,75 @@ class ByteSwapper {
  public:
   ByteSwapper(bool big_endian) { swap_ = big_endian == port::kLittleEndian; }
 
-  inline void SwapIfRequiredInt16(int16_t *x) const {
+  void SwapIfRequiredInt16(int16_t *x) const {
     if (swap_) {
       Swap16(x);
     }
   }
 
-  inline void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
+  void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
     if (swap_) {
       Swap16(reinterpret_cast(x));
     }
   }
 
-  inline void SwapIfRequiredInt32(int32_t *x) const {
+  void SwapIfRequiredInt32(int32_t *x) const {
     if (swap_) {
       Swap32(x);
     }
   }
 
-  inline void SwapIfRequiredFloat(float *x) const {
+  void SwapIfRequiredFloat(float *x) const {
     if (swap_) {
       Swap32(reinterpret_cast(x));
     }
   }
 
-  inline void SwapIfRequiredInt64(int64_t *x) const {
+  void SwapIfRequiredInt64(int64_t *x) const {
     if (swap_) {
       Swap64(x);
     }
   }
 
-  inline void SwapIfRequiredDouble(double *x) const {
+  void SwapIfRequiredDouble(double *x) const {
     if (swap_) {
       Swap64(reinterpret_cast(x));
     }
   }
 
-  inline void SwapIfRequiredInt16Arr(int16_t *x, int32_t length) const {
+  void SwapIfRequiredInt16Arr(int16_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++) Swap16(&x[i]);
     }
   }
 
-  inline void SwapIfRequiredUnsignedInt16Arr(uint16_t *x,
-                                             int32_t length) const {
+  void SwapIfRequiredUnsignedInt16Arr(uint16_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++)
         Swap16(reinterpret_cast(&x[i]));
     }
   }
 
-  inline void SwapIfRequiredInt32Arr(int32_t *x, int32_t length) const {
+  void SwapIfRequiredInt32Arr(int32_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++) Swap32(&x[i]);
     }
   }
 
-  inline void SwapIfRequiredFloatArr(float *x, int32_t length) const {
+  void SwapIfRequiredFloatArr(float *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++)
         Swap32(reinterpret_cast(&x[i]));
     }
   }
 
-  inline void SwapIfRequiredInt64Arr(int64_t *x, int32_t length) const {
+  void SwapIfRequiredInt64Arr(int64_t *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++) Swap64(&x[i]);
     }
   }
 
-  inline void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
+  void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
     if (swap_) {
       for (int32_t i = 0; i < length; i++)
         Swap64(reinterpret_cast(&x[i]));
@@ -102,16 +101,16 @@ class ByteSwapper {
   }
 
  private:
-  inline void Swap16(int16_t *x) const {
+  void Swap16(int16_t *x) const {
     *x = ((*x & 0xFF) << 8) | ((*x >> 8) & 0xFF);
   }
 
-  inline void Swap32(int32_t *x) const {
+  void Swap32(int32_t *x) const {
     *x = ((*x & 0xFF) << 24) | (((*x >> 8) & 0xFF) << 16) |
          (((*x >> 16) & 0xFF) << 8) | ((*x >> 24) & 0xFF);
   }
 
-  inline void Swap64(int64_t *x) const {
+  void Swap64(int64_t *x) const {
     *x = ((*x & 0xFF) << 56) | (((*x >> 8) & 0xFF) << 48) |
          (((*x >> 16) & 0xFF) << 40) | (((*x >> 24) & 0xFF) << 32) |
          (((*x >> 32) & 0xFF) << 24) | (((*x >> 40) & 0xFF) << 16) |
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.h b/tensorflow/contrib/ignite/kernels/client/ignite_client.h
similarity index 81%
rename from tensorflow/contrib/ignite/kernels/ignite_client.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_client.h
index 459b50b48fd..a38599fb997 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_client.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_client.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
 #define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -32,44 +32,44 @@ class Client {
   virtual Status ReadData(uint8_t *buf, const int32_t length) = 0;
   virtual Status WriteData(const uint8_t *buf, const int32_t length) = 0;
 
-  inline Status ReadByte(uint8_t *data) { return ReadData(data, 1); }
+  Status ReadByte(uint8_t *data) { return ReadData(data, 1); }
 
-  inline Status ReadShort(int16_t *data) {
+  Status ReadShort(int16_t *data) {
     TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 2));
     byte_swapper_.SwapIfRequiredInt16(data);
 
     return Status::OK();
   }
 
-  inline Status ReadInt(int32_t *data) {
+  Status ReadInt(int32_t *data) {
     TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 4));
     byte_swapper_.SwapIfRequiredInt32(data);
 
     return Status::OK();
   }
 
-  inline Status ReadLong(int64_t *data) {
+  Status ReadLong(int64_t *data) {
     TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 8));
     byte_swapper_.SwapIfRequiredInt64(data);
 
     return Status::OK();
   }
 
-  inline Status WriteByte(const uint8_t data) { return WriteData(&data, 1); }
+  Status WriteByte(const uint8_t data) { return WriteData(&data, 1); }
 
-  inline Status WriteShort(const int16_t data) {
+  Status WriteShort(const int16_t data) {
     int16_t tmp = data;
     byte_swapper_.SwapIfRequiredInt16(&tmp);
     return WriteData((uint8_t *)&tmp, 2);
   }
 
-  inline Status WriteInt(const int32_t data) {
+  Status WriteInt(const int32_t data) {
     int32_t tmp = data;
     byte_swapper_.SwapIfRequiredInt32(&tmp);
     return WriteData((uint8_t *)&tmp, 4);
   }
 
-  inline Status WriteLong(const int64_t data) {
+  Status WriteLong(const int64_t data) {
     int64_t tmp = data;
     byte_swapper_.SwapIfRequiredInt64(&tmp);
     return WriteData((uint8_t *)&tmp, 8);
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
similarity index 95%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
index 75424c19ee4..9f1c70c25cb 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
 #define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
index cf672942c61..54efb5b6176 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
 
 #include 
 #include 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
index dad5aace5fa..a99a3ada558 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
 
 #define WIN32_LEAN_AND_MEAN
 #include 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
index ceb479b0846..8f09c24a3be 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h"
 
 #include 
 #include 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
similarity index 95%
rename from tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
index 0406644bbaa..29e470403ec 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
 #define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
 
 #include 
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
similarity index 99%
rename from tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
index 2c8a7d44b07..4218ec05f2c 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
index eb1f856643a..b2647ec79bb 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
 
 #include 
-#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
index c4a7d3c513a..ace96e7b09f 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
similarity index 100%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
index 5da9127aa6a..6332e8e4bb4 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h"
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -311,8 +311,8 @@ Status IgniteDatasetIterator::ScanQuery() {
       TF_RETURN_IF_ERROR(client_->ReadData(err_msg_c, err_msg_length));
       string err_msg(reinterpret_cast(err_msg_c), err_msg_length);
 
-      return errors::Unknown("Scan Query Error [status=", status,
-                             ", message=", err_msg, "]");
+      return errors::Unknown("Scan Query Error [status=", status, ", message=",
+                             err_msg, "]");
     }
     return errors::Unknown("Scan Query Error [status=", status, "]");
   }
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
similarity index 93%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
index c499e2c9ccf..39b82bc06e2 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
 #define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
index f75b1c5ff55..f2108775e29 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include 
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs.cc
similarity index 97%
rename from tensorflow/contrib/igfs/kernels/igfs.cc
rename to tensorflow/contrib/ignite/kernels/igfs/igfs.cc
index 5434be4d60a..2ed140f649b 100644
--- a/tensorflow/contrib/igfs/kernels/igfs.cc
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 
-#include "tensorflow/contrib/igfs/kernels/igfs.h"
-#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
-#include "tensorflow/contrib/igfs/kernels/igfs_random_access_file.h"
-#include "tensorflow/contrib/igfs/kernels/igfs_writable_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs.h b/tensorflow/contrib/ignite/kernels/igfs/igfs.h
similarity index 97%
rename from tensorflow/contrib/igfs/kernels/igfs.h
rename to tensorflow/contrib/ignite/kernels/igfs/igfs.h
index a05f620ff41..903d07532e2 100644
--- a/tensorflow/contrib/igfs/kernels/igfs.h
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_H_
 
-#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
 #include "tensorflow/core/platform/file_system.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/igfs/kernels/igfs_client.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc
similarity index 95%
rename from tensorflow/contrib/igfs/kernels/igfs_client.cc
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc
index 05745ce4f1a..3f97c34fdd8 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_client.cc
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs_client.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h
similarity index 62%
rename from tensorflow/contrib/igfs/kernels/igfs_client.h
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_client.h
index fecb799dc5f..25b57407688 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_client.h
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_CLIENT_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_CLIENT_H_
 
-#include "tensorflow/contrib/igfs/kernels/igfs_messages.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
 
 namespace tensorflow {
 
@@ -26,73 +26,66 @@ class IGFSClient {
              const string &user_name);
   ~IGFSClient();
 
-  inline Status Handshake(CtrlResponse *res) {
+  Status Handshake(CtrlResponse *res) {
     return SendRequestGetResponse(HandshakeRequest(fs_name_, {}), res);
   }
 
-  inline Status ListFiles(CtrlResponse *res,
-                          const string &path) {
+  Status ListFiles(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(ListFilesRequest(user_name_, path), res);
   }
 
-  inline Status ListPaths(CtrlResponse *res,
-                          const string &path) {
+  Status ListPaths(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(ListPathsRequest(user_name_, path), res);
   }
 
-  inline Status Info(CtrlResponse *res, const string &path) {
+  Status Info(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(InfoRequest(user_name_, path), res);
   }
 
-  inline Status OpenCreate(CtrlResponse *res,
-                           const string &path) {
+  Status OpenCreate(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(OpenCreateRequest(user_name_, path), res);
   }
 
-  inline Status OpenAppend(CtrlResponse *res,
-                           const string &path) {
+  Status OpenAppend(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(OpenAppendRequest(user_name_, path), res);
   }
 
-  inline Status OpenRead(CtrlResponse *res,
-                         const string &path) {
+  Status OpenRead(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(OpenReadRequest(user_name_, path), res);
   }
 
-  inline Status Exists(CtrlResponse *res, const string &path) {
+  Status Exists(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(ExistsRequest(user_name_, path), res);
   }
 
-  inline Status MkDir(CtrlResponse *res,
-                      const string &path) {
+  Status MkDir(CtrlResponse *res, const string &path) {
     return SendRequestGetResponse(MakeDirectoriesRequest(user_name_, path),
                                   res);
   }
 
-  inline Status Delete(CtrlResponse *res, const string &path,
-                       bool recursive) {
+  Status Delete(CtrlResponse *res, const string &path,
+                bool recursive) {
     return SendRequestGetResponse(DeleteRequest(user_name_, path, recursive),
                                   res);
   }
 
-  inline Status WriteBlock(int64_t stream_id, const uint8_t *data,
-                           int32_t len) {
+  Status WriteBlock(int64_t stream_id, const uint8_t *data, int32_t len) {
     return SendRequestGetResponse(WriteBlockRequest(stream_id, data, len),
                                   nullptr);
   }
 
-  inline Status ReadBlock(ReadBlockCtrlResponse *res, int64_t stream_id,
-                          int64_t pos, int32_t length) {
+  Status ReadBlock(ReadBlockCtrlResponse *res, int64_t stream_id, int64_t pos,
+                   int32_t length) {
     return SendRequestGetResponse(ReadBlockRequest(stream_id, pos, length),
                                   res);
   }
 
-  inline Status Close(CtrlResponse *res, int64_t stream_id) {
+  Status Close(CtrlResponse *res, int64_t stream_id) {
     return SendRequestGetResponse(CloseRequest(stream_id), res);
   }
 
-  inline Status Rename(CtrlResponse *res, const string &source,
-                       const string &dest) {
+  Status Rename(CtrlResponse *res, const string &source,
+                const string &dest) {
     return SendRequestGetResponse(RenameRequest(user_name_, source, dest), res);
   }
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc
similarity index 95%
rename from tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc
index e12d6708b4e..f7db72c72e4 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.cc
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h"
 
 namespace tensorflow {
 
@@ -114,7 +114,8 @@ Status ExtendedTCPClient::WriteString(string str) {
   if (!str.empty()) {
     TF_RETURN_IF_ERROR(WriteBool(false));
     size_t l = str.length();
-    if (l > 0xFFFF) return errors::InvalidArgument("String is too long");
+    if (l > std::numeric_limits::max())
+      return errors::InvalidArgument("String is too long");
 
     TF_RETURN_IF_ERROR(WriteShort(l));
     TF_RETURN_IF_ERROR(WriteData(reinterpret_cast(str.c_str()),
diff --git a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h
similarity index 95%
rename from tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h
index 5121ee67a57..af713164950 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_EXTENDED_TCP_CLIENT_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_EXTENDED_TCP_CLIENT_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs_messages.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc
similarity index 99%
rename from tensorflow/contrib/igfs/kernels/igfs_messages.cc
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc
index 44368c7bb1f..3ee3e4e29a7 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_messages.cc
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/igfs/kernels/igfs_messages.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs_messages.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h
similarity index 99%
rename from tensorflow/contrib/igfs/kernels/igfs_messages.h
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h
index 626bdc56eb7..eed56af54d9 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_messages.h
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_MESSAGES_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_MESSAGES_H_
 
-#include "tensorflow/contrib/igfs/kernels/igfs_extended_tcp_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc
similarity index 92%
rename from tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc
index d44bb031b20..22aeb0143eb 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.cc
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/igfs/kernels/igfs_random_access_file.h"
-#include "tensorflow/contrib/igfs/kernels/igfs_messages.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h
similarity index 95%
rename from tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h
index b25426134ca..b748713c355 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_random_access_file.h
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_RANDOM_ACCESS_FILE_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_RANDOM_ACCESS_FILE_H_
 
-#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
 #include "tensorflow/core/platform/file_system.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc
similarity index 93%
rename from tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc
index 6eb335d312a..c15ecb7deeb 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_writable_file.cc
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/igfs/kernels/igfs_writable_file.h"
-#include "tensorflow/contrib/igfs/kernels/igfs_messages.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/igfs/kernels/igfs_writable_file.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h
similarity index 95%
rename from tensorflow/contrib/igfs/kernels/igfs_writable_file.h
rename to tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h
index f888beaa4eb..8ab6a018038 100644
--- a/tensorflow/contrib/igfs/kernels/igfs_writable_file.h
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_WRITABLE_FILE_H_
 #define TENSORFLOW_CONTRIB_IGFS_KERNELS_IGFS_WRITABLE_FILE_H_
 
-#include "tensorflow/contrib/igfs/kernels/igfs_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
 #include "tensorflow/core/platform/file_system.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/igfs/ops/igfs_ops.cc b/tensorflow/contrib/ignite/ops/igfs_ops.cc
similarity index 93%
rename from tensorflow/contrib/igfs/ops/igfs_ops.cc
rename to tensorflow/contrib/ignite/ops/igfs_ops.cc
index f49267d8b9c..473bddff08b 100644
--- a/tensorflow/contrib/igfs/ops/igfs_ops.cc
+++ b/tensorflow/contrib/ignite/ops/igfs_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/env.h"
 
-#include "tensorflow/contrib/igfs/kernels/igfs.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/igfs/python/ops/igfs_op_loader.py b/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py
similarity index 100%
rename from tensorflow/contrib/igfs/python/ops/igfs_op_loader.py
rename to tensorflow/contrib/ignite/python/ops/igfs_op_loader.py
diff --git a/tensorflow/contrib/igfs/__init__.py b/tensorflow/contrib/ignite/python/ops/igfs_ops.py
similarity index 76%
rename from tensorflow/contrib/igfs/__init__.py
rename to tensorflow/contrib/ignite/python/ops/igfs_ops.py
index c1de5388941..39cee05e071 100644
--- a/tensorflow/contrib/igfs/__init__.py
+++ b/tensorflow/contrib/ignite/python/ops/igfs_ops.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -29,5 +29,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.contrib.igfs.python.ops.igfs_ops
-from tensorflow.python.util.all_util import remove_undocumented
+import os
+
+from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.framework import load_library
+
+file_system_library = os.path.join(resource_loader.get_data_files_path(),
+                                   "../../_igfs_ops.so")
+load_library.load_file_system_library(file_system_library)
diff --git a/tensorflow/contrib/igfs/python/tests/bin/start-igfs.sh b/tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
similarity index 100%
rename from tensorflow/contrib/igfs/python/tests/bin/start-igfs.sh
rename to tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
diff --git a/tensorflow/contrib/igfs/python/tests/config/ignite-config-igfs.xml b/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml
similarity index 100%
rename from tensorflow/contrib/igfs/python/tests/config/ignite-config-igfs.xml
rename to tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml
diff --git a/tensorflow/contrib/igfs/python/tests/igfs_test.py b/tensorflow/contrib/ignite/python/tests/igfs_test.py
similarity index 98%
rename from tensorflow/contrib/igfs/python/tests/igfs_test.py
rename to tensorflow/contrib/ignite/python/tests/igfs_test.py
index 120ab666a89..dc8f3d9d09f 100644
--- a/tensorflow/contrib/igfs/python/tests/igfs_test.py
+++ b/tensorflow/contrib/ignite/python/tests/igfs_test.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
-import tensorflow.contrib.igfs.python.ops.igfs_ops  # pylint: disable=unused-import
+import tensorflow.contrib.ignite.python.ops.igfs_ops  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 class IGFSTest(test.TestCase):
diff --git a/tensorflow/contrib/ignite/python/tests/start_ignite.sh b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
index a67bd44f2fb..112e0dea844 100755
--- a/tensorflow/contrib/ignite/python/tests/start_ignite.sh
+++ b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
@@ -20,3 +20,7 @@ SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )"
 # Start Apache Ignite with plain client listener.
 docker run -itd --name ignite-plain -p 42300:10800 \
 -v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-plain.sh
+
+# Start Apache Ignite with IGFS.
+docker run -itd --name ignite-igfs -p 10500:10500 \
+-v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-igfs.sh
\ No newline at end of file
diff --git a/tensorflow/contrib/ignite/python/tests/stop_ignite.sh b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
index 8f03dbd1ede..35b0f32d1b3 100755
--- a/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
+++ b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
@@ -15,5 +15,4 @@
 # ==============================================================================
 
 docker rm -f ignite-plain
-docker rm -f ignite-ssl
-docker rm -f ignite-ssl-auth
+docker rm -f ignite-igfs
\ No newline at end of file

From 74b69fb3cd392fb5552b0103f8ebdb7808afa4f0 Mon Sep 17 00:00:00 2001
From: Anton Dmitriev 
Date: Fri, 2 Nov 2018 17:56:43 +0300
Subject: [PATCH 062/540] Fix style.

---
 tensorflow/c/c_api_internal.h                    |  4 +++-
 tensorflow/java/BUILD                            |  2 +-
 .../src/main/java/org/tensorflow/Server.java     | 16 ++++++++--------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 9bb6edacaa7..8df85cb503f 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -26,9 +26,9 @@ limitations under the License.
 
 #ifndef __ANDROID__
 #include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #endif
 #include "tensorflow/core/common_runtime/shape_refiner.h"
-#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
@@ -180,11 +180,13 @@ struct TF_ApiDefMap {
   tensorflow::mutex lock;
 };
 
+#ifndef __ANDROID__
 struct TF_Server {
   TF_Server(std::unique_ptr server);
 
   std::unique_ptr server;
 };
+#endif
 
 namespace tensorflow {
 
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 3f847c4c18c..10808e162ee 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -381,8 +381,8 @@ tf_cc_binary(
     linkshared = 1,
     linkstatic = 1,
     deps = [
-        "//tensorflow/java/src/main/native",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/java/src/main/native",
         LINKER_VERSION_SCRIPT,
         LINKER_EXPORTED_SYMBOLS,
     ],
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Server.java b/tensorflow/java/src/main/java/org/tensorflow/Server.java
index 98b123be303..7c67916ef64 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Server.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Server.java
@@ -23,7 +23,7 @@ package org.tensorflow;
  * training. A server belongs to a cluster (specified by a
  * {@code ClusterSpec}), and corresponds to a particular task in a named job.
  * The server can communicate with any other server in the same cluster.
- * The server will not serve any requests until {@link #start()} is invoked. 
+ * The server will not serve any requests until {@link #start()} is invoked.
  * The server will stop serving requests once {@link #stop()} or {@link #close()} is invoked.
  * Be aware that {@link #close()} method stops the server if it is running.
  *
@@ -60,12 +60,12 @@ package org.tensorflow;
  * }
*/ public final class Server implements AutoCloseable { - - /** - * Constructs a new instance of server. + /** + * Constructs a new instance of server. * * @param serverDef Server definition specified as a serialized - * ServerDef + * ServerDef * protocol buffer. */ public Server(byte[] serverDef) { @@ -85,7 +85,7 @@ public final class Server implements AutoCloseable { /** Blocks until the server has been successfully stopped. */ public void join() { long handle = 0; - synchronized(this) { + synchronized (this) { handle = nativeHandle; if (handle != 0) { numJoining++; @@ -94,10 +94,10 @@ public final class Server implements AutoCloseable { try { join(handle); } finally { - synchronized(this) { + synchronized (this) { if (handle != 0) { numJoining--; - } + } notifyAll(); } } From 1a942a88d06fa02425409968b7ef7b278c209bf4 Mon Sep 17 00:00:00 2001 From: frreiss Date: Fri, 2 Nov 2018 12:49:39 -0700 Subject: [PATCH 063/540] Address review comments --- tensorflow/python/ops/control_flow_ops.py | 3 +-- tensorflow/python/ops/control_flow_ops_test.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py index 0673cfabbfd..dcea747d4b6 100644 --- a/tensorflow/python/ops/control_flow_ops.py +++ b/tensorflow/python/ops/control_flow_ops.py @@ -3614,8 +3614,7 @@ def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name, if context.executing_eagerly(): # No name to sort on in eager mode. Use dictionary traversal order, # which is nondeterministic in versions of Python < 3.6 - if not exclusive and not isinstance(pred_fn_pairs, - collections.OrderedDict): + if not exclusive: raise ValueError("Unordered dictionaries are not supported for the " "`pred_fn_pairs` argument when `exclusive=False` and " "eager mode is enabled.") diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py index d65755a960a..6c246161c1a 100644 --- a/tensorflow/python/ops/control_flow_ops_test.py +++ b/tensorflow/python/ops/control_flow_ops_test.py @@ -937,7 +937,7 @@ class CaseTest(test_util.TensorFlowTestCase): sess.run(output, feed_dict={x: 4}) @test_util.run_in_graph_and_eager_modes - def testCase_dict_eagerMode(self): + def testCase_dict(self): x = constant_op.constant(2) conditions = {math_ops.equal(x, 1): lambda: constant_op.constant(2), math_ops.equal(x, 2): lambda: constant_op.constant(4)} From 61a1032a6920c948671c6b6d698dda8605af079c Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 3 Nov 2018 21:28:20 +0000 Subject: [PATCH 064/540] Pretty print the dtype in error message with eager This fix is related to 23452 where the dtype in the error message is a non-descriptive integer: ``` >>> import tensorflow as tf >>> tf.enable_eager_execution() >>> print(1.2*tf.constant(2)) Traceback (most recent call last): ... ... TypeError: Cannot convert value 1.2 to EagerTensor with requested dtype: 3 >>> ``` This fix converts the integer (e.g., `3`) to a descriptive string: ``` TypeError: Cannot convert value 1.2 to EagerTensor with requested dtype: int32 ``` This fix fixes 23452. Signed-off-by: Yong Tang --- tensorflow/python/eager/pywrap_tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc index a2407854fd7..07fdd47153e 100644 --- a/tensorflow/python/eager/pywrap_tensor.cc +++ b/tensorflow/python/eager/pywrap_tensor.cc @@ -435,7 +435,7 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { PyExc_TypeError, tensorflow::strings::StrCat( "Cannot convert value ", TFE_GetPythonString(value_str.get()), - " to EagerTensor with requested dtype: ", desired_dtype) + " to EagerTensor with requested dtype: ", tensorflow::DataTypeString(static_cast(desired_dtype))) .c_str()); return -1; } From 28ca67e446ad31e7a4155398388308a4baeb6dca Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 3 Nov 2018 21:33:18 +0000 Subject: [PATCH 065/540] clang-format fix Signed-off-by: Yong Tang --- tensorflow/python/eager/pywrap_tensor.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc index 07fdd47153e..29a98eb6f78 100644 --- a/tensorflow/python/eager/pywrap_tensor.cc +++ b/tensorflow/python/eager/pywrap_tensor.cc @@ -435,7 +435,9 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { PyExc_TypeError, tensorflow::strings::StrCat( "Cannot convert value ", TFE_GetPythonString(value_str.get()), - " to EagerTensor with requested dtype: ", tensorflow::DataTypeString(static_cast(desired_dtype))) + " to EagerTensor with requested dtype: ", + tensorflow::DataTypeString( + static_cast(desired_dtype))) .c_str()); return -1; } From 5c4a9e8246c9c5a498352fc5113b78e3dde25655 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 4 Nov 2018 00:26:45 +0000 Subject: [PATCH 066/540] Update additional type conversio int -> string Signed-off-by: Yong Tang --- tensorflow/python/eager/pywrap_tensor.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc index 29a98eb6f78..8e6723abd50 100644 --- a/tensorflow/python/eager/pywrap_tensor.cc +++ b/tensorflow/python/eager/pywrap_tensor.cc @@ -421,8 +421,11 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { PyErr_SetString( PyExc_TypeError, tensorflow::strings::StrCat("Error while casting from DataType ", - handle_dtype, " to ", desired_dtype, - ". ", TF_Message(self->status)) + tensorflow::DataTypeString(static_cast(handle_dtype)), + " to ", + tensorflow::DataTypeString(static_cast(desired_dtype)), + ". ", + TF_Message(self->status)) .c_str()); // Cleanup self->status before returning. TF_SetStatus(self->status, TF_OK, ""); From 742edbcab7c79519fc6d9edf2028bdd5d2570fce Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 4 Nov 2018 00:28:54 +0000 Subject: [PATCH 067/540] clang-format fix Signed-off-by: Yong Tang --- tensorflow/python/eager/pywrap_tensor.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc index 8e6723abd50..949276ae103 100644 --- a/tensorflow/python/eager/pywrap_tensor.cc +++ b/tensorflow/python/eager/pywrap_tensor.cc @@ -420,12 +420,13 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { if (TF_GetCode(self->status) != TF_OK) { PyErr_SetString( PyExc_TypeError, - tensorflow::strings::StrCat("Error while casting from DataType ", - tensorflow::DataTypeString(static_cast(handle_dtype)), - " to ", - tensorflow::DataTypeString(static_cast(desired_dtype)), - ". ", - TF_Message(self->status)) + tensorflow::strings::StrCat( + "Error while casting from DataType ", + tensorflow::DataTypeString( + static_cast(handle_dtype)), + " to ", tensorflow::DataTypeString( + static_cast(desired_dtype)), + ". ", TF_Message(self->status)) .c_str()); // Cleanup self->status before returning. TF_SetStatus(self->status, TF_OK, ""); From e48899b406a8df213569e99cc93c1ee3d8a51f8e Mon Sep 17 00:00:00 2001 From: Bairen Yi Date: Sun, 4 Nov 2018 10:18:50 +0800 Subject: [PATCH 068/540] Quick fix for 462a79b that breaks GDR. Signed-off-by: Bairen Yi --- tensorflow/contrib/gdr/gdr_server_lib.cc | 5 +++-- tensorflow/contrib/gdr/gdr_worker.cc | 4 ++-- tensorflow/contrib/gdr/gdr_worker.h | 3 ++- tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc | 2 +- tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h | 3 ++- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc index d8584e4e6b7..b3f48ec1dd9 100644 --- a/tensorflow/contrib/gdr/gdr_server_lib.cc +++ b/tensorflow/contrib/gdr/gdr_server_lib.cc @@ -52,9 +52,10 @@ Status GdrServer::Init() { [this](const WorkerEnv* env) { return new GdrRendezvousMgr(env, remote_memory_manager_.get()); }; - WorkerCreationFunction worker_func = [this](WorkerEnv* env) { + WorkerCreationFunction worker_func = [this](WorkerEnv* env, + const ConfigProto& config) { return std::unique_ptr( - new GdrWorker(env, remote_memory_manager_.get())); + new GdrWorker(env, config, remote_memory_manager_.get())); }; TF_RETURN_IF_ERROR(remote_memory_manager_->Init()); diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc index ce1d8d2d730..867cb83f420 100644 --- a/tensorflow/contrib/gdr/gdr_worker.cc +++ b/tensorflow/contrib/gdr/gdr_worker.cc @@ -39,9 +39,9 @@ limitations under the License. namespace tensorflow { -GdrWorker::GdrWorker(WorkerEnv* worker_env, +GdrWorker::GdrWorker(WorkerEnv* worker_env, const ConfigProto& config, RemoteMemoryManager* remote_memory_manager) - : GrpcWorker(worker_env), + : GrpcWorker(worker_env, config), remote_memory_manager_(remote_memory_manager), recv_tensor_recent_request_ids_(100000) {} diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h index 65105ed9973..39f11e6bde5 100644 --- a/tensorflow/contrib/gdr/gdr_worker.h +++ b/tensorflow/contrib/gdr/gdr_worker.h @@ -25,7 +25,8 @@ namespace tensorflow { class GdrWorker : public GrpcWorker { public: - GdrWorker(WorkerEnv* env, RemoteMemoryManager* remote_memory_manager); + GdrWorker(WorkerEnv* env, const ConfigProto& config, + RemoteMemoryManager* remote_memory_manager); // Serve the RecvTensorRequest but omit the tensor content and transmit it // out-of-band using GPU Direct RDMA whenever possible. diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index 63d438c6155..488b110f329 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -194,7 +194,7 @@ Status GrpcServer::Init( MaybeMutateBuilder(&builder); master_impl_ = CreateMaster(&master_env_); master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder); - worker_impl_ = worker_func ? worker_func(&worker_env_) + worker_impl_ = worker_func ? worker_func(&worker_env_, config) : NewGrpcWorker(&worker_env_, config); worker_service_ = NewGrpcWorkerService(worker_impl_.get(), &builder).release(); diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h index 7979e96d3ed..c1395abddeb 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h @@ -53,7 +53,8 @@ typedef std::function ServiceInitFunction; // function that creates a grpc based worker implementation. -typedef std::function(WorkerEnv*)> +typedef std::function(WorkerEnv*, + const ConfigProto& config)> WorkerCreationFunction; class GrpcServer : public ServerInterface { From 465e876232ca5bf5de2afa2f755706b93df59b2d Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sun, 4 Nov 2018 15:30:43 -0800 Subject: [PATCH 069/540] Add missing build dependencies PiperOrigin-RevId: 220020348 --- tensorflow/cc/BUILD | 3 +++ tensorflow/compiler/jit/BUILD | 2 ++ tensorflow/lite/delegates/flex/BUILD | 2 ++ 3 files changed, 7 insertions(+) diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD index c18b07603ae..83353b79f72 100644 --- a/tensorflow/cc/BUILD +++ b/tensorflow/cc/BUILD @@ -170,6 +170,7 @@ cc_library_with_android_deps( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "@com_google_absl//absl/strings", ], ) @@ -516,6 +517,8 @@ tf_gen_op_wrappers_cc( ":array_ops", ":const_op", ":math_ops", + "//tensorflow/cc:ops", + "//tensorflow/cc:scope", ], ) diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 0c41e095c7b..028687d4010 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -241,6 +241,7 @@ cc_library( "//tensorflow/core:protos_all_cc", "//tensorflow/core/kernels:variable_ops", "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/memory", ], ) @@ -500,6 +501,7 @@ cc_library( "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", ], ) diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD index f9850112abd..222a043a88e 100644 --- a/tensorflow/lite/delegates/flex/BUILD +++ b/tensorflow/lite/delegates/flex/BUILD @@ -53,7 +53,9 @@ cc_library( ], visibility = ["//visibility:public"], deps = [ + ":delegate_data", ":delegate_only_runtime", + "//tensorflow/lite/c:c_api_internal", ] + select({ "//tensorflow:android": [ "//tensorflow/core:android_tensorflow_lib", From 50397855a83f166c6d531fb18965d63d633ca021 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sun, 4 Nov 2018 15:32:55 -0800 Subject: [PATCH 070/540] Add missing build dependencies. PiperOrigin-RevId: 220020507 --- tensorflow/core/grappler/BUILD | 4 ++++ tensorflow/core/grappler/optimizers/BUILD | 1 + tensorflow/core/grappler/optimizers/data/vectorization/BUILD | 1 + 3 files changed, 6 insertions(+) diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD index 3eb7101adfa..7b03ec38bf5 100644 --- a/tensorflow/core/grappler/BUILD +++ b/tensorflow/core/grappler/BUILD @@ -23,6 +23,7 @@ cc_library( visibility = ["//visibility:public"], deps = [ "//tensorflow/core:framework", + "//tensorflow/core:graph", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", @@ -67,11 +68,14 @@ cc_library( visibility = ["//visibility:public"], deps = [ ":utils", + "//tensorflow/core:graph", "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/hash", + "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index 5c05ae2a4fe..3a5b1334d3f 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -213,6 +213,7 @@ cc_library( visibility = ["//visibility:public"], deps = [ "//tensorflow/core:graph", + "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler:utils", diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD index 49ba6c2ba9f..5175f6af7a2 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD +++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD @@ -120,5 +120,6 @@ cc_library( ":unpack_vectorizer", ":vectorizer", ":vectorizer_registry", + "@com_google_absl//absl/container:flat_hash_map", ], ) From 6d02b4dec775383c28b4f1401f3e370212a903ae Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sun, 4 Nov 2018 15:50:17 -0800 Subject: [PATCH 071/540] Add missing build dependencies. PiperOrigin-RevId: 220021454 --- tensorflow/compiler/xla/service/llvm_ir/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD index 850501a4b5c..56a729bca8e 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/BUILD +++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD @@ -169,6 +169,7 @@ cc_library( "//tensorflow/compiler/xla/service:elemental_ir_emitter", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/core:lib", + "@com_google_absl//absl/types:optional", "@com_google_absl//absl/types:span", "@llvm//:core", ], From 6a0b536a779f485edc25f6a11335b5e640acc8ab Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 4 Nov 2018 17:58:10 -0800 Subject: [PATCH 072/540] Internal changes PiperOrigin-RevId: 220028010 --- .../lstm/unidirectional_sequence_lstm_test.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py index 81ab6691df7..eeb48d12311 100644 --- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py +++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py @@ -19,8 +19,9 @@ import tempfile import numpy as np import tensorflow as tf -from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell from tensorflow.examples.tutorials.mnist import input_data +from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell +from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs from tensorflow.python.framework import test_util from tensorflow.python.platform import test from tensorflow.python.tools import optimize_for_inference_lib @@ -50,17 +51,17 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase): # Batch size self.batch_size = 16 # Lstm Units. - self.num_units = 64 + self.num_units = 16 def buildLstmLayer(self): return tf.nn.rnn_cell.MultiRNNCell([ TFLiteLSTMCell( self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"), - TFLiteLSTMCell(self.num_units, num_proj=64, forget_bias=0, name="rnn2"), + TFLiteLSTMCell(self.num_units, num_proj=8, forget_bias=0, name="rnn2"), TFLiteLSTMCell( self.num_units // 2, use_peepholes=True, - num_proj=64, + num_proj=8, forget_bias=0, name="rnn3"), TFLiteLSTMCell(self.num_units, forget_bias=0, name="rnn4") @@ -150,7 +151,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase): tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input}) with tf.Session() as sess: curr = sess.graph_def - curr = tf.lite.convert_op_hints_to_stubs(graph_def=curr) + curr = convert_op_hints_to_stubs(graph_def=curr) curr = optimize_for_inference_lib.optimize_for_inference( curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"], @@ -189,7 +190,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase): x, output_class, new_sess) result = self.tfliteInvoke(frozen_graph, test_inputs, output_class) - self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-3)) + self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2)) def testDynamicRnnMultiRnnCell(self): sess = tf.Session(config=CONFIG) @@ -219,7 +220,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase): x, output_class, new_sess) result = self.tfliteInvoke(frozen_graph, test_inputs, output_class) - self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-3)) + self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2)) if __name__ == "__main__": From aaecc4ec5e1d548d62834ac78da5de910ff58daa Mon Sep 17 00:00:00 2001 From: Guangda Lai Date: Sun, 4 Nov 2018 21:51:52 -0800 Subject: [PATCH 073/540] Fix deadness_analysis_test so the exit node of the control flow graph connects to the false output of the switch node. PiperOrigin-RevId: 220041922 --- tensorflow/compiler/jit/deadness_analysis_test.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc index 617e31488c7..8a73101c184 100644 --- a/tensorflow/compiler/jit/deadness_analysis_test.cc +++ b/tensorflow/compiler/jit/deadness_analysis_test.cc @@ -127,7 +127,8 @@ InductionVarInfo CreateInductionVariable(const Scope& root, Output loop_cond = ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr); ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond); - ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output); + ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), + latch.output_false); Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"), latch.output_true, increment_by); Output next_iteration = @@ -191,7 +192,8 @@ DependentInductionVar CreateDependentLoopInvariantValue( value, frame_name); ops::Merge iv(root.WithOpName(prefix + "/iv"), {enter_value, enter_value}); ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond); - ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output); + ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), + latch.output_false); Output next_iteration = ops::NextIteration( root.WithOpName(prefix + "/next_iteration"), latch.output_true); CHECK(root.graph() From 5ab39bbd01b150e6e1413a244b8ad6be04df3fdf Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 00:49:31 -0800 Subject: [PATCH 074/540] Add option to fail MetaOptimizer on sub-optimizer failures PiperOrigin-RevId: 220054255 --- .../grappler/optimizers/meta_optimizer.cc | 26 ++++++++++++------- .../core/protobuf/rewriter_config.proto | 5 ++++ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index 1d787d2b7c2..82c88bb06ae 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -279,6 +279,18 @@ MetaOptimizer::GetCustomGraphOptimizerConfig(const string& name) const { return nullptr; } +#define RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer) \ + { \ + const Status status = RunOptimizer(optimizer, cluster, &optimized_item, \ + optimized_graph, &optimization_result); \ + if (status.ok()) { \ + is_optimized = true; \ + } else if (cfg_.fail_on_optimizer_errors()) { \ + VLOG(2) << "Optimizer '" << optimizer->name() << "' failed: " << status; \ + TF_RETURN_IF_ERROR(status); \ + } \ + } + Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item, GraphDef* optimized_graph) { int min_graph_nodes = cfg_.min_graph_nodes() == 0 ? kDefaultMinGraphNodes @@ -340,9 +352,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item, if (fusion_optimizer == nullptr) fusion_optimizer = optimizer.get(); continue; } - Status status = RunOptimizer(optimizer.get(), cluster, &optimized_item, - optimized_graph, &optimization_result); - if (status.ok()) is_optimized = true; + RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer.get()); } } @@ -353,16 +363,12 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item, // optimizations from taking place since we don't have shape inference for // functions, and we can't optimize across function boundaries. if (fusion_optimizer != nullptr) { - Status status = RunOptimizer(fusion_optimizer, cluster, &optimized_item, - optimized_graph, &optimization_result); - if (status.ok()) is_optimized = true; + RUN_OPTIMIZER_OR_RETURN_IF_ERROR(fusion_optimizer); } // ScopedAllocatorOptimizer must run last. if (sa_optimizer != nullptr) { - Status status = RunOptimizer(sa_optimizer, cluster, &optimized_item, - optimized_graph, &optimization_result); - if (status.ok()) is_optimized = true; + RUN_OPTIMIZER_OR_RETURN_IF_ERROR(sa_optimizer); } // Record graph optimization result. @@ -379,6 +385,8 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item, return Status::OK(); } +#undef RUN_OPTIMIZER_OR_RETURN_IF_ERROR + Status MetaOptimizer::RunOptimizer( GraphOptimizer* optimizer, Cluster* cluster, GrapplerItem* optimized_item, GraphDef* optimized_graph, GraphOptimizationResult* optimization_result) { diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto index 143df115f42..d68f2735365 100644 --- a/tensorflow/core/protobuf/rewriter_config.proto +++ b/tensorflow/core/protobuf/rewriter_config.proto @@ -137,6 +137,11 @@ message RewriterConfig { // meta-optimizer or when manually specified through the optimizers field. AutoParallelOptions auto_parallel = 5; + // If true, any optimization pass failing will cause the MetaOptimizer to + // stop with an error. By default - or when set to false, failing passes are + // skipped silently. + bool fail_on_optimizer_errors = 21; + ScopedAllocatorOptions scoped_allocator_opts = 16; // If non-empty, will use this as an alternative way to specify a list of From e66dce03cf5159a36434899c4e8a4c1d13bf07cf Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 01:02:29 -0800 Subject: [PATCH 075/540] compat: Update forward compatibility horizon to 2018-11-05 PiperOrigin-RevId: 220055912 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index c2f3f32fd2e..1074a8b5a92 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -26,7 +26,7 @@ import datetime from tensorflow.python.util import tf_contextlib from tensorflow.python.util.tf_export import tf_export -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 4) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 5) @tf_export("compat.forward_compatible") From 00a3a49776f466609c882c4f42c4e14269b8337f Mon Sep 17 00:00:00 2001 From: olicht <29300900+olicht@users.noreply.github.com> Date: Mon, 5 Nov 2018 11:59:23 +0200 Subject: [PATCH 076/540] Update year to 2018 --- tensorflow/contrib/opt/python/training/nadam_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py index 44a8890cb10..155ff5b3f4f 100644 --- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py +++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py @@ -1,4 +1,4 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From c933edf6c654be6ca63d0dac6bc39baa94162492 Mon Sep 17 00:00:00 2001 From: olicht <29300900+olicht@users.noreply.github.com> Date: Mon, 5 Nov 2018 12:04:27 +0200 Subject: [PATCH 077/540] Update year to 2018 --- tensorflow/core/common_runtime/session_options.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/common_runtime/session_options.cc b/tensorflow/core/common_runtime/session_options.cc index aacd57000cf..57c3b605575 100644 --- a/tensorflow/core/common_runtime/session_options.cc +++ b/tensorflow/core/common_runtime/session_options.cc @@ -1,4 +1,4 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 1fafcbb49d0a88d6db83bac6e43d425edf4bb2b2 Mon Sep 17 00:00:00 2001 From: olicht <29300900+olicht@users.noreply.github.com> Date: Mon, 5 Nov 2018 12:05:13 +0200 Subject: [PATCH 078/540] Update year to 2018 --- tensorflow/core/platform/env_time.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/platform/env_time.cc b/tensorflow/core/platform/env_time.cc index 76a227b69a1..10ba2abe7cb 100644 --- a/tensorflow/core/platform/env_time.cc +++ b/tensorflow/core/platform/env_time.cc @@ -1,4 +1,4 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 71ae7e1091bc423b83a32fb5b2a3d83d4226102d Mon Sep 17 00:00:00 2001 From: olicht <29300900+olicht@users.noreply.github.com> Date: Mon, 5 Nov 2018 12:05:55 +0200 Subject: [PATCH 079/540] Update year to 2018 --- tensorflow/contrib/copy_graph/python/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/copy_graph/python/__init__.py b/tensorflow/contrib/copy_graph/python/__init__.py index b9ff28eb0d7..5c1048e02a3 100644 --- a/tensorflow/contrib/copy_graph/python/__init__.py +++ b/tensorflow/contrib/copy_graph/python/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 6b3e231617ddcd86e79ca7286c99c1fb5407aba3 Mon Sep 17 00:00:00 2001 From: olicht <29300900+olicht@users.noreply.github.com> Date: Mon, 5 Nov 2018 12:06:51 +0200 Subject: [PATCH 080/540] Update year to 2018 --- tensorflow/contrib/copy_graph/python/util/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/copy_graph/python/util/__init__.py b/tensorflow/contrib/copy_graph/python/util/__init__.py index b9ff28eb0d7..5c1048e02a3 100644 --- a/tensorflow/contrib/copy_graph/python/util/__init__.py +++ b/tensorflow/contrib/copy_graph/python/util/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From e8668fd8d1106a2c2619522de718008c263e2f1a Mon Sep 17 00:00:00 2001 From: olicht <29300900+olicht@users.noreply.github.com> Date: Mon, 5 Nov 2018 12:07:33 +0200 Subject: [PATCH 081/540] Update year to 2018 --- tensorflow/tools/docker/LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/docker/LICENSE b/tensorflow/tools/docker/LICENSE index 28711d7885d..dea770e05ee 100644 --- a/tensorflow/tools/docker/LICENSE +++ b/tensorflow/tools/docker/LICENSE @@ -1,4 +1,4 @@ -Copyright 2015 The TensorFlow Authors. All rights reserved. +Copyright 2018 The TensorFlow Authors. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From a32d44ba9ad4705eeb972726b6370e2a43eedd42 Mon Sep 17 00:00:00 2001 From: olicht <29300900+olicht@users.noreply.github.com> Date: Mon, 5 Nov 2018 12:08:16 +0200 Subject: [PATCH 082/540] Update year to 2018 --- tensorflow/tools/docker/notebooks/LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/docker/notebooks/LICENSE b/tensorflow/tools/docker/notebooks/LICENSE index 28711d7885d..dea770e05ee 100644 --- a/tensorflow/tools/docker/notebooks/LICENSE +++ b/tensorflow/tools/docker/notebooks/LICENSE @@ -1,4 +1,4 @@ -Copyright 2015 The TensorFlow Authors. All rights reserved. +Copyright 2018 The TensorFlow Authors. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From b3050e86b8bc15d7c14ef645c0d5869c8b192013 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 03:31:05 -0800 Subject: [PATCH 083/540] Require all reduce id to be always positive When it is serialized into the HLO proto there is no way to differentiate between no id specified and 0 specified as an id making the proto serialization lossy. PiperOrigin-RevId: 220071997 --- .../compiler/xla/service/hlo_parser_test.cc | 19 +++++++++++++++++++ .../compiler/xla/service/hlo_verifier.cc | 10 ++++++++++ .../xla/service/layout_assignment_test.cc | 4 ++-- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc index eae6d19792f..c59bdc0a0b3 100644 --- a/tensorflow/compiler/xla/service/hlo_parser_test.cc +++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc @@ -1150,6 +1150,25 @@ ENTRY CrossReplicaSumWithSubgroups { ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_groups={{0,1},{2,3}}, barrier="abc", to_apply=add } +)" +}, +// cross-replica-sum with all-reduce-id +{ +"CrossReplicaSumAllReduce", +R"(HloModule CRS + +add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) +} + +ENTRY CRS { + input = f32[8]{0} parameter(0) + crs.1 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add + ROOT crs.0 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add +} + )" }, // all-to-all diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 136824a3356..a2a6fb7c77e 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -18,6 +18,7 @@ limitations under the License. #include "absl/container/flat_hash_map.h" #include "absl/strings/str_join.h" #include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" @@ -1331,6 +1332,15 @@ class InstructionVerifier : public DfsHloVisitorWithDefault { return Status::OK(); } + Status HandleCrossReplicaSum(HloInstruction* crs) override { + if (crs->all_reduce_id().has_value()) { + TF_RET_CHECK(crs->all_reduce_id().value() > 0) + << "All reduce id must be greater than 0 for " + << crs->ToShortString(); + } + return Status::OK(); + } + Status Preprocess(HloInstruction* instruction) override { auto previous = instructions_by_name_.find(instruction->name()); TF_RET_CHECK(previous == instructions_by_name_.end()) diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc index 47bfca2fd6e..11c57682c11 100644 --- a/tensorflow/compiler/xla/service/layout_assignment_test.cc +++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc @@ -897,11 +897,11 @@ TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) { param = (f32[2,2]) parameter(0) gte = f32[2,2] get-tuple-element(param), index=0 ar.0 = f32[2,2] cross-replica-sum(gte), - all_reduce_id=0, replica_groups={{0}}, to_apply=add, + all_reduce_id=1, replica_groups={{0}}, to_apply=add, sharding={maximal device=0} const = f32[2,2] constant(f32[2,2]{{0,1},{2,3}}) ROOT ar.1 = f32[2,2] cross-replica-sum(const), - all_reduce_id=0, replica_groups={{0}}, to_apply=add, + all_reduce_id=1, replica_groups={{0}}, to_apply=add, sharding={maximal device=1} })"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, From de9f2e2fe2a2a544f6109649dbbb184272ff7dfa Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 07:44:14 -0800 Subject: [PATCH 084/540] Internal change PiperOrigin-RevId: 220100213 --- tensorflow/c/eager/BUILD | 2 +- tensorflow/python/eager/BUILD | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index 3ee31a6a7ac..ba3d8533db7 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -69,7 +69,7 @@ tf_cuda_library( name = "c_api_internal", hdrs = ["c_api_internal.h"], visibility = [ - "//learning/deepmind/courier:__pkg__", + "//learning/deepmind/courier:__subpackages__", "//tensorflow:internal", ], deps = [ diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD index 99e3dc0de18..3b4fcd2d977 100644 --- a/tensorflow/python/eager/BUILD +++ b/tensorflow/python/eager/BUILD @@ -18,7 +18,7 @@ cc_library( "pywrap_tfe.h", ], visibility = [ - "//learning/deepmind/courier:__pkg__", + "//learning/deepmind/courier:__subpackages__", "//tensorflow:internal", ], deps = [ From 2bef68f68bb6dbbefee151ea853b92ac17bb5acc Mon Sep 17 00:00:00 2001 From: Thor Johnsen Date: Mon, 5 Nov 2018 08:02:04 -0800 Subject: [PATCH 085/540] Clang --- .../core/kernels/crop_resize_bilinear_core.h | 2413 +++++++++-------- 1 file changed, 1257 insertions(+), 1156 deletions(-) diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h index c57131fd18c..892bdafb89f 100644 --- a/tensorflow/core/kernels/crop_resize_bilinear_core.h +++ b/tensorflow/core/kernels/crop_resize_bilinear_core.h @@ -37,8 +37,8 @@ namespace { // Compute the interpolation indices only once. struct CachedInterpolation { - int lower; // Lower source index used in the interpolation - int upper; // Upper source index used in the interpolation + int lower; // Lower source index used in the interpolation + int upper; // Upper source index used in the interpolation // 1-D linear iterpolation scale (see: // https://en.wikipedia.org/wiki/Bilinear_interpolation) float lerp; @@ -48,7 +48,7 @@ bool compute_single_interpolation_weight(const int in_size, const float out2in_scale, const float out2in_start, const bool clip, const int i, - int *lower, int *upper, float *lerp) { + int* lower, int* upper, float* lerp) { const float in = i * out2in_scale + out2in_start; *lower = (int)floor(in); *upper = (int)ceil(in); @@ -76,7 +76,7 @@ bool compute_single_interpolation_weight(const int in_size, bool compute_interpolation_weights(const int min_i, const int max_i, const int in_size, const float out2in_scale, const float out2in_start, const bool clip, - CachedInterpolation *interpolation) { + CachedInterpolation* interpolation) { bool rval = true; int num_i = max_i - min_i + 1; for (int i = 0; i < num_i; ++i) { @@ -94,15 +94,16 @@ bool compute_interpolation_weights(const int min_i, const int max_i, */ void compute_interpolation_weights(const int out_size, const int in_size, const float out2in_scale, - CachedInterpolation *interpolation) { + CachedInterpolation* interpolation) { interpolation[out_size].lower = 0; interpolation[out_size].upper = 0; const bool clip = true; if (!compute_interpolation_weights(0, out_size - 1, in_size, out2in_scale, 0.0f, clip, interpolation)) { // Should never happen, check for it anyway - printf("Warning! Interpolation values have lower,upper indexes outside of " - "range [0,in_size-1]\n"); + printf( + "Warning! Interpolation values have lower,upper indexes outside of " + "range [0,in_size-1]\n"); } } /** @@ -114,7 +115,7 @@ void compute_interpolation_weights(const int out_size, const int in_size, */ bool compute_minmax_indexes(const int out_size, const int in_size, const float out2in_scale, const float out2in_start, - int *min_i, int *max_i) { + int* min_i, int* max_i) { *min_i = out_size; *max_i = -1; int lower, upper; @@ -122,10 +123,8 @@ bool compute_minmax_indexes(const int out_size, const int in_size, for (int i = 0; i < out_size; ++i) { if (compute_single_interpolation_weight(in_size, out2in_scale, out2in_start, false, i, &lower, &upper, &lerp)) { - if (i < *min_i) - *min_i = i; - if (i > *max_i) - *max_i = i; + if (i < *min_i) *min_i = i; + if (i > *max_i) *max_i = i; } } return (*min_i <= *max_i) ? true : false; @@ -137,9 +136,9 @@ bool compute_minmax_indexes(const int out_size, const int in_size, */ bool compute_interpolation_weights( const int out_size, const int in_size, - const float x1, // lower bounding box, crop region starts at in_size*x1 - const float x2, // upper bounding box, crop region ends at in_size*x2 - int *min_i, int *max_i, std::vector *interpolation) { + const float x1, // lower bounding box, crop region starts at in_size*x1 + const float x2, // upper bounding box, crop region ends at in_size*x2 + int* min_i, int* max_i, std::vector* interpolation) { float out2in_start = out_size > 1 ? (float)(in_size - 1) * (float)x1 : (float)(in_size - 1) * (float)(x1 + x2) / 2.0f; @@ -207,24 +206,24 @@ float compute_lerp(const float top_left, const float top_right, * Optionally flips horizontal and/or vertical axis. */ template -void crop_resize_single_image(const T *image, const int64 in_height, +void crop_resize_single_image(const T* image, const int64 in_height, const int64 in_width, const int64 out_height, const int64 out_width, const int channels, const int min_ix, const int max_ix, - const CachedInterpolation *xs, const int min_iy, - const int max_iy, const CachedInterpolation *ys, + const CachedInterpolation* xs, const int min_iy, + const int max_iy, const CachedInterpolation* ys, const float extrapolated_value, const bool flip_x, const bool flip_y, - U *output) TF_ATTRIBUTE_NOINLINE; + U* output) TF_ATTRIBUTE_NOINLINE; template -void crop_resize_single_image(const T *image, const int64 in_height, +void crop_resize_single_image(const T* image, const int64 in_height, const int64 in_width, const int64 out_height, const int64 out_width, const int channels, const int min_ix, const int max_ix, - const CachedInterpolation *xs, const int min_iy, - const int max_iy, const CachedInterpolation *ys, + const CachedInterpolation* xs, const int min_iy, + const int max_iy, const CachedInterpolation* ys, const float extrapolated_value, const bool flip_x, - const bool flip_y, U *output) { + const bool flip_y, U* output) { const int64 in_row_size = in_width * channels; const int64 out_row_size = out_width * channels; U u_min_val = std::numeric_limits::min(); @@ -235,24 +234,22 @@ void crop_resize_single_image(const T *image, const int64 in_height, cast_to(extrapolated_value, min_val, max_val, u_min_val, u_max_val); // low y extrapolation zone if (min_iy > 0) { - U *p = flip_y ? output + out_row_size * (out_height - min_iy) : output; + U* p = flip_y ? output + out_row_size * (out_height - min_iy) : output; int64 nn = out_row_size * (int64)min_iy; - for (int64 i = 0; i < nn; ++i) - p[i] = uEx; + for (int64 i = 0; i < nn; ++i) p[i] = uEx; } // high y extrapolation zone if (max_iy < out_height - 1) { - U *p = flip_y ? output : output + out_row_size * (max_iy + 1); + U* p = flip_y ? output : output + out_row_size * (max_iy + 1); int64 nn = out_row_size * (int64)(out_height - 1 - max_iy); - for (int64 i = 0; i < nn; ++i) - p[i] = uEx; + for (int64 i = 0; i < nn; ++i) p[i] = uEx; } // low x extrapolation zone if (min_ix > 0) { for (int iy = min_iy; iy <= max_iy; ++iy) { int xx0 = flip_x ? (out_width - min_ix) * channels : 0; int nxx = min_ix * channels; - U *p = output + xx0 + + U* p = output + xx0 + out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy); for (int ix = 0; ix < nxx; ++ix) { p[ix] = uEx; @@ -264,22 +261,22 @@ void crop_resize_single_image(const T *image, const int64 in_height, for (int iy = min_iy; iy <= max_iy; ++iy) { int xx0 = flip_x ? 0 : (max_ix + 1) * channels; int nxx = (out_width - 1 - max_ix) * channels; - U *p = output + xx0 + + U* p = output + xx0 + out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy); for (int ix = 0; ix < nxx; ++ix) { p[ix] = uEx; } } } - U *output_y_ptr = + U* output_y_ptr = output + out_row_size * (int64)(flip_y ? out_height - 1 - min_iy : min_iy); // interpolation zone if (channels == 1) { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -307,8 +304,8 @@ void crop_resize_single_image(const T *image, const int64 in_height, } else if (channels == 2) { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -346,8 +343,8 @@ void crop_resize_single_image(const T *image, const int64 in_height, } else if (channels == 3) { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -395,8 +392,8 @@ void crop_resize_single_image(const T *image, const int64 in_height, } else if (channels == 4) { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -454,8 +451,8 @@ void crop_resize_single_image(const T *image, const int64 in_height, } else { for (int y = min_iy; y <= max_iy; ++y) { const int iy = y - min_iy; - const T *ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T *ys_input_upper_ptr = image + ys[iy].upper * in_row_size; + const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; + const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; const float ys_lerp = ys[iy].lerp; const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; @@ -486,12 +483,12 @@ void crop_resize_single_image(const T *image, const int64 in_height, // machine you are running on template void crop_resize_single_image_common( - const T *image, const int64 in_height, const int64 in_width, + const T* image, const int64 in_height, const int64 in_width, const int64 out_height, const int64 out_width, const int channels, - const int min_ix, const int max_ix, const CachedInterpolation *xs, - const int min_iy, const int max_iy, const CachedInterpolation *ys, + const int min_ix, const int max_ix, const CachedInterpolation* xs, + const int min_iy, const int max_iy, const CachedInterpolation* ys, const float extrapolated_value, const bool flip_x, const bool flip_y, - U *output) TF_ATTRIBUTE_NOINLINE; + U* output) TF_ATTRIBUTE_NOINLINE; // For now, only compile vectorized code on LINUX systems. // to-do: Test vectorized code on other platforms (MacOS and Windows). @@ -518,8 +515,9 @@ void crop_resize_single_image_common( // Eigen::half, bfloat16 or float. // -template class VectorLoader { -public: +template +class VectorLoader { + public: #ifdef __AVX2__ // convert 8 packed words of type T to fp32. // T must be one of uint8, int8, uint16, int16, int32, Eigen::half, bfloat16 @@ -537,20 +535,20 @@ public: // separate 128 bit lanes. // input is stored in lower portion of 4 separate sse words, v0 through v3. // output is stored in lower portion of v0. - void pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); // output is stored in lower portion of v0 and v1. - void pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); // output is stored in lower portion of v0, v1 and v2. - void pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); #else // pack 4 pixels with 1 channel, 2 channels and 3channels respectively. // input is stored in lower portion of 4 separate sse words, v0 through v3. // output is stored in lower portion of v0. - void pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); // output is stored in lower portion of v0 and v1. - void pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); // output is stored in lower portion of v0, v1 and v2. - void pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); #endif #ifdef __AVX2__ @@ -574,8 +572,8 @@ public: // pixels have 1 channel. // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned // SSE load. - void load1_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m256 *left0, __m256 *right0); + void load1_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* right0); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -583,9 +581,9 @@ public: // pixels have 2 channels. // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned // SSE load. - void load1_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, - __m256 *right0, __m256 *right1); + void load1_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* right0, __m256* right1); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -593,9 +591,9 @@ public: // pixels have 3 channels. // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned // SSE load. - void load1_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, - __m256 *left2, __m256 *right0, __m256 *right1, __m256 *right2); + void load1_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* left2, __m256* right0, __m256* right1, __m256* right2); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -603,10 +601,10 @@ public: // pixels have 4 channels. // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned // SSE load. - void load1_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, - __m256 *left2, __m256 *left3, __m256 *right0, __m256 *right1, - __m256 *right2, __m256 *right3); + void load1_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* left2, __m256* left3, __m256* right0, __m256* right1, + __m256* right2, __m256* right3); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -614,8 +612,8 @@ public: // pixels have 1 channel. // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right // inputs are loaded with second SSE load. - void load2_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m256 *left0, __m256 *right0); + void load2_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* right0); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -623,9 +621,9 @@ public: // pixels have 2 channels. // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right // inputs are loaded with second SSE load. - void load2_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, - __m256 *right0, __m256 *right1); + void load2_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* right0, __m256* right1); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -633,9 +631,9 @@ public: // pixels have 3 channels. // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right // inputs are loaded with second SSE load. - void load2_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, - __m256 *left2, __m256 *right0, __m256 *right1, __m256 *right2); + void load2_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* left2, __m256* right0, __m256* right1, __m256* right2); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -643,10 +641,10 @@ public: // pixels have 4 channels. // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right // inputs are loaded with second SSE load. - void load2_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m256 *left0, __m256 *left1, - __m256 *left2, __m256 *left3, __m256 *right0, __m256 *right1, - __m256 *right2, __m256 *right3); + void load2_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m256* left0, __m256* left1, + __m256* left2, __m256* left3, __m256* right0, __m256* right1, + __m256* right2, __m256* right3); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -654,9 +652,9 @@ public: // pixels have 1 channel. // load4 case, i.e. each pair of left and right inputs are loaded with a // separate SSE load. - void load4_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256 *left0, - __m256 *right0); + void load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* right0); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -664,9 +662,9 @@ public: // pixels have 2 channels. // load4 case, i.e. each pair of left and right inputs are loaded with a // separate SSE load. - void load4_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256 *left0, - __m256 *left1, __m256 *right0, __m256 *right1); + void load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* right0, __m256* right1); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -674,10 +672,10 @@ public: // pixels have 3 channels. // load4 case, i.e. each pair of left and right inputs are loaded with a // separate SSE load. - void load4_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256 *left0, - __m256 *left1, __m256 *left2, __m256 *right0, __m256 *right1, - __m256 *right2); + void load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* left2, __m256* right0, __m256* right1, + __m256* right2); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -685,10 +683,10 @@ public: // pixels have 4 channels. // load4 case, i.e. each pair of left and right inputs are loaded with a // separate SSE load. - void load4_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256 *left0, - __m256 *left1, __m256 *left2, __m256 *left3, __m256 *right0, - __m256 *right1, __m256 *right2, __m256 *right3); + void load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* left2, __m256* left3, __m256* right0, + __m256* right1, __m256* right2, __m256* right3); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -697,9 +695,9 @@ public: // load8 case, i.e. each input is loaded with a separate SSE load. // 4 pixels, each with left and right input necessitates 8 separate SSE loads // per input row. - void load8_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256 *left0, - __m256 *right0); + void load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* right0); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -708,9 +706,9 @@ public: // load8 case, i.e. each input is loaded with a separate SSE load. // 4 pixels, each with left and right input necessitates 8 separate SSE loads // per input row. - void load8_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256 *left0, - __m256 *left1, __m256 *right0, __m256 *right1); + void load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* right0, __m256* right1); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -719,10 +717,10 @@ public: // load8 case, i.e. each input is loaded with a separate SSE load. // 4 pixels, each with left and right input necessitates 8 separate SSE loads // per input row. - void load8_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256 *left0, - __m256 *left1, __m256 *left2, __m256 *right0, __m256 *right1, - __m256 *right2); + void load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* left2, __m256* right0, __m256* right1, + __m256* right2); // load top left and bottom left interpolation inputs into output argument // left. // load top right and bottom right interpolation inputs into output argument @@ -731,10 +729,10 @@ public: // load8 case, i.e. each input is loaded with a separate SSE load. // 4 pixels, each with left and right input necessitates 8 separate SSE loads // per input row. - void load8_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256 *left0, - __m256 *left1, __m256 *left2, __m256 *left3, __m256 *right0, - __m256 *right1, __m256 *right2, __m256 *right3); + void load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m256* left0, + __m256* left1, __m256* left2, __m256* left3, __m256* right0, + __m256* right1, __m256* right2, __m256* right3); #else // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. @@ -743,9 +741,9 @@ public: // pixels have 1 channel. // load1 case, i.e. all inputs for one input row are loaded with a single SSE // load. - void load1_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m128 *tl0, __m128 *bl0, - __m128 *tr0, __m128 *br0); + void load1_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* bl0, + __m128* tr0, __m128* br0); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -753,10 +751,10 @@ public: // pixels have 2 channels. // load1 case, i.e. all inputs for one input row are loaded with a single SSE // load. - void load1_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, - __m128 *bl0, __m128 *bl1, __m128 *tr0, __m128 *tr1, - __m128 *br0, __m128 *br1); + void load1_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* bl0, __m128* bl1, __m128* tr0, __m128* tr1, + __m128* br0, __m128* br1); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -764,11 +762,11 @@ public: // pixels have 3 channels. // load1 case, i.e. all inputs for one input row are loaded with a single SSE // load. - void load1_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, - __m128 *tl2, __m128 *bl0, __m128 *bl1, __m128 *bl2, - __m128 *tr0, __m128 *tr1, __m128 *tr2, __m128 *br0, - __m128 *br1, __m128 *br2); + void load1_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* bl0, __m128* bl1, __m128* bl2, + __m128* tr0, __m128* tr1, __m128* tr2, __m128* br0, + __m128* br1, __m128* br2); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -776,12 +774,12 @@ public: // pixels have 4 channels. // load1 case, i.e. all inputs for one input row are loaded with a single SSE // load. - void load1_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, - __m128 *tl2, __m128 *tl3, __m128 *bl0, __m128 *bl1, - __m128 *bl2, __m128 *bl3, __m128 *tr0, __m128 *tr1, - __m128 *tr2, __m128 *tr3, __m128 *br0, __m128 *br1, - __m128 *br2, __m128 *br3); + void load1_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* tl3, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* bl3, __m128* tr0, __m128* tr1, + __m128* tr2, __m128* tr3, __m128* br0, __m128* br1, + __m128* br2, __m128* br3); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -789,9 +787,9 @@ public: // pixels have 1 channel. // load2 case, i.e. left inputs are loaded with first SSE load, right inputs // are loaded with second SSE load. - void load2_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m128 *tl0, __m128 *bl0, - __m128 *tr0, __m128 *br0); + void load2_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* bl0, + __m128* tr0, __m128* br0); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -799,10 +797,10 @@ public: // pixels have 2 channels. // load2 case, i.e. left inputs are loaded with first SSE load, right inputs // are loaded with second SSE load. - void load2_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, - __m128 *bl0, __m128 *bl1, __m128 *tr0, __m128 *tr1, - __m128 *br0, __m128 *br1); + void load2_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* bl0, __m128* bl1, __m128* tr0, __m128* tr1, + __m128* br0, __m128* br1); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -810,11 +808,11 @@ public: // pixels have 3 channels. // load2 case, i.e. left inputs are loaded with first SSE load, right inputs // are loaded with second SSE load. - void load2_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, - __m128 *tl2, __m128 *bl0, __m128 *bl1, __m128 *bl2, - __m128 *tr0, __m128 *tr1, __m128 *tr2, __m128 *br0, - __m128 *br1, __m128 *br2); + void load2_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* bl0, __m128* bl1, __m128* bl2, + __m128* tr0, __m128* tr1, __m128* tr2, __m128* br0, + __m128* br1, __m128* br2); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -822,12 +820,12 @@ public: // pixels have 4 channels. // load2 case, i.e. left inputs are loaded with first SSE load, right inputs // are loaded with second SSE load. - void load2_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, - const __m128i *shuffle_masks, __m128 *tl0, __m128 *tl1, - __m128 *tl2, __m128 *tl3, __m128 *bl0, __m128 *bl1, - __m128 *bl2, __m128 *bl3, __m128 *tr0, __m128 *tr1, - __m128 *tr2, __m128 *tr3, __m128 *br0, __m128 *br1, - __m128 *br2, __m128 *br3); + void load2_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* tl3, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* bl3, __m128* tr0, __m128* tr1, + __m128* tr2, __m128* tr3, __m128* br0, __m128* br1, + __m128* br2, __m128* br3); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -835,9 +833,9 @@ public: // pixels have 1 channel. // load4 case, i.e. left and right inputs are loaded with a separate SSE load // for each pixel. - void load4_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128 *tl0, - __m128 *bl0, __m128 *tr0, __m128 *br0); + void load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* bl0, __m128* tr0, __m128* br0); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -845,10 +843,10 @@ public: // pixels have 2 channels. // load4 case, i.e. left and right inputs are loaded with a separate SSE load // for each pixel. - void load4_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128 *tl0, - __m128 *tl1, __m128 *bl0, __m128 *bl1, __m128 *tr0, - __m128 *tr1, __m128 *br0, __m128 *br1); + void load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* bl0, __m128* bl1, __m128* tr0, + __m128* tr1, __m128* br0, __m128* br1); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -856,11 +854,11 @@ public: // pixels have 3 channels. // load4 case, i.e. left and right inputs are loaded with a separate SSE load // for each pixel. - void load4_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128 *tl0, - __m128 *tl1, __m128 *tl2, __m128 *bl0, __m128 *bl1, - __m128 *bl2, __m128 *tr0, __m128 *tr1, __m128 *tr2, - __m128 *br0, __m128 *br1, __m128 *br2); + void load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* tl2, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* tr0, __m128* tr1, __m128* tr2, + __m128* br0, __m128* br1, __m128* br2); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -868,12 +866,12 @@ public: // pixels have 4 channels. // load4 case, i.e. left and right inputs are loaded with a separate SSE load // for each pixel. - void load4_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128 *tl0, - __m128 *tl1, __m128 *tl2, __m128 *tl3, __m128 *bl0, - __m128 *bl1, __m128 *bl2, __m128 *bl3, __m128 *tr0, - __m128 *tr1, __m128 *tr2, __m128 *tr3, __m128 *br0, - __m128 *br1, __m128 *br2, __m128 *br3); + void load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* tl2, __m128* tl3, __m128* bl0, + __m128* bl1, __m128* bl2, __m128* bl3, __m128* tr0, + __m128* tr1, __m128* tr2, __m128* tr3, __m128* br0, + __m128* br1, __m128* br2, __m128* br3); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -881,9 +879,9 @@ public: // pixels have 1 channel. // load8 case, i.e. left and right inputs are loaded with separate SSE loads // for each pixel. - void load8_1ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128 *tl0, - __m128 *bl0, __m128 *tr0, __m128 *br0); + void load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* bl0, __m128* tr0, __m128* br0); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -891,10 +889,10 @@ public: // pixels have 2 channels. // load8 case, i.e. left and right inputs are loaded with separate SSE loads // for each pixel. - void load8_2ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128 *tl0, - __m128 *tl1, __m128 *bl0, __m128 *bl1, __m128 *tr0, - __m128 *tr1, __m128 *br0, __m128 *br1); + void load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* bl0, __m128* bl1, __m128* tr0, + __m128* tr1, __m128* br0, __m128* br1); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -902,11 +900,11 @@ public: // pixels have 3 channels. // load8 case, i.e. left and right inputs are loaded with separate SSE loads // for each pixel. - void load8_3ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128 *tl0, - __m128 *tl1, __m128 *tl2, __m128 *bl0, __m128 *bl1, - __m128 *bl2, __m128 *tr0, __m128 *tr1, __m128 *tr2, - __m128 *br0, __m128 *br1, __m128 *br2); + void load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* tl2, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* tr0, __m128* tr1, __m128* tr2, + __m128* br0, __m128* br1, __m128* br2); // load top left interpolation inputs into output argument tl. // load bottom left interpolation inputs into output argument bl. // load top right interpolation inputs into output argument tr. @@ -914,48 +912,48 @@ public: // pixels have 4 channels. // load8 case, i.e. left and right inputs are loaded with separate SSE loads // for each pixel. - void load8_4ch(const T *lower_ptr, const T *upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128 *tl0, - __m128 *tl1, __m128 *tl2, __m128 *tl3, __m128 *bl0, - __m128 *bl1, __m128 *bl2, __m128 *bl3, __m128 *tr0, - __m128 *tr1, __m128 *tr2, __m128 *tr3, __m128 *br0, - __m128 *br1, __m128 *br2, __m128 *br3); + void load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, + int offset1, int offset2, int offset3, __m128* tl0, + __m128* tl1, __m128* tl2, __m128* tl3, __m128* bl0, + __m128* bl1, __m128* bl2, __m128* bl3, __m128* tr0, + __m128* tr1, __m128* tr2, __m128* tr3, __m128* br0, + __m128* br1, __m128* br2, __m128* br3); #endif // there is no method that packs 4 pixels with 4 channel into four sse words. // nothing to do for this case, everything is already in the right position. -private: + private: // helper methods #ifdef __AVX2__ // pack 4 pixels with 1, 2, 3 or 4 channels into lower portion of SSE vector // word. // works within SSE lanes. // sizeof(sample_data_type) can be 1, 2 or 4 bytes. - void pack4_1b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); - void pack4_2b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); - void pack4_4b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); - void pack4_1b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); - void pack4_2b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); - void pack4_4b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); - void pack4_1b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); - void pack4_2b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); - void pack4_4b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, __m256i *v3); + void pack4_1b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_2b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_4b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_1b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_2b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_4b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_1b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_2b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); + void pack4_4b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); // there is no pack4_xx_4ch functions because none is needed. // all the bytes are loaded in the right spots for this case. #else // pack 4 pixels with 1, 2, 3 or 4 channels into lower portion of SSE vector // word. // sizeof(sample_data_type) can be 1, 2 or 4 bytes. - void pack4_1b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); - void pack4_2b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); - void pack4_4b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); - void pack4_1b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); - void pack4_2b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); - void pack4_4b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); - void pack4_1b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); - void pack4_2b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); - void pack4_4b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, __m128i *v3); + void pack4_1b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_2b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_4b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_1b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_2b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_4b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_1b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_2b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); + void pack4_4b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); #endif #ifdef __AVX2__ __m256i extract_right_1b_(const __m256i left); @@ -976,8 +974,8 @@ private: #ifdef __AVX2__ template -void VectorLoader::pack4_1b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack4_1b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { *v3 = _mm256_slli_si256(*v3, 3); __m256i and_mask = _mm256_setr_epi32(255, 0, 0, 0, 255, 0, 0, 0); *v2 = _mm256_or_si256(*v3, @@ -987,8 +985,8 @@ void VectorLoader::pack4_1b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); } template -void VectorLoader::pack4_2b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack4_2b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { *v3 = _mm256_slli_si256(*v3, 6); __m256i and_mask = _mm256_setr_epi32(65535, 0, 0, 0, 65535, 0, 0, 0); *v2 = _mm256_or_si256(*v3, @@ -998,8 +996,8 @@ void VectorLoader::pack4_2b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); } template -void VectorLoader::pack4_4b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack4_4b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { *v3 = _mm256_slli_si256(*v3, 12); __m256i and_mask = _mm256_setr_epi32(-1, 0, 0, 0, -1, 0, 0, 0); *v2 = _mm256_or_si256(*v3, @@ -1010,8 +1008,8 @@ void VectorLoader::pack4_4b_1ch_(__m256i *v0, __m256i *v1, __m256i *v2, } template -void VectorLoader::pack4_1b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack4_1b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { __m256i and_mask = _mm256_setr_epi32(65535, 0, 0, 0, 65535, 0, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 2)); @@ -1019,8 +1017,8 @@ void VectorLoader::pack4_1b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, _mm256_slli_si256(*v3, 2)); } template -void VectorLoader::pack4_2b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack4_2b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { __m256i and_mask = _mm256_setr_epi32(-1, 0, 0, 0, -1, 0, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 4)); @@ -1028,8 +1026,8 @@ void VectorLoader::pack4_2b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, _mm256_slli_si256(*v3, 4)); } template -void VectorLoader::pack4_4b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack4_4b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { __m256i and_mask = _mm256_setr_epi32(-1, -1, 0, 0, -1, -1, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 8)); @@ -1038,8 +1036,8 @@ void VectorLoader::pack4_4b_2ch_(__m256i *v0, __m256i *v1, __m256i *v2, } template -void VectorLoader::pack4_1b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack4_1b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { __m256i and_mask = _mm256_setr_epi32(16777215, 0, 0, 0, 16777215, 0, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 3)); @@ -1051,8 +1049,8 @@ void VectorLoader::pack4_1b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, _mm256_slli_si256(*v3, 1)); } template -void VectorLoader::pack4_2b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack4_2b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { __m256i and_mask = _mm256_setr_epi32(-1, 65535, 0, 0, -1, 65535, 0, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 6)); @@ -1064,8 +1062,8 @@ void VectorLoader::pack4_2b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, _mm256_slli_si256(*v3, 2)); } template -void VectorLoader::pack4_4b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack4_4b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { __m256i and_mask = _mm256_setr_epi32(-1, -1, -1, 0, -1, -1, -1, 0); *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), _mm256_slli_si256(*v1, 12)); @@ -1078,131 +1076,131 @@ void VectorLoader::pack4_4b_3ch_(__m256i *v0, __m256i *v1, __m256i *v2, } template <> -void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_1b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_1b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_4b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_4b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_1b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_1b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_4b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_4b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_1b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_1b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_4b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m256i *v0, __m256i *v1, __m256i *v2, - __m256i *v3) { +void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, + __m256i* v3) { pack4_4b_3ch_(v0, v1, v2, v3); } #else template -void VectorLoader::pack4_1b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack4_1b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { *v3 = _mm_slli_si128(*v3, 3); __m128i and_mask = _mm_setr_epi32(255, 0, 0, 0); *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 2)); @@ -1210,8 +1208,8 @@ void VectorLoader::pack4_1b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); } template -void VectorLoader::pack4_2b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack4_2b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { *v3 = _mm_slli_si128(*v3, 6); __m128i and_mask = _mm_setr_epi32(65535, 0, 0, 0); *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 4)); @@ -1219,8 +1217,8 @@ void VectorLoader::pack4_2b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); } template -void VectorLoader::pack4_4b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack4_4b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { *v3 = _mm_slli_si128(*v3, 12); __m128i and_mask = _mm_setr_epi32(-1, 0, 0, 0); *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 8)); @@ -1228,29 +1226,29 @@ void VectorLoader::pack4_4b_1ch_(__m128i *v0, __m128i *v1, __m128i *v2, *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); } template -void VectorLoader::pack4_1b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack4_1b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { __m128i and_mask = _mm_setr_epi32(65535, 0, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 2)); *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 2)); } template -void VectorLoader::pack4_2b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack4_2b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { __m128i and_mask = _mm_setr_epi32(-1, 0, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 4)); *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 4)); } template -void VectorLoader::pack4_4b_2ch_(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack4_4b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { __m128i and_mask = _mm_setr_epi32(-1, -1, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 8)); *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 8)); } template -void VectorLoader::pack4_1b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack4_1b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { __m128i and_mask = _mm_setr_epi32(16777215, 0, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 3)); and_mask = _mm_srli_si128(and_mask, 1); @@ -1261,8 +1259,8 @@ void VectorLoader::pack4_1b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, _mm_slli_si128(*v3, 1)); } template -void VectorLoader::pack4_2b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack4_2b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { __m128i and_mask = _mm_setr_epi32(-1, 65535, 0, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 6)); and_mask = _mm_srli_si128(and_mask, 2); @@ -1273,8 +1271,8 @@ void VectorLoader::pack4_2b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, _mm_slli_si128(*v3, 2)); } template -void VectorLoader::pack4_4b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack4_4b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { __m128i and_mask = _mm_setr_epi32(-1, -1, -1, 0); *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 12)); and_mask = _mm_srli_si128(and_mask, 4); @@ -1286,144 +1284,148 @@ void VectorLoader::pack4_4b_3ch_(__m128i *v0, __m128i *v1, __m128i *v2, } template <> -void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_1b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_1b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_4b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_1ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_4b_1ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_1b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_1b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_4b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_2ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_4b_2ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_1b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_1b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_4b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_2b_3ch_(v0, v1, v2, v3); } template <> -void VectorLoader::pack_3ch(__m128i *v0, __m128i *v1, __m128i *v2, - __m128i *v3) { +void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, + __m128i* v3) { pack4_4b_3ch_(v0, v1, v2, v3); } #endif #ifdef __AVX2__ -template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_1b_(left); } -template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_1b_(left); } template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_2b_(left); } -template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_2b_(left); } -template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_4b_(left); } template <> @@ -1434,24 +1436,29 @@ template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_2b_(left); } -template <> __m256i VectorLoader::extract_right_1ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_1ch(const __m256i left) { return extract_right_4b_(left); } -template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_2b_(left); } -template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_2b_(left); } template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_4b_(left); } -template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_4b_(left); } -template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_8b_(left); } template <> @@ -1462,24 +1469,29 @@ template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_4b_(left); } -template <> __m256i VectorLoader::extract_right_2ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_2ch(const __m256i left) { return extract_right_8b_(left); } -template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_3b_(left); } -template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_3b_(left); } template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_6b_(left); } -template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_6b_(left); } -template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { assert(false); } template <> @@ -1490,24 +1502,29 @@ template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { return extract_right_6b_(left); } -template <> __m256i VectorLoader::extract_right_3ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_3ch(const __m256i left) { assert(false); } -template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_4b_(left); } -template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_4b_(left); } template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_8b_(left); } -template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_8b_(left); } -template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { assert(false); } template <> @@ -1518,24 +1535,29 @@ template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { return extract_right_8b_(left); } -template <> __m256i VectorLoader::extract_right_4ch(const __m256i left) { +template <> +__m256i VectorLoader::extract_right_4ch(const __m256i left) { assert(false); } #else -template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_1b_(left); } -template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_1b_(left); } template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_2b_(left); } -template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_2b_(left); } -template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_4b_(left); } template <> @@ -1546,24 +1568,29 @@ template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_2b_(left); } -template <> __m128i VectorLoader::extract_right_1ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_1ch(const __m128i left) { return extract_right_4b_(left); } -template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_2b_(left); } -template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_2b_(left); } template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_4b_(left); } -template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_4b_(left); } -template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_8b_(left); } template <> @@ -1574,24 +1601,29 @@ template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_4b_(left); } -template <> __m128i VectorLoader::extract_right_2ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_2ch(const __m128i left) { return extract_right_8b_(left); } -template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_3b_(left); } -template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_3b_(left); } template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_6b_(left); } -template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_6b_(left); } -template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { assert(false); } template <> @@ -1602,24 +1634,29 @@ template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { return extract_right_6b_(left); } -template <> __m128i VectorLoader::extract_right_3ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_3ch(const __m128i left) { assert(false); } -template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_4b_(left); } -template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_4b_(left); } template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_8b_(left); } -template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_8b_(left); } -template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { assert(false); } template <> @@ -1630,45 +1667,53 @@ template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { return extract_right_8b_(left); } -template <> __m128i VectorLoader::extract_right_4ch(const __m128i left) { +template <> +__m128i VectorLoader::extract_right_4ch(const __m128i left) { assert(false); } #endif #ifdef __AVX2__ -template <> __m256 VectorLoader::to_fp32(__m256i raw) { +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { raw = _mm256_insertf128_si256( _mm256_castsi128_si256(_mm_cvtepu8_epi32(_mm256_castsi256_si128(raw))), _mm_cvtepu8_epi32(_mm256_extractf128_si256(raw, 1)), 1); return _mm256_cvtepi32_ps(raw); } -template <> __m256 VectorLoader::to_fp32(__m256i raw) { +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { raw = _mm256_insertf128_si256( _mm256_castsi128_si256(_mm_cvtepi8_epi32(_mm256_castsi256_si128(raw))), _mm_cvtepi8_epi32(_mm256_extractf128_si256(raw, 1)), 1); return _mm256_cvtepi32_ps(raw); } -template <> __m256 VectorLoader::to_fp32(__m256i raw) { +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { raw = _mm256_insertf128_si256( _mm256_castsi128_si256(_mm_cvtepu16_epi32(_mm256_castsi256_si128(raw))), _mm_cvtepu16_epi32(_mm256_extractf128_si256(raw, 1)), 1); return _mm256_cvtepi32_ps(raw); } -template <> __m256 VectorLoader::to_fp32(__m256i raw) { +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { raw = _mm256_insertf128_si256( _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm256_castsi256_si128(raw))), _mm_cvtepi16_epi32(_mm256_extractf128_si256(raw, 1)), 1); return _mm256_cvtepi32_ps(raw); } -template <> __m256 VectorLoader::to_fp32(__m256i raw) { +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { return _mm256_cvtepi32_ps(raw); } -template <> __m256 VectorLoader::to_fp32(__m256i raw) { +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { return _mm256_insertf128_ps( _mm256_castps128_ps256(_mm_cvtph_ps(_mm256_castsi256_si128(raw))), _mm_cvtph_ps(_mm256_extractf128_si256(raw, 1)), 1); } -template <> __m256 VectorLoader::to_fp32(__m256i raw) { +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { // bfloat16 is essentially fp32 with mantissa truncated from 23 to 7 bits. // can convert with << 16, which we fuse with initial shuffle into epi32 // positions. @@ -1677,26 +1722,33 @@ template <> __m256 VectorLoader::to_fp32(__m256i raw) { -128, -128, 0, 1, -128, -128, 2, 3, -128, -128, 4, 5, -128, -128, 6, 7); return _mm256_castsi256_ps(_mm256_shuffle_epi8(raw, shuf_hi32)); } -template <> __m256 VectorLoader::to_fp32(__m256i raw) { +template <> +__m256 VectorLoader::to_fp32(__m256i raw) { return _mm256_castsi256_ps(raw); } #else -template <> __m128 VectorLoader::to_fp32(__m128i raw) { +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(raw)); } -template <> __m128 VectorLoader::to_fp32(__m128i raw) { +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(raw)); } -template <> __m128 VectorLoader::to_fp32(__m128i raw) { +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(raw)); } -template <> __m128 VectorLoader::to_fp32(__m128i raw) { +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(raw)); } -template <> __m128 VectorLoader::to_fp32(__m128i raw) { +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { return _mm_cvtepi32_ps(raw); } -template <> __m128 VectorLoader::to_fp32(__m128i raw) { +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { #ifdef __F16C__ return _mm_cvtph_ps(raw); #else @@ -1761,7 +1813,8 @@ template <> __m128 VectorLoader::to_fp32(__m128i raw) { return _mm_castsi128_ps(fp32_val); #endif } -template <> __m128 VectorLoader::to_fp32(__m128i raw) { +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { // bfloat16 is essentially fp32 with mantissa truncated from 23 to 7 bits. // can convert with << 16, which we fuse with initial shuffle into epi32 // positions. @@ -1769,7 +1822,8 @@ template <> __m128 VectorLoader::to_fp32(__m128i raw) { -128, 4, 5, -128, -128, 6, 7); return _mm_castsi128_ps(_mm_shuffle_epi8(raw, shuf_hi32)); } -template <> __m128 VectorLoader::to_fp32(__m128i raw) { +template <> +__m128 VectorLoader::to_fp32(__m128i raw) { return _mm_castsi128_ps(raw); } #endif @@ -1828,25 +1882,25 @@ __m128i VectorLoader::extract_right_8b_(const __m128i left) { #ifdef __AVX2__ template -void VectorLoader::load1_1ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m256 *left0, __m256 *right0) { +void VectorLoader::load1_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* right0) { __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); *left0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); *right0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); } template -void VectorLoader::load1_2ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m256 *left0, __m256 *left1, __m256 *right0, - __m256 *right1) { +void VectorLoader::load1_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* right0, + __m256* right1) { __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); *left0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); *left1 = to_fp32( @@ -1857,14 +1911,14 @@ void VectorLoader::load1_2ch(const T *lower_ptr, const T *upper_ptr, _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[3]))); } template -void VectorLoader::load1_3ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m256 *left0, __m256 *left1, __m256 *left2, - __m256 *right0, __m256 *right1, - __m256 *right2) { +void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* left2, + __m256* right0, __m256* right1, + __m256* right2) { __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); *left0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); *left1 = to_fp32( @@ -1879,14 +1933,14 @@ void VectorLoader::load1_3ch(const T *lower_ptr, const T *upper_ptr, _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[5]))); } template -void VectorLoader::load1_4ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m256 *left0, __m256 *left1, __m256 *left2, - __m256 *left3, __m256 *right0, __m256 *right1, - __m256 *right2, __m256 *right3) { +void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* left2, + __m256* left3, __m256* right0, __m256* right1, + __m256* right2, __m256* right3) { __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); *left0 = to_fp32( _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); *left1 = to_fp32( @@ -1905,32 +1959,32 @@ void VectorLoader::load1_4ch(const T *lower_ptr, const T *upper_ptr, _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[7]))); } template -void VectorLoader::load2_1ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m256 *left0, __m256 *right0) { +void VectorLoader::load2_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* right0) { __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i raw2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)), 1); __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); } template -void VectorLoader::load2_2ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m256 *left0, __m256 *left1, __m256 *right0, - __m256 *right1) { +void VectorLoader::load2_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* right0, + __m256* right1) { __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i raw2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)), 1); __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); @@ -1939,18 +1993,18 @@ void VectorLoader::load2_2ch(const T *lower_ptr, const T *upper_ptr, *right1 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); } template -void VectorLoader::load2_3ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m256 *left0, __m256 *left1, __m256 *left2, - __m256 *right0, __m256 *right1, - __m256 *right2) { +void VectorLoader::load2_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* left2, + __m256* right0, __m256* right1, + __m256* right2) { __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i raw2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)), 1); __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); @@ -1962,18 +2016,18 @@ void VectorLoader::load2_3ch(const T *lower_ptr, const T *upper_ptr, *right2 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); } template -void VectorLoader::load2_4ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m256 *left0, __m256 *left1, __m256 *left2, - __m256 *left3, __m256 *right0, __m256 *right1, - __m256 *right2, __m256 *right3) { +void VectorLoader::load2_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m256* left0, __m256* left1, __m256* left2, + __m256* left3, __m256* right0, __m256* right1, + __m256* right2, __m256* right3) { __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i raw2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 4))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 4)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)), 1); __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); @@ -1988,12 +2042,12 @@ void VectorLoader::load2_4ch(const T *lower_ptr, const T *upper_ptr, *right3 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); } template -void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256 *left0, __m256 *right0) { + int offset3, __m256* left0, __m256* right0) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i r0 = extract_right_1ch(l0); __m256i l1, r1; if (offset1 == offset0) { @@ -2002,8 +2056,8 @@ void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); r1 = extract_right_1ch(l1); } __m256i l2, r2; @@ -2013,8 +2067,8 @@ void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); r2 = extract_right_1ch(l2); } __m256i l3, r3; @@ -2024,8 +2078,8 @@ void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); r3 = extract_right_1ch(l3); } pack_1ch(&l0, &l1, &l2, &l3); @@ -2034,13 +2088,13 @@ void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, *right0 = to_fp32(r0); } template -void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256 *left0, __m256 *left1, - __m256 *right0, __m256 *right1) { + int offset3, __m256* left0, __m256* left1, + __m256* right0, __m256* right1) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i r0 = extract_right_2ch(l0); __m256i l1, r1; if (offset1 == offset0) { @@ -2049,8 +2103,8 @@ void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); r1 = extract_right_2ch(l1); } __m256i l2, r2; @@ -2060,8 +2114,8 @@ void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); r2 = extract_right_2ch(l2); } __m256i l3, r3; @@ -2071,8 +2125,8 @@ void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); r3 = extract_right_2ch(l3); } pack_2ch(&l0, &l1, &l2, &l3); @@ -2083,14 +2137,14 @@ void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, *right1 = to_fp32(r1); } template -void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256 *left0, __m256 *left1, - __m256 *left2, __m256 *right0, __m256 *right1, - __m256 *right2) { + int offset3, __m256* left0, __m256* left1, + __m256* left2, __m256* right0, __m256* right1, + __m256* right2) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i r0 = extract_right_3ch(l0); __m256i l1, r1; if (offset1 == offset0) { @@ -2099,8 +2153,8 @@ void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); r1 = extract_right_3ch(l1); } __m256i l2, r2; @@ -2110,8 +2164,8 @@ void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); r2 = extract_right_3ch(l2); } __m256i l3, r3; @@ -2121,8 +2175,8 @@ void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); r3 = extract_right_3ch(l3); } pack_3ch(&l0, &l1, &l2, &l3); @@ -2135,15 +2189,15 @@ void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, *right2 = to_fp32(r2); } template -void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256 *left0, __m256 *left1, - __m256 *left2, __m256 *left3, __m256 *right0, - __m256 *right1, __m256 *right2, - __m256 *right3) { + int offset3, __m256* left0, __m256* left1, + __m256* left2, __m256* left3, __m256* right0, + __m256* right1, __m256* right2, + __m256* right3) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i r0 = extract_right_4ch(l0); __m256i l1, r1; if (offset1 == offset0) { @@ -2152,8 +2206,8 @@ void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); r1 = extract_right_4ch(l1); } __m256i l2, r2; @@ -2163,8 +2217,8 @@ void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); r2 = extract_right_4ch(l2); } __m256i l3, r3; @@ -2174,8 +2228,8 @@ void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); r3 = extract_right_4ch(l3); } *left0 = to_fp32(l0); @@ -2188,16 +2242,16 @@ void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, *right3 = to_fp32(r3); } template -void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256 *left0, __m256 *right0) { + int offset3, __m256* left0, __m256* right0) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i r0 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)), 1); __m256i l1, r1; if (offset1 == offset0) { l1 = l0; @@ -2205,12 +2259,12 @@ void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); r1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 1)), 1); } __m256i l2, r2; if (offset2 == offset1) { @@ -2219,12 +2273,12 @@ void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); r2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 1)), 1); } __m256i l3, r3; if (offset3 == offset2) { @@ -2233,12 +2287,12 @@ void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); r3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 1)), 1); } pack_1ch(&l0, &l1, &l2, &l3); *left0 = to_fp32(l0); @@ -2246,17 +2300,17 @@ void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, *right0 = to_fp32(r0); } template -void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256 *left0, __m256 *left1, - __m256 *right0, __m256 *right1) { + int offset3, __m256* left0, __m256* left1, + __m256* right0, __m256* right1) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i r0 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)), 1); __m256i l1, r1; if (offset1 == offset0) { l1 = l0; @@ -2264,12 +2318,12 @@ void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); r1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 2)), 1); } __m256i l2, r2; if (offset2 == offset1) { @@ -2278,12 +2332,12 @@ void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); r2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 2)), 1); } __m256i l3, r3; if (offset3 == offset2) { @@ -2292,12 +2346,12 @@ void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); r3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 2)), 1); } pack_2ch(&l0, &l1, &l2, &l3); *left0 = to_fp32(l0); @@ -2307,18 +2361,18 @@ void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, *right1 = to_fp32(r1); } template -void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256 *left0, __m256 *left1, - __m256 *left2, __m256 *right0, __m256 *right1, - __m256 *right2) { + int offset3, __m256* left0, __m256* left1, + __m256* left2, __m256* right0, __m256* right1, + __m256* right2) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i r0 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)), 1); __m256i l1, r1; if (offset1 == offset0) { l1 = l0; @@ -2326,12 +2380,12 @@ void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); r1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 3)), 1); } __m256i l2, r2; if (offset2 == offset1) { @@ -2340,12 +2394,12 @@ void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); r2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 3)), 1); } __m256i l3, r3; if (offset3 == offset2) { @@ -2354,12 +2408,12 @@ void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); r3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 3)), 1); } pack_3ch(&l0, &l1, &l2, &l3); *left0 = to_fp32(l0); @@ -2371,19 +2425,19 @@ void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, *right2 = to_fp32(r2); } template -void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m256 *left0, __m256 *left1, - __m256 *left2, __m256 *left3, __m256 *right0, - __m256 *right1, __m256 *right2, - __m256 *right3) { + int offset3, __m256* left0, __m256* left1, + __m256* left2, __m256* left3, __m256* right0, + __m256* right1, __m256* right2, + __m256* right3) { __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0)), 1); + _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); __m256i r0 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 4))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 4)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)), 1); __m256i l1, r1; if (offset1 == offset0) { l1 = l0; @@ -2391,12 +2445,12 @@ void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, } else { l1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); r1 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 4))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 4)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 4)), 1); } __m256i l2, r2; if (offset2 == offset1) { @@ -2405,12 +2459,12 @@ void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, } else { l2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); r2 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 4))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 4)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 4)), 1); } __m256i l3, r3; if (offset3 == offset2) { @@ -2419,12 +2473,12 @@ void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, } else { l3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); r3 = _mm256_insertf128_si256( _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 4))), - _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 4)), 1); + _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 4))), + _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 4)), 1); } *left0 = to_fp32(l0); *left1 = to_fp32(l1); @@ -2437,49 +2491,49 @@ void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, } #else template -void VectorLoader::load1_1ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m128 *tl0, __m128 *bl0, __m128 *tr0, - __m128 *br0) { - __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); +void VectorLoader::load1_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* bl0, __m128* tr0, + __m128* br0) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); } template -void VectorLoader::load1_2ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m128 *tl0, __m128 *tl1, __m128 *bl0, - __m128 *bl1, __m128 *tr0, __m128 *tr1, - __m128 *br0, __m128 *br1) { - __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); +void VectorLoader::load1_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* bl0, + __m128* bl1, __m128* tr0, __m128* tr1, + __m128* br0, __m128* br1) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); } template -void VectorLoader::load1_3ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m128 *tl0, __m128 *tl1, __m128 *tl2, - __m128 *bl0, __m128 *bl1, __m128 *bl2, - __m128 *tr0, __m128 *tr1, __m128 *tr2, - __m128 *br0, __m128 *br1, __m128 *br2) { - __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); +void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* tl2, + __m128* bl0, __m128* bl1, __m128* bl2, + __m128* tr0, __m128* tr1, __m128* tr2, + __m128* br0, __m128* br1, __m128* br2) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); @@ -2488,15 +2542,15 @@ void VectorLoader::load1_3ch(const T *lower_ptr, const T *upper_ptr, *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); } template -void VectorLoader::load1_4ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m128 *tl0, __m128 *tl1, __m128 *tl2, - __m128 *tl3, __m128 *bl0, __m128 *bl1, - __m128 *bl2, __m128 *bl3, __m128 *tr0, - __m128 *tr1, __m128 *tr2, __m128 *tr3, - __m128 *br0, __m128 *br1, __m128 *br2, - __m128 *br3) { - __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); +void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* tl2, + __m128* tl3, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* bl3, __m128* tr0, + __m128* tr1, __m128* tr2, __m128* tr3, + __m128* br0, __m128* br1, __m128* br2, + __m128* br3) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); @@ -2505,7 +2559,7 @@ void VectorLoader::load1_4ch(const T *lower_ptr, const T *upper_ptr, *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[6])); *tr3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[7])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); @@ -2516,100 +2570,100 @@ void VectorLoader::load1_4ch(const T *lower_ptr, const T *upper_ptr, *br3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[7])); } template -void VectorLoader::load2_1ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m128 *tl0, __m128 *bl0, __m128 *tr0, - __m128 *br0) { - __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); +void VectorLoader::load2_1ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* bl0, __m128* tr0, + __m128* br0) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 1)); + raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1)); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 1)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); } template -void VectorLoader::load2_2ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m128 *tl0, __m128 *tl1, __m128 *bl0, - __m128 *bl1, __m128 *tr0, __m128 *tr1, - __m128 *br0, __m128 *br1) { - __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); +void VectorLoader::load2_2ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* bl0, + __m128* bl1, __m128* tr0, __m128* tr1, + __m128* br0, __m128* br1) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 2)); + raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2)); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 2)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); } template -void VectorLoader::load2_3ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m128 *tl0, __m128 *tl1, __m128 *tl2, - __m128 *bl0, __m128 *bl1, __m128 *bl2, - __m128 *tr0, __m128 *tr1, __m128 *tr2, - __m128 *br0, __m128 *br1, __m128 *br2) { - __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); +void VectorLoader::load2_3ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* tl2, + __m128* bl0, __m128* bl1, __m128* bl2, + __m128* tr0, __m128* tr1, __m128* tr2, + __m128* br0, __m128* br1, __m128* br2) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 3)); + raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3)); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 3)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); } template -void VectorLoader::load2_4ch(const T *lower_ptr, const T *upper_ptr, - int offset0, const __m128i *shuffle_masks, - __m128 *tl0, __m128 *tl1, __m128 *tl2, - __m128 *tl3, __m128 *bl0, __m128 *bl1, - __m128 *bl2, __m128 *bl3, __m128 *tr0, - __m128 *tr1, __m128 *tr2, __m128 *tr3, - __m128 *br0, __m128 *br1, __m128 *br2, - __m128 *br3) { - __m128i raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); +void VectorLoader::load2_4ch(const T* lower_ptr, const T* upper_ptr, + int offset0, const __m128i* shuffle_masks, + __m128* tl0, __m128* tl1, __m128* tl2, + __m128* tl3, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* bl3, __m128* tr0, + __m128* tr1, __m128* tr2, __m128* tr3, + __m128* br0, __m128* br1, __m128* br2, + __m128* br3) { + __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *tl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 4)); + raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4)); *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *tr3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *bl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 4)); + raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)); *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); *br3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); } template -void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128 *tl0, __m128 *bl0, - __m128 *tr0, __m128 *br0) { - __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); + int offset3, __m128* tl0, __m128* bl0, + __m128* tr0, __m128* br0) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); __m128i itr0 = extract_right_1ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); __m128i ibr0 = extract_right_1ch(ibl0); __m128i itl1, itr1; __m128i ibl1, ibr1; @@ -2619,9 +2673,9 @@ void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); itr1 = extract_right_1ch(itl1); - ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); ibr1 = extract_right_1ch(ibl1); } __m128i itl2, itr2; @@ -2632,9 +2686,9 @@ void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); itr2 = extract_right_1ch(itl2); - ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); ibr2 = extract_right_1ch(ibl2); } __m128i itl3, itr3; @@ -2645,9 +2699,9 @@ void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); itr3 = extract_right_1ch(itl3); - ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); ibr3 = extract_right_1ch(ibl3); } pack_1ch(&itl0, &itl1, &itl2, &itl3); @@ -2660,14 +2714,14 @@ void VectorLoader::load4_1ch(const T *lower_ptr, const T *upper_ptr, *br0 = to_fp32(ibr0); } template -void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128 *tl0, __m128 *tl1, - __m128 *bl0, __m128 *bl1, __m128 *tr0, - __m128 *tr1, __m128 *br0, __m128 *br1) { - __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); + int offset3, __m128* tl0, __m128* tl1, + __m128* bl0, __m128* bl1, __m128* tr0, + __m128* tr1, __m128* br0, __m128* br1) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); __m128i itr0 = extract_right_2ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); __m128i ibr0 = extract_right_2ch(ibl0); __m128i itl1, itr1; __m128i ibl1, ibr1; @@ -2677,9 +2731,9 @@ void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); itr1 = extract_right_2ch(itl1); - ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); ibr1 = extract_right_2ch(ibl1); } __m128i itl2, itr2; @@ -2690,9 +2744,9 @@ void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); itr2 = extract_right_2ch(itl2); - ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); ibr2 = extract_right_2ch(ibl2); } __m128i itl3, itr3; @@ -2703,9 +2757,9 @@ void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); itr3 = extract_right_2ch(itl3); - ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); ibr3 = extract_right_2ch(ibl3); } pack_2ch(&itl0, &itl1, &itl2, &itl3); @@ -2722,16 +2776,16 @@ void VectorLoader::load4_2ch(const T *lower_ptr, const T *upper_ptr, *br1 = to_fp32(ibr1); } template -void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128 *tl0, __m128 *tl1, - __m128 *tl2, __m128 *bl0, __m128 *bl1, - __m128 *bl2, __m128 *tr0, __m128 *tr1, - __m128 *tr2, __m128 *br0, __m128 *br1, - __m128 *br2) { - __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); + int offset3, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* tr0, __m128* tr1, + __m128* tr2, __m128* br0, __m128* br1, + __m128* br2) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); __m128i itr0 = extract_right_3ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); __m128i ibr0 = extract_right_3ch(ibl0); __m128i itl1, itr1; __m128i ibl1, ibr1; @@ -2741,9 +2795,9 @@ void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); itr1 = extract_right_3ch(itl1); - ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); ibr1 = extract_right_3ch(ibl1); } __m128i itl2, itr2; @@ -2754,9 +2808,9 @@ void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); itr2 = extract_right_3ch(itl2); - ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); ibr2 = extract_right_3ch(ibl2); } __m128i itl3, itr3; @@ -2767,9 +2821,9 @@ void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); itr3 = extract_right_3ch(itl3); - ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); ibr3 = extract_right_3ch(ibl3); } pack_3ch(&itl0, &itl1, &itl2, &itl3); @@ -2790,17 +2844,17 @@ void VectorLoader::load4_3ch(const T *lower_ptr, const T *upper_ptr, *br2 = to_fp32(ibr2); } template -void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128 *tl0, __m128 *tl1, - __m128 *tl2, __m128 *tl3, __m128 *bl0, - __m128 *bl1, __m128 *bl2, __m128 *bl3, - __m128 *tr0, __m128 *tr1, __m128 *tr2, - __m128 *tr3, __m128 *br0, __m128 *br1, - __m128 *br2, __m128 *br3) { - __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); + int offset3, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* tl3, __m128* bl0, + __m128* bl1, __m128* bl2, __m128* bl3, + __m128* tr0, __m128* tr1, __m128* tr2, + __m128* tr3, __m128* br0, __m128* br1, + __m128* br2, __m128* br3) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); __m128i itr0 = extract_right_4ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); __m128i ibr0 = extract_right_4ch(ibl0); __m128i itl1, itr1; __m128i ibl1, ibr1; @@ -2810,9 +2864,9 @@ void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); itr1 = extract_right_4ch(itl1); - ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); ibr1 = extract_right_4ch(ibl1); } __m128i itl2, itr2; @@ -2823,9 +2877,9 @@ void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); itr2 = extract_right_4ch(itl2); - ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); ibr2 = extract_right_4ch(ibl2); } __m128i itl3, itr3; @@ -2836,9 +2890,9 @@ void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); itr3 = extract_right_4ch(itl3); - ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); ibr3 = extract_right_4ch(ibl3); } *tl0 = to_fp32(itl0); @@ -2859,14 +2913,14 @@ void VectorLoader::load4_4ch(const T *lower_ptr, const T *upper_ptr, *br3 = to_fp32(ibr3); } template -void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128 *tl0, __m128 *bl0, - __m128 *tr0, __m128 *br0) { - __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 1)); - __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 1)); + int offset3, __m128* tl0, __m128* bl0, + __m128* tr0, __m128* br0) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)); __m128i itl1, itr1; __m128i ibl1, ibr1; if (offset1 == offset0) { @@ -2875,10 +2929,10 @@ void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 1)); - ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 1)); + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 1)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 1)); } __m128i itl2, itr2; __m128i ibl2, ibr2; @@ -2888,10 +2942,10 @@ void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 1)); - ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 1)); + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 1)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 1)); } __m128i itl3, itr3; __m128i ibl3, ibr3; @@ -2901,10 +2955,10 @@ void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 1)); - ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 1)); + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 1)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 1)); } pack_1ch(&itl0, &itl1, &itl2, &itl3); *tl0 = to_fp32(itl0); @@ -2916,15 +2970,15 @@ void VectorLoader::load8_1ch(const T *lower_ptr, const T *upper_ptr, *br0 = to_fp32(ibr0); } template -void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128 *tl0, __m128 *tl1, - __m128 *bl0, __m128 *bl1, __m128 *tr0, - __m128 *tr1, __m128 *br0, __m128 *br1) { - __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 2)); - __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 2)); + int offset3, __m128* tl0, __m128* tl1, + __m128* bl0, __m128* bl1, __m128* tr0, + __m128* tr1, __m128* br0, __m128* br1) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)); __m128i itl1, itr1; __m128i ibl1, ibr1; if (offset1 == offset0) { @@ -2933,10 +2987,10 @@ void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 2)); - ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 2)); + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 2)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 2)); } __m128i itl2, itr2; __m128i ibl2, ibr2; @@ -2946,10 +3000,10 @@ void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 2)); - ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 2)); + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 2)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 2)); } __m128i itl3, itr3; __m128i ibl3, ibr3; @@ -2959,10 +3013,10 @@ void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 2)); - ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 2)); + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 2)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 2)); } pack_2ch(&itl0, &itl1, &itl2, &itl3); *tl0 = to_fp32(itl0); @@ -2978,17 +3032,17 @@ void VectorLoader::load8_2ch(const T *lower_ptr, const T *upper_ptr, *br1 = to_fp32(ibr1); } template -void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128 *tl0, __m128 *tl1, - __m128 *tl2, __m128 *bl0, __m128 *bl1, - __m128 *bl2, __m128 *tr0, __m128 *tr1, - __m128 *tr2, __m128 *br0, __m128 *br1, - __m128 *br2) { - __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 3)); - __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 3)); + int offset3, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* bl0, __m128* bl1, + __m128* bl2, __m128* tr0, __m128* tr1, + __m128* tr2, __m128* br0, __m128* br1, + __m128* br2) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)); __m128i itl1, itr1; __m128i ibl1, ibr1; if (offset1 == offset0) { @@ -2997,10 +3051,10 @@ void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 3)); - ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 3)); + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 3)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 3)); } __m128i itl2, itr2; __m128i ibl2, ibr2; @@ -3010,10 +3064,10 @@ void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 3)); - ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 3)); + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 3)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 3)); } __m128i itl3, itr3; __m128i ibl3, ibr3; @@ -3023,10 +3077,10 @@ void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 3)); - ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 3)); + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 3)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 3)); } pack_3ch(&itl0, &itl1, &itl2, &itl3); *tl0 = to_fp32(itl0); @@ -3046,18 +3100,18 @@ void VectorLoader::load8_3ch(const T *lower_ptr, const T *upper_ptr, *br2 = to_fp32(ibr2); } template -void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, +void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, int offset1, int offset2, - int offset3, __m128 *tl0, __m128 *tl1, - __m128 *tl2, __m128 *tl3, __m128 *bl0, - __m128 *bl1, __m128 *bl2, __m128 *bl3, - __m128 *tr0, __m128 *tr1, __m128 *tr2, - __m128 *tr3, __m128 *br0, __m128 *br1, - __m128 *br2, __m128 *br3) { - __m128i itl0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i *)(lower_ptr + offset0 + 4)); - __m128i ibl0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i *)(upper_ptr + offset0 + 4)); + int offset3, __m128* tl0, __m128* tl1, + __m128* tl2, __m128* tl3, __m128* bl0, + __m128* bl1, __m128* bl2, __m128* bl3, + __m128* tr0, __m128* tr1, __m128* tr2, + __m128* tr3, __m128* br0, __m128* br1, + __m128* br2, __m128* br3) { + __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); + __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4)); + __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); + __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)); __m128i itl1, itr1; __m128i ibl1, ibr1; if (offset1 == offset0) { @@ -3066,10 +3120,10 @@ void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, ibl1 = ibl0; ibr1 = ibr0; } else { - itl1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i *)(lower_ptr + offset1 + 4)); - ibl1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i *)(upper_ptr + offset1 + 4)); + itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); + itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 4)); + ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); + ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 4)); } __m128i itl2, itr2; __m128i ibl2, ibr2; @@ -3079,10 +3133,10 @@ void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, ibl2 = ibl1; ibr2 = ibr1; } else { - itl2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i *)(lower_ptr + offset2 + 4)); - ibl2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i *)(upper_ptr + offset2 + 4)); + itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); + itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 4)); + ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); + ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 4)); } __m128i itl3, itr3; __m128i ibl3, ibr3; @@ -3092,10 +3146,10 @@ void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, ibl3 = ibl2; ibr3 = ibr2; } else { - itl3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i *)(lower_ptr + offset3 + 4)); - ibl3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i *)(upper_ptr + offset3 + 4)); + itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); + itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 4)); + ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); + ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 4)); } *tl0 = to_fp32(itl0); *tl1 = to_fp32(itl1); @@ -3123,8 +3177,9 @@ void VectorLoader::load8_4ch(const T *lower_ptr, const T *upper_ptr, // bfloat16 or float. // -template class VectorWriter { -public: +template +class VectorWriter { + public: // convert 4 fp32 words to type U with. // this function calls clip. // resulting words are packed. @@ -3134,89 +3189,89 @@ public: // converts from fp32 to U by calling method from_fp32(...) // writes 4 pixels with 1 channel to destination. - void write_1ch(U *destination, __m128 *vec); + void write_1ch(U* destination, __m128* vec); // converts from fp32 to U by calling method from_fp32(...) // writes 4 pixels with 1 channel to destination. - void write_2ch(U *destination, __m128 *vec); + void write_2ch(U* destination, __m128* vec); // converts from fp32 to U by calling method from_fp32(...) // writes 4 pixels with 1 channel to destination. - void write_3ch(U *destination, __m128 *vec); + void write_3ch(U* destination, __m128* vec); // converts from fp32 to U by calling method from_fp32(...) // writes 4 pixels with 1 channel to destination. - void write_4ch(U *destination, __m128 *vec); + void write_4ch(U* destination, __m128* vec); -private: + private: // clip 4 fp32 words to prevent overflow when converting to type U. __m128 clip_(__m128 vec) { // default is to do nothing, since the packing intrinsics include clipping. return vec; } - void write_1b_1ch(U *destination, __m128 *vec) { + void write_1b_1ch(U* destination, __m128* vec) { __m128i ivec = from_fp32(vec[0]); - _mm_store_ss((float *)(destination), _mm_castsi128_ps(ivec)); + _mm_store_ss((float*)(destination), _mm_castsi128_ps(ivec)); } - void write_2b_1ch(U *destination, __m128 *vec) { + void write_2b_1ch(U* destination, __m128* vec) { __m128i ivec = from_fp32(vec[0]); - _mm_store_sd((double *)(destination), _mm_castsi128_pd(ivec)); + _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec)); } - void write_4b_1ch(U *destination, __m128 *vec) { + void write_4b_1ch(U* destination, __m128* vec) { __m128i ivec = from_fp32(vec[0]); - _mm_storeu_si128((__m128i *)(destination), ivec); + _mm_storeu_si128((__m128i*)(destination), ivec); } - void write_1b_2ch(U *destination, __m128 *vec) { + void write_1b_2ch(U* destination, __m128* vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); - _mm_store_sd((double *)(destination), _mm_castsi128_pd(ivec1)); + _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec1)); } - void write_2b_2ch(U *destination, __m128 *vec) { + void write_2b_2ch(U* destination, __m128* vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); - _mm_storeu_si128((__m128i *)(destination), ivec1); + _mm_storeu_si128((__m128i*)(destination), ivec1); } - void write_4b_2ch(U *destination, __m128 *vec) { + void write_4b_2ch(U* destination, __m128* vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); - _mm_storeu_si128((__m128i *)(destination), ivec1); - _mm_storeu_si128((__m128i *)(destination + 4), ivec2); + _mm_storeu_si128((__m128i*)(destination), ivec1); + _mm_storeu_si128((__m128i*)(destination + 4), ivec2); } - void write_1b_3ch(U *destination, __m128 *vec) { + void write_1b_3ch(U* destination, __m128* vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); - _mm_store_sd((double *)(destination), _mm_castsi128_pd(ivec1)); + _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec1)); __m128i ivec3 = from_fp32(vec[2]); - _mm_store_ss((float *)(destination + 8), _mm_castsi128_ps(ivec3)); + _mm_store_ss((float*)(destination + 8), _mm_castsi128_ps(ivec3)); } - void write_2b_3ch(U *destination, __m128 *vec) { + void write_2b_3ch(U* destination, __m128* vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); - _mm_storeu_si128((__m128i *)(destination), ivec1); + _mm_storeu_si128((__m128i*)(destination), ivec1); __m128i ivec3 = from_fp32(vec[2]); - _mm_store_sd((double *)(destination + 8), _mm_castsi128_pd(ivec3)); + _mm_store_sd((double*)(destination + 8), _mm_castsi128_pd(ivec3)); } - void write_4b_3ch(U *destination, __m128 *vec) { + void write_4b_3ch(U* destination, __m128* vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i ivec3 = from_fp32(vec[2]); - _mm_storeu_si128((__m128i *)(destination), ivec1); - _mm_storeu_si128((__m128i *)(destination + 4), ivec2); - _mm_storeu_si128((__m128i *)(destination + 8), ivec3); + _mm_storeu_si128((__m128i*)(destination), ivec1); + _mm_storeu_si128((__m128i*)(destination + 4), ivec2); + _mm_storeu_si128((__m128i*)(destination + 8), ivec3); } - void write_1b_4ch(U *destination, __m128 *vec) { + void write_1b_4ch(U* destination, __m128* vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i ivec3 = from_fp32(vec[2]); @@ -3226,9 +3281,9 @@ private: ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec3), 8)); ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec4), 12)); - _mm_storeu_si128((__m128i *)(destination), ivec); + _mm_storeu_si128((__m128i*)(destination), ivec); } - void write_2b_4ch(U *destination, __m128 *vec) { + void write_2b_4ch(U* destination, __m128* vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i ivec3 = from_fp32(vec[2]); @@ -3236,24 +3291,25 @@ private: __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); __m128i ivec = _mm_and_si128(mask, ivec1); ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); - _mm_storeu_si128((__m128i *)(destination), ivec); + _mm_storeu_si128((__m128i*)(destination), ivec); ivec = _mm_and_si128(mask, ivec3); ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec4), 8)); - _mm_storeu_si128((__m128i *)(destination + 8), ivec); + _mm_storeu_si128((__m128i*)(destination + 8), ivec); } - void write_4b_4ch(U *destination, __m128 *vec) { + void write_4b_4ch(U* destination, __m128* vec) { __m128i ivec1 = from_fp32(vec[0]); __m128i ivec2 = from_fp32(vec[1]); __m128i ivec3 = from_fp32(vec[2]); __m128i ivec4 = from_fp32(vec[3]); - _mm_storeu_si128((__m128i *)(destination), ivec1); - _mm_storeu_si128((__m128i *)(destination + 4), ivec2); - _mm_storeu_si128((__m128i *)(destination + 8), ivec3); - _mm_storeu_si128((__m128i *)(destination + 12), ivec4); + _mm_storeu_si128((__m128i*)(destination), ivec1); + _mm_storeu_si128((__m128i*)(destination + 4), ivec2); + _mm_storeu_si128((__m128i*)(destination + 8), ivec3); + _mm_storeu_si128((__m128i*)(destination + 12), ivec4); } }; -template <> __m128 VectorWriter::clip_(__m128 vec) { +template <> +__m128 VectorWriter::clip_(__m128 vec) { // clip against low limit, -2147483648. // we round up to nearest number that can be represented as float. __m128 lt_val = _mm_set1_ps(-2147483520.0f); @@ -3266,7 +3322,8 @@ template <> __m128 VectorWriter::clip_(__m128 vec) { vec = _mm_or_ps(_mm_andnot_ps(gt_mask, vec), _mm_and_ps(gt_mask, gt_val)); return vec; } -template <> __m128 VectorWriter::clip_(__m128 vec) { +template <> +__m128 VectorWriter::clip_(__m128 vec) { // clip against low limit, -65504.0f; __m128 lt_val = _mm_set1_ps(-65504.0f); __m128 lt_mask = _mm_cmplt_ps(vec, lt_val); @@ -3278,28 +3335,34 @@ template <> __m128 VectorWriter::clip_(__m128 vec) { return vec; } -template <> __m128i VectorWriter::from_fp32(__m128 vec) { +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { __m128i ivec = _mm_cvttps_epi32(vec); ivec = _mm_packs_epi32(ivec, ivec); return _mm_packus_epi16(ivec, ivec); } -template <> __m128i VectorWriter::from_fp32(__m128 vec) { +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { __m128i ivec = _mm_cvttps_epi32(vec); ivec = _mm_packs_epi32(ivec, ivec); return _mm_packs_epi16(ivec, ivec); } -template <> __m128i VectorWriter::from_fp32(__m128 vec) { +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { __m128i ivec = _mm_cvttps_epi32(vec); return _mm_packus_epi32(ivec, ivec); } -template <> __m128i VectorWriter::from_fp32(__m128 vec) { +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { __m128i ivec = _mm_cvttps_epi32(vec); return _mm_packs_epi32(ivec, ivec); } -template <> __m128i VectorWriter::from_fp32(__m128 vec) { +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { return _mm_cvttps_epi32(clip_(vec)); } -template <> __m128i VectorWriter::from_fp32(__m128 vec) { +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { #ifdef __F16C__ return _mm_cvtps_ph(vec, _MM_FROUND_TO_ZERO); #else @@ -3363,7 +3426,8 @@ template <> __m128i VectorWriter::from_fp32(__m128 vec) { return number; #endif } -template <> __m128i VectorWriter::from_fp32(__m128 vec) { +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { // casting from float to bfloat16 simply means >> 16 // we do this with a shuffle that also moves everything to lower portion of // sse vector word @@ -3371,166 +3435,181 @@ template <> __m128i VectorWriter::from_fp32(__m128 vec) { -128, -128, -128, -128, -128, -128); return _mm_shuffle_epi8(_mm_castps_si128(vec), shuf_from_hi32); } -template <> __m128i VectorWriter::from_fp32(__m128 vec) { +template <> +__m128i VectorWriter::from_fp32(__m128 vec) { // nothing to do in this case return _mm_castps_si128(vec); } template <> -void VectorWriter::write_1ch(uint8 *destination, __m128 *vec) { - write_1b_1ch(destination, vec); -} -template <> void VectorWriter::write_1ch(int8 *destination, __m128 *vec) { +void VectorWriter::write_1ch(uint8* destination, __m128* vec) { write_1b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(uint16 *destination, __m128 *vec) { +void VectorWriter::write_1ch(int8* destination, __m128* vec) { + write_1b_1ch(destination, vec); +} +template <> +void VectorWriter::write_1ch(uint16* destination, __m128* vec) { write_2b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(int16 *destination, __m128 *vec) { +void VectorWriter::write_1ch(int16* destination, __m128* vec) { write_2b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(int32 *destination, __m128 *vec) { +void VectorWriter::write_1ch(int32* destination, __m128* vec) { write_4b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(Eigen::half *destination, - __m128 *vec) { +void VectorWriter::write_1ch(Eigen::half* destination, + __m128* vec) { write_2b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(bfloat16 *destination, __m128 *vec) { +void VectorWriter::write_1ch(bfloat16* destination, __m128* vec) { write_2b_1ch(destination, vec); } template <> -void VectorWriter::write_1ch(float *destination, __m128 *vec) { - _mm_storeu_si128((__m128i *)(destination), _mm_castps_si128(vec[0])); +void VectorWriter::write_1ch(float* destination, __m128* vec) { + _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); } template <> -void VectorWriter::write_2ch(uint8 *destination, __m128 *vec) { - write_1b_2ch(destination, vec); -} -template <> void VectorWriter::write_2ch(int8 *destination, __m128 *vec) { +void VectorWriter::write_2ch(uint8* destination, __m128* vec) { write_1b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(uint16 *destination, __m128 *vec) { +void VectorWriter::write_2ch(int8* destination, __m128* vec) { + write_1b_2ch(destination, vec); +} +template <> +void VectorWriter::write_2ch(uint16* destination, __m128* vec) { write_2b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(int16 *destination, __m128 *vec) { +void VectorWriter::write_2ch(int16* destination, __m128* vec) { write_2b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(int32 *destination, __m128 *vec) { +void VectorWriter::write_2ch(int32* destination, __m128* vec) { write_4b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(Eigen::half *destination, - __m128 *vec) { +void VectorWriter::write_2ch(Eigen::half* destination, + __m128* vec) { write_2b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(bfloat16 *destination, __m128 *vec) { +void VectorWriter::write_2ch(bfloat16* destination, __m128* vec) { write_2b_2ch(destination, vec); } template <> -void VectorWriter::write_2ch(float *destination, __m128 *vec) { - _mm_storeu_si128((__m128i *)(destination), _mm_castps_si128(vec[0])); - _mm_storeu_si128((__m128i *)(destination + 4), _mm_castps_si128(vec[1])); +void VectorWriter::write_2ch(float* destination, __m128* vec) { + _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); + _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); } template <> -void VectorWriter::write_3ch(uint8 *destination, __m128 *vec) { - write_1b_3ch(destination, vec); -} -template <> void VectorWriter::write_3ch(int8 *destination, __m128 *vec) { +void VectorWriter::write_3ch(uint8* destination, __m128* vec) { write_1b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(uint16 *destination, __m128 *vec) { +void VectorWriter::write_3ch(int8* destination, __m128* vec) { + write_1b_3ch(destination, vec); +} +template <> +void VectorWriter::write_3ch(uint16* destination, __m128* vec) { write_2b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(int16 *destination, __m128 *vec) { +void VectorWriter::write_3ch(int16* destination, __m128* vec) { write_2b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(int32 *destination, __m128 *vec) { +void VectorWriter::write_3ch(int32* destination, __m128* vec) { write_4b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(Eigen::half *destination, - __m128 *vec) { +void VectorWriter::write_3ch(Eigen::half* destination, + __m128* vec) { write_2b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(bfloat16 *destination, __m128 *vec) { +void VectorWriter::write_3ch(bfloat16* destination, __m128* vec) { write_2b_3ch(destination, vec); } template <> -void VectorWriter::write_3ch(float *destination, __m128 *vec) { - _mm_storeu_si128((__m128i *)(destination), _mm_castps_si128(vec[0])); - _mm_storeu_si128((__m128i *)(destination + 4), _mm_castps_si128(vec[1])); - _mm_storeu_si128((__m128i *)(destination + 8), _mm_castps_si128(vec[2])); +void VectorWriter::write_3ch(float* destination, __m128* vec) { + _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); + _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); + _mm_storeu_si128((__m128i*)(destination + 8), _mm_castps_si128(vec[2])); } template <> -void VectorWriter::write_4ch(uint8 *destination, __m128 *vec) { - write_1b_4ch(destination, vec); -} -template <> void VectorWriter::write_4ch(int8 *destination, __m128 *vec) { +void VectorWriter::write_4ch(uint8* destination, __m128* vec) { write_1b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(uint16 *destination, __m128 *vec) { +void VectorWriter::write_4ch(int8* destination, __m128* vec) { + write_1b_4ch(destination, vec); +} +template <> +void VectorWriter::write_4ch(uint16* destination, __m128* vec) { write_2b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(int16 *destination, __m128 *vec) { +void VectorWriter::write_4ch(int16* destination, __m128* vec) { write_2b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(int32 *destination, __m128 *vec) { +void VectorWriter::write_4ch(int32* destination, __m128* vec) { write_4b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(Eigen::half *destination, - __m128 *vec) { +void VectorWriter::write_4ch(Eigen::half* destination, + __m128* vec) { write_2b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(bfloat16 *destination, __m128 *vec) { +void VectorWriter::write_4ch(bfloat16* destination, __m128* vec) { write_2b_4ch(destination, vec); } template <> -void VectorWriter::write_4ch(float *destination, __m128 *vec) { - _mm_storeu_si128((__m128i *)(destination), _mm_castps_si128(vec[0])); - _mm_storeu_si128((__m128i *)(destination + 4), _mm_castps_si128(vec[1])); - _mm_storeu_si128((__m128i *)(destination + 8), _mm_castps_si128(vec[2])); - _mm_storeu_si128((__m128i *)(destination + 12), _mm_castps_si128(vec[3])); +void VectorWriter::write_4ch(float* destination, __m128* vec) { + _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); + _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); + _mm_storeu_si128((__m128i*)(destination + 8), _mm_castps_si128(vec[2])); + _mm_storeu_si128((__m128i*)(destination + 12), _mm_castps_si128(vec[3])); } template class CropResizeCastImage : public VectorLoader, public VectorWriter { -public: + public: CropResizeCastImage(const int in_height, const int in_width, const int out_height, const int out_width, const int channels, const int min_ix, const int max_ix, - const CachedInterpolation *xs, const int min_iy, - const int max_iy, const CachedInterpolation *ys, + const CachedInterpolation* xs, const int min_iy, + const int max_iy, const CachedInterpolation* ys, const float extrapolated_value, const bool flip_x, const bool flip_y, const bool verbose = false, const int allowed_load_groups = 15) - : verbose_(verbose), allowed_load_groups_(allowed_load_groups), - in_height_(in_height), in_width_(in_width), out_height_(out_height), - out_width_(out_width), channels_(channels), min_ix_(min_ix), - max_ix_(max_ix), min_iy_(min_iy), max_iy_(max_iy), ys_(ys), - extrapolated_value_(extrapolated_value), flip_x_(flip_x), - flip_y_(flip_y), in_row_size_(in_width * channels), + : verbose_(verbose), + allowed_load_groups_(allowed_load_groups), + in_height_(in_height), + in_width_(in_width), + out_height_(out_height), + out_width_(out_width), + channels_(channels), + min_ix_(min_ix), + max_ix_(max_ix), + min_iy_(min_iy), + max_iy_(max_iy), + ys_(ys), + extrapolated_value_(extrapolated_value), + flip_x_(flip_x), + flip_y_(flip_y), + in_row_size_(in_width * channels), in_row_size_bytes_(in_width * channels * sizeof(T)), out_row_size_(out_width * channels), x0_(flip_x ? out_width - 1 - max_ix : min_ix), @@ -3586,40 +3665,25 @@ public: } } ~CropResizeCastImage() { - if (general_x_ != NULL) - delete[] general_x_; - if (load1_x_ != NULL) - delete[] load1_x_; - if (load2_x_ != NULL) - delete[] load2_x_; - if (load4_x_ != NULL) - delete[] load4_x_; - if (load8_x_ != NULL) - delete[] load8_x_; - if (load1_offsets_ != NULL) - delete[] load1_offsets_; - if (load2_offsets_ != NULL) - delete[] load2_offsets_; - if (load4_offsets_ != NULL) - delete[] load4_offsets_; - if (load8_offsets_ != NULL) - delete[] load8_offsets_; - if (load1_shuffle_masks_ != NULL) - delete[] load1_shuffle_masks_; - if (load2_shuffle_masks_ != NULL) - delete[] load2_shuffle_masks_; - if (load1_mmxs_lerp_ != NULL) - delete[] load1_mmxs_lerp_; - if (load2_mmxs_lerp_ != NULL) - delete[] load2_mmxs_lerp_; - if (load4_mmxs_lerp_ != NULL) - delete[] load4_mmxs_lerp_; - if (load8_mmxs_lerp_ != NULL) - delete[] load8_mmxs_lerp_; + if (general_x_ != NULL) delete[] general_x_; + if (load1_x_ != NULL) delete[] load1_x_; + if (load2_x_ != NULL) delete[] load2_x_; + if (load4_x_ != NULL) delete[] load4_x_; + if (load8_x_ != NULL) delete[] load8_x_; + if (load1_offsets_ != NULL) delete[] load1_offsets_; + if (load2_offsets_ != NULL) delete[] load2_offsets_; + if (load4_offsets_ != NULL) delete[] load4_offsets_; + if (load8_offsets_ != NULL) delete[] load8_offsets_; + if (load1_shuffle_masks_ != NULL) delete[] load1_shuffle_masks_; + if (load2_shuffle_masks_ != NULL) delete[] load2_shuffle_masks_; + if (load1_mmxs_lerp_ != NULL) delete[] load1_mmxs_lerp_; + if (load2_mmxs_lerp_ != NULL) delete[] load2_mmxs_lerp_; + if (load4_mmxs_lerp_ != NULL) delete[] load4_mmxs_lerp_; + if (load8_mmxs_lerp_ != NULL) delete[] load8_mmxs_lerp_; delete[] xs_; } -private: + private: // constructor arguments const bool verbose_; // this value is meant for unit testing. @@ -3633,8 +3697,8 @@ private: const int in_height_, in_width_, out_height_, out_width_; const int channels_; const int min_ix_, max_ix_, min_iy_, max_iy_; - const CachedInterpolation *ys_; - CachedInterpolation *xs_; + const CachedInterpolation* ys_; + CachedInterpolation* xs_; const float extrapolated_value_; const bool flip_x_, flip_y_; // computed arguments @@ -3645,40 +3709,40 @@ private: const int y0_, y1_; // helper methods - void ResizeRow_load1_1ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load2_1ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load4_1ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load8_1ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load1_2ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load2_2ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load4_2ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load8_2ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load1_3ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load2_3ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load4_3ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load8_3ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load1_4ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load2_4ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load4_4ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_load8_4ch_(const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); - void ResizeRow_general_(const float ys_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr); + void ResizeRow_load1_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load2_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load4_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load8_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load1_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load2_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load4_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load8_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load1_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load2_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load4_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load8_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load1_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load2_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load4_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_load8_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); + void ResizeRow_general_(const float ys_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr); // configuration parameters int num_general_, num_load1_, num_load2_, num_load4_, num_load8_; @@ -3692,17 +3756,17 @@ private: // configuration methods void Configure_(); int DetermineLoadGroup_(const int x); - bool ComputeXIndexRange_(const int x, int *min_xidx, int *max_xidx); - bool - Load1_ok_(const int min_xidx, - const int max_xidx); // xs - pointer to first xs for this load group - bool - Load2_ok_(const int min_xidx, - const int max_xidx); // xs - pointer to first xs for this load group + bool ComputeXIndexRange_(const int x, int* min_xidx, int* max_xidx); + bool Load1_ok_( + const int min_xidx, + const int max_xidx); // xs - pointer to first xs for this load group + bool Load2_ok_( + const int min_xidx, + const int max_xidx); // xs - pointer to first xs for this load group bool Load4_ok_(const int min_xidx, const int max_xidx); bool Load8_ok_(const int min_xidx, const int max_xidx); -public: + public: // // public client methods // @@ -3712,36 +3776,34 @@ public: static bool clip_necessary(); // resize image - void Resize(const T *input_image, U *output_image); + void Resize(const T* input_image, U* output_image); }; template -void CropResizeCastImage::Resize(const T *input_image, U *output_image) { +void CropResizeCastImage::Resize(const T* input_image, U* output_image) { // U uEx = cast_to(extrapolated_value_, _f_min_val, _f_max_val, _u_min_val, _u_max_val); // extrapolate top if (min_iy_ > 0) { - U *p = flip_y_ ? output_image + out_row_size_ * (out_height_ - min_iy_) + U* p = flip_y_ ? output_image + out_row_size_ * (out_height_ - min_iy_) : output_image; int nn = out_row_size_ * min_iy_; - for (int i = 0; i < nn; ++i) - p[i] = uEx; + for (int i = 0; i < nn; ++i) p[i] = uEx; } // extrapolate bottom if (max_iy_ < out_height_ - 1) { - U *p = + U* p = flip_y_ ? output_image : output_image + out_row_size_ * (max_iy_ + 1); int nn = out_row_size_ * (out_height_ - 1 - max_iy_); - for (int i = 0; i < nn; ++i) - p[i] = uEx; + for (int i = 0; i < nn; ++i) p[i] = uEx; } // extrapolate left if (min_ix_ > 0) { for (int iy = min_iy_; iy <= max_iy_; ++iy) { int xx0 = flip_x_ ? (out_width_ - min_ix_) * channels_ : 0; int nxx = min_ix_ * channels_; - U *p = output_image + xx0 + + U* p = output_image + xx0 + out_row_size_ * (flip_y_ ? out_height_ - 1 - iy : iy); for (int ix = 0; ix < nxx; ++ix) { p[ix] = uEx; @@ -3753,7 +3815,7 @@ void CropResizeCastImage::Resize(const T *input_image, U *output_image) { for (int iy = min_iy_; iy <= max_iy_; ++iy) { int xx0 = flip_x_ ? 0 : (max_ix_ + 1) * channels_; int nxx = (out_width_ - 1 - max_ix_) * channels_; - U *p = output_image + xx0 + + U* p = output_image + xx0 + out_row_size_ * (flip_y_ ? out_height_ - 1 - iy : iy); for (int ix = 0; ix < nxx; ++ix) { p[ix] = uEx; @@ -3767,20 +3829,20 @@ void CropResizeCastImage::Resize(const T *input_image, U *output_image) { const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; const float yA_lerp = ys_[iyA].lerp; const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); - const T *ysA_input_lower_ptr = + const T* ysA_input_lower_ptr = input_image + ys_[iyA].lower * in_width_ * channels_; - const T *ysA_input_upper_ptr = + const T* ysA_input_upper_ptr = input_image + ys_[iyA].upper * in_width_ * channels_; - U *ysA_output_ptr = output_image + y * out_width_ * channels_; + U* ysA_output_ptr = output_image + y * out_width_ * channels_; const int iyB = flip_y_ ? out_height_ - 1 - min_iy_ - (y + 1) : (y + 1) - min_iy_; const float yB_lerp = ys_[iyB].lerp; const __m128 ysB_lerp = _mm_set1_ps(yB_lerp); - const T *ysB_input_lower_ptr = + const T* ysB_input_lower_ptr = input_image + ys_[iyB].lower * in_width_ * channels_; - const T *ysB_input_upper_ptr = + const T* ysB_input_upper_ptr = input_image + ys_[iyB].upper * in_width_ * channels_; - U *ysB_output_ptr = output_image + (y + 1) * out_width_ * channels_; + U* ysB_output_ptr = output_image + (y + 1) * out_width_ * channels_; if (channels_ == 1) { this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, ysA_input_upper_ptr, ysA_output_ptr); @@ -3873,11 +3935,11 @@ void CropResizeCastImage::Resize(const T *input_image, U *output_image) { const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; const float yA_lerp = ys_[iyA].lerp; const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); - const T *ysA_input_lower_ptr = + const T* ysA_input_lower_ptr = input_image + ys_[iyA].lower * in_width_ * channels_; - const T *ysA_input_upper_ptr = + const T* ysA_input_upper_ptr = input_image + ys_[iyA].upper * in_width_ * channels_; - U *ysA_output_ptr = output_image + y * out_width_ * channels_; + U* ysA_output_ptr = output_image + y * out_width_ * channels_; if (channels_ == 1) { this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, ysA_input_upper_ptr, ysA_output_ptr); @@ -3931,9 +3993,9 @@ void CropResizeCastImage::Resize(const T *input_image, U *output_image) { template void CropResizeCastImage::ResizeRow_general_(const float ys_lerp, - const T *ys_input_lower_ptr, - const T *ys_input_upper_ptr, - U *output_y_ptr) { + const T* ys_input_lower_ptr, + const T* ys_input_upper_ptr, + U* output_y_ptr) { for (int current = 0; current < num_general_; ++current) { int x = general_x_[current]; const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - x : x - min_ix_; @@ -3958,12 +4020,12 @@ void CropResizeCastImage::ResizeRow_general_(const float ys_lerp, // 1 channel image. template void CropResizeCastImage::ResizeRow_load1_1ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load1_; ++current) { - __m128 *mmxs_lerp = - (__m128 *)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; + __m128* mmxs_lerp = + (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, right0; this->load1_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4000,12 +4062,12 @@ void CropResizeCastImage::ResizeRow_load1_1ch_( // 1 channel image. template void CropResizeCastImage::ResizeRow_load2_1ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load2_; ++current) { - __m128 *mmxs_lerp = - (__m128 *)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; + __m128* mmxs_lerp = + (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, right0; this->load2_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4042,10 +4104,10 @@ void CropResizeCastImage::ResizeRow_load2_1ch_( // 1 channel image. template void CropResizeCastImage::ResizeRow_load4_1ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load4_; ++current) { - __m128 *mmxs_lerp = (__m128 *)(load4_mmxs_lerp_ + current * CHANNELS); + __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, right0; this->load4_1ch( @@ -4085,10 +4147,10 @@ void CropResizeCastImage::ResizeRow_load4_1ch_( // 1 channel image. template void CropResizeCastImage::ResizeRow_load8_1ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load8_; ++current) { - __m128 *mmxs_lerp = (__m128 *)(load8_mmxs_lerp_ + current * CHANNELS); + __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, right0; this->load8_1ch( @@ -4131,12 +4193,12 @@ void CropResizeCastImage::ResizeRow_load8_1ch_( // 2 channel image. template void CropResizeCastImage::ResizeRow_load1_2ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load1_; ++current) { - __m128 *mmxs_lerp = - (__m128 *)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; + __m128* mmxs_lerp = + (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, right0, right1; this->load1_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4184,12 +4246,12 @@ void CropResizeCastImage::ResizeRow_load1_2ch_( // 2 channel image. template void CropResizeCastImage::ResizeRow_load2_2ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load2_; ++current) { - __m128 *mmxs_lerp = - (__m128 *)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; + __m128* mmxs_lerp = + (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, right0, right1; this->load2_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4237,10 +4299,10 @@ void CropResizeCastImage::ResizeRow_load2_2ch_( // 2 channel image. template void CropResizeCastImage::ResizeRow_load4_2ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load4_; ++current) { - __m128 *mmxs_lerp = (__m128 *)(load4_mmxs_lerp_ + current * CHANNELS); + __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, right0, right1; this->load4_2ch( @@ -4291,10 +4353,10 @@ void CropResizeCastImage::ResizeRow_load4_2ch_( // 2 channel image. template void CropResizeCastImage::ResizeRow_load8_2ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load8_; ++current) { - __m128 *mmxs_lerp = (__m128 *)(load8_mmxs_lerp_ + current * CHANNELS); + __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, right0, right1; this->load8_2ch( @@ -4348,12 +4410,12 @@ void CropResizeCastImage::ResizeRow_load8_2ch_( // 3 channel image. template void CropResizeCastImage::ResizeRow_load1_3ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load1_; ++current) { - __m128 *mmxs_lerp = - (__m128 *)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; + __m128* mmxs_lerp = + (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, left2, right0, right1, right2; this->load1_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4411,12 +4473,12 @@ void CropResizeCastImage::ResizeRow_load1_3ch_( // 3 channel image. template void CropResizeCastImage::ResizeRow_load2_3ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load2_; ++current) { - __m128 *mmxs_lerp = - (__m128 *)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; + __m128* mmxs_lerp = + (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, left2, right0, right1, right2; this->load2_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4474,10 +4536,10 @@ void CropResizeCastImage::ResizeRow_load2_3ch_( // 3 channel image. template void CropResizeCastImage::ResizeRow_load4_3ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load4_; ++current) { - __m128 *mmxs_lerp = (__m128 *)(load4_mmxs_lerp_ + current * CHANNELS); + __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, left2, right0, right1, right2; this->load4_3ch( @@ -4539,10 +4601,10 @@ void CropResizeCastImage::ResizeRow_load4_3ch_( // 3 channel image. template void CropResizeCastImage::ResizeRow_load8_3ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load8_; ++current) { - __m128 *mmxs_lerp = (__m128 *)(load8_mmxs_lerp_ + current * CHANNELS); + __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, left2, right0, right1, right2; this->load8_3ch( @@ -4607,12 +4669,12 @@ void CropResizeCastImage::ResizeRow_load8_3ch_( // 4 channel image. template void CropResizeCastImage::ResizeRow_load1_4ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load1_; ++current) { - __m128 *mmxs_lerp = - (__m128 *)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; + __m128* mmxs_lerp = + (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, left2, left3, right0, right1, right2, right3; this->load1_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4682,12 +4744,12 @@ void CropResizeCastImage::ResizeRow_load1_4ch_( // 4 channel image. template void CropResizeCastImage::ResizeRow_load2_4ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load2_; ++current) { - __m128 *mmxs_lerp = - (__m128 *)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i *shuffle_masks = (__m128i *)mmxs_lerp + CHANNELS; + __m128* mmxs_lerp = + (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); + __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; #ifdef __AVX2__ __m256 left0, left1, left2, left3, right0, right1, right2, right3; this->load2_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, @@ -4757,10 +4819,10 @@ void CropResizeCastImage::ResizeRow_load2_4ch_( // 4 channel image. template void CropResizeCastImage::ResizeRow_load4_4ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load4_; ++current) { - __m128 *mmxs_lerp = (__m128 *)(load4_mmxs_lerp_ + current * CHANNELS); + __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, left2, left3, right0, right1, right2, right3; this->load4_4ch( @@ -4833,10 +4895,10 @@ void CropResizeCastImage::ResizeRow_load4_4ch_( // 4 channel image. template void CropResizeCastImage::ResizeRow_load8_4ch_( - const __m128 y_lerp, const T *ysA_input_lower_ptr, - const T *ysA_input_upper_ptr, U *ysA_output_ptr) { + const __m128 y_lerp, const T* ysA_input_lower_ptr, + const T* ysA_input_upper_ptr, U* ysA_output_ptr) { for (int current = 0; current < num_load8_; ++current) { - __m128 *mmxs_lerp = (__m128 *)(load8_mmxs_lerp_ + current * CHANNELS); + __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); #ifdef __AVX2__ __m256 left0, left1, left2, left3, right0, right1, right2, right3; this->load8_4ch( @@ -4907,23 +4969,22 @@ void CropResizeCastImage::ResizeRow_load8_4ch_( } #undef CHANNELS -template void CropResizeCastImage::Configure_() { +template +void CropResizeCastImage::Configure_() { // num_cases[0] = general case // num_cases[1] = load4from1 // num_cases[2] = load4from2 // num_cases[3] = load4from4 // num_cases[4] = load4from8 int num_cases[5]; - for (int i = 0; i < 5; ++i) - num_cases[i] = 0; + for (int i = 0; i < 5; ++i) num_cases[i] = 0; for (int x = x0_; x <= x1_; ++x) { int load_group = this->DetermineLoadGroup_(x); assert(load_group >= 0 && load_group <= 4); ++num_cases[load_group]; // load_group == 0 -> general case, pixel by pixel // every other value indidcates 1+3 = 4 pixels were processed this iteration - if (load_group > 0) - x += 3; + if (load_group > 0) x += 3; } num_general_ = num_cases[0]; num_load1_ = num_cases[1]; @@ -4938,7 +4999,7 @@ template void CropResizeCastImage::Configure_() { if (num_load1_ > 0) { load1_offsets_ = new int[num_load1_]; load1_shuffle_masks_ = new __m128i[num_load1_ * channels_ * 3]; - load1_mmxs_lerp_ = NULL; // new __m128[num_load1_*channels_]; + load1_mmxs_lerp_ = NULL; // new __m128[num_load1_*channels_]; load1_x_ = new int[num_load1_]; } else { load1_offsets_ = NULL; @@ -4949,7 +5010,7 @@ template void CropResizeCastImage::Configure_() { if (num_load2_ > 0) { load2_offsets_ = new int[num_load2_]; load2_shuffle_masks_ = new __m128i[num_load2_ * channels_ * 2]; - load2_mmxs_lerp_ = NULL; // new __m128[num_load2_*channels_]; + load2_mmxs_lerp_ = NULL; // new __m128[num_load2_*channels_]; load2_x_ = new int[num_load2_]; } else { load2_offsets_ = NULL; @@ -4975,8 +5036,7 @@ template void CropResizeCastImage::Configure_() { load8_mmxs_lerp_ = NULL; load8_x_ = NULL; } - for (int i = 0; i < 5; ++i) - num_cases[i] = 0; + for (int i = 0; i < 5; ++i) num_cases[i] = 0; if (verbose_) { printf(" load4from1 = %d\n", num_load1_); printf(" load4from2 = %d\n", num_load2_); @@ -5000,19 +5060,17 @@ template void CropResizeCastImage::Configure_() { int min_xidx, max_xidx; ComputeXIndexRange_(x, &min_xidx, &max_xidx); load1_offsets_[current] = min_xidx * channels_; - float *xs_lerp = - (float *)(load1_shuffle_masks_ + current * channels_ * 3); - char *shufmasks1 = - (char *)(load1_shuffle_masks_ + current * channels_ * 3 + channels_); - char *shufmasks2 = shufmasks1 + 16 * channels_; - for (int j = 0; j < 32 * channels_; ++j) - shufmasks1[j] = -128; + float* xs_lerp = (float*)(load1_shuffle_masks_ + current * channels_ * 3); + char* shufmasks1 = + (char*)(load1_shuffle_masks_ + current * channels_ * 3 + channels_); + char* shufmasks2 = shufmasks1 + 16 * channels_; + for (int j = 0; j < 32 * channels_; ++j) shufmasks1[j] = -128; for (int pix = 0; pix < 4; ++pix) { const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; float lerp = xs_[ix].lerp; int widx0 = xs_[ix].lower - - load1_offsets_[current]; // word index within SSE vector + load1_offsets_[current]; // word index within SSE vector for (int ch = 0; ch < channels_; ++ch) { int idx = pix * channels_ + ch; xs_lerp[idx] = lerp; @@ -5034,18 +5092,16 @@ template void CropResizeCastImage::Configure_() { int min_xidx, max_xidx; ComputeXIndexRange_(x, &min_xidx, &max_xidx); load2_offsets_[current] = min_xidx * channels_; - float *xs_lerp = - (float *)(load2_shuffle_masks_ + current * channels_ * 2); - char *shufmasks1 = - (char *)(load2_shuffle_masks_ + current * channels_ * 2 + channels_); - for (int j = 0; j < 16 * channels_; ++j) - shufmasks1[j] = -128; + float* xs_lerp = (float*)(load2_shuffle_masks_ + current * channels_ * 2); + char* shufmasks1 = + (char*)(load2_shuffle_masks_ + current * channels_ * 2 + channels_); + for (int j = 0; j < 16 * channels_; ++j) shufmasks1[j] = -128; for (int pix = 0; pix < 4; ++pix) { const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; float lerp = xs_[ix].lerp; int widx0 = xs_[ix].lower - - load2_offsets_[current]; // word index within SSE vector + load2_offsets_[current]; // word index within SSE vector for (int ch = 0; ch < channels_; ++ch) { int idx = pix * channels_ + ch; xs_lerp[idx] = lerp; @@ -5062,8 +5118,8 @@ template void CropResizeCastImage::Configure_() { // load4from4 assert(current < num_load4_); load4_x_[current] = x; - int *index = load4_offsets_ + current * 4; - float *xs_lerp = (float *)(load4_mmxs_lerp_ + current * channels_); + int* index = load4_offsets_ + current * 4; + float* xs_lerp = (float*)(load4_mmxs_lerp_ + current * channels_); for (int pix = 0; pix < 4; ++pix) { const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; @@ -5078,8 +5134,8 @@ template void CropResizeCastImage::Configure_() { // load4from8 assert(current < num_load8_); load8_x_[current] = x; - int *index = load8_offsets_ + current * 4; - float *xs_lerp = (float *)(load8_mmxs_lerp_ + current * channels_); + int* index = load8_offsets_ + current * 4; + float* xs_lerp = (float*)(load8_mmxs_lerp_ + current * channels_); for (int pix = 0; pix < 4; ++pix) { const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; @@ -5096,8 +5152,7 @@ template void CropResizeCastImage::Configure_() { ++num_cases[load_group]; // load_group == 0 -> general case, pixel by pixel // every other value indidcates 1+3 = 4 pixels were processed this iteration - if (load_group > 0) - x += 3; + if (load_group > 0) x += 3; } } @@ -5143,8 +5198,8 @@ int CropResizeCastImage::DetermineLoadGroup_(const int x) { // Compute range of x indexes for xs[0] through xs[3]. // Returns true if valid (xs[i].lower + channels == xs[i].upper for all pixels). template -bool CropResizeCastImage::ComputeXIndexRange_(const int x, int *min_xidx, - int *max_xidx) { +bool CropResizeCastImage::ComputeXIndexRange_(const int x, int* min_xidx, + int* max_xidx) { bool upper_is_lower_plus_one = true; *min_xidx = 0; *max_xidx = -1; @@ -5157,10 +5212,8 @@ bool CropResizeCastImage::ComputeXIndexRange_(const int x, int *min_xidx, *min_xidx = curr_xidx; *max_xidx = curr_xidx; } else { - if (curr_xidx < *min_xidx) - *min_xidx = curr_xidx; - if (curr_xidx > *max_xidx) - *max_xidx = curr_xidx; + if (curr_xidx < *min_xidx) *min_xidx = curr_xidx; + if (curr_xidx > *max_xidx) *max_xidx = curr_xidx; } } else { upper_is_lower_plus_one = false; @@ -5260,158 +5313,206 @@ bool CropResizeCastImage::Load8_ok_(const int min_xidx, // full implementations of templated static member function clip_necessary() // -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } template <> bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return true; } -template <> bool CropResizeCastImage::clip_necessary() { +template <> +bool CropResizeCastImage::clip_necessary() { return false; } @@ -5423,14 +5524,14 @@ template <> bool CropResizeCastImage::clip_necessary() { #define CROP_RESIZE_SINGLE_IMAGE_VECT(T_type, U_type) \ template <> \ void crop_resize_single_image_common( \ - const T_type *image, const int64 in_height, const int64 in_width, \ + const T_type* image, const int64 in_height, const int64 in_width, \ const int64 out_height, const int64 out_width, const int channels, \ - const int min_ix, const int max_ix, const CachedInterpolation *xs, \ - const int min_iy, const int max_iy, const CachedInterpolation *ys, \ + const int min_ix, const int max_ix, const CachedInterpolation* xs, \ + const int min_iy, const int max_iy, const CachedInterpolation* ys, \ const float extrapolated_value, const bool flip_x, const bool flip_y, \ - U_type *output) { \ + U_type* output) { \ if (channels <= 4) { \ - CropResizeCastImage *resizer = \ + CropResizeCastImage* resizer = \ new CropResizeCastImage( \ in_height, in_width, out_height, out_width, channels, min_ix, \ max_ix, xs, min_iy, max_iy, ys, extrapolated_value, flip_x, \ @@ -5459,19 +5560,19 @@ CROP_RESIZE_SINGLE_IMAGE_VECT(float, float) // image resizing for these data types default to the original code. // at the moment, this is int64 and double. -#define CROP_RESIZE_SINGLE_IMAGE_REGULAR(T_type, U_type) \ - template <> \ - void crop_resize_single_image_common( \ - const T_type *image, const int64 in_height, const int64 in_width, \ - const int64 out_height, const int64 out_width, const int channels, \ - const int min_ix, const int max_ix, const CachedInterpolation *xs, \ - const int min_iy, const int max_iy, const CachedInterpolation *ys, \ - const float extrapolated_value, const bool flip_x, const bool flip_y, \ - U_type *output) { \ - crop_resize_single_image(image, in_height, in_width, out_height, \ - out_width, channels, min_ix, max_ix, xs, min_iy, \ - max_iy, ys, extrapolated_value, flip_x, flip_y, \ - output); \ +#define CROP_RESIZE_SINGLE_IMAGE_REGULAR(T_type, U_type) \ + template <> \ + void crop_resize_single_image_common( \ + const T_type* image, const int64 in_height, const int64 in_width, \ + const int64 out_height, const int64 out_width, const int channels, \ + const int min_ix, const int max_ix, const CachedInterpolation* xs, \ + const int min_iy, const int max_iy, const CachedInterpolation* ys, \ + const float extrapolated_value, const bool flip_x, const bool flip_y, \ + U_type* output) { \ + crop_resize_single_image(image, in_height, in_width, out_height, \ + out_width, channels, min_ix, max_ix, xs, min_iy, \ + max_iy, ys, extrapolated_value, flip_x, flip_y, \ + output); \ } CROP_RESIZE_SINGLE_IMAGE_REGULAR(int64, float) @@ -5485,12 +5586,12 @@ CROP_RESIZE_SINGLE_IMAGE_REGULAR(double, float) template void crop_resize_single_image_common( - const T *image, const int64 in_height, const int64 in_width, + const T* image, const int64 in_height, const int64 in_width, const int64 out_height, const int64 out_width, const int channels, - const int min_ix, const int max_ix, const CachedInterpolation *xs, - const int min_iy, const int max_iy, const CachedInterpolation *ys, + const int min_ix, const int max_ix, const CachedInterpolation* xs, + const int min_iy, const int max_iy, const CachedInterpolation* ys, const float extrapolated_value, const bool flip_x, const bool flip_y, - U *output) { + U* output) { crop_resize_single_image(image, in_height, in_width, out_height, out_width, channels, min_ix, max_ix, xs, min_iy, max_iy, ys, extrapolated_value, flip_x, flip_y, output); @@ -5498,6 +5599,6 @@ void crop_resize_single_image_common( #endif -} // namespace -} // namespace tensorflow -#endif // define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ +} // namespace +} // namespace tensorflow +#endif // define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ From 51818c19a6245ae39054a2e9ee42b01b9e8eca13 Mon Sep 17 00:00:00 2001 From: Jared Duke Date: Mon, 5 Nov 2018 08:27:45 -0800 Subject: [PATCH 086/540] Add gemmlowp profiling output support to TFLite's benchmark_model Enabled by adding `--copt=-DGEMMLOWP_PROFILING` to the build flags. PiperOrigin-RevId: 220107124 --- tensorflow/lite/tools/benchmark/BUILD | 1 + .../tools/benchmark/benchmark_tflite_model.cc | 24 ++++++++++++++++--- .../tools/benchmark/benchmark_tflite_model.h | 15 ++++++++++-- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD index 7646bdcca9f..583046ad73d 100644 --- a/tensorflow/lite/tools/benchmark/BUILD +++ b/tensorflow/lite/tools/benchmark/BUILD @@ -112,6 +112,7 @@ cc_library( "//tensorflow/lite:string_util", "//tensorflow/lite/kernels:builtin_ops", "//tensorflow/lite/profiling:profile_summarizer", + "@gemmlowp", ], ) diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc index e063a144b66..777d9dde7dd 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc +++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc @@ -29,6 +29,10 @@ limitations under the License. #include "tensorflow/lite/string_util.h" #include "tensorflow/lite/tools/benchmark/logging.h" +#ifdef GEMMLOWP_PROFILING +#include "third_party/gemmlowp/profiling/profiler.h" +#endif + #ifdef TFLITE_CUSTOM_OPS_HEADER void RegisterSelectedOps(::tflite::MutableOpResolver* resolver); #endif @@ -62,6 +66,21 @@ void ProfilingListener::OnSingleRunEnd() { summarizer_.ProcessProfiles(profile_events, *interpreter_); } +void GemmlowpProfilingListener::OnBenchmarkStart( + const BenchmarkParams& params) { +#ifdef GEMMLOWP_PROFILING + gemmlowp::RegisterCurrentThreadForProfiling(); + gemmlowp::StartProfiling(); +#endif +} + +void GemmlowpProfilingListener::OnBenchmarkEnd( + const BenchmarkResults& results) { +#ifdef GEMMLOWP_PROFILING + gemmlowp::FinishProfiling(); +#endif +} + namespace { std::vector Split(const std::string& str, const char delim) { @@ -176,13 +195,12 @@ BenchmarkParams GetDefaultParams() { } // namespace BenchmarkTfLiteModel::BenchmarkTfLiteModel() - : BenchmarkModel(GetDefaultParams()) { - AddListener(&profiling_listener_); -} + : BenchmarkTfLiteModel(GetDefaultParams()) {} BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params) : BenchmarkModel(std::move(params)) { AddListener(&profiling_listener_); + AddListener(&gemmlowp_profiling_listener_); } std::vector BenchmarkTfLiteModel::GetFlags() { diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h index 8ad3a5dbe50..401ab5427d3 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h +++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h @@ -27,7 +27,7 @@ limitations under the License. namespace tflite { namespace benchmark { -// Dumps profiling events if profiling is enabled +// Dumps profiling events if profiling is enabled. class ProfilingListener : public BenchmarkListener { public: explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {} @@ -47,11 +47,21 @@ class ProfilingListener : public BenchmarkListener { bool has_profiles_; }; +// Dumps gemmlowp profiling events if gemmlowp profiling is enabled. +class GemmlowpProfilingListener : public BenchmarkListener { + public: + virtual ~GemmlowpProfilingListener() {} + + void OnBenchmarkStart(const BenchmarkParams& params) override; + + void OnBenchmarkEnd(const BenchmarkResults& results) override; +}; + // Benchmarks a TFLite model by running tflite interpreter. class BenchmarkTfLiteModel : public BenchmarkModel { public: BenchmarkTfLiteModel(); - BenchmarkTfLiteModel(BenchmarkParams params); + explicit BenchmarkTfLiteModel(BenchmarkParams params); virtual ~BenchmarkTfLiteModel() {} std::vector GetFlags() override; @@ -74,6 +84,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel { std::unique_ptr interpreter; std::vector inputs; ProfilingListener profiling_listener_; + GemmlowpProfilingListener gemmlowp_profiling_listener_; }; } // namespace benchmark From 4c605ca9d368c8ef8f13dd4c798da8bf3508668a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 08:33:26 -0800 Subject: [PATCH 087/540] Fix unhelpful error message For 99% of all usecases, if the expected shape differs from the actual shape, people will typically rerun with an additional print statement to see what the actual output was. PiperOrigin-RevId: 220107992 --- tensorflow/python/framework/test_util.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index f561d16bc3e..0516c686699 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -1670,9 +1670,16 @@ class TensorFlowTestCase(googletest.TestCase): msg = msg if msg else "" a = self._GetNdArray(a) b = self._GetNdArray(b) - self.assertEqual( - a.shape, b.shape, "Shape mismatch: expected %s, got %s." - " %s" % (a.shape, b.shape, msg)) + # Arbitrary bounds so that we don't print giant tensors. + if (b.ndim <= 3 or b.size < 500): + self.assertEqual( + a.shape, b.shape, "Shape mismatch: expected %s, got %s." + " Contents: %s. \n%s." % (a.shape, b.shape, b, msg)) + else: + self.assertEqual( + a.shape, b.shape, "Shape mismatch: expected %s, got %s." + " %s" % (a.shape, b.shape, msg)) + same = (a == b) if (a.dtype in [ From 81ae44d1245acc44a1f703182219051ec2c21a0e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 08:38:38 -0800 Subject: [PATCH 088/540] Convert device_coordinates to tuples to retrieve correct device and task ordinals. PiperOrigin-RevId: 220108785 --- tensorflow/contrib/tpu/python/tpu/topology.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/tpu/python/tpu/topology.py b/tensorflow/contrib/tpu/python/tpu/topology.py index b6bb5c6e56c..6ae718cc2c9 100644 --- a/tensorflow/contrib/tpu/python/tpu/topology.py +++ b/tensorflow/contrib/tpu/python/tpu/topology.py @@ -189,12 +189,13 @@ class Topology(object): def cpu_device_name_at_coordinates(self, device_coordinates, job=None): """Returns the CPU device attached to a logical core.""" return _tpu_host_device_name( - job, self._topology_tasks[device_coordinates]) + job, self._topology_tasks[tuple(device_coordinates)]) def tpu_device_name_at_coordinates(self, device_coordinates, job=None): """Returns the name of the TPU device assigned to a logical core.""" - return _tpu_device_name(job, self._topology_tasks[device_coordinates], - self._topology_devices[device_coordinates]) + return _tpu_device_name(job, + self._topology_tasks[tuple(device_coordinates)], + self._topology_devices[tuple(device_coordinates)]) @property def num_tasks(self): From aa23de83de1f847698861070f4c2401cf1ba7531 Mon Sep 17 00:00:00 2001 From: Jared Duke Date: Mon, 5 Nov 2018 08:46:27 -0800 Subject: [PATCH 089/540] Enable TFLite Java tests in oss PiperOrigin-RevId: 220110024 --- tensorflow/lite/java/BUILD | 8 +-- .../lite/InterpreterMobileNetTest.java | 57 +++++++++++++++++++ .../org/tensorflow/lite/InterpreterTest.java | 37 ++---------- tensorflow/lite/java/src/test/native/BUILD | 4 ++ .../src/test/native/interpreter_test_jni.cc | 30 +++++++--- 5 files changed, 89 insertions(+), 47 deletions(-) create mode 100644 tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD index e6f47a9773a..cf759fa00c6 100644 --- a/tensorflow/lite/java/BUILD +++ b/tensorflow/lite/java/BUILD @@ -90,7 +90,6 @@ java_test( size = "small", srcs = ["src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java"], javacopts = JAVACOPTS, - tags = ["no_oss"], test_class = "org.tensorflow.lite.TensorFlowLiteTest", deps = [ ":tensorflowlitelib", @@ -104,7 +103,6 @@ java_test( size = "small", srcs = ["src/test/java/org/tensorflow/lite/DataTypeTest.java"], javacopts = JAVACOPTS, - tags = ["no_oss"], test_class = "org.tensorflow.lite.DataTypeTest", deps = [ ":tensorflowlitelib", @@ -122,11 +120,11 @@ java_test( "src/testdata/int32.bin", "src/testdata/int64.bin", "src/testdata/invalid_model.bin", + "src/testdata/quantized.bin", "src/testdata/uint8.bin", "src/testdata/with_custom_op.lite", ], javacopts = JAVACOPTS, - tags = ["no_oss"], test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest", deps = [ ":tensorflowlitelib", @@ -142,11 +140,9 @@ java_test( srcs = ["src/test/java/org/tensorflow/lite/InterpreterTest.java"], data = [ "src/testdata/add.bin", - "src/testdata/mobilenet.tflite.bin", "//tensorflow/lite:testdata/multi_add_flex.bin", ], javacopts = JAVACOPTS, - tags = ["no_oss"], test_class = "org.tensorflow.lite.InterpreterTest", visibility = ["//visibility:private"], deps = [ @@ -165,7 +161,6 @@ java_test( "//tensorflow/lite:testdata/multi_add_flex.bin", ], javacopts = JAVACOPTS, - tags = ["no_oss"], test_class = "org.tensorflow.lite.InterpreterFlexTest", visibility = ["//visibility:private"], deps = [ @@ -183,7 +178,6 @@ java_test( "src/testdata/add.bin", ], javacopts = JAVACOPTS, - tags = ["no_oss"], test_class = "org.tensorflow.lite.TensorTest", deps = [ ":tensorflowlitelib", diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java new file mode 100644 index 00000000000..b69bfa076e2 --- /dev/null +++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java @@ -0,0 +1,57 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +package org.tensorflow.lite; + +import static com.google.common.truth.Truth.assertThat; + +import java.io.File; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Unit tests for {@link org.tensorflow.lite.Interpreter} agains a MobileNet model. */ +@RunWith(JUnit4.class) +public final class InterpreterMobileNetTest { + + private static final File MOBILENET_MODEL_FILE = + new File("tensorflow/lite/java/src/testdata/mobilenet.tflite.bin"); + + @Test + public void testMobilenetRun() { + // Create a gray image. + float[][][][] img = new float[1][224][224][3]; + for (int i = 0; i < 224; ++i) { + for (int j = 0; j < 224; ++j) { + img[0][i][j][0] = 0.5f; + img[0][i][j][1] = 0.5f; + img[0][i][j][2] = 0.5f; + } + } + + // Allocate memory to receive the output values. + float[][] labels = new float[1][1001]; + + Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE); + interpreter.run(img, labels); + assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3}); + assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001}); + interpreter.close(); + + assertThat(labels[0]) + .usingExactEquality() + .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY}); + } +} diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java index 612229d1727..7e591b009d2 100644 --- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java +++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java @@ -40,9 +40,6 @@ public final class InterpreterTest { private static final File MODEL_FILE = new File("tensorflow/lite/java/src/testdata/add.bin"); - private static final File MOBILENET_MODEL_FILE = - new File("tensorflow/lite/java/src/testdata/mobilenet.tflite.bin"); - private static final File FLEX_MODEL_FILE = new File("tensorflow/lite/testdata/multi_add_flex.bin"); @@ -214,32 +211,6 @@ public final class InterpreterTest { } } - @Test - public void testMobilenetRun() { - // Create a gray image. - float[][][][] img = new float[1][224][224][3]; - for (int i = 0; i < 224; ++i) { - for (int j = 0; j < 224; ++j) { - img[0][i][j][0] = 0.5f; - img[0][i][j][1] = 0.5f; - img[0][i][j][2] = 0.5f; - } - } - - // Allocate memory to receive the output values. - float[][] labels = new float[1][1001]; - - Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE); - interpreter.run(img, labels); - assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3}); - assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001}); - interpreter.close(); - - assertThat(labels[0]) - .usingExactEquality() - .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY}); - } - @Test public void testRunWithWrongInputType() { Interpreter interpreter = new Interpreter(MODEL_FILE); @@ -286,7 +257,7 @@ public final class InterpreterTest { @Test public void testGetInputIndex() { - Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE); + Interpreter interpreter = new Interpreter(MODEL_FILE); try { interpreter.getInputIndex("WrongInputName"); fail(); @@ -303,7 +274,7 @@ public final class InterpreterTest { @Test public void testGetOutputIndex() { - Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE); + Interpreter interpreter = new Interpreter(MODEL_FILE); try { interpreter.getOutputIndex("WrongOutputName"); fail(); @@ -312,9 +283,9 @@ public final class InterpreterTest { .hasMessageThat() .contains( "'WrongOutputName' is not a valid name for any output. Names of outputs and their" - + " indexes are {MobilenetV1/Predictions/Softmax=0}"); + + " indexes are {output=0}"); } - int index = interpreter.getOutputIndex("MobilenetV1/Predictions/Softmax"); + int index = interpreter.getOutputIndex("output"); assertThat(index).isEqualTo(0); } diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD index 27fc95f1f7f..4d3e82b1ac1 100644 --- a/tensorflow/lite/java/src/test/native/BUILD +++ b/tensorflow/lite/java/src/test/native/BUILD @@ -21,6 +21,10 @@ cc_library( "//tensorflow/lite/java/src/main/native:jni_md.h", ], }), + includes = select({ + "//tensorflow:android": [], + "//conditions:default": ["../../main/native/."], + }), deps = ["//tensorflow/lite/c:c_api_internal"], ) diff --git a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc index 4a5c18d9438..d83cb4cd305 100644 --- a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc +++ b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc @@ -25,6 +25,8 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate( JNIEnv* env, jclass clazz) { // A simple op which outputs a vector of length 1 with the value [7]. static TfLiteRegistration registration = { + .init = nullptr, + .free = nullptr, .prepare = [](TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = &context->tensors[node->outputs->data[0]]; @@ -38,11 +40,16 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate( TfLiteTensor* output = &context->tensors[node->outputs->data[0]]; output->data.f[0] = 7.0f; return kTfLiteOk; - }}; + }, + .profiling_string = nullptr, + .builtin_code = 0, + .custom_name = "", + .version = 1, + }; // A simple delegate which replaces all ops with a single op that outputs a // vector of length 1 with the value [7]. static TfLiteDelegate delegate = { - .flags = kTfLiteDelegateFlagsAllowDynamicTensors, + .data_ = nullptr, .Prepare = [](TfLiteContext* context, TfLiteDelegate* delegate) -> TfLiteStatus { TfLiteIntArray* execution_plan; @@ -51,7 +58,12 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate( context->ReplaceSubgraphsWithDelegateKernels(context, registration, execution_plan, delegate); return kTfLiteOk; - }}; + }, + .CopyFromBufferHandle = nullptr, + .CopyToBufferHandle = nullptr, + .FreeBufferHandle = nullptr, + .flags = kTfLiteDelegateFlagsAllowDynamicTensors, + }; return reinterpret_cast(&delegate); } @@ -60,10 +72,14 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForInvalidDelegate( JNIEnv* env, jclass clazz) { // A simple delegate that fails during preparation. static TfLiteDelegate delegate = { - .Prepare = [](TfLiteContext* context, - TfLiteDelegate* delegate) -> TfLiteStatus { - return kTfLiteError; - }}; + .data_ = nullptr, + .Prepare = [](TfLiteContext* context, TfLiteDelegate* delegate) + -> TfLiteStatus { return kTfLiteError; }, + .CopyFromBufferHandle = nullptr, + .CopyToBufferHandle = nullptr, + .FreeBufferHandle = nullptr, + .flags = kTfLiteDelegateFlagsNone, + }; return reinterpret_cast(&delegate); } From 32d73e410769228c78e100bc55242ac40e7875fe Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 08:50:17 -0800 Subject: [PATCH 090/540] allow tensors to go from one flex subgraph to another, without convertion to CPU. PiperOrigin-RevId: 220110546 --- tensorflow/lite/delegates/flex/buffer_map.cc | 8 ++++- tensorflow/lite/delegates/flex/buffer_map.h | 17 +++++++++- .../lite/delegates/flex/buffer_map_test.cc | 2 ++ tensorflow/lite/delegates/flex/kernel.cc | 7 ++++- tensorflow/lite/delegates/flex/kernel_test.cc | 31 ++++++++++++++++--- 5 files changed, 58 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc index 2c4aa7075db..9a6c5e74a7b 100644 --- a/tensorflow/lite/delegates/flex/buffer_map.cc +++ b/tensorflow/lite/delegates/flex/buffer_map.cc @@ -130,6 +130,10 @@ bool BufferMap::HasTensor(int tensor_index) const { return id_to_tensor_.count(tensor_index) != 0; } +bool BufferMap::IsTensorFlowTensor(int tensor_index) const { + return HasTensor(tensor_index) && owned_by_tf_.count(tensor_index) > 0; +} + tensorflow::Tensor BufferMap::GetTensor(int tensor_index) const { return id_to_tensor_.at(tensor_index); } @@ -154,11 +158,13 @@ void BufferMap::SetFromTfLite(int tensor_index, const TfLiteTensor* tensor) { GetTensorFlowDataType(tensor->type), shape, buf); buf->Unref(); - SetFromTensorFlow(tensor_index, std::move(t)); + id_to_tensor_[tensor_index] = std::move(t); + owned_by_tf_.erase(tensor_index); } void BufferMap::SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor) { id_to_tensor_[tensor_index] = std::move(tensor); + owned_by_tf_.insert(tensor_index); } } // namespace flex diff --git a/tensorflow/lite/delegates/flex/buffer_map.h b/tensorflow/lite/delegates/flex/buffer_map.h index 269a0a2a276..b73ed88d378 100644 --- a/tensorflow/lite/delegates/flex/buffer_map.h +++ b/tensorflow/lite/delegates/flex/buffer_map.h @@ -38,12 +38,17 @@ class BufferMap { // tensorflow::Tensor. bool HasTensor(int tensor_index) const; + // Returns true if the given 'tensor_index' has a corresponding + // tensorflow::Tensor *and* the content is owned by TensorFlow (that is, the + // mapping was added by SetFromTensorFlow()). + bool IsTensorFlowTensor(int tensor_index) const; + // Returns the tensorflow::Tensor associated with the given 'tensor_index'. // Precondition: HasTensor() is true. tensorflow::Tensor GetTensor(int tensor_index) const; // Associates the given tensorflow::Tensor with the given 'tensor_index'. - // Note that tensorflow Tensors share data buffers, so this method is only a + // Note that TensorFlow Tensors share data buffers, so this method is only a // shallow copy. void SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor); @@ -52,7 +57,17 @@ class BufferMap { void SetFromTfLite(int tensor_index, const TfLiteTensor* tensor); private: + // Mapping from TL Lite tensor ID to TensorFlow's Tensor. All tensors that + // are inputs or outputs of a subgraph will be added here, irrespective of + // whether their data are managed by TF Lite or TensorFlow. std::map id_to_tensor_; + // A list of tensors that are completely managed by TensorFlow. Most of the + // time, TF Lite will populate tensors that are inputs to subgraphs, while + // TensorFlow will populate output tensors. Occasionally, however, an input + // tensor is coming from a previous subgraph and could have been populated by + // TensorFlow. This set keeps track of all input or output tensors that have + // been populated by tensorflow. + std::set owned_by_tf_; }; } // namespace flex diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc index fd52273fb40..9e8472f1e7d 100644 --- a/tensorflow/lite/delegates/flex/buffer_map_test.cc +++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc @@ -203,6 +203,7 @@ TEST(BufferMapTest, TfLiteOverwritesTensorFlow) { buffer_map.SetFromTensorFlow(0, t1); buffer_map.SetFromTfLite(0, t2.get()); + EXPECT_FALSE(buffer_map.IsTensorFlowTensor(0)); EXPECT_THAT(GetTensorData(buffer_map.GetTensor(0)), ElementsAre(0, 0, 0, 3, 0, 0, 1, 2)); } @@ -216,6 +217,7 @@ TEST(BufferMapTest, TensorFlowOverwritesTfLite) { buffer_map.SetFromTfLite(0, t2.get()); buffer_map.SetFromTensorFlow(0, t1); + EXPECT_TRUE(buffer_map.IsTensorFlowTensor(0)); EXPECT_THAT(GetTensorData(buffer_map.GetTensor(0)), ElementsAre(0, 0, 0, 0.123f, 0, 0)); } diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc index 2c19580235f..c4fe142dff1 100644 --- a/tensorflow/lite/delegates/flex/kernel.cc +++ b/tensorflow/lite/delegates/flex/kernel.cc @@ -251,7 +251,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { for (auto tensor_index : op_data->subgraph_inputs) { TfLiteTensor* tensor = &context->tensors[tensor_index]; if (!IsConstantTensor(tensor)) { - buffer_map->SetFromTfLite(tensor_index, tensor); + // If this tensor is part of an earlier TF subgraph we should not add it + // to the BufferMap again, because TF already knows about it and its + // contents are kept automatically up-to-date. + if (!buffer_map->IsTensorFlowTensor(tensor_index)) { + buffer_map->SetFromTfLite(tensor_index, tensor); + } } } diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc index fae2b8b19e8..4742c24bfc9 100644 --- a/tensorflow/lite/delegates/flex/kernel_test.cc +++ b/tensorflow/lite/delegates/flex/kernel_test.cc @@ -100,6 +100,17 @@ TEST_F(KernelTest, FullGraph) { ASSERT_THAT(GetShape(8), ElementsAre(2, 1)); ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f)); + + // Try again with different inputs + SetShape(0, {2, 3, 1}); + SetValues(0, {2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f}); + SetShape(3, {2, 3, 1}); + SetValues(3, {2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f}); + + ASSERT_TRUE(Invoke()); + + ASSERT_THAT(GetShape(8), ElementsAre(3, 1)); + ASSERT_THAT(GetValues(8), ElementsAre(24.0f, 32.0f, 48.0f)); } TEST_F(KernelTest, BadTensorFlowOp) { @@ -240,11 +251,23 @@ TEST_F(KernelTest, SplitGraph) { ASSERT_TRUE(Invoke()); ASSERT_THAT(GetShape(17), ElementsAre(1)); + ASSERT_THAT(GetValues(17), ElementsAre(16.0f)); - // It should really be 16, but we are messing up tensor #16 with - // data from the TF Lite buffer, even though that particular tensor - // should use the data produced by TF. - ASSERT_THAT(GetValues(17), ElementsAre(::testing::Not(16.0f))); + // Same as above but with slightly different output. + // We still expect the result to be l + r where + // l = (a0 + b0) * (a2 + b2) + (a1 + b1) * (a3 + b3) + // r = (a4 + a6) + (a5 + a7) + SetShape(0, {2, 2, 2, 1}); + SetValues(0, {4.0f, 1.0f, 1.5f, -2.0f, 2.0f, 0.0f, -2.0f, 3.0f}); + SetShape(1, {2, 2, 1}); + SetValues(1, {0.0f, 2.0f, 1.5f, 3.0f}); + // So l = (4 + 0) * (1.5 + 1.5) + (1 + 2) * (-2 + 3) = 12 + 3 = 15 + // r = (2 - 2) + (0 + 3) = 3 + + ASSERT_TRUE(Invoke()); + + ASSERT_THAT(GetShape(17), ElementsAre(1)); + ASSERT_THAT(GetValues(17), ElementsAre(18.0f)); } } // namespace From 799686e50402a04bca2296eb19f067b168ae71e7 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 09:12:42 -0800 Subject: [PATCH 091/540] Introduced FunctionCallOptions in eager context and a context manager to set it. For now the available options are: executor_type: determines the executor used by the PartitionedCallOp created for a call site. rewriter_config: the Grappler config used by PartitionedCallOp when optimizing the function graph for the call site. PiperOrigin-RevId: 220114298 --- tensorflow/python/eager/context.py | 152 ++++++++++++++---- tensorflow/python/eager/function.py | 4 +- tensorflow/python/eager/function_test.py | 22 ++- .../tools/api/generator/api_init_files.bzl | 1 + .../tools/api/generator/api_init_files_v1.bzl | 1 + .../golden/v1/tensorflow.experimental.pbtxt | 7 + .../tools/api/golden/v1/tensorflow.pbtxt | 4 + .../golden/v2/tensorflow.experimental.pbtxt | 7 + .../tools/api/golden/v2/tensorflow.pbtxt | 4 + 9 files changed, 166 insertions(+), 36 deletions(-) create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py index 0986c4b9a6c..e3fef524bf9 100644 --- a/tensorflow/python/eager/context.py +++ b/tensorflow/python/eager/context.py @@ -81,6 +81,57 @@ class _EagerTensorCache(object): self._data = {} +class FunctionCallOptions(object): + """Options applied at call sites of eager functions. + Eager functions are functions decorated with tf.contrib.eager.defun. + """ + + def __init__(self, executor_type=None, rewriter_config=None): + """Constructor. + + Args: + executor_type: (optional) name of the executor to be used to execute the + eager function. If None or an empty string, the default Tensorflow + executor will be used. + rewriter_config: (optional) a rewriter_config_pb2.RewriterConfig proto or + a serialized string of that proto. + The config used by Grappler when optimizing the function graph. + Each concrete function is optimized the first time is called. Changing + rewriter_config after the first call has no effect. + If rewriter_config is None, an empty RewriterConfig will be used. + """ + self.rewriter_config_serialized = rewriter_config + self.executor_type = executor_type + + @property + def executor_type(self): + return self._executor_type + + @executor_type.setter + def executor_type(self, executor_type): + self._executor_type = executor_type + + @property + def rewriter_config_serialized(self): + return self._rewriter_config_serialized + + @rewriter_config_serialized.setter + def rewriter_config_serialized(self, config): + if isinstance(config, rewriter_config_pb2.RewriterConfig): + self._rewriter_config_serialized = config.SerializeToString() + elif isinstance(config, str): + self._rewriter_config_serialized = config + elif config is None: + self._rewriter_config_serialized = rewriter_config_pb2.RewriterConfig( + ).SerializeToString() + else: + raise ValueError( + "the rewriter config must be either a " + "rewriter_config_pb2.RewriterConfig, or a serialized string of that " + "proto or None. got: {}" + .format(type(config))) + + # TODO(agarwal): better name ? class _EagerContext(threading.local): """Thread local eager context.""" @@ -107,7 +158,8 @@ class _EagerContext(threading.local): "graph_options") and config.graph_options.HasField("rewrite_options"): base_config.Merge(config.graph_options.rewrite_options) - self.rewriter_config = base_config.SerializeToString() + self.function_call_options = FunctionCallOptions( + rewriter_config=base_config) ContextSwitch = collections.namedtuple( @@ -372,36 +424,6 @@ class Context(object): if mode == EAGER_MODE: self.context_switches.pop() - @tf_contextlib.contextmanager - def rewriter_config(self, rewriter_config_=None): - """A context manager to allow setting the grappler rewrite options. - - Args: - rewriter_config_: A tensorflow.RewriterConfig proto object. - - Yields: - Nothing. - - Raises: - ValueError: if rewriter_config is not a tensorflow.RewriterConfig proto. - """ - if rewriter_config_ is None or not isinstance( - rewriter_config_, rewriter_config_pb2.RewriterConfig): - raise ValueError("Must pass a rewriter_config proto") - - ctx = self._eager_context - old_rewriter_config = ctx.rewriter_config - ctx.rewriter_config = rewriter_config_.SerializeToString() - try: - yield - finally: - ctx.rewriter_config = old_rewriter_config - - @property - def rewriter_config_string(self): - """Returns the serialized rewriter_config for the current thread.""" - return self._eager_context.rewriter_config - def executing_eagerly(self): """Returns True if current thread has eager executing enabled.""" return self._eager_context.is_eager @@ -530,6 +552,35 @@ class Context(object): finally: self.set_execution_mode(old_mode) + def get_function_call_options(self): + """Returns function call options for current thread. + + Note that the returned object is still referenced by the eager context. + + Returns: the FunctionCallOptions for current thread. + """ + return self._eager_context.function_call_options + + @tf_contextlib.contextmanager + def function_call_options(self, set_options_func): + """Context manager for setting function call options of current thread. + + Args: + set_options_func: A callable that takes one argument of type + FunctionCallOptions. It should set the properties of that + FunctionCallOptions. + + Yields: + Nothing. + """ + current_options = self.get_function_call_options() + old_options = copy.copy(current_options) + try: + set_options_func(current_options) + yield + finally: + self._eager_context.function_call_options = old_options + def async_wait(self): """Waits for ops dispatched in ASYNC mode to finish.""" pywrap_tensorflow.TFE_ContextAsyncWait(self._handle) @@ -782,6 +833,25 @@ def execution_mode(mode): return context().execution_mode(mode) +@tf_export("experimental.function_executor_type") +def function_executor_type(executor_type): + """Context manager for setting the executor of eagar defined functions. + + Eager defined functions are functions decorated by tf.contrib.eager.defun. + + Args: + executor_type: a string for the name of the executor to be used + to execute functions defined by tf.contrib.eager.defun. + + Returns: + Context manager for setting the executor of eager defined functions. + """ + def _set_options_func(options): + options.executor_type = executor_type + + return context().function_call_options(_set_options_func) + + def async_wait(): """Waits for ops dispatched in ASYNC mode to finish.""" return context().async_wait() @@ -827,9 +897,23 @@ def export_run_metadata(): return context().export_run_metadata() -def rewriter_config(rewriter_config_): - """Context manager for setting the grappler rewrite config.""" - return context().rewriter_config(rewriter_config_) +def function_rewriter_config(rewriter_config): + """Context manager for setting the grappler rewrite config. + + This config is used by Grappler when optimizing the function graph. + + Args: + rewriter_config: a rewriter_config_pb2.RewriterConfig proto or + a serialized string of that proto or None. If None, the default instance + of rewriter_config_pb2.RewriterConfig will be used. + + Returns: + A context manager. + """ + def _set_options_func(options): + options.rewriter_config_serialized = rewriter_config + + return context().function_call_options(_set_options_func) def set_server_def(server_def): diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index f6c54e05d26..08266a115b2 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -254,12 +254,14 @@ class _EagerDefinedFunction(object): raise ValueError( "Arguments and signature arguments do not match: %s %s " % (len(args), len(list(self.signature.input_arg)))) + function_call_options = ctx.get_function_call_options() outputs = functional_ops.partitioned_call( args=args, f=self, tout=self._output_types, executing_eagerly=executing_eagerly, - config=ctx.rewriter_config_string) # pylint: disable=protected-access + config=function_call_options.rewriter_config_serialized, + executor_type=function_call_options.executor_type) if executing_eagerly: return outputs diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py index 651d6cec724..781c3f0a18a 100644 --- a/tensorflow/python/eager/function_test.py +++ b/tensorflow/python/eager/function_test.py @@ -105,7 +105,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): # The default config allows everything. rewrites = rewriter_config_pb2.RewriterConfig() - with context.rewriter_config(rewrites): + with context.function_rewriter_config(rewrites): t = constant_op.constant(1.0) self.assertAllEqual(add(t, t).numpy(), 2.0) @@ -2703,6 +2703,26 @@ class FunctionTest(test.TestCase, parameterized.TestCase): del m self.assertEqual([], list(weak_variables)) + def testExecutorType(self): + @function.defun + def add_five(x): + return x + 5 + + self.assertEqual( + 5, + add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy()) + + with self.assertRaisesRegexp(errors.NotFoundError, 'NON_EXISTENT_EXECUTOR'): + with context.function_executor_type('NON_EXISTENT_EXECUTOR'): + add_five(constant_op.constant(0, dtype=dtypes.int32)) + + for executor_type in ('', 'DEFAULT', None): + with context.function_executor_type(executor_type): + self.assertAllEqual( + 5, + add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy()) + + @parameterized.named_parameters( dict(testcase_name='Defun', function_decorator=function.defun), diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl index f7de02ed6fa..ac7bc28b2be 100644 --- a/tensorflow/python/tools/api/generator/api_init_files.bzl +++ b/tensorflow/python/tools/api/generator/api_init_files.bzl @@ -13,6 +13,7 @@ TENSORFLOW_API_INIT_FILES = [ "distributions/__init__.py", "dtypes/__init__.py", "errors/__init__.py", + "experimental/__init__.py", "feature_column/__init__.py", "gfile/__init__.py", "graph_util/__init__.py", diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl index 520ce54a881..a4fc58b8510 100644 --- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl +++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl @@ -13,6 +13,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [ "distributions/__init__.py", "dtypes/__init__.py", "errors/__init__.py", + "experimental/__init__.py", "feature_column/__init__.py", "gfile/__init__.py", "graph_util/__init__.py", diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt new file mode 100644 index 00000000000..0c3f04e468c --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt @@ -0,0 +1,7 @@ +path: "tensorflow.experimental" +tf_module { + member_method { + name: "function_executor_type" + argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt index c9eb136f54e..9597dd7684e 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt @@ -340,6 +340,10 @@ tf_module { name: "estimator" mtype: "" } + member { + name: "experimental" + mtype: "" + } member { name: "feature_column" mtype: "" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt new file mode 100644 index 00000000000..0c3f04e468c --- /dev/null +++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt @@ -0,0 +1,7 @@ +path: "tensorflow.experimental" +tf_module { + member_method { + name: "function_executor_type" + argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt index 311e12e6d25..7c865bb0022 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt @@ -236,6 +236,10 @@ tf_module { name: "estimator" mtype: "" } + member { + name: "experimental" + mtype: "" + } member { name: "feature_column" mtype: "" From 98f7a6760aab4c40c7eb2aebc14b2a9e96f9269a Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 5 Nov 2018 10:03:38 -0800 Subject: [PATCH 092/540] Fix the flaky interleaved eval test PiperOrigin-RevId: 220122757 --- tensorflow/contrib/distribute/python/BUILD | 2 -- .../contrib/distribute/python/keras_test.py | 36 +++++++++---------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD index 22736c799d2..6ba9187ae21 100644 --- a/tensorflow/contrib/distribute/python/BUILD +++ b/tensorflow/contrib/distribute/python/BUILD @@ -757,8 +757,6 @@ cuda_py_test( "no_oss", # TODO(b/117919883): Fix python error. "no_pip", "no_windows_gpu", - # TODO(b/118815591): Re-enable this test in guitar.) - "noguitar", "notsan", ], ) diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py index 4cd8ac14100..37802c14143 100644 --- a/tensorflow/contrib/distribute/python/keras_test.py +++ b/tensorflow/contrib/distribute/python/keras_test.py @@ -598,36 +598,33 @@ class TestDistributionStrategyWithDatasets(test.TestCase, @combinations.generate(strategy_combinations()) def test_model_interleaved_eval_same_as_direct_eval(self, distribution): with self.cached_session(): - loss = 'mse' - user_controlled_model = get_model() - user_controlled_optimizer = gradient_descent.GradientDescentOptimizer( - 0.001) - user_controlled_metrics = ['mae', keras.metrics.CategoricalAccuracy()] - user_controlled_model.compile(user_controlled_optimizer, loss, - metrics=user_controlled_metrics, - distribute=distribution) + user_controlled_model.compile( + gradient_descent.GradientDescentOptimizer(0.001), + loss='mse', + metrics=['mae'], + distribute=distribution) interleaved_model = get_model() - interleaved_optimizer = gradient_descent.GradientDescentOptimizer(0.001) - interleaved_metrics = ['mae', keras.metrics.CategoricalAccuracy()] - interleaved_model.compile(interleaved_optimizer, loss, - metrics=interleaved_metrics, - distribute=distribution) + interleaved_model.set_weights(user_controlled_model.get_weights()) + interleaved_model.compile( + gradient_descent.GradientDescentOptimizer(0.001), + loss='mse', + metrics=['mae'], + distribute=distribution) dataset = get_dataset(distribution) # Call fit with validation interleaved - interleaved_output = interleaved_model.fit(dataset, epochs=2, - steps_per_epoch=2, verbose=0, - validation_data=dataset, - validation_steps=2) + interleaved_output = interleaved_model.fit( + dataset, epochs=2, steps_per_epoch=2, verbose=1, + validation_data=dataset, validation_steps=2, shuffle=False) # Manually control the validation running after each epoch. user_controlled_output = [] for _ in range(2): user_controlled_model.fit( - dataset, epochs=1, steps_per_epoch=2, verbose=0) + dataset, epochs=1, steps_per_epoch=2, verbose=1, shuffle=False) user_controlled_output.append( user_controlled_model.evaluate(dataset, steps=2)) @@ -635,8 +632,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase, [x[0] for x in user_controlled_output]) self.assertEqual(interleaved_output.history['val_mean_absolute_error'], [x[1] for x in user_controlled_output]) - self.assertEqual(interleaved_output.history['val_categorical_accuracy'], - [x[2] for x in user_controlled_output]) + # TODO(sourabhbajaj): Add an stateful metric here and verify support. # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work # as clone_model's input_tensors argument only seems to accept list and not From 4531d75656882952c91b4d77b08f7bcace2895b4 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Mon, 5 Nov 2018 10:12:26 -0800 Subject: [PATCH 093/540] Internal change PiperOrigin-RevId: 220124546 --- tensorflow/python/debug/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD index 79951232097..a73a4323c3f 100644 --- a/tensorflow/python/debug/BUILD +++ b/tensorflow/python/debug/BUILD @@ -1130,4 +1130,6 @@ sh_test( ":debug_tflearn_iris", ":offline_analyzer", ], + # TODO(b/119032933): Re-enable this test in ASAN. + tags = ["noasan"], ) From 059dd690980c2d125f96cb149d1eceaba02f9512 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Mon, 5 Nov 2018 10:15:26 -0800 Subject: [PATCH 094/540] [tf.data] Clean up of how `IteratorContext` is constructed and modified. PiperOrigin-RevId: 220125067 --- tensorflow/core/framework/dataset.h | 93 +++++++++++-------- .../experimental/threadpool_dataset_op.cc | 12 +-- .../core/kernels/data/flat_map_dataset_op.cc | 10 -- tensorflow/core/kernels/data/iterator_ops.cc | 83 +++++++---------- .../core/kernels/data/model_dataset_op.cc | 20 ++-- .../kernels/data/multi_device_iterator_ops.cc | 34 +++---- .../core/kernels/data/optimize_dataset_op.cc | 10 +- .../data/stats_aggregator_dataset_op.cc | 12 +-- 8 files changed, 119 insertions(+), 155 deletions(-) diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h index 55ba7d96b16..b4cd2751319 100644 --- a/tensorflow/core/framework/dataset.h +++ b/tensorflow/core/framework/dataset.h @@ -272,56 +272,55 @@ class StatsAggregator; class IteratorContext { public: struct Params { + explicit Params(IteratorContext* ctx) + : allocator_getter(ctx->allocator_getter()), + env(ctx->env()), + function_library(ctx->function_library()), + lib(ctx->lib()), + model(ctx->model()), + runner(*(ctx->runner())), + stats_aggregator(ctx->stats_aggregator()) {} + + explicit Params(OpKernelContext* ctx) + : env(ctx->env()), + lib(ctx->function_library()), + runner(*(ctx->runner())) { + // NOTE: need reinterpret_cast because function.h forward-declares Device. + DeviceBase* device = + reinterpret_cast(ctx->function_library()->device()); + allocator_getter = [device](AllocatorAttributes attrs) { + return device->GetAllocator(attrs); + }; + } + + // The Allocator to be used to allocate the output of an iterator. + std::function allocator_getter = nullptr; + // Interface to operating system functionality. - Env* env; + Env* env = nullptr; + + // The FunctionLibraryDefinition used to look up user-defined functions. + std::shared_ptr function_library = nullptr; + + // The FunctionLibraryRuntime object to be used to make function calls. + FunctionLibraryRuntime* lib = nullptr; + + // If non-null, identifies the object used for performance modeling. + std::shared_ptr model = nullptr; // Function call support. std::function)> runner = nullptr; // The `StatsAggregator` object to record statistics about the iterator. std::shared_ptr stats_aggregator = nullptr; - - // The FunctionLibraryRuntime object to be used to make function calls. - FunctionLibraryRuntime* lib = nullptr; - std::shared_ptr function_library = nullptr; - - // The Allocator to be used to allocate the output of an iterator. - std::function allocator_getter = nullptr; - - // If non-null, identifies the object used for performance modeling. - std::shared_ptr model = nullptr; }; + explicit IteratorContext(IteratorContext* ctx) : params_(Params{ctx}) {} + + explicit IteratorContext(OpKernelContext* ctx) : params_(Params{ctx}) {} + explicit IteratorContext(Params params) : params_(std::move(params)) {} - explicit IteratorContext(OpKernelContext* ctx) { - params_.env = ctx->env(); - params_.runner = *(ctx->runner()); - params_.lib = ctx->function_library(); - // NOTE: must use reinterpret_cast because function.h forward-declares - // Device. - DeviceBase* device = - reinterpret_cast(ctx->function_library()->device()); - params_.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - } - - Env* env() const { return params_.env; } - - std::function)>* runner() { - return ¶ms_.runner; - } - - - std::shared_ptr function_library() { - return params_.function_library; - } - - FunctionLibraryRuntime* lib() { return params_.lib; } - - void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; } - Allocator* allocator(AllocatorAttributes attrs) { return params_.allocator_getter(attrs); } @@ -330,12 +329,24 @@ class IteratorContext { return params_.allocator_getter; } + Env* env() const { return params_.env; } + + std::shared_ptr function_library() { + return params_.function_library; + } + + FunctionLibraryRuntime* lib() { return params_.lib; } + + std::shared_ptr model() { return params_.model; } + + std::function)>* runner() { + return ¶ms_.runner; + } + std::shared_ptr stats_aggregator() { return params_.stats_aggregator; } - std::shared_ptr model() { return params_.model; } - Params params() { return params_; } private: diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc index 64715aee2e2..56fbbde1a3a 100644 --- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc +++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc @@ -192,18 +192,12 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel { std::vector* out_tensors, bool* end_of_sequence) override { ThreadPoolResource* pool = dataset()->threadpool_; - IteratorContext::Params params; - params.env = ctx->env(); + IteratorContext::Params params(ctx); params.runner = [pool](std::function c) { pool->Schedule(std::move(c)); }; - params.stats_aggregator = ctx->stats_aggregator(); - params.lib = ctx->lib(); - params.function_library = ctx->function_library(); - params.allocator_getter = ctx->allocator_getter(); - IteratorContext threadpool_ctx(params); - return input_impl_->GetNext(&threadpool_ctx, out_tensors, - end_of_sequence); + IteratorContext iter_ctx(params); + return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence); } protected: diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc index d858b4d698d..9b42981ed75 100644 --- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc +++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc @@ -247,16 +247,6 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel { ¤t_element_iterator_); } - Status BuildCurrentElementIteratorLocked(OpKernelContext* ctx) - EXCLUSIVE_LOCKS_REQUIRED(mu_) { - IteratorContext::Params params; - params.env = ctx->env(); - params.runner = *(ctx->runner()); - params.lib = ctx->function_library(); - IteratorContext iter_ctx(std::move(params)); - return BuildCurrentElementIteratorLocked(&iter_ctx); - } - mutex mu_; size_t element_index_ GUARDED_BY(mu_) = 0; std::unique_ptr input_impl_ GUARDED_BY(mu_); diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index 459dc28ee46..445718ba1e5 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -68,8 +68,10 @@ class IteratorResource : public ResourceBase { std::shared_ptr captured_iterator(iterator_); if (captured_iterator) { CHECK_NOTNULL(lib_); - ctx->set_lib(lib_); - return captured_iterator->GetNext(ctx, out_tensors, end_of_sequence); + IteratorContext::Params params(ctx); + params.lib = lib_; + return captured_iterator->GetNext(IteratorContext(std::move(params)), + out_tensors, end_of_sequence); } else { return errors::FailedPrecondition( "GetNext() failed because the iterator has not been initialized. " @@ -78,6 +80,11 @@ class IteratorResource : public ResourceBase { } } + Status GetNext(IteratorContext&& ctx, std::vector* out_tensors, + bool* end_of_sequence) { + return GetNext(&ctx, out_tensors, end_of_sequence); + } + Status Save(SerializationContext* ctx, IteratorStateWriter* writer) { std::shared_ptr captured_iterator(iterator_); if (captured_iterator) { @@ -124,24 +131,21 @@ class IteratorResource : public ResourceBase { TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset)); std::unique_ptr iterator; - IteratorContext iter_ctx(ctx); - iter_ctx.set_lib(lib); - TF_RETURN_IF_ERROR( - dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator)); + IteratorContext::Params params(ctx); + params.lib = lib; + TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)), + "Iterator", &iterator)); TF_RETURN_IF_ERROR(set_iterator(std::move(iterator))); std::shared_ptr captured_iterator(iterator_); if (captured_iterator) { - IteratorContext::Params params; - params.env = ctx->env(); - params.runner = *(ctx->runner()); + IteratorContext::Params params(ctx); params.lib = lib; DeviceBase* device = lib->device(); params.allocator_getter = [device](AllocatorAttributes attrs) { return device->GetAllocator(attrs); }; IteratorContext iter_ctx(std::move(params)); - TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader)); mutex_lock l(mu_); device_mgr_ = std::move(device_mgr); @@ -582,10 +586,10 @@ void MakeIteratorOp::Compute(OpKernelContext* ctx) { core::ScopedUnref unref(iterator_resource); std::unique_ptr iterator; - IteratorContext iter_ctx(ctx); - iter_ctx.set_lib(iterator_resource->function_library_runtime()); - OP_REQUIRES_OK( - ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator)); + IteratorContext::Params params(ctx); + params.lib = iterator_resource->function_library_runtime(); + OP_REQUIRES_OK(ctx, dataset->MakeIterator(IteratorContext(std::move(params)), + "Iterator", &iterator)); OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator))); } @@ -913,10 +917,10 @@ class OneShotIteratorOp : public AsyncOpKernel { DatasetBase* dataset; TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset)); std::unique_ptr iter; - IteratorContext iter_ctx(ctx); - iter_ctx.set_lib(lib); - TF_RETURN_IF_ERROR( - dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iter)); + IteratorContext::Params params(ctx); + params.lib = lib; + TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)), + "Iterator", &iter)); TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter))); (*iterator)->Ref(); @@ -972,17 +976,10 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { std::vector components; bool end_of_sequence = false; - IteratorContext::Params params; - params.env = ctx->env(); - params.runner = *(ctx->runner()); + IteratorContext::Params params(ctx); params.function_library = iterator->function_library(); - DeviceBase* device = ctx->function_library()->device(); - params.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - IteratorContext iter_ctx(std::move(params)); - - Status s = iterator->GetNext(&iter_ctx, &components, &end_of_sequence); + Status s = iterator->GetNext(IteratorContext(std::move(params)), + &components, &end_of_sequence); // NOTE(mrry): We must unref the iterator before calling `done()`, to // avoid destruction races. iterator->Unref(); @@ -1006,22 +1003,12 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) { IteratorResource* iterator; OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator)); core::ScopedUnref unref_iterator(iterator); - std::vector components; bool end_of_sequence = false; - - IteratorContext::Params params; - params.env = ctx->env(); - params.runner = *(ctx->runner()); + IteratorContext::Params params(ctx); params.function_library = iterator->function_library(); - DeviceBase* device = ctx->function_library()->device(); - params.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - IteratorContext iter_ctx(std::move(params)); - - OP_REQUIRES_OK(ctx, - iterator->GetNext(&iter_ctx, &components, &end_of_sequence)); + OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(std::move(params)), + &components, &end_of_sequence)); OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence")); for (int i = 0; i < components.size(); ++i) { @@ -1054,18 +1041,10 @@ class IteratorGetNextAsOptionalOp : public AsyncOpKernel { std::vector components; bool end_of_sequence = false; - IteratorContext::Params params; - params.env = ctx->env(); - params.runner = *(ctx->runner()); + IteratorContext::Params params(ctx); params.function_library = iterator->function_library(); - DeviceBase* device = ctx->function_library()->device(); - params.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - IteratorContext iter_ctx(std::move(params)); - - Status s = - iterator->GetNext(&iter_ctx, &components, &end_of_sequence); + Status s = iterator->GetNext(IteratorContext(std::move(params)), + &components, &end_of_sequence); // NOTE(mrry): We must unref the iterator before calling `done()`, to // avoid destruction races. iterator->Unref(); diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc index 56a45b12049..a0d8a1619c6 100644 --- a/tensorflow/core/kernels/data/model_dataset_op.cc +++ b/tensorflow/core/kernels/data/model_dataset_op.cc @@ -86,9 +86,10 @@ class ModelDatasetOp : public UnaryDatasetOpKernel { } Status Initialize(IteratorContext* ctx) override { - IteratorContext ctx_with_model(CreateParams(ctx)); - return dataset()->input_->MakeIterator(&ctx_with_model, prefix(), - &input_impl_); + IteratorContext::Params params(ctx); + params.model = model_; + return dataset()->input_->MakeIterator( + IteratorContext(std::move(params)), prefix(), &input_impl_); } Status GetNextInternal(IteratorContext* ctx, @@ -96,9 +97,10 @@ class ModelDatasetOp : public UnaryDatasetOpKernel { bool* end_of_sequence) override { mutex_lock l(mu_); TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx)); - IteratorContext ctx_with_model(CreateParams(ctx)); - return input_impl_->GetNext(&ctx_with_model, out_tensors, - end_of_sequence); + IteratorContext::Params params(ctx); + params.model = model_; + return input_impl_->GetNext(IteratorContext(std::move(params)), + out_tensors, end_of_sequence); } protected: @@ -121,12 +123,6 @@ class ModelDatasetOp : public UnaryDatasetOpKernel { return Status::OK(); } - IteratorContext::Params CreateParams(IteratorContext* ctx) { - IteratorContext::Params params = ctx->params(); - params.model = model_; - return params; - } - private: Status EnsureOptimizeThreadStarted(IteratorContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) { diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc index 4d4f8c01640..5268007e3d9 100644 --- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc +++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc @@ -86,12 +86,18 @@ class MultiDeviceIterator : public ResourceBase { void GetNextFromShard(IteratorContext* ctx, int shard_num, int64 incarnation_id, MultiDeviceIteratorCallback callback) { - if (lib_ != nullptr) { - ctx->set_lib(lib_); + if (ctx->lib() == lib_) { + tf_shared_lock l(mu_); + multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id, + std::move(callback)); + } else { + IteratorContext::Params params(ctx); + params.lib = lib_; + IteratorContext iter_ctx(std::move(params)); + tf_shared_lock l(mu_); + multi_device_buffer_->GetNextFromShard( + &iter_ctx, shard_num, incarnation_id, std::move(callback)); } - tf_shared_lock l(mu_); - multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id, - std::move(callback)); } const DataTypeVector& output_types() const { return output_types_; } @@ -455,8 +461,9 @@ class MultiDeviceIteratorInitOp : public OpKernel { core::ScopedUnref unref(resource); std::unique_ptr iterator; - IteratorContext iter_ctx(ctx); - iter_ctx.set_lib(resource->lib()); + IteratorContext::Params params(ctx); + params.lib = resource->lib(); + IteratorContext iter_ctx(std::move(params)); OP_REQUIRES_OK( ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator)); int64 incarnation_id; @@ -496,16 +503,6 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel { ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done); background_worker_.Schedule(std::bind( [ctx, iterator, shard_num, incarnation_id](DoneCallback done) { - IteratorContext::Params params; - params.env = ctx->env(); - params.runner = *(ctx->runner()); - params.function_library = iterator->function_library(); - DeviceBase* device = ctx->function_library()->device(); - params.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - IteratorContext iter_ctx(std::move(params)); - MultiDeviceIteratorCallback callback = std::bind( [ctx](const HostBufferElement& elem, DoneCallback done) { // iterator->Unref(); @@ -523,6 +520,9 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel { }, std::placeholders::_1, std::move(done)); + IteratorContext::Params params(ctx); + params.function_library = iterator->function_library(); + IteratorContext iter_ctx(std::move(params)); iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id, callback); iterator->Unref(); diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc index eddaefb4283..726220e06bf 100644 --- a/tensorflow/core/kernels/data/optimize_dataset_op.cc +++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc @@ -164,19 +164,19 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel { : DatasetIterator(params) {} Status Initialize(IteratorContext* ctx) override { - IteratorContext::Params params = ctx->params(); + IteratorContext::Params params(ctx); params.lib = dataset()->lib_; return dataset()->optimized_input_->MakeIterator( - IteratorContext(params), prefix(), &input_impl_); + IteratorContext(std::move(params)), prefix(), &input_impl_); } Status GetNextInternal(IteratorContext* ctx, std::vector* out_tensors, bool* end_of_sequence) override { - IteratorContext::Params params = ctx->params(); + IteratorContext::Params params(ctx); params.lib = dataset()->lib_; - return input_impl_->GetNext(IteratorContext(params), out_tensors, - end_of_sequence); + return input_impl_->GetNext(IteratorContext(std::move(params)), + out_tensors, end_of_sequence); } protected: diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc index 08d34eea878..a21b3fc16b7 100644 --- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc +++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc @@ -163,19 +163,13 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel { mutex_lock l(mu_); StatsAggregatorResource* stats_aggregator_resource = dataset()->stats_aggregator_resource_; - IteratorContext::Params params; - params.env = ctx->env(); - params.runner = *(ctx->runner()); + IteratorContext::Params params(ctx); params.stats_aggregator = std::shared_ptr( new StatsAggregatorWithTagAndPrefix( stats_aggregator_resource->stats_aggregator(), dataset()->tag_, dataset()->prefix_)); - params.lib = ctx->lib(); - params.function_library = ctx->function_library(); - params.allocator_getter = ctx->allocator_getter(); - IteratorContext set_stats_aggregator_ctx(params); - return input_impl_->GetNext(&set_stats_aggregator_ctx, out_tensors, - end_of_sequence); + IteratorContext iter_ctx(std::move(params)); + return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence); } protected: From 7761c55477968435a3080f2e813d59583275dd29 Mon Sep 17 00:00:00 2001 From: Shivani Agrawal Date: Mon, 5 Nov 2018 10:36:11 -0800 Subject: [PATCH 095/540] [data-defun] Avoids unnecessary args checking and uses light-weighted get_concrete_function. PiperOrigin-RevId: 220129277 --- .../python/data/experimental/ops/map_defun.py | 2 +- .../data/experimental/ops/prefetching_ops.py | 20 +++++++++++-------- .../data/ops/multi_device_iterator_ops.py | 16 +++++++++------ 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/tensorflow/python/data/experimental/ops/map_defun.py b/tensorflow/python/data/experimental/ops/map_defun.py index ec1a3adf0c1..5d729d392ac 100644 --- a/tensorflow/python/data/experimental/ops/map_defun.py +++ b/tensorflow/python/data/experimental/ops/map_defun.py @@ -52,7 +52,7 @@ def map_defun(fn, elems, output_dtypes, output_shapes): raise ValueError("`output_shapes` must be a list of `tf.TensorShape` " "objects.") - concrete_fn = fn.get_concrete_function() + concrete_fn = fn._get_concrete_function_internal() # pylint: disable=protected-access # TODO(shivaniagrawal/rachelim): what about functions created without # input_signature. elems = [ops.convert_to_tensor(e) for e in elems] diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py index 2add95558d5..a55b8bfb769 100644 --- a/tensorflow/python/data/experimental/ops/prefetching_ops.py +++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py @@ -138,7 +138,7 @@ class _PrefetchToDeviceIterator(object): ret = remote_iterator.get_next() return nest.flatten(sparse.serialize_sparse_tensors(ret)) - self._prefetch_fn = _prefetch_fn.get_concrete_function() + self._prefetch_fn = _prefetch_fn._get_concrete_function_internal() # pylint: disable=protected-access iterator_device = ged_ops.experimental_iterator_get_device( self._input_iterator._iterator_resource) @@ -237,7 +237,7 @@ class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator): ret = remote_iterator.get_next() return nest.flatten(sparse.serialize_sparse_tensors(ret)) - self._prefetch_fn = _prefetch_fn.get_concrete_function() + self._prefetch_fn = _prefetch_fn._get_concrete_function_internal() # pylint: disable=protected-access with ops.device(device): self._buffering_resource = function_buffering_resource( @@ -422,7 +422,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset): [gen_dataset_ops.make_iterator(ds_variant, resource)]): return gen_dataset_ops.iterator_to_string_handle(resource) - init_func_concrete = _init_func.get_concrete_function() + init_func_concrete = _init_func._get_concrete_function_internal() # pylint: disable=protected-access + @function.defun() def _remote_init_func(): return functional_ops.remote_call( @@ -431,7 +432,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset): Tout=[dtypes.string], f=init_func_concrete) - self._init_func = _remote_init_func.get_concrete_function() + self._init_func = _remote_init_func._get_concrete_function_internal() # pylint: disable=protected-access self._init_captured_args = self._init_func.captured_inputs @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) @@ -450,7 +451,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset): ret = iterator.get_next() return nest.flatten(sparse.serialize_sparse_tensors(ret)) - next_func_concrete = _next_func.get_concrete_function() + next_func_concrete = _next_func._get_concrete_function_internal() # pylint: disable=protected-access + @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) def _remote_next_func(string_handle): return functional_ops.remote_call( @@ -460,7 +462,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset): Tout=self._flat_output_types, f=next_func_concrete) - self._next_func = _remote_next_func.get_concrete_function() + self._next_func = _remote_next_func._get_concrete_function_internal() # pylint: disable=protected-access self._next_captured_args = self._next_func.captured_inputs @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) @@ -481,7 +483,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset): iterator_resource, ignore_lookup_error=True)]): return array_ops.constant(0, dtypes.int64) - finalize_func_concrete = _finalize_func.get_concrete_function() + finalize_func_concrete = _finalize_func._get_concrete_function_internal() # pylint: disable=protected-access + @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) def _remote_finalize_func(string_handle): return functional_ops.remote_call( @@ -491,7 +494,8 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset): Tout=[dtypes.int64], f=finalize_func_concrete) - self._finalize_func = _remote_finalize_func.get_concrete_function() + self._finalize_func = _remote_finalize_func._get_concrete_function_internal( # pylint: disable=protected-access + ) self._finalize_captured_args = self._finalize_func.captured_inputs g = ops.get_default_graph() diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py index 51dd8519e92..0f9add6461a 100644 --- a/tensorflow/python/data/ops/multi_device_iterator_ops.py +++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py @@ -55,7 +55,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset): def _init_func(): return multi_device_iterator_string_handle - init_func_concrete = _init_func.get_concrete_function() + init_func_concrete = _init_func._get_concrete_function_internal() # pylint: disable=protected-access + @function.defun() def _remote_init_func(): return functional_ops.remote_call( @@ -64,7 +65,7 @@ class _PerDeviceGenerator(dataset_ops.Dataset): Tout=[dtypes.string], f=init_func_concrete) - self._init_func = _remote_init_func.get_concrete_function() + self._init_func = _remote_init_func._get_concrete_function_internal() # pylint: disable=protected-access self._init_captured_args = self._init_func.captured_inputs @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) @@ -81,7 +82,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset): output_types=self._flat_output_types, output_shapes=self._flat_output_shapes) - next_func_concrete = _next_func.get_concrete_function() + next_func_concrete = _next_func._get_concrete_function_internal() # pylint: disable=protected-access + @function.defun_with_attributes( input_signature=[tensor_spec.TensorSpec([], dtypes.string)], attributes={"experimental_ints_on_device": True}) @@ -93,14 +95,15 @@ class _PerDeviceGenerator(dataset_ops.Dataset): Tout=self._flat_output_types, f=next_func_concrete) - self._next_func = _remote_next_func.get_concrete_function() + self._next_func = _remote_next_func._get_concrete_function_internal() # pylint: disable=protected-access self._next_captured_args = self._next_func.captured_inputs @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) def _finalize_func(unused_string_handle): return array_ops.constant(0, dtypes.int64) - finalize_func_concrete = _finalize_func.get_concrete_function() + finalize_func_concrete = _finalize_func._get_concrete_function_internal() # pylint: disable=protected-access + @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) def _remote_finalize_func(string_handle): return functional_ops.remote_call( @@ -110,7 +113,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset): Tout=[dtypes.int64], f=finalize_func_concrete) - self._finalize_func = _remote_finalize_func.get_concrete_function() + self._finalize_func = _remote_finalize_func._get_concrete_function_internal( # pylint: disable=protected-access + ) self._finalize_captured_args = self._finalize_func.captured_inputs def _as_variant_tensor(self): From f74a0436426ee50805a9a204d0c1376db30707bf Mon Sep 17 00:00:00 2001 From: Tamara Norman Date: Mon, 5 Nov 2018 10:36:54 -0800 Subject: [PATCH 096/540] Modify to return an error specifying an axis argument must be given instead of an error due to being unable to convert a null tensor. PiperOrigin-RevId: 220129411 --- tensorflow/python/ops/array_ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index 232a77c8888..6fdc50733a1 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -120,7 +120,7 @@ def expand_dims(input, axis=None, name=None, dim=None): axis: 0-D (scalar). Specifies the dimension index at which to expand the shape of `input`. Must be in the range `[-rank(input) - 1, rank(input)]`. - name: The name of the output `Tensor`. + name: The name of the output `Tensor` (optional). dim: 0-D (scalar). Equivalent to `axis`, to be deprecated. Returns: @@ -128,9 +128,11 @@ def expand_dims(input, axis=None, name=None, dim=None): dimension of size 1 added. Raises: - ValueError: if both `dim` and `axis` are specified. + ValueError: if either both or neither of `dim` and `axis` are specified. """ axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim) + if axis is None: + raise ValueError("Must specify an axis argument to tf.expand_dims()") return gen_array_ops.expand_dims(input, axis, name) From 6dc0f05f84d7a6e03856f9d83fa39d5126052820 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Mon, 5 Nov 2018 10:42:43 -0800 Subject: [PATCH 097/540] Keeping the init() method on InitializableLookupTableBase intact but adding a deprecation notice instead. PiperOrigin-RevId: 220130488 --- tensorflow/python/ops/lookup_ops.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py index 89109469b7b..e65b53e3ac9 100644 --- a/tensorflow/python/ops/lookup_ops.py +++ b/tensorflow/python/ops/lookup_ops.py @@ -171,6 +171,11 @@ class InitializableLookupTableBase(LookupInterface): def initializer(self): return self._init_op + @property + @deprecated("2018-12-15", "Use `initializer` instead.") + def init(self): + return self.initializer + @property def default_value(self): """The default value of the table.""" @@ -830,6 +835,11 @@ class IdTableWithHashBuckets(LookupInterface): with ops.name_scope(None, "init"): return control_flow_ops.no_op() + @property + @deprecated("2018-12-15", "Use `initializer` instead.") + def init(self): + return self.initializer + @property def resource_handle(self): if self._table is not None: From b9b0354ebc616f05e57656b861bcf16b2cf575d8 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Mon, 5 Nov 2018 10:51:41 -0800 Subject: [PATCH 098/540] [TF:XLA] Register the standard Stack kernels on XLA_... devices. Refactor the core stack ops to split the op classes from their registrations on CPU/GPU/SYSCL devices. Refactor the stack push op to be templated on an allow_swapping bool, rather than a specific device. The device was only ever used in a type equality test to determine whether to swap or not. On XLA_... devices, previously stack operators only worked when the entire computation was grouped into a single cluster (e.g., via xla.compile()). This change also allows stack-using operators to work in "ondemand" or eager modes, when running ops one-at-a-time. However, since the compiled and interpreted representations of stacks are still different, there is not yet any support for passing stacks into or out of compiled blocks. Stack usage must remain entirely inside or entirely outside a compiled block until we rectify this in a future change. PiperOrigin-RevId: 220132340 --- tensorflow/compiler/jit/BUILD | 1 + tensorflow/compiler/jit/xla_device_ops.h | 23 +- tensorflow/compiler/tests/BUILD | 2 - .../compiler/tf2xla/kernels/stack_ops.cc | 10 +- tensorflow/contrib/makefile/tf_op_files.txt | 1 + tensorflow/core/kernels/BUILD | 16 +- tensorflow/core/kernels/stack.cc | 339 ++++++++++++++++ tensorflow/core/kernels/stack.h | 76 ++++ tensorflow/core/kernels/stack_ops.cc | 383 ++---------------- 9 files changed, 491 insertions(+), 360 deletions(-) create mode 100644 tensorflow/core/kernels/stack.cc create mode 100644 tensorflow/core/kernels/stack.h diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 028687d4010..f98ba487354 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -190,6 +190,7 @@ cc_library( "//tensorflow/core/kernels:resource_variable_ops", "//tensorflow/core/kernels:sendrecv_ops", "//tensorflow/core/kernels:shape_ops", + "//tensorflow/core/kernels:stack", "//tensorflow/core/kernels:variable_ops", "//tensorflow/core/kernels/data:generator_dataset_op", "//tensorflow/core/kernels/data:iterator_ops", diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h index 241ea8f60df..adf0f994b84 100644 --- a/tensorflow/compiler/jit/xla_device_ops.h +++ b/tensorflow/compiler/jit/xla_device_ops.h @@ -35,6 +35,7 @@ limitations under the License. #include "tensorflow/core/kernels/resource_variable_ops.h" #include "tensorflow/core/kernels/sendrecv_ops.h" #include "tensorflow/core/kernels/shape_ops.h" +#include "tensorflow/core/kernels/stack.h" #include "tensorflow/core/kernels/variable_ops.h" namespace tensorflow { @@ -257,9 +258,27 @@ class XlaAssignVariableOp : public OpKernel { .Device(DEVICE) \ .TypeConstraint("T") \ .HostMemory("input"), \ - RetvalOp); + RetvalOp); \ + \ + REGISTER_KERNEL_BUILDER(Name("StackV2") \ + .Device(DEVICE) \ + .HostMemory("max_size") \ + .HostMemory("handle"), \ + StackOp); \ + REGISTER_KERNEL_BUILDER(Name("StackPushV2") \ + .Device(DEVICE) \ + .HostMemory("handle") \ + .TypeConstraint("T", TYPES), \ + TemplatedStackPushOp); \ + REGISTER_KERNEL_BUILDER(Name("StackPopV2") \ + .Device(DEVICE) \ + .HostMemory("handle") \ + .TypeConstraint("elem_type", TYPES), \ + StackPopOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("StackCloseV2").Device(DEVICE).HostMemory("handle"), StackCloseOp); -// TODO(phawkins): currently we do not register the QueueEnqueueMany, +// TODO(b/118881356): currently we do not register the QueueEnqueueMany, // QueueDequeueMany, or QueueDequeueUpTo kernels because they attempt to read // and write the tensors they access in order to concatenate them into a batch. // We would need either to call out to an XLA computation to perform the diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 6945de1eda1..06501e2177b 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -837,8 +837,6 @@ tf_xla_py_test( name = "stack_ops_test", size = "small", srcs = ["stack_ops_test.py"], - # Stack ops are not implemented in the on-demand compilation model yet. - disabled_backends = ["cpu_ondemand"], deps = [ ":xla_test", "//tensorflow/python:array_ops", diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc index d79cdad9fa2..7b96b43ad83 100644 --- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc @@ -126,7 +126,9 @@ class StackOp : public XlaOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(StackOp); }; -REGISTER_XLA_OP(Name("StackV2").CompileTimeConstantInput("max_size"), StackOp); +REGISTER_XLA_OP( + Name("StackV2").CompileTimeConstantInput("max_size").CompilationOnly(), + StackOp); class StackPushOp : public XlaOpKernel { public: @@ -173,7 +175,7 @@ class StackPushOp : public XlaOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(StackPushOp); }; -REGISTER_XLA_OP(Name("StackPushV2"), StackPushOp); +REGISTER_XLA_OP(Name("StackPushV2").CompilationOnly(), StackPushOp); class StackPopOp : public XlaOpKernel { public: @@ -227,7 +229,7 @@ class StackPopOp : public XlaOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(StackPopOp); }; -REGISTER_XLA_OP(Name("StackPopV2"), StackPopOp); +REGISTER_XLA_OP(Name("StackPopV2").CompilationOnly(), StackPopOp); class StackCloseOp : public XlaOpKernel { public: @@ -241,7 +243,7 @@ class StackCloseOp : public XlaOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(StackCloseOp); }; -REGISTER_XLA_OP(Name("StackCloseV2"), StackCloseOp); +REGISTER_XLA_OP(Name("StackCloseV2").CompilationOnly(), StackCloseOp); } // anonymous namespace } // namespace tensorflow diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt index eab93f2cc5e..24a4a03f232 100644 --- a/tensorflow/contrib/makefile/tf_op_files.txt +++ b/tensorflow/contrib/makefile/tf_op_files.txt @@ -248,6 +248,7 @@ tensorflow/core/kernels/spectrogram_op.cc tensorflow/core/kernels/split_lib_cpu.cc tensorflow/core/kernels/split_op.cc tensorflow/core/kernels/split_v_op.cc +tensorflow/core/kernels/stack.cc tensorflow/core/kernels/stack_ops.cc tensorflow/core/kernels/strided_slice_op.cc tensorflow/core/kernels/strided_slice_op_inst_0.cc diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 01f3627d674..f61ee53a428 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -1887,10 +1887,22 @@ tf_kernel_library( deps = DATA_FLOW_DEPS, ) +cc_library( + name = "stack", + srcs = ["stack.cc"], + hdrs = ["stack.h"], + deps = [ + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + ], +) + tf_kernel_library( name = "stack_ops", prefix = "stack_ops", - deps = DATA_FLOW_DEPS, + deps = DATA_FLOW_DEPS + [":stack"], ) tf_kernel_library( @@ -5485,6 +5497,8 @@ filegroup( "sparse_to_dense_op.cc", "spectrogram.cc", "spectrogram_op.cc", + "stack.cc", + "stack.h", "stack_ops.cc", "string_join_op.cc", "string_util.cc", diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc new file mode 100644 index 00000000000..5c70a2d62d3 --- /dev/null +++ b/tensorflow/core/kernels/stack.cc @@ -0,0 +1,339 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/kernels/stack.h" + +#include +#include +#include + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/refcount.h" +#include "tensorflow/core/lib/gtl/map_util.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +class Stack : public ResourceBase { + public: + static std::atomic stack_counter; + + struct TensorAndAllocation { + Tensor tensor; + AllocatorAttributes alloc_attrs; + bool swapped_to_cpu; + }; + + Stack(const DataType& elem_type, const string& stack_name, int max_size) + : elem_type_(elem_type), + stack_name_(stack_name), + max_size_(max_size), + closed_(false) {} + + Status Push(const TensorAndAllocation& value) { + mutex_lock l(mu_); + TF_RETURN_IF_ERROR(CheckNotClosed()); + if (max_size_ >= 0 && stack_.size() >= max_size_) { + return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ", + "its max_size (", max_size_, ")"); + } + stack_.push_back(value); + return Status::OK(); + } + + Status Pop(TensorAndAllocation* value) { + mutex_lock l(mu_); + TF_RETURN_IF_ERROR(CheckNotClosed()); + if (stack_.empty()) { + return errors::InvalidArgument("Stack[", stack_name_, + "] is empty when calling Pop()."); + } + *value = stack_.back(); + stack_.pop_back(); + return Status::OK(); + } + + // We don't swap the first tensor on the stack and any subsequent tensors + // that share the buffer with the first tensor. + bool IsUsefulToSwap(const Tensor& tensor) const { + mutex_lock l(mu_); + if (stack_.empty()) { + return false; + } + const Tensor& first = stack_.front().tensor; + return !tensor.SharesBufferWith(first); + } + + void Close() { + mutex_lock l(mu_); + stack_.clear(); + closed_ = true; + } + + DataType ElemType() { return elem_type_; } + + string DebugString() override { + mutex_lock l(mu_); + return strings::StrCat("Stack[", stack_name_, "]"); + } + + const string& stack_name() { return stack_name_; } + + private: + friend class StackOp; + mutex* mu() { return &mu_; } + + mutable mutex mu_; + DataType elem_type_; + const string stack_name_; + Tensor handle_; + int max_size_; + bool closed_ GUARDED_BY(mu_); + std::vector stack_ GUARDED_BY(mu_); + + Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (closed_) { + return errors::InvalidArgument("Stack[", stack_name_, + "] has already been closed."); + } + return Status::OK(); + } +}; + +Status GetStack(OpKernelContext* ctx, Stack** stack) { + if (ctx->input_dtype(0) == DT_RESOURCE) { + return LookupResource(ctx, HandleFromInput(ctx, 0), stack); + } else { + Tensor Tstack_handle = ctx->mutable_input(0, false); + if (Tstack_handle.NumElements() != 2) { + return errors::InvalidArgument( + "Stack handle must have two elements, but had shape: ", + Tstack_handle.shape().DebugString()); + } + const string& container = Tstack_handle.flat()(0); + const string& stack_name = Tstack_handle.flat()(1); + string key = strings::StrCat(container, stack_name); + ResourceMgr* rm = ctx->resource_manager(); + if (rm == nullptr) { + return errors::Internal("No resource manager."); + } + auto* step_container = ctx->step_container(); + if (step_container == nullptr) { + return errors::Internal("No step container."); + } + TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack)); + return Status::OK(); + } +} + +std::atomic Stack::stack_counter{0}; + +// StackOp + +StackOp::StackOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("elem_type", &elem_type_)); + OP_REQUIRES_OK(context, context->GetAttr("stack_name", &stack_name_)); + if (stack_name_.empty()) stack_name_ = name(); +} + +void StackOp::Compute(OpKernelContext* ctx) { + int32 size = std::numeric_limits::max(); + if (ctx->num_inputs() > 0) { + const Tensor* tensor_size; + OP_REQUIRES_OK(ctx, ctx->input("max_size", &tensor_size)); + + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(tensor_size->shape()), + errors::InvalidArgument("Stack size must be a scalar, but had shape: ", + tensor_size->shape().DebugString())); + + int32 size_value = tensor_size->scalar()(); + if (size_value >= 0) { + size = size_value; + } + } + + static const char kContainer[] = "_stacks"; + auto stack_id = Stack::stack_counter.fetch_add(1); + string stack_name = strings::StrCat(stack_name_, "_", stack_id); + // Store the handle in a per-step container. + ResourceMgr* rm = ctx->resource_manager(); + OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager.")); + string key = strings::StrCat(kContainer, stack_name); + Stack* stack = new Stack(elem_type_, stack_name, size); + auto* step_container = ctx->step_container(); + OP_REQUIRES(ctx, step_container != nullptr, + errors::Internal("No step container.")); + OP_REQUIRES_OK(ctx, rm->Create(step_container->name(), key, stack)); + if (IsRefType(ctx->expected_output_dtype(0))) { + // Create the stack handle. + AllocatorAttributes alloc_attr; + alloc_attr.set_on_host(true); + OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING, + tensorflow::TensorShape({2}), + &stack->handle_, alloc_attr)); + auto handle = stack->handle_.flat(); + handle(0) = kContainer; + handle(1) = std::move(stack_name); + ctx->set_output_ref(0, stack->mu(), &stack->handle_); + } else { + Tensor* handle; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle)); + handle->flat()(0) = + MakePerStepResourceHandle(ctx, key); + } +} + +// StackPushOp + +StackPushOp::StackPushOp(OpKernelConstruction* context, bool allow_swapping) + : AsyncOpKernel(context) { + if (allow_swapping) { + OP_REQUIRES_OK(context, context->GetAttr("swap_memory", &swap_memory_)); + } +} + +void StackPushOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { + // Get the stack from the handle. + Stack* stack = nullptr; + OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done); + core::ScopedUnref unref(stack); + + if (ctx->input_dtype(1) != stack->ElemType()) { + ctx->CtxFailure(errors::InvalidArgument("Must have type ", + stack->ElemType(), " but got ", + ctx->input_dtype(1))); + done(); + return; + } + + // Push the tensor onto the stack. Swap the tensor to CPU if instructed. + const Tensor& tensor = ctx->input(1); + AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1); + // For now, we use a simple heuristic for swapping: A GPU tensor is moved + // to CPU if the tensor has more than kCopyThreshold bytes and the GPU + // allocator says more than kOccupancy of the memory is in use. + static constexpr int kCopyThreshold = 2048; + static constexpr double kOccupancy = 0.7; + if (swap_memory_ && !alloc_attrs.on_host() && + tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) { + DeviceContext* device_ctxt = ctx->op_device_context(); + auto device = static_cast(ctx->device()); + Allocator* allocator = device->GetAllocator(alloc_attrs); + AllocatorStats stats; + allocator->GetStats(&stats); + if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) { + // Asynchronously copy the tensor from GPU to CPU memory. + // TODO(yuanbyu): Swap the oldest tensor first. + AllocatorAttributes host_alloc_attrs; + host_alloc_attrs.set_gpu_compatible(true); + host_alloc_attrs.set_on_host(true); + Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs); + Tensor* cpu_tensor = + new Tensor(cpu_allocator, tensor.dtype(), tensor.shape()); + device_ctxt->CopyDeviceTensorToCPU( + &tensor, "StackPush", device, cpu_tensor, + [cpu_tensor, stack, ctx, done](const Status& s) { + ctx->SetStatus(s); + if (s.ok()) { + AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1); + ctx->SetStatus(stack->Push({*cpu_tensor, alloc_attrs, true})); + } + if (ctx->status().ok()) { + ctx->set_output(0, *cpu_tensor); + } + done(); + delete cpu_tensor; + }); + return; + } + } + + // Execute synchronously if not swapped. + OP_REQUIRES_OK_ASYNC(ctx, stack->Push({tensor, alloc_attrs, false}), done); + ctx->set_output(0, tensor); + done(); +} + +bool StackPushOp::IsExpensive() { return false; } + +// StackPopOp + +StackPopOp::StackPopOp(OpKernelConstruction* context) + : AsyncOpKernel(context) {} + +void StackPopOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { + // Get the stack from the handle. + Stack* stack = nullptr; + OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done); + core::ScopedUnref unref(stack); + + // Pop the tensor. Transfer the tensor back to device if it was + // swapped out to CPU. + Stack::TensorAndAllocation value; + OP_REQUIRES_OK_ASYNC(ctx, stack->Pop(&value), done); + if (value.swapped_to_cpu) { + // Asynchronously copy the tensor back from CPU to GPU memory. + DeviceContext* device_ctxt = ctx->op_device_context(); + Device* device = static_cast(ctx->device()); + Tensor* cpu_tensor = &value.tensor; + Allocator* gpu_allocator = device->GetAllocator(value.alloc_attrs); + Tensor* device_tensor = + new Tensor(gpu_allocator, cpu_tensor->dtype(), cpu_tensor->shape()); + device_ctxt->CopyCPUTensorToDevice( + cpu_tensor, device, device_tensor, + [device_tensor, ctx, done](const Status& s) { + ctx->SetStatus(s); + if (s.ok()) { + ctx->set_output(0, *device_tensor); + } + done(); + delete device_tensor; + }); + } else { + // Execute synchronously if not swapped. + ctx->set_output(0, value.tensor); + done(); + } +} + +bool StackPopOp::IsExpensive() { return false; } + +// StackCloseOp + +StackCloseOp::StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {} + +void StackCloseOp::Compute(OpKernelContext* ctx) { + Stack* stack = nullptr; + OP_REQUIRES_OK(ctx, GetStack(ctx, &stack)); + core::ScopedUnref unref(stack); + stack->Close(); +} + +bool StackCloseOp::IsExpensive() { return false; } + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/stack.h b/tensorflow/core/kernels/stack.h new file mode 100644 index 00000000000..e1927e1f28f --- /dev/null +++ b/tensorflow/core/kernels/stack.h @@ -0,0 +1,76 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_STACK_H_ +#define TENSORFLOW_CORE_KERNELS_STACK_H_ + +// See docs in ../ops/data_flow_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +// A per-run local stack. The stack uses a "per-step" resource manager which +// ensures that correct garbage collection on error or successful completion. +class StackOp : public OpKernel { + public: + explicit StackOp(OpKernelConstruction* context); + void Compute(OpKernelContext* ctx) override; + + private: + DataType elem_type_; + string stack_name_; + + TF_DISALLOW_COPY_AND_ASSIGN(StackOp); +}; + +class StackPushOp : public AsyncOpKernel { + public: + StackPushOp(OpKernelConstruction* context, bool allow_swapping); + void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override; + bool IsExpensive() override; + + private: + bool swap_memory_ = false; +}; + +// Templated helper to make it easier to register kernels with or without +// swapping. +template +class TemplatedStackPushOp : public StackPushOp { + public: + TemplatedStackPushOp(OpKernelConstruction* context) + : StackPushOp(context, allow_swapping) {} +}; + +class StackPopOp : public AsyncOpKernel { + public: + explicit StackPopOp(OpKernelConstruction* context); + void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override; + bool IsExpensive() override; +}; + +class StackCloseOp : public OpKernel { + public: + explicit StackCloseOp(OpKernelConstruction* context); + void Compute(OpKernelContext* ctx) override; + bool IsExpensive() override; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_STACK_H_ diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc index add4afafc92..df94a8818e7 100644 --- a/tensorflow/core/kernels/stack_ops.cc +++ b/tensorflow/core/kernels/stack_ops.cc @@ -15,6 +15,8 @@ limitations under the License. // See docs in ../ops/data_flow_ops.cc. +#include "tensorflow/core/kernels/stack.h" + #include #include #include @@ -38,191 +40,6 @@ limitations under the License. namespace tensorflow { -typedef Eigen::ThreadPoolDevice CPUDevice; -typedef Eigen::GpuDevice GPUDevice; -#ifdef TENSORFLOW_USE_SYCL -typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL - -class Stack : public ResourceBase { - public: - static std::atomic stack_counter; - - struct TensorAndAllocation { - Tensor tensor; - AllocatorAttributes alloc_attrs; - bool swapped_to_cpu; - }; - - Stack(const DataType& elem_type, const string& stack_name, int max_size) - : elem_type_(elem_type), - stack_name_(stack_name), - max_size_(max_size), - closed_(false) {} - - Status Push(const TensorAndAllocation& value) { - mutex_lock l(mu_); - TF_RETURN_IF_ERROR(CheckNotClosed()); - if (max_size_ >= 0 && stack_.size() >= max_size_) { - return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ", - "its max_size (", max_size_, ")"); - } - stack_.push_back(value); - return Status::OK(); - } - - Status Pop(TensorAndAllocation* value) { - mutex_lock l(mu_); - TF_RETURN_IF_ERROR(CheckNotClosed()); - if (stack_.empty()) { - return errors::InvalidArgument("Stack[", stack_name_, - "] is empty when calling Pop()."); - } - *value = stack_.back(); - stack_.pop_back(); - return Status::OK(); - } - - // We don't swap the first tensor on the stack and any subsequent tensors - // that share the buffer with the first tensor. - bool IsUsefulToSwap(const Tensor& tensor) const { - mutex_lock l(mu_); - if (stack_.empty()) { - return false; - } - const Tensor& first = stack_.front().tensor; - return !tensor.SharesBufferWith(first); - } - - void Close() { - mutex_lock l(mu_); - stack_.clear(); - closed_ = true; - } - - DataType ElemType() { return elem_type_; } - - string DebugString() override { - mutex_lock l(mu_); - return strings::StrCat("Stack[", stack_name_, "]"); - } - - const string& stack_name() { return stack_name_; } - - private: - friend class StackOp; - mutex* mu() { return &mu_; } - - mutable mutex mu_; - DataType elem_type_; - const string stack_name_; - Tensor handle_; - int max_size_; - bool closed_ GUARDED_BY(mu_); - std::vector stack_ GUARDED_BY(mu_); - - Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) { - if (closed_) { - return errors::InvalidArgument("Stack[", stack_name_, - "] has already been closed."); - } - return Status::OK(); - } -}; - -Status GetStack(OpKernelContext* ctx, Stack** stack) { - if (ctx->input_dtype(0) == DT_RESOURCE) { - return LookupResource(ctx, HandleFromInput(ctx, 0), stack); - } else { - Tensor Tstack_handle = ctx->mutable_input(0, false); - if (Tstack_handle.NumElements() != 2) { - return errors::InvalidArgument( - "Stack handle must have two elements, but had shape: ", - Tstack_handle.shape().DebugString()); - } - const string& container = Tstack_handle.flat()(0); - const string& stack_name = Tstack_handle.flat()(1); - string key = strings::StrCat(container, stack_name); - ResourceMgr* rm = ctx->resource_manager(); - if (rm == nullptr) { - return errors::Internal("No resource manager."); - } - auto* step_container = ctx->step_container(); - if (step_container == nullptr) { - return errors::Internal("No step container."); - } - TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack)); - return Status::OK(); - } -} - -std::atomic Stack::stack_counter{0}; - -// A per-run local stack. The stack uses a "per-step" resource manager which -// ensures that correct garbage collection on error or successful completion. -class StackOp : public OpKernel { - public: - explicit StackOp(OpKernelConstruction* context) : OpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("elem_type", &elem_type_)); - OP_REQUIRES_OK(context, context->GetAttr("stack_name", &stack_name_)); - if (stack_name_.empty()) stack_name_ = name(); - } - - void Compute(OpKernelContext* ctx) override { - int32 size = std::numeric_limits::max(); - if (ctx->num_inputs() > 0) { - const Tensor* tensor_size; - OP_REQUIRES_OK(ctx, ctx->input("max_size", &tensor_size)); - - OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_size->shape()), - errors::InvalidArgument( - "Stack size must be a scalar, but had shape: ", - tensor_size->shape().DebugString())); - - int32 size_value = tensor_size->scalar()(); - if (size_value >= 0) { - size = size_value; - } - } - - static const char kContainer[] = "_stacks"; - auto stack_id = Stack::stack_counter.fetch_add(1); - string stack_name = strings::StrCat(stack_name_, "_", stack_id); - // Store the handle in a per-step container. - ResourceMgr* rm = ctx->resource_manager(); - OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager.")); - string key = strings::StrCat(kContainer, stack_name); - Stack* stack = new Stack(elem_type_, stack_name, size); - auto* step_container = ctx->step_container(); - OP_REQUIRES(ctx, step_container != nullptr, - errors::Internal("No step container.")); - OP_REQUIRES_OK(ctx, rm->Create(step_container->name(), key, stack)); - if (IsRefType(ctx->expected_output_dtype(0))) { - // Create the stack handle. - AllocatorAttributes alloc_attr; - alloc_attr.set_on_host(true); - OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING, - tensorflow::TensorShape({2}), - &stack->handle_, alloc_attr)); - auto handle = stack->handle_.flat(); - handle(0) = kContainer; - handle(1) = std::move(stack_name); - ctx->set_output_ref(0, stack->mu(), &stack->handle_); - } else { - Tensor* handle; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle)); - handle->flat()(0) = - MakePerStepResourceHandle(ctx, key); - } - } - - private: - DataType elem_type_; - string stack_name_; - - TF_DISALLOW_COPY_AND_ASSIGN(StackOp); -}; - REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_CPU), StackOp); REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_GPU).HostMemory("handle"), StackOp); @@ -242,102 +59,22 @@ REGISTER_KERNEL_BUILDER(Name("StackV2") StackOp); #endif // TENSORFLOW_USE_SYCL -template -class StackPushOp : public AsyncOpKernel { - public: - explicit StackPushOp(OpKernelConstruction* context) : AsyncOpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("swap_memory", &swap_memory_)); - } - - void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override { - // Get the stack from the handle. - Stack* stack = nullptr; - OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done); - core::ScopedUnref unref(stack); - - if (ctx->input_dtype(1) != stack->ElemType()) { - ctx->CtxFailure(errors::InvalidArgument("Must have type ", - stack->ElemType(), " but got ", - ctx->input_dtype(1))); - done(); - return; - } - - // Push the tensor onto the stack. Swap the tensor to CPU if instructed. - const Tensor& tensor = ctx->input(1); - AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1); - // For now, we use a simple heuristic for swapping: A GPU tensor is moved - // to CPU if the tensor has more than kCopyThreshold bytes and the GPU - // allocator says more than kOccupancy of the memory is in use. - static constexpr int kCopyThreshold = 2048; - static constexpr double kOccupancy = 0.7; - if (swap_memory_ && !alloc_attrs.on_host() && - (std::is_same::value -#ifdef TENSORFLOW_USE_SYCL - || std::is_same::value -#endif // TENSORFLOW_USE_SYCL - ) && - tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) { - DeviceContext* device_ctxt = ctx->op_device_context(); - auto device = static_cast(ctx->device()); - Allocator* allocator = device->GetAllocator(alloc_attrs); - AllocatorStats stats; - allocator->GetStats(&stats); - if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) { - // Asynchronously copy the tensor from GPU to CPU memory. - // TODO(yuanbyu): Swap the oldest tensor first. - AllocatorAttributes host_alloc_attrs; - host_alloc_attrs.set_gpu_compatible(true); - host_alloc_attrs.set_on_host(true); - Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs); - Tensor* cpu_tensor = - new Tensor(cpu_allocator, tensor.dtype(), tensor.shape()); - device_ctxt->CopyDeviceTensorToCPU( - &tensor, "StackPush", device, cpu_tensor, - [cpu_tensor, stack, ctx, done](const Status& s) { - ctx->SetStatus(s); - if (s.ok()) { - AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1); - ctx->SetStatus(stack->Push({*cpu_tensor, alloc_attrs, true})); - } - if (ctx->status().ok()) { - ctx->set_output(0, *cpu_tensor); - } - done(); - delete cpu_tensor; - }); - return; - } - } - - // Execute synchronously if not swapped. - OP_REQUIRES_OK_ASYNC(ctx, stack->Push({tensor, alloc_attrs, false}), done); - ctx->set_output(0, tensor); - done(); - } - - bool IsExpensive() override { return false; } - - private: - bool swap_memory_; -}; - REGISTER_KERNEL_BUILDER(Name("StackPush").Device(DEVICE_CPU), - StackPushOp); + TemplatedStackPushOp); REGISTER_KERNEL_BUILDER(Name("StackPushV2").Device(DEVICE_CPU), - StackPushOp); + TemplatedStackPushOp); -#define REGISTER_GPU_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("StackPush") \ - .Device(DEVICE_GPU) \ - .HostMemory("handle") \ - .TypeConstraint("T"), \ - StackPushOp); \ - REGISTER_KERNEL_BUILDER(Name("StackPushV2") \ - .Device(DEVICE_GPU) \ - .HostMemory("handle") \ - .TypeConstraint("T"), \ - StackPushOp); +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("StackPush") \ + .Device(DEVICE_GPU) \ + .HostMemory("handle") \ + .TypeConstraint("T"), \ + TemplatedStackPushOp); \ + REGISTER_KERNEL_BUILDER(Name("StackPushV2") \ + .Device(DEVICE_GPU) \ + .HostMemory("handle") \ + .TypeConstraint("T"), \ + TemplatedStackPushOp); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); #undef REGISTER_GPU_KERNEL @@ -345,21 +82,21 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); // Special GPU kernels for int32 and bool. // TODO(b/25387198): Also enable int32 in device memory. This kernel // registration requires all int32 inputs and outputs to be in host memory. -#define REGISTER_GPU_HOST_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("StackPush") \ - .Device(DEVICE_GPU) \ - .HostMemory("handle") \ - .HostMemory("elem") \ - .HostMemory("output") \ - .TypeConstraint("T"), \ - StackPushOp); \ - REGISTER_KERNEL_BUILDER(Name("StackPushV2") \ - .Device(DEVICE_GPU) \ - .HostMemory("handle") \ - .HostMemory("elem") \ - .HostMemory("output") \ - .TypeConstraint("T"), \ - StackPushOp); +#define REGISTER_GPU_HOST_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("StackPush") \ + .Device(DEVICE_GPU) \ + .HostMemory("handle") \ + .HostMemory("elem") \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + TemplatedStackPushOp); \ + REGISTER_KERNEL_BUILDER(Name("StackPushV2") \ + .Device(DEVICE_GPU) \ + .HostMemory("handle") \ + .HostMemory("elem") \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + TemplatedStackPushOp); REGISTER_GPU_HOST_KERNEL(int32); REGISTER_GPU_HOST_KERNEL(bool); @@ -372,7 +109,7 @@ REGISTER_GPU_HOST_KERNEL(bool); .Device(DEVICE_SYCL) \ .HostMemory("handle") \ .TypeConstraint("T"), \ - StackPushOp); + TemplatedStackPushOp); TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL); @@ -383,7 +120,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL); .HostMemory("elem") \ .HostMemory("output") \ .TypeConstraint("T"), \ - StackPushOp) + TemplatedStackPushOp) REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(bool); @@ -391,48 +128,6 @@ REGISTER_SYCL_HOST_KERNEL(bool); #undef REGISTER_SYCL_HOST_KERNEL #endif // TENSORFLOW_USE_SYCL -class StackPopOp : public AsyncOpKernel { - public: - explicit StackPopOp(OpKernelConstruction* context) : AsyncOpKernel(context) {} - - void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override { - // Get the stack from the handle. - Stack* stack = nullptr; - OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done); - core::ScopedUnref unref(stack); - - // Pop the tensor. Transfer the tensor back to device if it was - // swapped out to CPU. - Stack::TensorAndAllocation value; - OP_REQUIRES_OK_ASYNC(ctx, stack->Pop(&value), done); - if (value.swapped_to_cpu) { - // Asynchronously copy the tensor back from CPU to GPU memory. - DeviceContext* device_ctxt = ctx->op_device_context(); - Device* device = static_cast(ctx->device()); - Tensor* cpu_tensor = &value.tensor; - Allocator* gpu_allocator = device->GetAllocator(value.alloc_attrs); - Tensor* device_tensor = - new Tensor(gpu_allocator, cpu_tensor->dtype(), cpu_tensor->shape()); - device_ctxt->CopyCPUTensorToDevice( - cpu_tensor, device, device_tensor, - [device_tensor, ctx, done](const Status& s) { - ctx->SetStatus(s); - if (s.ok()) { - ctx->set_output(0, *device_tensor); - } - done(); - delete device_tensor; - }); - } else { - // Execute synchronously if not swapped. - ctx->set_output(0, value.tensor); - done(); - } - } - - bool IsExpensive() override { return false; } -}; - REGISTER_KERNEL_BUILDER(Name("StackPop").Device(DEVICE_CPU), StackPopOp); REGISTER_KERNEL_BUILDER(Name("StackPopV2").Device(DEVICE_CPU), StackPopOp); @@ -498,20 +193,6 @@ REGISTER_SYCL_HOST_KERNEL(bool); #undef REGISTER_SYCL_HOST_KERNEL #endif // TENSORFLOW_USE_SYCL -class StackCloseOp : public OpKernel { - public: - explicit StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* ctx) override { - Stack* stack = nullptr; - OP_REQUIRES_OK(ctx, GetStack(ctx, &stack)); - core::ScopedUnref unref(stack); - stack->Close(); - } - - bool IsExpensive() override { return false; } -}; - REGISTER_KERNEL_BUILDER(Name("StackClose").Device(DEVICE_CPU), StackCloseOp); REGISTER_KERNEL_BUILDER( Name("StackClose").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp); From 389b0fbeef501fb20c691610a3de08eff940953f Mon Sep 17 00:00:00 2001 From: Andrew Selle Date: Mon, 5 Nov 2018 11:05:44 -0800 Subject: [PATCH 099/540] Fix memory error where delegate->flags was not initialized. PiperOrigin-RevId: 220135142 --- tensorflow/lite/interpreter_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc index f47f698d3a5..3ac19fc87d1 100644 --- a/tensorflow/lite/interpreter_test.cc +++ b/tensorflow/lite/interpreter_test.cc @@ -1320,6 +1320,7 @@ TEST(TestDelegateOwnership, ProperlyDisposed) { struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate { TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared) : destroyed(destroyed), prepared(prepared) { + flags = kTfLiteDelegateFlagsNone; Prepare = [](TfLiteContext*, TfLiteDelegate* delegate) -> TfLiteStatus { *static_cast(delegate)->prepared = true; From 4e81d6cc877ba4a56196bbdd8d4e09ab7f7b8412 Mon Sep 17 00:00:00 2001 From: Pete Warden Date: Mon, 5 Nov 2018 11:09:46 -0800 Subject: [PATCH 100/540] Fix for #23440 in micro builds PiperOrigin-RevId: 220136024 --- .../micro/tools/make/targets/linux_x86_makefile.inc | 9 +++++++++ tensorflow/lite/kernels/internal/common.h | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc new file mode 100644 index 00000000000..8ea78e8f3e3 --- /dev/null +++ b/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc @@ -0,0 +1,9 @@ +# Settings for x86 on Linux +ifeq ($(TARGET), linux) + ifeq ($(TARGET_ARCH), x86_64) + PLATFORM_FLAGS = \ + -DTF_LITE_DISABLE_X86_NEON + CXXFLAGS += $(PLATFORM_FLAGS) + CCFLAGS += $(PLATFORM_FLAGS) + endif +endif diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h index e31f47d2cea..fdb72037f84 100644 --- a/tensorflow/lite/kernels/internal/common.h +++ b/tensorflow/lite/kernels/internal/common.h @@ -27,7 +27,7 @@ limitations under the License. #include #endif -#if defined __GNUC__ && defined __SSE4_1__ +#if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON #define USE_NEON #define OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS From 9e93f7fd9210fc9c61885ad24a9eab45e4e6012d Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Mon, 5 Nov 2018 12:26:50 -0800 Subject: [PATCH 101/540] [tf.data] Fix internal test failure. PiperOrigin-RevId: 220150752 --- tensorflow/core/kernels/data/unbatch_dataset_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc index ab8d8570336..b32ab8ba4fa 100644 --- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc +++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc @@ -151,7 +151,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel { // dimension. If it is statically known for any component, we model the // transformation using `KnownRatio`. Otherwise, we use `UnknownRatio`. for (auto& shape : dataset()->input_->output_shapes()) { - if (shape.dims() > 0 && shape.dim_size(0) != -1) { + if (shape.dims() > 0 && shape.dim_size(0) > 0) { return model::MakeKnownRatioNode( std::move(args), 1.0 / static_cast(shape.dim_size(0))); } From 36a97d10bec956d513e260a4a12c72f845257fc0 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Mon, 5 Nov 2018 12:35:58 -0800 Subject: [PATCH 102/540] [TF:XLA] Consolidate requires_compilation and enable_jit_by_default into a new autoclustering_policy field. The concept of "requires_compilation" has been fairly confusing ever since ondemand mode (aka. op-by-op compilation) has been checked in. Compilation here actually meant "autoclustering" and no device actually requires it any more. Don't form clusters of size 1 if autoclustering on what was formerly a "requires_compilation" device; the on-demand code path achieves the same goal. We can also move the policy decision about whether to jit on the CPU device into the autoclustering policy field, which is simpler. PiperOrigin-RevId: 220152352 --- tensorflow/compiler/jit/build_xla_ops_pass.cc | 3 +- .../compiler/jit/mark_for_compilation_pass.cc | 32 +++++++++---------- .../jit/mark_for_compilation_pass_test.cc | 5 ++- .../compiler/jit/partially_decluster_pass.cc | 3 +- tensorflow/compiler/jit/xla_cpu_device.cc | 6 ++-- tensorflow/compiler/jit/xla_gpu_device.cc | 4 +-- .../compiler/jit/xla_interpreter_device.cc | 4 +-- tensorflow/compiler/tf2xla/BUILD | 1 + tensorflow/compiler/tf2xla/xla_op_registry.cc | 15 ++++++--- tensorflow/compiler/tf2xla/xla_op_registry.h | 23 ++++++++----- 10 files changed, 56 insertions(+), 40 deletions(-) diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc index 054f31ba335..93637a69d5d 100644 --- a/tensorflow/compiler/jit/build_xla_ops_pass.cc +++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc @@ -214,7 +214,8 @@ Status NodeRequiresCompilation(Node* n, bool* result) { return errors::Internal("Could not find compilation device ", device_type.type()); } - *result = registration->requires_compilation; + *result = registration->autoclustering_policy == + XlaOpRegistry::AutoclusteringPolicy::kAlways; return Status::OK(); } diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 11975a6bb07..dae6ca4ad24 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -452,7 +452,9 @@ Status FindCompilationCandidates( OperationFilter op_filter; op_filter.allow_resource_ops = registration->compile_resource_ops; - op_filter.allow_stateful_rng_ops = registration->requires_compilation; + op_filter.allow_stateful_rng_ops = + (registration->autoclustering_policy == + XlaOpRegistry::AutoclusteringPolicy::kAlways); if (!HasXLAKernel(*node, jit_device_type) && !IsCompilableCall(node->def(), jit_device_type, op_filter, 0, @@ -613,10 +615,8 @@ Status MarkForCompilationPass::Run( GetGlobalJitLevel(options); legacy_flags::MarkForCompilationPassFlags* flags = legacy_flags::GetMarkForCompilationPassFlags(); - bool cpu_global_jit = flags->tf_xla_cpu_global_jit; bool fusion_only = flags->tf_xla_fusion_only; - VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit; VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only; VLOG(1) << "flags->tf_xla_auto_jit = " << flags->tf_xla_auto_jit; const FunctionLibraryDefinition* fld = options.flib_def; @@ -635,9 +635,6 @@ Status MarkForCompilationPass::Run( return false; } - // If this device requires a JIT, we must say yes. - if (registration->requires_compilation) return true; - // If there is a _XlaCompile annotation, use its value. bool compile = false; Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile); @@ -674,18 +671,21 @@ Status MarkForCompilationPass::Run( return false; } - // Otherwise use the value of global_jit_level. - // Ignore enable_jit_by_default if global jit compilation for CPU - // is explicitly requested via tf_xla_cpu_global_jit flag - bool ignore_registration = cpu_global_jit && device_type == DEVICE_CPU; + // Otherwise use the value of global_jit_level and the device's + // autoclustering policy. bool should_compile = - (ignore_registration || registration->enable_jit_by_default) && - global_jit_level != OptimizerOptions::OFF; + registration->autoclustering_policy == + XlaOpRegistry::AutoclusteringPolicy::kAlways || + (registration->autoclustering_policy == + XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally && + global_jit_level != OptimizerOptions::OFF); if (!should_compile) { if (global_jit_level == OptimizerOptions::OFF) { VLOG(2) << "Rejecting " << node->name() << ": global jit disabled."; } else { - VLOG(2) << "Rejecting " << node->name() << ": JIT for device disabled."; + VLOG(2) + << "Rejecting " << node->name() + << ": autoclustering for device only when requested explicitly."; } } return should_compile; @@ -1073,12 +1073,10 @@ Status MarkForCompilationPass::RunImpl( XlaOpRegistry::GetCompilationDevice(device_type.type(), ®istration); // Compile if this is a cluster of >= min_cluster_size compilable operators. - // Also, always compile if the operator is placed on a device that requires - // compilation, or if it contains at least one op that is marked for + // Also, always compile if it contains at least one op that is marked for // compilation that is not an Identity op. if (effective_cluster_sizes[cluster] >= min_cluster_size || - (effective_cluster_sizes[cluster] > 0 && marked_for_compilation) || - registration->requires_compilation) { + (effective_cluster_sizes[cluster] > 0 && marked_for_compilation)) { string& name = cluster_names[cluster]; if (name.empty()) { diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc index ead1cf4fd5f..ef4f1ea2b06 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc @@ -923,9 +923,8 @@ TEST(XlaCompilationTest, RandomShapeOnXlaDevice) { TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); std::unordered_map clusters = GetClusters(*graph); - EXPECT_NE(clusters["test/shape_rng"], ""); - EXPECT_NE(clusters["test/reshape"], ""); - EXPECT_NE(clusters["test/shape_rng"], clusters["test/reshape"]); + EXPECT_EQ(clusters["test/shape_rng"], ""); + EXPECT_EQ(clusters["test/reshape"], ""); } TEST(XlaCompilationTest, TensorArrayShapeOnXlaDevice) { diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc index 5b961032233..550ffa2465a 100644 --- a/tensorflow/compiler/jit/partially_decluster_pass.cc +++ b/tensorflow/compiler/jit/partially_decluster_pass.cc @@ -210,7 +210,8 @@ bool IsIntraClusterEdge(const Edge& edge) { bool IsMustCompileDevice(const DeviceType& device_type) { const XlaOpRegistry::DeviceRegistration* registration; if (XlaOpRegistry::GetCompilationDevice(device_type.type(), ®istration)) { - return registration->requires_compilation; + return registration->autoclustering_policy == + XlaOpRegistry::AutoclusteringPolicy::kAlways; } return false; diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc index cbfeb388050..116e0756036 100644 --- a/tensorflow/compiler/jit/xla_cpu_device.cc +++ b/tensorflow/compiler/jit/xla_cpu_device.cc @@ -42,8 +42,10 @@ Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& session_options, XlaOpRegistry::DeviceRegistration registration; registration.compilation_device_name = DEVICE_CPU_XLA_JIT; - registration.requires_compilation = !compile_on_demand; - registration.enable_jit_by_default = false; + registration.autoclustering_policy = + compile_on_demand + ? XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested + : XlaOpRegistry::AutoclusteringPolicy::kAlways; registration.compile_resource_ops = true; XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_CPU, registration); diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index 8f28b38b5e1..717daadc4ac 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -37,8 +37,8 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options, std::vector* devices) { XlaOpRegistry::DeviceRegistration registration; registration.compilation_device_name = DEVICE_GPU_XLA_JIT; - registration.requires_compilation = true; - registration.enable_jit_by_default = false; + registration.autoclustering_policy = + XlaOpRegistry::AutoclusteringPolicy::kAlways; registration.compile_resource_ops = true; XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_GPU, registration); diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc index dc37362fd86..e828bae865d 100644 --- a/tensorflow/compiler/jit/xla_interpreter_device.cc +++ b/tensorflow/compiler/jit/xla_interpreter_device.cc @@ -45,8 +45,8 @@ Status XlaInterpreterDeviceFactory::CreateDevices( XlaOpRegistry::DeviceRegistration registration; registration.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT; - registration.requires_compilation = true; - registration.enable_jit_by_default = false; + registration.autoclustering_policy = + XlaOpRegistry::AutoclusteringPolicy::kAlways; registration.compile_resource_ops = true; XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER, registration); diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index 5fc9a352ff9..f18d8c20089 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -194,6 +194,7 @@ cc_library( ":side_effect_util", ":tf2xla_util", "//tensorflow/compiler/jit:xla_cluster_util", + "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags", "//tensorflow/compiler/tf2xla/lib:util", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc index 9f00de708cc..dcd0e9c5c1f 100644 --- a/tensorflow/compiler/tf2xla/xla_op_registry.cc +++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h" #include "tensorflow/compiler/jit/xla_cluster_util.h" #include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_context.h" @@ -129,21 +130,27 @@ XlaOpRegistry::~XlaOpRegistry() = default; // Lazily register the CPU and GPU JIT devices the first time // GetCompilationDevice is called. static void* registration_init = [®istry]() { + legacy_flags::MarkForCompilationPassFlags* flags = + legacy_flags::GetMarkForCompilationPassFlags(); + bool cpu_global_jit = flags->tf_xla_cpu_global_jit; + mutex_lock lock(registry.mutex_); if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_CPU)).ok()) { DeviceRegistration& registration = registry.compilation_devices_[DEVICE_CPU]; registration.compilation_device_name = DEVICE_CPU_XLA_JIT; - registration.requires_compilation = false; - registration.enable_jit_by_default = false; + registration.autoclustering_policy = + cpu_global_jit + ? XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally + : XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested; registration.compile_resource_ops = false; } if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_GPU)).ok()) { DeviceRegistration& registration = registry.compilation_devices_[DEVICE_GPU]; registration.compilation_device_name = DEVICE_GPU_XLA_JIT; - registration.requires_compilation = false; - registration.enable_jit_by_default = true; + registration.autoclustering_policy = + XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally; registration.compile_resource_ops = false; } return nullptr; diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h index 45a40c0acc0..0bdd4a10854 100644 --- a/tensorflow/compiler/tf2xla/xla_op_registry.h +++ b/tensorflow/compiler/tf2xla/xla_op_registry.h @@ -66,19 +66,26 @@ class XlaOpRegistry { public: typedef OpKernel* (*Factory)(OpKernelConstruction*); + enum class AutoclusteringPolicy { + // Enable autoclustering if the user requests it, e.g., via + // experimental_jit_scope. Does not autocluster if the JIT is enabled + // globally (e.g., via the OptimizerOptions in the TF session + // configuration.) + kIfExplicitlyRequested, + // Enable autoclustering if explicitly requested, or if the JIT is enabled + // globally in the session options, or via TF_XLA_FLAGS=--tf_xla_auto_jit=N. + kIfEnabledGlobally, + // Always try to autocluster ops placed on this device. + kAlways, + }; + // Describes how to compile operators assigned to a device. struct DeviceRegistration { // The name of the an XLA compilation device to use to compile code. string compilation_device_name; - // Do operators assigned to this device require compilation? - bool requires_compilation; - - // If !requires_compilation, should we try to JIT operators on this device - // when XLA JIT compilation is enabled globally via the SessionOptions? - // (It is still possible to explicitly mark operators to JIT compile, even - // if enable_jit_by_default is false.) - bool enable_jit_by_default; + // When should we autocluster operators assigned to this device? + AutoclusteringPolicy autoclustering_policy; // Enable compilation of operators that use DT_RESOURCE types? bool compile_resource_ops = false; From 7f9ae594813dc4397b4c0c5229525de83bc4e4b2 Mon Sep 17 00:00:00 2001 From: Brennan Saeta Date: Mon, 5 Nov 2018 12:37:10 -0800 Subject: [PATCH 103/540] [tf.data]: Parallel map and filter fusion. This change augments the existing map_and_filter fusion tf.data rewrite pass to support fusing ParallelMapDataset ops in addition to the MapDataset ops. PiperOrigin-RevId: 220152549 --- .../optimizers/data/map_and_filter_fusion.cc | 27 ++++-- .../data/map_and_filter_fusion_test.cc | 86 +++++++++++++++++++ 2 files changed, 104 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc index 750cd58869c..2b0a347ce62 100644 --- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc +++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc @@ -38,21 +38,28 @@ NodeDef MakeFusedNode(const NodeDef& map_node, MutableGraphView* graph) { NodeDef fused_node; graph_utils::SetUniqueGraphNodeName("fused_map", graph->graph(), &fused_node); - fused_node.set_op("MapDataset"); - fused_node.add_input(map_node.input(0)); + fused_node.set_op(map_node.op()); + + // Copy over inputs. + for (int i = 0; i < map_node.input_size(); ++i) { + fused_node.add_input(map_node.input(i)); + } auto attr = map_node.attr().at("f"); attr.mutable_func()->set_name(fused_function.signature().name()); (*fused_node.mutable_attr())["f"] = std::move(attr); - graph_utils::CopyAttribute("Targuments", map_node, &fused_node); - - for (auto key : {"output_shapes", "output_types"}) + // Required attrs. + for (auto key : {"Targuments", "output_shapes", "output_types"}) { graph_utils::CopyAttribute(key, map_node, &fused_node); + } - if (const auto* attr = - gtl::FindOrNull(map_node.attr(), "use_inter_op_parallelism")) - (*fused_node.mutable_attr())["use_inter_op_parallelism"] = *attr; + // Optional attrs. + for (auto key : {"use_inter_op_parallelism", "sloppy"}) { + if (const auto* attr = gtl::FindOrNull(map_node.attr(), key)) { + graph_utils::CopyAttribute(key, map_node, &fused_node); + } + } // Add the predicate output attributes. (*fused_node.mutable_attr())["output_types"] @@ -97,7 +104,9 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item, FunctionLibraryDefinition function_library(OpRegistry::Global(), item.graph.library()); auto get_map_node = [](const NodeDef& node) -> const NodeDef* { - if (node.op() == "MapDataset") return &node; + if (node.op() == "MapDataset" || node.op() == "ParallelMapDataset") { + return &node; + } return nullptr; }; diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc index 6e6da37d7c2..c5a5e22aba6 100644 --- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc +++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc @@ -30,6 +30,7 @@ namespace grappler { namespace { using graph_tests_utils::MakeFilterNode; using graph_tests_utils::MakeMapNode; +using graph_tests_utils::MakeParallelMapNode; TEST(MapAndFilterFusionTest, FuseMapAndFilter) { using test::function::NDef; @@ -58,6 +59,41 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilter) { graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output)); } +TEST(MapAndFilterFusionTest, FuseParallelMapAndFilter) { + using test::function::NDef; + GrapplerItem item; + item.graph = test::function::GDef( + {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}), + NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}), + NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}), + NDef("range", "RangeDataset", {"start", "stop", "step"}, {}), + NDef("num_parallel_calls", "Const", {}, + {{"value", 3}, {"dtype", "DT_INT32"}}), + MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo", + /*sloppy=*/false), + MakeFilterNode("filter", "map")}, + // FunctionLib + { + test::function::XTimesTwo(), + test::function::IsZero(), + }); + + MapAndFilterFusion optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + + EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output)); + EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output)); + EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output)) + << output.DebugString(); + auto& map_node = output.node( + graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output)); + EXPECT_FALSE(map_node.attr().at("sloppy").b()) << map_node.DebugString(); + EXPECT_TRUE( + graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output)) + << output.DebugString(); +} + TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) { using test::function::NDef; GrapplerItem item; @@ -103,6 +139,56 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) { EXPECT_EQ(cache_node.input(0), filter_by_component.name()); } +TEST(MapAndFilterFusionTest, FuseParallelMapAndFilterWithExtraChild) { + using test::function::NDef; + GrapplerItem item; + item.graph = test::function::GDef( + {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}), + NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}), + NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}), + NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_STRING}}), + NDef("range", "RangeDataset", {"start", "stop", "step"}, {}), + NDef("num_parallel_calls", "Const", {}, + {{"value", 3}, {"dtype", "DT_INT32"}}), + MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo", + /*sloppy=*/true), + MakeFilterNode("filter", "map"), + NDef("cache", "CacheDataset", {"filter", "filename"}, {})}, + // FunctionLib + { + test::function::XTimesTwo(), + test::function::IsZero(), + }); + + MapAndFilterFusion optimizer; + GraphDef output; + TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output)); + + EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output)); + EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output)); + ASSERT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output)); + ASSERT_TRUE( + graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output)); + ASSERT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output)); + + int map_id = graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output); + auto& map_node = output.node(map_id); + ASSERT_EQ(map_node.input_size(), 2); + EXPECT_EQ(map_node.input(0), "range"); + EXPECT_EQ(map_node.input(1), "num_parallel_calls"); + + int filter_by_component_id = + graph_utils::FindGraphNodeWithOp("FilterByLastComponentDataset", output); + auto& filter_by_component = output.node(filter_by_component_id); + ASSERT_EQ(filter_by_component.input_size(), 1); + EXPECT_EQ(filter_by_component.input(0), map_node.name()); + + int cache_id = graph_utils::FindGraphNodeWithOp("CacheDataset", output); + auto& cache_node = output.node(cache_id); + ASSERT_EQ(cache_node.input_size(), 2); + EXPECT_EQ(cache_node.input(0), filter_by_component.name()); +} + } // namespace } // namespace grappler } // namespace tensorflow From 5c5038b7cf6bcc13af34eeb0c7596b79bda0f3b8 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 5 Nov 2018 12:58:06 -0800 Subject: [PATCH 104/540] Make _XlaCompile resilient against "unimplemented" compiler failures This CL makes _XlaCompile to fall back to the TF executor if the XLA compiler can't compile the cluster for some reason. While at it, make most of the fields in the XLA op kernels const so that we don't accidentally mutate them in the ::Compute functions (this would result in a race). Also remove the `#undef OP_REQUIRES_OK_RETURN`; it has minimal value in a .cc file. PiperOrigin-RevId: 220156284 --- tensorflow/compiler/jit/kernels/xla_ops.cc | 91 +++++++++++++--------- tensorflow/compiler/jit/kernels/xla_ops.h | 29 ++++--- 2 files changed, 74 insertions(+), 46 deletions(-) diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc index 6bcae1dcc3d..56b7909ffd3 100644 --- a/tensorflow/compiler/jit/kernels/xla_ops.cc +++ b/tensorflow/compiler/jit/kernels/xla_ops.cc @@ -39,12 +39,22 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/util/stream_executor_util.h" +// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that +// in error case, it returns RET instead of void. +#define OP_REQUIRES_OK_RETURN(CTX, RET, ...) \ + do { \ + ::tensorflow::Status _s(__VA_ARGS__); \ + if (!TF_PREDICT_TRUE(_s.ok())) { \ + (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \ + return RET; \ + } \ + } while (0) + namespace tensorflow { namespace { -Status PlatformInfoFromContext(OpKernelConstruction* ctx, - XlaPlatformInfo* result) { +XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) { DeviceType device_type = ctx->device_type(); se::Platform::Id platform_id = nullptr; const XlaDevice::Metadata* xla_device_metadata = nullptr; @@ -76,16 +86,16 @@ Status PlatformInfoFromContext(OpKernelConstruction* ctx, } if (!device_allocator) { - TF_ASSIGN_OR_RETURN(se::Platform* const platform, - se::MultiPlatformManager::PlatformWithId(platform_id)); + xla::StatusOr maybe_platform = + se::MultiPlatformManager::PlatformWithId(platform_id); + OP_REQUIRES_OK_RETURN(ctx, XlaPlatformInfo(), maybe_platform.status()); + xla_allocator = absl::make_unique( - platform, ctx->device()->GetAllocator({})); + maybe_platform.ValueOrDie(), ctx->device()->GetAllocator({})); } - *result = XlaPlatformInfo(device_type, platform_id, xla_device_metadata, - std::move(xla_allocator), device_allocator); - - return Status::OK(); + return XlaPlatformInfo(device_type, platform_id, xla_device_metadata, + std::move(xla_allocator), device_allocator); } // A closure describing how to run a compiled version of a TensorFlow function. @@ -179,9 +189,8 @@ XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx, : OpKernel(ctx), constants_(constants), resources_(resources), - function_(function) { - OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_)); -} + function_(function), + platform_info_(PlatformInfoFromContext(ctx)) {} static Status BuildCompilationCache(OpKernelContext* ctx, const XlaPlatformInfo& platform_info, @@ -333,18 +342,6 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { } namespace { - -// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that -// in error case, it returns RET instead of void. -#define OP_REQUIRES_OK_RETURN(CTX, RET, ...) \ - do { \ - ::tensorflow::Status _s(__VA_ARGS__); \ - if (!TF_PREDICT_TRUE(_s.ok())) { \ - (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \ - return RET; \ - } \ - } while (0) - // Helper static functions to construct parameters for // XlaLocalLaunchBase constructor from OpKernelConstruction. std::vector ConstantsVector(OpKernelConstruction* ctx) { @@ -381,7 +378,12 @@ NameAttrList FunctionAttr(OpKernelConstruction* ctx) { return *func; } -#undef OP_REQUIRES_OK_RETURN +bool MustCompileAttr(OpKernelConstruction* ctx) { + bool must_compile; + OP_REQUIRES_OK_RETURN(ctx, false, + ctx->GetAttr("must_compile", &must_compile)); + return must_compile; +} } // namespace XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx) @@ -396,10 +398,9 @@ XlaCompileOp::XlaCompileOp(OpKernelConstruction* ctx) : OpKernel(ctx), constants_(ConstantsVector(ctx)), resources_(ResourcesVector(ctx)), - function_(FunctionAttr(ctx)) { - OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_)); - OP_REQUIRES_OK(ctx, ctx->GetAttr("must_compile", &must_compile_)); -} + function_(FunctionAttr(ctx)), + platform_info_(PlatformInfoFromContext(ctx)), + must_compile_(MustCompileAttr(ctx)) {} void XlaCompileOp::Compute(OpKernelContext* ctx) { VLOG(3) << "XlaCompileOp " << def().name() @@ -409,13 +410,30 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) { xla::LocalExecutable* executable; std::map variables; - if (legacy_flags::GetXlaOpsCommonFlags().tf_xla_always_defer_compilation) { + bool cannot_compile_cluster; + { + mutex_lock guard(cannot_compile_cluster_mu_); + cannot_compile_cluster = cannot_compile_cluster_; + } + + if (legacy_flags::GetXlaOpsCommonFlags().tf_xla_always_defer_compilation || + cannot_compile_cluster) { executable = nullptr; } else { - OP_REQUIRES_OK(ctx, CompileToLocalExecutable( - ctx, function_, platform_info_, resources_, - constants_, /*lazy=*/!must_compile_, &client, - &variables, &kernel, &executable)); + Status status = CompileToLocalExecutable( + ctx, function_, platform_info_, resources_, constants_, + /*lazy=*/!must_compile_, &client, &variables, &kernel, &executable); + if (must_compile_ || status.code() != error::UNIMPLEMENTED) { + OP_REQUIRES_OK(ctx, status); + } + + if (status.code() == error::UNIMPLEMENTED) { + LOG(WARNING) << "Compilation failed:" << status.ToString() + << ". Falling back to TF function call."; + executable = nullptr; + mutex_lock guard(cannot_compile_cluster_mu_); + cannot_compile_cluster_ = true; + } } AllocatorAttributes host_alloc_attrs; @@ -452,9 +470,8 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) { ctx->set_output(1, compilation_successful); } -XlaRunOp::XlaRunOp(OpKernelConstruction* ctx) : OpKernel(ctx) { - OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_)); -} +XlaRunOp::XlaRunOp(OpKernelConstruction* ctx) + : OpKernel(ctx), platform_info_(PlatformInfoFromContext(ctx)) {} void XlaRunOp::Compute(OpKernelContext* ctx) { VLOG(3) << "XlaRunOp " << def().name(); diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h index ac90837e0d9..7b4d4b5b473 100644 --- a/tensorflow/compiler/jit/kernels/xla_ops.h +++ b/tensorflow/compiler/jit/kernels/xla_ops.h @@ -16,6 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_ #define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_ +#include + #include "tensorflow/compiler/jit/xla_compilation_cache.h" #include "tensorflow/compiler/jit/xla_device.h" #include "tensorflow/compiler/jit/xla_launch_util.h" @@ -33,6 +35,7 @@ namespace tensorflow { class XlaPlatformInfo { public: XlaPlatformInfo() : device_type_("") {} + XlaPlatformInfo(XlaPlatformInfo&&) = default; explicit XlaPlatformInfo(const DeviceType device_type, se::Platform::Id platform_id, const XlaDevice::Metadata* xla_device_metadata, @@ -110,12 +113,12 @@ class XlaLocalLaunchBase : public OpKernel { protected: // Indexes of compile-time constant inputs - std::vector constants_; + const std::vector constants_; // Indexes of resource inputs - std::vector resources_; + const std::vector resources_; - NameAttrList function_; - XlaPlatformInfo platform_info_; + const NameAttrList function_; + const XlaPlatformInfo platform_info_; }; // XlaLocalLaunchOp is used to replace a region of the TensorFlow graph @@ -144,15 +147,23 @@ class XlaCompileOp : public OpKernel { private: // Indexes of compile-time constant inputs - std::vector constants_; + const std::vector constants_; // Indexes of resource inputs - std::vector resources_; + const std::vector resources_; - NameAttrList function_; + const NameAttrList function_; XlaPlatformInfo platform_info_; - bool must_compile_; + const bool must_compile_; + + // cannot_compile_cluster_ is set to true if XLA returns an Unimplemented + // error when compiling the cluster this _XlaCompile is supposed to compile. + // If `cannot_compile_cluster_` is true then we avoid compiling this cluster + // on any future calls to _XlaCompile. + bool cannot_compile_cluster_ GUARDED_BY(cannot_compile_cluster_mu_) = false; + + mutex cannot_compile_cluster_mu_; }; class XlaRunOp : public OpKernel { @@ -162,7 +173,7 @@ class XlaRunOp : public OpKernel { void Compute(OpKernelContext* ctx) override; private: - XlaPlatformInfo platform_info_; + const XlaPlatformInfo platform_info_; }; } // namespace tensorflow From 7fe4876d7019db727459d8e303906cb4ac56643a Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 26 Oct 2018 09:36:29 -0700 Subject: [PATCH 105/540] Add runner_threadpool_size into IteratorContext --- tensorflow/core/framework/dataset.h | 10 ++++++- .../experimental/threadpool_dataset_op.cc | 3 +++ tensorflow/core/kernels/data/iterator_ops.cc | 27 +++++++++++++++++++ .../kernels/data/parallel_map_iterator.cc | 4 +-- 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h index b4cd2751319..ffd6b620258 100644 --- a/tensorflow/core/framework/dataset.h +++ b/tensorflow/core/framework/dataset.h @@ -279,12 +279,15 @@ class IteratorContext { lib(ctx->lib()), model(ctx->model()), runner(*(ctx->runner())), + runner_threadpool_size(ctx->runner_threadpool_size()), stats_aggregator(ctx->stats_aggregator()) {} explicit Params(OpKernelContext* ctx) : env(ctx->env()), lib(ctx->function_library()), - runner(*(ctx->runner())) { + runner(*(ctx->runner())), + runner_threadpool_size( + ctx->device()->tensorflow_cpu_worker_threads()->num_threads) { // NOTE: need reinterpret_cast because function.h forward-declares Device. DeviceBase* device = reinterpret_cast(ctx->function_library()->device()); @@ -311,6 +314,9 @@ class IteratorContext { // Function call support. std::function)> runner = nullptr; + // Number of threads used for executing user-defined functions. + int32 runner_threadpool_size = 0; + // The `StatsAggregator` object to record statistics about the iterator. std::shared_ptr stats_aggregator = nullptr; }; @@ -343,6 +349,8 @@ class IteratorContext { return ¶ms_.runner; } + int32 runner_threadpool_size() { return params_.runner_threadpool_size; } + std::shared_ptr stats_aggregator() { return params_.stats_aggregator; } diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc index 56fbbde1a3a..ab21dfc6bc5 100644 --- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc +++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc @@ -47,6 +47,8 @@ class ThreadPoolResource : public ResourceBase { } } + int32 NumThreads() { return thread_pool_.NumThreads(); } + string DebugString() override { return "ThreadPoolResource"; } private: @@ -196,6 +198,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel { params.runner = [pool](std::function c) { pool->Schedule(std::move(c)); }; + params.runner_threadpool_size = pool->NumThreads(); IteratorContext iter_ctx(params); return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence); } diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index 445718ba1e5..bb6b5fba06a 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -145,6 +145,8 @@ class IteratorResource : public ResourceBase { params.allocator_getter = [device](AllocatorAttributes attrs) { return device->GetAllocator(attrs); }; + params.runner_threadpool_size = + ctx->device()->tensorflow_cpu_worker_threads()->num_threads; IteratorContext iter_ctx(std::move(params)); TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader)); mutex_lock l(mu_); @@ -978,8 +980,20 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { IteratorContext::Params params(ctx); params.function_library = iterator->function_library(); +<<<<<<< HEAD Status s = iterator->GetNext(IteratorContext(std::move(params)), &components, &end_of_sequence); +======= + DeviceBase* device = ctx->function_library()->device(); + params.allocator_getter = [device](AllocatorAttributes attrs) { + return device->GetAllocator(attrs); + }; + params.runner_threadpool_size = + ctx->device()->tensorflow_cpu_worker_threads()->num_threads; + IteratorContext iter_ctx(std::move(params)); + + Status s = iterator->GetNext(&iter_ctx, &components, &end_of_sequence); +>>>>>>> Add runner_threadpool_size into IteratorContext // NOTE(mrry): We must unref the iterator before calling `done()`, to // avoid destruction races. iterator->Unref(); @@ -1007,8 +1021,21 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) { bool end_of_sequence = false; IteratorContext::Params params(ctx); params.function_library = iterator->function_library(); +<<<<<<< HEAD OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(std::move(params)), &components, &end_of_sequence)); +======= + DeviceBase* device = ctx->function_library()->device(); + params.allocator_getter = [device](AllocatorAttributes attrs) { + return device->GetAllocator(attrs); + }; + params.runner_threadpool_size = + ctx->device()->tensorflow_cpu_worker_threads()->num_threads; + IteratorContext iter_ctx(std::move(params)); + + OP_REQUIRES_OK(ctx, + iterator->GetNext(&iter_ctx, &components, &end_of_sequence)); +>>>>>>> Add runner_threadpool_size into IteratorContext OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence")); for (int i = 0; i < components.size(); ++i) { diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc index 1fa9a1fdc50..10103230950 100644 --- a/tensorflow/core/kernels/data/parallel_map_iterator.cc +++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc @@ -65,9 +65,7 @@ class ParallelMapIterator : public DatasetBaseIterator { Status Initialize(IteratorContext* ctx) override { mutex_lock l(*mu_); if (num_parallel_calls_->value == kAutoTune) { - // TODO(jsimsa): Surface the number of threads used by `ctx->runner()` and - // use it here for the default. - num_parallel_calls_->value = port::NumSchedulableCPUs(); + num_parallel_calls_->value = ctx->runner_threadpool_size(); num_parallel_calls_->tunable = true; } TF_RETURN_IF_ERROR( From 8f9a977a60baa2aeb101d71448f9b87f0f3f5a37 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Mon, 5 Nov 2018 13:13:56 -0800 Subject: [PATCH 106/540] Resolve the code conflicts --- tensorflow/core/kernels/data/iterator_ops.cc | 27 +------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index bb6b5fba06a..92bbfdbea5c 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -980,20 +980,8 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { IteratorContext::Params params(ctx); params.function_library = iterator->function_library(); -<<<<<<< HEAD Status s = iterator->GetNext(IteratorContext(std::move(params)), &components, &end_of_sequence); -======= - DeviceBase* device = ctx->function_library()->device(); - params.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - params.runner_threadpool_size = - ctx->device()->tensorflow_cpu_worker_threads()->num_threads; - IteratorContext iter_ctx(std::move(params)); - - Status s = iterator->GetNext(&iter_ctx, &components, &end_of_sequence); ->>>>>>> Add runner_threadpool_size into IteratorContext // NOTE(mrry): We must unref the iterator before calling `done()`, to // avoid destruction races. iterator->Unref(); @@ -1021,21 +1009,9 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) { bool end_of_sequence = false; IteratorContext::Params params(ctx); params.function_library = iterator->function_library(); -<<<<<<< HEAD + OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(std::move(params)), &components, &end_of_sequence)); -======= - DeviceBase* device = ctx->function_library()->device(); - params.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - params.runner_threadpool_size = - ctx->device()->tensorflow_cpu_worker_threads()->num_threads; - IteratorContext iter_ctx(std::move(params)); - - OP_REQUIRES_OK(ctx, - iterator->GetNext(&iter_ctx, &components, &end_of_sequence)); ->>>>>>> Add runner_threadpool_size into IteratorContext OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence")); for (int i = 0; i < components.size(); ++i) { @@ -1236,7 +1212,6 @@ class DeserializeIteratorOp : public OpKernel { } }; - REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp); REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU), IteratorHandleOp); From 6a4700942783ab5b82692209b3fc0010fcd09530 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Mon, 5 Nov 2018 13:15:30 -0800 Subject: [PATCH 107/540] Internal changes PiperOrigin-RevId: 220159416 --- tensorflow/python/kernel_tests/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 81980b95f4e..e6508fde0f6 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -2562,6 +2562,8 @@ cuda_py_test( ], shard_count = 4, tags = [ + # TODO(b/118887316): Re-enable this test in Kokoro. + "no_oss", "optonly", # times out ], ) From f7e83b595d8852a3081c1b230c7838ecec238ed3 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Mon, 5 Nov 2018 13:31:36 -0800 Subject: [PATCH 108/540] Avoid unnecessarily lowering __self__ to the type object in getmethodclass, because it appears to be unnecessary. PiperOrigin-RevId: 220162299 --- .../python/autograph/pyct/inspect_utils.py | 5 +---- .../python/autograph/pyct/inspect_utils_test.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py index e078cd56a21..88f2d7a0564 100644 --- a/tensorflow/python/autograph/pyct/inspect_utils.py +++ b/tensorflow/python/autograph/pyct/inspect_utils.py @@ -185,12 +185,9 @@ def getmethodclass(m): return m.__class__ # Instance method and class methods: should be bound to a non-null "self". - # If self is a class, then it's a class method. if hasattr(m, '__self__'): if m.__self__: - if tf_inspect.isclass(m.__self__): - return m.__self__ - return type(m.__self__) + return m.__self__ # Class, static and unbound methods: search all defined classes in any # namespace. This is inefficient but more robust method. diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py index 7e79b3b9f68..51116b6cac7 100644 --- a/tensorflow/python/autograph/pyct/inspect_utils_test.py +++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py @@ -184,16 +184,16 @@ class InspectUtilsTest(test.TestCase): test_obj = TestClass() self.assertEqual( inspect_utils.getmethodclass(test_obj.member_function), - TestClass) + test_obj) self.assertEqual( inspect_utils.getmethodclass(test_obj.decorated_member), - TestClass) + test_obj) self.assertEqual( inspect_utils.getmethodclass(test_obj.fn_decorated_member), - TestClass) + test_obj) self.assertEqual( inspect_utils.getmethodclass(test_obj.wrap_decorated_member), - TestClass) + test_obj) self.assertEqual( inspect_utils.getmethodclass(test_obj.static_method), TestClass) @@ -242,16 +242,16 @@ class InspectUtilsTest(test.TestCase): test_obj = LocalClass() self.assertEqual( inspect_utils.getmethodclass(test_obj.member_function), - LocalClass) + test_obj) self.assertEqual( inspect_utils.getmethodclass(test_obj.decorated_member), - LocalClass) + test_obj) self.assertEqual( inspect_utils.getmethodclass(test_obj.fn_decorated_member), - LocalClass) + test_obj) self.assertEqual( inspect_utils.getmethodclass(test_obj.wrap_decorated_member), - LocalClass) + test_obj) def test_getmethodclass_callables(self): class TestCallable(object): From 5fb8c844512f50ec58fe3b856c8718816d7a504a Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Mon, 5 Nov 2018 13:32:01 -0800 Subject: [PATCH 109/540] Internal changes PiperOrigin-RevId: 220162374 --- tensorflow/contrib/saved_model/BUILD | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD index 291ff83791c..4e1af191c99 100644 --- a/tensorflow/contrib/saved_model/BUILD +++ b/tensorflow/contrib/saved_model/BUILD @@ -82,7 +82,12 @@ py_library( name = "keras_saved_model", srcs = ["python/saved_model/keras_saved_model.py"], srcs_version = "PY2AND3", - tags = ["no_windows"], + tags = [ + "no_windows", + # TODO(b/119022845): Re-enable this test in TAP. + "manual", + "notap", + ], visibility = ["//visibility:public"], deps = [ "//tensorflow/python:array_ops", From 02da09f35209bc526720d94444ddf1c7fb9dd020 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 13:33:04 -0800 Subject: [PATCH 110/540] Changing verbose kwarg in convert() to be an enum for different levels of error reporting verbosity. Support for less verbose levels coming soon in later CLs. PiperOrigin-RevId: 220162577 --- tensorflow/python/autograph/__init__.py | 2 ++ tensorflow/python/autograph/core/converter.py | 18 +++++++++++++++--- tensorflow/python/autograph/impl/api.py | 15 ++++++++------- tensorflow/python/autograph/impl/conversion.py | 4 ++-- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py index fd9e60bea75..7252e0d9bf9 100644 --- a/tensorflow/python/autograph/__init__.py +++ b/tensorflow/python/autograph/__init__.py @@ -26,6 +26,7 @@ from tensorflow.python.autograph import operators from tensorflow.python.autograph import utils from tensorflow.python.autograph.core.converter import ConversionOptions from tensorflow.python.autograph.core.converter import Feature +from tensorflow.python.autograph.core.converter import Verbosity from tensorflow.python.autograph.core.errors import GraphConstructionError from tensorflow.python.autograph.core.errors import improved_errors from tensorflow.python.autograph.core.errors import TfRuntimeError @@ -58,6 +59,7 @@ _allowed_symbols = [ 'improved_errors', 'GraphConstructionError', 'TfRuntimeError', + 'Verbosity', # Python language "extensions" 'set_element_type', 'set_loop_options', diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py index 59b9ebb5918..bc366123096 100644 --- a/tensorflow/python/autograph/core/converter.py +++ b/tensorflow/python/autograph/core/converter.py @@ -64,6 +64,7 @@ from __future__ import division from __future__ import print_function from enum import Enum +from enum import IntEnum from tensorflow.python.autograph.core import config from tensorflow.python.autograph.core import naming @@ -89,6 +90,17 @@ from tensorflow.python.autograph.pyct.static_analysis import type_info # TODO(mdan): Add a test specific to this converter. +class Verbosity(IntEnum): + """Different levels of verbosity for printing errors. + + Attributes: + * BRIEF: No logging, minimal error messages. + * VERBOSE: Detailed logging of generated code, detailed error messages. + """ + BRIEF = 0 + VERBOSE = 1 + + class Feature(Enum): """Constants to use when selecting AutoGraph features.""" @@ -111,7 +123,7 @@ class ConversionOptions(object): Attributes: recursive: bool, whether to recursively convert any user functions or classes that the converted function may use. - verbose: bool, whether to log the converted code. + verbose: Verbosity, the level of verbosity to use. strip_decorators: Tuple[Callable], contains decorators that should be in excluded from the compiled output. By default, when converting a function before the decorators are applied, the compiled output will include those @@ -126,7 +138,7 @@ class ConversionOptions(object): def __init__(self, recursive=False, - verbose=False, + verbose=Verbosity.VERBOSE, strip_decorators=None, force_conversion=False, internal_convert_user_code=True, @@ -197,7 +209,7 @@ class ConversionOptions(object): constructor_name=parser.parse_expression( as_qualified_name(ConversionOptions)), recursive_val=parser.parse_expression(str(self.recursive)), - verbose_val=parser.parse_expression(str(self.verbose)), + verbose_val=parser.parse_expression(str(int(self.verbose))), strip_decorators_val=list_of_names(self.strip_decorators), force_conversion_val=parser.parse_expression( str(self.force_conversion)), diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py index e0e07c6d5f5..123d2897390 100644 --- a/tensorflow/python/autograph/impl/api.py +++ b/tensorflow/python/autograph/impl/api.py @@ -47,7 +47,9 @@ from tensorflow.python.util import tf_inspect # TODO(mdan): This should behave like to_graph (e.g. convert statically). -def convert(recursive=False, verbose=False): +# TODO(znado): Make an alias so can write Verbosity directly without needing +# to write converter. +def convert(recursive=False, verbose=converter.Verbosity.VERBOSE): """Decorator that compiles a function to use TensorFlow ops. The decorator is dynamic - it recompiles the target whenever the decorated @@ -58,7 +60,7 @@ def convert(recursive=False, verbose=False): Args: recursive: bool, whether to recursively convert any functions or classes that the converted function may use. - verbose: bool, whether to output the compiled code in the logs. + verbose: converter.Verbosity, the level of verbosity. Returns: Callable, a decorator that converts the given function into an equivalent @@ -92,8 +94,7 @@ def convert(recursive=False, verbose=False): class RunMode(Enum): """Specifies the way a converted function or method should be executed in TF. - The enum values have the following semantics: - + Attributes: * GRAPH: Call this function directly, as-is. This is suitable for functions that were already designed for TF graphs and contain ops. * PY_FUNC: Wrap this function into a py_func op. This is suitable for code @@ -153,7 +154,7 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None): # TODO(mdan): Move to a private, undocumented module. def converted_call(f, owner, options, *args, **kwargs): """Compiles a function call inline. For internal use only.""" - if options.verbose: + if options.verbose >= converter.Verbosity.VERBOSE: logging.info('Converted call: {}; owner: {}'.format(f, owner)) if owner is not None: @@ -283,7 +284,7 @@ def _is_not_callable(obj): # TODO(mdan): Remove partial_types. def to_graph(e, recursive=True, - verbose=False, + verbose=converter.Verbosity.VERBOSE, arg_values=None, arg_types=None, partial_types=None, @@ -301,7 +302,7 @@ def to_graph(e, e: Union[Callable, Type], the Python entity to convert. recursive: bool, whether to recursively convert any functions that the converted function may call. - verbose: bool, whether to output the compiled code in the logs. + verbose: converter.Verbosity, the level of printing verbosity to use. arg_values: Optional[Dict[Text, Any]], value hints for symbols including function arguments. arg_types: Optional[Dict[Text, Type]], type hints for symbols including diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py index ee09b2718ea..74cf3e61451 100644 --- a/tensorflow/python/autograph/impl/conversion.py +++ b/tensorflow/python/autograph/impl/conversion.py @@ -108,7 +108,7 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types): Raises: ValueError: if the entity type is not supported. """ - if program_ctx.options.verbose: + if program_ctx.options.verbose == converter.Verbosity.VERBOSE: logging.info('Converting {}'.format(o)) if tf_inspect.isclass(o): @@ -151,7 +151,7 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types): program_ctx.add_to_cache(o, node) - if program_ctx.options.verbose: + if program_ctx.options.verbose == converter.Verbosity.VERBOSE: logging.info('Compiled output of {}:\n\n{}\n'.format( o, compiler.ast_to_source(node))) From 172b5e3ca1efb5e6e7701cf5e2b3bb4f6aed7824 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Mon, 5 Nov 2018 13:56:18 -0800 Subject: [PATCH 111/540] Rename tf.colocate_with to tf.compat.v1.colocate_with in upgrade script. tf.colocate_with is no longer supported as public API in v2. In the upgrade script convert it to tf.compat.v1.colocate_with. PiperOrigin-RevId: 220166863 --- tensorflow/tools/compatibility/tf_upgrade_v2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py index c11fcdb96d8..dda45468fcd 100644 --- a/tensorflow/tools/compatibility/tf_upgrade_v2.py +++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py @@ -71,6 +71,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec): "tf.train.start_queue_runners": "tf.compat.v1.start_queue_runners", }) # pylint: enable=line-too-long + self.function_renames["tf.colocate_with"] = "tf.compat.v1.colocate_with" # TODO(amitpatankar): Fix the function rename script # to handle constants without hardcoding. From 62972e400ad0c15f8a69cec5290d922665b77218 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 14:01:30 -0800 Subject: [PATCH 112/540] Enabling AutoGraph error rewriting for classes. PiperOrigin-RevId: 220167715 --- .../python/autograph/impl/conversion.py | 14 +++++-------- tensorflow/python/autograph/pyct/compiler.py | 21 +++++-------------- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py index 74cf3e61451..197bd5a3e76 100644 --- a/tensorflow/python/autograph/impl/conversion.py +++ b/tensorflow/python/autograph/impl/conversion.py @@ -192,8 +192,7 @@ def class_to_graph(c, program_ctx): program_ctx=program_ctx, arg_values={}, arg_types={'self': (c.__name__, c)}, - owner_type=c, - rewrite_errors=False) + owner_type=c) if class_namespace is None: class_namespace = namespace else: @@ -282,8 +281,7 @@ def function_to_graph(f, program_ctx, arg_values, arg_types, - owner_type=None, - rewrite_errors=True): + owner_type=None): """Specialization of `entity_to_graph` for callable functions.""" node, source = parser.parse_entity(f) @@ -302,7 +300,7 @@ def function_to_graph(f, arg_types=arg_types, owner_type=owner_type) context = converter.EntityContext(namer, entity_info, program_ctx) - node = node_to_graph(node, context, rewrite_errors=rewrite_errors) + node = node_to_graph(node, context) # TODO(mdan): This somewhat duplicates the call rename logic in call_trees.py new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type) @@ -318,13 +316,12 @@ def function_to_graph(f, return [node], new_name, namespace -def node_to_graph(node, context, rewrite_errors=True): +def node_to_graph(node, context): """Convert Python code to equivalent TF graph mode code. Args: node: AST, the code to convert. context: converter.EntityContext - rewrite_errors: Boolean, whether or not to rewrite the error traceback. Returns: A tuple (node, deps): @@ -362,6 +359,5 @@ def node_to_graph(node, context, rewrite_errors=True): if context.program.options.uses(converter.Feature.AUTO_CONTROL_DEPS): node = converter.apply_(node, context, side_effect_guards) node = converter.apply_(node, context, function_scopes) - if rewrite_errors: - node = converter.apply_(node, context, error_handlers) + node = converter.apply_(node, context, error_handlers) return node diff --git a/tensorflow/python/autograph/pyct/compiler.py b/tensorflow/python/autograph/pyct/compiler.py index 21281aeb561..06e66c5b587 100644 --- a/tensorflow/python/autograph/pyct/compiler.py +++ b/tensorflow/python/autograph/pyct/compiler.py @@ -123,26 +123,15 @@ def ast_to_object(nodes, compiled_nodes = imp.load_source(module_name, f.name) # TODO(znado): Clean this up so we don't need to attach it to the namespace. - # TODO(znado): This does not work for classes because their methods share a - # namespace. - # This attaches the source map which is needed for error handling. Note that - # api.to_graph copies this source map into an attribute of the function. - # - # We need this so the ag_source_map__ variable is available to the call to - # rewrite_graph_construction_error in the except block inside each function - # that handles graph construction errors. - # # We cannot get the rewritten function name until it is too late so templating - # is hard, and this cleanly fixes the - # issues encountered with nested functions because this is attached to the - # outermost one. + # is hard, and this cleanly fixes the issues encountered with nested functions + # because this is attached to the outermost one. if include_source_map: # TODO(mdan): This name should be decided by the caller. source_map_name = 'ag_source_map__' - if source_map_name in compiled_nodes.__dict__: - raise ValueError('cannot convert %s because is has namespace attribute ' - '"%s", which is reserved for AutoGraph.' % - (compiled_nodes, source_map_name)) + assert source_map_name not in compiled_nodes.__dict__, ( + 'cannot convert %s because is has namespace attribute "%s", which is ' + 'reserved for AutoGraph.') % (compiled_nodes, source_map_name) compiled_nodes.__dict__[source_map_name] = source_map return compiled_nodes, source From 99b0480436f1c3c8889f42efca3c24eaf7c9d53c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 14:13:24 -0800 Subject: [PATCH 113/540] Fix metric names changed after switching to tf.div_no_nan. PiperOrigin-RevId: 220170102 --- tensorflow/python/ops/metrics_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py index d0919bdbe46..e86a3b85360 100644 --- a/tensorflow/python/ops/metrics_impl.py +++ b/tensorflow/python/ops/metrics_impl.py @@ -225,7 +225,7 @@ def _safe_div(numerator, denominator, name): 0 if `denominator` <= 0, else `numerator` / `denominator` """ if compat.forward_compatible(2018, 11, 1): - return math_ops.div_no_nan(numerator, denominator) + return math_ops.div_no_nan(numerator, denominator, name=name) t = math_ops.truediv(numerator, denominator) zero = array_ops.zeros_like(t, dtype=denominator.dtype) condition = math_ops.greater(denominator, zero) From 588b00bc1aeb9b41e0066e09b5a7bab6c12c5a8c Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Mon, 5 Nov 2018 14:21:20 -0800 Subject: [PATCH 114/540] Replace `OpKernelContext::wrapped_allocators()` with `ConsumeWrappedAllocators()`. This change makes the ownership transfer from `OpKernelContext` the stats collection machinery explicit, and should prevent memory leaks in future. It also logs a warning if the wrapped allocators are not consumed by a stats collector, which indicates that the code is taking a more expensive allocation path for no purpose. PiperOrigin-RevId: 220171712 --- .../core/common_runtime/eager/kernel_and_device.cc | 2 +- tensorflow/core/common_runtime/step_stats_collector.cc | 2 +- tensorflow/core/common_runtime/step_stats_collector.h | 3 +-- tensorflow/core/framework/op_kernel.cc | 9 ++++++++- tensorflow/core/framework/op_kernel.h | 5 +++-- tensorflow/core/kernels/constant_op_test.cc | 2 +- tensorflow/core/kernels/data/captured_function.cc | 7 +------ 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc index e1ff45d6dd0..ac9fd187b34 100644 --- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc +++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc @@ -131,7 +131,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container, outputs->push_back(Tensor(*context.mutable_output(i))); } if (stats != nullptr) { - for (const auto& allocator_pair : context.wrapped_allocators()) { + for (const auto& allocator_pair : context.ConsumeWrappedAllocators()) { AllocatorMemoryUsed* memory = stats->add_memory(); memory->set_allocator_name(allocator_pair.first->Name()); auto sizes = allocator_pair.second->GetSizes(); diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc index a70ab93d4ad..49265445659 100644 --- a/tensorflow/core/common_runtime/step_stats_collector.cc +++ b/tensorflow/core/common_runtime/step_stats_collector.cc @@ -139,7 +139,7 @@ void NodeExecStatsWrapper::SetScheduled(int64 nanos) { } void NodeExecStatsWrapper::SetMemory(OpKernelContext* ctx) { - for (const auto& allocator_pair : ctx->wrapped_allocators()) { + for (const auto& allocator_pair : ctx->ConsumeWrappedAllocators()) { AddAllocation(allocator_pair.first, allocator_pair.second); } auto* ms = stats_->mutable_memory_stats(); diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h index 4a1bb44f4b8..7d34383ce82 100644 --- a/tensorflow/core/common_runtime/step_stats_collector.h +++ b/tensorflow/core/common_runtime/step_stats_collector.h @@ -74,8 +74,7 @@ class NodeExecStatsInterface { // Records information about the memory allocated during the execution of this // node. // - // Takes ownership of the `TrackingAllocator` objects in - // `ctx->wrapped_allocators()`. + // Takes ownership of any `TrackingAllocator` objects stored in `ctx`. virtual void SetMemory(OpKernelContext* ctx) = 0; // Records information about the tensor produced by this node at the given diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index 1eb12d3f953..5f08c130871 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" -#include +#include // NOLINT #include #include #include @@ -286,6 +286,13 @@ OpKernelContext::~OpKernelContext() { } } if (params_->record_tensor_accesses) referenced_tensors_.Destroy(); + if (params_->track_allocations && !wrapped_allocators_.empty()) { + LOG(WARNING) << "OpKernelContext is tracking allocations but they are not " + << "being consumed by the StepStatsCollector."; + for (auto& wrapped_alloator : wrapped_allocators_) { + wrapped_alloator.second->GetRecordsAndUnRef(); + } + } } Allocator* OpKernelContext::get_allocator(AllocatorAttributes attr) { diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h index 6c71e118c02..165115aab32 100644 --- a/tensorflow/core/framework/op_kernel.h +++ b/tensorflow/core/framework/op_kernel.h @@ -982,9 +982,10 @@ class OpKernelContext { return params_->output_attr_array[index]; } - gtl::InlinedVector wrapped_allocators() const { + gtl::InlinedVector ConsumeWrappedAllocators() { mutex_lock lock(mu_); - gtl::InlinedVector retrieved = wrapped_allocators_; + gtl::InlinedVector retrieved; + retrieved.swap(wrapped_allocators_); return retrieved; } diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc index 0faad11e472..3988c190e70 100644 --- a/tensorflow/core/kernels/constant_op_test.cc +++ b/tensorflow/core/kernels/constant_op_test.cc @@ -79,7 +79,7 @@ void ConstantOpTest::PersistentMemoryTrackingTest(bool on_gpu) { } // Remove memory leak errors. - for (auto allocator_pair : ctx.wrapped_allocators()) { + for (auto allocator_pair : ctx.ConsumeWrappedAllocators()) { allocator_pair.second->GetRecordsAndUnRef(); } } diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc index d36eec0646b..64834e507f2 100644 --- a/tensorflow/core/kernels/data/captured_function.cc +++ b/tensorflow/core/kernels/data/captured_function.cc @@ -79,12 +79,7 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface { bool TrackAllocations() const override { return false; } - void SetMemory(OpKernelContext* ctx) override { - // Returning `false` from `TrackAllocations()` should prevent - // `TrackingAllocator` objects from being constructed. - DCHECK_EQ(0, ctx->wrapped_allocators().size()) - << "Allocations were tracked but should not have been requested."; - } + void SetMemory(OpKernelContext* ctx) override {} void SetOutput(int slot, const Tensor* tensor) override {} From b1b71921ffe3d0a08c1e35ed89055d38207e91a2 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 14:23:18 -0800 Subject: [PATCH 115/540] [TF:XLA] Remove dead function AcceptOrdered from HloInstruction. PiperOrigin-RevId: 220172092 --- .../compiler/xla/service/hlo_instruction.cc | 40 ------------------- .../compiler/xla/service/hlo_instruction.h | 10 ----- 2 files changed, 50 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index f6ed86b4165..ada536770ed 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -2639,46 +2639,6 @@ Status HloInstruction::Accept( return this->Accept(&visitor); } -Status HloInstruction::AcceptOrdered( - DfsHloVisitor* visitor, const std::vector& order) { - VLOG(2) << "HloInstruction::AcceptOrdered(%" << name() << ")"; - TF_RET_CHECK(OrderIsTopologicalSort(order)); - - // Compute the predecessors of this instruction. - std::unordered_set predecessors; - TF_RETURN_IF_ERROR(this->Accept([&predecessors](HloInstruction* instruction) { - predecessors.insert(instruction); - return Status::OK(); - })); - - for (auto* const_instruction : order) { - if (!ContainsKey(predecessors, const_instruction)) { - // Instruction is not a predecessors of 'this'. - continue; - } - - // The visitor can mark instructions as visited to skip particular - // instructions. - if (visitor->DidVisit(*const_instruction)) { - VLOG(3) << "Not visiting HLO %" << const_instruction->name() - << " as it was already visited."; - continue; - } - - // TODO(b/78350259): Eliminate const laundering. - HloInstruction* instruction = - const_cast(const_instruction); - - TF_RETURN_IF_ERROR(visitor->Preprocess(instruction)); - VLOG(2) << "Visiting HLO %" << instruction->name(); - TF_RETURN_IF_ERROR(instruction->Visit(visitor)); - visitor->SetVisited(*instruction); - TF_RETURN_IF_ERROR(visitor->Postprocess(instruction)); - } - - return visitor->FinishVisit(this); -} - const Shape& HloInstruction::shape() const { return shape_; } diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 15a4da8dbe0..c6a938383ce 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -954,16 +954,6 @@ class HloInstruction { Status Accept( const std::function& visitor_func) const; - // Visits all instructions rooted at this instruction using the given visitor - // in the given order. 'order' must contain at least the set of instructions - // rooted at this node (ie, those accessible from a DFS traversal from this - // instruction). Instructions contained in 'order' which are not in the set of - // instructions rooted at this node are ignored. 'order' must also be a valid - // topological sort of these instructions (defs appear before uses) though - // need not be a DFS post-order. - Status AcceptOrdered(DfsHloVisitor* visitor, - const std::vector& order); - // Visit this instruction and only this instruction with the given visitor. template Status Visit(DfsHloVisitorBase* visitor); From 361093aea98d5fb0fe93711324580a8d55a6ca13 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Mon, 5 Nov 2018 14:34:48 -0800 Subject: [PATCH 116/540] Clean code and resolve the style issue --- tensorflow/core/kernels/data/iterator_ops.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index 92bbfdbea5c..445718ba1e5 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -145,8 +145,6 @@ class IteratorResource : public ResourceBase { params.allocator_getter = [device](AllocatorAttributes attrs) { return device->GetAllocator(attrs); }; - params.runner_threadpool_size = - ctx->device()->tensorflow_cpu_worker_threads()->num_threads; IteratorContext iter_ctx(std::move(params)); TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader)); mutex_lock l(mu_); @@ -1009,7 +1007,6 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) { bool end_of_sequence = false; IteratorContext::Params params(ctx); params.function_library = iterator->function_library(); - OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(std::move(params)), &components, &end_of_sequence)); OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence")); @@ -1212,6 +1209,7 @@ class DeserializeIteratorOp : public OpKernel { } }; + REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp); REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU), IteratorHandleOp); From ae24dac83cc31799c027d1402ac16a702438f01e Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Mon, 5 Nov 2018 14:33:15 -0800 Subject: [PATCH 117/540] [tf.data] Fix internal test failure. PiperOrigin-RevId: 220173933 --- tensorflow/core/framework/model.h | 17 +++++++---------- tensorflow/core/framework/model_test.cc | 25 ------------------------- tensorflow/python/debug/BUILD | 2 -- 3 files changed, 7 insertions(+), 37 deletions(-) diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h index 10ecdef5868..635a760b22a 100644 --- a/tensorflow/core/framework/model.h +++ b/tensorflow/core/framework/model.h @@ -108,8 +108,8 @@ class Node { using Factory = std::function(Args)>; - Node(Args args) - : id_(args.id), name_(args.name), output_(std::move(args.output)) {} + explicit Node(Args args) + : id_(args.id), name_(args.name), output_(args.output.get()) {} // Adds an input. void add_input(std::shared_ptr node) LOCKS_EXCLUDED(mu_) { @@ -142,7 +142,7 @@ class Node { } // Returns the node output. - std::shared_ptr output() const LOCKS_EXCLUDED(mu_) { + Node* output() const LOCKS_EXCLUDED(mu_) { tf_shared_lock l(mu_); return output_; } @@ -185,12 +185,6 @@ class Node { inputs_.remove(input); } - // Set the node output. - void set_output(std::shared_ptr output) LOCKS_EXCLUDED(mu_) { - mutex_lock l(mu_); - output_ = output; - } - // Collects tunable parameters in the subtree rooted in this node. void CollectTunableParameters( std::vector>* parameters) LOCKS_EXCLUDED(mu_) { @@ -287,7 +281,10 @@ class Node { std::map work_start_ GUARDED_BY(mu_); std::map> parameters_ GUARDED_BY(mu_); std::list> inputs_ GUARDED_BY(mu_); - std::shared_ptr output_ GUARDED_BY(mu_); + + // The reference to the output node is not owned so that that deletion of a + // node results in recursive deletion of the subtree rooted in the node. + Node* output_ GUARDED_BY(mu_); }; // InterleaveMany is used to model datasets whose inputs are used to create diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc index 02e27107fb0..53e35f25b28 100644 --- a/tensorflow/core/framework/model_test.cc +++ b/tensorflow/core/framework/model_test.cc @@ -101,15 +101,9 @@ TEST_P(AsyncKnownRatioTest, Model) { std::shared_ptr source1 = model::MakeSourceNode({1, "source1", async_known_many}); async_known_many->add_input(source1); - auto cleanup1 = gtl::MakeCleanup([async_known_many, source1]() { - async_known_many->remove_input(source1); - }); std::shared_ptr source2 = model::MakeSourceNode({2, "source2", async_known_many}); async_known_many->add_input(source2); - auto cleanup2 = gtl::MakeCleanup([async_known_many, source2]() { - async_known_many->remove_input(source2); - }); std::vector input_times(1, input_time); source1->add_processing_time(100); EXPECT_EQ(0, async_known_many->ProcessingTime()); @@ -166,19 +160,12 @@ TEST(InterleaveManyTest, Model) { std::shared_ptr meta_source = model::MakeSourceNode({1, "meta_source", interleave_many}); interleave_many->add_input(meta_source); - auto cleanup_meta = gtl::MakeCleanup([interleave_many, meta_source]() { - interleave_many->remove_input(meta_source); - }); std::shared_ptr source1 = model::MakeSourceNode({1, "source1", interleave_many}); interleave_many->add_input(source1); - auto cleanup1 = gtl::MakeCleanup( - [interleave_many, source1]() { interleave_many->remove_input(source1); }); std::shared_ptr source2 = model::MakeSourceNode({2, "source2", interleave_many}); interleave_many->add_input(source2); - auto cleanup2 = gtl::MakeCleanup( - [interleave_many, source2]() { interleave_many->remove_input(source2); }); std::vector input_times(1, 0); interleave_many->add_processing_time(100); EXPECT_EQ(100, interleave_many->processing_time()); @@ -210,13 +197,9 @@ TEST_P(KnownRatioTest, Model) { std::shared_ptr source1 = model::MakeSourceNode({1, "source1", known_many}); known_many->add_input(source1); - auto cleanup1 = gtl::MakeCleanup( - [known_many, source1]() { known_many->remove_input(source1); }); std::shared_ptr source2 = model::MakeSourceNode({2, "source2", known_many}); known_many->add_input(source2); - auto cleanup2 = gtl::MakeCleanup( - [known_many, source2]() { known_many->remove_input(source2); }); std::vector input_times(1, 0); source1->add_processing_time(100); EXPECT_EQ(0, known_many->ProcessingTime()); @@ -280,13 +263,9 @@ TEST(UnknownRatioTest, Model) { std::shared_ptr source1 = model::MakeSourceNode({1, "source1", unknown_many}); unknown_many->add_input(source1); - auto cleanup1 = gtl::MakeCleanup( - [unknown_many, source1]() { unknown_many->remove_input(source1); }); std::shared_ptr source2 = model::MakeSourceNode({2, "source2", unknown_many}); unknown_many->add_input(source2); - auto cleanup2 = gtl::MakeCleanup( - [unknown_many, source2]() { unknown_many->remove_input(source2); }); std::vector input_times(1, 0); unknown_many->add_processing_time(100); EXPECT_EQ(100, unknown_many->processing_time()); @@ -315,13 +294,9 @@ TEST(UnknownTest, Model) { std::shared_ptr source1 = model::MakeSourceNode({1, "source1", unknown}); unknown->add_input(source1); - auto cleanup1 = gtl::MakeCleanup( - [unknown, source1]() { unknown->remove_input(source1); }); std::shared_ptr source2 = model::MakeSourceNode({2, "source2", unknown}); unknown->add_input(source2); - auto cleanup2 = gtl::MakeCleanup( - [unknown, source2]() { unknown->remove_input(source2); }); std::vector input_times(1, 0); source1->add_processing_time(100); EXPECT_EQ(0, unknown->ProcessingTime()); diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD index a73a4323c3f..79951232097 100644 --- a/tensorflow/python/debug/BUILD +++ b/tensorflow/python/debug/BUILD @@ -1130,6 +1130,4 @@ sh_test( ":debug_tflearn_iris", ":offline_analyzer", ], - # TODO(b/119032933): Re-enable this test in ASAN. - tags = ["noasan"], ) From d67ea6e35f053314e6d4224c0fb7c39243787841 Mon Sep 17 00:00:00 2001 From: Alexandre Passos Date: Mon, 5 Nov 2018 14:45:06 -0800 Subject: [PATCH 118/540] Automated rollback of commit 1bf16beaa984958f96a83e51210d8f171bd0655e. Revert #21681. PiperOrigin-RevId: 220176153 --- tensorflow/core/kernels/BUILD | 14 +- tensorflow/core/kernels/crop_and_resize_op.cc | 90 +- .../crop_and_resize_op_benchmark_test.cc | 36 +- .../core/kernels/crop_resize_bilinear_core.h | 5603 ----------------- tensorflow/core/kernels/resize_bilinear_op.cc | 157 +- .../core/kernels/resize_bilinear_op_test.cc | 2 +- 6 files changed, 201 insertions(+), 5701 deletions(-) delete mode 100644 tensorflow/core/kernels/crop_resize_bilinear_core.h diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index dc1ce593f84..f61ee53a428 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -29,7 +29,6 @@ package_group( load( "//tensorflow:tensorflow.bzl", "if_android", - "if_linux_x86_64", "tf_cc_test", "tf_cc_test_mkl", "tf_cc_tests", @@ -579,12 +578,6 @@ cc_header_only_library( deps = [":image_resizer_state"], ) -cc_library( - name = "crop_resize_bilinear_core", - hdrs = ["crop_resize_bilinear_core.h"], - visibility = ["//visibility:private"], -) - # OpKernel libraries ---------------------------------------------------------- ARRAY_DEPS = [ @@ -2292,9 +2285,8 @@ tf_kernel_library( tf_kernel_library( name = "crop_and_resize_op", - copts = tf_copts() + if_linux_x86_64(["-finline-functions"]), prefix = "crop_and_resize_op", - deps = IMAGE_DEPS + [":crop_resize_bilinear_core"], + deps = IMAGE_DEPS, ) tf_kernel_library( @@ -2359,9 +2351,8 @@ tf_kernel_library( tf_kernel_library( name = "resize_bilinear_op", - copts = tf_copts() + if_linux_x86_64(["-finline-functions"]), prefix = "resize_bilinear_op", - deps = IMAGE_DEPS + [":crop_resize_bilinear_core"], + deps = IMAGE_DEPS, ) tf_kernel_library( @@ -5298,7 +5289,6 @@ filegroup( "control_flow_ops.h", "conv_2d.h", "conv_ops.h", - "crop_resize_bilinear_core.h", "data_format_ops.h", "depthtospace_op.h", "depthwise_conv_op.h", diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc index 539779968ae..99d01b4db6b 100644 --- a/tensorflow/core/kernels/crop_and_resize_op.cc +++ b/tensorflow/core/kernels/crop_and_resize_op.cc @@ -28,7 +28,6 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/bounds_check.h" -#include "tensorflow/core/kernels/crop_resize_bilinear_core.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" @@ -229,56 +228,61 @@ struct CropAndResize { continue; } - if (method_name == "bilinear") { - std::vector xs; - std::vector ys; - int min_ix, max_ix, min_iy, max_iy; - compute_interpolation_weights(crop_width, image_width, x1, x2, - &min_ix, &max_ix, &xs); - compute_interpolation_weights(crop_height, image_height, y1, y2, - &min_iy, &max_iy, &ys); + const float height_scale = + (crop_height > 1) + ? (y2 - y1) * (image_height - 1) / (crop_height - 1) + : 0; + const float width_scale = + (crop_width > 1) ? (x2 - x1) * (image_width - 1) / (crop_width - 1) + : 0; - // multiply by depth to avoid multiplication in resize_single_image. - for (int i = min_ix; i <= max_ix; ++i) { - xs[i - min_ix].lower *= depth; - xs[i - min_ix].upper *= depth; + for (int y = 0; y < crop_height; ++y) { + const float in_y = (crop_height > 1) + ? y1 * (image_height - 1) + y * height_scale + : 0.5 * (y1 + y2) * (image_height - 1); + if (in_y < 0 || in_y > image_height - 1) { + for (int x = 0; x < crop_width; ++x) { + for (int d = 0; d < depth; ++d) { + crops(b, y, x, d) = extrapolation_value; + } + } + continue; } + if (method_name == "bilinear") { + const int top_y_index = floorf(in_y); + const int bottom_y_index = ceilf(in_y); + const float y_lerp = in_y - top_y_index; - crop_resize_single_image_common( - image.data() + static_cast(b_in) * - static_cast(image_height) * - static_cast(image_width) * - static_cast(depth), - image_height, image_width, crop_height, crop_width, depth, min_ix, - max_ix, xs.data(), min_iy, max_iy, ys.data(), extrapolation_value, - false, false, - crops.data() + static_cast(b) * - static_cast(crop_height) * - static_cast(crop_width) * - static_cast(depth)); - // xs and ys are deallocated automatically when they go out of scope - } else { // method == "nearest" - const float height_scale = - (crop_height > 1) - ? (y2 - y1) * (image_height - 1) / (crop_height - 1) - : 0; - const float width_scale = - (crop_width > 1) - ? (x2 - x1) * (image_width - 1) / (crop_width - 1) - : 0; - - for (int y = 0; y < crop_height; ++y) { - const float in_y = (crop_height > 1) - ? y1 * (image_height - 1) + y * height_scale - : 0.5 * (y1 + y2) * (image_height - 1); - if (in_y < 0 || in_y > image_height - 1) { - for (int x = 0; x < crop_width; ++x) { + for (int x = 0; x < crop_width; ++x) { + const float in_x = (crop_width > 1) + ? x1 * (image_width - 1) + x * width_scale + : 0.5 * (x1 + x2) * (image_width - 1); + if (in_x < 0 || in_x > image_width - 1) { for (int d = 0; d < depth; ++d) { crops(b, y, x, d) = extrapolation_value; } + continue; + } + const int left_x_index = floorf(in_x); + const int right_x_index = ceilf(in_x); + const float x_lerp = in_x - left_x_index; + + for (int d = 0; d < depth; ++d) { + const float top_left(static_cast( + image(b_in, top_y_index, left_x_index, d))); + const float top_right(static_cast( + image(b_in, top_y_index, right_x_index, d))); + const float bottom_left(static_cast( + image(b_in, bottom_y_index, left_x_index, d))); + const float bottom_right(static_cast( + image(b_in, bottom_y_index, right_x_index, d))); + const float top = top_left + (top_right - top_left) * x_lerp; + const float bottom = + bottom_left + (bottom_right - bottom_left) * x_lerp; + crops(b, y, x, d) = top + (bottom - top) * y_lerp; } - continue; } + } else { // method == "nearest" for (int x = 0; x < crop_width; ++x) { const float in_x = (crop_width > 1) ? x1 * (image_width - 1) + x * width_scale diff --git a/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc b/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc index 54d4f33b446..d7ca64bea05 100644 --- a/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc +++ b/tensorflow/core/kernels/crop_and_resize_op_benchmark_test.cc @@ -21,13 +21,11 @@ limitations under the License. namespace tensorflow { -template static Graph* BM_CropAndResize(int batches, int width, int height, int depth, int crop_height, int crop_width) { Graph* g = new Graph(OpRegistry::Global()); - Tensor in(DataTypeToEnum::v(), - TensorShape({batches, height, width, depth})); - in.flat().setRandom(); + Tensor in(DT_FLOAT, TensorShape({batches, height, width, depth})); + in.flat().setRandom(); Tensor boxes(DT_FLOAT, TensorShape({batches, 4})); auto boxes_tensor = boxes.matrix(); Tensor box_ind(DT_INT32, TensorShape({batches})); @@ -53,17 +51,13 @@ static Graph* BM_CropAndResize(int batches, int width, int height, int depth, return g; } -#define BM_CropAndResizeDev(DEVICE, DTYPE, B, W, H, D, CH, CW) \ - static void \ - BM_CropAndResize_##DEVICE##_##DTYPE##_##B##_##W##_##H##_##D##_##CH##_##CW( \ - int iters) { \ - testing::ItemsProcessed(iters* B* W* H* D); \ - test::Benchmark(#DEVICE, BM_CropAndResize::Type>( \ - B, W, H, D, CH, CW)) \ - .Run(iters); \ - } \ - BENCHMARK( \ - BM_CropAndResize_##DEVICE##_##DTYPE##_##B##_##W##_##H##_##D##_##CH##_##CW); +#define BM_CropAndResizeDev(DEVICE, B, W, H, D, CH, CW) \ + static void BM_CropAndResize_##DEVICE##_##B##_##W##_##H##_##D##_##CH##_##CW( \ + int iters) { \ + testing::ItemsProcessed(iters* B* W* H* D); \ + test::Benchmark(#DEVICE, BM_CropAndResize(B, W, H, D, CH, CW)).Run(iters); \ + } \ + BENCHMARK(BM_CropAndResize_##DEVICE##_##B##_##W##_##H##_##D##_##CH##_##CW); // Benchmark results using CPU:Intel Haswell with HyperThreading (6 cores) // Benchmark Time(ns) CPU(ns) Iterations @@ -71,14 +65,8 @@ static Graph* BM_CropAndResize(int batches, int width, int height, int depth, // BM_CropAndResize_cpu_1_640_640_1_512_512 3801232 3914692 185 99.784M items/s // BM_CropAndResize_cpu_1_80_80_512_7_7 182470 241767 2941 1.372G items/s -BM_CropAndResizeDev(cpu, DT_UINT8, 1, 640, 640, 3, 512, 512); -BM_CropAndResizeDev(cpu, DT_UINT8, 1, 640, 640, 1, 512, 512); - -BM_CropAndResizeDev(cpu, DT_HALF, 1, 640, 640, 3, 512, 512); -BM_CropAndResizeDev(cpu, DT_HALF, 1, 640, 640, 1, 512, 512); - -BM_CropAndResizeDev(cpu, DT_FLOAT, 1, 640, 640, 3, 512, 512); -BM_CropAndResizeDev(cpu, DT_FLOAT, 1, 640, 640, 1, 512, 512); -BM_CropAndResizeDev(cpu, DT_FLOAT, 1, 80, 80, 512, 7, 7); +BM_CropAndResizeDev(cpu, 1, 640, 640, 3, 512, 512); +BM_CropAndResizeDev(cpu, 1, 640, 640, 1, 512, 512); +BM_CropAndResizeDev(cpu, 1, 80, 80, 512, 7, 7); } // namespace tensorflow diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h deleted file mode 100644 index 1ea6f719dfd..00000000000 --- a/tensorflow/core/kernels/crop_resize_bilinear_core.h +++ /dev/null @@ -1,5603 +0,0 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ -#define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ - -// only include intrinsics when the appropriate flags call for it, -// since these headers only exists on x86 platforms. -#ifdef __SSE4_1__ -#include -#include -#include -#endif -#ifdef __AVX2__ -#include -#endif -#include -#include -#include -#include -#include - -namespace tensorflow { -namespace { - -// Compute the interpolation indices only once. -struct CachedInterpolation { - int lower; // Lower source index used in the interpolation - int upper; // Upper source index used in the interpolation - // 1-D linear iterpolation scale (see: - // https://en.wikipedia.org/wiki/Bilinear_interpolation) - float lerp; -}; - -bool compute_single_interpolation_weight(const int in_size, - const float out2in_scale, - const float out2in_start, - const bool clip, const int i, - int* lower, int* upper, float* lerp) { - const float in = i * out2in_scale + out2in_start; - *lower = (int)floor(in); - *upper = (int)ceil(in); - *lerp = (float)(in - (float)*lower); - if (clip) { - if (*lower < 0) - *lower = 0; - else if (*lower >= in_size) - *lower = in_size - 1; - if (*upper < 0) - *upper = 0; - else if (*upper >= in_size) - *upper = in_size - 1; - return true; - } else { - return (*lower >= 0 && *upper < in_size) ? true : false; - } -} -/** - * Compute interpolation values for output indexes in range - * [out_start,out_start+out_size-1]. - * Returns true if all output indexes have lower and upper (input) indexes - * within range [0,in_size-1]. - */ -bool compute_interpolation_weights(const int min_i, const int max_i, - const int in_size, const float out2in_scale, - const float out2in_start, const bool clip, - CachedInterpolation* interpolation) { - bool rval = true; - int num_i = max_i - min_i + 1; - for (int i = 0; i < num_i; ++i) { - if (!compute_single_interpolation_weight( - in_size, out2in_scale, out2in_start, clip, i + min_i, - &interpolation[i].lower, &interpolation[i].upper, - &interpolation[i].lerp)) { - rval = false; - } - } - return rval; -} -/** - * Compatibility method for resize_bilinear_op.cc - */ -void compute_interpolation_weights(const int out_size, const int in_size, - const float out2in_scale, - CachedInterpolation* interpolation) { - interpolation[out_size].lower = 0; - interpolation[out_size].upper = 0; - const bool clip = true; - if (!compute_interpolation_weights(0, out_size - 1, in_size, out2in_scale, - 0.0f, clip, interpolation)) { - // Should never happen, check for it anyway - printf( - "Warning! Interpolation values have lower,upper indexes outside of " - "range [0,in_size-1]\n"); - } -} -/** - * Compute minimum and maximum (output) i where both lower and upper (input) is - * in range [0,in_size-1] - * If no values of i satisfy condition, min_i = in_size, max_i = -1 and method - * returns false. - * Returns true if min_i >= max_i. - */ -bool compute_minmax_indexes(const int out_size, const int in_size, - const float out2in_scale, const float out2in_start, - int* min_i, int* max_i) { - *min_i = out_size; - *max_i = -1; - int lower, upper; - float lerp; - for (int i = 0; i < out_size; ++i) { - if (compute_single_interpolation_weight(in_size, out2in_scale, out2in_start, - false, i, &lower, &upper, &lerp)) { - if (i < *min_i) *min_i = i; - if (i > *max_i) *max_i = i; - } - } - return (*min_i <= *max_i) ? true : false; -} -/** - * Compute interpolation weights for crop_and_resize_op.cc - * Also computes extrapolation areas. - * Returns true if at least one point requires interpolation, false otherwise. - */ -bool compute_interpolation_weights( - const int out_size, const int in_size, - const float x1, // lower bounding box, crop region starts at in_size*x1 - const float x2, // upper bounding box, crop region ends at in_size*x2 - int* min_i, int* max_i, std::vector* interpolation) { - float out2in_start = out_size > 1 - ? (float)(in_size - 1) * (float)x1 - : (float)(in_size - 1) * (float)(x1 + x2) / 2.0f; - float out2in_scale = out_size > 1 ? (float)(x2 - x1) * (float)(in_size - 1) / - (float)(out_size - 1) - : 0.0f; - if (compute_minmax_indexes(out_size, in_size, out2in_scale, out2in_start, - min_i, max_i)) { - interpolation->resize(*max_i - *min_i + 1); - bool all_inputs_ok = compute_interpolation_weights( - *min_i, *max_i, in_size, out2in_scale, out2in_start, false, - interpolation->data()); - if (!all_inputs_ok) { - // should never happen, purpose of compute_minmax_indexes is to ensure - // that all inputs are ok. - printf( - "Error! compute_interpolation_weights returned input indexes outside " - "valid range - SEGV will likely ensue.\n"); - } - return true; - } else { - return false; - } -} - -/** - * Cast float v to type U with range clamping. - * - * If vmax_val, - * return value is clamped to u_max_val. - */ -template -U cast_to(float v, float min_val, float max_val, U u_min_val, U u_max_val); -template -U cast_to(float v, float min_val, float max_val, U u_min_val, U u_max_val) { - if (v < min_val) - return u_min_val; - else if (v > max_val) - return u_max_val; - else - return static_cast(v); -} -/** - * no-op cast from float to float. - */ -template <> -float cast_to(float v, float min_val, float max_val, float u_min_val, - float u_max_val) { - return v; -} - -float compute_lerp(const float top_left, const float top_right, - const float bottom_left, const float bottom_right, - const float x_lerp, const float y_lerp) { - const float top = top_left + (top_right - top_left) * x_lerp; - const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; - return top + (bottom - top) * y_lerp; -} - -/** - * Computes the bilinear interpolation from the appropriate 4 float points - * and the linear interpolation weights. - * Accepts input tensors of type T and produces output tensors of type U. - * Optionally flips horizontal and/or vertical axis. - */ -template -void crop_resize_single_image(const T* image, const int64 in_height, - const int64 in_width, const int64 out_height, - const int64 out_width, const int channels, - const int min_ix, const int max_ix, - const CachedInterpolation* xs, const int min_iy, - const int max_iy, const CachedInterpolation* ys, - const float extrapolated_value, const bool flip_x, - const bool flip_y, - U* output) TF_ATTRIBUTE_NOINLINE; -template -void crop_resize_single_image(const T* image, const int64 in_height, - const int64 in_width, const int64 out_height, - const int64 out_width, const int channels, - const int min_ix, const int max_ix, - const CachedInterpolation* xs, const int min_iy, - const int max_iy, const CachedInterpolation* ys, - const float extrapolated_value, const bool flip_x, - const bool flip_y, U* output) { - const int64 in_row_size = in_width * channels; - const int64 out_row_size = out_width * channels; - U u_min_val = std::numeric_limits::min(); - U u_max_val = std::numeric_limits::max(); - float min_val = static_cast(u_min_val); - float max_val = static_cast(u_max_val); - U uEx = - cast_to(extrapolated_value, min_val, max_val, u_min_val, u_max_val); - // low y extrapolation zone - if (min_iy > 0) { - U* p = flip_y ? output + out_row_size * (out_height - min_iy) : output; - int64 nn = out_row_size * (int64)min_iy; - for (int64 i = 0; i < nn; ++i) p[i] = uEx; - } - // high y extrapolation zone - if (max_iy < out_height - 1) { - U* p = flip_y ? output : output + out_row_size * (max_iy + 1); - int64 nn = out_row_size * (int64)(out_height - 1 - max_iy); - for (int64 i = 0; i < nn; ++i) p[i] = uEx; - } - // low x extrapolation zone - if (min_ix > 0) { - for (int iy = min_iy; iy <= max_iy; ++iy) { - int xx0 = flip_x ? (out_width - min_ix) * channels : 0; - int nxx = min_ix * channels; - U* p = output + xx0 + - out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy); - for (int ix = 0; ix < nxx; ++ix) { - p[ix] = uEx; - } - } - } - // high x extrapolation zone - if (max_ix < out_width - 1) { - for (int iy = min_iy; iy <= max_iy; ++iy) { - int xx0 = flip_x ? 0 : (max_ix + 1) * channels; - int nxx = (out_width - 1 - max_ix) * channels; - U* p = output + xx0 + - out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy); - for (int ix = 0; ix < nxx; ++ix) { - p[ix] = uEx; - } - } - } - U* output_y_ptr = - output + - out_row_size * (int64)(flip_y ? out_height - 1 - min_iy : min_iy); - // interpolation zone - if (channels == 1) { - for (int y = min_iy; y <= max_iy; ++y) { - const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; - const float ys_lerp = ys[iy].lerp; - const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; - const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; - for (int x = x0; x <= x1; ++x) { - const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; - const int64 xs_lower = xs[ix].lower; - const int64 xs_upper = xs[ix].upper; - const float xs_lerp = xs[ix].lerp; - - // Read channel 0. - const float top_left0(ys_input_lower_ptr[xs_lower]); - const float top_right0(ys_input_lower_ptr[xs_upper]); - const float bottom_left0(ys_input_upper_ptr[xs_lower]); - const float bottom_right0(ys_input_upper_ptr[xs_upper]); - - // Compute output. - float result0 = compute_lerp(top_left0, top_right0, bottom_left0, - bottom_right0, xs_lerp, ys_lerp); - output_y_ptr[x] = - cast_to(result0, min_val, max_val, u_min_val, u_max_val); - } - output_y_ptr = - flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; - } - } else if (channels == 2) { - for (int y = min_iy; y <= max_iy; ++y) { - const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; - const float ys_lerp = ys[iy].lerp; - const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; - const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; - for (int x = x0; x <= x1; ++x) { - const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; - const int64 xs_lower = xs[ix].lower; - const int64 xs_upper = xs[ix].upper; - const float xs_lerp = xs[ix].lerp; - - // Read channel 0. - const float top_left0(ys_input_lower_ptr[xs_lower + 0]); - const float top_right0(ys_input_lower_ptr[xs_upper + 0]); - const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); - const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); - - // Read channel 1. - const float top_left1(ys_input_lower_ptr[xs_lower + 1]); - const float top_right1(ys_input_lower_ptr[xs_upper + 1]); - const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); - const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); - - // Compute output. - float result0 = compute_lerp(top_left0, top_right0, bottom_left0, - bottom_right0, xs_lerp, ys_lerp); - float result1 = compute_lerp(top_left1, top_right1, bottom_left1, - bottom_right1, xs_lerp, ys_lerp); - output_y_ptr[x * 2 + 0] = - cast_to(result0, min_val, max_val, u_min_val, u_max_val); - output_y_ptr[x * 2 + 1] = - cast_to(result1, min_val, max_val, u_min_val, u_max_val); - } - output_y_ptr = - flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; - } - } else if (channels == 3) { - for (int y = min_iy; y <= max_iy; ++y) { - const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; - const float ys_lerp = ys[iy].lerp; - const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; - const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; - for (int x = x0; x <= x1; ++x) { - const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; - const int64 xs_lower = xs[ix].lower; - const int64 xs_upper = xs[ix].upper; - const float xs_lerp = xs[ix].lerp; - - // Read channel 0. - const float top_left0(ys_input_lower_ptr[xs_lower + 0]); - const float top_right0(ys_input_lower_ptr[xs_upper + 0]); - const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); - const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); - - // Read channel 1. - const float top_left1(ys_input_lower_ptr[xs_lower + 1]); - const float top_right1(ys_input_lower_ptr[xs_upper + 1]); - const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); - const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); - - // Read channel 2. - const float top_left2(ys_input_lower_ptr[xs_lower + 2]); - const float top_right2(ys_input_lower_ptr[xs_upper + 2]); - const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); - const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); - - // Compute output. - float result0 = compute_lerp(top_left0, top_right0, bottom_left0, - bottom_right0, xs_lerp, ys_lerp); - float result1 = compute_lerp(top_left1, top_right1, bottom_left1, - bottom_right1, xs_lerp, ys_lerp); - float result2 = compute_lerp(top_left2, top_right2, bottom_left2, - bottom_right2, xs_lerp, ys_lerp); - output_y_ptr[x * 3 + 0] = - cast_to(result0, min_val, max_val, u_min_val, u_max_val); - output_y_ptr[x * 3 + 1] = - cast_to(result1, min_val, max_val, u_min_val, u_max_val); - output_y_ptr[x * 3 + 2] = - cast_to(result2, min_val, max_val, u_min_val, u_max_val); - } - output_y_ptr = - flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; - } - } else if (channels == 4) { - for (int y = min_iy; y <= max_iy; ++y) { - const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; - const float ys_lerp = ys[iy].lerp; - const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; - const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; - for (int x = x0; x <= x1; ++x) { - const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; - const int64 xs_lower = xs[ix].lower; - const int64 xs_upper = xs[ix].upper; - const float xs_lerp = xs[ix].lerp; - - // Read channel 0. - const float top_left0(ys_input_lower_ptr[xs_lower + 0]); - const float top_right0(ys_input_lower_ptr[xs_upper + 0]); - const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); - const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); - - // Read channel 1. - const float top_left1(ys_input_lower_ptr[xs_lower + 1]); - const float top_right1(ys_input_lower_ptr[xs_upper + 1]); - const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); - const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); - - // Read channel 2. - const float top_left2(ys_input_lower_ptr[xs_lower + 2]); - const float top_right2(ys_input_lower_ptr[xs_upper + 2]); - const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); - const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); - - // Read channel 3. - const float top_left3(ys_input_lower_ptr[xs_lower + 3]); - const float top_right3(ys_input_lower_ptr[xs_upper + 3]); - const float bottom_left3(ys_input_upper_ptr[xs_lower + 3]); - const float bottom_right3(ys_input_upper_ptr[xs_upper + 3]); - - // Compute output. - float result0 = compute_lerp(top_left0, top_right0, bottom_left0, - bottom_right0, xs_lerp, ys_lerp); - float result1 = compute_lerp(top_left1, top_right1, bottom_left1, - bottom_right1, xs_lerp, ys_lerp); - float result2 = compute_lerp(top_left2, top_right2, bottom_left2, - bottom_right2, xs_lerp, ys_lerp); - float result3 = compute_lerp(top_left3, top_right3, bottom_left3, - bottom_right3, xs_lerp, ys_lerp); - output_y_ptr[x * 4 + 0] = - cast_to(result0, min_val, max_val, u_min_val, u_max_val); - output_y_ptr[x * 4 + 1] = - cast_to(result1, min_val, max_val, u_min_val, u_max_val); - output_y_ptr[x * 4 + 2] = - cast_to(result2, min_val, max_val, u_min_val, u_max_val); - output_y_ptr[x * 4 + 3] = - cast_to(result3, min_val, max_val, u_min_val, u_max_val); - } - output_y_ptr = - flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; - } - } else { - for (int y = min_iy; y <= max_iy; ++y) { - const int iy = y - min_iy; - const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size; - const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size; - const float ys_lerp = ys[iy].lerp; - const int x0 = flip_x ? out_width - 1 - max_ix : min_ix; - const int x1 = flip_x ? out_width - 1 - min_ix : max_ix; - for (int x = x0; x <= x1; ++x) { - const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix; - const int64 xs_lower = xs[ix].lower; - const int64 xs_upper = xs[ix].upper; - const float xs_lerp = xs[ix].lerp; - for (int ichan = 0; ichan < channels; ++ichan) { - const float top_left0(ys_input_lower_ptr[xs_lower + ichan]); - const float top_right0(ys_input_lower_ptr[xs_upper + ichan]); - const float bottom_left0(ys_input_upper_ptr[xs_lower + ichan]); - const float bottom_right0(ys_input_upper_ptr[xs_upper + ichan]); - float result0 = compute_lerp(top_left0, top_right0, bottom_left0, - bottom_right0, xs_lerp, ys_lerp); - output_y_ptr[x * channels + ichan] = - cast_to(result0, min_val, max_val, u_min_val, u_max_val); - } - } - output_y_ptr = - flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size; - } - } -} - -// template for method that calls either explicitly vectorized method -// or the fallback method, depending on what is appropriate for the -// machine you are running on -template -void crop_resize_single_image_common( - const T* image, const int64 in_height, const int64 in_width, - const int64 out_height, const int64 out_width, const int channels, - const int min_ix, const int max_ix, const CachedInterpolation* xs, - const int min_iy, const int max_iy, const CachedInterpolation* ys, - const float extrapolated_value, const bool flip_x, const bool flip_y, - U* output) TF_ATTRIBUTE_NOINLINE; - -// For now, only compile vectorized code on LINUX systems. -// to-do: Test vectorized code on other platforms (MacOS and Windows). -#if defined(__linux__) && defined(__SSE4_1__) - -// -// The remaining code implements explicitly vectorized versions of a bilinear -// image resizer. -// Images with 1, 2, 3 or 4 channels are supported. -// The image resizer reads samples of type T and writes samples of type U. -// T and U can be any of the following: uint8, int8, uint16, int16, int32, -// Eigen::half, bfloat16 and float. -// There are separate codes for SSE4.1 and AVX2. Enabling AVX2 also enables -// FP16C instruction set, -// which contains instructions that convert between Eigen::half and float. The -// SSE4.1 code path emulates -// the FP16C instructions in software. -// - -// -// This class loads 4 pixels with n channels, converts to fp32 and packs -// the result into n SSE vector words. -// Input data type T must be one of uint8, int8, uint16, int16, int32, -// Eigen::half, bfloat16 or float. -// - -template -class VectorLoader { - public: -#ifdef __AVX2__ - // convert 8 packed words of type T to fp32. - // T must be one of uint8, int8, uint16, int16, int32, Eigen::half, bfloat16 - // or float. - __m256 to_fp32(__m256i raw); -#else - // convert 4 packed words of type T to fp32. - // T must be one of uint8, int8, uint16, int16, int32, Eigen::half, bfloat16 - // or float. - __m128 to_fp32(__m128i raw); -#endif - -#ifdef __AVX2__ - // pack 4 pixels with 1 channel, 2 channels and 3channels respectively in - // separate 128 bit lanes. - // input is stored in lower portion of 4 separate sse words, v0 through v3. - // output is stored in lower portion of v0. - void pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - // output is stored in lower portion of v0 and v1. - void pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - // output is stored in lower portion of v0, v1 and v2. - void pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); -#else - // pack 4 pixels with 1 channel, 2 channels and 3channels respectively. - // input is stored in lower portion of 4 separate sse words, v0 through v3. - // output is stored in lower portion of v0. - void pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - // output is stored in lower portion of v0 and v1. - void pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - // output is stored in lower portion of v0, v1 and v2. - void pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); -#endif - -#ifdef __AVX2__ - // extract right pixel for load1 and load4 cases. - __m256i extract_right_1ch(const __m256i left); - __m256i extract_right_2ch(const __m256i left); - __m256i extract_right_3ch(const __m256i left); - __m256i extract_right_4ch(const __m256i left); -#else - __m128i extract_right_1ch(const __m128i left); - __m128i extract_right_2ch(const __m128i left); - __m128i extract_right_3ch(const __m128i left); - __m128i extract_right_4ch(const __m128i left); -#endif - -#ifdef __AVX2__ - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 1 channel. - // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned - // SSE load. - void load1_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* right0); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 2 channels. - // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned - // SSE load. - void load1_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* right0, __m256* right1); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 3 channels. - // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned - // SSE load. - void load1_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* left2, __m256* right0, __m256* right1, __m256* right2); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 4 channels. - // load1 case, i.e. 4 left and right inputs are loaded with a single unaligned - // SSE load. - void load1_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* left2, __m256* left3, __m256* right0, __m256* right1, - __m256* right2, __m256* right3); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 1 channel. - // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right - // inputs are loaded with second SSE load. - void load2_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* right0); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 2 channels. - // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right - // inputs are loaded with second SSE load. - void load2_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* right0, __m256* right1); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 3 channels. - // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right - // inputs are loaded with second SSE load. - void load2_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* left2, __m256* right0, __m256* right1, __m256* right2); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 4 channels. - // load2 case, i.e. 4 left inputs are loaded with first SSE load and 4 right - // inputs are loaded with second SSE load. - void load2_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m256* left0, __m256* left1, - __m256* left2, __m256* left3, __m256* right0, __m256* right1, - __m256* right2, __m256* right3); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 1 channel. - // load4 case, i.e. each pair of left and right inputs are loaded with a - // separate SSE load. - void load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* right0); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 2 channels. - // load4 case, i.e. each pair of left and right inputs are loaded with a - // separate SSE load. - void load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* right0, __m256* right1); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 3 channels. - // load4 case, i.e. each pair of left and right inputs are loaded with a - // separate SSE load. - void load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* left2, __m256* right0, __m256* right1, - __m256* right2); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 4 channels. - // load4 case, i.e. each pair of left and right inputs are loaded with a - // separate SSE load. - void load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* left2, __m256* left3, __m256* right0, - __m256* right1, __m256* right2, __m256* right3); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 1 channel. - // load8 case, i.e. each input is loaded with a separate SSE load. - // 4 pixels, each with left and right input necessitates 8 separate SSE loads - // per input row. - void load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* right0); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 2 channels. - // load8 case, i.e. each input is loaded with a separate SSE load. - // 4 pixels, each with left and right input necessitates 8 separate SSE loads - // per input row. - void load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* right0, __m256* right1); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 3 channels. - // load8 case, i.e. each input is loaded with a separate SSE load. - // 4 pixels, each with left and right input necessitates 8 separate SSE loads - // per input row. - void load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* left2, __m256* right0, __m256* right1, - __m256* right2); - // load top left and bottom left interpolation inputs into output argument - // left. - // load top right and bottom right interpolation inputs into output argument - // right. - // pixels have 4 channels. - // load8 case, i.e. each input is loaded with a separate SSE load. - // 4 pixels, each with left and right input necessitates 8 separate SSE loads - // per input row. - void load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m256* left0, - __m256* left1, __m256* left2, __m256* left3, __m256* right0, - __m256* right1, __m256* right2, __m256* right3); -#else - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 1 channel. - // load1 case, i.e. all inputs for one input row are loaded with a single SSE - // load. - void load1_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* bl0, - __m128* tr0, __m128* br0); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 2 channels. - // load1 case, i.e. all inputs for one input row are loaded with a single SSE - // load. - void load1_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* bl0, __m128* bl1, __m128* tr0, __m128* tr1, - __m128* br0, __m128* br1); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 3 channels. - // load1 case, i.e. all inputs for one input row are loaded with a single SSE - // load. - void load1_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* bl0, __m128* bl1, __m128* bl2, - __m128* tr0, __m128* tr1, __m128* tr2, __m128* br0, - __m128* br1, __m128* br2); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 4 channels. - // load1 case, i.e. all inputs for one input row are loaded with a single SSE - // load. - void load1_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* tl3, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* bl3, __m128* tr0, __m128* tr1, - __m128* tr2, __m128* tr3, __m128* br0, __m128* br1, - __m128* br2, __m128* br3); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 1 channel. - // load2 case, i.e. left inputs are loaded with first SSE load, right inputs - // are loaded with second SSE load. - void load2_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* bl0, - __m128* tr0, __m128* br0); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 2 channels. - // load2 case, i.e. left inputs are loaded with first SSE load, right inputs - // are loaded with second SSE load. - void load2_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* bl0, __m128* bl1, __m128* tr0, __m128* tr1, - __m128* br0, __m128* br1); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 3 channels. - // load2 case, i.e. left inputs are loaded with first SSE load, right inputs - // are loaded with second SSE load. - void load2_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* bl0, __m128* bl1, __m128* bl2, - __m128* tr0, __m128* tr1, __m128* tr2, __m128* br0, - __m128* br1, __m128* br2); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 4 channels. - // load2 case, i.e. left inputs are loaded with first SSE load, right inputs - // are loaded with second SSE load. - void load2_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - const __m128i* shuffle_masks, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* tl3, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* bl3, __m128* tr0, __m128* tr1, - __m128* tr2, __m128* tr3, __m128* br0, __m128* br1, - __m128* br2, __m128* br3); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 1 channel. - // load4 case, i.e. left and right inputs are loaded with a separate SSE load - // for each pixel. - void load4_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* bl0, __m128* tr0, __m128* br0); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 2 channels. - // load4 case, i.e. left and right inputs are loaded with a separate SSE load - // for each pixel. - void load4_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* bl0, __m128* bl1, __m128* tr0, - __m128* tr1, __m128* br0, __m128* br1); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 3 channels. - // load4 case, i.e. left and right inputs are loaded with a separate SSE load - // for each pixel. - void load4_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* tl2, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* tr0, __m128* tr1, __m128* tr2, - __m128* br0, __m128* br1, __m128* br2); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 4 channels. - // load4 case, i.e. left and right inputs are loaded with a separate SSE load - // for each pixel. - void load4_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* tl2, __m128* tl3, __m128* bl0, - __m128* bl1, __m128* bl2, __m128* bl3, __m128* tr0, - __m128* tr1, __m128* tr2, __m128* tr3, __m128* br0, - __m128* br1, __m128* br2, __m128* br3); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 1 channel. - // load8 case, i.e. left and right inputs are loaded with separate SSE loads - // for each pixel. - void load8_1ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* bl0, __m128* tr0, __m128* br0); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 2 channels. - // load8 case, i.e. left and right inputs are loaded with separate SSE loads - // for each pixel. - void load8_2ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* bl0, __m128* bl1, __m128* tr0, - __m128* tr1, __m128* br0, __m128* br1); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 3 channels. - // load8 case, i.e. left and right inputs are loaded with separate SSE loads - // for each pixel. - void load8_3ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* tl2, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* tr0, __m128* tr1, __m128* tr2, - __m128* br0, __m128* br1, __m128* br2); - // load top left interpolation inputs into output argument tl. - // load bottom left interpolation inputs into output argument bl. - // load top right interpolation inputs into output argument tr. - // load bottom right interpolation inputs into output argument br. - // pixels have 4 channels. - // load8 case, i.e. left and right inputs are loaded with separate SSE loads - // for each pixel. - void load8_4ch(const T* lower_ptr, const T* upper_ptr, int offset0, - int offset1, int offset2, int offset3, __m128* tl0, - __m128* tl1, __m128* tl2, __m128* tl3, __m128* bl0, - __m128* bl1, __m128* bl2, __m128* bl3, __m128* tr0, - __m128* tr1, __m128* tr2, __m128* tr3, __m128* br0, - __m128* br1, __m128* br2, __m128* br3); -#endif - - // there is no method that packs 4 pixels with 4 channel into four sse words. - // nothing to do for this case, everything is already in the right position. - - private: -// helper methods -#ifdef __AVX2__ - // pack 4 pixels with 1, 2, 3 or 4 channels into lower portion of SSE vector - // word. - // works within SSE lanes. - // sizeof(sample_data_type) can be 1, 2 or 4 bytes. - void pack4_1b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_2b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_4b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_1b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_2b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_4b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_1b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_2b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); - void pack4_4b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, __m256i* v3); -// there is no pack4_xx_4ch functions because none is needed. -// all the bytes are loaded in the right spots for this case. -#else - // pack 4 pixels with 1, 2, 3 or 4 channels into lower portion of SSE vector - // word. - // sizeof(sample_data_type) can be 1, 2 or 4 bytes. - void pack4_1b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_2b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_4b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_1b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_2b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_4b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_1b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_2b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); - void pack4_4b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, __m128i* v3); -#endif -#ifdef __AVX2__ - __m256i extract_right_1b_(const __m256i left); - __m256i extract_right_2b_(const __m256i left); - __m256i extract_right_3b_(const __m256i left); - __m256i extract_right_4b_(const __m256i left); - __m256i extract_right_6b_(const __m256i left); - __m256i extract_right_8b_(const __m256i left); -#else - __m128i extract_right_1b_(const __m128i left); - __m128i extract_right_2b_(const __m128i left); - __m128i extract_right_3b_(const __m128i left); - __m128i extract_right_4b_(const __m128i left); - __m128i extract_right_6b_(const __m128i left); - __m128i extract_right_8b_(const __m128i left); -#endif -}; - -#ifdef __AVX2__ -template -void VectorLoader::pack4_1b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - *v3 = _mm256_slli_si256(*v3, 3); - __m256i and_mask = _mm256_setr_epi32(255, 0, 0, 0, 255, 0, 0, 0); - *v2 = _mm256_or_si256(*v3, - _mm256_slli_si256(_mm256_and_si256(and_mask, *v2), 2)); - *v1 = _mm256_or_si256(*v2, - _mm256_slli_si256(_mm256_and_si256(and_mask, *v1), 1)); - *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); -} -template -void VectorLoader::pack4_2b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - *v3 = _mm256_slli_si256(*v3, 6); - __m256i and_mask = _mm256_setr_epi32(65535, 0, 0, 0, 65535, 0, 0, 0); - *v2 = _mm256_or_si256(*v3, - _mm256_slli_si256(_mm256_and_si256(and_mask, *v2), 4)); - *v1 = _mm256_or_si256(*v2, - _mm256_slli_si256(_mm256_and_si256(and_mask, *v1), 2)); - *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); -} -template -void VectorLoader::pack4_4b_1ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - *v3 = _mm256_slli_si256(*v3, 12); - __m256i and_mask = _mm256_setr_epi32(-1, 0, 0, 0, -1, 0, 0, 0); - *v2 = _mm256_or_si256(*v3, - _mm256_slli_si256(_mm256_and_si256(and_mask, *v2), 8)); - *v1 = _mm256_or_si256(*v2, - _mm256_slli_si256(_mm256_and_si256(and_mask, *v1), 4)); - *v0 = _mm256_or_si256(*v1, _mm256_and_si256(and_mask, *v0)); -} - -template -void VectorLoader::pack4_1b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - __m256i and_mask = _mm256_setr_epi32(65535, 0, 0, 0, 65535, 0, 0, 0); - *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), - _mm256_slli_si256(*v1, 2)); - *v1 = _mm256_or_si256(_mm256_and_si256(*v2, and_mask), - _mm256_slli_si256(*v3, 2)); -} -template -void VectorLoader::pack4_2b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - __m256i and_mask = _mm256_setr_epi32(-1, 0, 0, 0, -1, 0, 0, 0); - *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), - _mm256_slli_si256(*v1, 4)); - *v1 = _mm256_or_si256(_mm256_and_si256(*v2, and_mask), - _mm256_slli_si256(*v3, 4)); -} -template -void VectorLoader::pack4_4b_2ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - __m256i and_mask = _mm256_setr_epi32(-1, -1, 0, 0, -1, -1, 0, 0); - *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), - _mm256_slli_si256(*v1, 8)); - *v1 = _mm256_or_si256(_mm256_and_si256(*v2, and_mask), - _mm256_slli_si256(*v3, 8)); -} - -template -void VectorLoader::pack4_1b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - __m256i and_mask = _mm256_setr_epi32(16777215, 0, 0, 0, 16777215, 0, 0, 0); - *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), - _mm256_slli_si256(*v1, 3)); - and_mask = _mm256_srli_si256(and_mask, 1); - *v1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v1, 1), and_mask), - _mm256_slli_si256(*v2, 2)); - and_mask = _mm256_srli_si256(and_mask, 1); - *v2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v2, 2), and_mask), - _mm256_slli_si256(*v3, 1)); -} -template -void VectorLoader::pack4_2b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - __m256i and_mask = _mm256_setr_epi32(-1, 65535, 0, 0, -1, 65535, 0, 0); - *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), - _mm256_slli_si256(*v1, 6)); - and_mask = _mm256_srli_si256(and_mask, 2); - *v1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v1, 2), and_mask), - _mm256_slli_si256(*v2, 4)); - and_mask = _mm256_srli_si256(and_mask, 2); - *v2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v2, 4), and_mask), - _mm256_slli_si256(*v3, 2)); -} -template -void VectorLoader::pack4_4b_3ch_(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - __m256i and_mask = _mm256_setr_epi32(-1, -1, -1, 0, -1, -1, -1, 0); - *v0 = _mm256_or_si256(_mm256_and_si256(*v0, and_mask), - _mm256_slli_si256(*v1, 12)); - and_mask = _mm256_srli_si256(and_mask, 4); - *v1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v1, 4), and_mask), - _mm256_slli_si256(*v2, 8)); - and_mask = _mm256_srli_si256(and_mask, 4); - *v2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_si256(*v2, 8), and_mask), - _mm256_slli_si256(*v3, 4)); -} - -template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_1b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_1b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_4b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_4b_1ch_(v0, v1, v2, v3); -} - -template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_1b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_1b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_4b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_4b_2ch_(v0, v1, v2, v3); -} - -template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_1b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_1b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_4b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_2b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m256i* v0, __m256i* v1, __m256i* v2, - __m256i* v3) { - pack4_4b_3ch_(v0, v1, v2, v3); -} -#else -template -void VectorLoader::pack4_1b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - *v3 = _mm_slli_si128(*v3, 3); - __m128i and_mask = _mm_setr_epi32(255, 0, 0, 0); - *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 2)); - *v1 = _mm_or_si128(*v2, _mm_slli_si128(_mm_and_si128(and_mask, *v1), 1)); - *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); -} -template -void VectorLoader::pack4_2b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - *v3 = _mm_slli_si128(*v3, 6); - __m128i and_mask = _mm_setr_epi32(65535, 0, 0, 0); - *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 4)); - *v1 = _mm_or_si128(*v2, _mm_slli_si128(_mm_and_si128(and_mask, *v1), 2)); - *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); -} -template -void VectorLoader::pack4_4b_1ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - *v3 = _mm_slli_si128(*v3, 12); - __m128i and_mask = _mm_setr_epi32(-1, 0, 0, 0); - *v2 = _mm_or_si128(*v3, _mm_slli_si128(_mm_and_si128(and_mask, *v2), 8)); - *v1 = _mm_or_si128(*v2, _mm_slli_si128(_mm_and_si128(and_mask, *v1), 4)); - *v0 = _mm_or_si128(*v1, _mm_and_si128(and_mask, *v0)); -} -template -void VectorLoader::pack4_1b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - __m128i and_mask = _mm_setr_epi32(65535, 0, 0, 0); - *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 2)); - *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 2)); -} -template -void VectorLoader::pack4_2b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - __m128i and_mask = _mm_setr_epi32(-1, 0, 0, 0); - *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 4)); - *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 4)); -} -template -void VectorLoader::pack4_4b_2ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - __m128i and_mask = _mm_setr_epi32(-1, -1, 0, 0); - *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 8)); - *v1 = _mm_or_si128(_mm_and_si128(*v2, and_mask), _mm_slli_si128(*v3, 8)); -} -template -void VectorLoader::pack4_1b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - __m128i and_mask = _mm_setr_epi32(16777215, 0, 0, 0); - *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 3)); - and_mask = _mm_srli_si128(and_mask, 1); - *v1 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v1, 1), and_mask), - _mm_slli_si128(*v2, 2)); - and_mask = _mm_srli_si128(and_mask, 1); - *v2 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v2, 2), and_mask), - _mm_slli_si128(*v3, 1)); -} -template -void VectorLoader::pack4_2b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - __m128i and_mask = _mm_setr_epi32(-1, 65535, 0, 0); - *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 6)); - and_mask = _mm_srli_si128(and_mask, 2); - *v1 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v1, 2), and_mask), - _mm_slli_si128(*v2, 4)); - and_mask = _mm_srli_si128(and_mask, 2); - *v2 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v2, 4), and_mask), - _mm_slli_si128(*v3, 2)); -} -template -void VectorLoader::pack4_4b_3ch_(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - __m128i and_mask = _mm_setr_epi32(-1, -1, -1, 0); - *v0 = _mm_or_si128(_mm_and_si128(*v0, and_mask), _mm_slli_si128(*v1, 12)); - and_mask = _mm_srli_si128(and_mask, 4); - *v1 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v1, 4), and_mask), - _mm_slli_si128(*v2, 8)); - and_mask = _mm_srli_si128(and_mask, 4); - *v2 = _mm_or_si128(_mm_and_si128(_mm_srli_si128(*v2, 8), and_mask), - _mm_slli_si128(*v3, 4)); -} - -template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_1b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_1b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_4b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_1ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_1ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_4b_1ch_(v0, v1, v2, v3); -} - -template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_1b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_1b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_4b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_2ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_2ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_4b_2ch_(v0, v1, v2, v3); -} - -template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_1b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_1b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_4b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_2b_3ch_(v0, v1, v2, v3); -} -template <> -void VectorLoader::pack_3ch(__m128i* v0, __m128i* v1, __m128i* v2, - __m128i* v3) { - pack4_4b_3ch_(v0, v1, v2, v3); -} -#endif - -#ifdef __AVX2__ -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { - return extract_right_1b_(left); -} -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { - return extract_right_1b_(left); -} -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { - return extract_right_2b_(left); -} -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { - return extract_right_2b_(left); -} -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { - return extract_right_4b_(left); -} -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { - return extract_right_2b_(left); -} -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { - return extract_right_2b_(left); -} -template <> -__m256i VectorLoader::extract_right_1ch(const __m256i left) { - return extract_right_4b_(left); -} - -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { - return extract_right_2b_(left); -} -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { - return extract_right_2b_(left); -} -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { - return extract_right_4b_(left); -} -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { - return extract_right_4b_(left); -} -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { - return extract_right_8b_(left); -} -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { - return extract_right_4b_(left); -} -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { - return extract_right_4b_(left); -} -template <> -__m256i VectorLoader::extract_right_2ch(const __m256i left) { - return extract_right_8b_(left); -} - -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { - return extract_right_3b_(left); -} -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { - return extract_right_3b_(left); -} -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { - return extract_right_6b_(left); -} -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { - return extract_right_6b_(left); -} -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { - assert(false); -} -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { - return extract_right_6b_(left); -} -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { - return extract_right_6b_(left); -} -template <> -__m256i VectorLoader::extract_right_3ch(const __m256i left) { - assert(false); -} - -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { - return extract_right_4b_(left); -} -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { - return extract_right_4b_(left); -} -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { - return extract_right_8b_(left); -} -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { - return extract_right_8b_(left); -} -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { - assert(false); -} -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { - return extract_right_8b_(left); -} -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { - return extract_right_8b_(left); -} -template <> -__m256i VectorLoader::extract_right_4ch(const __m256i left) { - assert(false); -} -#else -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { - return extract_right_1b_(left); -} -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { - return extract_right_1b_(left); -} -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { - return extract_right_2b_(left); -} -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { - return extract_right_2b_(left); -} -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { - return extract_right_4b_(left); -} -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { - return extract_right_2b_(left); -} -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { - return extract_right_2b_(left); -} -template <> -__m128i VectorLoader::extract_right_1ch(const __m128i left) { - return extract_right_4b_(left); -} - -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { - return extract_right_2b_(left); -} -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { - return extract_right_2b_(left); -} -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { - return extract_right_4b_(left); -} -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { - return extract_right_4b_(left); -} -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { - return extract_right_8b_(left); -} -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { - return extract_right_4b_(left); -} -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { - return extract_right_4b_(left); -} -template <> -__m128i VectorLoader::extract_right_2ch(const __m128i left) { - return extract_right_8b_(left); -} - -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { - return extract_right_3b_(left); -} -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { - return extract_right_3b_(left); -} -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { - return extract_right_6b_(left); -} -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { - return extract_right_6b_(left); -} -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { - assert(false); -} -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { - return extract_right_6b_(left); -} -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { - return extract_right_6b_(left); -} -template <> -__m128i VectorLoader::extract_right_3ch(const __m128i left) { - assert(false); -} - -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { - return extract_right_4b_(left); -} -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { - return extract_right_4b_(left); -} -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { - return extract_right_8b_(left); -} -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { - return extract_right_8b_(left); -} -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { - assert(false); -} -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { - return extract_right_8b_(left); -} -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { - return extract_right_8b_(left); -} -template <> -__m128i VectorLoader::extract_right_4ch(const __m128i left) { - assert(false); -} -#endif - -#ifdef __AVX2__ -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { - raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_cvtepu8_epi32(_mm256_castsi256_si128(raw))), - _mm_cvtepu8_epi32(_mm256_extractf128_si256(raw, 1)), 1); - return _mm256_cvtepi32_ps(raw); -} -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { - raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_cvtepi8_epi32(_mm256_castsi256_si128(raw))), - _mm_cvtepi8_epi32(_mm256_extractf128_si256(raw, 1)), 1); - return _mm256_cvtepi32_ps(raw); -} -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { - raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_cvtepu16_epi32(_mm256_castsi256_si128(raw))), - _mm_cvtepu16_epi32(_mm256_extractf128_si256(raw, 1)), 1); - return _mm256_cvtepi32_ps(raw); -} -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { - raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_cvtepi16_epi32(_mm256_castsi256_si128(raw))), - _mm_cvtepi16_epi32(_mm256_extractf128_si256(raw, 1)), 1); - return _mm256_cvtepi32_ps(raw); -} -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { - return _mm256_cvtepi32_ps(raw); -} -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { - return _mm256_insertf128_ps( - _mm256_castps128_ps256(_mm_cvtph_ps(_mm256_castsi256_si128(raw))), - _mm_cvtph_ps(_mm256_extractf128_si256(raw, 1)), 1); -} -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { - // bfloat16 is essentially fp32 with mantissa truncated from 23 to 7 bits. - // can convert with << 16, which we fuse with initial shuffle into epi32 - // positions. - __m256i shuf_hi32 = _mm256_setr_epi8( - -128, -128, 0, 1, -128, -128, 2, 3, -128, -128, 4, 5, -128, -128, 6, 7, - -128, -128, 0, 1, -128, -128, 2, 3, -128, -128, 4, 5, -128, -128, 6, 7); - return _mm256_castsi256_ps(_mm256_shuffle_epi8(raw, shuf_hi32)); -} -template <> -__m256 VectorLoader::to_fp32(__m256i raw) { - return _mm256_castsi256_ps(raw); -} -#else -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { - return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(raw)); -} -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { - return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(raw)); -} -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { - return _mm_cvtepi32_ps(_mm_cvtepu16_epi32(raw)); -} -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { - return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(raw)); -} -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { - return _mm_cvtepi32_ps(raw); -} -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { -#ifdef __F16C__ - return _mm_cvtph_ps(raw); -#else - // It is fairly trivial to convert from fp16 to fp32. - // The formats are defined as follows: - // - // fp16 :: 15=sign_bit, 14-10=exponent, 9-0=mantissa :: exp zero offset is 15 - // :: exponent of -15 (all 0) and +16 (all 1) are special numbers. - // fp32 :: 31=sign_bit, 30-23=exponent, 22-0=mantissa :: exp zero offset is - // 127 - // :: exponent of -127 (all 0) and +128 (all 1) are special numbers. - // - // Assuming the fp16 values is stored in the lower 16 bits of an int32 - // 'fp16_val'. - // - // fp16_mantissa = fp16_val & (2^10-1) - // fp32_mantissa = fp16_mantissa << 13 - // - // The exponent is a little trickier. - // For normal numbers, the following works: - // fp16_exponent_with_10bit_left_shift = (fp16_val & ((2^5-1)<<10)) - // fp16_exponent_at_msb = fp16_exponent_with_10bit_left_shift << 17 - // The next line shifts in 1's from msb - // fp16_exponent_at_fp32_position = fp16_exponent_at_msb >> 4 - // The next line flips the 3 bits from [msb-1,msb-4] - // fp32_exponent = fp16_exponent_at_fp32_position ^ (7 << 27) - // This breaks for subnormals, nan and infinity. - // The only thing that breaks is the 3bit bit flip, which should - // happen for normal numbers, but should not happen otherwise. - // Since the bit flip can be done with an XOR of all 1's, we - // can make this happen by turning the XOR mask to all zeros - // when the fp16_exponent is either 0 or 31. - // - // ..move 16-bit input words to lower part of 32-bit positions. - __m128i shuf_lo32 = _mm_setr_epi8(0, 1, -128, -128, 2, 3, -128, -128, 4, 5, - -128, -128, 6, 7, -128, -128); - __m128i fp16_val = _mm_shuffle_epi8(raw, shuf_lo32); - // ..extract sign bit - __m128i fp32_sign = - _mm_slli_epi32(_mm_and_si128(fp16_val, _mm_set1_epi32(32768)), 16); - // ..extract fp16_mantissa and shift - __m128i fp16_mantissa = _mm_and_si128(fp16_val, _mm_set1_epi32(1023)); - __m128i fp32_mantissa = _mm_slli_epi32(fp16_mantissa, 13); - // ..extract fp16 exponent shifted 10bits to the left - __m128i fp16_exponent_sl10 = _mm_and_si128(fp16_val, _mm_set1_epi32(31744)); - __m128i fp16_exponent_all1_mask = - _mm_cmpeq_epi32(fp16_exponent_sl10, _mm_set1_epi32(31 << 10)); - __m128i fp16_exponent_all0_mask = - _mm_cmpeq_epi32(fp16_exponent_sl10, _mm_setzero_si128()); - __m128i fp16_denormal_mask = - _mm_or_si128(fp16_exponent_all0_mask, fp16_exponent_all1_mask); - __m128i fp32_exponent_before_xor = - _mm_and_si128(_mm_set1_epi32(2139095040), - _mm_srai_epi32(_mm_slli_epi32(fp16_exponent_sl10, 17), 4)); - __m128i fp32_exponent_xor_mask = - _mm_andnot_si128(fp16_denormal_mask, _mm_set1_epi32(7 << 27)); - __m128i fp32_exponent = - _mm_xor_si128(fp32_exponent_xor_mask, fp32_exponent_before_xor); - // ..or everything into one word - __m128i fp32_val = - _mm_or_si128(_mm_or_si128(fp32_sign, fp32_exponent), fp32_mantissa); - return _mm_castsi128_ps(fp32_val); -#endif -} -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { - // bfloat16 is essentially fp32 with mantissa truncated from 23 to 7 bits. - // can convert with << 16, which we fuse with initial shuffle into epi32 - // positions. - __m128i shuf_hi32 = _mm_setr_epi8(-128, -128, 0, 1, -128, -128, 2, 3, -128, - -128, 4, 5, -128, -128, 6, 7); - return _mm_castsi128_ps(_mm_shuffle_epi8(raw, shuf_hi32)); -} -template <> -__m128 VectorLoader::to_fp32(__m128i raw) { - return _mm_castsi128_ps(raw); -} -#endif - -#ifdef __AVX2__ -template -__m256i VectorLoader::extract_right_1b_(const __m256i left) { - return _mm256_srli_si256(left, 1); -} -template -__m256i VectorLoader::extract_right_2b_(const __m256i left) { - return _mm256_srli_si256(left, 2); -} -template -__m256i VectorLoader::extract_right_3b_(const __m256i left) { - return _mm256_srli_si256(left, 3); -} -template -__m256i VectorLoader::extract_right_4b_(const __m256i left) { - return _mm256_srli_si256(left, 4); -} -template -__m256i VectorLoader::extract_right_6b_(const __m256i left) { - return _mm256_srli_si256(left, 6); -} -template -__m256i VectorLoader::extract_right_8b_(const __m256i left) { - return _mm256_srli_si256(left, 8); -} -#else -template -__m128i VectorLoader::extract_right_1b_(const __m128i left) { - return _mm_srli_si128(left, 1); -} -template -__m128i VectorLoader::extract_right_2b_(const __m128i left) { - return _mm_srli_si128(left, 2); -} -template -__m128i VectorLoader::extract_right_3b_(const __m128i left) { - return _mm_srli_si128(left, 3); -} -template -__m128i VectorLoader::extract_right_4b_(const __m128i left) { - return _mm_srli_si128(left, 4); -} -template -__m128i VectorLoader::extract_right_6b_(const __m128i left) { - return _mm_srli_si128(left, 6); -} -template -__m128i VectorLoader::extract_right_8b_(const __m128i left) { - return _mm_srli_si128(left, 8); -} -#endif - -#ifdef __AVX2__ -template -void VectorLoader::load1_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* right0) { - __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - *left0 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); - *right0 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); -} -template -void VectorLoader::load1_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* right0, - __m256* right1) { - __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - *left0 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); - *left1 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); - *right0 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[2]))); - *right1 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[3]))); -} -template -void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* left2, - __m256* right0, __m256* right1, - __m256* right2) { - __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - *left0 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); - *left1 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); - *left2 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[2]))); - *right0 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[3]))); - *right1 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[4]))); - *right2 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[5]))); -} -template -void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* left2, - __m256* left3, __m256* right0, __m256* right1, - __m256* right2, __m256* right3) { - __m256i raw = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - *left0 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[0]))); - *left1 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[1]))); - *left2 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[2]))); - *left3 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[3]))); - *right0 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[4]))); - *right1 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[5]))); - *right2 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[6]))); - *right3 = to_fp32( - _mm256_shuffle_epi8(raw, _mm256_broadcastsi128_si256(shuffle_masks[7]))); -} -template -void VectorLoader::load2_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* right0) { - __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i raw2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)), 1); - __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); - *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); -} -template -void VectorLoader::load2_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* right0, - __m256* right1) { - __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i raw2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)), 1); - __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); - *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); - mask = _mm256_broadcastsi128_si256(shuffle_masks[1]); - *left1 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right1 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); -} -template -void VectorLoader::load2_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* left2, - __m256* right0, __m256* right1, - __m256* right2) { - __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i raw2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)), 1); - __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); - *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); - mask = _mm256_broadcastsi128_si256(shuffle_masks[1]); - *left1 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right1 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); - mask = _mm256_broadcastsi128_si256(shuffle_masks[2]); - *left2 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right2 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); -} -template -void VectorLoader::load2_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m256* left0, __m256* left1, __m256* left2, - __m256* left3, __m256* right0, __m256* right1, - __m256* right2, __m256* right3) { - __m256i raw1 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i raw2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)), 1); - __m256i mask = _mm256_broadcastsi128_si256(shuffle_masks[0]); - *left0 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right0 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); - mask = _mm256_broadcastsi128_si256(shuffle_masks[1]); - *left1 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right1 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); - mask = _mm256_broadcastsi128_si256(shuffle_masks[2]); - *left2 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right2 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); - mask = _mm256_broadcastsi128_si256(shuffle_masks[3]); - *left3 = to_fp32(_mm256_shuffle_epi8(raw1, mask)); - *right3 = to_fp32(_mm256_shuffle_epi8(raw2, mask)); -} -template -void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* right0) { - __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i r0 = extract_right_1ch(l0); - __m256i l1, r1; - if (offset1 == offset0) { - l1 = l0; - r1 = r0; - } else { - l1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); - r1 = extract_right_1ch(l1); - } - __m256i l2, r2; - if (offset2 == offset1) { - l2 = l1; - r2 = r1; - } else { - l2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); - r2 = extract_right_1ch(l2); - } - __m256i l3, r3; - if (offset3 == offset2) { - l3 = l2; - r3 = r2; - } else { - l3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); - r3 = extract_right_1ch(l3); - } - pack_1ch(&l0, &l1, &l2, &l3); - *left0 = to_fp32(l0); - pack_1ch(&r0, &r1, &r2, &r3); - *right0 = to_fp32(r0); -} -template -void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* right0, __m256* right1) { - __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i r0 = extract_right_2ch(l0); - __m256i l1, r1; - if (offset1 == offset0) { - l1 = l0; - r1 = r0; - } else { - l1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); - r1 = extract_right_2ch(l1); - } - __m256i l2, r2; - if (offset2 == offset1) { - l2 = l1; - r2 = r1; - } else { - l2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); - r2 = extract_right_2ch(l2); - } - __m256i l3, r3; - if (offset3 == offset2) { - l3 = l2; - r3 = r2; - } else { - l3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); - r3 = extract_right_2ch(l3); - } - pack_2ch(&l0, &l1, &l2, &l3); - *left0 = to_fp32(l0); - *left1 = to_fp32(l1); - pack_2ch(&r0, &r1, &r2, &r3); - *right0 = to_fp32(r0); - *right1 = to_fp32(r1); -} -template -void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* left2, __m256* right0, __m256* right1, - __m256* right2) { - __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i r0 = extract_right_3ch(l0); - __m256i l1, r1; - if (offset1 == offset0) { - l1 = l0; - r1 = r0; - } else { - l1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); - r1 = extract_right_3ch(l1); - } - __m256i l2, r2; - if (offset2 == offset1) { - l2 = l1; - r2 = r1; - } else { - l2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); - r2 = extract_right_3ch(l2); - } - __m256i l3, r3; - if (offset3 == offset2) { - l3 = l2; - r3 = r2; - } else { - l3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); - r3 = extract_right_3ch(l3); - } - pack_3ch(&l0, &l1, &l2, &l3); - *left0 = to_fp32(l0); - *left1 = to_fp32(l1); - *left2 = to_fp32(l2); - pack_3ch(&r0, &r1, &r2, &r3); - *right0 = to_fp32(r0); - *right1 = to_fp32(r1); - *right2 = to_fp32(r2); -} -template -void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* left2, __m256* left3, __m256* right0, - __m256* right1, __m256* right2, - __m256* right3) { - __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i r0 = extract_right_4ch(l0); - __m256i l1, r1; - if (offset1 == offset0) { - l1 = l0; - r1 = r0; - } else { - l1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); - r1 = extract_right_4ch(l1); - } - __m256i l2, r2; - if (offset2 == offset1) { - l2 = l1; - r2 = r1; - } else { - l2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); - r2 = extract_right_4ch(l2); - } - __m256i l3, r3; - if (offset3 == offset2) { - l3 = l2; - r3 = r2; - } else { - l3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); - r3 = extract_right_4ch(l3); - } - *left0 = to_fp32(l0); - *left1 = to_fp32(l1); - *left2 = to_fp32(l2); - *left3 = to_fp32(l3); - *right0 = to_fp32(r0); - *right1 = to_fp32(r1); - *right2 = to_fp32(r2); - *right3 = to_fp32(r3); -} -template -void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* right0) { - __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i r0 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)), 1); - __m256i l1, r1; - if (offset1 == offset0) { - l1 = l0; - r1 = r0; - } else { - l1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); - r1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 1)), 1); - } - __m256i l2, r2; - if (offset2 == offset1) { - l2 = l1; - r2 = r1; - } else { - l2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); - r2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 1)), 1); - } - __m256i l3, r3; - if (offset3 == offset2) { - l3 = l2; - r3 = r2; - } else { - l3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); - r3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 1)), 1); - } - pack_1ch(&l0, &l1, &l2, &l3); - *left0 = to_fp32(l0); - pack_1ch(&r0, &r1, &r2, &r3); - *right0 = to_fp32(r0); -} -template -void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* right0, __m256* right1) { - __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i r0 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)), 1); - __m256i l1, r1; - if (offset1 == offset0) { - l1 = l0; - r1 = r0; - } else { - l1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); - r1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 2)), 1); - } - __m256i l2, r2; - if (offset2 == offset1) { - l2 = l1; - r2 = r1; - } else { - l2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); - r2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 2)), 1); - } - __m256i l3, r3; - if (offset3 == offset2) { - l3 = l2; - r3 = r2; - } else { - l3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); - r3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 2)), 1); - } - pack_2ch(&l0, &l1, &l2, &l3); - *left0 = to_fp32(l0); - *left1 = to_fp32(l1); - pack_2ch(&r0, &r1, &r2, &r3); - *right0 = to_fp32(r0); - *right1 = to_fp32(r1); -} -template -void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* left2, __m256* right0, __m256* right1, - __m256* right2) { - __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i r0 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)), 1); - __m256i l1, r1; - if (offset1 == offset0) { - l1 = l0; - r1 = r0; - } else { - l1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); - r1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 3)), 1); - } - __m256i l2, r2; - if (offset2 == offset1) { - l2 = l1; - r2 = r1; - } else { - l2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); - r2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 3)), 1); - } - __m256i l3, r3; - if (offset3 == offset2) { - l3 = l2; - r3 = r2; - } else { - l3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); - r3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 3)), 1); - } - pack_3ch(&l0, &l1, &l2, &l3); - *left0 = to_fp32(l0); - *left1 = to_fp32(l1); - *left2 = to_fp32(l2); - pack_3ch(&r0, &r1, &r2, &r3); - *right0 = to_fp32(r0); - *right1 = to_fp32(r1); - *right2 = to_fp32(r2); -} -template -void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m256* left0, __m256* left1, - __m256* left2, __m256* left3, __m256* right0, - __m256* right1, __m256* right2, - __m256* right3) { - __m256i l0 = _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(lower_ptr + offset0))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0)), 1); - __m256i r0 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)), 1); - __m256i l1, r1; - if (offset1 == offset0) { - l1 = l0; - r1 = r0; - } else { - l1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1)), 1); - r1 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 4)), 1); - } - __m256i l2, r2; - if (offset2 == offset1) { - l2 = l1; - r2 = r1; - } else { - l2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2)), 1); - r2 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 4)), 1); - } - __m256i l3, r3; - if (offset3 == offset2) { - l3 = l2; - r3 = r2; - } else { - l3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3)), 1); - r3 = _mm256_insertf128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 4))), - _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 4)), 1); - } - *left0 = to_fp32(l0); - *left1 = to_fp32(l1); - *left2 = to_fp32(l2); - *left3 = to_fp32(l3); - *right0 = to_fp32(r0); - *right1 = to_fp32(r1); - *right2 = to_fp32(r2); - *right3 = to_fp32(r3); -} -#else -template -void VectorLoader::load1_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* bl0, __m128* tr0, - __m128* br0) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); -} -template -void VectorLoader::load1_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* bl0, - __m128* bl1, __m128* tr0, __m128* tr1, - __m128* br0, __m128* br1) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); -} -template -void VectorLoader::load1_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* tl2, - __m128* bl0, __m128* bl1, __m128* bl2, - __m128* tr0, __m128* tr1, __m128* tr2, - __m128* br0, __m128* br1, __m128* br2) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); - *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); - *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); -} -template -void VectorLoader::load1_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* tl2, - __m128* tl3, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* bl3, __m128* tr0, - __m128* tr1, __m128* tr2, __m128* tr3, - __m128* br0, __m128* br1, __m128* br2, - __m128* br3) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *tl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); - *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); - *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[6])); - *tr3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[7])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *bl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[4])); - *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[5])); - *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[6])); - *br3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[7])); -} -template -void VectorLoader::load2_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* bl0, __m128* tr0, - __m128* br0) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1)); - *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)); - *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); -} -template -void VectorLoader::load2_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* bl0, - __m128* bl1, __m128* tr0, __m128* tr1, - __m128* br0, __m128* br1) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2)); - *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)); - *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); -} -template -void VectorLoader::load2_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* tl2, - __m128* bl0, __m128* bl1, __m128* bl2, - __m128* tr0, __m128* tr1, __m128* tr2, - __m128* br0, __m128* br1, __m128* br2) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3)); - *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)); - *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); -} -template -void VectorLoader::load2_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, const __m128i* shuffle_masks, - __m128* tl0, __m128* tl1, __m128* tl2, - __m128* tl3, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* bl3, __m128* tr0, - __m128* tr1, __m128* tr2, __m128* tr3, - __m128* br0, __m128* br1, __m128* br2, - __m128* br3) { - __m128i raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - *tl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *tl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *tl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4)); - *tr0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *tr1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *tr2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *tr3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - *bl0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *bl1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *bl2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *bl3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); - raw = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)); - *br0 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[0])); - *br1 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[1])); - *br2 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[2])); - *br3 = to_fp32(_mm_shuffle_epi8(raw, shuffle_masks[3])); -} -template -void VectorLoader::load4_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* bl0, - __m128* tr0, __m128* br0) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = extract_right_1ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = extract_right_1ch(ibl0); - __m128i itl1, itr1; - __m128i ibl1, ibr1; - if (offset1 == offset0) { - itl1 = itl0; - itr1 = itr0; - ibl1 = ibl0; - ibr1 = ibr0; - } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = extract_right_1ch(itl1); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = extract_right_1ch(ibl1); - } - __m128i itl2, itr2; - __m128i ibl2, ibr2; - if (offset2 == offset1) { - itl2 = itl1; - itr2 = itr1; - ibl2 = ibl1; - ibr2 = ibr1; - } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = extract_right_1ch(itl2); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = extract_right_1ch(ibl2); - } - __m128i itl3, itr3; - __m128i ibl3, ibr3; - if (offset3 == offset2) { - itl3 = itl2; - itr3 = itr2; - ibl3 = ibl2; - ibr3 = ibr2; - } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = extract_right_1ch(itl3); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = extract_right_1ch(ibl3); - } - pack_1ch(&itl0, &itl1, &itl2, &itl3); - *tl0 = to_fp32(itl0); - pack_1ch(&itr0, &itr1, &itr2, &itr3); - *tr0 = to_fp32(itr0); - pack_1ch(&ibl0, &ibl1, &ibl2, &ibl3); - *bl0 = to_fp32(ibl0); - pack_1ch(&ibr0, &ibr1, &ibr2, &ibr3); - *br0 = to_fp32(ibr0); -} -template -void VectorLoader::load4_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* bl0, __m128* bl1, __m128* tr0, - __m128* tr1, __m128* br0, __m128* br1) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = extract_right_2ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = extract_right_2ch(ibl0); - __m128i itl1, itr1; - __m128i ibl1, ibr1; - if (offset1 == offset0) { - itl1 = itl0; - itr1 = itr0; - ibl1 = ibl0; - ibr1 = ibr0; - } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = extract_right_2ch(itl1); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = extract_right_2ch(ibl1); - } - __m128i itl2, itr2; - __m128i ibl2, ibr2; - if (offset2 == offset1) { - itl2 = itl1; - itr2 = itr1; - ibl2 = ibl1; - ibr2 = ibr1; - } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = extract_right_2ch(itl2); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = extract_right_2ch(ibl2); - } - __m128i itl3, itr3; - __m128i ibl3, ibr3; - if (offset3 == offset2) { - itl3 = itl2; - itr3 = itr2; - ibl3 = ibl2; - ibr3 = ibr2; - } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = extract_right_2ch(itl3); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = extract_right_2ch(ibl3); - } - pack_2ch(&itl0, &itl1, &itl2, &itl3); - *tl0 = to_fp32(itl0); - *tl1 = to_fp32(itl1); - pack_2ch(&itr0, &itr1, &itr2, &itr3); - *tr0 = to_fp32(itr0); - *tr1 = to_fp32(itr1); - pack_2ch(&ibl0, &ibl1, &ibl2, &ibl3); - *bl0 = to_fp32(ibl0); - *bl1 = to_fp32(ibl1); - pack_2ch(&ibr0, &ibr1, &ibr2, &ibr3); - *br0 = to_fp32(ibr0); - *br1 = to_fp32(ibr1); -} -template -void VectorLoader::load4_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* tr0, __m128* tr1, - __m128* tr2, __m128* br0, __m128* br1, - __m128* br2) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = extract_right_3ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = extract_right_3ch(ibl0); - __m128i itl1, itr1; - __m128i ibl1, ibr1; - if (offset1 == offset0) { - itl1 = itl0; - itr1 = itr0; - ibl1 = ibl0; - ibr1 = ibr0; - } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = extract_right_3ch(itl1); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = extract_right_3ch(ibl1); - } - __m128i itl2, itr2; - __m128i ibl2, ibr2; - if (offset2 == offset1) { - itl2 = itl1; - itr2 = itr1; - ibl2 = ibl1; - ibr2 = ibr1; - } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = extract_right_3ch(itl2); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = extract_right_3ch(ibl2); - } - __m128i itl3, itr3; - __m128i ibl3, ibr3; - if (offset3 == offset2) { - itl3 = itl2; - itr3 = itr2; - ibl3 = ibl2; - ibr3 = ibr2; - } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = extract_right_3ch(itl3); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = extract_right_3ch(ibl3); - } - pack_3ch(&itl0, &itl1, &itl2, &itl3); - *tl0 = to_fp32(itl0); - *tl1 = to_fp32(itl1); - *tl2 = to_fp32(itl2); - pack_3ch(&itr0, &itr1, &itr2, &itr3); - *tr0 = to_fp32(itr0); - *tr1 = to_fp32(itr1); - *tr2 = to_fp32(itr2); - pack_3ch(&ibl0, &ibl1, &ibl2, &ibl3); - *bl0 = to_fp32(ibl0); - *bl1 = to_fp32(ibl1); - *bl2 = to_fp32(ibl2); - pack_3ch(&ibr0, &ibr1, &ibr2, &ibr3); - *br0 = to_fp32(ibr0); - *br1 = to_fp32(ibr1); - *br2 = to_fp32(ibr2); -} -template -void VectorLoader::load4_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* tl3, __m128* bl0, - __m128* bl1, __m128* bl2, __m128* bl3, - __m128* tr0, __m128* tr1, __m128* tr2, - __m128* tr3, __m128* br0, __m128* br1, - __m128* br2, __m128* br3) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = extract_right_4ch(itl0); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = extract_right_4ch(ibl0); - __m128i itl1, itr1; - __m128i ibl1, ibr1; - if (offset1 == offset0) { - itl1 = itl0; - itr1 = itr0; - ibl1 = ibl0; - ibr1 = ibr0; - } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = extract_right_4ch(itl1); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = extract_right_4ch(ibl1); - } - __m128i itl2, itr2; - __m128i ibl2, ibr2; - if (offset2 == offset1) { - itl2 = itl1; - itr2 = itr1; - ibl2 = ibl1; - ibr2 = ibr1; - } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = extract_right_4ch(itl2); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = extract_right_4ch(ibl2); - } - __m128i itl3, itr3; - __m128i ibl3, ibr3; - if (offset3 == offset2) { - itl3 = itl2; - itr3 = itr2; - ibl3 = ibl2; - ibr3 = ibr2; - } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = extract_right_4ch(itl3); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = extract_right_4ch(ibl3); - } - *tl0 = to_fp32(itl0); - *tl1 = to_fp32(itl1); - *tl2 = to_fp32(itl2); - *tl3 = to_fp32(itl3); - *tr0 = to_fp32(itr0); - *tr1 = to_fp32(itr1); - *tr2 = to_fp32(itr2); - *tr3 = to_fp32(itr3); - *bl0 = to_fp32(ibl0); - *bl1 = to_fp32(ibl1); - *bl2 = to_fp32(ibl2); - *bl3 = to_fp32(ibl3); - *br0 = to_fp32(ibr0); - *br1 = to_fp32(ibr1); - *br2 = to_fp32(ibr2); - *br3 = to_fp32(ibr3); -} -template -void VectorLoader::load8_1ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* bl0, - __m128* tr0, __m128* br0) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 1)); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 1)); - __m128i itl1, itr1; - __m128i ibl1, ibr1; - if (offset1 == offset0) { - itl1 = itl0; - itr1 = itr0; - ibl1 = ibl0; - ibr1 = ibr0; - } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 1)); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 1)); - } - __m128i itl2, itr2; - __m128i ibl2, ibr2; - if (offset2 == offset1) { - itl2 = itl1; - itr2 = itr1; - ibl2 = ibl1; - ibr2 = ibr1; - } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 1)); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 1)); - } - __m128i itl3, itr3; - __m128i ibl3, ibr3; - if (offset3 == offset2) { - itl3 = itl2; - itr3 = itr2; - ibl3 = ibl2; - ibr3 = ibr2; - } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 1)); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 1)); - } - pack_1ch(&itl0, &itl1, &itl2, &itl3); - *tl0 = to_fp32(itl0); - pack_1ch(&itr0, &itr1, &itr2, &itr3); - *tr0 = to_fp32(itr0); - pack_1ch(&ibl0, &ibl1, &ibl2, &ibl3); - *bl0 = to_fp32(ibl0); - pack_1ch(&ibr0, &ibr1, &ibr2, &ibr3); - *br0 = to_fp32(ibr0); -} -template -void VectorLoader::load8_2ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* bl0, __m128* bl1, __m128* tr0, - __m128* tr1, __m128* br0, __m128* br1) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 2)); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 2)); - __m128i itl1, itr1; - __m128i ibl1, ibr1; - if (offset1 == offset0) { - itl1 = itl0; - itr1 = itr0; - ibl1 = ibl0; - ibr1 = ibr0; - } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 2)); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 2)); - } - __m128i itl2, itr2; - __m128i ibl2, ibr2; - if (offset2 == offset1) { - itl2 = itl1; - itr2 = itr1; - ibl2 = ibl1; - ibr2 = ibr1; - } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 2)); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 2)); - } - __m128i itl3, itr3; - __m128i ibl3, ibr3; - if (offset3 == offset2) { - itl3 = itl2; - itr3 = itr2; - ibl3 = ibl2; - ibr3 = ibr2; - } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 2)); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 2)); - } - pack_2ch(&itl0, &itl1, &itl2, &itl3); - *tl0 = to_fp32(itl0); - *tl1 = to_fp32(itl1); - pack_2ch(&itr0, &itr1, &itr2, &itr3); - *tr0 = to_fp32(itr0); - *tr1 = to_fp32(itr1); - pack_2ch(&ibl0, &ibl1, &ibl2, &ibl3); - *bl0 = to_fp32(ibl0); - *bl1 = to_fp32(ibl1); - pack_2ch(&ibr0, &ibr1, &ibr2, &ibr3); - *br0 = to_fp32(ibr0); - *br1 = to_fp32(ibr1); -} -template -void VectorLoader::load8_3ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* bl0, __m128* bl1, - __m128* bl2, __m128* tr0, __m128* tr1, - __m128* tr2, __m128* br0, __m128* br1, - __m128* br2) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 3)); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 3)); - __m128i itl1, itr1; - __m128i ibl1, ibr1; - if (offset1 == offset0) { - itl1 = itl0; - itr1 = itr0; - ibl1 = ibl0; - ibr1 = ibr0; - } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 3)); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 3)); - } - __m128i itl2, itr2; - __m128i ibl2, ibr2; - if (offset2 == offset1) { - itl2 = itl1; - itr2 = itr1; - ibl2 = ibl1; - ibr2 = ibr1; - } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 3)); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 3)); - } - __m128i itl3, itr3; - __m128i ibl3, ibr3; - if (offset3 == offset2) { - itl3 = itl2; - itr3 = itr2; - ibl3 = ibl2; - ibr3 = ibr2; - } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 3)); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 3)); - } - pack_3ch(&itl0, &itl1, &itl2, &itl3); - *tl0 = to_fp32(itl0); - *tl1 = to_fp32(itl1); - *tl2 = to_fp32(itl2); - pack_3ch(&itr0, &itr1, &itr2, &itr3); - *tr0 = to_fp32(itr0); - *tr1 = to_fp32(itr1); - *tr2 = to_fp32(itr2); - pack_3ch(&ibl0, &ibl1, &ibl2, &ibl3); - *bl0 = to_fp32(ibl0); - *bl1 = to_fp32(ibl1); - *bl2 = to_fp32(ibl2); - pack_3ch(&ibr0, &ibr1, &ibr2, &ibr3); - *br0 = to_fp32(ibr0); - *br1 = to_fp32(ibr1); - *br2 = to_fp32(ibr2); -} -template -void VectorLoader::load8_4ch(const T* lower_ptr, const T* upper_ptr, - int offset0, int offset1, int offset2, - int offset3, __m128* tl0, __m128* tl1, - __m128* tl2, __m128* tl3, __m128* bl0, - __m128* bl1, __m128* bl2, __m128* bl3, - __m128* tr0, __m128* tr1, __m128* tr2, - __m128* tr3, __m128* br0, __m128* br1, - __m128* br2, __m128* br3) { - __m128i itl0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0)); - __m128i itr0 = _mm_loadu_si128((__m128i*)(lower_ptr + offset0 + 4)); - __m128i ibl0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0)); - __m128i ibr0 = _mm_loadu_si128((__m128i*)(upper_ptr + offset0 + 4)); - __m128i itl1, itr1; - __m128i ibl1, ibr1; - if (offset1 == offset0) { - itl1 = itl0; - itr1 = itr0; - ibl1 = ibl0; - ibr1 = ibr0; - } else { - itl1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1)); - itr1 = _mm_loadu_si128((__m128i*)(lower_ptr + offset1 + 4)); - ibl1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1)); - ibr1 = _mm_loadu_si128((__m128i*)(upper_ptr + offset1 + 4)); - } - __m128i itl2, itr2; - __m128i ibl2, ibr2; - if (offset2 == offset1) { - itl2 = itl1; - itr2 = itr1; - ibl2 = ibl1; - ibr2 = ibr1; - } else { - itl2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2)); - itr2 = _mm_loadu_si128((__m128i*)(lower_ptr + offset2 + 4)); - ibl2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2)); - ibr2 = _mm_loadu_si128((__m128i*)(upper_ptr + offset2 + 4)); - } - __m128i itl3, itr3; - __m128i ibl3, ibr3; - if (offset3 == offset2) { - itl3 = itl2; - itr3 = itr2; - ibl3 = ibl2; - ibr3 = ibr2; - } else { - itl3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3)); - itr3 = _mm_loadu_si128((__m128i*)(lower_ptr + offset3 + 4)); - ibl3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3)); - ibr3 = _mm_loadu_si128((__m128i*)(upper_ptr + offset3 + 4)); - } - *tl0 = to_fp32(itl0); - *tl1 = to_fp32(itl1); - *tl2 = to_fp32(itl2); - *tl3 = to_fp32(itl3); - *tr0 = to_fp32(itr0); - *tr1 = to_fp32(itr1); - *tr2 = to_fp32(itr2); - *tr3 = to_fp32(itr3); - *bl0 = to_fp32(ibl0); - *bl1 = to_fp32(ibl1); - *bl2 = to_fp32(ibl2); - *bl3 = to_fp32(ibl3); - *br0 = to_fp32(ibr0); - *br1 = to_fp32(ibr1); - *br2 = to_fp32(ibr2); - *br3 = to_fp32(ibr3); -} -#endif - -// -// This class stores 4 pixels with n channels packed into n SSE vector words. -// Pixel values are converted to type U and packed before storage. -// Output type U must be one of uint8, int8, uint16, int16, int32, Eigen::half, -// bfloat16 or float. -// - -template -class VectorWriter { - public: - // convert 4 fp32 words to type U with. - // this function calls clip. - // resulting words are packed. - // U must be one of uint8, int8, uint16, int16, int32, Eigen::half, bfloat16 - // or float. - __m128i from_fp32(__m128 vec); - - // converts from fp32 to U by calling method from_fp32(...) - // writes 4 pixels with 1 channel to destination. - void write_1ch(U* destination, __m128* vec); - - // converts from fp32 to U by calling method from_fp32(...) - // writes 4 pixels with 1 channel to destination. - void write_2ch(U* destination, __m128* vec); - - // converts from fp32 to U by calling method from_fp32(...) - // writes 4 pixels with 1 channel to destination. - void write_3ch(U* destination, __m128* vec); - - // converts from fp32 to U by calling method from_fp32(...) - // writes 4 pixels with 1 channel to destination. - void write_4ch(U* destination, __m128* vec); - - private: - // clip 4 fp32 words to prevent overflow when converting to type U. - __m128 clip_(__m128 vec) { - // default is to do nothing, since the packing intrinsics include clipping. - return vec; - } - void write_1b_1ch(U* destination, __m128* vec) { - __m128i ivec = from_fp32(vec[0]); - _mm_store_ss((float*)(destination), _mm_castsi128_ps(ivec)); - } - void write_2b_1ch(U* destination, __m128* vec) { - __m128i ivec = from_fp32(vec[0]); - _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec)); - } - void write_4b_1ch(U* destination, __m128* vec) { - __m128i ivec = from_fp32(vec[0]); - _mm_storeu_si128((__m128i*)(destination), ivec); - } - void write_1b_2ch(U* destination, __m128* vec) { - __m128i ivec1 = from_fp32(vec[0]); - __m128i ivec2 = from_fp32(vec[1]); - __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); - ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), - _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); - _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec1)); - } - void write_2b_2ch(U* destination, __m128* vec) { - __m128i ivec1 = from_fp32(vec[0]); - __m128i ivec2 = from_fp32(vec[1]); - __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); - ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), - _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); - _mm_storeu_si128((__m128i*)(destination), ivec1); - } - void write_4b_2ch(U* destination, __m128* vec) { - __m128i ivec1 = from_fp32(vec[0]); - __m128i ivec2 = from_fp32(vec[1]); - _mm_storeu_si128((__m128i*)(destination), ivec1); - _mm_storeu_si128((__m128i*)(destination + 4), ivec2); - } - void write_1b_3ch(U* destination, __m128* vec) { - __m128i ivec1 = from_fp32(vec[0]); - __m128i ivec2 = from_fp32(vec[1]); - __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); - ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), - _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); - _mm_store_sd((double*)(destination), _mm_castsi128_pd(ivec1)); - __m128i ivec3 = from_fp32(vec[2]); - _mm_store_ss((float*)(destination + 8), _mm_castsi128_ps(ivec3)); - } - void write_2b_3ch(U* destination, __m128* vec) { - __m128i ivec1 = from_fp32(vec[0]); - __m128i ivec2 = from_fp32(vec[1]); - __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); - ivec1 = _mm_or_si128(_mm_and_si128(mask, ivec1), - _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); - _mm_storeu_si128((__m128i*)(destination), ivec1); - __m128i ivec3 = from_fp32(vec[2]); - _mm_store_sd((double*)(destination + 8), _mm_castsi128_pd(ivec3)); - } - void write_4b_3ch(U* destination, __m128* vec) { - __m128i ivec1 = from_fp32(vec[0]); - __m128i ivec2 = from_fp32(vec[1]); - __m128i ivec3 = from_fp32(vec[2]); - _mm_storeu_si128((__m128i*)(destination), ivec1); - _mm_storeu_si128((__m128i*)(destination + 4), ivec2); - _mm_storeu_si128((__m128i*)(destination + 8), ivec3); - } - void write_1b_4ch(U* destination, __m128* vec) { - __m128i ivec1 = from_fp32(vec[0]); - __m128i ivec2 = from_fp32(vec[1]); - __m128i ivec3 = from_fp32(vec[2]); - __m128i ivec4 = from_fp32(vec[3]); - __m128i mask = _mm_setr_epi32(-1, 0, 0, 0); - __m128i ivec = _mm_and_si128(mask, ivec1); - ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec2), 4)); - ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec3), 8)); - ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec4), 12)); - _mm_storeu_si128((__m128i*)(destination), ivec); - } - void write_2b_4ch(U* destination, __m128* vec) { - __m128i ivec1 = from_fp32(vec[0]); - __m128i ivec2 = from_fp32(vec[1]); - __m128i ivec3 = from_fp32(vec[2]); - __m128i ivec4 = from_fp32(vec[3]); - __m128i mask = _mm_setr_epi32(-1, -1, 0, 0); - __m128i ivec = _mm_and_si128(mask, ivec1); - ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec2), 8)); - _mm_storeu_si128((__m128i*)(destination), ivec); - ivec = _mm_and_si128(mask, ivec3); - ivec = _mm_or_si128(ivec, _mm_slli_si128(_mm_and_si128(mask, ivec4), 8)); - _mm_storeu_si128((__m128i*)(destination + 8), ivec); - } - void write_4b_4ch(U* destination, __m128* vec) { - __m128i ivec1 = from_fp32(vec[0]); - __m128i ivec2 = from_fp32(vec[1]); - __m128i ivec3 = from_fp32(vec[2]); - __m128i ivec4 = from_fp32(vec[3]); - _mm_storeu_si128((__m128i*)(destination), ivec1); - _mm_storeu_si128((__m128i*)(destination + 4), ivec2); - _mm_storeu_si128((__m128i*)(destination + 8), ivec3); - _mm_storeu_si128((__m128i*)(destination + 12), ivec4); - } -}; - -template <> -__m128 VectorWriter::clip_(__m128 vec) { - // clip against low limit, -2147483648. - // we round up to nearest number that can be represented as float. - __m128 lt_val = _mm_set1_ps(-2147483520.0f); - __m128 lt_mask = _mm_cmplt_ps(vec, lt_val); - vec = _mm_or_ps(_mm_andnot_ps(lt_mask, vec), _mm_and_ps(lt_mask, lt_val)); - // clip against hight limit, 2147483647. - // we round down to nearest number that can be represented as float. - __m128 gt_val = _mm_set1_ps(2147483520.0f); - __m128 gt_mask = _mm_cmpgt_ps(vec, gt_val); - vec = _mm_or_ps(_mm_andnot_ps(gt_mask, vec), _mm_and_ps(gt_mask, gt_val)); - return vec; -} -template <> -__m128 VectorWriter::clip_(__m128 vec) { - // clip against low limit, -65504.0f; - __m128 lt_val = _mm_set1_ps(-65504.0f); - __m128 lt_mask = _mm_cmplt_ps(vec, lt_val); - vec = _mm_or_ps(_mm_andnot_ps(lt_mask, vec), _mm_and_ps(lt_mask, lt_val)); - // clip against hight limit, 65504.0f. - __m128 gt_val = _mm_set1_ps(65504.0f); - __m128 gt_mask = _mm_cmpgt_ps(vec, gt_val); - vec = _mm_or_ps(_mm_andnot_ps(gt_mask, vec), _mm_and_ps(gt_mask, gt_val)); - return vec; -} - -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { - __m128i ivec = _mm_cvttps_epi32(vec); - ivec = _mm_packs_epi32(ivec, ivec); - return _mm_packus_epi16(ivec, ivec); -} -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { - __m128i ivec = _mm_cvttps_epi32(vec); - ivec = _mm_packs_epi32(ivec, ivec); - return _mm_packs_epi16(ivec, ivec); -} -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { - __m128i ivec = _mm_cvttps_epi32(vec); - return _mm_packus_epi32(ivec, ivec); -} -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { - __m128i ivec = _mm_cvttps_epi32(vec); - return _mm_packs_epi32(ivec, ivec); -} -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { - return _mm_cvttps_epi32(clip_(vec)); -} -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { -#ifdef __F16C__ - return _mm_cvtps_ph(vec, _MM_FROUND_TO_ZERO); -#else - // Emulation of _mm_cvtps_ph(vec, _MM_FROUND_TO_ZERO) intrinsic. - // - // fp16 :: 15=sign_bit, 14-10=exponent, 9-0=mantissa :: exp zero offset is 15 - // :: exponent of -15 (all 0) and +16 (all 1) are special numbers. - // fp32 :: 31=sign_bit, 30-23=exponent, 22-0=mantissa :: exp zero offset is - // 127 - // :: exponent of -127 (all 0) and +128 (all 1) are special numbers. - // - __m128i hw = _mm_castps_si128(vec); - // ..extract fp32 exponent and mantissa - __m128i fp16_sign_bit_msb = _mm_and_si128(_mm_set1_epi32(-2147483648), hw); - __m128i fp32_exponent_lsb = - _mm_and_si128(_mm_set1_epi32(255), _mm_srli_epi32(hw, 23)); - __m128i fp32_mantissa = _mm_and_si128(_mm_set1_epi32(8388607), hw); - // ..test for NaN - __m128i exponent_ones = - _mm_cmpeq_epi32(fp32_exponent_lsb, _mm_set1_epi32(255)); - __m128i mantissa_zero = _mm_cmpeq_epi32(fp32_mantissa, _mm_setzero_si128()); - __m128i infinity_mask = _mm_and_si128(mantissa_zero, exponent_ones); - // ..have to test for NaN on fp32 bits to avoid converting NaN to infinity - __m128i NaN_mask = _mm_andnot_si128(mantissa_zero, exponent_ones); - // ..compensate for exponent zero offset difference - __m128i fp16_exponent_lsb = - _mm_sub_epi32(fp32_exponent_lsb, _mm_set1_epi32(112)); - // ..clip output if fp16_exponent > 30 - __m128i saturated_mask = _mm_andnot_si128( - exponent_ones, _mm_cmpgt_epi32(fp16_exponent_lsb, _mm_set1_epi32(30))); - // ..generate subnormal number if fp16_exponent == 0 - // ..flush to zero if fp16_exponent < 0 - __m128i subnormal_mask = - _mm_cmpeq_epi32(fp16_exponent_lsb, _mm_setzero_si128()); - __m128i underflow_mask = - _mm_cmplt_epi32(fp16_exponent_lsb, _mm_setzero_si128()); - __m128i fp16_mantissa = _mm_srli_epi32(fp32_mantissa, 13); - // ..handle abnormal values - __m128i normal_number = - _mm_or_si128(_mm_slli_epi32(fp16_exponent_lsb, 10), fp16_mantissa); - __m128i subnormal_number = - _mm_or_si128(_mm_set1_epi32(512), _mm_srli_epi32(fp16_mantissa, 1)); - __m128i saturated_number = _mm_set1_epi32(31743); - __m128i infinity_number = _mm_set1_epi32(31744); - __m128i NaN_number = _mm_set1_epi32(32256); - __m128i number = _mm_andnot_si128(underflow_mask, normal_number); - number = _mm_or_si128(_mm_andnot_si128(subnormal_mask, number), - _mm_and_si128(subnormal_mask, subnormal_number)); - number = _mm_or_si128(_mm_andnot_si128(saturated_mask, number), - _mm_and_si128(saturated_mask, saturated_number)); - number = _mm_or_si128(_mm_andnot_si128(infinity_mask, number), - _mm_and_si128(infinity_mask, infinity_number)); - number = _mm_or_si128(_mm_andnot_si128(NaN_mask, number), - _mm_and_si128(NaN_mask, NaN_number)); - // ..or in sign bit - number = _mm_or_si128(fp16_sign_bit_msb, _mm_slli_epi32(number, 16)); - // ..move 16 bit words to lower portion of sse vector; - __m128i shuf_from_hi32 = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -128, -128, - -128, -128, -128, -128, -128, -128); - number = _mm_shuffle_epi8(number, shuf_from_hi32); - return number; -#endif -} -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { - // casting from float to bfloat16 simply means >> 16 - // we do this with a shuffle that also moves everything to lower portion of - // sse vector word - __m128i shuf_from_hi32 = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -128, -128, - -128, -128, -128, -128, -128, -128); - return _mm_shuffle_epi8(_mm_castps_si128(vec), shuf_from_hi32); -} -template <> -__m128i VectorWriter::from_fp32(__m128 vec) { - // nothing to do in this case - return _mm_castps_si128(vec); -} - -template <> -void VectorWriter::write_1ch(uint8* destination, __m128* vec) { - write_1b_1ch(destination, vec); -} -template <> -void VectorWriter::write_1ch(int8* destination, __m128* vec) { - write_1b_1ch(destination, vec); -} -template <> -void VectorWriter::write_1ch(uint16* destination, __m128* vec) { - write_2b_1ch(destination, vec); -} -template <> -void VectorWriter::write_1ch(int16* destination, __m128* vec) { - write_2b_1ch(destination, vec); -} -template <> -void VectorWriter::write_1ch(int32* destination, __m128* vec) { - write_4b_1ch(destination, vec); -} -template <> -void VectorWriter::write_1ch(Eigen::half* destination, - __m128* vec) { - write_2b_1ch(destination, vec); -} -template <> -void VectorWriter::write_1ch(bfloat16* destination, __m128* vec) { - write_2b_1ch(destination, vec); -} -template <> -void VectorWriter::write_1ch(float* destination, __m128* vec) { - _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); -} - -template <> -void VectorWriter::write_2ch(uint8* destination, __m128* vec) { - write_1b_2ch(destination, vec); -} -template <> -void VectorWriter::write_2ch(int8* destination, __m128* vec) { - write_1b_2ch(destination, vec); -} -template <> -void VectorWriter::write_2ch(uint16* destination, __m128* vec) { - write_2b_2ch(destination, vec); -} -template <> -void VectorWriter::write_2ch(int16* destination, __m128* vec) { - write_2b_2ch(destination, vec); -} -template <> -void VectorWriter::write_2ch(int32* destination, __m128* vec) { - write_4b_2ch(destination, vec); -} -template <> -void VectorWriter::write_2ch(Eigen::half* destination, - __m128* vec) { - write_2b_2ch(destination, vec); -} -template <> -void VectorWriter::write_2ch(bfloat16* destination, __m128* vec) { - write_2b_2ch(destination, vec); -} -template <> -void VectorWriter::write_2ch(float* destination, __m128* vec) { - _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); - _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); -} - -template <> -void VectorWriter::write_3ch(uint8* destination, __m128* vec) { - write_1b_3ch(destination, vec); -} -template <> -void VectorWriter::write_3ch(int8* destination, __m128* vec) { - write_1b_3ch(destination, vec); -} -template <> -void VectorWriter::write_3ch(uint16* destination, __m128* vec) { - write_2b_3ch(destination, vec); -} -template <> -void VectorWriter::write_3ch(int16* destination, __m128* vec) { - write_2b_3ch(destination, vec); -} -template <> -void VectorWriter::write_3ch(int32* destination, __m128* vec) { - write_4b_3ch(destination, vec); -} -template <> -void VectorWriter::write_3ch(Eigen::half* destination, - __m128* vec) { - write_2b_3ch(destination, vec); -} -template <> -void VectorWriter::write_3ch(bfloat16* destination, __m128* vec) { - write_2b_3ch(destination, vec); -} -template <> -void VectorWriter::write_3ch(float* destination, __m128* vec) { - _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); - _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); - _mm_storeu_si128((__m128i*)(destination + 8), _mm_castps_si128(vec[2])); -} - -template <> -void VectorWriter::write_4ch(uint8* destination, __m128* vec) { - write_1b_4ch(destination, vec); -} -template <> -void VectorWriter::write_4ch(int8* destination, __m128* vec) { - write_1b_4ch(destination, vec); -} -template <> -void VectorWriter::write_4ch(uint16* destination, __m128* vec) { - write_2b_4ch(destination, vec); -} -template <> -void VectorWriter::write_4ch(int16* destination, __m128* vec) { - write_2b_4ch(destination, vec); -} -template <> -void VectorWriter::write_4ch(int32* destination, __m128* vec) { - write_4b_4ch(destination, vec); -} -template <> -void VectorWriter::write_4ch(Eigen::half* destination, - __m128* vec) { - write_2b_4ch(destination, vec); -} -template <> -void VectorWriter::write_4ch(bfloat16* destination, __m128* vec) { - write_2b_4ch(destination, vec); -} -template <> -void VectorWriter::write_4ch(float* destination, __m128* vec) { - _mm_storeu_si128((__m128i*)(destination), _mm_castps_si128(vec[0])); - _mm_storeu_si128((__m128i*)(destination + 4), _mm_castps_si128(vec[1])); - _mm_storeu_si128((__m128i*)(destination + 8), _mm_castps_si128(vec[2])); - _mm_storeu_si128((__m128i*)(destination + 12), _mm_castps_si128(vec[3])); -} - -template -class CropResizeCastImage : public VectorLoader, public VectorWriter { - public: - CropResizeCastImage(const int in_height, const int in_width, - const int out_height, const int out_width, - const int channels, const int min_ix, const int max_ix, - const CachedInterpolation* xs, const int min_iy, - const int max_iy, const CachedInterpolation* ys, - const float extrapolated_value, const bool flip_x, - const bool flip_y, const bool verbose = false, - const int allowed_load_groups = 15) - : verbose_(verbose), - allowed_load_groups_(allowed_load_groups), - in_height_(in_height), - in_width_(in_width), - out_height_(out_height), - out_width_(out_width), - channels_(channels), - min_ix_(min_ix), - max_ix_(max_ix), - min_iy_(min_iy), - max_iy_(max_iy), - ys_(ys), - extrapolated_value_(extrapolated_value), - flip_x_(flip_x), - flip_y_(flip_y), - in_row_size_(in_width * channels), - in_row_size_bytes_(in_width * channels * sizeof(T)), - out_row_size_(out_width * channels), - x0_(flip_x ? out_width - 1 - max_ix : min_ix), - x1_(flip_x ? out_width - 1 - min_ix : max_ix), - y0_(flip_y ? out_height - 1 - max_iy : min_iy), - y1_(flip_y ? out_height - 1 - min_iy : max_iy) { - if (min_ix_ <= max_ix_ && min_iy_ <= max_iy_) { - // copy xs values, but filter out the following: - // xs[].lower == xs[].upper AND xs[].lerp == 0 - // xs[].lower == xs[].upper AND xs[].lerp == 1 - xs_ = new CachedInterpolation[max_ix_ - min_ix_ + 1]; - for (int i = min_ix_; i <= max_ix_; ++i) { - int ix = i - min_ix_; - int xs_lower = xs[ix].lower / channels_; - int xs_upper = xs[ix].upper / channels_; - if (xs_lower == xs_upper) { - if (xs[ix].lerp == 0.0f && xs_lower + 1 < in_width) { - // upper weight is zero - xs_upper = xs_lower + 1; - } else if (xs[ix].lerp == 1.0f && xs_upper - 1 >= 0) { - // lower weight is zero - xs_lower = xs_upper - 1; - } - } - xs_[ix].lower = xs_lower * channels_; - xs_[ix].upper = xs_upper * channels_; - xs_[ix].lerp = xs[ix].lerp; - } - _u_min_val = std::numeric_limits::min(); - _u_max_val = std::numeric_limits::max(); - _f_min_val = static_cast(_u_min_val); - _f_max_val = static_cast(_u_max_val); - Configure_(); - } else { - // crop region outside of input image. - // extrapolation only. - general_x_ = NULL; - load1_x_ = NULL; - load2_x_ = NULL; - load4_x_ = NULL; - load8_x_ = NULL; - load1_offsets_ = NULL; - load2_offsets_ = NULL; - load4_offsets_ = NULL; - load8_offsets_ = NULL; - load1_shuffle_masks_ = NULL; - load2_shuffle_masks_ = NULL; - load1_mmxs_lerp_ = NULL; - load2_mmxs_lerp_ = NULL; - load4_mmxs_lerp_ = NULL; - load8_mmxs_lerp_ = NULL; - xs_ = NULL; - } - } - ~CropResizeCastImage() { - if (general_x_ != NULL) delete[] general_x_; - if (load1_x_ != NULL) delete[] load1_x_; - if (load2_x_ != NULL) delete[] load2_x_; - if (load4_x_ != NULL) delete[] load4_x_; - if (load8_x_ != NULL) delete[] load8_x_; - if (load1_offsets_ != NULL) delete[] load1_offsets_; - if (load2_offsets_ != NULL) delete[] load2_offsets_; - if (load4_offsets_ != NULL) delete[] load4_offsets_; - if (load8_offsets_ != NULL) delete[] load8_offsets_; - if (load1_shuffle_masks_ != NULL) delete[] load1_shuffle_masks_; - if (load2_shuffle_masks_ != NULL) delete[] load2_shuffle_masks_; - if (load1_mmxs_lerp_ != NULL) delete[] load1_mmxs_lerp_; - if (load2_mmxs_lerp_ != NULL) delete[] load2_mmxs_lerp_; - if (load4_mmxs_lerp_ != NULL) delete[] load4_mmxs_lerp_; - if (load8_mmxs_lerp_ != NULL) delete[] load8_mmxs_lerp_; - delete[] xs_; - } - - private: - // constructor arguments - const bool verbose_; - // this value is meant for unit testing. - // set this to 15 for normal execution. - // its an OR of flags for the different load group. - // 1 -> load4from1 - // 2 -> load4from2 - // 4 -> load4from4 - // 8 -> load4from8 - const int allowed_load_groups_; - const int in_height_, in_width_, out_height_, out_width_; - const int channels_; - const int min_ix_, max_ix_, min_iy_, max_iy_; - const CachedInterpolation* ys_; - CachedInterpolation* xs_; - const float extrapolated_value_; - const bool flip_x_, flip_y_; - // computed arguments - const int in_row_size_; - const int in_row_size_bytes_; - const int out_row_size_; - const int x0_, x1_; - const int y0_, y1_; - - // helper methods - void ResizeRow_load1_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load2_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load4_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load8_1ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load1_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load2_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load4_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load8_2ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load1_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load2_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load4_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load8_3ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load1_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load2_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load4_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_load8_4ch_(const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - void ResizeRow_general_(const float ys_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr); - - // configuration parameters - int num_general_, num_load1_, num_load2_, num_load4_, num_load8_; - int *load1_offsets_, *load2_offsets_, *load4_offsets_, *load8_offsets_; - int *general_x_, *load1_x_, *load2_x_, *load4_x_, *load8_x_; - __m128i *load1_shuffle_masks_, *load2_shuffle_masks_; - __m128 *load1_mmxs_lerp_, *load2_mmxs_lerp_, *load4_mmxs_lerp_, - *load8_mmxs_lerp_; - float _f_min_val, _f_max_val; - U _u_min_val, _u_max_val; - // configuration methods - void Configure_(); - int DetermineLoadGroup_(const int x); - bool ComputeXIndexRange_(const int x, int* min_xidx, int* max_xidx); - bool Load1_ok_( - const int min_xidx, - const int max_xidx); // xs - pointer to first xs for this load group - bool Load2_ok_( - const int min_xidx, - const int max_xidx); // xs - pointer to first xs for this load group - bool Load4_ok_(const int min_xidx, const int max_xidx); - bool Load8_ok_(const int min_xidx, const int max_xidx); - - public: - // - // public client methods - // - - // convenience function that determines if clipping is necessary - // in order to prevent overflow when casting to the output type U. - static bool clip_necessary(); - - // resize image - void Resize(const T* input_image, U* output_image); -}; - -template -void CropResizeCastImage::Resize(const T* input_image, U* output_image) { - // - U uEx = cast_to(extrapolated_value_, _f_min_val, _f_max_val, _u_min_val, - _u_max_val); - // extrapolate top - if (min_iy_ > 0) { - U* p = flip_y_ ? output_image + out_row_size_ * (out_height_ - min_iy_) - : output_image; - int nn = out_row_size_ * min_iy_; - for (int i = 0; i < nn; ++i) p[i] = uEx; - } - // extrapolate bottom - if (max_iy_ < out_height_ - 1) { - U* p = - flip_y_ ? output_image : output_image + out_row_size_ * (max_iy_ + 1); - int nn = out_row_size_ * (out_height_ - 1 - max_iy_); - for (int i = 0; i < nn; ++i) p[i] = uEx; - } - // extrapolate left - if (min_ix_ > 0) { - for (int iy = min_iy_; iy <= max_iy_; ++iy) { - int xx0 = flip_x_ ? (out_width_ - min_ix_) * channels_ : 0; - int nxx = min_ix_ * channels_; - U* p = output_image + xx0 + - out_row_size_ * (flip_y_ ? out_height_ - 1 - iy : iy); - for (int ix = 0; ix < nxx; ++ix) { - p[ix] = uEx; - } - } - } - // extrapolate right - if (max_ix_ < out_width_ - 1) { - for (int iy = min_iy_; iy <= max_iy_; ++iy) { - int xx0 = flip_x_ ? 0 : (max_ix_ + 1) * channels_; - int nxx = (out_width_ - 1 - max_ix_) * channels_; - U* p = output_image + xx0 + - out_row_size_ * (flip_y_ ? out_height_ - 1 - iy : iy); - for (int ix = 0; ix < nxx; ++ix) { - p[ix] = uEx; - } - } - } - // interpolation region - if (min_ix_ <= max_ix_ && min_iy_ <= max_iy_) { - int y = y0_; - for (y = y0_; y + 1 <= y1_; y += 2) { - const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; - const float yA_lerp = ys_[iyA].lerp; - const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); - const T* ysA_input_lower_ptr = - input_image + ys_[iyA].lower * in_width_ * channels_; - const T* ysA_input_upper_ptr = - input_image + ys_[iyA].upper * in_width_ * channels_; - U* ysA_output_ptr = output_image + y * out_width_ * channels_; - const int iyB = - flip_y_ ? out_height_ - 1 - min_iy_ - (y + 1) : (y + 1) - min_iy_; - const float yB_lerp = ys_[iyB].lerp; - const __m128 ysB_lerp = _mm_set1_ps(yB_lerp); - const T* ysB_input_lower_ptr = - input_image + ys_[iyB].lower * in_width_ * channels_; - const T* ysB_input_upper_ptr = - input_image + ys_[iyB].upper * in_width_ * channels_; - U* ysB_output_ptr = output_image + (y + 1) * out_width_ * channels_; - if (channels_ == 1) { - this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_1ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - } else if (channels_ == 2) { - this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_2ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - } else if (channels_ == 3) { - this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_3ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - } else if (channels_ == 4) { - this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load1_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_4ch_(ysB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yB_lerp, ysB_input_lower_ptr, - ysB_input_upper_ptr, ysB_output_ptr); - } else { - assert(false); - } - } - for (; y <= y1_; ++y) { - const int iyA = flip_y_ ? out_height_ - 1 - min_iy_ - y : y - min_iy_; - const float yA_lerp = ys_[iyA].lerp; - const __m128 ysA_lerp = _mm_set1_ps(yA_lerp); - const T* ysA_input_lower_ptr = - input_image + ys_[iyA].lower * in_width_ * channels_; - const T* ysA_input_upper_ptr = - input_image + ys_[iyA].upper * in_width_ * channels_; - U* ysA_output_ptr = output_image + y * out_width_ * channels_; - if (channels_ == 1) { - this->ResizeRow_load1_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_1ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - } else if (channels_ == 2) { - this->ResizeRow_load1_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_2ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - } else if (channels_ == 3) { - this->ResizeRow_load1_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_3ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - } else if (channels_ == 4) { - this->ResizeRow_load1_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load2_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load4_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_load8_4ch_(ysA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - this->ResizeRow_general_(yA_lerp, ysA_input_lower_ptr, - ysA_input_upper_ptr, ysA_output_ptr); - } else { - assert(false); - } - } - } -} - -template -void CropResizeCastImage::ResizeRow_general_(const float ys_lerp, - const T* ys_input_lower_ptr, - const T* ys_input_upper_ptr, - U* output_y_ptr) { - for (int current = 0; current < num_general_; ++current) { - int x = general_x_[current]; - const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - x : x - min_ix_; - const int xs_lower = xs_[ix].lower; - const int xs_upper = xs_[ix].upper; - const float xs_lerp = xs_[ix].lerp; - for (int ichan = 0; ichan < channels_; ++ichan) { - const float top_left0(ys_input_lower_ptr[xs_lower + ichan]); - const float top_right0(ys_input_lower_ptr[xs_upper + ichan]); - const float bottom_left0(ys_input_upper_ptr[xs_lower + ichan]); - const float bottom_right0(ys_input_upper_ptr[xs_upper + ichan]); - float result0 = compute_lerp(top_left0, top_right0, bottom_left0, - bottom_right0, xs_lerp, ys_lerp); - output_y_ptr[x * channels_ + ichan] = - cast_to(result0, _f_min_val, _f_max_val, _u_min_val, _u_max_val); - } - } -} - -#define CHANNELS 1 -// Resize all points that fall in the 'load4from1' group for an entire row of a -// 1 channel image. -template -void CropResizeCastImage::ResizeRow_load1_1ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load1_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; -#ifdef __AVX2__ - __m256 left0, right0; - this->load1_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load1_offsets_[current], shuffle_masks, &left0, &right0); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); -#else - __m128 tl0, bl0, tr0, br0; - this->load1_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load1_offsets_[current], shuffle_masks, &tl0, &bl0, &tr0, - &br0); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); -#endif -#ifdef __AVX2__ - __m128 res[1]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - this->write_1ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); -#else - __m128 res[1]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - this->write_1ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from2' group for an entire row of a -// 1 channel image. -template -void CropResizeCastImage::ResizeRow_load2_1ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load2_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; -#ifdef __AVX2__ - __m256 left0, right0; - this->load2_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load2_offsets_[current], shuffle_masks, &left0, &right0); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); -#else - __m128 tl0, bl0, tr0, br0; - this->load2_1ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load2_offsets_[current], shuffle_masks, &tl0, &bl0, &tr0, - &br0); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); -#endif -#ifdef __AVX2__ - __m128 res[1]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - this->write_1ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); -#else - __m128 res[1]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - this->write_1ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from4' group for an entire row of a -// 1 channel image. -template -void CropResizeCastImage::ResizeRow_load4_1ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load4_; ++current) { - __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); -#ifdef __AVX2__ - __m256 left0, right0; - this->load4_1ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], - load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], - load4_offsets_[current * 4 + 3], &left0, &right0); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); -#else - __m128 tl0, bl0, tr0, br0; - this->load4_1ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], - load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], - load4_offsets_[current * 4 + 3], &tl0, &bl0, &tr0, &br0); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); -#endif -#ifdef __AVX2__ - __m128 res[1]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - this->write_1ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); -#else - __m128 res[1]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - this->write_1ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from8' group for an entire row of a -// 1 channel image. -template -void CropResizeCastImage::ResizeRow_load8_1ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load8_; ++current) { - __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); -#ifdef __AVX2__ - __m256 left0, right0; - this->load8_1ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], - load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], - load8_offsets_[current * 4 + 3], &left0, &right0); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); -#else - __m128 tl0, bl0, tr0, br0; - this->load8_1ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], - load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], - load8_offsets_[current * 4 + 3], &tl0, &bl0, &tr0, &br0); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); -#endif -#ifdef __AVX2__ - __m128 res[1]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - this->write_1ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); -#else - __m128 res[1]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - this->write_1ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); -#endif - } -} -#undef CHANNELS - -#define CHANNELS 2 -// Resize all points that fall in the 'load4from1' group for an entire row of a -// 2 channel image. -template -void CropResizeCastImage::ResizeRow_load1_2ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load1_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; -#ifdef __AVX2__ - __m256 left0, left1, right0, right1; - this->load1_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load1_offsets_[current], shuffle_masks, &left0, &left1, - &right0, &right1); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); -#else - __m128 tl0, tl1, bl0, bl1, tr0, tr1, br0, br1; - this->load1_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load1_offsets_[current], shuffle_masks, &tl0, &tl1, &bl0, - &bl1, &tr0, &tr1, &br0, &br1); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); -#endif -#ifdef __AVX2__ - __m128 res[2]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - this->write_2ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); -#else - __m128 res[2]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - this->write_2ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from2' group for an entire row of a -// 2 channel image. -template -void CropResizeCastImage::ResizeRow_load2_2ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load2_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; -#ifdef __AVX2__ - __m256 left0, left1, right0, right1; - this->load2_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load2_offsets_[current], shuffle_masks, &left0, &left1, - &right0, &right1); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); -#else - __m128 tl0, tl1, bl0, bl1, tr0, tr1, br0, br1; - this->load2_2ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load2_offsets_[current], shuffle_masks, &tl0, &tl1, &bl0, - &bl1, &tr0, &tr1, &br0, &br1); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); -#endif -#ifdef __AVX2__ - __m128 res[2]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - this->write_2ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); -#else - __m128 res[2]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - this->write_2ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from4' group for an entire row of a -// 2 channel image. -template -void CropResizeCastImage::ResizeRow_load4_2ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load4_; ++current) { - __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); -#ifdef __AVX2__ - __m256 left0, left1, right0, right1; - this->load4_2ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], - load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], - load4_offsets_[current * 4 + 3], &left0, &left1, &right0, &right1); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); -#else - __m128 tl0, tl1, bl0, bl1, tr0, tr1, br0, br1; - this->load4_2ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], - load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], - load4_offsets_[current * 4 + 3], &tl0, &tl1, &bl0, &bl1, &tr0, &tr1, - &br0, &br1); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); -#endif -#ifdef __AVX2__ - __m128 res[2]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - this->write_2ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); -#else - __m128 res[2]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - this->write_2ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from8' group for an entire row of a -// 2 channel image. -template -void CropResizeCastImage::ResizeRow_load8_2ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load8_; ++current) { - __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); -#ifdef __AVX2__ - __m256 left0, left1, right0, right1; - this->load8_2ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], - load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], - load8_offsets_[current * 4 + 3], &left0, &left1, &right0, &right1); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); -#else - __m128 tl0, tl1, bl0, bl1, tr0, tr1, br0, br1; - this->load8_2ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], - load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], - load8_offsets_[current * 4 + 3], &tl0, &tl1, &bl0, &bl1, &tr0, &tr1, - &br0, &br1); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); -#endif -#ifdef __AVX2__ - __m128 res[2]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - this->write_2ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); -#else - __m128 res[2]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - this->write_2ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); -#endif - } -} -#undef CHANNELS - -#define CHANNELS 3 -// Resize all points that fall in the 'load4from1' group for an entire row of a -// 3 channel image. -template -void CropResizeCastImage::ResizeRow_load1_3ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load1_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; -#ifdef __AVX2__ - __m256 left0, left1, left2, right0, right1, right2; - this->load1_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load1_offsets_[current], shuffle_masks, &left0, &left1, - &left2, &right0, &right1, &right2); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); - __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); - __m128 top2 = _mm256_castps256_ps128(hori2); - __m128 bot2 = _mm256_extractf128_ps(hori2, 1); -#else - __m128 tl0, tl1, tl2, bl0, bl1, bl2, tr0, tr1, tr2, br0, br1, br2; - this->load1_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load1_offsets_[current], shuffle_masks, &tl0, &tl1, &tl2, - &bl0, &bl1, &bl2, &tr0, &tr1, &tr2, &br0, &br1, &br2); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); - x_lerp = mmxs_lerp[2]; - __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); - __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); -#endif -#ifdef __AVX2__ - __m128 res[3]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); - this->write_3ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); -#else - __m128 res[3]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); - this->write_3ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from2' group for an entire row of a -// 3 channel image. -template -void CropResizeCastImage::ResizeRow_load2_3ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load2_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; -#ifdef __AVX2__ - __m256 left0, left1, left2, right0, right1, right2; - this->load2_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load2_offsets_[current], shuffle_masks, &left0, &left1, - &left2, &right0, &right1, &right2); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); - __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); - __m128 top2 = _mm256_castps256_ps128(hori2); - __m128 bot2 = _mm256_extractf128_ps(hori2, 1); -#else - __m128 tl0, tl1, tl2, bl0, bl1, bl2, tr0, tr1, tr2, br0, br1, br2; - this->load2_3ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load2_offsets_[current], shuffle_masks, &tl0, &tl1, &tl2, - &bl0, &bl1, &bl2, &tr0, &tr1, &tr2, &br0, &br1, &br2); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); - x_lerp = mmxs_lerp[2]; - __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); - __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); -#endif -#ifdef __AVX2__ - __m128 res[3]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); - this->write_3ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); -#else - __m128 res[3]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); - this->write_3ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from4' group for an entire row of a -// 3 channel image. -template -void CropResizeCastImage::ResizeRow_load4_3ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load4_; ++current) { - __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); -#ifdef __AVX2__ - __m256 left0, left1, left2, right0, right1, right2; - this->load4_3ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], - load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], - load4_offsets_[current * 4 + 3], &left0, &left1, &left2, &right0, - &right1, &right2); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); - __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); - __m128 top2 = _mm256_castps256_ps128(hori2); - __m128 bot2 = _mm256_extractf128_ps(hori2, 1); -#else - __m128 tl0, tl1, tl2, bl0, bl1, bl2, tr0, tr1, tr2, br0, br1, br2; - this->load4_3ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], - load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], - load4_offsets_[current * 4 + 3], &tl0, &tl1, &tl2, &bl0, &bl1, &bl2, - &tr0, &tr1, &tr2, &br0, &br1, &br2); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); - x_lerp = mmxs_lerp[2]; - __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); - __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); -#endif -#ifdef __AVX2__ - __m128 res[3]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); - this->write_3ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); -#else - __m128 res[3]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); - this->write_3ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from8' group for an entire row of a -// 3 channel image. -template -void CropResizeCastImage::ResizeRow_load8_3ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load8_; ++current) { - __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); -#ifdef __AVX2__ - __m256 left0, left1, left2, right0, right1, right2; - this->load8_3ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], - load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], - load8_offsets_[current * 4 + 3], &left0, &left1, &left2, &right0, - &right1, &right2); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); - __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); - __m128 top2 = _mm256_castps256_ps128(hori2); - __m128 bot2 = _mm256_extractf128_ps(hori2, 1); -#else - __m128 tl0, tl1, tl2, bl0, bl1, bl2, tr0, tr1, tr2, br0, br1, br2; - this->load8_3ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], - load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], - load8_offsets_[current * 4 + 3], &tl0, &tl1, &tl2, &bl0, &bl1, &bl2, - &tr0, &tr1, &tr2, &br0, &br1, &br2); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); - x_lerp = mmxs_lerp[2]; - __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); - __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); -#endif -#ifdef __AVX2__ - __m128 res[3]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); - this->write_3ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); -#else - __m128 res[3]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); - this->write_3ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); -#endif - } -} -#undef CHANNELS - -#define CHANNELS 4 -// Resize all points that fall in the 'load4from1' group for an entire row of a -// 4 channel image. -template -void CropResizeCastImage::ResizeRow_load1_4ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load1_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load1_shuffle_masks_ + current * CHANNELS * 3); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; -#ifdef __AVX2__ - __m256 left0, left1, left2, left3, right0, right1, right2, right3; - this->load1_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load1_offsets_[current], shuffle_masks, &left0, &left1, - &left2, &left3, &right0, &right1, &right2, &right3); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); - __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[3]))); - __m256 hori3 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right3, left3), left3); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); - __m128 top2 = _mm256_castps256_ps128(hori2); - __m128 bot2 = _mm256_extractf128_ps(hori2, 1); - __m128 top3 = _mm256_castps256_ps128(hori3); - __m128 bot3 = _mm256_extractf128_ps(hori3, 1); -#else - __m128 tl0, tl1, tl2, tl3, bl0, bl1, bl2, bl3, tr0, tr1, tr2, tr3, br0, br1, - br2, br3; - this->load1_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load1_offsets_[current], shuffle_masks, &tl0, &tl1, &tl2, - &tl3, &bl0, &bl1, &bl2, &bl3, &tr0, &tr1, &tr2, &tr3, &br0, - &br1, &br2, &br3); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); - x_lerp = mmxs_lerp[2]; - __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); - __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); - x_lerp = mmxs_lerp[3]; - __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); - __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); -#endif -#ifdef __AVX2__ - __m128 res[4]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); - res[3] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot3, top3), top3); - this->write_4ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); -#else - __m128 res[4]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); - res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); - this->write_4ch(ysA_output_ptr + load1_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from2' group for an entire row of a -// 4 channel image. -template -void CropResizeCastImage::ResizeRow_load2_4ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load2_; ++current) { - __m128* mmxs_lerp = - (__m128*)(load2_shuffle_masks_ + current * CHANNELS * 2); - __m128i* shuffle_masks = (__m128i*)mmxs_lerp + CHANNELS; -#ifdef __AVX2__ - __m256 left0, left1, left2, left3, right0, right1, right2, right3; - this->load2_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load2_offsets_[current], shuffle_masks, &left0, &left1, - &left2, &left3, &right0, &right1, &right2, &right3); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); - __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[3]))); - __m256 hori3 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right3, left3), left3); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); - __m128 top2 = _mm256_castps256_ps128(hori2); - __m128 bot2 = _mm256_extractf128_ps(hori2, 1); - __m128 top3 = _mm256_castps256_ps128(hori3); - __m128 bot3 = _mm256_extractf128_ps(hori3, 1); -#else - __m128 tl0, tl1, tl2, tl3, bl0, bl1, bl2, bl3, tr0, tr1, tr2, tr3, br0, br1, - br2, br3; - this->load2_4ch(ysA_input_lower_ptr, ysA_input_upper_ptr, - load2_offsets_[current], shuffle_masks, &tl0, &tl1, &tl2, - &tl3, &bl0, &bl1, &bl2, &bl3, &tr0, &tr1, &tr2, &tr3, &br0, - &br1, &br2, &br3); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); - x_lerp = mmxs_lerp[2]; - __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); - __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); - x_lerp = mmxs_lerp[3]; - __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); - __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); -#endif -#ifdef __AVX2__ - __m128 res[4]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); - res[3] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot3, top3), top3); - this->write_4ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); -#else - __m128 res[4]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); - res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); - this->write_4ch(ysA_output_ptr + load2_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from4' group for an entire row of a -// 4 channel image. -template -void CropResizeCastImage::ResizeRow_load4_4ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load4_; ++current) { - __m128* mmxs_lerp = (__m128*)(load4_mmxs_lerp_ + current * CHANNELS); -#ifdef __AVX2__ - __m256 left0, left1, left2, left3, right0, right1, right2, right3; - this->load4_4ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], - load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], - load4_offsets_[current * 4 + 3], &left0, &left1, &left2, &left3, - &right0, &right1, &right2, &right3); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); - __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[3]))); - __m256 hori3 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right3, left3), left3); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); - __m128 top2 = _mm256_castps256_ps128(hori2); - __m128 bot2 = _mm256_extractf128_ps(hori2, 1); - __m128 top3 = _mm256_castps256_ps128(hori3); - __m128 bot3 = _mm256_extractf128_ps(hori3, 1); -#else - __m128 tl0, tl1, tl2, tl3, bl0, bl1, bl2, bl3, tr0, tr1, tr2, tr3, br0, br1, - br2, br3; - this->load4_4ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load4_offsets_[current * 4], - load4_offsets_[current * 4 + 1], load4_offsets_[current * 4 + 2], - load4_offsets_[current * 4 + 3], &tl0, &tl1, &tl2, &tl3, &bl0, &bl1, - &bl2, &bl3, &tr0, &tr1, &tr2, &tr3, &br0, &br1, &br2, &br3); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); - x_lerp = mmxs_lerp[2]; - __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); - __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); - x_lerp = mmxs_lerp[3]; - __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); - __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); -#endif -#ifdef __AVX2__ - __m128 res[4]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); - res[3] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot3, top3), top3); - this->write_4ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); -#else - __m128 res[4]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); - res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); - this->write_4ch(ysA_output_ptr + load4_x_[current] * CHANNELS, res); -#endif - } -} -// Resize all points that fall in the 'load4from8' group for an entire row of a -// 4 channel image. -template -void CropResizeCastImage::ResizeRow_load8_4ch_( - const __m128 y_lerp, const T* ysA_input_lower_ptr, - const T* ysA_input_upper_ptr, U* ysA_output_ptr) { - for (int current = 0; current < num_load8_; ++current) { - __m128* mmxs_lerp = (__m128*)(load8_mmxs_lerp_ + current * CHANNELS); -#ifdef __AVX2__ - __m256 left0, left1, left2, left3, right0, right1, right2, right3; - this->load8_4ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], - load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], - load8_offsets_[current * 4 + 3], &left0, &left1, &left2, &left3, - &right0, &right1, &right2, &right3); - - __m256 x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[0]))); - __m256 hori0 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right0, left0), left0); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[1]))); - __m256 hori1 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right1, left1), left1); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[2]))); - __m256 hori2 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right2, left2), left2); - x_lerp = _mm256_castsi256_ps( - _mm256_broadcastsi128_si256(_mm_castps_si128(mmxs_lerp[3]))); - __m256 hori3 = _mm256_fmadd_ps(x_lerp, _mm256_sub_ps(right3, left3), left3); - - __m128 top0 = _mm256_castps256_ps128(hori0); - __m128 bot0 = _mm256_extractf128_ps(hori0, 1); - __m128 top1 = _mm256_castps256_ps128(hori1); - __m128 bot1 = _mm256_extractf128_ps(hori1, 1); - __m128 top2 = _mm256_castps256_ps128(hori2); - __m128 bot2 = _mm256_extractf128_ps(hori2, 1); - __m128 top3 = _mm256_castps256_ps128(hori3); - __m128 bot3 = _mm256_extractf128_ps(hori3, 1); -#else - __m128 tl0, tl1, tl2, tl3, bl0, bl1, bl2, bl3, tr0, tr1, tr2, tr3, br0, br1, - br2, br3; - this->load8_4ch( - ysA_input_lower_ptr, ysA_input_upper_ptr, load8_offsets_[current * 4], - load8_offsets_[current * 4 + 1], load8_offsets_[current * 4 + 2], - load8_offsets_[current * 4 + 3], &tl0, &tl1, &tl2, &tl3, &bl0, &bl1, - &bl2, &bl3, &tr0, &tr1, &tr2, &tr3, &br0, &br1, &br2, &br3); - - __m128 x_lerp = mmxs_lerp[0]; - __m128 top0 = _mm_add_ps(tl0, _mm_mul_ps(x_lerp, _mm_sub_ps(tr0, tl0))); - __m128 bot0 = _mm_add_ps(bl0, _mm_mul_ps(x_lerp, _mm_sub_ps(br0, bl0))); - x_lerp = mmxs_lerp[1]; - __m128 top1 = _mm_add_ps(tl1, _mm_mul_ps(x_lerp, _mm_sub_ps(tr1, tl1))); - __m128 bot1 = _mm_add_ps(bl1, _mm_mul_ps(x_lerp, _mm_sub_ps(br1, bl1))); - x_lerp = mmxs_lerp[2]; - __m128 top2 = _mm_add_ps(tl2, _mm_mul_ps(x_lerp, _mm_sub_ps(tr2, tl2))); - __m128 bot2 = _mm_add_ps(bl2, _mm_mul_ps(x_lerp, _mm_sub_ps(br2, bl2))); - x_lerp = mmxs_lerp[3]; - __m128 top3 = _mm_add_ps(tl3, _mm_mul_ps(x_lerp, _mm_sub_ps(tr3, tl3))); - __m128 bot3 = _mm_add_ps(bl3, _mm_mul_ps(x_lerp, _mm_sub_ps(br3, bl3))); -#endif -#ifdef __AVX2__ - __m128 res[4]; - res[0] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot0, top0), top0); - res[1] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot1, top1), top1); - res[2] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot2, top2), top2); - res[3] = _mm_fmadd_ps(y_lerp, _mm_sub_ps(bot3, top3), top3); - this->write_4ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); -#else - __m128 res[4]; - res[0] = _mm_add_ps(top0, _mm_mul_ps(y_lerp, _mm_sub_ps(bot0, top0))); - res[1] = _mm_add_ps(top1, _mm_mul_ps(y_lerp, _mm_sub_ps(bot1, top1))); - res[2] = _mm_add_ps(top2, _mm_mul_ps(y_lerp, _mm_sub_ps(bot2, top2))); - res[3] = _mm_add_ps(top3, _mm_mul_ps(y_lerp, _mm_sub_ps(bot3, top3))); - this->write_4ch(ysA_output_ptr + load8_x_[current] * CHANNELS, res); -#endif - } -} -#undef CHANNELS - -template -void CropResizeCastImage::Configure_() { - // num_cases[0] = general case - // num_cases[1] = load4from1 - // num_cases[2] = load4from2 - // num_cases[3] = load4from4 - // num_cases[4] = load4from8 - int num_cases[5]; - for (int i = 0; i < 5; ++i) num_cases[i] = 0; - for (int x = x0_; x <= x1_; ++x) { - int load_group = this->DetermineLoadGroup_(x); - assert(load_group >= 0 && load_group <= 4); - ++num_cases[load_group]; - // load_group == 0 -> general case, pixel by pixel - // every other value indidcates 1+3 = 4 pixels were processed this iteration - if (load_group > 0) x += 3; - } - num_general_ = num_cases[0]; - num_load1_ = num_cases[1]; - num_load2_ = num_cases[2]; - num_load4_ = num_cases[3]; - num_load8_ = num_cases[4]; - if (num_general_ > 0) { - general_x_ = new int[num_general_]; - } else { - general_x_ = NULL; - } - if (num_load1_ > 0) { - load1_offsets_ = new int[num_load1_]; - load1_shuffle_masks_ = new __m128i[num_load1_ * channels_ * 3]; - load1_mmxs_lerp_ = NULL; // new __m128[num_load1_*channels_]; - load1_x_ = new int[num_load1_]; - } else { - load1_offsets_ = NULL; - load1_shuffle_masks_ = NULL; - load1_mmxs_lerp_ = NULL; - load1_x_ = NULL; - } - if (num_load2_ > 0) { - load2_offsets_ = new int[num_load2_]; - load2_shuffle_masks_ = new __m128i[num_load2_ * channels_ * 2]; - load2_mmxs_lerp_ = NULL; // new __m128[num_load2_*channels_]; - load2_x_ = new int[num_load2_]; - } else { - load2_offsets_ = NULL; - load2_shuffle_masks_ = NULL; - load2_mmxs_lerp_ = NULL; - load2_x_ = NULL; - } - if (num_load4_ > 0) { - load4_offsets_ = new int[num_load4_ * 4]; - load4_mmxs_lerp_ = new __m128[num_load4_ * channels_]; - load4_x_ = new int[num_load4_]; - } else { - load4_offsets_ = NULL; - load4_mmxs_lerp_ = NULL; - load4_x_ = NULL; - } - if (num_load8_ > 0) { - load8_offsets_ = new int[num_load8_ * 4]; - load8_mmxs_lerp_ = new __m128[num_load8_ * channels_]; - load8_x_ = new int[num_load8_]; - } else { - load8_offsets_ = NULL; - load8_mmxs_lerp_ = NULL; - load8_x_ = NULL; - } - for (int i = 0; i < 5; ++i) num_cases[i] = 0; - if (verbose_) { - printf(" load4from1 = %d\n", num_load1_); - printf(" load4from2 = %d\n", num_load2_); - printf(" load4from4 = %d\n", num_load4_); - printf(" load4from8 = %d\n", num_load8_); - printf(" general = %d\n", num_general_); - } - for (int x = x0_; x <= x1_; ++x) { - int load_group = DetermineLoadGroup_(x); - assert(load_group >= 0 && load_group <= 4); - int current = num_cases[load_group]; - assert(current >= 0); - if (load_group == 0) { - // general case - assert(current < num_general_); - general_x_[current] = x; - } else if (load_group == 1) { - // load4from1 - assert(current < num_load1_); - load1_x_[current] = x; - int min_xidx, max_xidx; - ComputeXIndexRange_(x, &min_xidx, &max_xidx); - load1_offsets_[current] = min_xidx * channels_; - float* xs_lerp = (float*)(load1_shuffle_masks_ + current * channels_ * 3); - char* shufmasks1 = - (char*)(load1_shuffle_masks_ + current * channels_ * 3 + channels_); - char* shufmasks2 = shufmasks1 + 16 * channels_; - for (int j = 0; j < 32 * channels_; ++j) shufmasks1[j] = -128; - for (int pix = 0; pix < 4; ++pix) { - const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) - : (x + pix) - min_ix_; - float lerp = xs_[ix].lerp; - int widx0 = xs_[ix].lower - - load1_offsets_[current]; // word index within SSE vector - for (int ch = 0; ch < channels_; ++ch) { - int idx = pix * channels_ + ch; - xs_lerp[idx] = lerp; - int shufvec = idx / 4; - int shufidx = idx % 4; - int widx = widx0 + ch; - for (int b = 0; b < sizeof(T); ++b) { - shufmasks1[shufvec * 16 + shufidx * sizeof(T) + b] = - widx * sizeof(T) + b; - shufmasks2[shufvec * 16 + shufidx * sizeof(T) + b] = - (widx + channels_) * sizeof(T) + b; - } - } - } - } else if (load_group == 2) { - // load4from2 - assert(current < num_load2_); - load2_x_[current] = x; - int min_xidx, max_xidx; - ComputeXIndexRange_(x, &min_xidx, &max_xidx); - load2_offsets_[current] = min_xidx * channels_; - float* xs_lerp = (float*)(load2_shuffle_masks_ + current * channels_ * 2); - char* shufmasks1 = - (char*)(load2_shuffle_masks_ + current * channels_ * 2 + channels_); - for (int j = 0; j < 16 * channels_; ++j) shufmasks1[j] = -128; - for (int pix = 0; pix < 4; ++pix) { - const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) - : (x + pix) - min_ix_; - float lerp = xs_[ix].lerp; - int widx0 = xs_[ix].lower - - load2_offsets_[current]; // word index within SSE vector - for (int ch = 0; ch < channels_; ++ch) { - int idx = pix * channels_ + ch; - xs_lerp[idx] = lerp; - int shufvec = idx / 4; - int shufidx = idx % 4; - int widx = widx0 + ch; - for (int b = 0; b < sizeof(T); ++b) { - shufmasks1[shufvec * 16 + shufidx * sizeof(T) + b] = - widx * sizeof(T) + b; - } - } - } - } else if (load_group == 3) { - // load4from4 - assert(current < num_load4_); - load4_x_[current] = x; - int* index = load4_offsets_ + current * 4; - float* xs_lerp = (float*)(load4_mmxs_lerp_ + current * channels_); - for (int pix = 0; pix < 4; ++pix) { - const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) - : (x + pix) - min_ix_; - float lerp = xs_[ix].lerp; - index[pix] = xs_[ix].lower; - for (int ch = 0; ch < channels_; ++ch) { - int idx = pix * channels_ + ch; - xs_lerp[idx] = lerp; - } - } - } else if (load_group == 4) { - // load4from8 - assert(current < num_load8_); - load8_x_[current] = x; - int* index = load8_offsets_ + current * 4; - float* xs_lerp = (float*)(load8_mmxs_lerp_ + current * channels_); - for (int pix = 0; pix < 4; ++pix) { - const int ix = flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) - : (x + pix) - min_ix_; - float lerp = xs_[ix].lerp; - index[pix] = xs_[ix].lower; - for (int ch = 0; ch < channels_; ++ch) { - int idx = pix * channels_ + ch; - xs_lerp[idx] = lerp; - } - } - } else { - assert(false); - } - ++num_cases[load_group]; - // load_group == 0 -> general case, pixel by pixel - // every other value indidcates 1+3 = 4 pixels were processed this iteration - if (load_group > 0) x += 3; - } -} - -template -int CropResizeCastImage::DetermineLoadGroup_(const int x) { - int num_remaining = x1_ - x + 1; - if (num_remaining >= 4) { - // at least 4 values left, so theoretically possible to do SSE - int min_xidx, max_xidx; - // Using this-> is necessary in order to avoid compile error: - // "there are no arguments to ‘xxx’ that depend on a template parameter, so - // a declaration of ‘xxx’ must be available" - // This is an issue for all member functions that have only builtin type - // arguments and happens because - // argument dependent lookup is not done for these arguments (so I've been - // told). - if (this->ComputeXIndexRange_(x, &min_xidx, &max_xidx)) { - if ((allowed_load_groups_ & 1) && this->Load1_ok_(min_xidx, max_xidx)) { - return 1; - } else if ((allowed_load_groups_ & 2) && - this->Load2_ok_(min_xidx, max_xidx)) { - return 2; - } else if ((allowed_load_groups_ & 4) && - this->Load4_ok_(min_xidx, max_xidx)) { - return 3; - } else if ((allowed_load_groups_ & 8) && - this->Load8_ok_(min_xidx, max_xidx)) { - return 4; - } else { - return 0; - } - } else { - // assumption xs[i].lower + channels == xs[i].upper NOT true for this - // quintuple. - return 0; - } - } else { - // too few remaining values - return 0; - } -} - -// Compute range of x indexes for xs[0] through xs[3]. -// Returns true if valid (xs[i].lower + channels == xs[i].upper for all pixels). -template -bool CropResizeCastImage::ComputeXIndexRange_(const int x, int* min_xidx, - int* max_xidx) { - bool upper_is_lower_plus_one = true; - *min_xidx = 0; - *max_xidx = -1; - for (int pix = 0; pix < 4; ++pix) { - const int ix = - flip_x_ ? out_width_ - 1 - min_ix_ - (x + pix) : (x + pix) - min_ix_; - int curr_xidx = xs_[ix].lower; - if (curr_xidx + channels_ == xs_[ix].upper) { - if (pix == 0) { - *min_xidx = curr_xidx; - *max_xidx = curr_xidx; - } else { - if (curr_xidx < *min_xidx) *min_xidx = curr_xidx; - if (curr_xidx > *max_xidx) *max_xidx = curr_xidx; - } - } else { - upper_is_lower_plus_one = false; - } - } - *min_xidx /= channels_; - *max_xidx /= channels_; - return upper_is_lower_plus_one; -} - -// This method returns true if it is possible to do load4from1 -// for the load group pointed to by xs. -template -bool CropResizeCastImage::Load1_ok_(const int min_xidx, - const int max_xidx) { - // num_pixels_to_load_left_input = max_xs_low - min_xs_low + 1 - // num_pixels_to_load_left_and_right_input = num_pixels_to_load_left_input + 1 - int total_load_bytes = (max_xidx - min_xidx + 2) * channels_ * sizeof(T); - if (total_load_bytes <= 16) { - // a single (mis-aligned) SSE word gives us all the inputs - // ensure that SSE word can be loaded without causing SEGV - int load_offset = min_xidx * channels_; - int load_offset_bytes = load_offset * sizeof(T); - if (in_row_size_bytes_ - load_offset_bytes >= 16) { - return true; - } else { - return false; - } - } else { - return false; - } -} - -// This method returns true if it is possible to do load4from2 -// for the load group pointed to by xs. -template -bool CropResizeCastImage::Load2_ok_(const int min_xidx, - const int max_xidx) { - // num_pixels_to_load_left_input = max_xs_low - min_xs_low + 1 - int total_load_bytes = (max_xidx - min_xidx + 1) * channels_ * sizeof(T); - if (total_load_bytes <= 16) { - // a single (mis-aligned) SSE word gives us all the inputs - // ensure that SSE word can be loaded without causing SEGV - int load_offset = (min_xidx + 1) * channels_; - int load_offset_bytes = load_offset * sizeof(T); - if (in_row_size_bytes_ - load_offset_bytes >= 16) { - return true; - } else { - return false; - } - } else { - return false; - } -} - -// This method returns true if it is possible to do load4from4 -// for the load group pointed to by xs. -template -bool CropResizeCastImage::Load4_ok_(const int min_xidx, - const int max_xidx) { - int total_load_bytes = 2 * channels_ * sizeof(T); - if (total_load_bytes <= 16) { - // ensure that SSE word can be loaded without causing SEGV - int load_offset = max_xidx * channels_; - int load_offset_bytes = load_offset * sizeof(T); - if (in_row_size_bytes_ - load_offset_bytes >= 16) { - return true; - } else { - return false; - } - } else { - return false; - } -} - -// This method returns true if it is possible to do load4from8 -// for the load group pointed to by xs. -template -bool CropResizeCastImage::Load8_ok_(const int min_xidx, - const int max_xidx) { - int total_load_bytes = channels_ * sizeof(T); - if (total_load_bytes <= 16) { - // ensure that SSE word can be loaded without causing SEGV - int load_offset = (max_xidx + 1) * channels_; - int load_offset_bytes = load_offset * sizeof(T); - if (in_row_size_bytes_ - load_offset_bytes >= 16) { - return true; - } else { - return false; - } - } else { - return false; - } -} - -// -// full implementations of templated static member function clip_necessary() -// - -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} - -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} - -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} - -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} - -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} - -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} - -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return true; -} -template <> -bool CropResizeCastImage::clip_necessary() { - return false; -} - -// full specializations of crop_resize_single_image_common for data types that -// have vectorized implementations. -// at the moment, this is uint8, int8, uint16, int16, int32, Eigen::half, -// bfloat16 and float. - -#define CROP_RESIZE_SINGLE_IMAGE_VECT(T_type, U_type) \ - template <> \ - void crop_resize_single_image_common( \ - const T_type* image, const int64 in_height, const int64 in_width, \ - const int64 out_height, const int64 out_width, const int channels, \ - const int min_ix, const int max_ix, const CachedInterpolation* xs, \ - const int min_iy, const int max_iy, const CachedInterpolation* ys, \ - const float extrapolated_value, const bool flip_x, const bool flip_y, \ - U_type* output) { \ - if (channels <= 4) { \ - CropResizeCastImage* resizer = \ - new CropResizeCastImage( \ - in_height, in_width, out_height, out_width, channels, min_ix, \ - max_ix, xs, min_iy, max_iy, ys, extrapolated_value, flip_x, \ - flip_y, false, 15); \ - resizer->Resize(image, output); \ - delete resizer; \ - } else { \ - crop_resize_single_image(image, in_height, in_width, out_height, \ - out_width, channels, min_ix, max_ix, xs, \ - min_iy, max_iy, ys, extrapolated_value, flip_x, \ - flip_y, output); \ - } \ - } - -CROP_RESIZE_SINGLE_IMAGE_VECT(uint8, float) -CROP_RESIZE_SINGLE_IMAGE_VECT(int8, float) -CROP_RESIZE_SINGLE_IMAGE_VECT(uint16, float) -CROP_RESIZE_SINGLE_IMAGE_VECT(int16, float) -CROP_RESIZE_SINGLE_IMAGE_VECT(int32, float) -CROP_RESIZE_SINGLE_IMAGE_VECT(Eigen::half, float) -CROP_RESIZE_SINGLE_IMAGE_VECT(bfloat16, float) -CROP_RESIZE_SINGLE_IMAGE_VECT(float, float) - -// full specializations of crop_resize_single_image_common for data types that -// don't have vectorized implementations. -// image resizing for these data types default to the original code. -// at the moment, this is int64 and double. - -#define CROP_RESIZE_SINGLE_IMAGE_REGULAR(T_type, U_type) \ - template <> \ - void crop_resize_single_image_common( \ - const T_type* image, const int64 in_height, const int64 in_width, \ - const int64 out_height, const int64 out_width, const int channels, \ - const int min_ix, const int max_ix, const CachedInterpolation* xs, \ - const int min_iy, const int max_iy, const CachedInterpolation* ys, \ - const float extrapolated_value, const bool flip_x, const bool flip_y, \ - U_type* output) { \ - crop_resize_single_image(image, in_height, in_width, out_height, \ - out_width, channels, min_ix, max_ix, xs, min_iy, \ - max_iy, ys, extrapolated_value, flip_x, flip_y, \ - output); \ - } - -CROP_RESIZE_SINGLE_IMAGE_REGULAR(int64, float) -CROP_RESIZE_SINGLE_IMAGE_REGULAR(double, float) - -#else - -// compile fall-back code if either -// a) target is not a linux machine -// b) target architecture does not support at least SSE4.1 - -template -void crop_resize_single_image_common( - const T* image, const int64 in_height, const int64 in_width, - const int64 out_height, const int64 out_width, const int channels, - const int min_ix, const int max_ix, const CachedInterpolation* xs, - const int min_iy, const int max_iy, const CachedInterpolation* ys, - const float extrapolated_value, const bool flip_x, const bool flip_y, - U* output) { - crop_resize_single_image(image, in_height, in_width, out_height, out_width, - channels, min_ix, max_ix, xs, min_iy, max_iy, ys, - extrapolated_value, flip_x, flip_y, output); -} - -#endif - -} // namespace -} // namespace tensorflow -#endif // define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_ diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc index 433a0d38ddb..f10c9a19a7f 100644 --- a/tensorflow/core/kernels/resize_bilinear_op.cc +++ b/tensorflow/core/kernels/resize_bilinear_op.cc @@ -25,7 +25,6 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" -#include "tensorflow/core/kernels/crop_resize_bilinear_core.h" #include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" @@ -64,6 +63,140 @@ class ResizeBilinearOp : public OpKernel { bool align_corners_; }; +namespace { +// Compute the interpolation indices only once. +struct CachedInterpolation { + int64 lower; // Lower source index used in the interpolation + int64 upper; // Upper source index used in the interpolation + // 1-D linear iterpolation scale (see: + // https://en.wikipedia.org/wiki/Bilinear_interpolation) + float lerp; +}; + +inline void compute_interpolation_weights(const int64 out_size, + const int64 in_size, + const float scale, + CachedInterpolation* interpolation) { + interpolation[out_size].lower = 0; + interpolation[out_size].upper = 0; + for (int64 i = out_size - 1; i >= 0; --i) { + const float in = i * scale; + interpolation[i].lower = static_cast(in); + interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1); + interpolation[i].lerp = in - interpolation[i].lower; + } +} + +/** + * Computes the bilinear interpolation from the appropriate 4 float points + * and the linear interpolation weights. + */ +inline float compute_lerp(const float top_left, const float top_right, + const float bottom_left, const float bottom_right, + const float x_lerp, const float y_lerp) { + const float top = top_left + (top_right - top_left) * x_lerp; + const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; + return top + (bottom - top) * y_lerp; +} + +template +void resize_image( + typename TTypes::ConstTensor images, const int batch_size, + const int64 in_height, const int64 in_width, const int64 out_height, + const int64 out_width, const int channels, + const std::vector& xs, + const std::vector& ys, + typename TTypes::Tensor output) TF_ATTRIBUTE_NOINLINE; +template +void resize_image(typename TTypes::ConstTensor images, + const int batch_size, const int64 in_height, + const int64 in_width, const int64 out_height, + const int64 out_width, const int channels, + const std::vector& xs_vec, + const std::vector& ys, + typename TTypes::Tensor output) { + const int64 in_row_size = in_width * channels; + const int64 in_batch_num_values = in_height * in_row_size; + const int64 out_row_size = out_width * channels; + + const T* input_b_ptr = images.data(); + const CachedInterpolation* xs = xs_vec.data(); + + if (channels == 3) { + float* output_y_ptr = output.data(); + for (int b = 0; b < batch_size; ++b) { + for (int64 y = 0; y < out_height; ++y) { + const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size; + const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size; + const float ys_lerp = ys[y].lerp; + for (int64 x = 0; x < out_width; ++x) { + const int64 xs_lower = xs[x].lower; + const int64 xs_upper = xs[x].upper; + const float xs_lerp = xs[x].lerp; + + // Read channel 0. + const float top_left0(ys_input_lower_ptr[xs_lower + 0]); + const float top_right0(ys_input_lower_ptr[xs_upper + 0]); + const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]); + const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]); + + // Read channel 1. + const float top_left1(ys_input_lower_ptr[xs_lower + 1]); + const float top_right1(ys_input_lower_ptr[xs_upper + 1]); + const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]); + const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]); + + // Read channel 2. + const float top_left2(ys_input_lower_ptr[xs_lower + 2]); + const float top_right2(ys_input_lower_ptr[xs_upper + 2]); + const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]); + const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]); + + // Compute output. + output_y_ptr[x * channels + 0] = + compute_lerp(top_left0, top_right0, bottom_left0, bottom_right0, + xs_lerp, ys_lerp); + output_y_ptr[x * channels + 1] = + compute_lerp(top_left1, top_right1, bottom_left1, bottom_right1, + xs_lerp, ys_lerp); + output_y_ptr[x * channels + 2] = + compute_lerp(top_left2, top_right2, bottom_left2, bottom_right2, + xs_lerp, ys_lerp); + } + output_y_ptr += out_row_size; + } + input_b_ptr += in_batch_num_values; + } + } else { + float* output_y_ptr = output.data(); + for (int b = 0; b < batch_size; ++b) { + for (int64 y = 0; y < out_height; ++y) { + const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size; + const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size; + const float ys_lerp = ys[y].lerp; + for (int64 x = 0; x < out_width; ++x) { + auto xs_lower = xs[x].lower; + auto xs_upper = xs[x].upper; + auto xs_lerp = xs[x].lerp; + for (int c = 0; c < channels; ++c) { + const float top_left(ys_input_lower_ptr[xs_lower + c]); + const float top_right(ys_input_lower_ptr[xs_upper + c]); + const float bottom_left(ys_input_upper_ptr[xs_lower + c]); + const float bottom_right(ys_input_upper_ptr[xs_upper + c]); + output_y_ptr[x * channels + c] = + compute_lerp(top_left, top_right, bottom_left, bottom_right, + xs_lerp, ys_lerp); + } + } + output_y_ptr += out_row_size; + } + input_b_ptr += in_batch_num_values; + } + } +} + +} // namespace + // Partial specialization of ResizeBilinear functor for a CPUDevice. namespace functor { template @@ -79,24 +212,18 @@ struct ResizeBilinear { const int64 out_height = output.dimension(1); const int64 out_width = output.dimension(2); - const int64 in_row_size = in_width * channels; - const int64 in_batch_num_values = in_height * in_row_size; - const int64 out_row_size = out_width * channels; - const int64 out_batch_num_values = out_row_size * out_height; - // Handle no-op resizes efficiently. if (out_height == in_height && out_width == in_width) { output = images.template cast(); return; } + std::vector ys(out_height + 1); + std::vector xs(out_width + 1); + // Compute the cached interpolation weights on the x and y dimensions. - std::vector ys; - ys.resize(out_height + 1); compute_interpolation_weights(out_height, in_height, height_scale, ys.data()); - std::vector xs; - xs.resize(out_width + 1); compute_interpolation_weights(out_width, in_width, width_scale, xs.data()); // Scale x interpolation weights to avoid a multiplication during iteration. @@ -105,14 +232,8 @@ struct ResizeBilinear { xs[i].upper *= channels; } - for (int b = 0; b < batch_size; ++b) { - crop_resize_single_image_common( - images.data() + (int64)b * in_batch_num_values, in_height, in_width, - out_height, out_width, channels, 0, out_width - 1, xs.data(), 0, - out_height - 1, ys.data(), 0.0f, false, false, - output.data() + (int64)b * out_batch_num_values); - } - // xs and ys are freed when they go out of scope + resize_image(images, batch_size, in_height, in_width, out_height, + out_width, channels, xs, ys, output); } }; } // namespace functor diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc index 55e1d2e1e22..6d578928285 100644 --- a/tensorflow/core/kernels/resize_bilinear_op_test.cc +++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc @@ -122,7 +122,7 @@ class ResizeBilinearOpTest : public OpsTestBase { TensorShape({batch_size, output_width, output_height, channels}))); ResizeBilinearBaseline(input->tensor(), expected->tensor()); - test::ExpectClose(*expected, *GetOutput(0)); + test::ExpectTensorEqual(*expected, *GetOutput(0)); } void RunManyRandomTests(int channels) { From b795287fc1642ad6c1ea5dd20e9d1f4d49a0a9e6 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 5 Nov 2018 14:56:10 -0800 Subject: [PATCH 119/540] Add stateful metrics to interleaved eval test PiperOrigin-RevId: 220178289 --- tensorflow/contrib/distribute/python/keras_test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py index 37802c14143..33b8a61eb1a 100644 --- a/tensorflow/contrib/distribute/python/keras_test.py +++ b/tensorflow/contrib/distribute/python/keras_test.py @@ -602,7 +602,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase, user_controlled_model.compile( gradient_descent.GradientDescentOptimizer(0.001), loss='mse', - metrics=['mae'], + metrics=['mae', keras.metrics.CategoricalAccuracy()], distribute=distribution) interleaved_model = get_model() @@ -610,7 +610,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase, interleaved_model.compile( gradient_descent.GradientDescentOptimizer(0.001), loss='mse', - metrics=['mae'], + metrics=['mae', keras.metrics.CategoricalAccuracy()], distribute=distribution) dataset = get_dataset(distribution) @@ -632,7 +632,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase, [x[0] for x in user_controlled_output]) self.assertEqual(interleaved_output.history['val_mean_absolute_error'], [x[1] for x in user_controlled_output]) - # TODO(sourabhbajaj): Add an stateful metric here and verify support. + self.assertEqual(interleaved_output.history['val_categorical_accuracy'], + [x[2] for x in user_controlled_output]) # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work # as clone_model's input_tensors argument only seems to accept list and not From 412dbc6034cfadb41035a25e6385a2b815ab9fc5 Mon Sep 17 00:00:00 2001 From: Rachel Lim Date: Mon, 5 Nov 2018 14:56:49 -0800 Subject: [PATCH 120/540] [tf.data] Bug fix to apply options globally PiperOrigin-RevId: 220178389 --- .../python/data/kernel_tests/dataset_ops_test.py | 14 +++++++++++++- tensorflow/python/data/ops/dataset_ops.py | 11 ++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/data/kernel_tests/dataset_ops_test.py b/tensorflow/python/data/kernel_tests/dataset_ops_test.py index 63d2be4371c..a5324af4d0c 100644 --- a/tensorflow/python/data/kernel_tests/dataset_ops_test.py +++ b/tensorflow/python/data/kernel_tests/dataset_ops_test.py @@ -226,7 +226,8 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase): ds = dataset_ops.Dataset.range(0).with_options(options1).with_options( options2) self.assertTrue(ds.options().experimental_autotune) - self.assertFalse(ds.options().experimental_filter_fusion) + # Explicitly check that flag is False since assertFalse allows None + self.assertIs(ds.options().experimental_filter_fusion, False) def testOptionsTwiceDifferentError(self): options1 = dataset_ops.Options() @@ -237,6 +238,17 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase): "Cannot merge incompatible values of option"): dataset_ops.Dataset.range(0).with_options(options1).with_options(options2) + def testOptionsMergeOptionsFromMultipleInputs(self): + options1 = dataset_ops.Options() + options1.experimental_autotune = True + options2 = dataset_ops.Options() + options2.experimental_filter_fusion = True + ds = dataset_ops.Dataset.zip( + (dataset_ops.Dataset.range(0).with_options(options1), + dataset_ops.Dataset.range(0).with_options(options2))) + self.assertTrue(ds.options().experimental_autotune) + self.assertTrue(ds.options().experimental_filter_fusion) + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py index f49ebd0e55e..e4b5da64032 100644 --- a/tensorflow/python/data/ops/dataset_ops.py +++ b/tensorflow/python/data/ops/dataset_ops.py @@ -88,16 +88,17 @@ class Dataset(object): raise NotImplementedError("Dataset._inputs") def options(self): - """Returns the options for this dataset. + """Returns the options for this dataset and its inputs. Returns: A `tf.data.Options` object representing the dataset options. """ + options = Options() for input_dataset in self._inputs(): - options = input_dataset.options() - if options is not None: - return options - return Options() + input_options = input_dataset.options() + if input_options is not None: + options = options.merge(input_options) + return options def _apply_options(self): dataset = self From ce5b0e6c239e8f80f05aa0874f84afd9cfc08538 Mon Sep 17 00:00:00 2001 From: Alexandre Passos Date: Mon, 5 Nov 2018 14:58:45 -0800 Subject: [PATCH 121/540] Stop force-placing variables which are init_from_checkpoint on CPU. Fixes #23236 PiperOrigin-RevId: 220178808 --- tensorflow/python/training/checkpoint_utils.py | 12 ++++++------ tensorflow/python/training/checkpoint_utils_test.py | 3 --- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py index 57954ec56a5..857da431db2 100644 --- a/tensorflow/python/training/checkpoint_utils.py +++ b/tensorflow/python/training/checkpoint_utils.py @@ -318,13 +318,13 @@ def _set_checkpoint_initializer(variable, saveable_objects.append(s) assert len(saveable_objects) == 1 # Should be only one variable. - init_op = saveable_objects[0].restore([restore_op], restored_shapes=None) + init_op = saveable_objects[0].restore([restore_op], restored_shapes=None) - # pylint:disable=protected-access - variable._initializer_op = init_op - restore_op.set_shape(variable.shape) - variable._initial_value = restore_op - # pylint:enable=protected-access + # pylint:disable=protected-access + variable._initializer_op = init_op + restore_op.set_shape(variable.shape) + variable._initial_value = restore_op + # pylint:enable=protected-access def _set_variable_or_list_initializer(variable_or_list, ckpt_file, diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py index 61dcbdb2b8f..a3e58de4a31 100644 --- a/tensorflow/python/training/checkpoint_utils_test.py +++ b/tensorflow/python/training/checkpoint_utils_test.py @@ -207,9 +207,6 @@ class CheckpointsTest(test.TestCase): checkpoint_utils.init_from_checkpoint(checkpoint_dir, {"useful_scope/": "useful_scope/"}) - # initializer runs on the same task but always on CPU. - self.assertEqual(my4._initializer_op.op.inputs[1].device, - "/job:ps/device:CPU:0") def testInitFromRootCheckpoint(self): checkpoint_dir = self.get_temp_dir() From 5babd0d4e56edebbe8236fb614da5158a9f124d4 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Mon, 5 Nov 2018 15:08:58 -0800 Subject: [PATCH 122/540] [FLR] Consolidate logic for configuring `Executor::Args` in one place. As a bonus, avoid heap-allocating `Executor::Args` for all local function calls. This slightly improves the performance of the callback when a function completes. PiperOrigin-RevId: 220180860 --- tensorflow/core/common_runtime/function.cc | 82 ++++++++++------------ 1 file changed, 37 insertions(+), 45 deletions(-) diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc index 286386e04c2..9affc9fb188 100644 --- a/tensorflow/core/common_runtime/function.cc +++ b/tensorflow/core/common_runtime/function.cc @@ -391,7 +391,11 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime { AttrValueMap FixAttrs(const AttrSlice& attrs); void RunRemote(const Options& opts, Handle handle, gtl::ArraySlice args, std::vector* rets, - Executor::Args* exec_args, Item* item, DoneCallback done); + Item* item, DoneCallback done); + + void ExecutorArgsFromOptions(const FunctionLibraryRuntime::Options& run_opts, + CallFrameInterface* frame, + Executor::Args* exec_args); TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryRuntimeImpl); }; @@ -858,19 +862,34 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) { return CreateItem(handle, item); } +void FunctionLibraryRuntimeImpl::ExecutorArgsFromOptions( + const FunctionLibraryRuntime::Options& run_opts, CallFrameInterface* frame, + Executor::Args* exec_args) { + // Inherit the step_id from the caller. + exec_args->step_id = run_opts.step_id; + exec_args->rendezvous = run_opts.rendezvous; + exec_args->stats_collector = run_opts.stats_collector; + exec_args->cancellation_manager = run_opts.cancellation_manager; + exec_args->step_container = run_opts.step_container; + if (run_opts.runner) { + exec_args->runner = *run_opts.runner; + } else { + exec_args->runner = default_runner_; + } + exec_args->collective_executor = run_opts.collective_executor; + exec_args->call_frame = frame; +} + void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle, gtl::ArraySlice args, std::vector* rets, - Executor::Args* exec_args, Item* item, DoneCallback done) { - DCHECK(exec_args->call_frame == nullptr); string target_device = parent_->GetDeviceName(handle); string source_device = opts.source_device; Rendezvous* rendezvous = opts.rendezvous; DeviceContext* device_context; Status s = parent_->GetDeviceContext(target_device, &device_context); if (!s.ok()) { - delete exec_args; done(s); return; } @@ -878,7 +897,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle, s = parent_->GetDeviceIncarnation(source_device, &src_incarnation); s.Update(parent_->GetDeviceIncarnation(target_device, &target_incarnation)); if (!s.ok()) { - delete exec_args; done(s); return; } @@ -886,13 +904,8 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle, const FunctionBody* fbody = GetFunctionBody(handle); FunctionCallFrame* frame = new FunctionCallFrame(fbody->arg_types, fbody->ret_types); - exec_args->call_frame = frame; - if (!s.ok()) { - delete frame; - delete exec_args; - done(s); - return; - } + Executor::Args* exec_args = new Executor::Args; + ExecutorArgsFromOptions(opts, frame, exec_args); std::vector args_alloc_attrs, rets_alloc_attrs; args_alloc_attrs.reserve(fbody->arg_types.size()); @@ -938,10 +951,10 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle, return; } item->exec->RunAsync( - *exec_args, [frame, rets, done, source_device, target_device, - target_incarnation, rendezvous, device_context, - remote_args, exec_args, rets_alloc_attrs, - allow_dead_tensors](const Status& status) { + *exec_args, + [frame, rets, done, source_device, target_device, + target_incarnation, rendezvous, device_context, remote_args, + rets_alloc_attrs, allow_dead_tensors](const Status& status) { Status s = status; if (s.ok()) { s = frame->ConsumeRetvals(rets, allow_dead_tensors); @@ -949,7 +962,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle, delete frame; if (!s.ok()) { delete remote_args; - delete exec_args; done(s); return; } @@ -957,9 +969,9 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle, target_device, source_device, "ret_", target_incarnation, *rets, device_context, rets_alloc_attrs, rendezvous); delete remote_args; - delete exec_args; done(s); }); + delete exec_args; }); } @@ -992,54 +1004,43 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, } DCHECK(run_opts.runner != nullptr); - Executor::Args* exec_args = new Executor::Args; - // Inherit the step_id from the caller. - exec_args->step_id = run_opts.step_id; - exec_args->rendezvous = run_opts.rendezvous; - exec_args->stats_collector = run_opts.stats_collector; - exec_args->cancellation_manager = run_opts.cancellation_manager; - exec_args->step_container = run_opts.step_container; - exec_args->runner = *run_opts.runner; - exec_args->collective_executor = run_opts.collective_executor; - Item* item = nullptr; Status s = GetOrCreateItem(handle, &item); if (!s.ok()) { - delete exec_args; done(s); return; } if (run_opts.remote_execution) { // NOTE(mrry): `RunRemote()` will set `exec_args->call_frame` for us. - RunRemote(run_opts, handle, args, rets, exec_args, item, done); + RunRemote(run_opts, handle, args, rets, item, done); return; } const FunctionBody* fbody = GetFunctionBody(handle); FunctionCallFrame* frame = new FunctionCallFrame(fbody->arg_types, fbody->ret_types); - exec_args->call_frame = frame; s = frame->SetArgs(args); if (!s.ok()) { delete frame; - delete exec_args; done(s); return; } + Executor::Args exec_args; + ExecutorArgsFromOptions(opts, frame, &exec_args); + bool allow_dead_tensors = opts.allow_dead_tensors; item->exec->RunAsync( // Executor args - *exec_args, + exec_args, // Done callback. - [frame, rets, done, exec_args, allow_dead_tensors](const Status& status) { + [frame, rets, done, allow_dead_tensors](const Status& status) { Status s = status; if (s.ok()) { s = frame->ConsumeRetvals(rets, allow_dead_tensors); } delete frame; - delete exec_args; done(s); }); } @@ -1084,16 +1085,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, DCHECK(run_opts.runner != nullptr); Executor::Args exec_args; - // Inherit the step_id from the caller. - exec_args.step_id = run_opts.step_id; - exec_args.rendezvous = run_opts.rendezvous; - exec_args.stats_collector = run_opts.stats_collector; - exec_args.cancellation_manager = run_opts.cancellation_manager; - exec_args.collective_executor = run_opts.collective_executor; - exec_args.step_container = run_opts.step_container; - exec_args.runner = *run_opts.runner; - exec_args.call_frame = frame; - + ExecutorArgsFromOptions(opts, frame, &exec_args); item->exec->RunAsync(exec_args, std::move(done)); } From 273c0833ef237f4040f0c62f8486019b1f1d3f32 Mon Sep 17 00:00:00 2001 From: Shivani Agrawal Date: Mon, 5 Nov 2018 15:17:41 -0800 Subject: [PATCH 123/540] Support for sparse tensors for test_util.evaluate(). PiperOrigin-RevId: 220182277 --- .../parse_example_dataset_test.py | 227 +++++++++--------- .../kernel_tests/range_dataset_op_test.py | 171 ++++--------- .../python/data/kernel_tests/test_base.py | 35 ++- tensorflow/python/framework/test_util.py | 4 + 4 files changed, 192 insertions(+), 245 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py index f9ea4c3b545..c74f754fefb 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py @@ -27,7 +27,6 @@ from tensorflow.core.example import feature_pb2 from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsing_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.data.util import nest from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops @@ -35,7 +34,6 @@ from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import test_util from tensorflow.python.ops import parsing_ops from tensorflow.python.platform import test -from tensorflow.python.platform import tf_logging # Helpers for creating Example objects example = example_pb2.Example @@ -50,33 +48,20 @@ feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d) sequence_example = example_pb2.SequenceExample -def _compare_output_to_expected(tester, dict_tensors, expected_tensors, - flat_output): - tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys())) - - i = 0 # Index into the flattened output - for k, v in sorted(dict_tensors.items()): - # TODO(shivaniagrawal): flat_output is same as v. - expected_v = expected_tensors[k] - tf_logging.info("Comparing key: %s", k) - print("i", i, "flat_output", flat_output[i], "expected_v", expected_v) - if sparse_tensor.is_sparse(v): - # Three outputs for SparseTensor : indices, values, shape. - tester.assertEqual([k, len(expected_v)], [k, 3]) - print("i", i, "flat_output", flat_output[i].indices, "expected_v", - expected_v[0]) - tester.assertAllEqual(expected_v[0], flat_output[i].indices) - tester.assertAllEqual(expected_v[1], flat_output[i].values) - tester.assertAllEqual(expected_v[2], flat_output[i].dense_shape) - else: - # One output for standard Tensor. - tester.assertAllEqual(expected_v, flat_output[i]) - i += 1 - - @test_util.run_all_in_graph_and_eager_modes class ParseExampleDatasetTest(test_base.DatasetTestBase): + def _compare_output_to_expected(self, dict_tensors, expected_tensors): + self.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys())) + + for k, v in sorted(dict_tensors.items()): + expected_v = expected_tensors[k] + if sparse_tensor.is_sparse(v): + self.assertSparseValuesEqual(expected_v, v) + else: + # One output for standard Tensor. + self.assertAllEqual(expected_v, v) + def _test(self, input_tensor, feature_val, @@ -99,26 +84,29 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): contrib_parsing_ops.parse_example_dataset(feature_val)) get_next = self.getNext(dataset) result = self.evaluate(get_next()) - flattened = nest.flatten(result) - _compare_output_to_expected(self, result, expected_values, flattened) + self._compare_output_to_expected(result, expected_values) + with self.assertRaises(errors_impl.OutOfRangeError): + self.evaluate(get_next()) + with self.assertRaises(errors_impl.OutOfRangeError): + self.evaluate(get_next()) if create_iterator_twice: get_next = self.getNext(dataset) result = self.evaluate(get_next()) - flattened = nest.flatten(result) - _compare_output_to_expected(self, result, expected_values, flattened) + self._compare_output_to_expected(result, expected_values) + with self.assertRaises(errors_impl.OutOfRangeError): + self.evaluate(get_next()) # Check shapes; if serialized is a Tensor we need its size to # properly check. batch_size = ( self.evaluate(input_tensor).size if isinstance(input_tensor, ops.Tensor) else np.asarray(input_tensor).size) for k, f in feature_val.items(): - print("output_shapes as list ", tuple(dataset.output_shapes[k].as_list())) if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None: self.assertEqual(dataset.output_shapes[k].as_list()[0], batch_size) elif isinstance(f, parsing_ops.VarLenFeature): self.assertEqual(dataset.output_shapes[k].as_list()[1], None) - def testSkipEagerEmptySerializedWithAllDefaults(self): + def testEmptySerializedWithAllDefaults(self): sparse_name = "st_a" a_name = "a" b_name = "b" @@ -127,13 +115,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): b_default = np.random.rand(3, 3).astype(bytes) c_default = np.random.rand(2).astype(np.float32) - expected_st_a = ( # indices, values, shape - np.empty( - (0, 2), dtype=np.int64), # indices - np.empty( - (0,), dtype=np.int64), # sp_a is DT_INT64 - np.array( - [2, 0], dtype=np.int64)) # batch == 2, max_elems = 0 + expected_st_a = sparse_tensor.SparseTensorValue( # indices, values, shape + np.empty((0, 2), dtype=np.int64), # indices + np.empty((0,), dtype=np.int64), # sp_a is DT_INT64 + np.array([2, 0], dtype=np.int64)) # batch == 2, max_elems = 0 expected_output = { sparse_name: expected_st_a, @@ -219,7 +204,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): {"a": parsing_ops.FixedLenFeature(None, dtypes.float32)}, expected_err=(ValueError, "Missing shape for feature a")) - def testSkipEagerSerializedContainingSparse(self): + def testSerializedContainingSparse(self): original = [ example(features=features({ "st_c": float_feature([3, 4]) @@ -238,17 +223,14 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): serialized = [m.SerializeToString() for m in original] - expected_st_c = ( # indices, values, shape - np.array( - [[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), np.array( - [3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), np.array( - [4, 3], dtype=np.int64)) # batch == 2, max_elems = 3 + expected_st_c = sparse_tensor.SparseTensorValue( # indices, values, shape + np.array([[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), + np.array([3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), + np.array([4, 3], dtype=np.int64)) # batch == 2, max_elems = 3 - expected_st_d = ( # indices, values, shape - np.array( - [[3, 0]], dtype=np.int64), np.array( - ["hi"], dtype=bytes), np.array( - [4, 1], dtype=np.int64)) # batch == 2, max_elems = 1 + expected_st_d = sparse_tensor.SparseTensorValue( # indices, values, shape + np.array([[3, 0]], dtype=np.int64), np.array(["hi"], dtype=bytes), + np.array([4, 1], dtype=np.int64)) # batch == 2, max_elems = 1 expected_output = { "st_c": expected_st_c, @@ -263,7 +245,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) - def testSkipEagerSerializedContainingSparseFeature(self): + def testSerializedContainingSparseFeature(self): original = [ example(features=features({ "val": float_feature([3, 4]), @@ -286,12 +268,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): serialized = [m.SerializeToString() for m in original] - expected_sp = ( # indices, values, shape - np.array( - [[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64), - np.array( - [3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), np.array( - [4, 13], dtype=np.int64)) # batch == 4, max_elems = 13 + expected_sp = sparse_tensor.SparseTensorValue( # indices, values, shape + np.array([[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64), + np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), + np.array([4, 13], dtype=np.int64)) # batch == 4, max_elems = 13 expected_output = {"sp": expected_sp,} @@ -301,7 +281,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) - def testSkipEagerSerializedContainingSparseFeatureReuse(self): + def testSerializedContainingSparseFeatureReuse(self): original = [ example(features=features({ "val1": float_feature([3, 4]), @@ -316,17 +296,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): serialized = [m.SerializeToString() for m in original] - expected_sp1 = ( # indices, values, shape - np.array( - [[0, 5], [0, 10]], dtype=np.int64), np.array( - [3.0, 4.0], dtype=np.float32), np.array( - [2, 13], dtype=np.int64)) # batch == 2, max_elems = 13 + expected_sp1 = sparse_tensor.SparseTensorValue( # indices, values, shape + np.array([[0, 5], [0, 10]], dtype=np.int64), + np.array([3.0, 4.0], dtype=np.float32), + np.array([2, 13], dtype=np.int64)) # batch == 2, max_elems = 13 - expected_sp2 = ( # indices, values, shape - np.array( - [[0, 5], [0, 10]], dtype=np.int64), np.array( - [5.0, 6.0], dtype=np.float32), np.array( - [2, 7], dtype=np.int64)) # batch == 2, max_elems = 13 + expected_sp2 = sparse_tensor.SparseTensorValue( # indices, values, shape + np.array([[0, 5], [0, 10]], dtype=np.int64), + np.array([5.0, 6.0], dtype=np.float32), + np.array([2, 7], dtype=np.int64)) # batch == 2, max_elems = 13 expected_output = { "sp1": expected_sp1, @@ -344,7 +322,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) - def testSkipEagerSerializedContaining3DSparseFeature(self): + def testSerializedContaining3DSparseFeature(self): original = [ example(features=features({ "val": float_feature([3, 4]), @@ -369,11 +347,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): serialized = [m.SerializeToString() for m in original] - expected_sp = ( + expected_sp = sparse_tensor.SparseTensorValue( # indices - np.array( - [[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]], - dtype=np.int64), + np.array([[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]], + dtype=np.int64), # values np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), # shape batch == 4, max_elems = 13 @@ -534,20 +511,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) - def testSkipEagerSerializedSparseAndSparseFeatureAndDenseWithNoDefault( - self): - expected_st_a = ( # indices, values, shape - np.empty( - (0, 2), dtype=np.int64), # indices - np.empty( - (0,), dtype=np.int64), # sp_a is DT_INT64 - np.array( - [2, 0], dtype=np.int64)) # batch == 2, max_elems = 0 - expected_sp = ( # indices, values, shape - np.array( - [[0, 0], [0, 3], [1, 7]], dtype=np.int64), np.array( - ["a", "b", "c"], dtype="|S"), np.array( - [2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 + def testSerializedSparseAndSparseFeatureAndDenseWithNoDefault(self): + expected_st_a = sparse_tensor.SparseTensorValue( # indices, values, shape + np.empty((0, 2), dtype=np.int64), # indices + np.empty((0,), dtype=np.int64), # sp_a is DT_INT64 + np.array([2, 0], dtype=np.int64)) # batch == 2, max_elems = 0 + expected_sp = sparse_tensor.SparseTensorValue( # indices, values, shape + np.array([[0, 0], [0, 3], [1, 7]], dtype=np.int64), + np.array(["a", "b", "c"], dtype="|S"), + np.array([2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 original = [ example(features=features({ @@ -594,18 +566,16 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) - def testSkipEagererializedContainingSparseAndSparseFeatureWithReuse(self): - expected_idx = ( # indices, values, shape - np.array( - [[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64), - np.array([0, 3, 7, 1]), np.array( - [2, 2], dtype=np.int64)) # batch == 4, max_elems = 2 + def testerializedContainingSparseAndSparseFeatureWithReuse(self): + expected_idx = sparse_tensor.SparseTensorValue( # indices, values, shape + np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64), + np.array([0, 3, 7, 1]), + np.array([2, 2], dtype=np.int64)) # batch == 4, max_elems = 2 - expected_sp = ( # indices, values, shape - np.array( - [[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64), np.array( - ["a", "b", "d", "c"], dtype="|S"), np.array( - [2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 + expected_sp = sparse_tensor.SparseTensorValue( # indices, values, shape + np.array([[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64), + np.array(["a", "b", "d", "c"], dtype="|S"), + np.array([2, 13], dtype=np.int64)) # batch == 4, max_elems = 13 original = [ example(features=features({ @@ -694,16 +664,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) - def testSkipEagerSerializedContainingVarLenDenseLargerBatch(self): + def testSerializedContainingVarLenDenseLargerBatch(self): np.random.seed(3456) for batch_size in (1, 10, 20, 100, 256): self._testSerializedContainingVarLenDenseLargerBatch(batch_size) - def testSkipEagerSerializedContainingVarLenDense(self): + def testSkipEagerSerializedShapeMismatch(self): aname = "a" bname = "b" cname = "c" - dname = "d" original = [ example(features=features({ cname: int64_feature([2]), @@ -722,6 +691,47 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): })), ] + serialized = [m.SerializeToString() for m in original] + self._test( + ops.convert_to_tensor(serialized), { + aname: + parsing_ops.FixedLenSequenceFeature((2, 1), + dtype=dtypes.float32, + allow_missing=True, + default_value=[]), + bname: + parsing_ops.FixedLenSequenceFeature( + (2, 1, 1), dtype=dtypes.string, allow_missing=True), + }, + expected_err=(ValueError, + "Cannot reshape a tensor with 0 elements to shape")) + + def testSerializedContainingVarLenDense(self): + aname = "a" + bname = "b" + cname = "c" + dname = "d" + original = [ + example(features=features({ + cname: int64_feature([2]), + })), + example( + features=features({ + aname: float_feature([1, 1]), + bname: bytes_feature([b"b0_str", b"b1_str"]), + })), + example( + features=features({ + aname: float_feature([-1, -1, 2, 2]), + bname: bytes_feature([b"b1"]), + })), + example( + features=features({ + aname: float_feature([]), + cname: int64_feature([3]), + })), + ] + serialized = [m.SerializeToString() for m in original] expected_output = { @@ -807,21 +817,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): errors_impl.OpError, "Key: b, Index: 2. " "Number of bytes values is not a multiple of stride length.")) - self._test( - ops.convert_to_tensor(serialized), { - aname: - parsing_ops.FixedLenSequenceFeature( - (2, 1), - dtype=dtypes.float32, - allow_missing=True, - default_value=[]), - bname: - parsing_ops.FixedLenSequenceFeature( - (2, 1, 1), dtype=dtypes.string, allow_missing=True), - }, - expected_err=(ValueError, - "Cannot reshape a tensor with 0 elements to shape")) - self._test( ops.convert_to_tensor(serialized), { aname: diff --git a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py index b7e2a5f615e..b71e6b2ea43 100644 --- a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py +++ b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py @@ -26,7 +26,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape -from tensorflow.python.ops import array_ops +from tensorflow.python.framework import test_util from tensorflow.python.ops import gen_dataset_ops from tensorflow.python.ops import io_ops from tensorflow.python.ops import parsing_ops @@ -35,8 +35,52 @@ from tensorflow.python.platform import gfile from tensorflow.python.platform import test +@test_util.run_all_in_graph_and_eager_modes class RangeDatasetTest(test_base.DatasetTestBase): + def testStop(self): + dataset = dataset_ops.Dataset.range(5) + self.assertDatasetProduces(dataset, expected_output=range(5)) + + def testStartStop(self): + start, stop = 2, 5 + dataset = dataset_ops.Dataset.range(start, stop) + self.assertDatasetProduces(dataset, expected_output=range(2, 5)) + + def testStartStopStep(self): + start, stop, step = 2, 10, 2 + dataset = dataset_ops.Dataset.range(start, stop, step) + self.assertDatasetProduces(dataset, expected_output=range(2, 10, 2)) + + def testZeroStep(self): + start, stop, step = 2, 10, 0 + dataset = dataset_ops.Dataset.range(start, stop, step) + self.assertDatasetProduces( + dataset, expected_err=(errors.InvalidArgumentError, "")) + + def testNegativeStep(self): + start, stop, step = 2, 10, -1 + dataset = dataset_ops.Dataset.range(start, stop, step) + self.assertDatasetProduces(dataset, expected_output=range(2, 10, -1)) + + def testStopLessThanStart(self): + start, stop = 10, 2 + dataset = dataset_ops.Dataset.range(start, stop) + self.assertDatasetProduces(dataset, expected_output=range(10, 2)) + + def testStopLessThanStartWithPositiveStep(self): + start, stop, step = 10, 2, 2 + dataset = dataset_ops.Dataset.range(start, stop, step) + self.assertDatasetProduces(dataset, expected_output=range(10, 2, 2)) + + def testStopLessThanStartWithNegativeStep(self): + start, stop, step = 10, 2, -1 + dataset = dataset_ops.Dataset.range(start, stop, step) + self.assertDatasetProduces(dataset, expected_output=range(10, 2, -1)) + + +class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase): + def tearDown(self): # Remove all checkpoint files. prefix = self._iterator_checkpoint_prefix() @@ -44,131 +88,6 @@ class RangeDatasetTest(test_base.DatasetTestBase): files = gfile.Glob(pattern) map(gfile.Remove, files) - def testStop(self): - stop = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = dataset_ops.Dataset.range(stop).make_initializable_iterator() - init_op = iterator.initializer - get_next = iterator.get_next() - - with self.cached_session() as sess: - sess.run(init_op, feed_dict={stop: 5}) - for i in range(5): - self.assertEqual(i, sess.run(get_next)) - with self.assertRaises(errors.OutOfRangeError): - sess.run(get_next) - - def testStartStop(self): - start = array_ops.placeholder(dtypes.int64, shape=[]) - stop = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = dataset_ops.Dataset.range(start, - stop).make_initializable_iterator() - init_op = iterator.initializer - get_next = iterator.get_next() - - with self.cached_session() as sess: - sess.run(init_op, feed_dict={start: 2, stop: 5}) - for i in range(2, 5): - self.assertEqual(i, sess.run(get_next)) - with self.assertRaises(errors.OutOfRangeError): - sess.run(get_next) - - def testStartStopStep(self): - start = array_ops.placeholder(dtypes.int64, shape=[]) - stop = array_ops.placeholder(dtypes.int64, shape=[]) - step = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = dataset_ops.Dataset.range(start, stop, - step).make_initializable_iterator() - init_op = iterator.initializer - get_next = iterator.get_next() - - with self.cached_session() as sess: - sess.run(init_op, feed_dict={start: 2, stop: 10, step: 2}) - for i in range(2, 10, 2): - self.assertEqual(i, sess.run(get_next)) - with self.assertRaises(errors.OutOfRangeError): - sess.run(get_next) - - def testZeroStep(self): - start = array_ops.placeholder(dtypes.int64, shape=[]) - stop = array_ops.placeholder(dtypes.int64, shape=[]) - step = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = dataset_ops.Dataset.range(start, stop, - step).make_initializable_iterator() - init_op = iterator.initializer - - with self.cached_session() as sess: - with self.assertRaises(errors.InvalidArgumentError): - sess.run(init_op, feed_dict={start: 2, stop: 10, step: 0}) - - def testNegativeStep(self): - start = array_ops.placeholder(dtypes.int64, shape=[]) - stop = array_ops.placeholder(dtypes.int64, shape=[]) - step = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = dataset_ops.Dataset.range(start, stop, - step).make_initializable_iterator() - init_op = iterator.initializer - get_next = iterator.get_next() - - with self.cached_session() as sess: - sess.run(init_op, feed_dict={start: 2, stop: 10, step: -1}) - # This for loop is a no-op but will ensure that the implementation is - # consistent with range if it ever changes. - for i in range(2, 10, -1): - self.assertEqual(i, sess.run(get_next)) - with self.assertRaises(errors.OutOfRangeError): - sess.run(get_next) - - def testStopLessThanStart(self): - start = array_ops.placeholder(dtypes.int64, shape=[]) - stop = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = dataset_ops.Dataset.range(start, - stop).make_initializable_iterator() - init_op = iterator.initializer - get_next = iterator.get_next() - - with self.cached_session() as sess: - sess.run(init_op, feed_dict={start: 10, stop: 2}) - # This for loop is a no-op but will ensure that the implementation is - # consistent with range if it ever changes. - for i in range(10, 2): - self.assertEqual(i, sess.run(get_next)) - with self.assertRaises(errors.OutOfRangeError): - sess.run(get_next) - - def testStopLessThanStartWithPositiveStep(self): - start = array_ops.placeholder(dtypes.int64, shape=[]) - stop = array_ops.placeholder(dtypes.int64, shape=[]) - step = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = dataset_ops.Dataset.range(start, stop, - step).make_initializable_iterator() - init_op = iterator.initializer - get_next = iterator.get_next() - - with self.cached_session() as sess: - sess.run(init_op, feed_dict={start: 10, stop: 2, step: 2}) - # This for loop is a no-op but will ensure that the implementation is - # consistent with range if it ever changes. - for i in range(10, 2, 2): - self.assertEqual(i, sess.run(get_next)) - with self.assertRaises(errors.OutOfRangeError): - sess.run(get_next) - - def testStopLessThanStartWithNegativeStep(self): - start = array_ops.placeholder(dtypes.int64, shape=[]) - stop = array_ops.placeholder(dtypes.int64, shape=[]) - step = array_ops.placeholder(dtypes.int64, shape=[]) - iterator = dataset_ops.Dataset.range(start, stop, - step).make_initializable_iterator() - init_op = iterator.initializer - get_next = iterator.get_next() - - with self.cached_session() as sess: - sess.run(init_op, feed_dict={start: 10, stop: 2, step: -1}) - for i in range(10, 2, -1): - self.assertEqual(i, sess.run(get_next)) - with self.assertRaises(errors.OutOfRangeError): - sess.run(get_next) - def _iterator_checkpoint_prefix(self): return os.path.join(self.get_temp_dir(), "iterator") diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py index 219a25a615e..edb3eff3c17 100644 --- a/tensorflow/python/data/kernel_tests/test_base.py +++ b/tensorflow/python/data/kernel_tests/test_base.py @@ -62,6 +62,37 @@ class DatasetTestBase(test.TestCase): nxt = it.get_next() return lambda: nxt + def _compare_output_to_expected(self, result_values, expected_values): + for i in range(len(result_values)): + if sparse_tensor.is_sparse(result_values[i]): + self.assertSparseValuesEqual(result_values[i], expected_values[i]) + else: + self.assertAllEqual(result_values[i], expected_values[i]) + + def assertDatasetProduces(self, + input_dataset, + expected_output=None, + expected_err=None, + create_iterator_twice=True): + + if expected_err: + with self.assertRaisesWithPredicateMatch(expected_err[0], + expected_err[1]): + get_next = self.getNext(input_dataset) + self.evaluate(get_next()) + return + repeated = 2 if create_iterator_twice else 1 + for _ in range(repeated): + get_next = self.getNext(input_dataset) + result = [] + for _ in range(len(expected_output)): + result.append(self.evaluate(get_next())) + self._compare_output_to_expected(result, expected_output) + with self.assertRaises(errors.OutOfRangeError): + self.evaluate(get_next()) + with self.assertRaises(errors.OutOfRangeError): + self.evaluate(get_next()) + def assertDatasetsEqual(self, dataset1, dataset2): """Checks that datasets are equal. Supports both graph and eager mode.""" self.assertEqual(dataset1.output_types, dataset2.output_types) @@ -83,9 +114,7 @@ class DatasetTestBase(test.TestCase): op2 = nest.flatten(op2) assert len(op1) == len(op2) for i in range(len(op1)): - if isinstance( - op1[i], - (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)): + if sparse_tensor.is_sparse(op1[i]): self.assertSparseValuesEqual(op1[i], op2[i]) elif flattened_types[i] == dtypes.string: self.assertAllEqual(op1[i], op2[i]) diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index 0516c686699..768ed36917f 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -61,6 +61,7 @@ from tensorflow.python.framework import errors_impl from tensorflow.python.framework import importer from tensorflow.python.framework import ops from tensorflow.python.framework import random_seed +from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import versions from tensorflow.python.ops import array_ops @@ -1135,6 +1136,9 @@ class TensorFlowTestCase(googletest.TestCase): return self._eval_helper(tensor()) else: try: + if sparse_tensor.is_sparse(tensor): + return sparse_tensor.SparseTensorValue(tensor.indices, tensor.values, + tensor.dense_shape) return tensor.numpy() except AttributeError as e: six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e) From 17a0c8bf8b43da8f2fb3cdf41b954ce602e6ca59 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Mon, 5 Nov 2018 15:21:37 -0800 Subject: [PATCH 124/540] [tf.data] Raise a `TypeError` if users pass an old-style `tf.ReaderBase` class to `tf.data.experimental.make_batched_features_dataset()`. PiperOrigin-RevId: 220182944 --- .../python/data/experimental/kernel_tests/BUILD | 1 + .../make_batched_features_dataset_test.py | 16 ++++++++++++++++ tensorflow/python/data/experimental/ops/BUILD | 1 + .../python/data/experimental/ops/readers.py | 8 ++++++++ 4 files changed, 26 insertions(+) diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD index a1382f75982..bfe2e0cf7a1 100644 --- a/tensorflow/python/data/experimental/kernel_tests/BUILD +++ b/tensorflow/python/data/experimental/kernel_tests/BUILD @@ -279,6 +279,7 @@ py_test( "//tensorflow/python:dtypes", "//tensorflow/python:errors", "//tensorflow/python:framework_ops", + "//tensorflow/python:io_ops", "//tensorflow/python:parsing_ops", "//tensorflow/python/data/experimental/ops:readers", "//tensorflow/python/data/ops:readers", diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py index 5ee94e14dcd..91ae8cb1bd2 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py @@ -20,11 +20,13 @@ from __future__ import print_function import numpy as np from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base +from tensorflow.python.data.experimental.ops import readers from tensorflow.python.data.ops import readers as core_readers from tensorflow.python.data.util import nest from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops +from tensorflow.python.ops import io_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.platform import test @@ -234,6 +236,20 @@ class MakeBatchedFeaturesDatasetTest( if issubclass(clazz, ops.Tensor): self.assertEqual(32, shape[0]) + def testOldStyleReader(self): + with self.assertRaisesRegexp( + TypeError, r"The `reader` argument must return a `Dataset` object. " + r"`tf.ReaderBase` subclasses are not supported."): + _ = readers.make_batched_features_dataset( + file_pattern=self.test_filenames[0], batch_size=32, + features={ + "file": parsing_ops.FixedLenFeature([], dtypes.int64), + "record": parsing_ops.FixedLenFeature([], dtypes.int64), + "keywords": parsing_ops.VarLenFeature(dtypes.string), + "label": parsing_ops.FixedLenFeature([], dtypes.string), + }, + reader=io_ops.TFRecordReader) + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD index 323298e33a6..eda547c37af 100644 --- a/tensorflow/python/data/experimental/ops/BUILD +++ b/tensorflow/python/data/experimental/ops/BUILD @@ -82,6 +82,7 @@ py_library( "//tensorflow/python:dtypes", "//tensorflow/python:experimental_dataset_ops_gen", "//tensorflow/python:framework_ops", + "//tensorflow/python:io_ops", "//tensorflow/python:lib", "//tensorflow/python:platform", "//tensorflow/python:tensor_shape", diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py index 3b2d0945148..fe601925860 100644 --- a/tensorflow/python/data/experimental/ops/readers.py +++ b/tensorflow/python/data/experimental/ops/readers.py @@ -38,6 +38,7 @@ from tensorflow.python.framework import tensor_shape from tensorflow.python.lib.io import file_io from tensorflow.python.ops import gen_dataset_ops from tensorflow.python.ops import gen_experimental_dataset_ops +from tensorflow.python.ops import io_ops from tensorflow.python.platform import gfile from tensorflow.python.util.tf_export import tf_export @@ -760,6 +761,7 @@ def make_batched_features_dataset(file_pattern, Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects. Raises: + TypeError: If `reader` is a `tf.ReaderBase` subclass. ValueError: If `label_key` is not one of the `features` keys. """ # Create dataset of all matching filenames @@ -768,6 +770,12 @@ def make_batched_features_dataset(file_pattern, if shuffle: dataset = dataset.shuffle(len(filenames), shuffle_seed) + if isinstance(reader, type) and issubclass(reader, io_ops.ReaderBase): + raise TypeError("The `reader` argument must return a `Dataset` object. " + "`tf.ReaderBase` subclasses are not supported. For " + "example, pass `tf.data.TFRecordDataset` instead of " + "`tf.TFRecordReader`.") + # Read `Example` records from files as tensor objects. if reader_args is None: reader_args = [] From 12b9fbc2c82ce0a97e7863d3f2919ea044144e37 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 15:22:22 -0800 Subject: [PATCH 125/540] Allow AddN as a flex op. PiperOrigin-RevId: 220183068 --- tensorflow/lite/toco/import_tensorflow.cc | 39 +++++++++++++++++++---- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc index 86d55f3e15d..76c6985e3a2 100644 --- a/tensorflow/lite/toco/import_tensorflow.cc +++ b/tensorflow/lite/toco/import_tensorflow.cc @@ -1122,28 +1122,53 @@ tensorflow::Status ConvertConcatOperator( return tensorflow::Status::OK(); } +static constexpr int kAnyNumInputs = -1; + +enum FlexSupport { kFlexOk, kFlexNotOk }; + // This method supports simple operators without additional attributes. -template -tensorflow::Status ConvertSimpleOperator( +// Converts a simple operator that takes no attributes. The list of inputs is +// taken from the given NodeDef, and its number must match NumInputs, unless +// kAnyNumInputs is passed in. If kFlexOk is passed in the resulting operator +// will be eligible for being exported as a flex op. +template +tensorflow::Status ConvertSimpleOperatorGeneric( const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, Model* model) { + if (NumInputs != kAnyNumInputs) { + TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, NumInputs)); + } auto* op = new Op; const int num_inputs = GetInputsCount(node, tf_import_flags); for (int i = 0; i < num_inputs; ++i) { op->inputs.push_back(node.input(i)); } op->outputs.push_back(node.name()); + + if (flex == kFlexOk) { + RetainTensorFlowNodeDef(node, op); + } + model->operators.emplace_back(op); return tensorflow::Status::OK(); } -// This method supports simple operators without additional attributes. -template +// Convert a simple operator which is not valid as a flex op. +template tensorflow::Status ConvertSimpleOperator( const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, Model* model) { - TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, NumInputs)); - return ConvertSimpleOperator(node, tf_import_flags, model); + return ConvertSimpleOperatorGeneric( + node, tf_import_flags, model); +} + +// Convert a simple operator which is valid as a flex op. +template +tensorflow::Status ConvertSimpleOperatorFlexOk( + const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, + Model* model) { + return ConvertSimpleOperatorGeneric( + node, tf_import_flags, model); } void GetOutputNamesFromNodeDef(const NodeDef& node, @@ -2203,7 +2228,7 @@ ConverterMapType GetTensorFlowNodeConverterMapForFlex() { ConverterMapType GetTensorFlowNodeConverterMap() { return std::unordered_map({ {"Add", ConvertSimpleOperator}, - {"AddN", ConvertSimpleOperator}, + {"AddN", ConvertSimpleOperatorFlexOk}, {"All", ConvertSimpleOperator}, {"Any", ConvertReduceOperator}, {"ArgMax", ConvertArgMaxOperator}, From 0b7a377e941603c24a6dd634f4ea105c60bbcebc Mon Sep 17 00:00:00 2001 From: Suharsh Sivakumar Date: Mon, 5 Nov 2018 15:32:06 -0800 Subject: [PATCH 126/540] Do not quantize shared tensors. PiperOrigin-RevId: 220184733 --- tensorflow/lite/tools/optimize/quantize_weights.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc index a13774f7130..de3c0b03237 100644 --- a/tensorflow/lite/tools/optimize/quantize_weights.cc +++ b/tensorflow/lite/tools/optimize/quantize_weights.cc @@ -182,8 +182,7 @@ std::vector GetQuantizableTensorsFromOperator( TensorT* tensor = subgraph->tensors[tensor_idx].get(); // TODO(suharshs): Support shared weights, i.e. If two tensors share the // same weight array, things may break. (i.e. SSD object detection) - if (!eval_hybrid && - CountTensorConsumers(model, subgraph, tensor_idx) != 1) { + if (CountTensorConsumers(model, subgraph, tensor_idx) != 1) { LOG(INFO) << "Skipping quantization of tensor " << tensor->name << " that is shared between multiple multiple operations."; continue; From 634005cb53d2a136c04885ffa6cbf5323649fe32 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Mon, 5 Nov 2018 15:42:34 -0800 Subject: [PATCH 127/540] Surface the number of threads used by ctx->runner() --- tensorflow/core/kernels/data/map_and_batch_dataset_op.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc index 7790d133203..d7db3c4d183 100644 --- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc @@ -262,9 +262,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { Status Initialize(IteratorContext* ctx) override { mutex_lock l(*mu_); if (num_parallel_calls_->value == kAutoTune) { - // TODO(jsimsa): Surface the number of threads used by `ctx->runner()` - // and use it here for the default. - num_parallel_calls_->value = port::NumSchedulableCPUs(); + num_parallel_calls_->value = ctx->runner_threadpool_size(); num_parallel_calls_->tunable = true; } TF_RETURN_IF_ERROR( From 0ff931a5965241ffdb1551493617b6d9592976e8 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Mon, 5 Nov 2018 15:50:22 -0800 Subject: [PATCH 128/540] Surface the number of threads used by ctx->runner() for numa_map_and_batch_dataset_op.cc --- .../kernels/data/experimental/numa_map_and_batch_dataset_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc index 677141a89d6..1286795a0b2 100644 --- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc @@ -201,7 +201,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel { Status Initialize(IteratorContext* ctx) override { mutex_lock l(*mu_); if (num_parallel_calls_->value == kAutoTune) { - num_parallel_calls_->value = port::NumSchedulableCPUs(); + num_parallel_calls_->value = ctx->runner_threadpool_size(); num_parallel_calls_->tunable = true; } TF_RETURN_IF_ERROR( From 74ff556c850a89414e62201e831217b69aafb4c6 Mon Sep 17 00:00:00 2001 From: Jonathan Shen Date: Mon, 5 Nov 2018 15:50:42 -0800 Subject: [PATCH 129/540] Add implicit capture support to functional_ops.While. PiperOrigin-RevId: 220187921 --- .../kernel_tests/functional_ops_test.py | 43 +++++++++ tensorflow/python/ops/functional_ops.py | 93 +++++++++++++------ 2 files changed, 108 insertions(+), 28 deletions(-) diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py index f35450b6fd6..04c1032722c 100644 --- a/tensorflow/python/kernel_tests/functional_ops_test.py +++ b/tensorflow/python/kernel_tests/functional_ops_test.py @@ -829,6 +829,49 @@ class FunctionalOpsTest(test.TestCase): self.assertAllEqual(5050., sess.run([result, c], feed_dict={n: 100.})[0]) + # pylint: disable=cell-var-from-loop + def testWhileCapturedInputs(self): + for use_gpu in (True, False): + with ops.Graph().as_default() as g: + v = variables.Variable(1.0) + + def TestCond(n, *args): + del args + return n < 10 + + @function.Defun(*[dtypes.float32] * 2) + def TestUnary(n, x): + return math_ops.add(n, 1), x + n + v + + @function.Defun(*[dtypes.float32] * 3) + def TestBinary(n, x, x2): + return math_ops.add(n, 1), x + n + v, x2 + v + + with self.session(graph=g, use_gpu=use_gpu) as sess: + result_unary = functional_ops.While( + [1.0, 0.], + function.Defun(*[dtypes.float32] * 2)(TestCond), TestUnary) + result_binary = functional_ops.While( + [1.0, 0., 0.], + function.Defun(*[dtypes.float32] * 3)(TestCond), TestBinary) + sess.run(variables.global_variables_initializer()) + assert len(result_unary) == 2 + self.assertEqual([10.0, 54.0], sess.run(result_unary)) + assert len(result_binary) == 3 + self.assertEqual([10.0, 54.0, 9.0], sess.run(result_binary)) + + def TestCondCapture(n, *args): + del args + return math_ops.to_float(n) + v < 10 + + with self.assertRaises(ValueError): + _ = functional_ops.While( + [1], + function.Defun(dtypes.int32)(TestCondCapture), + function.Defun(dtypes.int32, dtypes.float32)(TestUnary)) + + # pylint: enable=cell-var-from-loop + def _tfSum(self, use_gpu, rewrite_with_while): with ops.Graph().as_default() as g: with self.session(graph=g, use_gpu=use_gpu) as sess: diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py index f0ef03524bf..fecd7ddbf9f 100644 --- a/tensorflow/python/ops/functional_ops.py +++ b/tensorflow/python/ops/functional_ops.py @@ -802,6 +802,29 @@ def Gradient(inputs, f, name=None): return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name) +def _LoopBodyCaptureWrapper(func): + """Returns a wrapper for `func` that handles loop-carried captured inputs.""" + + @function.Defun( + *func.declared_input_types, func_name="%s_Wrapper" % func.name) + def Wrapper(*args): + """A wrapper that handles loop-carried captured inputs.""" + result = func(*args) + extra_args = tuple(function.get_extra_args()) + # Nullary functions return an Operation. Normal functions can't do this + # because their return values are converted to Tensors. + if isinstance(result, ops.Operation): + return extra_args + # Unary functions return a single Tensor value. + elif not isinstance(result, tuple): + return (result,) + extra_args + # N-ary functions return a tuple of Tensors. + else: + return result + extra_args + + return Wrapper + + # pylint: disable=invalid-name,protected-access def While(input_, cond, body, name=None, hostmem=None): r"""output = input; While (Cond(output)) { output = Body(output) }. @@ -823,11 +846,41 @@ def While(input_, cond, body, name=None, hostmem=None): hostmem: A list of integer. If i is in the list, input[i] is a host memory tensor. + Raises: + ValueError: if `cond` has implicitly captured inputs or if `cond` and `body` + have different signatures. + Returns: A list of `Tensor` objects. Has the same type as `input`. A list of output tensors whose types are T. """ - ret = gen_functional_ops._while(input_, cond, body, name=name) + if cond.captured_inputs: + raise ValueError("While op 'cond' argument must be a function " + "without implicitly captured inputs.") + + if cond.declared_input_types != body.declared_input_types: + raise ValueError( + "While op 'cond' and 'body' signatures do not match. %r vs %r" % + (cond.declared_input_types, body.declared_input_types)) + + if body.captured_inputs: + cond_dtypes = list( + body.declared_input_types) + [t.dtype for t in body.captured_inputs] + + @function.Defun(*cond_dtypes, func_name="%s_Wrapper" % cond.name) + def CondWrapper(*args): + """A wrapper that handles loop-carried captured inputs.""" + return cond(*args[:len(body.declared_input_types)]) + + ret = gen_functional_ops._while( + input_ + body.captured_inputs, + CondWrapper, + _LoopBodyCaptureWrapper(body), + name=name) + # Slice off the loop-carried captured inputs. + ret = ret[:-len(body.captured_inputs)] + else: + ret = gen_functional_ops._while(input_, cond, body, name=name) if hostmem: input_attr = attr_value_pb2.AttrValue() input_attr.list.i.extend(hostmem) @@ -876,11 +929,10 @@ def _ForUsingWhile(start, # must have identical inputs, we have to augment the cond signature to take # the same types as the carried loop variables. body_sig = [dtypes.int32] * 4 + list(forbody.declared_input_types)[1:] - cond_sig = body_sig + [t.dtype for t in forbody.captured_inputs] cond_name = "%s_Cond" % forbody.name - @function.Defun(*cond_sig, func_name=cond_name) + @function.Defun(*body_sig, func_name=cond_name) def WhileCond(i, n, *args): del args return i < n @@ -898,8 +950,7 @@ def _ForUsingWhile(start, # Unary functions return a single Tensor value. elif isinstance(for_result, ops.Tensor): for_result = (for_result,) - extra_args = tuple(function.get_extra_args()) - return (i + 1, n, start, delta) + tuple(for_result) + extra_args + return (i + 1, n, start, delta) + tuple(for_result) if hostmem is not None: hostmem = [0, 1, 2, 3] + [(4 + _) for _ in hostmem] @@ -907,13 +958,13 @@ def _ForUsingWhile(start, hostmem = [0, 1, 2, 3] results = While( - input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs, + input_=[0, n, start, delta] + inputs, cond=WhileCond, body=WhileBody, name=name, hostmem=hostmem) # Slice off the loop-carried captured inputs. - return list(results[4:len(results) - len(WhileBody.captured_inputs)]) + return list(results[4:len(results)]) def For(start, @@ -947,29 +998,15 @@ def For(start, if rewrite_with_while: return _ForUsingWhile(start, limit, delta, inputs, body, name, hostmem) if body.captured_inputs: - wrapper_name = "%s_BodyWrapper" % body.name - - @function.Defun(*body.declared_input_types, func_name=wrapper_name) - def BodyWrapper(*args): - """A wrapper for body that handles loop-carried captured inputs.""" - body_result = body(*args) - extra_args = tuple(function.get_extra_args()) - # Nullary functions return an Operation. Normal functions can't do this - # because their return values are converted to Tensors. - if isinstance(body_result, ops.Operation): - return extra_args - # Unary functions return a single Tensor value. - elif not isinstance(body_result, tuple): - return (body_result,) + extra_args - # N-ary functions return a tuple of Tensors. - else: - return body_result + extra_args - - inputs += BodyWrapper.captured_inputs ret = gen_functional_ops._for( - start, limit, delta, inputs, BodyWrapper, name=name) + start, + limit, + delta, + inputs + body.captured_inputs, + _LoopBodyCaptureWrapper(body), + name=name) # Slice off the loop-carried captured inputs. - ret = ret[:-len(BodyWrapper.captured_inputs)] + ret = ret[:-len(body.captured_inputs)] else: ret = gen_functional_ops._for(start, limit, delta, inputs, body, name=name) if hostmem: From f16494683dffd7587e623b95af51e58716e12992 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Mon, 5 Nov 2018 15:52:58 -0800 Subject: [PATCH 130/540] Internal changes PiperOrigin-RevId: 220188319 --- tensorflow/lite/kernels/internal/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index 32f61e02807..01c4c5d6b2c 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -593,6 +593,8 @@ cc_test( srcs = ["depthwiseconv_quantized_test.cc"], tags = [ "no_oss", + # TODO(b/119052685): Re-enable this test in TSAN. + "notsan", "tflite_not_portable_ios", ], deps = [ From 58cdbb31ced5bf5077227f7dcac1142a5ecd06c8 Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Mon, 5 Nov 2018 16:02:27 -0800 Subject: [PATCH 131/540] Fix build error for //tensorflow/core:platform_port unit test. --- tensorflow/core/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index afe4c46c8ef..26dd295d0cb 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -383,6 +383,7 @@ cc_library( ":lib_platform", ":platform_base", "//tensorflow/core/platform/default/build_config:port", + "@com_google_absl//absl/base", "@snappy", ], ) From 689d9974b95a764759be2028cff36735fde3001f Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Mon, 5 Nov 2018 16:05:42 -0800 Subject: [PATCH 132/540] Change the maximum input for model::MakeParameter::parallelism --- .../kernels/data/experimental/numa_map_and_batch_dataset_op.cc | 2 +- tensorflow/core/kernels/data/map_and_batch_dataset_op.cc | 2 +- tensorflow/core/kernels/data/parallel_map_iterator.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc index 1286795a0b2..068f8540230 100644 --- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc @@ -244,7 +244,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel { return model::MakeAsyncKnownRatioNode( std::move(args), dataset()->batch_size_, {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1, - /*max=*/port::NumSchedulableCPUs())}); + /*max=*/ctx->runner_threadpool_size())}); } Status SaveInternal(IteratorStateWriter* writer) override { diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc index d7db3c4d183..31851925124 100644 --- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc @@ -296,7 +296,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { return model::MakeAsyncKnownRatioNode( std::move(args), dataset()->batch_size_, {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1, - /*max=*/port::NumSchedulableCPUs())}); + /*max=*/ctx->runner_threadpool_size())}); } Status SaveInternal(IteratorStateWriter* writer) override { diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc index 10103230950..ec1c9238430 100644 --- a/tensorflow/core/kernels/data/parallel_map_iterator.cc +++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc @@ -101,7 +101,7 @@ class ParallelMapIterator : public DatasetBaseIterator { std::move(args), /*ratio=*/1, {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1, - /*max=*/port::NumSchedulableCPUs())}); + /*max=*/ctx->runner_threadpool_size())}); } Status SaveInternal(IteratorStateWriter* writer) override { From 7ee5e20dc6438342ced17d87c870a1c7d3915536 Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Mon, 5 Nov 2018 16:11:41 -0800 Subject: [PATCH 133/540] Add tf.saved_model.save to the 2.x API (tf.saved_model.experimental.save in 1.x) Renamed to "save" from "export" since "import" is a reserved keyword. Exports SavedModels, currently targeting only serving. This CL includes some extra documentation and some tangential changes to make the example unit test work. PiperOrigin-RevId: 220191697 --- tensorflow/python/framework/meta_graph.py | 4 +- tensorflow/python/saved_model/BUILD | 12 +- .../python/saved_model/{export.py => save.py} | 120 ++++++++++++----- .../{export_test.py => save_test.py} | 127 ++++++++++++------ tensorflow/python/saved_model/saved_model.py | 2 +- .../tools/api/generator/api_init_files_v1.bzl | 1 + .../tensorflow.saved_model.experimental.pbtxt | 7 + .../golden/v1/tensorflow.saved_model.pbtxt | 4 + .../golden/v2/tensorflow.saved_model.pbtxt | 4 + 9 files changed, 198 insertions(+), 83 deletions(-) rename tensorflow/python/saved_model/{export.py => save.py} (81%) rename tensorflow/python/saved_model/{export_test.py => save_test.py} (66%) create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py index 33631282bd0..ddf6f66e8ab 100644 --- a/tensorflow/python/framework/meta_graph.py +++ b/tensorflow/python/framework/meta_graph.py @@ -462,7 +462,7 @@ def _is_default_attr_value(op_def, attr_name, attr_value): return False -def _strip_graph_default_valued_attrs(meta_graph_def): +def strip_graph_default_valued_attrs(meta_graph_def): """Strips default valued attributes for node defs in given MetaGraphDef. This method also sets `meta_info_def.stripped_default_attrs` in the given @@ -587,7 +587,7 @@ def create_meta_graph_def(meta_info_def=None, # Strip default valued attributes in graph_def. if strip_default_attrs: - _strip_graph_default_valued_attrs(meta_graph_def) + strip_graph_default_valued_attrs(meta_graph_def) # Adds saver_def. if saver_def: diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD index 576ad8ed65c..e7a3b8afd5d 100644 --- a/tensorflow/python/saved_model/BUILD +++ b/tensorflow/python/saved_model/BUILD @@ -21,9 +21,9 @@ py_library( deps = [ ":builder", ":constants", - ":export", ":loader", ":main_op", + ":save", ":signature_constants", ":signature_def_utils", ":simple_save", @@ -265,9 +265,9 @@ py_test( ) py_library( - name = "export", + name = "save", srcs = [ - "export.py", + "save.py", ], srcs_version = "PY2AND3", deps = [ @@ -285,11 +285,11 @@ py_library( ) py_test( - name = "export_test", - srcs = ["export_test.py"], + name = "save_test", + srcs = ["save_test.py"], srcs_version = "PY2AND3", deps = [ - ":export", + ":save", ":signature_constants", ":tag_constants", "//tensorflow/python/eager:def_function", diff --git a/tensorflow/python/saved_model/export.py b/tensorflow/python/saved_model/save.py similarity index 81% rename from tensorflow/python/saved_model/export.py rename to tensorflow/python/saved_model/save.py index 030182ca4bc..63575f631eb 100644 --- a/tensorflow/python/saved_model/export.py +++ b/tensorflow/python/saved_model/save.py @@ -25,6 +25,7 @@ from tensorflow.core.protobuf import saved_model_pb2 from tensorflow.python.eager import context from tensorflow.python.eager import def_function from tensorflow.python.eager import function +from tensorflow.python.framework import meta_graph from tensorflow.python.framework import ops from tensorflow.python.lib.io import file_io from tensorflow.python.ops import array_ops @@ -37,6 +38,7 @@ from tensorflow.python.training.checkpointable import base from tensorflow.python.training.checkpointable import util from tensorflow.python.util import compat from tensorflow.python.util import nest +from tensorflow.python.util.tf_export import tf_export def _find_function_to_export(root): @@ -51,7 +53,7 @@ def _find_function_to_export(root): # TODO(allenl): Automatically infer signatures for Keras functional models? if not functions: raise ValueError( - ("Exporting an object with no tf.saved_model_save(..., signatures=...) " + ("Exporting an object with no tf.saved_model.save(..., signatures=...) " "argument specified, and with no @tf.function-decorated methods " "attached to it. In the future this will be a supported use-case for " "Python re-import, but at the moment saving a SavedModel without " @@ -60,7 +62,7 @@ def _find_function_to_export(root): "explicitly.")) elif len(functions) > 1: raise ValueError( - ("Exporting an object with no tf.saved_model_save(..., signatures=...) " + ("Exporting an object with no tf.saved_model.save(..., signatures=...) " "argument specified, and with more than one @tf.function-decorated " "method attached to it: {}. The signature keys for these functions " "are ambiguous. Specify signature functions explicitly.").format( @@ -370,46 +372,66 @@ def _make_graph_def(root, signature_functions, object_saver): return graph_def, signatures, saver_def -def export(obj, export_dir, signatures=None): +@tf_export("saved_model.save", v1=["saved_model.experimental.save"]) +def save(obj, export_dir, signatures=None): # pylint: disable=line-too-long """Exports the Checkpointable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md). - The `signatures` argument indicates TensorFlow functions which will be + Example usage: + + ```python + class Adder(tf.train.Checkpoint): + + @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)]) + def add(self, x): + return x + x + 1. + + to_export = Adder() + tf.saved_model.save(to_export, '/tmp/adder') + ``` + + The resulting SavedModel is then servable with an input named "x", its value + having any shape and dtype float32. + + The optional `signatures` argument controls which methods in `obj` will be available to programs which consume `SavedModel`s, for example serving APIs. Python functions may be decorated with `@tf.function(input_signature=...)` and passed as signatures directly, or - created without a signature using `@tf.function` and then converted to a - concrete TensorFlow function using `f.get_concrete_function(...)`. + lazily with a call to `get_concrete_function` on the method decorated with + `@tf.function`. - In either case, `Tensor` inputs to `signatures` functions which are not - associated with a unique Python argument name must have names explicitly - specified in their `tf.TensorSpec` objects. Cases where this is necessary - include positional arguments passed through variadic `*args` and multiple - `Tensor` inputs which are part of the same nested structure. + If the `signatures` argument is omitted, `obj` will be searched for + `@tf.function`-decorated methods. If exactly one `@tf.function` is found, that + method will be used as the default signature for the SavedModel. This behavior + is expected to change in the future, when a corresponding + `tf.saved_model.load` symbol is added. At that point signatures will be + completely optional, and any `@tf.function` attached to `obj` or its + dependencies will be exported for use with `load`. + + When invoking a signature in an exported SavedModel, `Tensor` arguments are + identified by name. These names will come from the Python function's argument + names by default. They may be overridden by specifying a `name=...` argument + in the corresponding `tf.TensorSpec` object. Explicit naming is required if + multiple `Tensor`s are passed through a single argument to the Python + function. The outputs of functions used as `signatures` must either be flat lists, in which case outputs will be numbered, or a dictionary mapping string keys to - Tensors, in which case the string keys will be used to name outputs. + `Tensor`, in which case the keys will be used to name outputs. - Exporting with a signature specified: + Since `tf.keras.Model` objects are also Checkpointable, this function can be + used to export Keras models. For example, exporting with a signature + specified: ```python class Model(tf.keras.Model): - @tf.function(input_signature=tf.TensorSpec(shape=[None], dtype=tf.string)) - def serve(serialized): + @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)]) + def serve(self, serialized): ... m = Model() - tf.saved_model.export(m, '/tmp/saved_model/', signatures=m.serve) - ``` - - The `signatures` argument may be omitted if only one method of the exported - object is decorated with `tf.function` and that method has an input signature - specified. - - ```python - tf.saved_model.export(m, '/tmp/saved_model/') + tf.saved_model.save(m, '/tmp/saved_model/') ``` Exporting from a function without a fixed signature: @@ -418,13 +440,13 @@ def export(obj, export_dir, signatures=None): class Model(tf.keras.Model): @tf.function - def compute(x): + def call(self, x): ... m = Model() - tf.saved_model.export( + tf.saved_model.save( m, '/tmp/saved_model/', - signatures=m.compute.get_concrete_function( + signatures=m.call.get_concrete_function( tf.TensorSpec(shape=[None, 3], dtype=tf.float32, name="inp"))) ``` @@ -434,14 +456,47 @@ def export(obj, export_dir, signatures=None): automatically. This is the same tracking scheme that `tf.train.Checkpoint` uses, and an exported `Checkpoint` object may be restored as a training checkpoint by pointing `tf.train.Checkpoint.restore` to the SavedModel's - "variables/" subdirectory. + "variables/" subdirectory. Currently variables are the only stateful objects + supported by `tf.saved_model.save`, but others (e.g. tables) will be supported + in the future. + + `tf.function` does not hard-code device annotations from outside the function + body, instead using the calling context's device. This means for example that + exporting a model which runs on a GPU and serving it on a CPU will generally + work, with some exceptions. `tf.device` annotations inside the body of the + function will be hard-coded in the exported model; this type of annotation is + discouraged. Device-specific operations, e.g. with "cuDNN" in the name or with + device-specific layouts, may cause issues. Currently a `DistributionStrategy` + is another exception: active distribution strategies will cause device + placements to be hard-coded in a function. Exporting a single-device + computation and importing under a `DistributionStrategy` is not currently + supported, but may be in the future. + + SavedModels exported with `tf.saved_model.save` [strip default-valued + attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes) + automatically, which removes one source of incompatibilities when the consumer + of a SavedModel is running an older TensorFlow version than the + producer. There are however other sources of incompatibilities which are not + handled automatically, such as when the exported model contains operations + which the consumer does not have definitions for. + + The current implementation of `tf.saved_model.save` targets serving use-cases, + but omits information which will be necessary for the planned future + implementation of `tf.saved_model.load`. Exported models using the current + `save` implementation, and other existing SavedModels, will not be compatible + with `tf.saved_model.load` when it is implemented. Further, `save` will in the + future attempt to export `@tf.function`-decorated methods which it does not + currently inspect, so some objects which are exportable today will raise + exceptions on export in the future (e.g. due to complex/non-serializable + default arguments). Such backwards-incompatible API changes are expected only + prior to the TensorFlow 2.0 release. Args: obj: A checkpointable object to export. export_dir: A directory in which to write the SavedModel. signatures: Optional, either a `tf.function` with an input signature specified or the result of `f.get_concrete_function` on a - `tf.function`-decorated function `f`, in which case `f` will be used to + `@tf.function`-decorated function `f`, in which case `f` will be used to generate a signature for the SavedModel under the default serving signature key. `signatures` may also be a dictionary, in which case it maps from signature keys to either `tf.function` instances with input @@ -456,12 +511,14 @@ def export(obj, export_dir, signatures=None): if not isinstance(obj, base.CheckpointableBase): raise ValueError( "Expected a Checkpointable object for export, got {}.".format(obj)) + if signatures is None: + # Note that we run this before saving the checkpoint, since looping over + # attributes may have the side effect of creating variables in some cases. + signatures = _find_function_to_export(obj) object_saver = util.CheckpointableSaver(obj) utils_impl.get_or_create_variables_dir(export_dir) object_saver.save(utils_impl.get_variables_path(export_dir)) - if signatures is None: - signatures = _find_function_to_export(obj) signatures = _canonicalize_signatures(signatures) graph_def, signatures, saver_def = _make_graph_def( obj, signatures, object_saver) @@ -476,6 +533,7 @@ def export(obj, export_dir, signatures=None): meta_graph_def.graph_def.MergeFrom(graph_def) for signature_key, signature in signatures.items(): meta_graph_def.signature_def[signature_key].MergeFrom(signature) + meta_graph.strip_graph_default_valued_attrs(meta_graph_def) path = os.path.join( compat.as_bytes(export_dir), compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB)) diff --git a/tensorflow/python/saved_model/export_test.py b/tensorflow/python/saved_model/save_test.py similarity index 66% rename from tensorflow/python/saved_model/export_test.py rename to tensorflow/python/saved_model/save_test.py index d70e53a26f0..42ff508b38a 100644 --- a/tensorflow/python/saved_model/export_test.py +++ b/tensorflow/python/saved_model/save_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for checkpointable object SavedModel export.""" +"""Tests for checkpointable object SavedModel save.""" from __future__ import absolute_import from __future__ import division @@ -33,11 +33,12 @@ from tensorflow.python.keras.engine import training from tensorflow.python.keras.layers import core from tensorflow.python.ops import math_ops from tensorflow.python.ops import variables -from tensorflow.python.saved_model import export from tensorflow.python.saved_model import loader +from tensorflow.python.saved_model import save from tensorflow.python.saved_model import signature_constants from tensorflow.python.training import adam from tensorflow.python.training.checkpointable import tracking +from tensorflow.python.training.checkpointable import util class _ModelWithOptimizer(training.Model): @@ -59,15 +60,15 @@ class _ModelWithOptimizer(training.Model): return {"loss": loss} -class ExportTest(test.TestCase): +class SaveTest(test.TestCase): def _import_and_infer( - self, export_dir, inputs, + self, save_dir, inputs, signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY): """Import a SavedModel into a TF 1.x-style graph and run `signature_key`.""" graph = ops.Graph() with graph.as_default(), self.session(graph) as session: - model = loader.load(session, [], export_dir) + model = loader.load(session, [], save_dir) signature = model.signature_def[signature_key] self.assertEqual(set(inputs.keys()), set(signature.inputs.keys())) feed_dict = {} @@ -80,42 +81,42 @@ class ExportTest(test.TestCase): output_tensor_info.name) return session.run(output_dict, feed_dict=feed_dict) - def test_method_export_signature(self): + def test_method_save_signature(self): root = tracking.Checkpointable() root.f = def_function.function( lambda x: 2. * x, input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]) root.f(constant_op.constant(1.)) - export_dir = os.path.join(self.get_temp_dir(), "saved_model") - export.export(root, export_dir, root.f) + save_dir = os.path.join(self.get_temp_dir(), "saved_model") + save.save(root, save_dir, root.f) self.assertEqual( {"output_0": 2.}, - self._import_and_infer(export_dir, {"x": 1.})) + self._import_and_infer(save_dir, {"x": 1.})) - def test_method_export_concrete(self): + def test_method_save_concrete(self): root = tracking.Checkpointable() root.f = def_function.function( lambda z: {"out": 2. * z}) root.f(constant_op.constant(1.)) - export_dir = os.path.join(self.get_temp_dir(), "saved_model") - export.export( + save_dir = os.path.join(self.get_temp_dir(), "saved_model") + save.save( root, - export_dir, + save_dir, {"non_default_key": root.f.get_concrete_function( tensor_spec.TensorSpec(None, dtypes.float32))}) self.assertEqual( {"out": 2.}, self._import_and_infer( - export_dir, {"z": 1.}, signature_key="non_default_key")) + save_dir, {"z": 1.}, signature_key="non_default_key")) def test_non_concrete_error(self): root = tracking.Checkpointable() root.f = def_function.function(lambda x: 2. * x) root.f(constant_op.constant(1.)) - export_dir = os.path.join(self.get_temp_dir(), "saved_model") + save_dir = os.path.join(self.get_temp_dir(), "saved_model") with self.assertRaisesRegexp( ValueError, "must be converted to concrete functions"): - export.export(root, export_dir, root.f) + save.save(root, save_dir, root.f) def test_nested_inputs(self): root = tracking.Checkpointable() @@ -124,7 +125,7 @@ class ExportTest(test.TestCase): input_signature=([tensor_spec.TensorSpec(None, dtypes.float32), tensor_spec.TensorSpec(None, dtypes.float32)],)) root.f([constant_op.constant(1.), constant_op.constant(1.)]) - # Concrete functions must always have uniquely named Tensor inputs. Export + # Concrete functions must always have uniquely named Tensor inputs. Save # relies on this. with self.assertRaisesRegexp( ValueError, "two arguments named 'x'"): @@ -134,22 +135,22 @@ class ExportTest(test.TestCase): root = tracking.Checkpointable() root.f = def_function.function(lambda x: (2. * x, (3. * x, 4. * x))) root.f(constant_op.constant(1.)) - to_export = root.f.get_concrete_function(constant_op.constant(1.)) - export_dir = os.path.join(self.get_temp_dir(), "saved_model") + to_save = root.f.get_concrete_function(constant_op.constant(1.)) + save_dir = os.path.join(self.get_temp_dir(), "saved_model") with self.assertRaisesRegexp( ValueError, "non-flat outputs"): - export.export(root, export_dir, to_export) + save.save(root, save_dir, to_save) def test_nested_dict_outputs(self): root = tracking.Checkpointable() root.f = def_function.function( lambda x: {"a": 2. * x, "b": (3. * x, 4. * x)}) root.f(constant_op.constant(1.)) - to_export = root.f.get_concrete_function(constant_op.constant(1.)) - export_dir = os.path.join(self.get_temp_dir(), "saved_model") + to_save = root.f.get_concrete_function(constant_op.constant(1.)) + save_dir = os.path.join(self.get_temp_dir(), "saved_model") with self.assertRaisesRegexp( ValueError, "dictionary containing non-Tensor value"): - export.export(root, export_dir, to_export) + save.save(root, save_dir, to_save) def test_variable(self): root = tracking.Checkpointable() @@ -158,49 +159,49 @@ class ExportTest(test.TestCase): root.f = def_function.function( lambda x: root.v1 * root.v2 * x) root.f(constant_op.constant(1.)) - to_export = root.f.get_concrete_function(constant_op.constant(1.)) - export_dir = os.path.join(self.get_temp_dir(), "saved_model") - export.export(root, export_dir, to_export) + to_save = root.f.get_concrete_function(constant_op.constant(1.)) + save_dir = os.path.join(self.get_temp_dir(), "saved_model") + save.save(root, save_dir, to_save) self.assertAllEqual({"output_0": 12.}, - self._import_and_infer(export_dir, {"x": 2.})) + self._import_and_infer(save_dir, {"x": 2.})) def test_optimizer(self): x = constant_op.constant([[3., 4.]]) y = constant_op.constant([2.]) model = _ModelWithOptimizer() first_loss = model(x, y) - export_dir = os.path.join(self.get_temp_dir(), "saved_model") - export.export(model, export_dir, model.call) + save_dir = os.path.join(self.get_temp_dir(), "saved_model") + save.save(model, save_dir, model.call) second_loss = model(x, y) self.assertNotEqual(first_loss, second_loss) self.assertAllClose( second_loss, - self._import_and_infer(export_dir, {"x": [[3., 4.]], "y": [2.]})) + self._import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]})) - def test_trivial_export_exception(self): - export_dir = os.path.join(self.get_temp_dir(), "saved_model") + def test_trivial_save_exception(self): + save_dir = os.path.join(self.get_temp_dir(), "saved_model") with self.assertRaisesRegexp(ValueError, "signature"): - export.export(tracking.Checkpointable(), export_dir) + save.save(tracking.Checkpointable(), save_dir) def test_single_method_default_signature(self): model = _ModelWithOptimizer() x = constant_op.constant([[3., 4.]]) y = constant_op.constant([2.]) model(x, y) - export_dir = os.path.join(self.get_temp_dir(), "saved_model") - export.export(model, export_dir) + save_dir = os.path.join(self.get_temp_dir(), "saved_model") + save.save(model, save_dir) self.assertIn("loss", - self._import_and_infer(export_dir, + self._import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]})) def test_single_function_default_signature(self): model = tracking.Checkpointable() model.f = def_function.function(lambda: 3., input_signature=()) model.f() - export_dir = os.path.join(self.get_temp_dir(), "saved_model") - export.export(model, export_dir) + save_dir = os.path.join(self.get_temp_dir(), "saved_model") + save.save(model, save_dir) self.assertAllClose({"output_0": 3.}, - self._import_and_infer(export_dir, {})) + self._import_and_infer(save_dir, {})) def test_ambiguous_signatures(self): model = _ModelWithOptimizer() @@ -208,9 +209,49 @@ class ExportTest(test.TestCase): y = constant_op.constant([2.]) model(x, y) model.second_function = def_function.function(lambda: 1.) - export_dir = os.path.join(self.get_temp_dir(), "saved_model") + save_dir = os.path.join(self.get_temp_dir(), "saved_model") with self.assertRaisesRegexp(ValueError, "call.*second_function"): - export.export(model, export_dir) + save.save(model, save_dir) + + def test_docstring(self): + + class Adder(util.Checkpoint): + + @def_function.function(input_signature=[tensor_spec.TensorSpec( + shape=None, dtype=dtypes.float32)]) + def add(self, x): + return x + x + 1. + + to_save = Adder() + to_save.add(constant_op.constant(1.)) + save_dir = os.path.join(self.get_temp_dir(), "saved_model") + save.save(to_save, save_dir) + self.assertAllClose({"output_0": 7.}, + self._import_and_infer(save_dir, {"x": 3.})) + + def test_default_attr_stripping(self): + + class Complex(util.Checkpoint): + + @def_function.function(input_signature=[]) + def __call__(self): + return math_ops.complex( + constant_op.constant(1.), + constant_op.constant(2.), + name="complex") + + to_save = Complex() + to_save() + save_dir = os.path.join(self.get_temp_dir(), "saved_model") + save.save(to_save, save_dir) + graph = ops.Graph() + with graph.as_default(), self.session(graph) as session: + loader.load(session, [], save_dir) + func, = graph._functions.values() + complex_node, = [ + node for node in func.definition.node_def if node.op == "Complex"] + self.assertNotIn("T", complex_node.attr) + self.assertNotIn("Tout", complex_node.attr) class MemoryTests(test.TestCase): @@ -227,8 +268,8 @@ class MemoryTests(test.TestCase): # TODO(allenl): debug reference cycles in Python 2.x self.skipTest("This test only works in Python 3+. Reference cycles are " "created in older Python versions.") - export_dir = os.path.join(self.get_temp_dir(), "saved_model") - export.export(self._model, export_dir, self._model.call) + save_dir = os.path.join(self.get_temp_dir(), "saved_model") + save.save(self._model, save_dir, self._model.call) if __name__ == "__main__": diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py index 6702c996071..fcde6b47e4f 100644 --- a/tensorflow/python/saved_model/saved_model.py +++ b/tensorflow/python/saved_model/saved_model.py @@ -29,8 +29,8 @@ from tensorflow.python.saved_model import signature_constants from tensorflow.python.saved_model import signature_def_utils from tensorflow.python.saved_model import tag_constants from tensorflow.python.saved_model import utils +from tensorflow.python.saved_model.save import save # pylint: enable=unused-import # pylint: disable=wildcard-import from tensorflow.python.saved_model.simple_save import * # pylint: enable=wildcard-import - diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl index a4fc58b8510..d0bac4033ca 100644 --- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl +++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl @@ -81,6 +81,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [ "saved_model/__init__.py", "saved_model/builder/__init__.py", "saved_model/constants/__init__.py", + "saved_model/experimental/__init__.py", "saved_model/loader/__init__.py", "saved_model/main_op/__init__.py", "saved_model/signature_constants/__init__.py", diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt new file mode 100644 index 00000000000..34343e7c039 --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt @@ -0,0 +1,7 @@ +path: "tensorflow.saved_model.experimental" +tf_module { + member_method { + name: "save" + argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], " + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt index 5b28f7b9b18..2055bfbf066 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt @@ -108,6 +108,10 @@ tf_module { name: "constants" mtype: "" } + member { + name: "experimental" + mtype: "" + } member { name: "loader" mtype: "" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt index 6b86d4b49f5..d57936a2f1c 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt @@ -120,4 +120,8 @@ tf_module { name: "regression_signature_def" argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None" } + member_method { + name: "save" + argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], " + } } From 7137218c2dcbb3963415b3a09fed64b96dceb3cd Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Mon, 5 Nov 2018 16:16:05 -0800 Subject: [PATCH 134/540] Internal change PiperOrigin-RevId: 220192329 --- tensorflow/compiler/tests/BUILD | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 06501e2177b..194e710f1f1 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -714,6 +714,10 @@ tf_xla_py_test( size = "medium", srcs = ["reduce_ops_test.py"], shard_count = 5, + tags = [ + # TODO(b/119059212): Re-enable this test in OSS. + "no_oss", + ], deps = [ ":xla_test", "//tensorflow/python:array_ops", From 4243facb600bbf62ca3a000e19929ec219892924 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 16:26:14 -0800 Subject: [PATCH 135/540] Added support for weight decay in most TPU embedding optimizers, including AdamW and MomentumW. PiperOrigin-RevId: 220193891 --- .../contrib/tpu/proto/optimization_parameters.proto | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto index c2e3be03db0..aae1ab1d37a 100644 --- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto +++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto @@ -154,6 +154,14 @@ message OptimizationParameters { // updates; not present means no limits are applied. ClippingLimits gradient_clipping_limits = 7; + // Amount of weight decay to apply; see weight_decay_optimizers.py for + // details. Almost all optimizers are supported with this option (MDL Adagrad + // Light does not work, and SGD does not behave as expected if it is enabled). + // Although there is no check, users who want weight decay will probably also + // want to enable gradient accumulation as well so that the decay will happen + // once per minibatch. + float weight_decay_factor = 16; + // Whether to use gradient accumulation (do two passes over the input // gradients: one to accumulate them into a temporary array and another to // apply them using the actual optimization algorithm). This feature is From 6c776aa01e48313c1c3322c2f40de24355489172 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Mon, 5 Nov 2018 16:31:22 -0800 Subject: [PATCH 136/540] Making all the boosted tree resources subclass TrackableResource. This is needed for tracking resources in the absence of collection as well as help out with saving / loading in TF 2.0 land. PiperOrigin-RevId: 220194735 --- .../learner/batch/ordinal_split_handler.py | 10 +- .../stats_accumulator_ops_test.py | 28 +-- .../boosted_trees/python/ops/model_ops.py | 52 +++- .../boosted_trees/python/ops/quantile_ops.py | 155 +++++++----- .../python/ops/stats_accumulator_ops.py | 236 ++++++++++-------- .../python/training/functions/gbdt_batch.py | 2 +- tensorflow/python/ops/boosted_trees_ops.py | 52 ++-- 7 files changed, 329 insertions(+), 206 deletions(-) diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py index f45010ec26e..1fffbb5f660 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py @@ -142,7 +142,7 @@ class InequalitySplitHandler(base_split_handler.BaseSplitHandler): name="StatsAccumulator/{}".format(self._name)) # Allocate both stats accumulator and quantile accumulator on the same # device so that we can build splits with fewer RPCs. - with ops.colocate_with(self._stats_accumulator.resource()): + with ops.colocate_with(self._stats_accumulator.resource_handle): self._quantile_accumulator = quantile_ops.QuantileAccumulator( init_stamp_token, epsilon=epsilon, @@ -268,8 +268,8 @@ class DenseSplitHandler(InequalitySplitHandler): handler = make_dense_split_tensor are_splits_ready, partition_ids, gains, split_infos = ( - handler(self._quantile_accumulator.resource(), - self._stats_accumulator.resource(), stamp_token, + handler(self._quantile_accumulator.resource_handle, + self._stats_accumulator.resource_handle, stamp_token, next_stamp_token, self._multiclass_strategy, class_id, self._feature_column_group_id, self._l1_regularization, self._l2_regularization, self._tree_complexity_regularization, @@ -447,8 +447,8 @@ class SparseSplitHandler(InequalitySplitHandler): handler = make_sparse_split_tensor are_splits_ready, partition_ids, gains, split_infos = ( - handler(self._quantile_accumulator.resource(), - self._stats_accumulator.resource(), stamp_token, + handler(self._quantile_accumulator.resource_handle, + self._stats_accumulator.resource_handle, stamp_token, next_stamp_token, self._multiclass_strategy, class_id, self._feature_column_group_id, self._l1_regularization, self._l2_regularization, self._tree_complexity_regularization, diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py index 05ce0884ccf..356ae337685 100644 --- a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py +++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py @@ -34,7 +34,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.scalar(), hessian_shape=tensor_shape.scalar()) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, partition_ids=[1, 2], @@ -62,7 +62,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.scalar(), hessian_shape=tensor_shape.scalar()) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, partition_ids=[1, 2, 1], @@ -91,7 +91,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.scalar(), hessian_shape=tensor_shape.scalar()) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, partition_ids=[1, 2], @@ -123,7 +123,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.scalar(), hessian_shape=tensor_shape.scalar()) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, partition_ids=[1, 2], @@ -133,7 +133,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): with ops.control_dependencies([op1]): (stamp_token, num_updates, partition_1, feature_1, grads_1, - hessians_1) = accumulator.serialize() + hessians_1) = accumulator.saveable.serialize() # Make sure that the accumulator hasn't changed during serialization. with ops.control_dependencies([stamp_token]): num_updates_2, partition_2, feature_2, grads_2, hessians_2 = ( @@ -164,7 +164,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.scalar(), hessian_shape=tensor_shape.scalar()) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): # These will be deleted due to deserialize call. op1 = accumulator.add( stamp_token=0, @@ -175,7 +175,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): with ops.control_dependencies([op1]): deserialize = ( - accumulator.deserialize( + accumulator.saveable.deserialize( stamp_token=2, num_updates=3, partition_ids=[3, 4], @@ -223,7 +223,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.TensorShape([2]), hessian_shape=tensor_shape.TensorShape([2, 2])) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, partition_ids=[1, 2], @@ -261,7 +261,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.TensorShape([2]), hessian_shape=tensor_shape.TensorShape([2, 2])) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, partition_ids=[1, 2], @@ -299,7 +299,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.TensorShape([2]), hessian_shape=tensor_shape.TensorShape([2, 2])) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, partition_ids=[1, 2], @@ -336,7 +336,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.TensorShape([2]), hessian_shape=tensor_shape.TensorShape([2, 2])) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, partition_ids=[1, 2], @@ -349,7 +349,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase): with ops.control_dependencies([op1]): (stamp_token, num_updates_1, partition_1, feature_1, grads_1, - hessians_1) = accumulator.serialize() + hessians_1) = accumulator.saveable.serialize() # Make sure that the accumulator hasn't changed during serialization. with ops.control_dependencies([stamp_token]): num_updates_2, partition_2, feature_2, grads_2, hessians_2 = ( @@ -386,7 +386,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase): stamp_token=0, gradient_shape=tensor_shape.TensorShape([2]), hessian_shape=tensor_shape.TensorShape([2, 2])) - with ops.control_dependencies([accumulator._create_op]): + with ops.control_dependencies([accumulator.initializer]): # These will be deleted due to deserialize call. op1 = accumulator.add( stamp_token=0, @@ -399,7 +399,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase): 0.08]]]) with ops.control_dependencies([op1]): - deserialize = accumulator.deserialize( + deserialize = accumulator.saveable.deserialize( stamp_token=2, num_updates=3, partition_ids=[3, 4], diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py index 25b2c9e2fd7..fca22c71a83 100644 --- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py +++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools + # pylint: disable=unused-import from tensorflow.contrib.boosted_trees.python.ops import boosted_trees_ops_loader # pylint: enable=unused-import @@ -31,6 +33,7 @@ from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensem from tensorflow.python.framework import ops from tensorflow.python.ops import resources from tensorflow.python.training import saver +from tensorflow.python.training.checkpointable import tracking ops.NotDifferentiable("TreeEnsembleVariable") ops.NotDifferentiable("TreeEnsembleSerialize") @@ -82,6 +85,44 @@ class TreeEnsembleVariableSavable(saver.BaseSaverBuilder.SaveableObject): tree_ensemble_config=restored_tensors[1]) +class TreeEnsembleVariable(tracking.TrackableResource): + """A Tree ensemble model.""" + + def __init__(self, stamp_token, tree_ensemble_config, name, container=None): + self._stamp_token = stamp_token + self._tree_ensemble_config = tree_ensemble_config + self._name = name + self._container = container + self._init_op = None + super(TreeEnsembleVariable, self).__init__() + + def create_resource(self): + return gen_model_ops.decision_tree_ensemble_resource_handle_op( + self._container, shared_name=self._name, name=self._name) + + def initialize(self): + return gen_model_ops.create_tree_ensemble_variable( + self.resource_handle, self._stamp_token, self._tree_ensemble_config) + + @property + def initializer(self): + if self._init_op is None: + self._init_op = self.initialize() + return self._init_op + + def is_initialized(self): + return gen_model_ops.tree_ensemble_is_initialized_op(self.resource_handle) + + def _gather_saveables_for_checkpoint(self): + return { + "tree_ensemble_variable": + functools.partial( + TreeEnsembleVariableSavable, + tree_ensemble_handle=self.resource_handle, + create_op=self.initializer) + } + + def tree_ensemble_variable(stamp_token, tree_ensemble_config, name, @@ -99,12 +140,11 @@ def tree_ensemble_variable(stamp_token, A `Tensor` of type mutable `string`. The handle to the tree ensemble. """ with ops.name_scope(name, "TreeEnsembleVariable") as name: - resource_handle = gen_model_ops.decision_tree_ensemble_resource_handle_op( - container, shared_name=name, name=name) - create_op = gen_model_ops.create_tree_ensemble_variable( - resource_handle, stamp_token, tree_ensemble_config) - is_initialized_op = gen_model_ops.tree_ensemble_is_initialized_op( - resource_handle) + tree_ensemble_var = TreeEnsembleVariable(stamp_token, tree_ensemble_config, + name, container) + resource_handle = tree_ensemble_var.resource_handle + create_op = tree_ensemble_var.initializer + is_initialized_op = tree_ensemble_var.is_initialized() # Adds the variable to the savable list. saveable = TreeEnsembleVariableSavable(resource_handle, create_op, resource_handle.name) diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py index 19b6b3296db..0c319cc9bd1 100644 --- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py +++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py @@ -33,12 +33,60 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.ops import resources from tensorflow.python.training import saver +from tensorflow.python.training.checkpointable import tracking # Pattern to remove all non alpha numeric from a string. _PATTERN = re.compile(r"[\W_]+") -class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject): +class QuantileAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject): + """SaveableObject implementation for QuantileAccumulator.""" + + def __init__(self, resource_handle, create_op, name): + self._resource_handle = resource_handle + self._create_op = create_op + stamp_token, state, are_buckets_ready, buckets = ( + gen_quantile_ops.quantile_accumulator_serialize(resource_handle)) + # slice_spec is useful for saving a slice from a variable. + # It's not meaningful in quantile accumulator. + slice_spec = "" + def make_save_spec(tensor, suffix): + return saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name + suffix) + + specs = [make_save_spec(stamp_token, "_stamp")] + specs += [make_save_spec(state, "_state")] + specs += [make_save_spec(are_buckets_ready, "_are_buckets_ready")] + specs += [make_save_spec(buckets, "buckets")] + super(QuantileAccumulatorSaveable, self).__init__(self._resource_handle, + specs, name) + + def restore(self, restored_tensors, unused_restored_shapes): + """Restores the associated quantile accumulator from 'restored_tensors'. + + Args: + restored_tensors: the tensors that were loaded from a checkpoint. + unused_restored_shapes: the shapes this object should conform to after + restore. + + Returns: + The operation that restores the state of the quantile accumulator. + """ + # Read the restored tensors with the same order that were added to saving + # spec. + stamp_token = restored_tensors[:1] + state = restored_tensors[1:2] + are_buckets_ready = restored_tensors[2:3] + buckets = restored_tensors[3] + with ops.control_dependencies([self._create_op]): + return gen_quantile_ops.quantile_accumulator_deserialize( + self._resource_handle, + stamp_token=stamp_token, + stream_state=state, + are_buckets_ready=are_buckets_ready, + buckets=buckets) + + +class QuantileAccumulator(tracking.TrackableResource): """A resource that allows distributed quantile computation.""" def __init__(self, @@ -61,82 +109,64 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject): generate_quantiles: Generate quantiles instead of approximate boundaries. If true, exactly `num_quantiles` will be produced in the final summary. """ + self._init_stamp_token = init_stamp_token self._epsilon = epsilon + self._num_quantiles = num_quantiles + self._max_elements = max_elements + self._container = container self._generate_quantiles = generate_quantiles + super(QuantileAccumulator, self).__init__() name = _PATTERN.sub("", name) with ops.name_scope(name, "QuantileAccumulator") as name: - self._quantile_accumulator_handle = ( - gen_quantile_ops.quantile_stream_resource_handle_op( - container=container, shared_name=name, name=name)) - self._create_op = gen_quantile_ops.create_quantile_accumulator( - self._quantile_accumulator_handle, - init_stamp_token, - epsilon=epsilon, - max_elements=max_elements, - num_quantiles=num_quantiles, - generate_quantiles=generate_quantiles) - is_initialized_op = gen_quantile_ops.quantile_accumulator_is_initialized( - self._quantile_accumulator_handle) - resources.register_resource(self._quantile_accumulator_handle, - self._create_op, is_initialized_op) - self._make_savable(name) + self._name = name + self._resource_handle = self.create_resource() + self._init_op = self.initialize() + is_initialized_op = self.is_initialized() + resources.register_resource(self.resource_handle, self._init_op, + is_initialized_op) + self._saveable = QuantileAccumulatorSaveable(self.resource_handle, + self._init_op, name) + ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable) - def _make_savable(self, name): - stamp_token, state, are_buckets_ready, buckets = ( - gen_quantile_ops.quantile_accumulator_serialize( - self._quantile_accumulator_handle)) - # slice_spec is useful for saving a slice from a variable. - # It's not meaningful in quantile accumulator. - slice_spec = "" - def make_save_spec(tensor, suffix): - return saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name + suffix) + def create_resource(self): + return gen_quantile_ops.quantile_stream_resource_handle_op( + container=self._container, shared_name=self._name, name=self._name) - specs = [make_save_spec(stamp_token, "_stamp")] - specs += [make_save_spec(state, "_state")] - specs += [make_save_spec(are_buckets_ready, "_are_buckets_ready")] - specs += [make_save_spec(buckets, "buckets")] - super(QuantileAccumulator, - self).__init__(self._quantile_accumulator_handle, specs, name) - ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self) + def initialize(self): + return gen_quantile_ops.create_quantile_accumulator( + self.resource_handle, + self._init_stamp_token, + epsilon=self._epsilon, + max_elements=self._max_elements, + num_quantiles=self._num_quantiles, + generate_quantiles=self._generate_quantiles) - def restore(self, restored_tensors, unused_restored_shapes): - """Restores the associated quantile accumulator from 'restored_tensors'. + @property + def initializer(self): + if self._init_op is None: + self._init_op = self.initialize() + return self._init_op - Args: - restored_tensors: the tensors that were loaded from a checkpoint. - unused_restored_shapes: the shapes this object should conform to after - restore. + def is_initialized(self): + return gen_quantile_ops.quantile_accumulator_is_initialized( + self.resource_handle) - Returns: - The operation that restores the state of the quantile accumulator. - """ - # Read the restored tensors with the same order that were added to saving - # spec. - stamp_token = restored_tensors[:1] - state = restored_tensors[1:2] - are_buckets_ready = restored_tensors[2:3] - buckets = restored_tensors[3] - with ops.control_dependencies([self._create_op]): - return gen_quantile_ops.quantile_accumulator_deserialize( - self._quantile_accumulator_handle, - stamp_token=stamp_token, - stream_state=state, - are_buckets_ready=are_buckets_ready, - buckets=buckets) + def _gather_saveables_for_checkpoint(self): + return {"quantile_accumulator", self.saveable} def get_buckets(self, stamp_token): """Returns quantile buckets created during previous flush.""" are_buckets_ready, buckets = ( gen_quantile_ops.quantile_accumulator_get_buckets( - quantile_accumulator_handles=[self._quantile_accumulator_handle], + quantile_accumulator_handles=[self.resource_handle], stamp_token=stamp_token)) return are_buckets_ready[0], buckets[0] def schedule_get_buckets(self): """Returns a scheduled read of buckets created during previous flush.""" return batch_ops_utils.ScheduledStampedResourceOp( - resource_handle=self._quantile_accumulator_handle, + resource_handle=self.resource_handle, op=gen_quantile_ops.quantile_accumulator_get_buckets) def _make_summary(self, column, example_weights): @@ -161,14 +191,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject): """Adds quantile summary to its stream in resource.""" summary = self._make_summary(column, example_weights) return gen_quantile_ops.quantile_accumulator_add_summaries( - quantile_accumulator_handles=[self._quantile_accumulator_handle], + quantile_accumulator_handles=[self.resource_handle], stamp_token=stamp_token, summaries=[summary]) def add_prebuilt_summary(self, stamp_token, summary): """Adds quantile summary to its stream in resource.""" return gen_quantile_ops.quantile_accumulator_add_summaries( - quantile_accumulator_handles=[self._quantile_accumulator_handle], + quantile_accumulator_handles=[self.resource_handle], stamp_token=stamp_token, summaries=[summary]) @@ -177,7 +207,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject): summary = self._make_summary(column, example_weights) return batch_ops_utils.ScheduledStampedResourceOp( op=gen_quantile_ops.quantile_accumulator_add_summaries, - resource_handle=self._quantile_accumulator_handle, + resource_handle=self.resource_handle, summaries=summary) def flush(self, stamp_token, next_stamp_token): @@ -190,17 +220,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject): The flush operation. """ return gen_quantile_ops.quantile_accumulator_flush( - quantile_accumulator_handle=self._quantile_accumulator_handle, + quantile_accumulator_handle=self.resource_handle, stamp_token=stamp_token, next_stamp_token=next_stamp_token) def flush_summary(self, stamp_token, next_stamp_token): """Finalizes quantile summary stream and resets it for next iteration.""" result = gen_quantile_ops.quantile_accumulator_flush_summary( - quantile_accumulator_handle=self._quantile_accumulator_handle, + quantile_accumulator_handle=self.resource_handle, stamp_token=stamp_token, next_stamp_token=next_stamp_token) return result - - def resource(self): - return self._quantile_accumulator_handle diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py index 2e94e353f32..ad1191d4123 100644 --- a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py +++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py @@ -26,12 +26,83 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import resources from tensorflow.python.training import saver +from tensorflow.python.training.checkpointable import tracking # Pattern to remove all non alpha numeric from a string. _PATTERN = re.compile(r"[\W_]+") -class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject): +class StatsAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject): + """SaveableObject implementation for StatsAccumulator.""" + + def __init__(self, resource_handle, create_op, is_scalar, name): + self._create_op = create_op + self._resource_handle = resource_handle + self._is_scalar = is_scalar + slice_spec = "" + saver_name = self._resource_handle.name + (stamp_token, num_updates, partition_ids, feature_ids, gradients, + hessians) = self.serialize() + specs = [ + saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec, + saver_name + "_stamp"), + saver.BaseSaverBuilder.SaveSpec(num_updates, slice_spec, + saver_name + "_num_updates"), + saver.BaseSaverBuilder.SaveSpec(partition_ids, slice_spec, + saver_name + "_partition_ids"), + saver.BaseSaverBuilder.SaveSpec(feature_ids, slice_spec, + saver_name + "_feature_ids"), + saver.BaseSaverBuilder.SaveSpec(gradients, slice_spec, + saver_name + "_gradients"), + saver.BaseSaverBuilder.SaveSpec(hessians, slice_spec, + saver_name + "hessians"), + ] + super(StatsAccumulatorSaveable, self).__init__(self._resource_handle, specs, + name) + + def serialize(self): + """Serializes the stats accumulator state.""" + if self._is_scalar: + return gen_stats_accumulator_ops.stats_accumulator_scalar_serialize( + self._resource_handle) + else: + return gen_stats_accumulator_ops.stats_accumulator_tensor_serialize( + self._resource_handle) + + def deserialize(self, stamp_token, num_updates, partition_ids, feature_ids, + gradients, hessians): + """Resets the stats accumulator with the serialized state.""" + if self._is_scalar: + return gen_stats_accumulator_ops.stats_accumulator_scalar_deserialize( + self._resource_handle, stamp_token, num_updates, partition_ids, + feature_ids, gradients, hessians) + else: + return gen_stats_accumulator_ops.stats_accumulator_tensor_deserialize( + self._resource_handle, stamp_token, num_updates, partition_ids, + feature_ids, gradients, hessians) + + def restore(self, restored_tensors, unused_restored_shapes): + """Restores the associated tree ensemble from 'restored_tensors'. + + Args: + restored_tensors: the tensors that were loaded from a checkpoint. + unused_restored_shapes: the shapes this object should conform to after + restore. Not meaningful for trees. + + Returns: + The operation that restores the state of the tree ensemble variable. + """ + with ops.control_dependencies([self._create_op]): + return self.deserialize( + stamp_token=restored_tensors[0], + num_updates=restored_tensors[1], + partition_ids=restored_tensors[2], + feature_ids=restored_tensors[3], + gradients=restored_tensors[4], + hessians=restored_tensors[5]) + + +class StatsAccumulator(tracking.TrackableResource): """A resource that allows to accumulate gradients and hessians. For consistency guarantees, we use read and write stamp tokens. @@ -58,58 +129,69 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject): Returns: A `Tensor` of type mutable `string`. The handle to the stats accumulator. """ + self._stamp_token = stamp_token + self._gradient_shape = gradient_shape + self._hessian_shape = hessian_shape + self._container = container + + if (gradient_shape == tensor_shape.scalar() and + hessian_shape == tensor_shape.scalar()): + self._is_scalar = True + else: + self._is_scalar = False + if name is not None: name = _PATTERN.sub("", name) with ops.name_scope(name, "StatsAccumulator") as name: - # Both values are scalars. - if (gradient_shape == tensor_shape.scalar() and - hessian_shape == tensor_shape.scalar()): - self._is_scalar = True - self._resource_handle = (gen_stats_accumulator_ops. - stats_accumulator_scalar_resource_handle_op( - container, name, name=name)) - - create_op = gen_stats_accumulator_ops.create_stats_accumulator_scalar( - self._resource_handle, stamp_token) - is_initialized_op = ( - gen_stats_accumulator_ops.stats_accumulator_scalar_is_initialized( - self._resource_handle)) - else: - self._is_scalar = False - self._resource_handle = (gen_stats_accumulator_ops. - stats_accumulator_tensor_resource_handle_op( - container, name, name=name)) - create_op = gen_stats_accumulator_ops.create_stats_accumulator_tensor( - self._resource_handle, stamp_token, gradient_shape.as_list(), - hessian_shape.as_list()) - is_initialized_op = ( - gen_stats_accumulator_ops.stats_accumulator_tensor_is_initialized( - self._resource_handle)) - - self._create_op = create_op - slice_spec = "" - saver_name = self._resource_handle.name - (stamp_token, num_updates, partition_ids, feature_ids, gradients, - hessians) = self.serialize() - specs = [ - saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec, - saver_name + "_stamp"), - saver.BaseSaverBuilder.SaveSpec(num_updates, slice_spec, - saver_name + "_num_updates"), - saver.BaseSaverBuilder.SaveSpec(partition_ids, slice_spec, - saver_name + "_partition_ids"), - saver.BaseSaverBuilder.SaveSpec(feature_ids, slice_spec, - saver_name + "_feature_ids"), - saver.BaseSaverBuilder.SaveSpec(gradients, slice_spec, - saver_name + "_gradients"), - saver.BaseSaverBuilder.SaveSpec(hessians, slice_spec, - saver_name + "hessians"), - ] - - super(StatsAccumulator, self).__init__(self._resource_handle, specs, name) - resources.register_resource(self._resource_handle, create_op, + self._name = name + self._resource_handle = self.create_resource() + self._init_op = self.initialize() + is_initialized_op = self.is_initialized() + resources.register_resource(self.resource_handle, self.initializer, is_initialized_op) - ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self) + self._saveable = StatsAccumulatorSaveable( + self.resource_handle, self.initializer, self._is_scalar, name) + ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable) + + def create_resource(self): + if self._is_scalar: + return ( + gen_stats_accumulator_ops.stats_accumulator_scalar_resource_handle_op( + self._container, self._name, name=self._name)) + else: + return ( + gen_stats_accumulator_ops.stats_accumulator_tensor_resource_handle_op( + self._container, self._name, name=self._name)) + + def initialize(self): + if self._is_scalar: + return gen_stats_accumulator_ops.create_stats_accumulator_scalar( + self.resource_handle, self._stamp_token) + else: + return gen_stats_accumulator_ops.create_stats_accumulator_tensor( + self.resource_handle, self._stamp_token, + self._gradient_shape.as_list(), self._hessian_shape.as_list()) + + @property + def initializer(self): + if self._init_op is None: + self._init_op = self.initialize() + return self._init_op + + def is_initialized(self): + if self._is_scalar: + return gen_stats_accumulator_ops.stats_accumulator_scalar_is_initialized( + self.resource_handle) + else: + return gen_stats_accumulator_ops.stats_accumulator_tensor_is_initialized( + self.resource_handle) + + @property + def saveable(self): + return self._saveable + + def _gather_saveables_for_checkpoint(self): + return {"stats_accumulator", self.saveable} def add(self, stamp_token, partition_ids, feature_ids, gradients, hessians): """Updates the stats accumulator.""" @@ -117,11 +199,11 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject): partition_ids, feature_ids, gradients, hessians)) if self._is_scalar: return gen_stats_accumulator_ops.stats_accumulator_scalar_add( - [self._resource_handle], stamp_token, [partition_ids], [feature_ids], + [self.resource_handle], stamp_token, [partition_ids], [feature_ids], [gradients], [hessians]) else: return gen_stats_accumulator_ops.stats_accumulator_tensor_add( - [self._resource_handle], stamp_token, [partition_ids], [feature_ids], + [self.resource_handle], stamp_token, [partition_ids], [feature_ids], [gradients], [hessians]) def schedule_add(self, partition_ids, feature_ids, gradients, hessians): @@ -131,7 +213,7 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject): if self._is_scalar: return batch_ops_utils.ScheduledStampedResourceOp( op=gen_stats_accumulator_ops.stats_accumulator_scalar_add, - resource_handle=self._resource_handle, + resource_handle=self.resource_handle, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, @@ -139,7 +221,7 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject): else: return batch_ops_utils.ScheduledStampedResourceOp( op=gen_stats_accumulator_ops.stats_accumulator_tensor_add, - resource_handle=self._resource_handle, + resource_handle=self.resource_handle, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, @@ -153,55 +235,11 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject): return gen_stats_accumulator_ops.stats_accumulator_tensor_make_summary( partition_ids, feature_ids, gradients, hessians) - def deserialize(self, stamp_token, num_updates, partition_ids, feature_ids, - gradients, hessians): - """Resets the stats accumulator with the serialized state.""" - if self._is_scalar: - return gen_stats_accumulator_ops.stats_accumulator_scalar_deserialize( - self._resource_handle, stamp_token, num_updates, partition_ids, - feature_ids, gradients, hessians) - else: - return gen_stats_accumulator_ops.stats_accumulator_tensor_deserialize( - self._resource_handle, stamp_token, num_updates, partition_ids, - feature_ids, gradients, hessians) - def flush(self, stamp_token, next_stamp_token): """Flushes the stats accumulator.""" if self._is_scalar: return gen_stats_accumulator_ops.stats_accumulator_scalar_flush( - self._resource_handle, stamp_token, next_stamp_token) + self.resource_handle, stamp_token, next_stamp_token) else: return gen_stats_accumulator_ops.stats_accumulator_tensor_flush( - self._resource_handle, stamp_token, next_stamp_token) - - def serialize(self): - """Serializes the stats accumulator state.""" - if self._is_scalar: - return gen_stats_accumulator_ops.stats_accumulator_scalar_serialize( - self._resource_handle) - else: - return gen_stats_accumulator_ops.stats_accumulator_tensor_serialize( - self._resource_handle) - - def restore(self, restored_tensors, unused_restored_shapes): - """Restores the associated tree ensemble from 'restored_tensors'. - - Args: - restored_tensors: the tensors that were loaded from a checkpoint. - unused_restored_shapes: the shapes this object should conform to after - restore. Not meaningful for trees. - - Returns: - The operation that restores the state of the tree ensemble variable. - """ - with ops.control_dependencies([self._create_op]): - return self.deserialize( - stamp_token=restored_tensors[0], - num_updates=restored_tensors[1], - partition_ids=restored_tensors[2], - feature_ids=restored_tensors[3], - gradients=restored_tensors[4], - hessians=restored_tensors[5]) - - def resource(self): - return self._resource_handle + self.resource_handle, stamp_token, next_stamp_token) diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py index 1cf61a10ba2..ab5713fbe26 100644 --- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py +++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py @@ -992,7 +992,7 @@ class GradientBoostedDecisionTreeModel(object): # Get accumulated steps and examples for the current layer. _, _, _, _, acc_examples, acc_steps = ( - steps_accumulator.serialize()) + steps_accumulator.saveable.serialize()) acc_examples = math_ops.cast(acc_examples[0], dtypes.int64) acc_steps = math_ops.cast(acc_steps[0], dtypes.int64) ensemble_update_ops.append( diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py index 720f9f4d41e..87d3918a5f9 100644 --- a/tensorflow/python/ops/boosted_trees_ops.py +++ b/tensorflow/python/ops/boosted_trees_ops.py @@ -40,6 +40,7 @@ from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_update_ens # pylint: enable=unused-import from tensorflow.python.training import saver +from tensorflow.python.training.checkpointable import tracking class PruningMode(object): @@ -102,35 +103,52 @@ class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject): tree_ensemble_serialized=restored_tensors[1]) -class TreeEnsemble(object): +class TreeEnsemble(tracking.TrackableResource): """Creates TreeEnsemble resource.""" def __init__(self, name, stamp_token=0, is_local=False, serialized_proto=''): + self._stamp_token = stamp_token + self._serialized_proto = serialized_proto + self._is_local = is_local with ops.name_scope(name, 'TreeEnsemble') as name: - self._resource_handle = ( - gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op( - container='', shared_name=name, name=name)) - create_op = gen_boosted_trees_ops.boosted_trees_create_ensemble( - self.resource_handle, - stamp_token, - tree_ensemble_serialized=serialized_proto) - is_initialized_op = ( - gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized( - self._resource_handle)) + self._name = name + self._resource_handle = self.create_resource() + self._init_op = self.initialize() + is_initialized_op = self.is_initialized() # Adds the variable to the savable list. if not is_local: - saveable = _TreeEnsembleSavable(self.resource_handle, create_op, - self.resource_handle.name) - ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable) + self._saveable = _TreeEnsembleSavable( + self.resource_handle, self.initializer, self.resource_handle.name) + ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable) resources.register_resource( self.resource_handle, - create_op, + self.initializer, is_initialized_op, is_shared=not is_local) + def create_resource(self): + return gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op( + container='', shared_name=self._name, name=self._name) + + def initialize(self): + return gen_boosted_trees_ops.boosted_trees_create_ensemble( + self.resource_handle, + self._stamp_token, + tree_ensemble_serialized=self._serialized_proto) + @property - def resource_handle(self): - return self._resource_handle + def initializer(self): + if self._init_op is None: + self._init_op = self.initialize() + return self._init_op + + def is_initialized(self): + return gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized( + self.resource_handle) + + def _gather_saveables_for_checkpoint(self): + if not self._is_local: + return {'tree_ensemble': self._saveable} def get_stamp_token(self): """Returns the current stamp token of the resource.""" From d4278cbaff3a01676f85b68ad6735e9435496fff Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 5 Nov 2018 16:48:31 -0800 Subject: [PATCH 137/540] Mkldnn pack and gemm for Eigen contractions PiperOrigin-RevId: 220197431 --- tensorflow/core/kernels/BUILD | 42 +++++- tensorflow/core/kernels/eigen_mkldnn.h | 123 +++++++++++++++ tensorflow/core/kernels/eigen_mkldnn_test.cc | 148 +++++++++++++++++++ 3 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 tensorflow/core/kernels/eigen_mkldnn.h create mode 100644 tensorflow/core/kernels/eigen_mkldnn_test.cc diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index f61ee53a428..fed0178176e 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -93,6 +93,17 @@ config_setting( }, ) +config_setting( + # Add "--define tensorflow_eigen_mkldnn=1" to your build command to use mkldnn + # sgemm in Eigen tensor contractions (matrix multiplications and convolutions). + # The mkldnn kernels are generated at runtime and use avx/avx2/fma/avx512 + # based on cpu status registers (https://en.wikipedia.org/wiki/CPUID). + name = "eigen_mkldnn", + values = { + "define": "tensorflow_eigen_mkldnn=1", + }, +) + # Public support libraries ---------------------------------------------------- cc_library( @@ -555,10 +566,20 @@ cc_library( "eigen_softmax.h", "eigen_spatial_convolutions.h", "eigen_volume_patch.h", - ], + ] + select({ + ":eigen_mkldnn": ["eigen_mkldnn.h"], + "//conditions:default": [], + }), + defines = select({ + ":eigen_mkldnn": ["EIGEN_USE_MKLDNN"], + "//conditions:default": [], + }), deps = [ "//third_party/eigen3", - ], + ] + select({ + ":eigen_mkldnn": ["//third_party/intel_mkl_dnn:mkldnn_single_threaded"], + "//conditions:default": [], + }), ) cc_library( @@ -2410,6 +2431,23 @@ tf_cc_tests( ], ) +# Conditional test target generation is not supported by the "tf_cc_tests" macro +# (can't add 'select' to the srcs field, type 'select' is not iterable). +tf_cc_test( + name = "eigen_mkldnn_test", + size = "small", + srcs = select({ + ":eigen_mkldnn": ["eigen_mkldnn_test.cc"], + "//conditions:default": [], + }), + tags = ["eigen_mkldnn"], + deps = [ + ":eigen_helpers", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + cc_library( name = "eigen_benchmark", testonly = 1, diff --git a/tensorflow/core/kernels/eigen_mkldnn.h b/tensorflow/core/kernels/eigen_mkldnn.h new file mode 100644 index 00000000000..5235431f5f3 --- /dev/null +++ b/tensorflow/core/kernels/eigen_mkldnn.h @@ -0,0 +1,123 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_MKLDNN_H_ +#define TENSORFLOW_CORE_KERNELS_EIGEN_MKLDNN_H_ + +// Support for Mkldnn sgemm kernel in Eigen/Tensor contractions: +// +// 1. Prepare packed Lhs/Rhs blocks from tensor expressions using +// DataMapper (see TensorContractionInputMapper). +// 2. Invoke gemm kernel with packed blocks (replacement for default +// gebp_kernel). + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "third_party/intel_mkl_dnn/include/mkldnn.h" + +namespace Eigen { +namespace internal { + +template +struct mkldnn_gemm_pack; + +// mkl_gemm_pack for ColMajor storage order. +template +struct mkldnn_gemm_pack { + typedef typename internal::packet_traits::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; + + enum { PacketSize = internal::packet_traits::size }; + + EIGEN_DONT_INLINE + void operator()(Scalar *block, const DataMapper &data_mapper, IndexType rows, + IndexType cols) { + const IndexType unrolled_rows = + (rows / (4 * PacketSize)) * (4 * PacketSize); + const IndexType vectorized_rows = (rows / PacketSize) * PacketSize; + + for (IndexType col = 0; col < cols; ++col) { + LinearMapper lm = data_mapper.getLinearMapper(0, col); + + // Give compiler a strong possibility to unroll the loop. + for (IndexType i = 0; i < unrolled_rows; i += 4 * PacketSize) { + for (IndexType j = 0; j < 4; ++j) { + const Packet p = lm.loadPacket(i + j * PacketSize); + internal::pstoreu(block + j * PacketSize, p); + } + block += 4 * PacketSize; + } + + // Process remaining rows with packets. + for (IndexType i = unrolled_rows; i < vectorized_rows; i += PacketSize) { + const Packet p = lm.loadPacket(i); + internal::pstoreu(block, p); + block += PacketSize; + } + + // Finalize with coefficients. + for (IndexType i = vectorized_rows; i < rows; ++i) { + *block = lm(i); + ++block; + } + } + } +}; + +template +struct mkldnn_gemm_kernel; + +// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm. +template +struct mkldnn_gemm_kernel { + EIGEN_DONT_INLINE + void operator()(const OutputMapper &output, const float *blockA, + const float *blockB, const IndexType rows, + const IndexType depth, const IndexType cols, float alpha) { + static const int max_index = (std::numeric_limits::max)(); + + eigen_assert(max_index >= rows); + eigen_assert(max_index >= cols); + eigen_assert(max_index >= depth); + eigen_assert(max_index >= output.stride()); + + const int m = static_cast(rows); + const int n = static_cast(cols); + const int k = static_cast(depth); + + const char transposeA = ConjugateLhs ? 'Y' : 'N'; + const char transposeB = ConjugateRhs ? 'Y' : 'N'; + + const int ldA = ConjugateLhs ? k : m; + const int ldB = ConjugateRhs ? n : k; + const int ldC = static_cast(output.stride()); + + const float beta = 1.0; + + mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, + &alpha, blockA, &ldA, blockB, &ldB, &beta, + const_cast(output.data()), &ldC); + eigen_assert(st == 0); + } +}; + +} // namespace internal +} // namespace Eigen + +#endif // TENSORFLOW_CORE_KERNELS_EIGEN_MKLDNN_H_ diff --git a/tensorflow/core/kernels/eigen_mkldnn_test.cc b/tensorflow/core/kernels/eigen_mkldnn_test.cc new file mode 100644 index 00000000000..051ab28f792 --- /dev/null +++ b/tensorflow/core/kernels/eigen_mkldnn_test.cc @@ -0,0 +1,148 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/kernels/eigen_mkldnn.h" +#include "tensorflow/core/platform/test.h" + +namespace Eigen { +namespace internal { + +namespace { +template +Eigen::array RandomDims(int min_dim = 1, int max_dim = 20) { + Eigen::array dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random(min_dim, max_dim); + } + return dims; +} +} // namespace + +using Scalar = float; +using Index = Eigen::Index; + +TEST(EigenMkldnnTest, MkldnnPack) { + // Packing with mkldnn_gemm_pack is the same as taking a slice of 2 + // dimensional Tensor. + + // Mkldnn pack and gemm are used only in Tensor contractions, and it's + // guaranteed that Tensors will have ColMajor layout. + static const int Options = ColMajor; + + using DataMapper = blas_data_mapper; + using MkldnnGemmPack = mkldnn_gemm_pack; + using Tensor2d = Tensor; + + Eigen::array dims = RandomDims(1, 500); + + // Create a tensor initialized with random data. + Tensor2d src(dims); + src.setRandom(); + + // Pick a random slice of src tensor. + Eigen::array slice_start = RandomDims(0, 250); + Eigen::array slice_size = RandomDims(100, 500); + + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < 2; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + // Prepare tensors for packing and slicing results. + Tensor2d pack_dst(slice_size[0], slice_size[1]); + Tensor2d slice_dst(slice_size[0], slice_size[1]); + + // Pack memory using mkldnn_gemm_pack. + DataMapper data_mapper(src.data(), dims[0]); + MkldnnGemmPack gemm_pack; + gemm_pack(pack_dst.data(), + data_mapper.getSubMapper(slice_start[0], slice_start[1]), + slice_size[0], slice_size[1]); + + // Slice the source tensor. + slice_dst = src.slice(slice_start, slice_size); + + // Verify that dst tensors are equal. + EXPECT_EQ(pack_dst.dimensions().TotalSize(), + slice_dst.dimensions().TotalSize()); + for (size_t i = 0; i < pack_dst.dimensions().TotalSize(); ++i) { + Scalar packed = pack_dst.coeff(i); + Scalar sliced = slice_dst.coeff(i); + EXPECT_EQ(packed, sliced); + } +} + +TEST(EigenMkldnnTest, MkldnnGemm) { + // Mkldnn pack and gemm are used only in Tensor contractions, and it's + // guaranteed that Tensors will have ColMajor layout. + static const int Options = ColMajor; + + using Tensor2d = Tensor; + + int m = internal::random(1, 100); + int n = internal::random(1, 100); + int k = internal::random(1, 100); + + Tensor2d lhs(m, k); + lhs.setRandom(); + + Tensor2d rhs(k, n); + rhs.setRandom(); + + // Compute matmul with mkldnn gemm kernel. + using OutputMapper = blas_data_mapper; + using MkldnnGemmKernel = + mkldnn_gemm_kernel; + + Tensor2d mkldnn_result(m, n); + mkldnn_result.setZero(); + OutputMapper output_mapper(mkldnn_result.data(), m); + + MkldnnGemmKernel gemm_kernel; + gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha=*/1.0); + + // Compute matmul with Eigen::Matrix. + using Matrix = Eigen::Matrix; + using MatrixMap = Map>; + + MatrixMap lhs_mat(lhs.data(), m, k); + MatrixMap rhs_mat(rhs.data(), k, n); + + Matrix matmul_result(m, n); + matmul_result.setZero(); + matmul_result = lhs_mat * rhs_mat; + + // Verify that results are equal. + for (Index i = 0; i < m * n; ++i) { + Scalar gemm = mkldnn_result(i); + Scalar matmul = matmul_result(i % m, i / m); + + Scalar delta = std::abs(gemm - matmul); + + // NOTE(rmlarsen): Compute proper forward error bound. + Scalar sum = Scalar(0.0); + for (int k1 = 0; k1 < k; ++k1) { + sum += std::abs(lhs_mat(i % m, k1) * rhs_mat(k1, i / m)); + } + Scalar epsilon = std::numeric_limits::epsilon(); + Scalar upper_bound = Scalar(1.01) * epsilon * k * sum; + + EXPECT_LE(delta, upper_bound); + } +} + +} // namespace internal +} // namespace Eigen From 1018eda32b234d97239ec2ccb48af7de138fff5d Mon Sep 17 00:00:00 2001 From: Ian Langmore Date: Mon, 5 Nov 2018 16:55:55 -0800 Subject: [PATCH 138/540] matvec added to math_ops.py. This is pure sugar: matvec(a, b) = matmul(a, b[..., tf.newaxis])[..., 0] PiperOrigin-RevId: 220198397 --- .../python/kernel_tests/matmul_op_test.py | 13 +++ tensorflow/python/ops/math_ops.py | 101 ++++++++++++++++++ .../api/golden/v1/tensorflow.linalg.pbtxt | 4 + .../api/golden/v2/tensorflow.linalg.pbtxt | 4 + 4 files changed, 122 insertions(+) diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py index 4760236ca0e..1c2822180ac 100644 --- a/tensorflow/python/kernel_tests/matmul_op_test.py +++ b/tensorflow/python/kernel_tests/matmul_op_test.py @@ -35,6 +35,19 @@ from tensorflow.python.platform import test as test_lib # os.environ["TF_MATMUL_AUTOTUNE_ENABLE"] = "1" to enable it. +class MatVecTest(test_lib.TestCase): + """Simple test for matvec, which is sugar on top of matmul.""" + + def testTwoByTwoCase(self): + a = np.array([[1, 2], [3, 4]]) + b = np.array([5, 6]) + with self.cached_session(): + c = math_ops.matvec(a, b) + self.assertAllEqual((2,), c.shape) + c_ = c.eval() + self.assertAllEqual([5 + 2 * 6, 3 * 5 + 4 * 6], c_) + + def _AddTest(test, op_name, testcase_name, fn): test_name = "_".join(["test", op_name, testcase_name]) if hasattr(test, test_name): diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index c9374006ba3..d247e7b2463 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -2057,6 +2057,107 @@ def matmul(a, a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name) +@tf_export("linalg.matvec") +def matvec(a, + b, + transpose_a=False, + adjoint_a=False, + a_is_sparse=False, + b_is_sparse=False, + name=None): + """Multiplies matrix `a` by vector `b`, producing `a` * `b`. + + The matrix `a` must, following any transpositions, be a tensor of rank >= 2, + and we must have `shape(b) = shape(a)[:-2] + [shape(a)[-1]]`. + + Both `a` and `b` must be of the same type. The supported types are: + `float16`, `float32`, `float64`, `int32`, `complex64`, `complex128`. + + Matrix `a` can be transposed or adjointed (conjugated and transposed) on + the fly by setting one of the corresponding flag to `True`. These are `False` + by default. + + If one or both of the inputs contain a lot of zeros, a more efficient + multiplication algorithm can be used by setting the corresponding + `a_is_sparse` or `b_is_sparse` flag to `True`. These are `False` by default. + This optimization is only available for plain matrices/vectors (rank-2/1 + tensors) with datatypes `bfloat16` or `float32`. + + For example: + + ```python + # 2-D tensor `a` + # [[1, 2, 3], + # [4, 5, 6]] + a = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3]) + + # 1-D tensor `b` + # [7, 9, 11] + b = tf.constant([7, 9, 11], shape=[3]) + + # `a` * `b` + # [ 58, 64] + c = tf.matvec(a, b) + + + # 3-D tensor `a` + # [[[ 1, 2, 3], + # [ 4, 5, 6]], + # [[ 7, 8, 9], + # [10, 11, 12]]] + a = tf.constant(np.arange(1, 13, dtype=np.int32), + shape=[2, 2, 3]) + + # 2-D tensor `b` + # [[13, 14, 15], + # [16, 17, 18]] + b = tf.constant(np.arange(13, 19, dtype=np.int32), + shape=[2, 3]) + + # `a` * `b` + # [[ 86, 212], + # [410, 563]] + c = tf.matvec(a, b) + ``` + + Args: + a: `Tensor` of type `float16`, `float32`, `float64`, `int32`, `complex64`, + `complex128` and rank > 1. + b: `Tensor` with same type and rank = `rank(a) - 1`. + transpose_a: If `True`, `a` is transposed before multiplication. + adjoint_a: If `True`, `a` is conjugated and transposed before + multiplication. + a_is_sparse: If `True`, `a` is treated as a sparse matrix. + b_is_sparse: If `True`, `b` is treated as a sparse matrix. + name: Name for the operation (optional). + + Returns: + A `Tensor` of the same type as `a` and `b` where each inner-most vector is + the product of the corresponding matrices in `a` and vectors in `b`, e.g. if + all transpose or adjoint attributes are `False`: + + `output`[..., i] = sum_k (`a`[..., i, k] * `b`[..., k]), for all indices i. + + Note: This is matrix-vector product, not element-wise product. + + + Raises: + ValueError: If transpose_a and adjoint_a are both set to True. + """ + with ops.name_scope(name, "MatVec", [a, b]) as name: + # matvec is achieved by reshaping b into a matrix (appending a singleton), + # then squeezing out the trailing dim of the result. There are other ways + # to do this, e.g. using tf.expand_dims and tf.squeeze. What we have here + # has been found to be most memory efficient on TPU. + return matmul( + a, + b[..., array_ops.newaxis], + transpose_a=transpose_a, + adjoint_a=adjoint_a, + a_is_sparse=a_is_sparse, + b_is_sparse=b_is_sparse)[..., 0] + + _OverrideBinaryOperatorHelper(matmul, "matmul") sparse_matmul = gen_math_ops.sparse_mat_mul diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt index cbab7ce6314..1a4098d121b 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt @@ -136,6 +136,10 @@ tf_module { name: "matmul" argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], " } + member_method { + name: "matvec" + argspec: "args=[\'a\', \'b\', \'transpose_a\', \'adjoint_a\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], " + } member_method { name: "norm" argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt index cbab7ce6314..1a4098d121b 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt @@ -136,6 +136,10 @@ tf_module { name: "matmul" argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], " } + member_method { + name: "matvec" + argspec: "args=[\'a\', \'b\', \'transpose_a\', \'adjoint_a\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], " + } member_method { name: "norm" argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], " From cae143f88aa17afdc39c4a91e2905a4472e4df75 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 5 Nov 2018 17:05:47 -0800 Subject: [PATCH 139/540] Remove shuffle from input PiperOrigin-RevId: 220199936 --- tensorflow/contrib/distribute/python/examples/keras_mnist.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py index c7036daa3e3..0fd3acd0451 100644 --- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py +++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py @@ -61,7 +61,6 @@ def get_input_datasets(use_bfloat16=False): # train dataset train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)) train_ds = train_ds.repeat() - train_ds = train_ds.shuffle(100) train_ds = train_ds.map(lambda x, y: (tf.cast(x, cast_dtype), y)) train_ds = train_ds.batch(64, drop_remainder=True) From 37e9d9f8a331efc9a720ed08883edad10b9db55c Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Mon, 5 Nov 2018 17:06:04 -0800 Subject: [PATCH 140/540] Add some documentation to DnnSupport. PiperOrigin-RevId: 220199987 --- tensorflow/stream_executor/dnn.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 558f3890da7..21954d58f0a 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -921,6 +921,23 @@ class VersionInfo { // Suite of operations typically used for implementing Deep/Convolutional Neural // Nets. Note: A false return value of an operation indicates the // implementation is not available. +// +// TODO(b/118763918): this class (or rather dispatch table) has several +// problems: +// * Some overloads are missing. Ideally we want to have template virtual +// functions while the template arguments is a closed set. However, we don't +// get that from the language. +// * The API is a union of cuDNN and another private backend. Only 10% of the +// functions are actually implemented by both backends, the rest are +// actually backend-specific. The massive interface creates extra mental +// burden. +// * Poor error handling: the API should return Status objects. +// +// Things worth trying: +// * Move functions that are not actually common back to the backends. Then, +// callers may use dynamic_cast to access specific backends. This may not be +// that hard, as many of the callers are Stream::ThenXxx functions. +// * Change all the returned bools to Status. class DnnSupport { public: DnnSupport() {} From dd1cfe2f2092517d8a57bad04b2cb269a19b37ee Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 24 Aug 2018 17:42:17 +0000 Subject: [PATCH 141/540] Convert InputBuffer to BuffereedInputStream for FixedLengthRecordDatasetOp This fix converts InputBuffer to BuffereedInputStream for FixedLengthRecordDatasetOp, so that it is possible to add compression layer on top for FixedLengthRecordDatasetOp. Signed-off-by: Yong Tang --- .../core/kernels/data/reader_dataset_ops.cc | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc index ea97cf5ffdc..21258ebe620 100644 --- a/tensorflow/core/kernels/data/reader_dataset_ops.cc +++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc @@ -383,13 +383,13 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { mutex_lock l(mu_); do { // We are currently processing a file, so try to read the next record. - if (input_buffer_) { - const int64 current_pos = input_buffer_->Tell(); + if (buffered_input_stream_) { + const int64 current_pos = buffered_input_stream_->Tell(); DCHECK_GE(file_pos_limit_, 0); if (current_pos < file_pos_limit_) { string record; - TF_RETURN_IF_ERROR( - input_buffer_->ReadNBytes(dataset()->record_bytes_, &record)); + TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes( + dataset()->record_bytes_, &record)); // Produce the record as output. out_tensors->emplace_back(ctx->allocator({}), DT_STRING, TensorShape({})); @@ -400,7 +400,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { // We have reached the end of the current file, so maybe // move on to next file. - input_buffer_.reset(); + buffered_input_stream_.reset(); file_.reset(); ++current_file_index_; } @@ -432,10 +432,10 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { } TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile( dataset()->filenames_[current_file_index_], &file_)); - input_buffer_.reset( - new io::InputBuffer(file_.get(), dataset()->buffer_size_)); + buffered_input_stream_.reset(new io::BufferedInputStream( + file_.get(), dataset()->buffer_size_)); TF_RETURN_IF_ERROR( - input_buffer_->SkipNBytes(dataset()->header_bytes_)); + buffered_input_stream_->SkipNBytes(dataset()->header_bytes_)); } while (true); } @@ -450,10 +450,11 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"), current_file_index_)); - // `input_buffer_` is empty if + // `buffered_input_stream_` is empty if // 1. GetNext has not been called even once. // 2. All files have been read and iterator has been exhausted. - int64 current_pos = input_buffer_ ? input_buffer_->Tell() : -1; + int64 current_pos = + buffered_input_stream_ ? buffered_input_stream_->Tell() : -1; TF_RETURN_IF_ERROR( writer->WriteScalar(full_name("current_pos"), current_pos)); return Status::OK(); @@ -471,18 +472,18 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { reader->ReadScalar(full_name("current_pos"), ¤t_pos)); // Seek to current_pos. - input_buffer_.reset(); + buffered_input_stream_.reset(); file_.reset(); - if (current_pos >= 0) { // There was an active input_buffer_. + if (current_pos >= 0) { // There was an active buffered_input_stream_. uint64 file_size; TF_RETURN_IF_ERROR(ctx->env()->GetFileSize( dataset()->filenames_[current_file_index_], &file_size)); file_pos_limit_ = file_size - dataset()->footer_bytes_; TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile( dataset()->filenames_[current_file_index_], &file_)); - input_buffer_.reset( - new io::InputBuffer(file_.get(), dataset()->buffer_size_)); - TF_RETURN_IF_ERROR(input_buffer_->Seek(current_pos)); + buffered_input_stream_.reset(new io::BufferedInputStream( + file_.get(), dataset()->buffer_size_)); + TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes(current_pos)); } return Status::OK(); @@ -492,8 +493,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { mutex mu_; size_t current_file_index_ GUARDED_BY(mu_) = 0; std::unique_ptr file_ - GUARDED_BY(mu_); // must outlive input_buffer_ - std::unique_ptr input_buffer_ GUARDED_BY(mu_); + GUARDED_BY(mu_); // must outlive buffered_input_stream_ + std::unique_ptr buffered_input_stream_ + GUARDED_BY(mu_); int64 file_pos_limit_ GUARDED_BY(mu_) = -1; }; From c4aed9c197ec1a4ea8f912db1f7ff64339a809ad Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 24 Aug 2018 18:58:48 +0000 Subject: [PATCH 142/540] Add compression support for FixedLengthRecordDataset This fix adds FixedLengthRecordDatasetV2 which enables the ability to pass a compression type for FixedLengthRecordDataset. This fix fixes 21680. Signed-off-by: Yong Tang --- .../core/kernels/data/reader_dataset_ops.cc | 33 ++++++++++++++----- tensorflow/core/ops/dataset_ops.cc | 23 +++++++++++++ tensorflow/python/data/ops/readers.py | 14 ++++++-- 3 files changed, 59 insertions(+), 11 deletions(-) diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc index 21258ebe620..455d814464c 100644 --- a/tensorflow/core/kernels/data/reader_dataset_ops.cc +++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc @@ -271,6 +271,9 @@ REGISTER_KERNEL_BUILDER(Name("TextLineDataset").Device(DEVICE_CPU), class FixedLengthRecordDatasetOp : public DatasetOpKernel { public: using DatasetOpKernel::DatasetOpKernel; + explicit FixedLengthRecordDatasetOp(OpKernelConstruction* ctx) + : DatasetOpKernel(ctx), + op_version_(ctx->def().op() == "FixedLengthRecordDataset" ? 1 : 2) {} void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override { const Tensor* filenames_tensor; @@ -311,9 +314,16 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { if (buffer_size == 0) { buffer_size = 256 << 10; // 256 kB as default. } - + string compression_type; + if (op_version_ > 1) { + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "compression_type", + &compression_type)); + OP_REQUIRES(ctx, compression_type.empty() || compression_type == "ZLIB" || + compression_type == "GZIP", + errors::InvalidArgument("Unsupported compression_type.")); + } *output = new Dataset(ctx, std::move(filenames), header_bytes, record_bytes, - footer_bytes, buffer_size); + footer_bytes, buffer_size, compression_type); } private: @@ -321,13 +331,14 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { public: explicit Dataset(OpKernelContext* ctx, std::vector filenames, int64 header_bytes, int64 record_bytes, int64 footer_bytes, - int64 buffer_size) + int64 buffer_size, const string& compression_type) : DatasetBase(DatasetContext(ctx)), filenames_(std::move(filenames)), header_bytes_(header_bytes), record_bytes_(record_bytes), footer_bytes_(footer_bytes), - buffer_size_(buffer_size) {} + buffer_size_(buffer_size), + compression_type_(compression_type) {} std::unique_ptr MakeIteratorInternal( const string& prefix) const override { @@ -359,15 +370,17 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { Node* record_bytes = nullptr; Node* footer_bytes = nullptr; Node* buffer_size = nullptr; + Node* compression_type = nullptr; TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames)); TF_RETURN_IF_ERROR(b->AddScalar(header_bytes_, &header_bytes)); TF_RETURN_IF_ERROR(b->AddScalar(record_bytes_, &record_bytes)); TF_RETURN_IF_ERROR(b->AddScalar(footer_bytes_, &footer_bytes)); TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size)); - TF_RETURN_IF_ERROR(b->AddDataset( - this, - {filenames, header_bytes, record_bytes, footer_bytes, buffer_size}, - output)); + TF_RETURN_IF_ERROR(b->AddScalar(compression_type_, &compression_type)); + TF_RETURN_IF_ERROR( + b->AddDataset(this, {filenames, header_bytes, record_bytes, + footer_bytes, buffer_size, compression_type}, + output)); return Status::OK(); } @@ -504,11 +517,15 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { const int64 record_bytes_; const int64 footer_bytes_; const int64 buffer_size_; + const string compression_type_; }; + const int op_version_; }; REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDataset").Device(DEVICE_CPU), FixedLengthRecordDatasetOp); +REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDatasetV2").Device(DEVICE_CPU), + FixedLengthRecordDatasetOp); class TFRecordDatasetOp : public DatasetOpKernel { public: diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc index 98a76962611..9b261a28db6 100644 --- a/tensorflow/core/ops/dataset_ops.cc +++ b/tensorflow/core/ops/dataset_ops.cc @@ -674,6 +674,29 @@ REGISTER_OP("FixedLengthRecordDataset") return shape_inference::ScalarShape(c); }); +REGISTER_OP("FixedLengthRecordDatasetV2") + .Input("filenames: string") + .Input("header_bytes: int64") + .Input("record_bytes: int64") + .Input("footer_bytes: int64") + .Input("buffer_size: int64") + .Input("compression_type: string") + .Output("handle: variant") + .SetIsStateful() // TODO(b/65524810): Source dataset ops must be marked + // stateful to inhibit constant folding. + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // `filenames` must be a scalar or a vector. + TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused)); + // header_bytes, record_bytes, footer_bytes, buffer_size should be + // scalars. + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + return shape_inference::ScalarShape(c); + }); + REGISTER_OP("TFRecordDataset") .Input("filenames: string") .Input("compression_type: string") diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index d08da6704ca..76fe5445aad 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -255,7 +255,8 @@ class FixedLengthRecordDataset(dataset_ops.Dataset): record_bytes, header_bytes=None, footer_bytes=None, - buffer_size=None): + buffer_size=None, + compression_type=None): """Creates a `FixedLengthRecordDataset`. Args: @@ -268,6 +269,8 @@ class FixedLengthRecordDataset(dataset_ops.Dataset): bytes to ignore at the end of a file. buffer_size: (Optional.) A `tf.int64` scalar representing the number of bytes to buffer when reading. + compression_type: (Optional.) A `tf.string` scalar evaluating to one of + `""` (no compression), `"ZLIB"`, or `"GZIP"`. """ super(FixedLengthRecordDataset, self).__init__() self._filenames = ops.convert_to_tensor( @@ -281,11 +284,16 @@ class FixedLengthRecordDataset(dataset_ops.Dataset): "footer_bytes", footer_bytes) self._buffer_size = convert.optional_param_to_tensor( "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES) + self._compression_type = convert.optional_param_to_tensor( + "compression_type", + compression_type, + argument_default="", + argument_dtype=dtypes.string) def _as_variant_tensor(self): - return gen_dataset_ops.fixed_length_record_dataset( + return gen_dataset_ops.fixed_length_record_dataset_v2( self._filenames, self._header_bytes, self._record_bytes, - self._footer_bytes, self._buffer_size) + self._footer_bytes, self._buffer_size, self._compression_type) def _inputs(self): return [] From 2f6527c17e92ad975323314dd43c65eb071aa8ed Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 24 Aug 2018 19:35:45 +0000 Subject: [PATCH 143/540] Use lookahead cache to process compressed stream Signed-off-by: Yong Tang --- .../core/kernels/data/reader_dataset_ops.cc | 145 +++++++++++++----- 1 file changed, 110 insertions(+), 35 deletions(-) diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc index 455d814464c..7b687739e95 100644 --- a/tensorflow/core/kernels/data/reader_dataset_ops.cc +++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc @@ -398,17 +398,49 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { // We are currently processing a file, so try to read the next record. if (buffered_input_stream_) { const int64 current_pos = buffered_input_stream_->Tell(); - DCHECK_GE(file_pos_limit_, 0); - if (current_pos < file_pos_limit_) { + if (dataset()->compression_type_ == "") { + DCHECK_GE(file_pos_limit_, 0); + if (current_pos < file_pos_limit_) { + string record; + TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes( + dataset()->record_bytes_, &record)); + // Produce the record as output. + Tensor record_tensor(ctx->allocator({}), DT_STRING, {}); + record_tensor.scalar()() = record; + out_tensors->emplace_back(std::move(record_tensor)); + *end_of_sequence = false; + return Status::OK(); + } + } else { string record; - TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes( - dataset()->record_bytes_, &record)); - // Produce the record as output. - out_tensors->emplace_back(ctx->allocator({}), DT_STRING, - TensorShape({})); - out_tensors->back().scalar()() = record; - *end_of_sequence = false; - return Status::OK(); + Status s = buffered_input_stream_->ReadNBytes( + dataset()->record_bytes_, &record); + if (s.ok()) { + lookahead_cache_.append(record); + record = lookahead_cache_.substr(0, dataset()->record_bytes_); + lookahead_cache_ = + lookahead_cache_.substr(dataset()->record_bytes_); + // Produce the record as output. + out_tensors->emplace_back(ctx->allocator({}), DT_STRING, + TensorShape({})); + out_tensors->back().scalar()() = record; + *end_of_sequence = false; + return Status::OK(); + } + if (errors::IsOutOfRange(s) && !record.empty()) { + uint64 body_size = + current_pos + record.size() - + (dataset()->header_bytes_ + dataset()->footer_bytes_); + return errors::DataLoss( + "Excluding the header (", dataset()->header_bytes_, + " bytes) and footer (", dataset()->footer_bytes_, + " bytes), input file \"", + dataset()->filenames_[current_file_index_], + "\" has body length ", body_size, + " bytes, which is not an exact multiple of the record " + "length (", + dataset()->record_bytes_, " bytes)."); + } } // We have reached the end of the current file, so maybe @@ -425,30 +457,49 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { } // Actually move on to next file. - uint64 file_size; - TF_RETURN_IF_ERROR(ctx->env()->GetFileSize( - dataset()->filenames_[current_file_index_], &file_size)); - file_pos_limit_ = file_size - dataset()->footer_bytes_; + if (dataset()->compression_type_ == "") { + uint64 file_size; + TF_RETURN_IF_ERROR(ctx->env()->GetFileSize( + dataset()->filenames_[current_file_index_], &file_size)); + file_pos_limit_ = file_size - dataset()->footer_bytes_; - uint64 body_size = - file_size - (dataset()->header_bytes_ + dataset()->footer_bytes_); + uint64 body_size = file_size - (dataset()->header_bytes_ + + dataset()->footer_bytes_); - if (body_size % dataset()->record_bytes_ != 0) { - return errors::InvalidArgument( - "Excluding the header (", dataset()->header_bytes_, - " bytes) and footer (", dataset()->footer_bytes_, - " bytes), input file \"", - dataset()->filenames_[current_file_index_], - "\" has body length ", body_size, - " bytes, which is not an exact multiple of the record length (", - dataset()->record_bytes_, " bytes)."); + if (body_size % dataset()->record_bytes_ != 0) { + return errors::InvalidArgument( + "Excluding the header (", dataset()->header_bytes_, + " bytes) and footer (", dataset()->footer_bytes_, + " bytes), input file \"", + dataset()->filenames_[current_file_index_], + "\" has body length ", body_size, + " bytes, which is not an exact multiple of the record length " + "(", + dataset()->record_bytes_, " bytes)."); + } } TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile( dataset()->filenames_[current_file_index_], &file_)); - buffered_input_stream_.reset(new io::BufferedInputStream( - file_.get(), dataset()->buffer_size_)); + if (dataset()->compression_type_ != "") { + const io::ZlibCompressionOptions zlib_options = + dataset()->compression_type_ == "ZLIB" + ? io::ZlibCompressionOptions::DEFAULT() + : io::ZlibCompressionOptions::GZIP(); + file_stream_.reset(new io::RandomAccessInputStream(file_.get())); + buffered_input_stream_.reset(new io::ZlibInputStream( + file_stream_.get(), dataset()->buffer_size_, + dataset()->buffer_size_, zlib_options)); + } else { + buffered_input_stream_.reset(new io::BufferedInputStream( + file_.get(), dataset()->buffer_size_)); + } TF_RETURN_IF_ERROR( buffered_input_stream_->SkipNBytes(dataset()->header_bytes_)); + lookahead_cache_.clear(); + if (dataset()->compression_type_ != "") { + TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes( + dataset()->footer_bytes_, &lookahead_cache_)); + } } while (true); } @@ -488,15 +539,36 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { buffered_input_stream_.reset(); file_.reset(); if (current_pos >= 0) { // There was an active buffered_input_stream_. - uint64 file_size; - TF_RETURN_IF_ERROR(ctx->env()->GetFileSize( - dataset()->filenames_[current_file_index_], &file_size)); - file_pos_limit_ = file_size - dataset()->footer_bytes_; + if (dataset()->compression_type_ == "") { + uint64 file_size; + TF_RETURN_IF_ERROR(ctx->env()->GetFileSize( + dataset()->filenames_[current_file_index_], &file_size)); + file_pos_limit_ = file_size - dataset()->footer_bytes_; + } TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile( dataset()->filenames_[current_file_index_], &file_)); - buffered_input_stream_.reset(new io::BufferedInputStream( - file_.get(), dataset()->buffer_size_)); - TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes(current_pos)); + if (dataset()->compression_type_ != "") { + const io::ZlibCompressionOptions zlib_options = + dataset()->compression_type_ == "ZLIB" + ? io::ZlibCompressionOptions::DEFAULT() + : io::ZlibCompressionOptions::GZIP(); + file_stream_.reset(new io::RandomAccessInputStream(file_.get())); + buffered_input_stream_.reset(new io::ZlibInputStream( + file_stream_.get(), dataset()->buffer_size_, + dataset()->buffer_size_, zlib_options)); + } else { + buffered_input_stream_.reset(new io::BufferedInputStream( + file_.get(), dataset()->buffer_size_)); + } + lookahead_cache_.clear(); + if (dataset()->compression_type_ == "") { + TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes(current_pos)); + } else { + TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes( + current_pos - dataset()->footer_bytes_)); + TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes( + dataset()->footer_bytes_, &lookahead_cache_)); + } } return Status::OK(); @@ -507,9 +579,12 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { size_t current_file_index_ GUARDED_BY(mu_) = 0; std::unique_ptr file_ GUARDED_BY(mu_); // must outlive buffered_input_stream_ - std::unique_ptr buffered_input_stream_ + std::unique_ptr + file_stream_; // must outlive buffered_inputstream_ + std::unique_ptr buffered_input_stream_ GUARDED_BY(mu_); int64 file_pos_limit_ GUARDED_BY(mu_) = -1; + string lookahead_cache_ GUARDED_BY(mu_); }; const std::vector filenames_; From 746e0ff70e0fcd5fe81253f08274c70b24f41994 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 24 Aug 2018 19:47:16 +0000 Subject: [PATCH 144/540] Add test cases for compression support with FixedRecordDataset Signed-off-by: Yong Tang --- .../kernel_tests/reader_dataset_ops_test.py | 44 +++++++++++++++---- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py index aef2dd1d9c6..6b95a2f40e1 100644 --- a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py +++ b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py @@ -213,26 +213,43 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase): def _record(self, f, r): return compat.as_bytes(str(f * 2 + r) * self._record_bytes) - def _createFiles(self): + def _createFiles(self, compression_type=None): filenames = [] for i in range(self._num_files): fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i) filenames.append(fn) - with open(fn, "wb") as f: - f.write(b"H" * self._header_bytes) - for j in range(self._num_records): - f.write(self._record(i, j)) - f.write(b"F" * self._footer_bytes) + + contents = [] + contents.append(b"H" * self._header_bytes) + for j in range(self._num_records): + contents.append(self._record(i, j)) + contents.append(b"F" * self._footer_bytes) + contents = b"".join(contents) + + if not compression_type: + with open(fn, "wb") as f: + f.write(contents) + elif compression_type == "GZIP": + with gzip.GzipFile(fn, "wb") as f: + f.write(contents) + elif compression_type == "ZLIB": + contents = zlib.compress(contents) + with open(fn, "wb") as f: + f.write(contents) + else: + raise ValueError("Unsupported compression_type", compression_type) + return filenames - def testFixedLengthRecordDataset(self): - test_filenames = self._createFiles() + def _testFixedLengthRecordDataset(self, compression_type=None): + test_filenames = self._createFiles(compression_type=compression_type) filenames = array_ops.placeholder(dtypes.string, shape=[None]) num_epochs = array_ops.placeholder(dtypes.int64, shape=[]) batch_size = array_ops.placeholder(dtypes.int64, shape=[]) repeat_dataset = (readers.FixedLengthRecordDataset( - filenames, self._record_bytes, self._header_bytes, self._footer_bytes) + filenames, self._record_bytes, self._header_bytes, self._footer_bytes, + compression_type=compression_type) .repeat(num_epochs)) batch_dataset = repeat_dataset.batch(batch_size) @@ -293,6 +310,15 @@ class FixedLengthRecordReaderTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) + def testFixedLengthRecordDatasetNoCompression(self): + self._testFixedLengthRecordDataset() + + def testFixedLengthRecordDatasetGzipCompression(self): + self._testFixedLengthRecordDataset(compression_type="GZIP") + + def testFixedLengthRecordDatasetZlibCompression(self): + self._testFixedLengthRecordDataset(compression_type="ZLIB") + def testFixedLengthRecordDatasetBuffering(self): test_filenames = self._createFiles() dataset = readers.FixedLengthRecordDataset( From 4f2ec1a96605b9f90d356db0aed7ab1ec39e093a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 11 Sep 2018 19:46:52 +0000 Subject: [PATCH 145/540] Address review feedbacks Signed-off-by: Yong Tang --- tensorflow/core/kernels/data/reader_dataset_ops.cc | 4 ++-- tensorflow/python/data/ops/readers.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc index 7b687739e95..343f7f302da 100644 --- a/tensorflow/core/kernels/data/reader_dataset_ops.cc +++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc @@ -406,7 +406,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { dataset()->record_bytes_, &record)); // Produce the record as output. Tensor record_tensor(ctx->allocator({}), DT_STRING, {}); - record_tensor.scalar()() = record; + record_tensor.scalar()() = std::move(record); out_tensors->emplace_back(std::move(record_tensor)); *end_of_sequence = false; return Status::OK(); @@ -580,7 +580,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { std::unique_ptr file_ GUARDED_BY(mu_); // must outlive buffered_input_stream_ std::unique_ptr - file_stream_; // must outlive buffered_inputstream_ + file_stream_; // must outlive buffered_input_stream_ std::unique_ptr buffered_input_stream_ GUARDED_BY(mu_); int64 file_pos_limit_ GUARDED_BY(mu_) = -1; diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index 76fe5445aad..c0b75a26e7e 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -17,6 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.compat import compat from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import convert from tensorflow.python.framework import dtypes @@ -291,9 +292,15 @@ class FixedLengthRecordDataset(dataset_ops.Dataset): argument_dtype=dtypes.string) def _as_variant_tensor(self): - return gen_dataset_ops.fixed_length_record_dataset_v2( - self._filenames, self._header_bytes, self._record_bytes, - self._footer_bytes, self._buffer_size, self._compression_type) + if self._compression_type is not None or compat.forward_compatible(2018, 9, 30): + return gen_dataset_ops.fixed_length_record_dataset_v2( + self._filenames, self._header_bytes, self._record_bytes, + self._footer_bytes, self._buffer_size, self._compression_type) + else: + return gen_dataset_ops.fixed_length_record_dataset( + self._filenames, self._header_bytes, self._record_bytes, + self._footer_bytes, self._buffer_size) + def _inputs(self): return [] From dbbebd80fc778e8a279909ecef7022c601a69232 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 11 Sep 2018 21:18:31 +0000 Subject: [PATCH 146/540] Split CompressedIterator and UncompressedIterator for FixedLengthRecordDataset Signed-off-by: Yong Tang --- .../core/kernels/data/reader_dataset_ops.cc | 176 ++++++++++++++---- 1 file changed, 143 insertions(+), 33 deletions(-) diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc index 343f7f302da..bc5c65d6428 100644 --- a/tensorflow/core/kernels/data/reader_dataset_ops.cc +++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc @@ -342,8 +342,13 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { std::unique_ptr MakeIteratorInternal( const string& prefix) const override { + if (compression_type_ == "") { return std::unique_ptr( - new Iterator({this, strings::StrCat(prefix, "::FixedLengthRecord")})); + new UncompressedIterator({this, strings::StrCat(prefix, "::FixedLengthRecord")})); + } else { + return std::unique_ptr( + new CompressedIterator({this, strings::StrCat(prefix, "::FixedLengthRecord")})); + } } const DataTypeVector& output_dtypes() const override { @@ -385,9 +390,129 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { } private: - class Iterator : public DatasetIterator { + class UncompressedIterator : public DatasetIterator { public: - explicit Iterator(const Params& params) + explicit UncompressedIterator(const Params& params) + : DatasetIterator(params) {} + + Status GetNextInternal(IteratorContext* ctx, + std::vector* out_tensors, + bool* end_of_sequence) override { + mutex_lock l(mu_); + do { + // We are currently processing a file, so try to read the next record. + if (input_buffer_) { + const int64 current_pos = input_buffer_->Tell(); + DCHECK_GE(file_pos_limit_, 0); + if (current_pos < file_pos_limit_) { + string record; + TF_RETURN_IF_ERROR( + input_buffer_->ReadNBytes(dataset()->record_bytes_, &record)); + // Produce the record as output. + Tensor record_tensor(ctx->allocator({}), DT_STRING, {}); + record_tensor.scalar()() = record; + out_tensors->emplace_back(std::move(record_tensor)); + *end_of_sequence = false; + return Status::OK(); + } + + // We have reached the end of the current file, so maybe + // move on to next file. + input_buffer_.reset(); + file_.reset(); + ++current_file_index_; + } + + // Iteration ends when there are no more files to process. + if (current_file_index_ == dataset()->filenames_.size()) { + *end_of_sequence = true; + return Status::OK(); + } + + // Actually move on to next file. + uint64 file_size; + TF_RETURN_IF_ERROR(ctx->env()->GetFileSize( + dataset()->filenames_[current_file_index_], &file_size)); + file_pos_limit_ = file_size - dataset()->footer_bytes_; + + uint64 body_size = + file_size - (dataset()->header_bytes_ + dataset()->footer_bytes_); + + if (body_size % dataset()->record_bytes_ != 0) { + return errors::InvalidArgument( + "Excluding the header (", dataset()->header_bytes_, + " bytes) and footer (", dataset()->footer_bytes_, + " bytes), input file \"", + dataset()->filenames_[current_file_index_], + "\" has body length ", body_size, + " bytes, which is not an exact multiple of the record length (", + dataset()->record_bytes_, " bytes)."); + } + TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile( + dataset()->filenames_[current_file_index_], &file_)); + input_buffer_.reset( + new io::InputBuffer(file_.get(), dataset()->buffer_size_)); + TF_RETURN_IF_ERROR( + input_buffer_->SkipNBytes(dataset()->header_bytes_)); + } while (true); + } + + protected: + Status SaveInternal(IteratorStateWriter* writer) override { + mutex_lock l(mu_); + TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"), + current_file_index_)); + + // `input_buffer_` is empty if + // 1. GetNext has not been called even once. + // 2. All files have been read and iterator has been exhausted. + int64 current_pos = input_buffer_ ? input_buffer_->Tell() : -1; + TF_RETURN_IF_ERROR( + writer->WriteScalar(full_name("current_pos"), current_pos)); + return Status::OK(); + } + + Status RestoreInternal(IteratorContext* ctx, + IteratorStateReader* reader) override { + mutex_lock l(mu_); + int64 current_file_index; + TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_file_index"), + ¤t_file_index)); + current_file_index_ = size_t(current_file_index); + int64 current_pos; + TF_RETURN_IF_ERROR( + reader->ReadScalar(full_name("current_pos"), ¤t_pos)); + + // Seek to current_pos. + input_buffer_.reset(); + file_.reset(); + if (current_pos >= 0) { // There was an active input_buffer_. + uint64 file_size; + TF_RETURN_IF_ERROR(ctx->env()->GetFileSize( + dataset()->filenames_[current_file_index_], &file_size)); + file_pos_limit_ = file_size - dataset()->footer_bytes_; + TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile( + dataset()->filenames_[current_file_index_], &file_)); + input_buffer_.reset( + new io::InputBuffer(file_.get(), dataset()->buffer_size_)); + TF_RETURN_IF_ERROR(input_buffer_->Seek(current_pos)); + } + + return Status::OK(); + } + + private: + mutex mu_; + size_t current_file_index_ GUARDED_BY(mu_) = 0; + std::unique_ptr file_ + GUARDED_BY(mu_); // must outlive input_buffer_ + std::unique_ptr input_buffer_ GUARDED_BY(mu_); + int64 file_pos_limit_ GUARDED_BY(mu_) = -1; + }; + + class CompressedIterator : public DatasetIterator { + public: + explicit CompressedIterator(const Params& params) : DatasetIterator(params) {} Status GetNextInternal(IteratorContext* ctx, @@ -421,9 +546,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { lookahead_cache_ = lookahead_cache_.substr(dataset()->record_bytes_); // Produce the record as output. - out_tensors->emplace_back(ctx->allocator({}), DT_STRING, - TensorShape({})); - out_tensors->back().scalar()() = record; + Tensor record_tensor(ctx->allocator({}), DT_STRING, {}); + record_tensor.scalar()() = std::move(record); + out_tensors->emplace_back(std::move(record_tensor)); *end_of_sequence = false; return Status::OK(); } @@ -539,36 +664,21 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { buffered_input_stream_.reset(); file_.reset(); if (current_pos >= 0) { // There was an active buffered_input_stream_. - if (dataset()->compression_type_ == "") { - uint64 file_size; - TF_RETURN_IF_ERROR(ctx->env()->GetFileSize( - dataset()->filenames_[current_file_index_], &file_size)); - file_pos_limit_ = file_size - dataset()->footer_bytes_; - } TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile( dataset()->filenames_[current_file_index_], &file_)); - if (dataset()->compression_type_ != "") { - const io::ZlibCompressionOptions zlib_options = - dataset()->compression_type_ == "ZLIB" - ? io::ZlibCompressionOptions::DEFAULT() - : io::ZlibCompressionOptions::GZIP(); - file_stream_.reset(new io::RandomAccessInputStream(file_.get())); - buffered_input_stream_.reset(new io::ZlibInputStream( - file_stream_.get(), dataset()->buffer_size_, - dataset()->buffer_size_, zlib_options)); - } else { - buffered_input_stream_.reset(new io::BufferedInputStream( - file_.get(), dataset()->buffer_size_)); - } + const io::ZlibCompressionOptions zlib_options = + dataset()->compression_type_ == "ZLIB" + ? io::ZlibCompressionOptions::DEFAULT() + : io::ZlibCompressionOptions::GZIP(); + file_stream_.reset(new io::RandomAccessInputStream(file_.get())); + buffered_input_stream_.reset(new io::ZlibInputStream( + file_stream_.get(), dataset()->buffer_size_, + dataset()->buffer_size_, zlib_options)); lookahead_cache_.clear(); - if (dataset()->compression_type_ == "") { - TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes(current_pos)); - } else { - TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes( - current_pos - dataset()->footer_bytes_)); - TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes( - dataset()->footer_bytes_, &lookahead_cache_)); - } + TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes( + current_pos - dataset()->footer_bytes_)); + TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes( + dataset()->footer_bytes_, &lookahead_cache_)); } return Status::OK(); From 517a63b27802fe02d986aea416e1e5e1a0a24983 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 11 Sep 2018 22:40:15 +0000 Subject: [PATCH 147/540] Fix `Experimental clang-format Check` build Signed-off-by: Yong Tang --- tensorflow/core/kernels/data/reader_dataset_ops.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc index bc5c65d6428..c110e79ac99 100644 --- a/tensorflow/core/kernels/data/reader_dataset_ops.cc +++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc @@ -343,11 +343,11 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { std::unique_ptr MakeIteratorInternal( const string& prefix) const override { if (compression_type_ == "") { - return std::unique_ptr( - new UncompressedIterator({this, strings::StrCat(prefix, "::FixedLengthRecord")})); + return std::unique_ptr(new UncompressedIterator( + {this, strings::StrCat(prefix, "::FixedLengthRecord")})); } else { - return std::unique_ptr( - new CompressedIterator({this, strings::StrCat(prefix, "::FixedLengthRecord")})); + return std::unique_ptr(new CompressedIterator( + {this, strings::StrCat(prefix, "::FixedLengthRecord")})); } } From 897666ccefbc852e76037b486c1963f278af48f4 Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Mon, 5 Nov 2018 17:17:33 -0800 Subject: [PATCH 148/540] Stop blocking async device/op_device reads. PiperOrigin-RevId: 220201429 --- tensorflow/c/eager/c_api_internal.h | 4 ---- .../core/common_runtime/eager/copy_to_device_node.h | 2 +- tensorflow/core/common_runtime/eager/execute.cc | 11 ++++++++++- tensorflow/core/common_runtime/eager/tensor_handle.cc | 4 ---- tensorflow/core/common_runtime/eager/tensor_handle.h | 7 ++++--- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h index 104d52430cf..fa1b22e3af4 100644 --- a/tensorflow/c/eager/c_api_internal.h +++ b/tensorflow/c/eager/c_api_internal.h @@ -79,10 +79,6 @@ struct TFE_TensorHandle { tensorflow::Device* op_device) : handle(new tensorflow::TensorHandle(t, d, op_device, nullptr)) {} - TFE_TensorHandle(tensorflow::uint64 node_id, tensorflow::DataType dtype, - tensorflow::EagerContext* ctx) - : handle(new tensorflow::TensorHandle(node_id, dtype, ctx)) {} - TFE_TensorHandle(tensorflow::TensorHandle* handle) : handle(handle) {} tensorflow::TensorHandle* handle; diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h index 8a887540b06..953b3580c2e 100644 --- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h +++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h @@ -30,7 +30,7 @@ class CopyToDeviceNode : public EagerNode { src_(src), dstd_(dstd), ctx_(ctx), - dst_(new TensorHandle(id, src_->dtype, ctx)) { + dst_(new TensorHandle(id, dstd_, dstd_, src->dtype, ctx)) { src_->Ref(); dst_->Ref(); } diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc index 53f0ba1c818..0fcf5d93877 100644 --- a/tensorflow/core/common_runtime/eager/execute.cc +++ b/tensorflow/core/common_runtime/eager/execute.cc @@ -333,8 +333,17 @@ Status EagerLocalExecute(EagerOperation* op, // input handles are ready before executing them. // TODO(agarwal): Consider executing "cheap" kernels inline for performance. tensorflow::uint64 id = ctx->NextId(); + const MemoryTypeVector* output_memory_types = nullptr; + output_memory_types = &kernel->kernel()->output_memory_types(); + + Device* op_device = kernel->device(); for (int i = 0; i < *num_retvals; ++i) { - (*retvals)[i] = new TensorHandle(id, output_dtypes[i], ctx); + Device* d = op_device; + if (d != nullptr && output_memory_types != nullptr && + (*output_memory_types)[i] == HOST_MEMORY) { + d = nullptr; + } + (*retvals)[i] = new TensorHandle(id, d, op_device, output_dtypes[i], ctx); } EagerNode* node = new ExecuteNode( id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(), diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc index d58724cbfac..655add00e9b 100644 --- a/tensorflow/core/common_runtime/eager/tensor_handle.cc +++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc @@ -80,15 +80,11 @@ Status TensorHandle::Tensor(const tensorflow::Tensor** t) { } Status TensorHandle::Device(tensorflow::Device** d) { - TF_RETURN_IF_ERROR(WaitReady()); - DCHECK(IsReady()); *d = device_; return Status::OK(); } Status TensorHandle::OpDevice(tensorflow::Device** d) { - TF_RETURN_IF_ERROR(WaitReady()); - DCHECK(IsReady()); *d = op_device_; return Status::OK(); } diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h index e55f1a03385..4f2c1a31a47 100644 --- a/tensorflow/core/common_runtime/eager/tensor_handle.h +++ b/tensorflow/core/common_runtime/eager/tensor_handle.h @@ -61,12 +61,13 @@ class TensorHandle : public core::RefCounted { ctx_(ctx), is_ready_(true) {} - TensorHandle(uint64 node_id, DataType dtype, EagerContext* ctx) + TensorHandle(uint64 node_id, Device* d, Device* op_device, DataType dtype, + EagerContext* ctx) : dtype(dtype), node_id_(node_id), tensor_(dtype), - device_(nullptr), - op_device_(nullptr), + device_(d), + op_device_(op_device), remote_op_id_(-1), remote_output_num_(-1), remote_shape_node_id_(-1), From 09c2df32421b70604cf88fc748f4e1983a338d57 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 17:19:24 -0800 Subject: [PATCH 149/540] Do not constant fold nodes with TPU specific attributes. PiperOrigin-RevId: 220201649 --- .../grappler/optimizers/constant_folding.cc | 38 +++++++++++++++---- .../grappler/optimizers/constant_folding.h | 7 ++-- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc index 24b02040692..032f41c9d25 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding.cc @@ -157,6 +157,16 @@ bool GetConcatAxis(const GraphProperties& properties, NodeDef* node, return true; } +bool HasTPUAttributes(const NodeDef& node) { + AttrSlice attrs(node); + for (auto attr : attrs) { + if (attr.first.find("_tpu_") != attr.first.npos) { + return true; + } + } + return false; +} + } // namespace ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level, @@ -764,6 +774,13 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const { return false; } + // Don't fold nodes that contain TPU attributes. + // TODO(rmlarsen): We should be able to fold many of these nodes as long as we + // properly forward custom attributes, b/119051778. + if (HasTPUAttributes(node)) { + return false; + } + const OpDef* op_def = nullptr; Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def); if (!status.ok()) { @@ -1128,9 +1145,12 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph, std::vector const_nodes; TF_RETURN_IF_ERROR( EvaluateOneFoldable(*node, &const_nodes, result_too_large)); + VLOG(1) << "Folded node:\n" << node->DebugString(); + NodeDef* constant_output = nullptr; for (int i = 0; i < const_nodes.size(); i++) { NodeDef* const_node = &const_nodes[i]; + VLOG(1) << "Generated constant node:\n" << const_node->DebugString(); if (const_node->name().empty()) { // Dead output: we can't create a constant to encode its value, so we'll // just skip it. We'll preserve the edges that originate from that @@ -1595,15 +1615,19 @@ Status ConstantFolding::ReplaceOperationWithConstant( Status ConstantFolding::SimplifyGraph( bool use_shape_info, GraphDef* optimized_graph, GraphProperties* properties, - const absl::flat_hash_set& nodes_to_not_simplify) { + absl::flat_hash_set* nodes_to_not_simplify) { for (int i = 0; i < optimized_graph->node_size(); ++i) { + NodeDef* node = optimized_graph->mutable_node(i); // TODO(lyandy): Move nodes to not simplify check into SimplifyNode and // generalize to only restrict certain simplifications. - if (nodes_to_not_simplify.find(optimized_graph->node(i).name()) == - nodes_to_not_simplify.end()) { - TF_RETURN_IF_ERROR(SimplifyNode(use_shape_info, - optimized_graph->mutable_node(i), - optimized_graph, properties)); + if (nodes_to_not_simplify->find(node->name()) == + nodes_to_not_simplify->end()) { + if (HasTPUAttributes(optimized_graph->node(i))) { + nodes_to_not_simplify->insert(node->name()); + continue; + } + TF_RETURN_IF_ERROR( + SimplifyNode(use_shape_info, node, optimized_graph, properties)); } } return Status::OK(); @@ -3042,7 +3066,7 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster, TF_RETURN_IF_ERROR(FoldGraph(optimized_graph, &nodes_to_not_simplify)); node_map_.reset(new NodeMap(optimized_graph)); TF_RETURN_IF_ERROR(SimplifyGraph(can_use_shape_info, optimized_graph, - &properties, nodes_to_not_simplify)); + &properties, &nodes_to_not_simplify)); return Status::OK(); } diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h index c81d3067d50..d1898cdb049 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding.h +++ b/tensorflow/core/grappler/optimizers/constant_folding.h @@ -100,10 +100,9 @@ class ConstantFolding : public GraphOptimizer { const GraphProperties& properties) const; bool IsSimplifiableReshape(const NodeDef& node, const GraphProperties& properties) const; - Status SimplifyGraph( - bool use_shape_info, GraphDef* optimized_graph, - GraphProperties* properties, - const absl::flat_hash_set& nodes_to_not_simplify); + Status SimplifyGraph(bool use_shape_info, GraphDef* optimized_graph, + GraphProperties* properties, + absl::flat_hash_set* nodes_to_not_simplify); Status SimplifyNode(bool use_shape_info, NodeDef* node, GraphDef* optimized_graph, GraphProperties* properties); From 8ae7343f3d24569b4bb142ddc7b58037267a2d3c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 17:21:13 -0800 Subject: [PATCH 150/540] Internal change. PiperOrigin-RevId: 220201899 --- tensorflow/python/keras/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index dd4e11f0eeb..9bdef21234a 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -368,7 +368,7 @@ cuda_py_test( py_test( name = "pooling_test", - size = "small", + size = "medium", srcs = ["layers/pooling_test.py"], srcs_version = "PY2AND3", deps = [ From 2a96384c95bd2970890b49058dbde5b44a791bc1 Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Mon, 5 Nov 2018 17:44:50 -0800 Subject: [PATCH 151/540] Remove ConvolutionDescriptor::set_pad_alignment(), as no one uses it. PiperOrigin-RevId: 220204981 --- tensorflow/stream_executor/dnn.cc | 3 +-- tensorflow/stream_executor/dnn.h | 9 +++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc index a38a6d52765..3d8e691ab28 100644 --- a/tensorflow/stream_executor/dnn.cc +++ b/tensorflow/stream_executor/dnn.cc @@ -448,7 +448,6 @@ ConvolutionDescriptor::ConvolutionDescriptor(int ndims) : zero_padding_(ndims, 0), filter_strides_(ndims, 1), dilation_rates_(ndims, 1), - pad_alignment_(PadAlignment::kDefault), group_count_(1), ndims_(ndims) {} @@ -470,7 +469,7 @@ string ConvolutionDescriptor::ToString() const { return port::Printf( "{zero_padding: %s pad_alignment: %s filter_strides: %s dilation_rates: " "%s}", - padding.c_str(), PadAlignmentString(pad_alignment_).c_str(), + padding.c_str(), PadAlignmentString(pad_alignment()).c_str(), strides.c_str(), dilations.c_str()); } diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 21954d58f0a..c934301829d 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -548,10 +548,6 @@ class ConvolutionDescriptor { SetDim(&dilation_rates_, dim, value); return *this; } - ConvolutionDescriptor& set_pad_alignment(PadAlignment pad_alignment) { - pad_alignment_ = pad_alignment; - return *this; - } ConvolutionDescriptor& set_group_count(int group_count) { group_count_ = group_count; return *this; @@ -578,7 +574,9 @@ class ConvolutionDescriptor { int zero_padding(DimIndex dim) const { return GetDim(zero_padding_, dim); } int filter_stride(DimIndex dim) const { return GetDim(filter_strides_, dim); } int dilation_rate(DimIndex dim) const { return GetDim(dilation_rates_, dim); } - PadAlignment pad_alignment() const { return pad_alignment_; } + // TODO(timshen): remove this function. No users of this class is setting a + // non-default pad alignment. + PadAlignment pad_alignment() const { return PadAlignment::kDefault; } int group_count() const { return group_count_; } int ndims() const { return ndims_; } @@ -591,7 +589,6 @@ class ConvolutionDescriptor { std::vector zero_padding_; std::vector filter_strides_; std::vector dilation_rates_; - PadAlignment pad_alignment_; int group_count_; int ndims_; // TODO(leary) cudnn provides these fields, but need to characterize what From aa8020d77048d101782acd71b869fa9f9c813baa Mon Sep 17 00:00:00 2001 From: Yuefeng Zhou Date: Mon, 5 Nov 2018 17:47:36 -0800 Subject: [PATCH 152/540] Mark SyncReplicaOptimizer as deprecated. PiperOrigin-RevId: 220205234 --- .../training/sync_replicas_optimizer.py | 9 ++- ...rflow.train.-sync-replicas-optimizer.pbtxt | 63 ------------------- .../api/golden/v2/tensorflow.train.pbtxt | 4 -- 3 files changed, 8 insertions(+), 68 deletions(-) delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py index 6a3756fba9f..fbde8fe3c2a 100644 --- a/tensorflow/python/training/sync_replicas_optimizer.py +++ b/tensorflow/python/training/sync_replicas_optimizer.py @@ -31,6 +31,7 @@ from tensorflow.python.training import optimizer from tensorflow.python.training import queue_runner from tensorflow.python.training import session_manager from tensorflow.python.training import session_run_hook +from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export @@ -39,7 +40,7 @@ from tensorflow.python.util.tf_export import tf_export # rate according to the number of replicas. This change is introduced to be # consistent with how gradients are aggregated (averaged) within a batch in a # replica. -@tf_export("train.SyncReplicasOptimizer") +@tf_export(v1=["train.SyncReplicasOptimizer"]) class SyncReplicasOptimizer(optimizer.Optimizer): """Class to synchronize, aggregate gradients and pass them to the optimizer. @@ -139,6 +140,12 @@ class SyncReplicasOptimizer(optimizer.Optimizer): ``` """ + @deprecation.deprecated( + None, + "The `SyncReplicaOptimizer` is deprecated. For synchrononous training, " + "please use [Distribution Strategies](https://github.com/tensorflow/" + "tensorflow/tree/master/tensorflow/contrib/distribute).", + warn_once=True) def __init__(self, opt, replicas_to_aggregate, diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt deleted file mode 100644 index 2c0fda3c72b..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt +++ /dev/null @@ -1,63 +0,0 @@ -path: "tensorflow.train.SyncReplicasOptimizer" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "GATE_GRAPH" - mtype: "" - } - member { - name: "GATE_NONE" - mtype: "" - } - member { - name: "GATE_OP" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'opt\', \'replicas_to_aggregate\', \'total_num_replicas\', \'variable_averages\', \'variables_to_average\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'sync_replicas\'], " - } - member_method { - name: "apply_gradients" - argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " - } - member_method { - name: "compute_gradients" - argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None" - } - member_method { - name: "get_chief_queue_runner" - argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "get_init_tokens_op" - argspec: "args=[\'self\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], " - } - member_method { - name: "get_name" - argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "get_slot" - argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None" - } - member_method { - name: "get_slot_names" - argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None" - } - member_method { - name: "make_session_run_hook" - argspec: "args=[\'self\', \'is_chief\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], " - } - member_method { - name: "minimize" - argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], " - } - member_method { - name: "variables" - argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt index c2dc4140e8e..582c0ee3d03 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt @@ -212,10 +212,6 @@ tf_module { name: "Supervisor" mtype: "" } - member { - name: "SyncReplicasOptimizer" - mtype: "" - } member { name: "VocabInfo" mtype: "" From db473ed111fdb87cd1796ac0c444fcb7254ec0f0 Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Mon, 5 Nov 2018 17:54:46 -0800 Subject: [PATCH 153/540] [XLA] Reverse dependency between :hlo and :hlo_reachability :hlo has no real dependence on :hlo_reachability. PiperOrigin-RevId: 220205962 --- tensorflow/compiler/xla/service/BUILD | 4 +- .../xla/service/gpu/gpu_hlo_schedule.cc | 3 +- .../xla/service/gpu/stream_assignment.cc | 2 +- .../compiler/xla/service/hlo_computation.cc | 66 ---------- .../compiler/xla/service/hlo_computation.h | 30 ++--- .../xla/service/hlo_computation_test.cc | 123 ----------------- .../compiler/xla/service/hlo_ordering.cc | 2 +- .../compiler/xla/service/hlo_ordering.h | 1 + .../compiler/xla/service/hlo_reachability.cc | 68 ++++++++++ .../compiler/xla/service/hlo_reachability.h | 24 +++- .../xla/service/hlo_reachability_test.cc | 124 ++++++++++++++++++ .../xla/service/instruction_fusion.cc | 4 +- .../compiler/xla/service/instruction_fusion.h | 1 + .../xla/service/multi_output_fusion.cc | 3 +- .../xla/service/multi_output_fusion.h | 1 + 15 files changed, 233 insertions(+), 223 deletions(-) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index cd8c20d43ea..04b2f72ac95 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -323,7 +323,6 @@ cc_library( ":hlo_casting_utils", ":hlo_module_config", ":hlo_proto", - ":hlo_reachability", ":name_uniquer", "//tensorflow/compiler/xla:array", "//tensorflow/compiler/xla:literal", @@ -402,6 +401,7 @@ cc_library( srcs = ["hlo_reachability.cc"], hdrs = ["hlo_reachability.h"], deps = [ + ":hlo", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", @@ -1103,6 +1103,7 @@ cc_library( ":hlo", ":hlo_dataflow_analysis", ":hlo_proto", + ":hlo_reachability", ":hlo_value", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -1388,6 +1389,7 @@ cc_library( srcs = ["multi_output_fusion.cc"], hdrs = ["multi_output_fusion.h"], deps = [ + ":hlo_reachability", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla/service:hlo", diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc index 02a0d028c11..d8f2e9f6abd 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc @@ -124,7 +124,8 @@ GpuHloOrdering::GpuHloOrdering( for (auto* computation : module->computations()) { if (computation != module->entry_computation() && !computation->IsFusionComputation()) { - predecessors_.emplace(computation, computation->ComputeReachability()); + predecessors_.emplace(computation, + HloReachabilityMap::Build(computation)); } } } diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc index 5b6cf2c04d0..4775baf44ae 100644 --- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc +++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc @@ -122,7 +122,7 @@ std::unique_ptr AssignStreams(const HloModule& module) { auto stream_assignment = absl::make_unique(); const HloComputation& computation = *module.entry_computation(); std::unique_ptr reachability = - computation.ComputeReachability(); + HloReachabilityMap::Build(&computation); std::vector seen_gemms; // The execution of different RNG Hlo instructions in the same module updates // a common global variable. To avoid a race condition, we simply assign all diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index b0f7cd91ad1..01ae6a55fcf 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -739,72 +739,6 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction, return RemoveInstructionAndUnusedOperands(old_instruction); } -std::unique_ptr HloComputation::ComputeReachability() - const { - const auto& all = MakeInstructionPostOrder(); - auto result = absl::make_unique(all); - auto channel_dependency_map = ComputeChannelDependencies(); - - std::vector inputs; - for (const HloInstruction* hlo : all) { - inputs.assign(hlo->operands().begin(), hlo->operands().end()); - inputs.insert(inputs.end(), hlo->control_predecessors().begin(), - hlo->control_predecessors().end()); - - switch (hlo->opcode()) { - case HloOpcode::kRecvDone: { - auto it = channel_dependency_map.find(hlo->channel_id()); - if (it != channel_dependency_map.end()) { - absl::c_copy(it->second, std::back_inserter(inputs)); - } - break; - } - case HloOpcode::kCrossReplicaSum: { - auto all_reduce_id = hlo->all_reduce_id(); - if (all_reduce_id) { - auto it = channel_dependency_map.find(all_reduce_id.value()); - if (it != channel_dependency_map.end()) { - absl::c_copy(it->second, std::back_inserter(inputs)); - } - } - break; - } - default: - break; - } - - result->FastSetReachabilityToUnion(inputs, hlo); - } - return result; -} - -void HloComputation::UpdateReachabilityThroughInstruction( - const HloInstruction* instruction, HloReachabilityMap* reachability_map) { - std::queue worklist; - worklist.push(instruction); - - std::vector inputs; - - while (!worklist.empty()) { - const HloInstruction* item = worklist.front(); - worklist.pop(); - - inputs.assign(item->operands().begin(), item->operands().end()); - inputs.insert(inputs.end(), item->control_predecessors().begin(), - item->control_predecessors().end()); - - if (reachability_map->SetReachabilityToUnion(inputs, item)) { - // Add immediate successors to worklist. - for (const HloInstruction* user : item->users()) { - worklist.push(user); - } - for (const HloInstruction* succ : item->control_successors()) { - worklist.push(succ); - } - } - } -} - std::vector HloComputation::CollectUnreachableRoots() const { std::vector unreachable_roots; for (auto* instruction : instructions()) { diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h index dec96d11a93..2cce866e5c1 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.h +++ b/tensorflow/compiler/xla/service/hlo_computation.h @@ -35,7 +35,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_clone_context.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_reachability.h" #include "tensorflow/compiler/xla/service/name_uniquer.h" #include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/statusor.h" @@ -215,19 +214,6 @@ class HloComputation { // this order, definitions of values always appear before their uses. std::vector MakeInstructionPostOrder() const; - // Computes and returns the reachability between HLO instructions in the - // computation. The returned HloReachabilityMap is constructed such that - // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a - // directed path (from producer to consumer) from 'a' to 'b'. Both data - // dependencies (operands) and control dependencies are considered for - // reachability. Trivially an instruction is reachable from itself. - std::unique_ptr ComputeReachability() const; - - // Updates the given reachability map after the immediate predecessor set - // (operands and control predecessors) of 'instruction' has changed. - void UpdateReachabilityThroughInstruction( - const HloInstruction* instruction, HloReachabilityMap* reachability_map); - int64 instruction_count() const { return instruction_iterators_.size(); } // Creates and returns a list of the embedded computations called by this @@ -355,6 +341,14 @@ class HloComputation { // channel complete). bool IsRemovable(const HloInstruction* instruction); + // Returns a map from channel-id to directed dependencies of the channel + // instructions. For send&recv pairs it means the send instruction and for + // cross-replica-sum the union of the dependencies for all participating + // instructions. + using ChannelDependencyMap = + absl::flat_hash_map>; + ChannelDependencyMap ComputeChannelDependencies() const; + // Returns true if this computation has a side effect. A computation has a // side effect if it contains one or more instructions with a side effect. bool HasSideEffect() const; @@ -410,14 +404,6 @@ class HloComputation { // Internal helper to collect unreachable roots. std::vector CollectUnreachableRoots() const; - // Returns a map from channel-id to directed dependencies of the channel - // instructions. For send&recv pairs it means the send instruction and for - // cross-replica-sum the union of the dependencies for all participating - // instructions. - using ChannelDependencyMap = - absl::flat_hash_map>; - ChannelDependencyMap ComputeChannelDependencies() const; - enum VisitState { kVisiting, kVisited }; void ComputeInstructionPostOrder( const HloComputation::ChannelDependencyMap& channel_dependency_map, diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc index 2aaaef1d36d..ac6d08b026a 100644 --- a/tensorflow/compiler/xla/service/hlo_computation_test.cc +++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc @@ -484,107 +484,6 @@ TEST_F(HloComputationTest, CloneWithControlDependency) { EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add)); } -TEST_F(HloComputationTest, Reachability) { - // Test reachability of a non-trivial computation: - // - // const1 const2 - // | | - // | +-------+ - // | | | - // add .. negate - // | . | - // | .... exp - // | | - // +---+ +-+---+ - // | | | - // multiply copy - // - // There is a control dependency from 'add' to 'exp'. - auto builder = HloComputation::Builder(TestName()); - auto constant1 = builder.AddInstruction( - HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); - auto constant2 = builder.AddInstruction( - HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0f))); - auto add = builder.AddInstruction(HloInstruction::CreateBinary( - r0f32_, HloOpcode::kAdd, constant1, constant2)); - auto negate = builder.AddInstruction( - HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant2)); - auto exp = builder.AddInstruction( - HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, negate)); - auto mul = builder.AddInstruction( - HloInstruction::CreateBinary(r0f32_, HloOpcode::kMultiply, add, exp)); - auto copy = builder.AddInstruction( - HloInstruction::CreateUnary(r0f32_, HloOpcode::kCopy, exp)); - - auto module = CreateNewModule(); - auto computation = - module->AddEntryComputation(builder.Build(/*root_instruction=*/mul)); - - TF_CHECK_OK(add->AddControlDependencyTo(exp)); - auto reachability = computation->ComputeReachability(); - - EXPECT_TRUE(reachability->IsReachable(constant1, constant1)); - EXPECT_FALSE(reachability->IsReachable(constant1, constant2)); - EXPECT_TRUE(reachability->IsReachable(constant1, add)); - EXPECT_FALSE(reachability->IsReachable(constant1, negate)); - EXPECT_TRUE(reachability->IsReachable(constant1, exp)); - EXPECT_TRUE(reachability->IsReachable(constant1, mul)); - EXPECT_TRUE(reachability->IsReachable(constant1, copy)); - - EXPECT_FALSE(reachability->IsReachable(constant2, constant1)); - EXPECT_TRUE(reachability->IsReachable(constant2, constant2)); - EXPECT_TRUE(reachability->IsReachable(constant2, add)); - EXPECT_TRUE(reachability->IsReachable(constant2, negate)); - EXPECT_TRUE(reachability->IsReachable(constant2, exp)); - EXPECT_TRUE(reachability->IsReachable(constant2, mul)); - EXPECT_TRUE(reachability->IsReachable(constant2, copy)); - - EXPECT_FALSE(reachability->IsReachable(exp, constant1)); - EXPECT_FALSE(reachability->IsReachable(exp, constant2)); - EXPECT_FALSE(reachability->IsReachable(exp, add)); - EXPECT_FALSE(reachability->IsReachable(exp, negate)); - EXPECT_TRUE(reachability->IsReachable(exp, exp)); - EXPECT_TRUE(reachability->IsReachable(exp, mul)); - EXPECT_TRUE(reachability->IsReachable(exp, copy)); - - EXPECT_FALSE(reachability->IsReachable(mul, constant1)); - EXPECT_FALSE(reachability->IsReachable(mul, constant2)); - EXPECT_FALSE(reachability->IsReachable(mul, add)); - EXPECT_FALSE(reachability->IsReachable(mul, negate)); - EXPECT_FALSE(reachability->IsReachable(mul, exp)); - EXPECT_TRUE(reachability->IsReachable(mul, mul)); - EXPECT_FALSE(reachability->IsReachable(mul, copy)); - - EXPECT_TRUE(reachability->IsConnected(constant1, copy)); - EXPECT_TRUE(reachability->IsConnected(copy, constant1)); - EXPECT_FALSE(reachability->IsConnected(negate, add)); - EXPECT_FALSE(reachability->IsConnected(add, negate)); - - // Remove the control dependency then update and verify the reachability map - ASSERT_IS_OK(add->RemoveControlDependencyTo(exp)); - computation->UpdateReachabilityThroughInstruction(exp, reachability.get()); - - EXPECT_TRUE(reachability->IsReachable(constant1, constant1)); - EXPECT_FALSE(reachability->IsReachable(constant1, constant2)); - EXPECT_TRUE(reachability->IsReachable(constant1, add)); - EXPECT_FALSE(reachability->IsReachable(constant1, negate)); - EXPECT_FALSE(reachability->IsReachable(constant1, exp)); - EXPECT_TRUE(reachability->IsReachable(constant1, mul)); - EXPECT_FALSE(reachability->IsReachable(constant1, copy)); - - // Change a use within the graph then update and verify the reachability map - ASSERT_IS_OK(constant2->ReplaceUseWith(negate, constant1)); - computation->UpdateReachabilityThroughInstruction(negate, reachability.get()); - - EXPECT_FALSE(reachability->IsReachable(constant2, constant1)); - EXPECT_TRUE(reachability->IsReachable(constant2, constant2)); - EXPECT_TRUE(reachability->IsReachable(constant2, add)); - EXPECT_FALSE(reachability->IsReachable(constant2, negate)); - EXPECT_FALSE(reachability->IsReachable(constant2, exp)); - EXPECT_TRUE(reachability->IsReachable(constant2, mul)); - EXPECT_FALSE(reachability->IsReachable(constant2, copy)); -} - TEST_F(HloComputationTest, Stringification) { const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10}); const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10}); @@ -700,27 +599,5 @@ TEST_F(HloComputationTest, StringificationCanonical) { EXPECT_EQ(computation->ToString(options), expected_computation2); } -TEST_F(HloComputationTest, ChannelReachability) { - const Shape shape = ShapeUtil::MakeShape(F32, {5, 7}); - HloComputation::Builder builder("ChannelReachability"); - auto param = builder.AddInstruction( - HloInstruction::CreateParameter(0, shape, "param")); - auto token0 = builder.AddInstruction(HloInstruction::CreateToken()); - auto send = - builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1)); - auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send)); - auto token1 = builder.AddInstruction(HloInstruction::CreateToken()); - auto recv = - builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1)); - auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv)); - - auto module = CreateNewModule(); - auto computation = module->AddEntryComputation(builder.Build(recv_done)); - auto reachability = computation->ComputeReachability(); - EXPECT_TRUE(reachability->IsReachable(param, recv_done)); - EXPECT_FALSE(reachability->IsReachable(send, recv)); - EXPECT_FALSE(reachability->IsReachable(send_done, recv)); -} - } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc index 23d41d91d69..1c93641a588 100644 --- a/tensorflow/compiler/xla/service/hlo_ordering.cc +++ b/tensorflow/compiler/xla/service/hlo_ordering.cc @@ -334,7 +334,7 @@ DependencyHloOrdering::DependencyHloOrdering(const HloModule* module) // ordering based on dependencies. ExecutesBefore will return true iff there // exists a path in the HLO computation graph from 'a' to 'b'. for (auto* computation : module->MakeNonfusionComputations()) { - predecessors_.emplace(computation, computation->ComputeReachability()); + predecessors_.emplace(computation, HloReachabilityMap::Build(computation)); } } diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h index 66313492eb2..4dbe44769a2 100644 --- a/tensorflow/compiler/xla/service/hlo_ordering.h +++ b/tensorflow/compiler/xla/service/hlo_ordering.h @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_reachability.h" #include "tensorflow/compiler/xla/service/hlo_schedule.h" #include "tensorflow/compiler/xla/service/hlo_value.h" #include "tensorflow/compiler/xla/types.h" diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc index 961930f0a88..7e73cf5889c 100644 --- a/tensorflow/compiler/xla/service/hlo_reachability.cc +++ b/tensorflow/compiler/xla/service/hlo_reachability.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include + #include "tensorflow/compiler/xla/service/hlo_reachability.h" namespace xla { @@ -71,4 +73,70 @@ bool HloReachabilityMap::IsConnected(const HloInstruction* a, return IsReachable(a, b) || IsReachable(b, a); } +std::unique_ptr HloReachabilityMap::Build( + const HloComputation* computation) { + const auto& all = computation->MakeInstructionPostOrder(); + auto result = absl::make_unique(all); + auto channel_dependency_map = computation->ComputeChannelDependencies(); + + std::vector inputs; + for (const HloInstruction* hlo : all) { + inputs.assign(hlo->operands().begin(), hlo->operands().end()); + inputs.insert(inputs.end(), hlo->control_predecessors().begin(), + hlo->control_predecessors().end()); + + switch (hlo->opcode()) { + case HloOpcode::kRecvDone: { + auto it = channel_dependency_map.find(hlo->channel_id()); + if (it != channel_dependency_map.end()) { + absl::c_copy(it->second, std::back_inserter(inputs)); + } + break; + } + case HloOpcode::kCrossReplicaSum: { + auto all_reduce_id = hlo->all_reduce_id(); + if (all_reduce_id) { + auto it = channel_dependency_map.find(all_reduce_id.value()); + if (it != channel_dependency_map.end()) { + absl::c_copy(it->second, std::back_inserter(inputs)); + } + } + break; + } + default: + break; + } + + result->FastSetReachabilityToUnion(inputs, hlo); + } + return result; +} + +void HloReachabilityMap::UpdateReachabilityThroughInstruction( + const HloInstruction* instruction) { + std::queue worklist; + worklist.push(instruction); + + std::vector inputs; + + while (!worklist.empty()) { + const HloInstruction* item = worklist.front(); + worklist.pop(); + + inputs.assign(item->operands().begin(), item->operands().end()); + inputs.insert(inputs.end(), item->control_predecessors().begin(), + item->control_predecessors().end()); + + if (SetReachabilityToUnion(inputs, item)) { + // Add immediate successors to worklist. + for (const HloInstruction* user : item->users()) { + worklist.push(user); + } + for (const HloInstruction* succ : item->control_successors()) { + worklist.push(succ); + } + } + } +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h index 3e27d098aeb..2c965f58bfa 100644 --- a/tensorflow/compiler/xla/service/hlo_reachability.h +++ b/tensorflow/compiler/xla/service/hlo_reachability.h @@ -22,6 +22,7 @@ limitations under the License. #include "absl/container/flat_hash_map.h" #include "absl/types/span.h" #include "tensorflow/compiler/xla/map_util.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/types.h" @@ -32,11 +33,11 @@ class HloInstruction; // A class for representing reachability between HloInstructions. // -// !!! THIS CLASS DOES NOT COMPUTE REACHABILITY !!! It has an adjacency matrix -// and it is up to the user of the class to set the adjacency matrix such that -// it represents reachability, i.e. such that it is transitive. That the graph -// be transitive is thus not an invariant of this class, but it is required for -// the name of the class and its methods to make sense. +// It has an adjacency matrix and it is up to the user of the class to set the +// adjacency matrix such that it represents reachability, i.e. such that it is +// transitive. That the graph be transitive is thus not an invariant of this +// class, but it is required for the name of the class and its methods to make +// sense. class HloReachabilityMap { public: // Sets up a graph with no edges and where the nodes correspond to the given @@ -44,6 +45,15 @@ class HloReachabilityMap { explicit HloReachabilityMap( absl::Span instructions); + // Computes and returns the reachability between HLO instructions in the + // computation. The returned HloReachabilityMap is constructed such that + // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a + // directed path (from producer to consumer) from 'a' to 'b'. Both data + // dependencies (operands) and control dependencies are considered for + // reachability. Trivially an instruction is reachable from itself. + static std::unique_ptr Build( + const HloComputation* computation); + // Set the reachability set of 'instruction' to the union of the reachability // sets of 'inputs'. Upon return, IsReachable(x, instruction) where // 'x' is not 'instruction' will return true iff IsReachable(x, input) is true @@ -70,6 +80,10 @@ class HloReachabilityMap { // adjacency matrix. void SetReachable(const HloInstruction* a, const HloInstruction* b); + // Updates the given reachability map after the immediate predecessor set + // (operands and control predecessors) of 'instruction' has changed. + void UpdateReachabilityThroughInstruction(const HloInstruction* instruction); + // Returns true if "b" is reachable from "a" // // Note that this function only correctly answers queries about reachability diff --git a/tensorflow/compiler/xla/service/hlo_reachability_test.cc b/tensorflow/compiler/xla/service/hlo_reachability_test.cc index d9848cee0bf..21265d9f222 100644 --- a/tensorflow/compiler/xla/service/hlo_reachability_test.cc +++ b/tensorflow/compiler/xla/service/hlo_reachability_test.cc @@ -81,6 +81,130 @@ TEST_F(HloReachabilityTest, Reachability) { EXPECT_FALSE(reachability.SetReachabilityToUnion({b, c}, d)); } +TEST_F(HloReachabilityTest, NonTrivialReachability) { + // Test reachability of a non-trivial computation: + // + // const1 const2 + // | | + // | +-------+ + // | | | + // add .. negate + // | . | + // | .... exp + // | | + // +---+ +-+---+ + // | | | + // multiply copy + // + // There is a control dependency from 'add' to 'exp'. + Shape r0f32 = ShapeUtil::MakeShape(F32, {}); + auto builder = HloComputation::Builder(TestName()); + auto constant1 = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0f))); + auto constant2 = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0f))); + auto add = builder.AddInstruction(HloInstruction::CreateBinary( + r0f32, HloOpcode::kAdd, constant1, constant2)); + auto negate = builder.AddInstruction( + HloInstruction::CreateUnary(r0f32, HloOpcode::kNegate, constant2)); + auto exp = builder.AddInstruction( + HloInstruction::CreateUnary(r0f32, HloOpcode::kExp, negate)); + auto mul = builder.AddInstruction( + HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, add, exp)); + auto copy = builder.AddInstruction( + HloInstruction::CreateUnary(r0f32, HloOpcode::kCopy, exp)); + + auto module = CreateNewVerifiedModule(); + auto computation = + module->AddEntryComputation(builder.Build(/*root_instruction=*/mul)); + + TF_CHECK_OK(add->AddControlDependencyTo(exp)); + auto reachability = HloReachabilityMap::Build(computation); + + EXPECT_TRUE(reachability->IsReachable(constant1, constant1)); + EXPECT_FALSE(reachability->IsReachable(constant1, constant2)); + EXPECT_TRUE(reachability->IsReachable(constant1, add)); + EXPECT_FALSE(reachability->IsReachable(constant1, negate)); + EXPECT_TRUE(reachability->IsReachable(constant1, exp)); + EXPECT_TRUE(reachability->IsReachable(constant1, mul)); + EXPECT_TRUE(reachability->IsReachable(constant1, copy)); + + EXPECT_FALSE(reachability->IsReachable(constant2, constant1)); + EXPECT_TRUE(reachability->IsReachable(constant2, constant2)); + EXPECT_TRUE(reachability->IsReachable(constant2, add)); + EXPECT_TRUE(reachability->IsReachable(constant2, negate)); + EXPECT_TRUE(reachability->IsReachable(constant2, exp)); + EXPECT_TRUE(reachability->IsReachable(constant2, mul)); + EXPECT_TRUE(reachability->IsReachable(constant2, copy)); + + EXPECT_FALSE(reachability->IsReachable(exp, constant1)); + EXPECT_FALSE(reachability->IsReachable(exp, constant2)); + EXPECT_FALSE(reachability->IsReachable(exp, add)); + EXPECT_FALSE(reachability->IsReachable(exp, negate)); + EXPECT_TRUE(reachability->IsReachable(exp, exp)); + EXPECT_TRUE(reachability->IsReachable(exp, mul)); + EXPECT_TRUE(reachability->IsReachable(exp, copy)); + + EXPECT_FALSE(reachability->IsReachable(mul, constant1)); + EXPECT_FALSE(reachability->IsReachable(mul, constant2)); + EXPECT_FALSE(reachability->IsReachable(mul, add)); + EXPECT_FALSE(reachability->IsReachable(mul, negate)); + EXPECT_FALSE(reachability->IsReachable(mul, exp)); + EXPECT_TRUE(reachability->IsReachable(mul, mul)); + EXPECT_FALSE(reachability->IsReachable(mul, copy)); + + EXPECT_TRUE(reachability->IsConnected(constant1, copy)); + EXPECT_TRUE(reachability->IsConnected(copy, constant1)); + EXPECT_FALSE(reachability->IsConnected(negate, add)); + EXPECT_FALSE(reachability->IsConnected(add, negate)); + + // Remove the control dependency then update and verify the reachability map + ASSERT_IS_OK(add->RemoveControlDependencyTo(exp)); + reachability->UpdateReachabilityThroughInstruction(exp); + + EXPECT_TRUE(reachability->IsReachable(constant1, constant1)); + EXPECT_FALSE(reachability->IsReachable(constant1, constant2)); + EXPECT_TRUE(reachability->IsReachable(constant1, add)); + EXPECT_FALSE(reachability->IsReachable(constant1, negate)); + EXPECT_FALSE(reachability->IsReachable(constant1, exp)); + EXPECT_TRUE(reachability->IsReachable(constant1, mul)); + EXPECT_FALSE(reachability->IsReachable(constant1, copy)); + + // Change a use within the graph then update and verify the reachability map + ASSERT_IS_OK(constant2->ReplaceUseWith(negate, constant1)); + reachability->UpdateReachabilityThroughInstruction(negate); + + EXPECT_FALSE(reachability->IsReachable(constant2, constant1)); + EXPECT_TRUE(reachability->IsReachable(constant2, constant2)); + EXPECT_TRUE(reachability->IsReachable(constant2, add)); + EXPECT_FALSE(reachability->IsReachable(constant2, negate)); + EXPECT_FALSE(reachability->IsReachable(constant2, exp)); + EXPECT_TRUE(reachability->IsReachable(constant2, mul)); + EXPECT_FALSE(reachability->IsReachable(constant2, copy)); +} + +TEST_F(HloReachabilityTest, ChannelReachability) { + const Shape shape = ShapeUtil::MakeShape(F32, {5, 7}); + HloComputation::Builder builder("ChannelReachability"); + auto param = builder.AddInstruction( + HloInstruction::CreateParameter(0, shape, "param")); + auto token0 = builder.AddInstruction(HloInstruction::CreateToken()); + auto send = + builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1)); + auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send)); + auto token1 = builder.AddInstruction(HloInstruction::CreateToken()); + auto recv = + builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1)); + auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv)); + + auto module = CreateNewVerifiedModule(); + auto computation = module->AddEntryComputation(builder.Build(recv_done)); + auto reachability = HloReachabilityMap::Build(computation); + EXPECT_TRUE(reachability->IsReachable(param, recv_done)); + EXPECT_FALSE(reachability->IsReachable(send, recv)); + EXPECT_FALSE(reachability->IsReachable(send_done, recv)); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc index a85793e4774..426c1256080 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion.cc @@ -452,7 +452,7 @@ StatusOr InstructionFusion::Run(HloModule* module) { for (auto* computation : module->MakeNonfusionComputations()) { CHECK(!computation->IsFusionComputation()); computation_ = computation; - reachability_ = computation_->ComputeReachability(); + reachability_ = HloReachabilityMap::Build(computation_); HloInstructionSet do_not_duplicate = ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder()); @@ -566,7 +566,7 @@ bool InstructionFusion::MultiOutputFusionCreatesCycle( // A consumer operand may have been multii-output fused into a parallel // consumer and thus be missing from the oridinal reachability map. if (!reachability_->IsPresent(a) || !reachability_->IsPresent(b)) { - reachability_ = consumer->parent()->ComputeReachability(); + reachability_ = HloReachabilityMap::Build(consumer->parent()); } return reachability_->IsReachable(a, b); }; diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h index 4045e886dd9..198bd7fce5f 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.h +++ b/tensorflow/compiler/xla/service/instruction_fusion.h @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" +#include "tensorflow/compiler/xla/service/hlo_reachability.h" #include "tensorflow/core/platform/macros.h" namespace xla { diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc index 2ca527bc4cb..6088fa4df66 100644 --- a/tensorflow/compiler/xla/service/multi_output_fusion.cc +++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc @@ -18,6 +18,7 @@ limitations under the License. #include "absl/container/flat_hash_set.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_reachability.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/core/platform/types.h" @@ -257,7 +258,7 @@ bool MultiOutputFusion::LegalToFuse(HloInstruction* instr1, } void MultiOutputFusion::RecomputeReachability() { - reachability_ = computation_->ComputeReachability(); + reachability_ = HloReachabilityMap::Build(computation_); } void MultiOutputFusion::UpdateReachability( diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h index 9508ab2ed1d..1c7583ece72 100644 --- a/tensorflow/compiler/xla/service/multi_output_fusion.h +++ b/tensorflow/compiler/xla/service/multi_output_fusion.h @@ -23,6 +23,7 @@ limitations under the License. #include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" +#include "tensorflow/compiler/xla/service/hlo_reachability.h" #include "tensorflow/compiler/xla/statusor.h" namespace xla { From 56c9554ccd39567dbbed91efe6082c9f51c42f53 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 18:04:56 -0800 Subject: [PATCH 154/540] Copy LinearEstimator to core. LinearEstimator allows users to specify their own head (unlike LinearRegressor and LinearClassifier). PiperOrigin-RevId: 220207294 --- ...nsorflow.estimator.-linear-estimator.pbtxt | 62 +++++++++++++++++++ .../api/golden/v1/tensorflow.estimator.pbtxt | 4 ++ ...nsorflow.estimator.-linear-estimator.pbtxt | 62 +++++++++++++++++++ .../api/golden/v2/tensorflow.estimator.pbtxt | 4 ++ 4 files changed, 132 insertions(+) create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt new file mode 100644 index 00000000000..3d6b03098aa --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt @@ -0,0 +1,62 @@ +path: "tensorflow.estimator.LinearEstimator" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + member { + name: "config" + mtype: "" + } + member { + name: "model_dir" + mtype: "" + } + member { + name: "model_fn" + mtype: "" + } + member { + name: "params" + mtype: "" + } + member_method { + name: "__init__" + argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], " + } + member_method { + name: "eval_dir" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "evaluate" + argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } + member_method { + name: "export_savedmodel" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " + } + member_method { + name: "get_variable_names" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_variable_value" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "latest_checkpoint" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "predict" + argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], " + } + member_method { + name: "train" + argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt index ec3216ae705..da685d94cfa 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt @@ -72,6 +72,10 @@ tf_module { name: "LinearClassifier" mtype: "" } + member { + name: "LinearEstimator" + mtype: "" + } member { name: "LinearRegressor" mtype: "" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt new file mode 100644 index 00000000000..3d6b03098aa --- /dev/null +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt @@ -0,0 +1,62 @@ +path: "tensorflow.estimator.LinearEstimator" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + member { + name: "config" + mtype: "" + } + member { + name: "model_dir" + mtype: "" + } + member { + name: "model_fn" + mtype: "" + } + member { + name: "params" + mtype: "" + } + member_method { + name: "__init__" + argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], " + } + member_method { + name: "eval_dir" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "evaluate" + argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } + member_method { + name: "export_savedmodel" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " + } + member_method { + name: "get_variable_names" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_variable_value" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "latest_checkpoint" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "predict" + argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], " + } + member_method { + name: "train" + argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } +} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt index ec3216ae705..da685d94cfa 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt @@ -72,6 +72,10 @@ tf_module { name: "LinearClassifier" mtype: "" } + member { + name: "LinearEstimator" + mtype: "" + } member { name: "LinearRegressor" mtype: "" From 84eae2984c81c74040e61a98c9e25279a5935004 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 18:06:58 -0800 Subject: [PATCH 155/540] Copy DNNEstimator to core. DNNEstimator allows users to specify their own head (unlike DNNRegressor and DNNClassifier). PiperOrigin-RevId: 220207545 --- ...ensorflow.estimator.-d-n-n-estimator.pbtxt | 62 +++++++++++++++++++ .../api/golden/v1/tensorflow.estimator.pbtxt | 4 ++ ...ensorflow.estimator.-d-n-n-estimator.pbtxt | 62 +++++++++++++++++++ .../api/golden/v2/tensorflow.estimator.pbtxt | 4 ++ 4 files changed, 132 insertions(+) create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt new file mode 100644 index 00000000000..4635a1544c3 --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt @@ -0,0 +1,62 @@ +path: "tensorflow.estimator.DNNEstimator" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + member { + name: "config" + mtype: "" + } + member { + name: "model_dir" + mtype: "" + } + member { + name: "model_fn" + mtype: "" + } + member { + name: "params" + mtype: "" + } + member_method { + name: "__init__" + argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'\', \'None\', \'None\', \'None\', \'None\', \'False\'], " + } + member_method { + name: "eval_dir" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "evaluate" + argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } + member_method { + name: "export_savedmodel" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " + } + member_method { + name: "get_variable_names" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_variable_value" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "latest_checkpoint" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "predict" + argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], " + } + member_method { + name: "train" + argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt index da685d94cfa..c5b0085b8d3 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt @@ -28,6 +28,10 @@ tf_module { name: "DNNClassifier" mtype: "" } + member { + name: "DNNEstimator" + mtype: "" + } member { name: "DNNLinearCombinedClassifier" mtype: "" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt new file mode 100644 index 00000000000..4635a1544c3 --- /dev/null +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt @@ -0,0 +1,62 @@ +path: "tensorflow.estimator.DNNEstimator" +tf_class { + is_instance: "" + is_instance: "" + is_instance: "" + member { + name: "config" + mtype: "" + } + member { + name: "model_dir" + mtype: "" + } + member { + name: "model_fn" + mtype: "" + } + member { + name: "params" + mtype: "" + } + member_method { + name: "__init__" + argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'\', \'None\', \'None\', \'None\', \'None\', \'False\'], " + } + member_method { + name: "eval_dir" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } + member_method { + name: "evaluate" + argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } + member_method { + name: "export_saved_model" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " + } + member_method { + name: "export_savedmodel" + argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], " + } + member_method { + name: "get_variable_names" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "get_variable_value" + argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "latest_checkpoint" + argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" + } + member_method { + name: "predict" + argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], " + } + member_method { + name: "train" + argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " + } +} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt index da685d94cfa..c5b0085b8d3 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt @@ -28,6 +28,10 @@ tf_module { name: "DNNClassifier" mtype: "" } + member { + name: "DNNEstimator" + mtype: "" + } member { name: "DNNLinearCombinedClassifier" mtype: "" From be4dc89e8ccd1846b22cc69ce759a27ed166004a Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Mon, 5 Nov 2018 18:25:19 -0800 Subject: [PATCH 156/540] [FLR] Switch to using LocalHandle in `FLRImpl::GetOrCreateItem()`. This enables us to avoid two lock-protected lookups in the parent ProcFLR's handle map (the first for `parent_->IsInstantiatedOnDevice()`, and the second in `GetOrCreateItem()`) every time we invoke a function. PiperOrigin-RevId: 220209369 --- tensorflow/core/common_runtime/function.cc | 45 +++++++++++----------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc index 9affc9fb188..313cdfa2200 100644 --- a/tensorflow/core/common_runtime/function.cc +++ b/tensorflow/core/common_runtime/function.cc @@ -382,8 +382,8 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime { Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs, const FunctionLibraryDefinition* lib_def, FunctionBody** fbody); - Status CreateItem(Handle handle, Item** item); - Status GetOrCreateItem(Handle handle, Item** item); + Status CreateItem(Item** item); + Status GetOrCreateItem(LocalHandle local_handle, Item** item); Status InstantiateSymbolicGradient(const NameAttrList& func, const FunctionLibraryDefinition* lib_def, FunctionBody** g_body); @@ -691,13 +691,14 @@ Status FunctionLibraryRuntimeImpl::Instantiate( TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, attrs, lib_def, &fbody)); } + LocalHandle local_handle; { mutex_lock l(mu_); *handle = parent_->GetHandle(key); if (*handle != kInvalidHandle) { delete fbody; - ++items_[parent_->GetHandleOnDevice(device_name_, *handle)] - ->instantiation_counter; + local_handle = parent_->GetHandleOnDevice(device_name_, *handle); + ++items_[local_handle]->instantiation_counter; } else { *handle = parent_->AddHandle(key, device_name_, next_handle_); Item* item = new Item; @@ -709,26 +710,24 @@ Status FunctionLibraryRuntimeImpl::Instantiate( item->overlay_flr = new FunctionLibraryRuntimeOverlay(this, options.overlay_lib); } - items_.emplace(next_handle_, std::unique_ptr(item)); - next_handle_++; + local_handle = next_handle_++; + items_.emplace(local_handle, std::unique_ptr(item)); } } if (options.create_kernels_eagerly) { Item* item; - TF_RETURN_IF_ERROR(GetOrCreateItem(*handle, &item)); + TF_RETURN_IF_ERROR(GetOrCreateItem(local_handle, &item)); } return Status::OK(); } Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) { - if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) { + LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle); + if (h == kInvalidLocalHandle) { return parent_->ReleaseHandle(handle); } - - LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle); - CHECK_NE(h, kInvalidLocalHandle); mutex_lock l(mu_); CHECK_EQ(1, items_.count(h)); std::unique_ptr& item = items_[h]; @@ -789,7 +788,7 @@ void PruneFunctionBody(Graph* g) { } } // namespace -Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) { +Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) { const FunctionBody* fbody; const FunctionLibraryDefinition* lib_def; string executor_type; @@ -843,13 +842,13 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) { return Status::OK(); } -Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) { - LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle); +Status FunctionLibraryRuntimeImpl::GetOrCreateItem(LocalHandle local_handle, + Item** item) { { tf_shared_lock l(mu_); auto iter = items_.find(local_handle); if (iter == items_.end()) { - return errors::NotFound("Function handle ", handle, + return errors::Internal("Local function handle ", local_handle, " is not valid. Likely an internal error."); } *item = iter->second.get(); @@ -859,7 +858,7 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) { } // NOTE: We need to call CreateItem out of mu_ because creating an // executor needs to call CreateKernel. - return CreateItem(handle, item); + return CreateItem(item); } void FunctionLibraryRuntimeImpl::ExecutorArgsFromOptions( @@ -994,7 +993,8 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, }; } - if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) { + LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle); + if (local_handle == kInvalidLocalHandle) { parent_->Run(run_opts, handle, args, rets, done); return; } @@ -1005,7 +1005,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, DCHECK(run_opts.runner != nullptr); Item* item = nullptr; - Status s = GetOrCreateItem(handle, &item); + Status s = GetOrCreateItem(local_handle, &item); if (!s.ok()) { done(s); return; @@ -1052,8 +1052,8 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, done(errors::Cancelled("")); return; } - if (!parent_->IsInstantiatedOnDevice(device_name_, handle) || - opts.remote_execution) { + LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle); + if (local_handle == kInvalidLocalHandle || opts.remote_execution) { done(errors::Unimplemented("Remote calling with CallFrameInterface")); return; } @@ -1074,7 +1074,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, } Item* item = nullptr; - Status s = GetOrCreateItem(handle, &item); + Status s = GetOrCreateItem(local_handle, &item); if (!s.ok()) { done(s); return; @@ -1097,7 +1097,8 @@ bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) { string FunctionLibraryRuntimeImpl::DebugString(Handle handle) { Item* item = nullptr; - Status s = GetOrCreateItem(handle, &item); + LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle); + Status s = GetOrCreateItem(local_handle, &item); if (s.ok()) { return tensorflow::DebugString(item->graph); } else { From dae1e7c69336d28897abf7853bde10110d48b15f Mon Sep 17 00:00:00 2001 From: Anna R Date: Mon, 5 Nov 2018 18:35:50 -0800 Subject: [PATCH 157/540] Update c_api_test.cc to load test_op1.so instead of test_op.so. If op has the same name as already loaded op, then it won't be added to list returned by TF_GetOpList. So, we want to load a different op. PiperOrigin-RevId: 220210246 --- tensorflow/c/BUILD | 6 +++--- tensorflow/c/c_api_test.cc | 40 ++++++++++++++++++-------------------- tensorflow/c/test_op1.cc | 23 ++++++++++++++++++++++ 3 files changed, 45 insertions(+), 24 deletions(-) create mode 100644 tensorflow/c/test_op1.cc diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD index 16f633643d4..dbe8dba2924 100644 --- a/tensorflow/c/BUILD +++ b/tensorflow/c/BUILD @@ -199,7 +199,7 @@ tf_cuda_cc_test( size = "small", srcs = ["c_api_test.cc"], data = [ - ":test_op.so", + ":test_op1.so", "//tensorflow/cc/saved_model:saved_model_half_plus_two", ], kernels = [":test_op_kernel"], @@ -284,8 +284,8 @@ tf_cc_test( ) tf_custom_op_library( - name = "test_op.so", - srcs = ["test_op.cc"], + name = "test_op1.so", + srcs = ["test_op1.cc"], ) tf_kernel_library( diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc index b0dc0363fdb..d5934a10395 100644 --- a/tensorflow/c/c_api_test.cc +++ b/tensorflow/c/c_api_test.cc @@ -187,15 +187,26 @@ TEST(CAPI, LibraryLoadFunctions) { // tf_cuda_cc_test() bazel rule and remove the next line. if (!GPUDeviceName().empty()) return; - // Load the library. - TF_Status* status = TF_NewStatus(); - TF_Library* lib = - TF_LoadLibrary("tensorflow/c/test_op.so", status); - TF_Code code = TF_GetCode(status); - string status_msg(TF_Message(status)); - TF_DeleteStatus(status); - ASSERT_EQ(TF_OK, code) << status_msg; +#if !defined(TENSORFLOW_NO_SHARED_OBJECTS) + { + // Load the library. + TF_Status* status = TF_NewStatus(); + TF_Library* lib = + TF_LoadLibrary("tensorflow/c/test_op1.so", status); + TF_Code code = TF_GetCode(status); + string status_msg(TF_Message(status)); + TF_DeleteStatus(status); + ASSERT_EQ(TF_OK, code) << status_msg; + // Test op list. + TF_Buffer op_list_buf = TF_GetOpList(lib); + tensorflow::OpList op_list; + EXPECT_TRUE(op_list.ParseFromArray(op_list_buf.data, op_list_buf.length)); + ASSERT_EQ(op_list.op_size(), 1); + EXPECT_EQ("TestCApi1", op_list.op(0).name()); + TF_DeleteLibraryHandle(lib); + } +#endif // !defined(TENSORFLOW_NO_SHARED_OBJECTS) { TF_Buffer* op_list_buffer = TF_GetAllOpList(); tensorflow::OpList op_list; @@ -210,19 +221,6 @@ TEST(CAPI, LibraryLoadFunctions) { EXPECT_TRUE(found); TF_DeleteBuffer(op_list_buffer); } - -#if !defined(TENSORFLOW_NO_SHARED_OBJECTS) - { - // Test op list. - TF_Buffer op_list_buf = TF_GetOpList(lib); - tensorflow::OpList op_list; - EXPECT_TRUE(op_list.ParseFromArray(op_list_buf.data, op_list_buf.length)); - ASSERT_EQ(op_list.op_size(), 1); - EXPECT_EQ("TestCApi", op_list.op(0).name()); - } -#endif // !defined(TENSORFLOW_NO_SHARED_OBJECTS) - - TF_DeleteLibraryHandle(lib); } void TestEncodeDecode(int line, const std::vector& data) { diff --git a/tensorflow/c/test_op1.cc b/tensorflow/c/test_op1.cc new file mode 100644 index 00000000000..b22cc9aef2b --- /dev/null +++ b/tensorflow/c/test_op1.cc @@ -0,0 +1,23 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +REGISTER_OP("TestCApi1").Doc(R"doc(Used to test C API)doc"); + +} // namespace tensorflow From 70ccf4212035170c5411c931d5e5957afafc6ff1 Mon Sep 17 00:00:00 2001 From: Anna R Date: Mon, 5 Nov 2018 18:45:15 -0800 Subject: [PATCH 158/540] Internal change. PiperOrigin-RevId: 220210933 --- tensorflow/contrib/saved_model/BUILD | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD index 4e1af191c99..395a68c6446 100644 --- a/tensorflow/contrib/saved_model/BUILD +++ b/tensorflow/contrib/saved_model/BUILD @@ -82,12 +82,6 @@ py_library( name = "keras_saved_model", srcs = ["python/saved_model/keras_saved_model.py"], srcs_version = "PY2AND3", - tags = [ - "no_windows", - # TODO(b/119022845): Re-enable this test in TAP. - "manual", - "notap", - ], visibility = ["//visibility:public"], deps = [ "//tensorflow/python:array_ops", @@ -108,7 +102,14 @@ py_test( size = "medium", srcs = ["python/saved_model/keras_saved_model_test.py"], srcs_version = "PY2AND3", - tags = ["notsan"], + tags = [ + "no_windows", + # TODO(b/119022845): Re-enable this test in TAP. + "manual", + "notap", + "notsan", + "no_oss", + ], deps = [ ":keras_saved_model", "//tensorflow/python:client_testlib", From 812490e3dda3902a629be86a2062e72a4aab897f Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 12 Sep 2018 05:00:26 +0000 Subject: [PATCH 159/540] Update api defs Signed-off-by: Yong Tang --- .../v1/tensorflow.data.-fixed-length-record-dataset.pbtxt | 2 +- tensorflow/tools/api/golden/v1/tensorflow.pbtxt | 4 ++++ .../v2/tensorflow.data.-fixed-length-record-dataset.pbtxt | 2 +- tensorflow/tools/api/golden/v2/tensorflow.pbtxt | 4 ++++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt index a7bfa82c650..773b3743acd 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt @@ -17,7 +17,7 @@ tf_class { } member_method { name: "__init__" - argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } member_method { name: "apply" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt index 9597dd7684e..29a6aab9ffa 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt @@ -1172,6 +1172,10 @@ tf_module { name: "fill" argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "fixed_length_record_dataset_v2" + argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "fixed_size_partitioner" argspec: "args=[\'num_shards\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt index a7bfa82c650..773b3743acd 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt @@ -17,7 +17,7 @@ tf_class { } member_method { name: "__init__" - argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], " } member_method { name: "apply" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt index 7c865bb0022..f43a1e4cd90 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt @@ -776,6 +776,10 @@ tf_module { name: "fill" argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "fixed_length_record_dataset_v2" + argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "fixed_size_partitioner" argspec: "args=[\'num_shards\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], " From be3cf530c4f31236ebd7d21693d46267330ea042 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 12 Sep 2018 05:24:39 +0000 Subject: [PATCH 160/540] update_api_def.sh Signed-off-by: Yong Tang --- .../api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt new file mode 100644 index 00000000000..ad82eddb587 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt @@ -0,0 +1,3 @@ +op { + graph_op_name: "FixedLengthRecordDatasetV2" +} From 57d2b8dd9d67bf9153a69b29c4bd01feeca5990a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 12 Sep 2018 14:53:22 +0000 Subject: [PATCH 161/540] Pylint fix Signed-off-by: Yong Tang --- tensorflow/python/data/ops/readers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index c0b75a26e7e..e481300a175 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -292,7 +292,8 @@ class FixedLengthRecordDataset(dataset_ops.Dataset): argument_dtype=dtypes.string) def _as_variant_tensor(self): - if self._compression_type is not None or compat.forward_compatible(2018, 9, 30): + if (self._compression_type is not None or + compat.forward_compatible(2018, 9, 30)): return gen_dataset_ops.fixed_length_record_dataset_v2( self._filenames, self._header_bytes, self._record_bytes, self._footer_bytes, self._buffer_size, self._compression_type) From 7fe09a5864f8fceb664e3c0b246b5ec3d22fd266 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 12 Sep 2018 14:53:41 +0000 Subject: [PATCH 162/540] Add FixedLengthRecordDatasetV2 to the reader list of tf.contrib.distribute so that tf.contrib.distribute's input_op could correctly locate the Dataset Signed-off-by: Yong Tang --- tensorflow/contrib/distribute/python/input_ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/distribute/python/input_ops.py b/tensorflow/contrib/distribute/python/input_ops.py index f07ec8234df..b20dc493afa 100644 --- a/tensorflow/contrib/distribute/python/input_ops.py +++ b/tensorflow/contrib/distribute/python/input_ops.py @@ -29,7 +29,8 @@ from tensorflow.python.platform import tf_logging _READER_DATASET_OPS = [ "TextLineDataset", "TFRecordDataset", - "FixedLengthRecordDataset" + "FixedLengthRecordDataset", + "FixedLengthRecordDatasetV2" ] From b3f02c5c030a1d3d2ca84c7b2d50740beb69eb9a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 6 Nov 2018 02:43:10 +0000 Subject: [PATCH 163/540] Fix broken test Signed-off-by: Yong Tang --- tensorflow/python/data/ops/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index e481300a175..c11930fc1fc 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -293,7 +293,7 @@ class FixedLengthRecordDataset(dataset_ops.Dataset): def _as_variant_tensor(self): if (self._compression_type is not None or - compat.forward_compatible(2018, 9, 30)): + compat.forward_compatible(2018, 11, 30)): return gen_dataset_ops.fixed_length_record_dataset_v2( self._filenames, self._header_bytes, self._record_bytes, self._footer_bytes, self._buffer_size, self._compression_type) From e436475e805d259b5359f64aebba89a5b83e4aee Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Mon, 5 Nov 2018 20:15:12 -0800 Subject: [PATCH 164/540] [FLR] Use the correct `Options` when populating `Executor::Args`. My recent change modified how `Executor::Args` was populated and missed the fact that we rewrite the `Options` to add in a created `Rendezvous` object in some cases. This change correctly uses the rewritten `Options` in both cases. PiperOrigin-RevId: 220218606 --- tensorflow/core/common_runtime/function.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc index 313cdfa2200..6775695fa2d 100644 --- a/tensorflow/core/common_runtime/function.cc +++ b/tensorflow/core/common_runtime/function.cc @@ -1028,9 +1028,9 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, } Executor::Args exec_args; - ExecutorArgsFromOptions(opts, frame, &exec_args); + ExecutorArgsFromOptions(run_opts, frame, &exec_args); - bool allow_dead_tensors = opts.allow_dead_tensors; + bool allow_dead_tensors = run_opts.allow_dead_tensors; item->exec->RunAsync( // Executor args exec_args, @@ -1085,7 +1085,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle, DCHECK(run_opts.runner != nullptr); Executor::Args exec_args; - ExecutorArgsFromOptions(opts, frame, &exec_args); + ExecutorArgsFromOptions(run_opts, frame, &exec_args); item->exec->RunAsync(exec_args, std::move(done)); } From c975cefc3d9516e1e7b8799ac86b908fd9a1dcdc Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 5 Nov 2018 22:30:46 -0800 Subject: [PATCH 165/540] Add better warnings for deprecation tools. Fixes #23059 Also adds "Warning:" to the deprecation warning, which is rendered in many doc tools as a red callout box. PiperOrigin-RevId: 220228838 --- .../python/framework/experimental_test.py | 23 ++++---- tensorflow/python/util/decorator_utils.py | 20 ++++++- .../python/util/decorator_utils_test.py | 11 ++-- tensorflow/python/util/deprecation.py | 56 ++++++++++++++----- tensorflow/python/util/deprecation_test.py | 52 ++++++++++------- 5 files changed, 110 insertions(+), 52 deletions(-) diff --git a/tensorflow/contrib/framework/python/framework/experimental_test.py b/tensorflow/contrib/framework/python/framework/experimental_test.py index cfdc7df7d8f..00e04b83ac4 100644 --- a/tensorflow/contrib/framework/python/framework/experimental_test.py +++ b/tensorflow/contrib/framework/python/framework/experimental_test.py @@ -44,17 +44,18 @@ class ExperimentalTest(test.TestCase): # Assert function docs are properly updated. self.assertEqual("_fn", _fn.__name__) - self.assertEqual("fn doc. (experimental)" - "\n" - "\nTHIS FUNCTION IS EXPERIMENTAL. It may change or " - "be removed at any time, and without warning." - "\n" - "\nArgs:" - "\n arg0: Arg 0." - "\n arg1: Arg 1." - "\n" - "\nReturns:" - "\n Sum of args.", _fn.__doc__) + self.assertEqual( + "fn doc. (experimental)" + "\n" + "\nWarning: THIS FUNCTION IS EXPERIMENTAL. It may change " + "or be removed at any time, and without warning." + "\n" + "\nArgs:" + "\n arg0: Arg 0." + "\n arg1: Arg 1." + "\n" + "\nReturns:" + "\n Sum of args.", _fn.__doc__) # Assert calling new fn issues log warning. self.assertEqual(3, _fn(1, 2)) diff --git a/tensorflow/python/util/decorator_utils.py b/tensorflow/python/util/decorator_utils.py index 7b4363c0e40..ab9641d96bc 100644 --- a/tensorflow/python/util/decorator_utils.py +++ b/tensorflow/python/util/decorator_utils.py @@ -75,13 +75,31 @@ def _normalize_docstring(docstring): def add_notice_to_docstring( doc, instructions, no_doc_str, suffix_str, notice): - """Adds a deprecation notice to a docstring.""" + """Adds a deprecation notice to a docstring. + + Args: + doc: The original docstring. + instructions: A string, describing how to fix the problem. + no_doc_str: The default value to use for `doc` if `doc` is empty. + suffix_str: Is added to the end of the first line. + notice: A list of strings. The main notice warning body. + + Returns: + A new docstring, with the notice attached. + + Raises: + ValueError: If `notice` is empty. + """ if not doc: lines = [no_doc_str] else: lines = _normalize_docstring(doc).splitlines() lines[0] += ' ' + suffix_str + if not notice: + raise ValueError('The `notice` arg must not be empty.') + + notice[0] = 'Warning: ' + notice[0] notice = [''] + notice + ([instructions] if instructions else []) if len(lines) > 1: diff --git a/tensorflow/python/util/decorator_utils_test.py b/tensorflow/python/util/decorator_utils_test.py index 64e0cc7f57e..440dcbb6df3 100644 --- a/tensorflow/python/util/decorator_utils_test.py +++ b/tensorflow/python/util/decorator_utils_test.py @@ -55,8 +55,9 @@ class AddNoticeToDocstringTest(test.TestCase): expected) def test_regular(self): - expected = ("Brief (suffix)\n\nGo away\nInstructions\n\nDocstring\n\n" - "Args:\n arg1: desc") + expected = ( + "Brief (suffix)\n\nWarning: Go away\nInstructions\n\nDocstring\n\n" + "Args:\n arg1: desc") # No indent for main docstring self._check("Brief\n\nDocstring\n\nArgs:\n arg1: desc", expected) # 2 space indent for main docstring, blank lines not indented @@ -71,7 +72,7 @@ class AddNoticeToDocstringTest(test.TestCase): expected) def test_brief_only(self): - expected = "Brief (suffix)\n\nGo away\nInstructions" + expected = "Brief (suffix)\n\nWarning: Go away\nInstructions" self._check("Brief", expected) self._check("Brief\n", expected) self._check("Brief\n ", expected) @@ -79,12 +80,12 @@ class AddNoticeToDocstringTest(test.TestCase): self._check("\n Brief\n ", expected) def test_no_docstring(self): - expected = "Nothing here\n\nGo away\nInstructions" + expected = "Nothing here\n\nWarning: Go away\nInstructions" self._check(None, expected) self._check("", expected) def test_no_empty_line(self): - expected = "Brief (suffix)\n\nGo away\nInstructions\n\nDocstring" + expected = "Brief (suffix)\n\nWarning: Go away\nInstructions\n\nDocstring" # No second line indent self._check("Brief\nDocstring", expected) # 2 space second line indent diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py index c43589f5c45..4c68d1aaae3 100644 --- a/tensorflow/python/util/deprecation.py +++ b/tensorflow/python/util/deprecation.py @@ -54,16 +54,39 @@ def _add_deprecated_function_notice_to_docstring(doc, date, instructions): '(deprecated)', main_text) -def _add_deprecated_arg_notice_to_docstring(doc, date, instructions): +def _add_deprecated_arg_notice_to_docstring(doc, date, instructions, + deprecated_names): """Adds a deprecation notice to a docstring for deprecated arguments.""" + + deprecation_string = ', '.join(sorted(deprecated_names)) + return decorator_utils.add_notice_to_docstring( - doc, instructions, - 'DEPRECATED FUNCTION ARGUMENTS', + doc, instructions, 'DEPRECATED FUNCTION ARGUMENTS', '(deprecated arguments)', [ - 'SOME ARGUMENTS ARE DEPRECATED. ' - 'They will be removed %s.' % ( - 'in a future version' if date is None else ('after %s' % date)), - 'Instructions for updating:']) + 'SOME ARGUMENTS ARE DEPRECATED: `(%s)`. ' + 'They will be removed %s.' % + (deprecation_string, 'in a future version' if date is None else + ('after %s' % date)), 'Instructions for updating:' + ]) + + +def _add_deprecated_arg_value_notice_to_docstring(doc, date, instructions, + deprecated_name_value_dict): + """Adds a deprecation notice to a docstring for deprecated arguments.""" + + deprecation_string = ', '.join( + '%s=%r' % (key, value) + for key, value in sorted(deprecated_name_value_dict.items())) + + when = 'in a future version' if date is None else ('after %s' % date) + + return decorator_utils.add_notice_to_docstring( + doc, instructions, 'DEPRECATED FUNCTION ARGUMENT VALUES', + '(deprecated argument values)', [ + 'SOME ARGUMENT VALUES ARE DEPRECATED: `(%s)`. ' + 'They will be removed %s.' % (deprecation_string, when), + 'Instructions for updating:' + ]) def _validate_deprecation_args(date, instructions): @@ -403,10 +426,11 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples, pos, spec.has_ok_value, spec.ok_value) return deprecated_positional_args + deprecated_arg_names = _get_arg_names_to_ok_vals() + def deprecated_wrapper(func): """Deprecation decorator.""" decorator_utils.validate_callable(func, 'deprecated_args') - deprecated_arg_names = _get_arg_names_to_ok_vals() arg_spec = tf_inspect.getfullargspec(func) deprecated_positions = _get_deprecated_positional_arguments( @@ -486,9 +510,11 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples, 'in a future version' if date is None else ('after %s' % date), instructions) return func(*args, **kwargs) - return tf_decorator.make_decorator(func, new_func, 'deprecated', - _add_deprecated_arg_notice_to_docstring( - func.__doc__, date, instructions)) + + doc = _add_deprecated_arg_notice_to_docstring( + func.__doc__, date, instructions, sorted(deprecated_arg_names.keys())) + return tf_decorator.make_decorator(func, new_func, 'deprecated', doc) + return deprecated_wrapper @@ -551,9 +577,11 @@ def deprecated_arg_values(date, instructions, warn_once=True, func.__module__, arg_name, arg_value, 'in a future version' if date is None else ('after %s' % date), instructions) return func(*args, **kwargs) - return tf_decorator.make_decorator(func, new_func, 'deprecated', - _add_deprecated_arg_notice_to_docstring( - func.__doc__, date, instructions)) + + doc = _add_deprecated_arg_value_notice_to_docstring( + func.__doc__, date, instructions, deprecated_kwargs) + return tf_decorator.make_decorator(func, new_func, 'deprecated', doc) + return deprecated_wrapper diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py index 90c73a0a58d..34cbca52a1b 100644 --- a/tensorflow/python/util/deprecation_test.py +++ b/tensorflow/python/util/deprecation_test.py @@ -153,7 +153,8 @@ class DeprecationTest(test.TestCase): self.assertEqual( "fn doc. (deprecated)" "\n" - "\nTHIS FUNCTION IS DEPRECATED. It will be removed in a future version." + "\nWarning: THIS FUNCTION IS DEPRECATED. " + "It will be removed in a future version." "\nInstructions for updating:\n%s" "\n" "\nArgs:" @@ -195,7 +196,7 @@ class DeprecationTest(test.TestCase): self.assertEqual( "fn doc. (deprecated)" "\n" - "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s." + "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s." "\nInstructions for updating:\n%s" "\n" "\nArgs:" @@ -227,7 +228,7 @@ class DeprecationTest(test.TestCase): self.assertEqual( "fn doc. (deprecated)" "\n" - "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s." + "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s." "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__) # Assert calling new fn issues log warning. @@ -251,7 +252,7 @@ class DeprecationTest(test.TestCase): self.assertEqual( "DEPRECATED FUNCTION" "\n" - "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s." + "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s." "\nInstructions for updating:" "\n%s" % (date, instructions), _fn.__doc__) @@ -289,7 +290,7 @@ class DeprecationTest(test.TestCase): self.assertEqual( "fn doc. (deprecated)" "\n" - "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s." + "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s." "\nInstructions for updating:\n%s" "\n" "\nArgs:" @@ -326,7 +327,7 @@ class DeprecationTest(test.TestCase): self.assertEqual( "fn doc. (deprecated)" "\n" - "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s." + "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s." "\nInstructions for updating:\n%s" % (date, instructions), getattr(_Object, "_fn").__doc__) @@ -355,9 +356,10 @@ class DeprecationTest(test.TestCase): self.assertEqual( "DEPRECATED FUNCTION" "\n" - "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s." + "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s." "\nInstructions for updating:" - "\n%s" % (date, instructions), getattr(_Object, "_fn").__doc__) + "\n%s" % (date, instructions), + getattr(_Object, "_fn").__doc__) # Assert calling new fn issues log warning. self.assertEqual(3, _Object()._fn(1, 2)) @@ -406,12 +408,13 @@ class DeprecationTest(test.TestCase): self.assertEqual( "prop doc. (deprecated)" "\n" - "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s." + "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s." "\nInstructions for updating:" "\n%s" "\n" "\nReturns:" - "\n String." % (date, instructions), getattr(_Object, "_prop").__doc__) + "\n String." % (date, instructions), + getattr(_Object, "_prop").__doc__) # Assert calling new fn issues log warning. self.assertEqual("prop_with_doc", _Object()._prop) @@ -439,9 +442,10 @@ class DeprecationTest(test.TestCase): self.assertEqual( "DEPRECATED FUNCTION" "\n" - "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s." + "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s." "\nInstructions for updating:" - "\n%s" % (date, instructions), getattr(_Object, "_prop").__doc__) + "\n%s" % (date, instructions), + getattr(_Object, "_prop").__doc__) # Assert calling new fn issues log warning. self.assertEqual("prop_no_doc", _Object()._prop) @@ -507,7 +511,8 @@ class DeprecatedArgsTest(test.TestCase): self.assertEqual( "fn doc. (deprecated arguments)" "\n" - "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s." + "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. " + "They will be removed after %s." "\nInstructions for updating:\n%s" "\n" "\nArgs:" @@ -544,7 +549,8 @@ class DeprecatedArgsTest(test.TestCase): self.assertEqual( "fn doc. (deprecated arguments)" "\n" - "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s." + "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. " + "They will be removed after %s." "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__) # Assert calls without the deprecated argument log nothing. @@ -572,7 +578,8 @@ class DeprecatedArgsTest(test.TestCase): self.assertEqual( "DEPRECATED FUNCTION ARGUMENTS" "\n" - "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s." + "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. " + "They will be removed after %s." "\nInstructions for updating:" "\n%s" % (date, instructions), _fn.__doc__) @@ -767,9 +774,10 @@ class DeprecatedArgValuesTest(test.TestCase): # Assert function docs are properly updated. self.assertEqual("_fn", _fn.__name__) self.assertEqual( - "fn doc. (deprecated arguments)" + "fn doc. (deprecated argument values)" "\n" - "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s." + "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. " + "They will be removed after %s." "\nInstructions for updating:\n%s" "\n" "\nArgs:" @@ -809,9 +817,10 @@ class DeprecatedArgValuesTest(test.TestCase): # Assert function docs are properly updated. self.assertEqual("_fn", _fn.__name__) self.assertEqual( - "fn doc. (deprecated arguments)" + "fn doc. (deprecated argument values)" "\n" - "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s." + "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. " + "They will be removed after %s." "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__) # Assert calling new fn with non-deprecated value logs nothing. @@ -842,9 +851,10 @@ class DeprecatedArgValuesTest(test.TestCase): # Assert function docs are properly updated. self.assertEqual("_fn", _fn.__name__) self.assertEqual( - "DEPRECATED FUNCTION ARGUMENTS" + "DEPRECATED FUNCTION ARGUMENT VALUES" "\n" - "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s." + "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. " + "They will be removed after %s." "\nInstructions for updating:" "\n%s" % (date, instructions), _fn.__doc__) From fa0661ee7daa284f5fe71ceac778642bcc375c4f Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Mon, 5 Nov 2018 22:47:05 -0800 Subject: [PATCH 166/540] Move to using num_replicas_in_sync instead of num_replicas. PiperOrigin-RevId: 220229772 --- .../contrib/distribute/python/keras_test.py | 10 +++--- .../distribute/python/metrics_v1_test.py | 6 ++-- .../distribute/python/minimize_loss_test.py | 11 ++++--- .../distribute/python/one_device_strategy.py | 4 +++ .../python/parameter_server_strategy.py | 2 +- .../python/parameter_server_strategy_test.py | 8 ++--- .../contrib/optimizer_v2/optimizer_v2.py | 32 +++++++++---------- .../engine/distributed_training_utils.py | 7 ++-- tensorflow/python/keras/engine/training.py | 2 +- .../keras/engine/training_distributed.py | 15 +++++---- tensorflow/python/training/distribute.py | 10 +++++- tensorflow/python/training/optimizer.py | 22 +++++++------ 12 files changed, 71 insertions(+), 58 deletions(-) diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py index 33b8a61eb1a..a048e7f2790 100644 --- a/tensorflow/contrib/distribute/python/keras_test.py +++ b/tensorflow/contrib/distribute/python/keras_test.py @@ -413,8 +413,8 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase, with self.assertRaisesRegexp(ValueError, 'is smaller than the number ' 'of replicas'): - # The batch size(32) * num_replicas(3) is 96 which is greater than the - # number of input samples(64). + # The batch size(32) * num_replicas_in_sync(3) is 96 which is greater + # than the number of input samples(64). distributed_training_utils.get_input_batch_params(inputs, 32, strategy) @@ -1016,7 +1016,7 @@ class TestDistributionStrategyCorrectness(test.TestCase, distribute=distribution) batch_size = 64 - batch_size //= distribution.num_replicas + batch_size //= distribution.num_replicas_in_sync train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train)) train_dataset = batch_wrapper(train_dataset, batch_size, distribution) @@ -1057,7 +1057,7 @@ class TestDistributionStrategyCorrectness(test.TestCase, batch_size = 64 if with_distribution: - batch_size //= with_distribution.num_replicas + batch_size //= with_distribution.num_replicas_in_sync train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train)) train_dataset = batch_wrapper(train_dataset, batch_size, distribution) @@ -1072,7 +1072,7 @@ class TestDistributionStrategyCorrectness(test.TestCase, x_predict = [[1.], [2.], [3.], [4.]] predict_batch_size = 4 if with_distribution: - predict_batch_size //= with_distribution.num_replicas + predict_batch_size //= with_distribution.num_replicas_in_sync predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict) predict_dataset = batch_wrapper(predict_dataset, predict_batch_size, distribution) diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py index 9e1a7ad3932..c4b5433360a 100644 --- a/tensorflow/contrib/distribute/python/metrics_v1_test.py +++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py @@ -111,14 +111,14 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase): # In each run, we run multiple steps, and each steps consumes as many # batches as number of replicas. batches_per_update = ( - distribution.num_replicas * distribution.steps_per_run) + distribution.num_replicas_in_sync * distribution.steps_per_run) else: value, update = distribution.call_for_each_replica( metric_fn, iterator.get_next()) update = distribution.group(update) # TODO(josh11b): Once we switch to using a global batch size for input, - # replace "distribution.num_replicas" with "1". - batches_per_update = distribution.num_replicas + # replace "distribution.num_replicas_in_sync" with "1". + batches_per_update = distribution.num_replicas_in_sync self.evaluate(iterator.initializer) self.evaluate(distribution.initialize()) diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py index 165732d578f..e60fa317ac0 100644 --- a/tensorflow/contrib/distribute/python/minimize_loss_test.py +++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py @@ -221,7 +221,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): renorm, update_ops_in_cross_replica_mode): """Verifies that moving mean updates are reduced across replicas.""" with distribution.scope(): - num_replicas = len(distribution.worker_devices) + num_replicas = distribution.num_replicas_in_sync model_fn, dataset_fn, batchnorm = batchnorm_example( optimizer_fn, batch_per_epoch=num_replicas, @@ -369,10 +369,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2 # with sum loss reduction, or 10.6 with mean. if loss_reduction == losses_impl.Reduction.SUM: - # Note that the "distribution.num_replicas" factor will go away once - # we split the input across replicas, instead of pulling a complete + # Note that the "distribution.num_replicas_in_sync" factor will go away + # once we split the input across replicas, instead of pulling a complete # batch of input per replica. - self.assertNear(weight, 2 + 21.2 * distribution.num_replicas, 0.0001) + self.assertNear(weight, 2 + 21.2 * distribution.num_replicas_in_sync, + 0.0001) else: # One of the mean loss reductions. self.assertNear(weight, 2 + 10.6, 0.0001) @@ -491,7 +492,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): def _verify_loss_output(self, initial_loss, loss_output, aggregated, distribution): if not aggregated: - self.assertEqual(distribution.num_replicas, + self.assertEqual(distribution.num_replicas_in_sync, len(distribution.unwrap(loss_output))) loss_output = distribution.reduce( aggregation=variables_lib.VariableAggregation.MEAN, diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py index 8bdf0012087..616508f9138 100644 --- a/tensorflow/contrib/distribute/python/one_device_strategy.py +++ b/tensorflow/contrib/distribute/python/one_device_strategy.py @@ -171,6 +171,10 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy): def num_replicas(self): return 1 + @property + def num_replicas_in_sync(self): + return 1 + @property def worker_devices(self): return [self._device] diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py index 2aa7f1ae5d6..f1020d090a6 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py @@ -234,7 +234,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy): # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through # this creator, such as "MutableHashTable". def _create_variable(self, next_creator, *args, **kwargs): - if self.num_replicas > 1: + if self.num_replicas_in_sync > 1: aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE) if aggregation not in ( vs.VariableAggregation.NONE, diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py index a9f643c6ecc..6b912122e81 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py @@ -345,11 +345,11 @@ class ParameterServerStrategyTestBase( self._finish_condition.release() x_val, y_val, z_val = sess.run([x, y, z]) - self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas) - self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas) + self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas_in_sync) + self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas_in_sync) self.assertEqual(z_val, 30.0 + 1.0 * num_workers) - return (x_val == 10.0 + 1.0 * num_workers * d.num_replicas and - y_val == 20.0 + 1.0 * num_workers * d.num_replicas and + return (x_val == 10.0 + 1.0 * num_workers * d.num_replicas_in_sync and + y_val == 20.0 + 1.0 * num_workers * d.num_replicas_in_sync and z_val == 30.0 + 1.0 * num_workers) def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py index f789c83e005..467dd86d8fd 100644 --- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py +++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py @@ -790,14 +790,7 @@ class OptimizerV2(optimizer_v1.Optimizer): # Scale loss for number of replicas (callable-loss case). In this case, # we have to be careful to call distribute_lib.get_loss_reduction() # *after* loss() is evaluated, so we know what loss reduction it uses. - if scale_loss_by_num_replicas is None: - scale_loss_by_num_replicas = ( - distribute_lib.get_loss_reduction() == variable_scope - .VariableAggregation.MEAN) - if scale_loss_by_num_replicas: - num_replicas = distribute_ctx.get_distribution_strategy().num_replicas - if num_replicas > 1: - loss_value *= 1. / num_replicas + loss_value = self._scale_loss(loss_value, scale_loss_by_num_replicas) if var_list is None: var_list = tape.watched_variables() @@ -808,14 +801,7 @@ class OptimizerV2(optimizer_v1.Optimizer): "be a function when eager execution is enabled.") # Scale loss for number of replicas (non-callable-loss case). - if scale_loss_by_num_replicas is None: - scale_loss_by_num_replicas = ( - distribute_lib.get_loss_reduction() == variable_scope - .VariableAggregation.MEAN) - if scale_loss_by_num_replicas: - num_replicas = distribute_ctx.get_distribution_strategy().num_replicas - if num_replicas > 1: - loss *= 1. / num_replicas + loss = self._scale_loss(loss, scale_loss_by_num_replicas) if gate_gradients not in [ optimizer_v1.Optimizer.GATE_NONE, optimizer_v1.Optimizer.GATE_OP, @@ -857,6 +843,20 @@ class OptimizerV2(optimizer_v1.Optimizer): ]) return grads_and_vars + @staticmethod + def _scale_loss(loss_value, scale_loss_by_num_replicas): + """Scale loss for the number of replicas.""" + if scale_loss_by_num_replicas is None: + scale_loss_by_num_replicas = ( + distribute_lib.get_loss_reduction() == variable_scope + .VariableAggregation.MEAN) + if scale_loss_by_num_replicas: + num_replicas = \ + distribute_ctx.get_distribution_strategy().num_replicas_in_sync + if num_replicas > 1: + loss_value *= 1. / num_replicas + return loss_value + def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py index ec553db2f8e..fc408dd39fd 100644 --- a/tensorflow/python/keras/engine/distributed_training_utils.py +++ b/tensorflow/python/keras/engine/distributed_training_utils.py @@ -368,17 +368,14 @@ def get_input_batch_params(first_x_value, batch_size, distribution_strategy): if not num_batches: raise ValueError('Please specify a batch_size that is smaller than' 'the number of input samples %d.' % first_x_value.shape[0]) - # TODO(anjalisridhar): TPU currently supports using the num_replicas property. - # We might want to look into implementing worker_devices. In multi worker - # strategy, perhaps num_replicas works better? - steps = num_batches // distribution_strategy.num_replicas + steps = num_batches // distribution_strategy.num_replicas_in_sync if not steps: # TODO(anjalisridhar): Number of replicas in the error message may not # convey what we want to the user. Is there another terminology that we can # use that is consistent across different strategies? raise ValueError('The number of batches %d is smaller than the number ' 'of replicas %d used for DistributionStrategy. ' % - (num_batches, distribution_strategy.num_replicas)) + (num_batches, distribution_strategy.num_replicas_in_sync)) return steps diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index 1847a6a3897..c0aad1d8e5f 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -882,7 +882,7 @@ class Model(Network): x_shape = first_x_value.shape if batch_size is None: batch_size = distributed_training_utils.get_batch_size( - self._distribution_strategy.num_replicas, x_shape[0], steps) + self._distribution_strategy.num_replicas_in_sync, x_shape[0], steps) # We need to use the drop_remainder argument to allow for a static # input shape which is required for TPUs. drop_remainder = self._distribution_strategy.require_static_shapes diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py index 8550b960557..12bc04eec78 100644 --- a/tensorflow/python/keras/engine/training_distributed.py +++ b/tensorflow/python/keras/engine/training_distributed.py @@ -131,8 +131,8 @@ def fit_loop( # We need to set sample_weights to None since there are sample weight # placeholders that are created with default values. - sample_weights = [None for _ in range(len(model.outputs) * - current_strategy.num_replicas)] + sample_weights = [None for _ in range( + len(model.outputs) * current_strategy.num_replicas_in_sync)] if model.uses_learning_phase and not isinstance(K.learning_phase(), int): ins = dataset_inputs + dataset_targets + sample_weights + [1] else: @@ -467,8 +467,8 @@ def test_loop(model, iterator, verbose=0, steps=None): # We need to set sample_weights to None since there are sample weight # placeholders that are created with default values. - sample_weights = [None for _ in range(len(model.outputs) * - current_strategy.num_replicas)] + sample_weights = [None for _ in range( + len(model.outputs) * current_strategy.num_replicas_in_sync)] if model.uses_learning_phase and not isinstance(K.learning_phase(), int): ins = dataset_inputs + dataset_targets + sample_weights + [0] else: @@ -691,7 +691,7 @@ def predict_loop(model, iterator, verbose=0, steps=None): distributed_training_utils.set_weights( current_strategy, distributed_model, orig_model_weights) - num_towers = current_strategy.num_towers + num_replicas = current_strategy.num_replicas_in_sync # Since we do not know how many samples we will see, we cannot # pre-allocate the returned Numpy arrays. Instead, we store one array per # batch seen and concatenate them upon returning. @@ -703,11 +703,12 @@ def predict_loop(model, iterator, verbose=0, steps=None): batch_outs = [batch_outs] if step == 0: # batch_outs gives you the number of model outputs. In the distributed - # case this will be number of model_outputs * num_towers. + # case this will be number of model_outputs * num_replicas. for _ in range(len(model.outputs)): unconcatenated_outs.append([]) for i in range(len(model.outputs)): - nested_outs = batch_outs[i * num_towers:i * num_towers + num_towers] + nested_outs = batch_outs[i * num_replicas: + i * num_replicas + num_replicas] outs = nest.flatten(nested_outs) unconcatenated_outs[i].extend(outs) if verbose >= 1: diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py index 35ed52fa129..a726f35f826 100644 --- a/tensorflow/python/training/distribute.py +++ b/tensorflow/python/training/distribute.py @@ -947,7 +947,10 @@ class DistributionStrategy(object): @property def num_replicas(self): - """Returns number of replicas, for purposes of averaging across replicas.""" + """Returns number of replicas, for purposes of averaging across replicas. + + DEPRECATED: use `num_replicas_in_sync` instead. + """ raise NotImplementedError("must be implemented in descendants") @property @@ -1148,6 +1151,11 @@ class ReplicaContext(object): """Returns number of replicas, for purposes of averaging across replicas.""" return self._distribution_strategy.num_replicas + @property + def num_replicas_in_sync(self): + """Returns number of replicas over which gradients are aggregated.""" + return self._distribution_strategy.num_replicas_in_sync + @property def replica_id(self): """Which replica is being defined, a number from 0 to `num_replicas - 1`.""" diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py index 8e400f2aeba..9dfa9d2afb2 100644 --- a/tensorflow/python/training/optimizer.py +++ b/tensorflow/python/training/optimizer.py @@ -465,11 +465,7 @@ class Optimizer( # Have to be careful to call distribute_lib.get_loss_reduction() # *after* loss() is evaluated, so we know what loss reduction it uses. # TODO(josh11b): Test that we handle weight decay in a reasonable way. - if (distribute_lib.get_loss_reduction() == - variable_scope.VariableAggregation.MEAN): - num_replicas = distribute_ctx.get_distribution_strategy().num_replicas - if num_replicas > 1: - loss_value *= (1. / num_replicas) + loss_value = self._scale_loss(loss_value) if var_list is None: var_list = tape.watched_variables() @@ -486,11 +482,7 @@ class Optimizer( "be a function when eager execution is enabled.") # Scale loss if using a "mean" loss reduction and multiple replicas. - if (distribute_lib.get_loss_reduction() == - variable_scope.VariableAggregation.MEAN): - num_replicas = distribute_ctx.get_distribution_strategy().num_replicas - if num_replicas > 1: - loss *= (1. / num_replicas) + loss = self._scale_loss(loss) if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: @@ -526,6 +518,16 @@ class Optimizer( if g is not None and v.dtype != dtypes.resource]) return grads_and_vars + @staticmethod + def _scale_loss(loss_value): + if (distribute_lib.get_loss_reduction() == + variable_scope.VariableAggregation.MEAN): + num_replicas = \ + distribute_ctx.get_distribution_strategy().num_replicas_in_sync + if num_replicas > 1: + loss_value *= (1. / num_replicas) + return loss_value + def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. From a7b4bd66f7a034045d355ef33ab5539c357637eb Mon Sep 17 00:00:00 2001 From: Shivani Agrawal Date: Mon, 5 Nov 2018 22:56:27 -0800 Subject: [PATCH 167/540] [data-stats] Adds option "experimental_stats" to `tf.data.Options` which takes `tf.data.experimental.StatsOptions` object. `StatsOptions` can configure options for collecting `dataset` stats using `StatsAggregator`, and it has aggregator as an argument which attaches the given aggregator to the dataset. (this will also replace `set_stats_aggregator()` dataset transformation.) PiperOrigin-RevId: 220230269 --- .../python/data/experimental/__init__.py | 8 +- .../data/experimental/kernel_tests/BUILD | 6 + .../kernel_tests/optimization/BUILD | 1 + .../optimization/latency_all_edges_test.py | 40 +++- .../kernel_tests/serialization/BUILD | 1 + .../stats_dataset_serialization_test.py | 5 +- .../kernel_tests/stats_dataset_ops_test.py | 184 +++++++++++------- .../kernel_tests/stats_dataset_test_base.py | 11 +- tensorflow/python/data/experimental/ops/BUILD | 19 ++ .../data/experimental/ops/stats_aggregator.py | 84 ++++++++ .../python/data/experimental/ops/stats_ops.py | 106 +--------- .../data/experimental/ops/stats_options.py | 103 ++++++++++ tensorflow/python/data/ops/BUILD | 1 + tensorflow/python/data/ops/dataset_ops.py | 66 ++++++- .../golden/v1/tensorflow.data.-options.pbtxt | 8 +- ....data.experimental.-stats-aggregator.pbtxt | 2 +- ...low.data.experimental.-stats-options.pbtxt | 25 +++ .../v1/tensorflow.data.experimental.pbtxt | 8 +- .../golden/v2/tensorflow.data.-options.pbtxt | 8 +- ....data.experimental.-stats-aggregator.pbtxt | 2 +- ...low.data.experimental.-stats-options.pbtxt | 25 +++ .../v2/tensorflow.data.experimental.pbtxt | 8 +- 22 files changed, 511 insertions(+), 210 deletions(-) create mode 100644 tensorflow/python/data/experimental/ops/stats_aggregator.py create mode 100644 tensorflow/python/data/experimental/ops/stats_options.py create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py index d4e7fee9219..126c2be4420 100644 --- a/tensorflow/python/data/experimental/__init__.py +++ b/tensorflow/python/data/experimental/__init__.py @@ -29,6 +29,8 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview. @@RandomDataset @@Reducer @@SqlDataset +@@StatsAggregator +@@StatsOptions @@TFRecordWriter @@bucket_by_sequence_length @@ -52,9 +54,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview. @@rejection_resample @@sample_from_datasets @@scan -@@set_stats_aggregator @@shuffle_and_repeat -@@StatsAggregator @@unbatch @@unique @@ -98,9 +98,9 @@ from tensorflow.python.data.experimental.ops.readers import SqlDataset from tensorflow.python.data.experimental.ops.resampling import rejection_resample from tensorflow.python.data.experimental.ops.scan_ops import scan from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repeat +from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator from tensorflow.python.data.experimental.ops.stats_ops import latency_stats -from tensorflow.python.data.experimental.ops.stats_ops import set_stats_aggregator -from tensorflow.python.data.experimental.ops.stats_ops import StatsAggregator +from tensorflow.python.data.experimental.ops.stats_options import StatsOptions from tensorflow.python.data.experimental.ops.unique import unique from tensorflow.python.data.experimental.ops.writers import TFRecordWriter from tensorflow.python.data.ops.iterator_ops import get_next_as_optional diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD index bfe2e0cf7a1..0141ac730fa 100644 --- a/tensorflow/python/data/experimental/kernel_tests/BUILD +++ b/tensorflow/python/data/experimental/kernel_tests/BUILD @@ -625,9 +625,15 @@ py_test( "//tensorflow/python:client_testlib", "//tensorflow/python:errors", "//tensorflow/python:framework_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python/data/experimental/ops:batching", + "//tensorflow/python/data/experimental/ops:optimization", + "//tensorflow/python/data/experimental/ops:stats_aggregator", "//tensorflow/python/data/experimental/ops:stats_ops", + "//tensorflow/python/data/experimental/ops:stats_options", "//tensorflow/python/data/ops:dataset_ops", "//third_party/py/numpy", + "@absl_py//absl/testing:parameterized", ], ) diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD index 5b75e54f66c..9946ef5a42f 100644 --- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD +++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD @@ -89,6 +89,7 @@ py_test( "//tensorflow/python:errors", "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base", "//tensorflow/python/data/experimental/ops:optimization", + "//tensorflow/python/data/experimental/ops:stats_aggregator", "//tensorflow/python/data/experimental/ops:stats_ops", "//tensorflow/python/data/ops:dataset_ops", ], diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py index 469b05399a1..7144d834f9f 100644 --- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py @@ -19,7 +19,8 @@ from __future__ import print_function from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base from tensorflow.python.data.experimental.ops import optimization -from tensorflow.python.data.experimental.ops import stats_ops +from tensorflow.python.data.experimental.ops import stats_aggregator +from tensorflow.python.data.experimental.ops import stats_options from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import errors from tensorflow.python.platform import test @@ -28,18 +29,45 @@ from tensorflow.python.platform import test class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase): def testLatencyStatsOptimization(self): - stats_aggregator = stats_ops.StatsAggregator() + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.from_tensors(1).apply( optimization.assert_next( ["LatencyStats", "Map", "LatencyStats", "Prefetch", - "LatencyStats"])).map(lambda x: x * x).prefetch(1).apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + "LatencyStats"])).map(lambda x: x * x).prefetch(1) options = dataset_ops.Options() - options.experimental_latency_all_edges = True + options.experimental_stats = stats_options.StatsOptions() + options.experimental_stats.latency_all_edges = True + options.experimental_stats.aggregator = aggregator dataset = dataset.with_options(options) iterator = dataset.make_initializable_iterator() get_next = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() + + with self.cached_session() as sess: + sess.run(iterator.initializer) + self.assertEqual(1 * 1, sess.run(get_next)) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + summary_str = sess.run(summary_t) + self._assertSummaryHasCount(summary_str, + "record_latency_TensorDataset/_1", 1) + self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", + 1) + self._assertSummaryHasCount(summary_str, + "record_latency_PrefetchDataset/_6", 1) + + def testLatencyStatsOptimizationV2(self): + aggregator = stats_aggregator.StatsAggregator() + dataset = dataset_ops.Dataset.from_tensors(1).apply( + optimization.assert_next( + ["LatencyStats", "Map", "LatencyStats", "Prefetch", + "LatencyStats"])).map(lambda x: x * x).prefetch(1) + options = dataset_ops.Options() + options.experimental_stats = stats_options.StatsOptions(aggregator) + dataset = dataset.with_options(options) + iterator = dataset.make_initializable_iterator() + get_next = iterator.get_next() + summary_t = aggregator.get_summary() with self.cached_session() as sess: sess.run(iterator.initializer) diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD index 66bc3833a73..2cfb5759036 100644 --- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD +++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD @@ -651,6 +651,7 @@ py_test( "//tensorflow/python:array_ops", "//tensorflow/python:client_testlib", "//tensorflow/python:framework_ops", + "//tensorflow/python/data/experimental/ops:stats_aggregator", "//tensorflow/python/data/experimental/ops:stats_ops", "//tensorflow/python/data/ops:dataset_ops", ], diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py index ef7061b1904..662d768b489 100644 --- a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py @@ -18,6 +18,7 @@ from __future__ import division from __future__ import print_function from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base +from tensorflow.python.data.experimental.ops import stats_aggregator from tensorflow.python.data.experimental.ops import stats_ops from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import errors @@ -92,9 +93,9 @@ class StatsDatasetSerializationTest( None, num_outputs) def _build_dataset_stats_aggregator(self): - stats_aggregator = stats_ops.StatsAggregator() + aggregator = stats_aggregator.StatsAggregator() return dataset_ops.Dataset.range(10).apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + stats_ops.set_stats_aggregator(aggregator)) def test_set_stats_aggregator_not_support_checkpointing(self): with self.assertRaisesRegexp(errors.UnimplementedError, diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py index 4d794b4b845..83028937d36 100644 --- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py @@ -17,13 +17,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.experimental.ops import optimization +from tensorflow.python.data.experimental.ops import stats_aggregator from tensorflow.python.data.experimental.ops import stats_ops +from tensorflow.python.data.experimental.ops import stats_options from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -32,17 +35,43 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test +def function_set_stats_aggregator(dataset, + aggregator, + prefix="", + counter_prefix=""): + return dataset.apply( + stats_ops.set_stats_aggregator(aggregator, prefix, counter_prefix)) + + +def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""): + options = dataset_ops.Options() + options.experimental_stats = stats_options.StatsOptions(aggregator) + options.experimental_stats.latency_all_edges = False + if prefix: + options.experimental_stats.prefix = prefix + if counter_prefix: + options.experimental_stats.counter_prefix = counter_prefix + return dataset.with_options(options) + + +@parameterized.named_parameters( + dict( + testcase_name="SetStatsAggregator", + dataset_transformation=function_set_stats_aggregator), + dict( + testcase_name="StatsOptions", + dataset_transformation=function_apply_options)) class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): - def testBytesProduced(self): - stats_aggregator = stats_ops.StatsAggregator() + def testBytesProduced(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).map( lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply( - stats_ops.bytes_produced_stats("bytes_produced")).apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + stats_ops.bytes_produced_stats("bytes_produced")) + dataset = dataset_transformation(dataset, aggregator) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.cached_session() as sess: sess.run(iterator.initializer) @@ -60,14 +89,14 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0) self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum) - def testLatencyStats(self): - stats_aggregator = stats_ops.StatsAggregator() + def testLatencyStats(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( - stats_ops.latency_stats("record_latency")).apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + stats_ops.latency_stats("record_latency")) + dataset = dataset_transformation(dataset, aggregator) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.cached_session() as sess: sess.run(iterator.initializer) @@ -79,14 +108,14 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): sess.run(next_element) self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0) - def testPrefetchBufferUtilization(self): - stats_aggregator = stats_ops.StatsAggregator() + def testPrefetchBufferUtilization(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).map( - lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch( - -1).apply(stats_ops.set_stats_aggregator(stats_aggregator)) + lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(-1) + dataset = dataset_transformation(dataset, aggregator) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.cached_session() as sess: sess.run(iterator.initializer) @@ -106,14 +135,14 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization", 100) - def testPrefetchBufferScalars(self): - stats_aggregator = stats_ops.StatsAggregator() + def testPrefetchBufferScalars(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(10).map( - lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch( - 0).apply(stats_ops.set_stats_aggregator(stats_aggregator)) + lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(0) + dataset = dataset_transformation(dataset, aggregator) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.cached_session() as sess: sess.run(iterator.initializer) @@ -128,14 +157,14 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) - def testFilteredElementsStats(self): - stats_aggregator = stats_ops.StatsAggregator() + def testFilteredElementsStats(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(101).filter( - lambda x: math_ops.equal(math_ops.mod(x, 3), 0)).apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + lambda x: math_ops.equal(math_ops.mod(x, 3), 0)) + dataset = dataset_transformation(dataset, aggregator) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.test_session() as sess: sess.run(iterator.initializer) @@ -153,7 +182,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self._assertSummaryHasScalarValue( sess.run(summary_t), "Filter::filtered_elements", 34.0) - def testMapBufferUtilization(self): + def testMapBufferUtilization(self, dataset_transformation): def dataset_fn(): return dataset_ops.Dataset.range(10).map( @@ -161,9 +190,13 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): num_parallel_calls=4) self._testParallelCallsStats( - dataset_fn, "ParallelMap", 10, function_processing_time=True) + dataset_fn, + "ParallelMap", + 10, + dataset_transformation, + function_processing_time=True) - def testMapAutoTuneBufferUtilization(self): + def testMapAutoTuneBufferUtilization(self, dataset_transformation): def dataset_fn(): dataset = dataset_ops.Dataset.range(10).map( @@ -174,9 +207,13 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): return dataset.with_options(options) self._testParallelCallsStats( - dataset_fn, "ParallelMap", 10, function_processing_time=True) + dataset_fn, + "ParallelMap", + 10, + dataset_transformation, + function_processing_time=True) - def testInterleaveAutoTuneBufferUtilization(self): + def testInterleaveAutoTuneBufferUtilization(self, dataset_transformation): def dataset_fn(): dataset = dataset_ops.Dataset.range(10).map( @@ -189,9 +226,10 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): options.experimental_autotune = True return dataset.with_options(options) - self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10) + self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10, + dataset_transformation) - def testMapAndBatchAutoTuneBufferUtilization(self): + def testMapAndBatchAutoTuneBufferUtilization(self, dataset_transformation): def dataset_fn(): dataset = dataset_ops.Dataset.range(100).apply( @@ -208,17 +246,18 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): dataset_fn, "MapAndBatch", num_output, + dataset_transformation, check_elements=False, function_processing_time=True) - def testReinitialize(self): - stats_aggregator = stats_ops.StatsAggregator() + def testReinitialize(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( - stats_ops.latency_stats("record_latency")).apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + stats_ops.latency_stats("record_latency")) + dataset = dataset_transformation(dataset, aggregator) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.cached_session() as sess: for j in range(5): @@ -232,7 +271,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self._assertSummaryHasCount( sess.run(summary_t), "record_latency", (j + 1) * 100.0) - def testNoAggregatorRegistered(self): + def testNoAggregatorRegistered(self, dataset_transformation): dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")) iterator = dataset.make_initializable_iterator() @@ -245,15 +284,15 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) - def testMultipleTags(self): - stats_aggregator = stats_ops.StatsAggregator() + def testMultipleTags(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")).apply( - stats_ops.latency_stats("record_latency_2")).apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + stats_ops.latency_stats("record_latency_2")) + dataset = dataset_transformation(dataset, aggregator) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.cached_session() as sess: sess.run(iterator.initializer) @@ -269,15 +308,15 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self._assertSummaryHasCount( sess.run(summary_t), "record_latency_2", 100.0) - def testRepeatedTags(self): - stats_aggregator = stats_ops.StatsAggregator() + def testRepeatedTags(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")).apply( - stats_ops.latency_stats("record_latency")).apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + stats_ops.latency_stats("record_latency")) + dataset = dataset_transformation(dataset, aggregator) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.cached_session() as sess: sess.run(iterator.initializer) @@ -289,15 +328,15 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): sess.run(next_element) self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0) - def testMultipleIteratorsSameAggregator(self): - stats_aggregator = stats_ops.StatsAggregator() + def testMultipleIteratorsSameAggregator(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( - stats_ops.latency_stats("record_latency")).apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + stats_ops.latency_stats("record_latency")) + dataset = dataset_transformation(dataset, aggregator) iterator_0 = dataset.make_initializable_iterator() iterator_1 = dataset.make_initializable_iterator() next_element = iterator_0.get_next() + iterator_1.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.cached_session() as sess: sess.run([iterator_0.initializer, iterator_1.initializer]) @@ -309,18 +348,18 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): sess.run(next_element) self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0) - def testMultipleDatasetWithTags(self): - stats_aggregator = stats_ops.StatsAggregator() + def testMultipleDatasetWithPrefixes(self, dataset_transformation): + aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( - stats_ops.latency_stats("record_latency")).apply( - stats_ops.set_stats_aggregator(stats_aggregator, "dataset1")) + stats_ops.latency_stats("record_latency")) + dataset = dataset_transformation(dataset, aggregator, prefix="dataset1") dataset2 = dataset_ops.Dataset.range(100).apply( - stats_ops.latency_stats("record_latency")).apply( - stats_ops.set_stats_aggregator(stats_aggregator, "dataset2")) + stats_ops.latency_stats("record_latency")) + dataset2 = dataset_transformation(dataset2, aggregator, prefix="dataset2") iterator_0 = dataset.make_initializable_iterator() iterator_1 = dataset2.make_initializable_iterator() next_element = iterator_0.get_next() + iterator_1.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.test_session() as sess: sess.run([iterator_0.initializer, iterator_1.initializer]) @@ -338,15 +377,22 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): sess.run(summary_t), "dataset2_record_latency", 100.0) +@parameterized.named_parameters( + dict( + testcase_name="SetStatsAggregator", + dataset_transformation=function_set_stats_aggregator), + dict( + testcase_name="StatsOptions", + dataset_transformation=function_apply_options)) class FeatureStatsDatasetTest( stats_dataset_test_base.StatsDatasetTestBase, reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase): - def testFeaturesStats(self): + def testFeaturesStats(self, dataset_transformation): num_epochs = 5 total_records = num_epochs * self._num_records batch_size = 2 - stats_aggregator = stats_ops.StatsAggregator() + aggregator = stats_aggregator.StatsAggregator() def dataset_fn(): return self.make_batch_feature( @@ -362,13 +408,17 @@ class FeatureStatsDatasetTest( num_output = total_records // batch_size + 1 self._testParallelCallsStats( - dataset_fn, "ParseExample", num_output, check_elements=False) + dataset_fn, + "ParseExample", + num_output, + dataset_transformation, + check_elements=False) - iterator = dataset_fn().apply( - stats_ops.set_stats_aggregator( - stats_aggregator, "record_stats")).make_initializable_iterator() + dataset = dataset_transformation( + dataset_fn(), aggregator, prefix="record_stats") + iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.test_session() as sess: sess.run(iterator.initializer) diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py index a4e6242b00c..c5bf9267590 100644 --- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py +++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np from tensorflow.core.framework import summary_pb2 -from tensorflow.python.data.experimental.ops import stats_ops +from tensorflow.python.data.experimental.ops import stats_aggregator from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.framework import errors @@ -87,14 +87,15 @@ class StatsDatasetTestBase(test_base.DatasetTestBase): dataset_fn, dataset_name, num_output, + dataset_transformation, function_processing_time=False, check_elements=True): - stats_aggregator = stats_ops.StatsAggregator() - dataset = dataset_fn().apply( - stats_ops.set_stats_aggregator(stats_aggregator)) + aggregator = stats_aggregator.StatsAggregator() + dataset = dataset_fn() + dataset = dataset_transformation(dataset, aggregator) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() - summary_t = stats_aggregator.get_summary() + summary_t = aggregator.get_summary() with self.cached_session() as sess: sess.run(iterator.initializer) diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD index eda547c37af..170fda90b68 100644 --- a/tensorflow/python/data/experimental/ops/BUILD +++ b/tensorflow/python/data/experimental/ops/BUILD @@ -272,6 +272,16 @@ py_library( ], ) +py_library( + name = "stats_aggregator", + srcs = ["stats_aggregator.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/python:dataset_ops_gen", + "//tensorflow/python:util", + ], +) + py_library( name = "stats_ops", srcs = ["stats_ops.py"], @@ -287,6 +297,15 @@ py_library( ], ) +py_library( + name = "stats_options", + srcs = ["stats_options.py"], + srcs_version = "PY2AND3", + deps = [ + ":stats_aggregator", + ], +) + py_library( name = "threadpool", srcs = ["threadpool.py"], diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py new file mode 100644 index 00000000000..5274c816a49 --- /dev/null +++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py @@ -0,0 +1,84 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""StatsAggregator for aggregating statistics from `tf.data` pipelines.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import gen_dataset_ops +from tensorflow.python.util.tf_export import tf_export + + +@tf_export("data.experimental.StatsAggregator") +class StatsAggregator(object): + """A stateful resource that aggregates statistics from one or more iterators. + + To record statistics, use one of the custom transformation functions defined + in this module when defining your `tf.data.Dataset`. All statistics will be + aggregated by the `StatsAggregator` that is associated with a particular + iterator (see below). For example, to record the latency of producing each + element by iterating over a dataset: + + ```python + dataset = ... + dataset = dataset.apply(tf.data.experimental.latency_stats("total_bytes")) + ``` + + To associate a `StatsAggregator` with a `tf.data.Dataset` object, use + the following pattern: + + ```python + aggregator = tf.data.experimental.StatsAggregator() + dataset = ... + + # Apply `StatsOptions` to associate `dataset` with `aggregator`. + options = dataset_ops.Options() + options.experimental_stats = tf.data.experimental.StatsOptions(aggregator) + dataset = dataset.with_options(options) + iterator = dataset.make_one_shot_iterator() + ``` + + To get a protocol buffer summary of the currently aggregated statistics, + use the `StatsAggregator.get_summary()` tensor. The easiest way to do this + is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection, + so that the summaries will be included with any existing summaries. + + ```python + aggregator = tf.data.experimental.StatsAggregator() + # ... + stats_summary = aggregator.get_summary() + tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary) + ``` + + Note: This interface is experimental and expected to change. In particular, + we expect to add other implementations of `StatsAggregator` that provide + different ways of exporting statistics, and add more types of statistics. + """ + + def __init__(self): + """Creates a `StatsAggregator`.""" + self._resource = gen_dataset_ops.stats_aggregator_handle() + + # TODO(b/116314787): Update this/add support for V2 summary API. + def get_summary(self): + """Returns a string `tf.Tensor` that summarizes the aggregated statistics. + + The returned tensor will contain a serialized `tf.summary.Summary` protocol + buffer, which can be used with the standard TensorBoard logging facilities. + + Returns: + A scalar string `tf.Tensor` that summarizes the aggregated statistics. + """ + return gen_dataset_ops.stats_aggregator_summary(self._resource) diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py index fb93b86b291..ca2f5f2a887 100644 --- a/tensorflow/python/data/experimental/ops/stats_ops.py +++ b/tensorflow/python/data/experimental/ops/stats_ops.py @@ -21,110 +21,18 @@ from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import gen_dataset_ops +from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export -@tf_export("data.experimental.StatsAggregator") -class StatsAggregator(object): - """A stateful resource that aggregates statistics from one or more iterators. - - To record statistics, use one of the custom transformation functions defined - in this module when defining your `tf.data.Dataset`. All statistics will be - aggregated by the `StatsAggregator` that is associated with a particular - iterator (see below). For example, to record the latency of producing each - element by iterating over a dataset: - - ```python - dataset = ... - dataset = dataset.apply(tf.data.experimental.latency_stats("total_bytes")) - ``` - - To associate a `StatsAggregator` with a `tf.data.Dataset` object, use - the following pattern: - - ```python - stats_aggregator = stats_ops.StatsAggregator() - dataset = ... - - # Apply `set_stats_aggregator` to associate `dataset` with `stats_aggregator`. - dataset = dataset.apply( - tf.data.experimental.set_stats_aggregator(stats_aggregator)) - iterator = dataset.make_one_shot_iterator() - ``` - - To get a protocol buffer summary of the currently aggregated statistics, - use the `StatsAggregator.get_summary()` tensor. The easiest way to do this - is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection, - so that the summaries will be included with any existing summaries. - - ```python - stats_aggregator = stats_ops.StatsAggregator() - # ... - stats_summary = stats_aggregator.get_summary() - tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary) - ``` - - Note: This interface is experimental and expected to change. In particular, - we expect to add other implementations of `StatsAggregator` that provide - different ways of exporting statistics, and add more types of statistics. - """ - - def __init__(self): - """Creates a `StatsAggregator`.""" - self._resource = gen_dataset_ops.stats_aggregator_handle() - - # TODO(b/116314787): Update this/add support for V2 summary API. - def get_summary(self): - """Returns a string `tf.Tensor` that summarizes the aggregated statistics. - - The returned tensor will contain a serialized `tf.summary.Summary` protocol - buffer, which can be used with the standard TensorBoard logging facilities. - - Returns: - A scalar string `tf.Tensor` that summarizes the aggregated statistics. - """ - return gen_dataset_ops.stats_aggregator_summary(self._resource) - - -class _SetStatsAggregatorDataset(dataset_ops.UnaryDataset): - """A `Dataset` that acts as an identity, and sets given stats_aggregator.""" - - def __init__(self, input_dataset, stats_aggregator, tag, prefix): - super(_SetStatsAggregatorDataset, self).__init__(input_dataset) - self._input_dataset = input_dataset - self._stats_aggregator = stats_aggregator - self._tag = tag - self._prefix = prefix - - def _as_variant_tensor(self): - return gen_dataset_ops.set_stats_aggregator_dataset( - self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access - self._stats_aggregator._resource, # pylint: disable=protected-access - self._tag, - self._prefix, - **dataset_ops.flat_structure(self)) - - @property - def output_shapes(self): - return self._input_dataset.output_shapes - - @property - def output_types(self): - return self._input_dataset.output_types - - @property - def output_classes(self): - return self._input_dataset.output_classes - - -@tf_export("data.experimental.set_stats_aggregator") -def set_stats_aggregator(stats_aggregator, tag="", counter_prefix=""): +@deprecation.deprecated(None, "Use `tf.data.experimental.StatsOptions`.") +def set_stats_aggregator(stats_aggregator, prefix="", counter_prefix=""): """Set the given `stats_aggregator` for aggregating the input dataset stats. Args: stats_aggregator: A `tf.contrib.data.StatsAggregator` object. - tag: (Optional) String, all statistics recorded for the input `dataset` - will have given `tag` prepend with the name. + prefix: (Optional) String, all statistics recorded for the input `dataset` + will have given `prefix` prepend with the name. counter_prefix: (Optional) String, all statistics recorded as `counters` will have the given `prefix` for the counter. Defaults to "/tensorflow". @@ -134,8 +42,8 @@ def set_stats_aggregator(stats_aggregator, tag="", counter_prefix=""): """ def _apply_fn(dataset): - return _SetStatsAggregatorDataset(dataset, stats_aggregator, tag, - counter_prefix) + return dataset_ops._SetStatsAggregatorDataset( # pylint: disable=protected-access + dataset, stats_aggregator, prefix, counter_prefix) return _apply_fn diff --git a/tensorflow/python/data/experimental/ops/stats_options.py b/tensorflow/python/data/experimental/ops/stats_options.py new file mode 100644 index 00000000000..c088d3d8881 --- /dev/null +++ b/tensorflow/python/data/experimental/ops/stats_options.py @@ -0,0 +1,103 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""StatsOptions to configure stats aggregation options for `tf.data` pipelines. + +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.data.experimental.ops import stats_aggregator +from tensorflow.python.util.tf_export import tf_export + + +@tf_export("data.experimental.StatsOptions") +class StatsOptions(object): + """Represents options for collecting dataset stats using `StatsAggregator`. + + To apply `StatsOptions` with a `tf.data.Dataset` object, use the following + pattern: + + ```python + aggretator = tf.data.experimental.StatsAggregator() + + options = dataset_ops.Options() + options.experimental_stats = tf.data.experimental.StatsOptions() + options.experimental_stats.aggregator = aggregator + dataset = dataset.with_options(options) + + iterator = dataset.make_one_shot_iterator() + ``` + + Note: a `StatsAggregator` object can be attached either duing construction or + can be provided later like in above example. + + ```python + aggretator = tf.data.experimental.StatsAggregator() + # attach aggregator during construction + options.experimental_stats = tf.data.experimental.StatsOptions(aggregator) + ..... + ``` + """ + + for _name, _ty, _default, _docstring in [ + ("aggregator", stats_aggregator.StatsAggregator, None, + "Associate the given statistics options with the dataset pipeline."), + ("prefix", str, "", + "Prefix to prepend all statistics recorded for the input `dataset` with." + ), + ("counter_prefix", str, "", + "Prefix for the statistics recorded as counter."), + ("latency_all_edges", bool, True, + "Whether to add latency measurements on all edges."), + ]: + + def _make_getter(name): # pylint: disable=no-self-argument + + def getter(self): + return getattr(self, "_" + name) + + return getter + + def _make_setter(name, ty): # pylint: disable=no-self-argument + + def setter(self, value): + if not isinstance(value, ty): + raise TypeError( + "Attempting to set the option %s to incompatible value: %r when " + "it expects %r" % (name, value, ty)) + setattr(self, "_" + name, value) + + return setter + + vars()["_" + _name] = _default + vars()[_name] = property( + _make_getter(_name), _make_setter(_name, _ty), _default, _docstring) + + def __init__(self, aggregator=None): + if aggregator: + self.aggregator = aggregator + + def __eq__(self, other): + if isinstance(other, self.__class__): + return self.__dict__ == other.__dict__ + else: + return False + + def __ne__(self, other): + return not self.__eq__(other) + + def __str__(self): + return str(self.__dict__) diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD index 5e636965a66..18edc0872d7 100644 --- a/tensorflow/python/data/ops/BUILD +++ b/tensorflow/python/data/ops/BUILD @@ -25,6 +25,7 @@ py_library( "//tensorflow/python:tensor_shape", "//tensorflow/python:tensor_util", "//tensorflow/python:util", + "//tensorflow/python/data/experimental/ops:stats_options", "//tensorflow/python/data/util:nest", "//tensorflow/python/data/util:random_seed", "//tensorflow/python/data/util:sparse", diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py index e4b5da64032..59389a24f73 100644 --- a/tensorflow/python/data/ops/dataset_ops.py +++ b/tensorflow/python/data/ops/dataset_ops.py @@ -25,6 +25,7 @@ import numpy as np import six from tensorflow.python.compat import compat +from tensorflow.python.data.experimental.ops import stats_options from tensorflow.python.data.ops import iterator_ops from tensorflow.python.data.util import nest from tensorflow.python.data.util import random_seed @@ -101,6 +102,8 @@ class Dataset(object): return options def _apply_options(self): + """Apply options, such as optimization configuration, to the dataset.""" + dataset = self options = self.options() static_optimizations = options._static_optimizations() # pylint: disable=protected-access @@ -108,6 +111,11 @@ class Dataset(object): dataset = _OptimizeDataset(dataset, static_optimizations) if options.experimental_autotune is not False: dataset = _ModelDataset(dataset) + if options.experimental_stats and options.experimental_stats.aggregator: # pylint: disable=line-too-long + dataset = _SetStatsAggregatorDataset( # pylint: disable=protected-access + dataset, options.experimental_stats.aggregator, + options.experimental_stats.prefix, + options.experimental_stats.counter_prefix) return dataset def make_initializable_iterator(self, shared_name=None): @@ -1411,8 +1419,8 @@ class Options(object): ("experimental_hoist_random_uniform", bool, "Whether to hoist `tf.random_uniform()` ops out of map transformations." ), - ("experimental_latency_all_edges", bool, - "Whether to add latency measurements on all edges."), + ("experimental_stats", stats_options.StatsOptions, + "Associate the given statistics options with the dataset pipeline."), ("experimental_map_and_batch_fusion", bool, "Whether to fuse map and batch transformations."), ("experimental_map_and_filter_fusion", bool, @@ -1442,8 +1450,8 @@ class Options(object): def setter(self, value): if not isinstance(value, ty): raise TypeError( - "Attempting to set the option %s to incompatible value: %r" % - (name, value)) + "Attempting to set the option %s to incompatible value: %r when " + "it expects %r" % (name, value, ty)) setattr(self, "_" + name, value) return setter @@ -1467,10 +1475,15 @@ class Options(object): def _static_optimizations(self): """Produces the list of enabled static optimizations.""" experimental_optimizations = [ - "filter_fusion", "hoist_random_uniform", "latency_all_edges", - "map_and_batch_fusion", "map_and_filter_fusion", "map_fusion", - "map_parallelization", "map_vectorization", "noop_elimination", - "shuffle_and_repeat_fusion" + "filter_fusion", + "hoist_random_uniform", + "map_and_batch_fusion", + "map_and_filter_fusion", + "map_fusion", + "map_parallelization", + "map_vectorization", + "noop_elimination", + "shuffle_and_repeat_fusion", ] result = [] for exp_opt in experimental_optimizations: @@ -1481,6 +1494,10 @@ class Options(object): result.append("make_numa_aware") if getattr(self, "experimental_deterministic") is False: result.append("make_sloppy") + experimental_stats_options = getattr(self, "experimental_stats") + if experimental_stats_options and getattr(experimental_stats_options, + "latency_all_edges"): + result.append("latency_all_edges") return result def merge(self, options): @@ -1506,7 +1523,6 @@ class Options(object): "experimental_deterministic", "experimental_filter_fusion", "experimental_hoist_random_uniform", - "experimental_latency_all_edges", "experimental_map_and_batch_fusion", "experimental_map_and_filter_fusion", "experimental_map_fusion", @@ -1515,6 +1531,7 @@ class Options(object): "experimental_noop_elimination", "experimental_numa_aware", "experimental_shuffle_and_repeat_fusion", + "experimental_stats", ]: this = getattr(result, name) that = getattr(other, name) @@ -3068,3 +3085,34 @@ class _OptimizeDataset(UnaryDataset): @property def output_types(self): return self._input_dataset.output_types + + +class _SetStatsAggregatorDataset(UnaryDataset): + """A `Dataset` that acts as an identity, and sets stats aggregator.""" + + def __init__(self, input_dataset, aggregator, prefix, counter_prefix): + super(_SetStatsAggregatorDataset, self).__init__(input_dataset) + self._input_dataset = input_dataset + self._stats_aggregator = aggregator + self._prefix = prefix + self._counter_prefix = counter_prefix + + def _as_variant_tensor(self): + return gen_dataset_ops.set_stats_aggregator_dataset( + self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access + self._stats_aggregator._resource, # pylint: disable=protected-access + self._prefix, + self._counter_prefix, + **flat_structure(self)) + + @property + def output_shapes(self): + return self._input_dataset.output_shapes + + @property + def output_types(self): + return self._input_dataset.output_types + + @property + def output_classes(self): + return self._input_dataset.output_classes diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt index 9f4de74c393..9d032d43de1 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt @@ -18,10 +18,6 @@ tf_class { name: "experimental_hoist_random_uniform" mtype: "" } - member { - name: "experimental_latency_all_edges" - mtype: "" - } member { name: "experimental_map_and_batch_fusion" mtype: "" @@ -54,6 +50,10 @@ tf_class { name: "experimental_shuffle_and_repeat_fusion" mtype: "" } + member { + name: "experimental_stats" + mtype: "" + } member_method { name: "__init__" argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt index 0bcc8cf3e87..6536a698b50 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt @@ -1,6 +1,6 @@ path: "tensorflow.data.experimental.StatsAggregator" tf_class { - is_instance: "" + is_instance: "" is_instance: "" member_method { name: "__init__" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt new file mode 100644 index 00000000000..f423eed42cc --- /dev/null +++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt @@ -0,0 +1,25 @@ +path: "tensorflow.data.experimental.StatsOptions" +tf_class { + is_instance: "" + is_instance: "" + member { + name: "aggregator" + mtype: "" + } + member { + name: "counter_prefix" + mtype: "" + } + member { + name: "latency_all_edges" + mtype: "" + } + member { + name: "prefix" + mtype: "" + } + member_method { + name: "__init__" + argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], " + } +} diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt index 116684e5d81..4c253bb8adf 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt @@ -32,6 +32,10 @@ tf_module { name: "StatsAggregator" mtype: "" } + member { + name: "StatsOptions" + mtype: "" + } member { name: "TFRecordWriter" mtype: "" @@ -124,10 +128,6 @@ tf_module { name: "scan" argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None" } - member_method { - name: "set_stats_aggregator" - argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], " - } member_method { name: "shuffle_and_repeat" argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt index 9f4de74c393..9d032d43de1 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt @@ -18,10 +18,6 @@ tf_class { name: "experimental_hoist_random_uniform" mtype: "" } - member { - name: "experimental_latency_all_edges" - mtype: "" - } member { name: "experimental_map_and_batch_fusion" mtype: "" @@ -54,6 +50,10 @@ tf_class { name: "experimental_shuffle_and_repeat_fusion" mtype: "" } + member { + name: "experimental_stats" + mtype: "" + } member_method { name: "__init__" argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt index 0bcc8cf3e87..6536a698b50 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt @@ -1,6 +1,6 @@ path: "tensorflow.data.experimental.StatsAggregator" tf_class { - is_instance: "" + is_instance: "" is_instance: "" member_method { name: "__init__" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt new file mode 100644 index 00000000000..f423eed42cc --- /dev/null +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt @@ -0,0 +1,25 @@ +path: "tensorflow.data.experimental.StatsOptions" +tf_class { + is_instance: "" + is_instance: "" + member { + name: "aggregator" + mtype: "" + } + member { + name: "counter_prefix" + mtype: "" + } + member { + name: "latency_all_edges" + mtype: "" + } + member { + name: "prefix" + mtype: "" + } + member_method { + name: "__init__" + argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], " + } +} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt index 116684e5d81..4c253bb8adf 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt @@ -32,6 +32,10 @@ tf_module { name: "StatsAggregator" mtype: "" } + member { + name: "StatsOptions" + mtype: "" + } member { name: "TFRecordWriter" mtype: "" @@ -124,10 +128,6 @@ tf_module { name: "scan" argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None" } - member_method { - name: "set_stats_aggregator" - argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], " - } member_method { name: "shuffle_and_repeat" argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " From 0359d0dd15980a8f564ea045783b802e84bf3c56 Mon Sep 17 00:00:00 2001 From: Nick Felt Date: Mon, 5 Nov 2018 23:01:16 -0800 Subject: [PATCH 168/540] Consolidate summary V1 python op definitions and tests With this change, tensorflow/python/summary/summary.py contains all the existing python op definitions. Tests remain split with summary_test.py in the same directory testing the API and separate (newly v1-named) tests in tensorflow/python/kernel_tests to test the serialized proto formats. PiperOrigin-RevId: 220230494 --- tensorflow/contrib/compiler/xla_test.py | 3 +- tensorflow/python/BUILD | 16 --- tensorflow/python/kernel_tests/BUILD | 18 +-- ...op_test.py => summary_v1_audio_op_test.py} | 4 +- ...op_test.py => summary_v1_image_op_test.py} | 4 +- ...ary_ops_test.py => summary_v1_ops_test.py} | 18 ++- ...p_test.py => summary_v1_tensor_op_test.py} | 32 ++--- tensorflow/python/ops/logging_ops.py | 2 + tensorflow/python/ops/summary_ops.py | 87 -------------- tensorflow/python/summary/summary.py | 109 ++++++++++++++++-- tensorflow/python/summary/summary_test.py | 30 ++++- tensorflow/python/summary/text_summary.py | 73 ------------ .../python/summary/text_summary_test.py | 53 --------- 13 files changed, 167 insertions(+), 282 deletions(-) rename tensorflow/python/kernel_tests/{summary_audio_op_test.py => summary_v1_audio_op_test.py} (96%) rename tensorflow/python/kernel_tests/{summary_image_op_test.py => summary_v1_image_op_test.py} (98%) rename tensorflow/python/kernel_tests/{summary_ops_test.py => summary_v1_ops_test.py} (89%) rename tensorflow/python/kernel_tests/{summary_tensor_op_test.py => summary_v1_tensor_op_test.py} (86%) delete mode 100644 tensorflow/python/ops/summary_ops.py delete mode 100644 tensorflow/python/summary/text_summary.py delete mode 100644 tensorflow/python/summary/text_summary_test.py diff --git a/tensorflow/contrib/compiler/xla_test.py b/tensorflow/contrib/compiler/xla_test.py index 8d13dc7316a..3b49755afcf 100644 --- a/tensorflow/contrib/compiler/xla_test.py +++ b/tensorflow/contrib/compiler/xla_test.py @@ -28,7 +28,6 @@ from tensorflow.python.ops import control_flow_util from tensorflow.python.ops import logging_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import state_ops -from tensorflow.python.ops import summary_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import test @@ -49,7 +48,7 @@ class XLACompileContextTest(test.TestCase): histogram_summary = summary.histogram('histogram_summary', dummy_tensor) image_summary = summary.image('image_summary', dummy_tensor) scalar_summary = summary.scalar('scalar_summary', dummy_tensor) - tensor_summary = summary_ops.tensor_summary('tensor_summary', dummy_tensor) + tensor_summary = summary.tensor_summary('tensor_summary', dummy_tensor) summary.merge( [ audio_summary, histogram_summary, image_summary, scalar_summary, diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 0d06c49f7c7..7ff358cb088 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -3000,18 +3000,6 @@ py_library( ], ) -py_library( - name = "summary_ops", - srcs = ["ops/summary_ops.py"], - srcs_version = "PY2AND3", - deps = [ - ":framework", - ":framework_for_generated_wrappers", - ":logging_ops_gen", - ":summary_op_util", - ], -) - py_library( name = "summary_ops_v2", srcs = ["ops/summary_ops_v2.py"], @@ -5012,7 +5000,6 @@ py_library( deps = [ ":client", ":constant_op", - ":errors", ":framework", ":framework_for_generated_wrappers", ":lib", @@ -5021,12 +5008,10 @@ py_library( ":protos_all_py", ":pywrap_tensorflow", ":summary_op_util", - ":summary_ops", ":summary_ops_gen", ":summary_ops_v2", ":util", "//tensorflow/python/eager:context", - "//third_party/py/numpy", "@six_archive//:six", ], ) @@ -5037,7 +5022,6 @@ py_tests( srcs = [ "summary/plugin_asset_test.py", "summary/summary_test.py", - "summary/text_summary_test.py", "summary/writer/writer_test.py", ], additional_deps = [ diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index e6508fde0f6..f0b677cd0ec 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -1053,9 +1053,9 @@ tf_py_test( ) tf_py_test( - name = "summary_ops_test", + name = "summary_v1_ops_test", size = "small", - srcs = ["summary_ops_test.py"], + srcs = ["summary_v1_ops_test.py"], additional_deps = [ "//tensorflow/core:protos_all_py", "//tensorflow/python:client_testlib", @@ -1066,9 +1066,9 @@ tf_py_test( ) tf_py_test( - name = "summary_tensor_op_test", + name = "summary_v1_tensor_op_test", size = "small", - srcs = ["summary_tensor_op_test.py"], + srcs = ["summary_v1_tensor_op_test.py"], additional_deps = [ "//third_party/py/numpy", "@six_archive//:six", @@ -1077,7 +1077,7 @@ tf_py_test( "//tensorflow/python:client_testlib", "//tensorflow/python:framework", "//tensorflow/python:framework_for_generated_wrappers", - "//tensorflow/python:summary_ops", + "//tensorflow/python:summary", ], ) @@ -2321,9 +2321,9 @@ cuda_py_test( ) cuda_py_test( - name = "summary_audio_op_test", + name = "summary_v1_audio_op_test", size = "small", - srcs = ["summary_audio_op_test.py"], + srcs = ["summary_v1_audio_op_test.py"], additional_deps = [ "//third_party/py/numpy", "//tensorflow/core:protos_all_py", @@ -2334,9 +2334,9 @@ cuda_py_test( ) cuda_py_test( - name = "summary_image_op_test", + name = "summary_v1_image_op_test", size = "small", - srcs = ["summary_image_op_test.py"], + srcs = ["summary_v1_image_op_test.py"], additional_deps = [ "//third_party/py/numpy", "//tensorflow/core:protos_all_py", diff --git a/tensorflow/python/kernel_tests/summary_audio_op_test.py b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py similarity index 96% rename from tensorflow/python/kernel_tests/summary_audio_op_test.py rename to tensorflow/python/kernel_tests/summary_v1_audio_op_test.py index e59a2ceef7e..63ce77b9d55 100644 --- a/tensorflow/python/kernel_tests/summary_audio_op_test.py +++ b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for summary sound op.""" +"""Tests for summary V1 audio op.""" from __future__ import absolute_import from __future__ import division @@ -27,7 +27,7 @@ from tensorflow.python.platform import test from tensorflow.python.summary import summary -class SummaryAudioOpTest(test.TestCase): +class SummaryV1AudioOpTest(test.TestCase): def _AsSummary(self, s): summ = summary_pb2.Summary() diff --git a/tensorflow/python/kernel_tests/summary_image_op_test.py b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py similarity index 98% rename from tensorflow/python/kernel_tests/summary_image_op_test.py rename to tensorflow/python/kernel_tests/summary_v1_image_op_test.py index b650e104042..094606944ff 100644 --- a/tensorflow/python/kernel_tests/summary_image_op_test.py +++ b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for summary image op.""" +"""Tests for summary V1 image op.""" from __future__ import absolute_import from __future__ import division @@ -30,7 +30,7 @@ from tensorflow.python.platform import test from tensorflow.python.summary import summary -class SummaryImageOpTest(test.TestCase): +class SummaryV1ImageOpTest(test.TestCase): def _AsSummary(self, s): summ = summary_pb2.Summary() diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_v1_ops_test.py similarity index 89% rename from tensorflow/python/kernel_tests/summary_ops_test.py rename to tensorflow/python/kernel_tests/summary_v1_ops_test.py index 0c500120b0b..6c4e106b118 100644 --- a/tensorflow/python/kernel_tests/summary_ops_test.py +++ b/tensorflow/python/kernel_tests/summary_v1_ops_test.py @@ -12,21 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for summary ops.""" +"""Tests for the actual serialized proto output of the V1 tf.summary ops. + +The tensor, audio, and image ops have dedicated tests in adjacent files. The +overall tf.summary API surface also has its own tests in summary_test.py that +check calling the API methods but not the exact serialized proto output. +""" + from __future__ import absolute_import from __future__ import division from __future__ import print_function from tensorflow.core.framework import summary_pb2 from tensorflow.python.framework import constant_op -from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import logging_ops from tensorflow.python.platform import test from tensorflow.python.summary import summary -class SummaryOpsTest(test.TestCase): +class SummaryV1OpsTest(test.TestCase): def _AsSummary(self, s): summ = summary_pb2.Summary() @@ -100,13 +105,6 @@ class SummaryOpsTest(test.TestCase): self.assertEqual(summ2, merge.op.inputs[0]) self.assertTrue(summary.merge_all("bar_key") is None) - def testHistogramSummaryTypes(self): - with ops.Graph().as_default(): - for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32, - dtypes.float32, dtypes.float64): - const = constant_op.constant(10, dtype=dtype) - summary.histogram("h", const) - if __name__ == "__main__": test.main() diff --git a/tensorflow/python/kernel_tests/summary_tensor_op_test.py b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py similarity index 86% rename from tensorflow/python/kernel_tests/summary_tensor_op_test.py rename to tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py index 0f4643393a1..34f771679ae 100644 --- a/tensorflow/python/kernel_tests/summary_tensor_op_test.py +++ b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for summary ops.""" +"""Tests for summary V1 tensor op.""" from __future__ import absolute_import from __future__ import division @@ -26,11 +26,11 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops -from tensorflow.python.ops import summary_ops from tensorflow.python.platform import test +from tensorflow.python.summary import summary as summary_lib -class SummaryOpsTest(test.TestCase): +class SummaryV1TensorOpTest(test.TestCase): def _SummarySingleValue(self, s): summ = summary_pb2.Summary() @@ -44,12 +44,12 @@ class SummaryOpsTest(test.TestCase): def testTags(self): with self.cached_session() as sess: c = constant_op.constant(1) - s1 = summary_ops.tensor_summary("s1", c) + s1 = summary_lib.tensor_summary("s1", c) with ops.name_scope("foo"): - s2 = summary_ops.tensor_summary("s2", c) + s2 = summary_lib.tensor_summary("s2", c) with ops.name_scope("zod"): - s3 = summary_ops.tensor_summary("s3", c) - s4 = summary_ops.tensor_summary("TensorSummary", c) + s3 = summary_lib.tensor_summary("s3", c) + s4 = summary_lib.tensor_summary("TensorSummary", c) summ1, summ2, summ3, summ4 = sess.run([s1, s2, s3, s4]) v1 = self._SummarySingleValue(summ1) @@ -67,7 +67,7 @@ class SummaryOpsTest(test.TestCase): def testScalarSummary(self): with self.cached_session() as sess: const = constant_op.constant(10.0) - summ = summary_ops.tensor_summary("foo", const) + summ = summary_lib.tensor_summary("foo", const) result = sess.run(summ) value = self._SummarySingleValue(result) @@ -78,7 +78,7 @@ class SummaryOpsTest(test.TestCase): s = six.b("foobar") with self.cached_session() as sess: const = constant_op.constant(s) - summ = summary_ops.tensor_summary("foo", const) + summ = summary_lib.tensor_summary("foo", const) result = sess.run(summ) value = self._SummarySingleValue(result) @@ -88,7 +88,7 @@ class SummaryOpsTest(test.TestCase): def testManyScalarSummary(self): with self.cached_session() as sess: const = array_ops.ones([5, 5, 5]) - summ = summary_ops.tensor_summary("foo", const) + summ = summary_lib.tensor_summary("foo", const) result = sess.run(summ) value = self._SummarySingleValue(result) n = tensor_util.MakeNdarray(value.tensor) @@ -98,7 +98,7 @@ class SummaryOpsTest(test.TestCase): strings = [[six.b("foo bar"), six.b("baz")], [six.b("zoink"), six.b("zod")]] with self.cached_session() as sess: const = constant_op.constant(strings) - summ = summary_ops.tensor_summary("foo", const) + summ = summary_lib.tensor_summary("foo", const) result = sess.run(summ) value = self._SummarySingleValue(result) n = tensor_util.MakeNdarray(value.tensor) @@ -108,7 +108,7 @@ class SummaryOpsTest(test.TestCase): bools = [True, True, True, False, False, False] with self.cached_session() as sess: const = constant_op.constant(bools) - summ = summary_ops.tensor_summary("foo", const) + summ = summary_lib.tensor_summary("foo", const) result = sess.run(summ) value = self._SummarySingleValue(result) @@ -126,14 +126,14 @@ class SummaryOpsTest(test.TestCase): const = constant_op.constant(1) # Default case; no description or display name - simple_summary = summary_ops.tensor_summary("simple", const) + simple_summary = summary_lib.tensor_summary("simple", const) descr = get_description(simple_summary) self.assertEqual(descr.display_name, "") self.assertEqual(descr.summary_description, "") # Values are provided via function args - with_values = summary_ops.tensor_summary( + with_values = summary_lib.tensor_summary( "simple", const, display_name="my name", @@ -148,14 +148,14 @@ class SummaryOpsTest(test.TestCase): metadata.display_name = "my name" metadata.summary_description = "my description" - with_metadata = summary_ops.tensor_summary( + with_metadata = summary_lib.tensor_summary( "simple", const, summary_metadata=metadata) descr = get_description(with_metadata) self.assertEqual(descr.display_name, "my name") self.assertEqual(descr.summary_description, "my description") # If both SummaryMetadata and explicit args are provided, the args win - overwrite = summary_ops.tensor_summary( + overwrite = summary_lib.tensor_summary( "simple", const, summary_metadata=metadata, diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py index fd532a9be2d..5a948a21946 100644 --- a/tensorflow/python/ops/logging_ops.py +++ b/tensorflow/python/ops/logging_ops.py @@ -629,4 +629,6 @@ ops.NotDifferentiable("AudioSummary") ops.NotDifferentiable("AudioSummaryV2") ops.NotDifferentiable("MergeSummary") ops.NotDifferentiable("ScalarSummary") +ops.NotDifferentiable("TensorSummary") +ops.NotDifferentiable("TensorSummaryV2") ops.NotDifferentiable("Timestamp") diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py deleted file mode 100644 index ec4d4a6e924..00000000000 --- a/tensorflow/python/ops/summary_ops.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2016 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Summary Operations.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from tensorflow.core.framework import summary_pb2 -from tensorflow.python.framework import constant_op -from tensorflow.python.framework import ops -from tensorflow.python.ops import gen_logging_ops -from tensorflow.python.ops import summary_op_util -# go/tf-wildcard-import -# pylint: disable=wildcard-import -from tensorflow.python.ops.gen_logging_ops import * -from tensorflow.python.util.tf_export import tf_export -# pylint: enable=wildcard-import - - -@tf_export("summary.tensor_summary") -def tensor_summary(name, - tensor, - summary_description=None, - collections=None, - summary_metadata=None, - family=None, - display_name=None): - """Outputs a `Summary` protocol buffer with a serialized tensor.proto. - - Args: - name: A name for the generated node. If display_name is not set, it will - also serve as the tag name in TensorBoard. (In that case, the tag - name will inherit tf name scopes.) - tensor: A tensor of any type and shape to serialize. - summary_description: A long description of the summary sequence. Markdown - is supported. - collections: Optional list of graph collections keys. The new summary op is - added to these collections. Defaults to `[GraphKeys.SUMMARIES]`. - summary_metadata: Optional SummaryMetadata proto (which describes which - plugins may use the summary value). - family: Optional; if provided, used as the prefix of the summary tag, - which controls the name used for display on TensorBoard when - display_name is not set. - display_name: A string used to name this data in TensorBoard. If this is - not set, then the node name will be used instead. - - Returns: - A scalar `Tensor` of type `string`. The serialized `Summary` protocol - buffer. - """ - - if summary_metadata is None: - summary_metadata = summary_pb2.SummaryMetadata() - - if summary_description is not None: - summary_metadata.summary_description = summary_description - - if display_name is not None: - summary_metadata.display_name = display_name - - serialized_summary_metadata = summary_metadata.SerializeToString() - - if summary_op_util.skip_summary(): - return constant_op.constant("") - with summary_op_util.summary_scope( - name, family, values=[tensor]) as (tag, scope): - val = gen_logging_ops.tensor_summary_v2( - tensor=tensor, - tag=tag, - name=scope, - serialized_summary_metadata=serialized_summary_metadata) - summary_op_util.collect(val, collections, [ops.GraphKeys.SUMMARIES]) - return val - -ops.NotDifferentiable("TensorSummary") diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py index fbae2b77faf..9e9e6ed9035 100644 --- a/tensorflow/python/summary/summary.py +++ b/tensorflow/python/summary/summary.py @@ -28,12 +28,12 @@ from google.protobuf import json_format as _json_format # pylint: disable=unused-import from tensorflow.core.framework.summary_pb2 import Summary from tensorflow.core.framework.summary_pb2 import SummaryDescription +from tensorflow.core.framework.summary_pb2 import SummaryMetadata as _SummaryMetadata # pylint: enable=unused-import from tensorflow.core.util.event_pb2 import Event from tensorflow.core.util.event_pb2 import SessionLog from tensorflow.core.util.event_pb2 import TaggedRunMetadata # pylint: enable=unused-import - from tensorflow.python.eager import context as _context from tensorflow.python.framework import constant_op as _constant_op from tensorflow.python.framework import dtypes as _dtypes @@ -42,16 +42,6 @@ from tensorflow.python.ops import gen_logging_ops as _gen_logging_ops from tensorflow.python.ops import gen_summary_ops as _gen_summary_ops # pylint: disable=unused-import from tensorflow.python.ops import summary_op_util as _summary_op_util -# exports tensor-related summaries -# pylint: disable=unused-import -from tensorflow.python.ops.summary_ops import tensor_summary -# pylint: enable=unused-import - -# exports text -# pylint: disable=unused-import -from tensorflow.python.summary.text_summary import text_summary as text -# pylint: enable=unused-import - # exports FileWriter, FileWriterCache # pylint: disable=unused-import from tensorflow.python.summary.writer.writer import FileWriter @@ -238,6 +228,103 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None, return val +@tf_export('summary.text') +def text(name, tensor, collections=None): + """Summarizes textual data. + + Text data summarized via this plugin will be visible in the Text Dashboard + in TensorBoard. The standard TensorBoard Text Dashboard will render markdown + in the strings, and will automatically organize 1d and 2d tensors into tables. + If a tensor with more than 2 dimensions is provided, a 2d subarray will be + displayed along with a warning message. (Note that this behavior is not + intrinsic to the text summary api, but rather to the default TensorBoard text + plugin.) + + Args: + name: A name for the generated node. Will also serve as a series name in + TensorBoard. + tensor: a string-type Tensor to summarize. + collections: Optional list of ops.GraphKeys. The collections to add the + summary to. Defaults to [_ops.GraphKeys.SUMMARIES] + + Returns: + A TensorSummary op that is configured so that TensorBoard will recognize + that it contains textual data. The TensorSummary is a scalar `Tensor` of + type `string` which contains `Summary` protobufs. + + Raises: + ValueError: If tensor has the wrong type. + """ + if tensor.dtype != _dtypes.string: + raise ValueError('Expected tensor %s to have dtype string, got %s' % + (tensor.name, tensor.dtype)) + + summary_metadata = _SummaryMetadata( + plugin_data=_SummaryMetadata.PluginData(plugin_name='text')) + t_summary = tensor_summary( + name=name, + tensor=tensor, + summary_metadata=summary_metadata, + collections=collections) + return t_summary + + +@tf_export('summary.tensor_summary') +def tensor_summary(name, + tensor, + summary_description=None, + collections=None, + summary_metadata=None, + family=None, + display_name=None): + """Outputs a `Summary` protocol buffer with a serialized tensor.proto. + + Args: + name: A name for the generated node. If display_name is not set, it will + also serve as the tag name in TensorBoard. (In that case, the tag + name will inherit tf name scopes.) + tensor: A tensor of any type and shape to serialize. + summary_description: A long description of the summary sequence. Markdown + is supported. + collections: Optional list of graph collections keys. The new summary op is + added to these collections. Defaults to `[GraphKeys.SUMMARIES]`. + summary_metadata: Optional SummaryMetadata proto (which describes which + plugins may use the summary value). + family: Optional; if provided, used as the prefix of the summary tag, + which controls the name used for display on TensorBoard when + display_name is not set. + display_name: A string used to name this data in TensorBoard. If this is + not set, then the node name will be used instead. + + Returns: + A scalar `Tensor` of type `string`. The serialized `Summary` protocol + buffer. + """ + + if summary_metadata is None: + summary_metadata = _SummaryMetadata() + + if summary_description is not None: + summary_metadata.summary_description = summary_description + + if display_name is not None: + summary_metadata.display_name = display_name + + serialized_summary_metadata = summary_metadata.SerializeToString() + + if _summary_op_util.skip_summary(): + return _constant_op.constant('') + with _summary_op_util.summary_scope( + name, family, values=[tensor]) as (tag, scope): + val = _gen_logging_ops.tensor_summary_v2( + tensor=tensor, + tag=tag, + name=scope, + serialized_summary_metadata=serialized_summary_metadata) + _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES]) + return val + + @tf_export('summary.merge') def merge(inputs, collections=None, name=None): # pylint: disable=line-too-long diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py index ac5eb4dbbe3..cacc28cc596 100644 --- a/tensorflow/python/summary/summary_test.py +++ b/tensorflow/python/summary/summary_test.py @@ -12,6 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""Tests for the API surface of the V1 tf.summary ops. + +These tests don't check the actual serialized proto summary value for the +more complex summaries (e.g. audio, image). Those test live separately in +tensorflow/python/kernel_tests/summary_v1_*.py. +""" from __future__ import absolute_import from __future__ import division @@ -21,6 +27,7 @@ from six.moves import xrange # pylint: disable=redefined-builtin from tensorflow.core.framework import summary_pb2 from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes from tensorflow.python.framework import meta_graph from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -29,7 +36,7 @@ from tensorflow.python.platform import test from tensorflow.python.summary import summary as summary_lib -class ScalarSummaryTest(test.TestCase): +class SummaryTest(test.TestCase): def testScalarSummary(self): with self.cached_session() as s: @@ -135,6 +142,12 @@ class ScalarSummaryTest(test.TestCase): self.assertEqual(len(summary.value), 1) self.assertEqual(summary.value[0].tag, 'family/outer/family/inner') + def testHistogramSummaryTypes(self): + for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32, + dtypes.float32, dtypes.float64): + const = constant_op.constant(10, dtype=dtype) + summary_lib.histogram('h', const) + def testAudioSummary(self): with self.cached_session() as s: i = array_ops.ones((5, 3, 4)) @@ -165,6 +178,21 @@ class ScalarSummaryTest(test.TestCase): for i in xrange(3)) self.assertEqual(tags, expected) + def testTextSummary(self): + with self.cached_session(): + with self.assertRaises(ValueError): + num = array_ops.constant(1) + summary_lib.text('foo', num) + + # The API accepts vectors. + arr = array_ops.constant(['one', 'two', 'three']) + summ = summary_lib.text('foo', arr) + self.assertEqual(summ.op.type, 'TensorSummaryV2') + + # the API accepts scalars + summ = summary_lib.text('foo', array_ops.constant('one')) + self.assertEqual(summ.op.type, 'TensorSummaryV2') + def testSummaryNameConversion(self): c = constant_op.constant(3) s = summary_lib.scalar('name with spaces', c) diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py deleted file mode 100644 index 6418c847f3c..00000000000 --- a/tensorflow/python/summary/text_summary.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Implements text_summary in TensorFlow, with TensorBoard support. - -The text_summary is a wrapper around the generic tensor_summary that takes a -string-type tensor and emits a TensorSummary op with SummaryMetadata that -notes that this summary is textual data for the TensorBoard text plugin. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from tensorflow.core.framework import summary_pb2 -from tensorflow.python.framework import dtypes -from tensorflow.python.ops.summary_ops import tensor_summary -from tensorflow.python.util.tf_export import tf_export - -PLUGIN_NAME = "text" - - -@tf_export("summary.text") -def text_summary(name, tensor, collections=None): - """Summarizes textual data. - - Text data summarized via this plugin will be visible in the Text Dashboard - in TensorBoard. The standard TensorBoard Text Dashboard will render markdown - in the strings, and will automatically organize 1d and 2d tensors into tables. - If a tensor with more than 2 dimensions is provided, a 2d subarray will be - displayed along with a warning message. (Note that this behavior is not - intrinsic to the text summary api, but rather to the default TensorBoard text - plugin.) - - Args: - name: A name for the generated node. Will also serve as a series name in - TensorBoard. - tensor: a string-type Tensor to summarize. - collections: Optional list of ops.GraphKeys. The collections to add the - summary to. Defaults to [_ops.GraphKeys.SUMMARIES] - - Returns: - A TensorSummary op that is configured so that TensorBoard will recognize - that it contains textual data. The TensorSummary is a scalar `Tensor` of - type `string` which contains `Summary` protobufs. - - Raises: - ValueError: If tensor has the wrong type. - """ - if tensor.dtype != dtypes.string: - raise ValueError("Expected tensor %s to have dtype string, got %s" % - (tensor.name, tensor.dtype)) - - summary_metadata = summary_pb2.SummaryMetadata( - plugin_data=summary_pb2.SummaryMetadata.PluginData( - plugin_name=PLUGIN_NAME)) - t_summary = tensor_summary( - name=name, - tensor=tensor, - summary_metadata=summary_metadata, - collections=collections) - return t_summary diff --git a/tensorflow/python/summary/text_summary_test.py b/tensorflow/python/summary/text_summary_test.py deleted file mode 100644 index 5b0db43cc1c..00000000000 --- a/tensorflow/python/summary/text_summary_test.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from tensorflow.python.framework import test_util -from tensorflow.python.ops import array_ops -from tensorflow.python.platform import googletest -from tensorflow.python.summary import text_summary - - -class TextPluginTest(test_util.TensorFlowTestCase): - """Test the Text Summary API. - - These tests are focused on testing the API design of the text_summary method. - It doesn't test the PluginAsset and tensors registry functionality, because - that is better tested by the text_plugin test that actually consumes that - metadata. - """ - - def testTextSummaryAPI(self): - with self.cached_session(): - - with self.assertRaises(ValueError): - num = array_ops.constant(1) - text_summary.text_summary("foo", num) - - # The API accepts vectors. - arr = array_ops.constant(["one", "two", "three"]) - summ = text_summary.text_summary("foo", arr) - self.assertEqual(summ.op.type, "TensorSummaryV2") - - # the API accepts scalars - summ = text_summary.text_summary("foo", array_ops.constant("one")) - self.assertEqual(summ.op.type, "TensorSummaryV2") - - -if __name__ == "__main__": - googletest.main() From 1c73bd639bce5440a542c711f767a32a6b573ba4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Nov 2018 23:04:18 -0800 Subject: [PATCH 169/540] Improve performance of depthwise separable convolutions PiperOrigin-RevId: 220231014 --- .../convolution_feature_group_converter.cc | 63 ++++- .../convolution_feature_group_converter.h | 6 +- .../compiler/xla/tests/convolution_test.cc | 266 ++++++++++++++++++ 3 files changed, 326 insertions(+), 9 deletions(-) diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc index 0ac4a65ec6a..7f7f1503a09 100644 --- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc +++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc @@ -51,7 +51,8 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault { Status HandleConvolution(HloInstruction* convolution) override; // Runs the visitor on a computation. - static bool Run(HloComputation* computation); + static bool Run(HloComputation* computation, + bool canonicalize_depthwise_filter); // Returns whether any convolution ops were rewritten. const bool changed() const { return changed_; } @@ -59,18 +60,24 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault { ~ConvolutionVisitor() override = default; private: - explicit ConvolutionVisitor(HloComputation* computation) - : computation_(computation) {} + explicit ConvolutionVisitor(HloComputation* computation, + bool canonicalize_depthwise_filter = false) + : computation_(computation), + filter_expansion_(!canonicalize_depthwise_filter) {} // Current HloComputation instance the ConvolutionVisitor is traversing. HloComputation* computation_; // Whether rewrite has occurred. bool changed_ = false; + + // Whether filter expansion is required. + bool filter_expansion_; }; -bool ConvolutionVisitor::Run(HloComputation* computation) { - ConvolutionVisitor visitor(computation); +bool ConvolutionVisitor::Run(HloComputation* computation, + bool canonicalize_depthwise_filter) { + ConvolutionVisitor visitor(computation, canonicalize_depthwise_filter); TF_CHECK_OK(computation->Accept(&visitor)); return visitor.changed_; } @@ -190,9 +197,49 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) { HloInstruction* filter_mask = GetExpandedFilterMask( filter->shape(), input_feature_dim, output_feature_dim, group_count, add); HloInstruction* expanded_filter; - // We want to repeat 'filter' in the 'input_feature_dim' dimension - // 'group_count' times. + if (group_size == 1) { + bool depthwise_separable = + (group_count == filter->shape().dimensions(output_feature_dim)); + // If the code generator handles depthwise separable convolutions + // inherently, then no filter expansion is needed. + if (!filter_expansion_ && depthwise_separable) { + const int64 old_kernel_input_feature_dimension = + dim_numbers.kernel_input_feature_dimension(); + const int64 old_kernel_output_feature_dimension = + dim_numbers.kernel_output_feature_dimension(); + + // For depthwise convolutions, we want the kernel input feature dimension + // to be smaller than the output feature dimension. If that's not the + // case, we swap the dimensions. + if (old_kernel_input_feature_dimension > + old_kernel_output_feature_dimension) { + Shape reshaped_filter_shape = filter->shape(); + auto& dimensions = *reshaped_filter_shape.mutable_dimensions(); + std::swap(dimensions[old_kernel_input_feature_dimension], + dimensions[old_kernel_output_feature_dimension]); + + auto reshaped_filter = + add(HloInstruction::CreateReshape(reshaped_filter_shape, filter)); + + dim_numbers.set_kernel_input_feature_dimension( + old_kernel_output_feature_dimension); + + dim_numbers.set_kernel_output_feature_dimension( + old_kernel_input_feature_dimension); + + auto new_convolution = HloInstruction::CreateConvolve( + convolution->shape(), convolution->mutable_operand(0), + reshaped_filter, group_count, convolution->window(), dim_numbers, + convolution->precision_config()); + + TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction( + convolution, std::move(new_convolution))); + } + return Status::OK(); + } + // We want to repeat 'filter' in the 'input_feature_dim' dimension + // 'group_count' times. Shape reshaped_filter_shape = ShapeUtil::DeleteDimension(input_feature_dim, filter->shape()); auto reshaped_filter = @@ -237,7 +284,7 @@ StatusOr ConvolutionFeatureGroupConverter::Run(HloModule* module) { module->ToString()); bool changed = false; for (auto* comp : module->MakeNonfusionComputations()) { - if (ConvolutionVisitor::Run(comp)) { + if (ConvolutionVisitor::Run(comp, filter_expansion_)) { changed = true; } } diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h index ce0138e56fb..cb6bc04c00a 100644 --- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h +++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h @@ -27,7 +27,8 @@ namespace xla { // convolutions with feature_group_count = 1. class ConvolutionFeatureGroupConverter : public HloModulePass { public: - ConvolutionFeatureGroupConverter() {} + ConvolutionFeatureGroupConverter(bool canonicalize_depthwise_filter = false) + : filter_expansion_(canonicalize_depthwise_filter) {} absl::string_view name() const override { return "convolution-feature-group-converter"; @@ -36,6 +37,9 @@ class ConvolutionFeatureGroupConverter : public HloModulePass { // Run convolution rewriting on the given computation. Returns whether the // computation was changed. StatusOr Run(HloModule* module) override; + + // Tells whether filter expansion is required. + bool filter_expansion_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc index 3aebf784664..211d004ec8c 100644 --- a/tensorflow/compiler/xla/tests/convolution_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_test.cc @@ -596,6 +596,272 @@ TYPED_TEST(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, Types) { this->RunTest(); } +template +class Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid : public ConvolutionTest { + public: + void RunTest() { + XlaBuilder builder(TestName()); + std::vector input_dims = {1, 4, 4, 5}; + std::vector filter_dims = {3, 3, 1, 5}; + Shape input_shape = ShapeUtil::MakeShapeWithType(input_dims); + Shape filter_shape = ShapeUtil::MakeShapeWithType(filter_dims); + { + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + + // Tensorflow dimension numbers for 2D convolution. + ConvolutionDimensionNumbers dnums; + dnums.set_input_batch_dimension(0); + dnums.set_output_batch_dimension(0); + dnums.add_input_spatial_dimensions(1); + dnums.add_output_spatial_dimensions(1); + dnums.add_input_spatial_dimensions(2); + dnums.add_output_spatial_dimensions(2); + dnums.set_input_feature_dimension(3); + dnums.set_output_feature_dimension(3); + dnums.add_kernel_spatial_dimensions(0); + dnums.add_kernel_spatial_dimensions(1); + dnums.set_kernel_input_feature_dimension(2); + dnums.set_kernel_output_feature_dimension(3); + + ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums, + /*feature_group_count=*/5); + } + + std::vector input_elems(ShapeUtil::ElementsIn(input_shape)); + iota_int_init_value(input_elems, 1); + auto input_r1 = LiteralUtil::CreateR1(input_elems); + auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie(); + + std::vector filter_elems(ShapeUtil::ElementsIn(filter_shape)); + iota_int_init_value(filter_elems, 1); + auto filter_r1 = LiteralUtil::CreateR1(filter_elems); + auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie(); + + auto expected_r1 = LiteralUtil::CreateR1( + {static_cast(6864), static_cast(7296), static_cast(7746), + static_cast(8214), static_cast(8700), static_cast(7809), + static_cast(8286), static_cast(8781), static_cast(9294), + static_cast(9825), static_cast(10644), static_cast(11256), + static_cast(11886), static_cast(12534), static_cast(13200), + static_cast(11589), static_cast(12246), static_cast(12921), + static_cast(13614), static_cast(14325)}); + auto expected_r4 = expected_r1.Reshape({1, 2, 2, 5}).ConsumeValueOrDie(); + + auto input_literal = + client_->TransferToServer(input_r4).ConsumeValueOrDie(); + auto filter_literal = + client_->TransferToServer(filter_r4).ConsumeValueOrDie(); + + ComputeAndCompareLiteral(&builder, expected_r4, + {input_literal.get(), filter_literal.get()}, + error_spec_); + + auto filter_r = filter_r1.Reshape(filter_dims); + } +}; + +TYPED_TEST_CASE(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid, TestTypes); +TYPED_TEST(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid, Types) { + this->RunTest(); +} + +template +class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid : public ConvolutionTest { + public: + void RunTest() { + XlaBuilder builder(TestName()); + std::vector input_dims = {1, 4, 4, 512}; + std::vector filter_dims = {3, 3, 1, 512}; + Shape input_shape = ShapeUtil::MakeShapeWithType(input_dims); + Shape filter_shape = ShapeUtil::MakeShapeWithType(filter_dims); + { + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + + // Tensorflow dimension numbers for 2D convolution. + ConvolutionDimensionNumbers dnums; + dnums.set_input_batch_dimension(0); + dnums.set_output_batch_dimension(0); + dnums.add_input_spatial_dimensions(1); + dnums.add_output_spatial_dimensions(1); + dnums.add_input_spatial_dimensions(2); + dnums.add_output_spatial_dimensions(2); + dnums.set_input_feature_dimension(3); + dnums.set_output_feature_dimension(3); + dnums.add_kernel_spatial_dimensions(0); + dnums.add_kernel_spatial_dimensions(1); + dnums.set_kernel_input_feature_dimension(2); + dnums.set_kernel_output_feature_dimension(3); + + ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums, + /*feature_group_count=*/512); + } + + std::vector input_elems(ShapeUtil::ElementsIn(input_shape), + static_cast(1)); + auto input_r1 = LiteralUtil::CreateR1(input_elems); + auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie(); + + std::vector filter_elems(ShapeUtil::ElementsIn(filter_shape), + static_cast(2)); + auto filter_r1 = LiteralUtil::CreateR1(filter_elems); + auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie(); + + std::vector output_elems(2048, static_cast(18)); + + auto expected_r1 = LiteralUtil::CreateR1(output_elems); + auto expected_r4 = expected_r1.Reshape({1, 2, 2, 512}).ConsumeValueOrDie(); + + auto input_literal = + client_->TransferToServer(input_r4).ConsumeValueOrDie(); + auto filter_literal = + client_->TransferToServer(filter_r4).ConsumeValueOrDie(); + + ComputeAndCompareLiteral(&builder, expected_r4, + {input_literal.get(), filter_literal.get()}, + error_spec_); + + auto filter_r = filter_r1.Reshape(filter_dims); + } +}; + +TYPED_TEST_CASE(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, TestTypes); +TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, Types) { + this->RunTest(); +} + +template +class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest { + public: + void RunTest() { + XlaBuilder builder(TestName()); + std::vector input_dims = {1, 4, 4, 160}; + std::vector filter_dims = {3, 3, 1, 160}; + Shape input_shape = ShapeUtil::MakeShapeWithType(input_dims); + Shape filter_shape = ShapeUtil::MakeShapeWithType(filter_dims); + { + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + + // Tensorflow dimension numbers for 2D convolution. + ConvolutionDimensionNumbers dnums; + dnums.set_input_batch_dimension(0); + dnums.set_output_batch_dimension(0); + dnums.add_input_spatial_dimensions(1); + dnums.add_output_spatial_dimensions(1); + dnums.add_input_spatial_dimensions(2); + dnums.add_output_spatial_dimensions(2); + dnums.set_input_feature_dimension(3); + dnums.set_output_feature_dimension(3); + dnums.add_kernel_spatial_dimensions(0); + dnums.add_kernel_spatial_dimensions(1); + dnums.set_kernel_input_feature_dimension(2); + dnums.set_kernel_output_feature_dimension(3); + + ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums, + /*feature_group_count=*/160); + } + + std::vector input_elems(ShapeUtil::ElementsIn(input_shape), + static_cast(1)); + auto input_r1 = LiteralUtil::CreateR1(input_elems); + auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie(); + + std::vector filter_elems(ShapeUtil::ElementsIn(filter_shape), + static_cast(2)); + auto filter_r1 = LiteralUtil::CreateR1(filter_elems); + auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie(); + + std::vector output_elems(640, static_cast(18)); + + auto expected_r1 = LiteralUtil::CreateR1(output_elems); + auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie(); + + auto input_literal = + client_->TransferToServer(input_r4).ConsumeValueOrDie(); + auto filter_literal = + client_->TransferToServer(filter_r4).ConsumeValueOrDie(); + + ComputeAndCompareLiteral(&builder, expected_r4, + {input_literal.get(), filter_literal.get()}, + error_spec_); + + auto filter_r = filter_r1.Reshape(filter_dims); + } +}; + +TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, TestTypes); +TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, Types) { + this->RunTest(); +} + +template +class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid + : public ConvolutionTest { + public: + void RunTest() { + XlaBuilder builder(TestName()); + std::vector input_dims = {1, 4, 4, 1024}; + std::vector filter_dims = {3, 3, 1, 1024}; + Shape input_shape = ShapeUtil::MakeShapeWithType(input_dims); + Shape filter_shape = ShapeUtil::MakeShapeWithType(filter_dims); + { + auto input = Parameter(&builder, 0, input_shape, "input"); + auto filter = Parameter(&builder, 1, filter_shape, "filter"); + + // Tensorflow dimension numbers for 2D convolution. + ConvolutionDimensionNumbers dnums; + dnums.set_input_batch_dimension(0); + dnums.set_output_batch_dimension(0); + dnums.add_input_spatial_dimensions(1); + dnums.add_output_spatial_dimensions(1); + dnums.add_input_spatial_dimensions(2); + dnums.add_output_spatial_dimensions(2); + dnums.set_input_feature_dimension(3); + dnums.set_output_feature_dimension(3); + dnums.add_kernel_spatial_dimensions(0); + dnums.add_kernel_spatial_dimensions(1); + dnums.set_kernel_input_feature_dimension(2); + dnums.set_kernel_output_feature_dimension(3); + + ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums, + /*feature_group_count=*/1024); + } + + std::vector input_elems(ShapeUtil::ElementsIn(input_shape), + static_cast(1)); + auto input_r1 = LiteralUtil::CreateR1(input_elems); + auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie(); + + std::vector filter_elems(ShapeUtil::ElementsIn(filter_shape), + static_cast(2)); + auto filter_r1 = LiteralUtil::CreateR1(filter_elems); + auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie(); + + std::vector output_elems(4096, static_cast(18)); + + auto expected_r1 = LiteralUtil::CreateR1(output_elems); + auto expected_r4 = expected_r1.Reshape({1, 2, 2, 1024}).ConsumeValueOrDie(); + + auto input_literal = + client_->TransferToServer(input_r4).ConsumeValueOrDie(); + auto filter_literal = + client_->TransferToServer(filter_r4).ConsumeValueOrDie(); + + ComputeAndCompareLiteral(&builder, expected_r4, + {input_literal.get(), filter_literal.get()}, + error_spec_); + + auto filter_r = filter_r1.Reshape(filter_dims); + } +}; + +TYPED_TEST_CASE(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, TestTypes); +TYPED_TEST(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, Types) { + this->RunTest(); +} + template class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest { public: From 69b7ab295e749462a7b899e28e019490e1565579 Mon Sep 17 00:00:00 2001 From: Anton Dmitriev Date: Tue, 6 Nov 2018 11:32:18 +0300 Subject: [PATCH 170/540] Update Apache Ignite IGFS after review. --- tensorflow/contrib/ignite/BUILD | 24 +++++++++---------- .../ignite/python/ops/igfs_op_loader.py | 2 +- .../contrib/ignite/python/ops/igfs_ops.py | 2 +- .../ignite/python/ops/ignite_op_loader.py | 2 +- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD index e486d85a4d1..5587119ec2e 100644 --- a/tensorflow/contrib/ignite/BUILD +++ b/tensorflow/contrib/ignite/BUILD @@ -27,18 +27,16 @@ py_library( ) tf_custom_op_library( - name = "_dataset_ops.so", - srcs = ["ops/dataset_ops.cc"], - deps = [":dataset_kernels"], -) - -tf_custom_op_library( - name = "_igfs_ops.so", + name = "_ignite_ops.so", srcs = [ "kernels/igfs/igfs.h", + "ops/dataset_ops.cc", "ops/igfs_ops.cc", ], - deps = [":igfs_kernels"], + deps = [ + ":dataset_kernels", + ":igfs_kernels", + ], ) tf_gen_op_libs( @@ -110,7 +108,7 @@ cc_library( "kernels/igfs/igfs_writable_file.h", ], deps = [ - "//tensorflow/contrib/ignite:ignite_client", + ":ignite_client", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", ], @@ -148,7 +146,7 @@ py_library( tf_gen_op_wrapper_py( name = "gen_dataset_ops", out = "python/ops/gen_dataset_ops.py", - deps = ["//tensorflow/contrib/ignite:dataset_ops_op_lib"], + deps = [":dataset_ops_op_lib"], ) tf_gen_op_wrapper_py( @@ -178,10 +176,10 @@ tf_kernel_library( tf_custom_op_py_library( name = "ignite_op_loader", srcs = ["python/ops/ignite_op_loader.py"], - dso = ["//tensorflow/contrib/ignite:_dataset_ops.so"], + dso = [":_ignite_ops.so"], kernels = [ ":dataset_ops_kernels", - "//tensorflow/contrib/ignite:dataset_ops_op_lib", + ":dataset_ops_op_lib", ], srcs_version = "PY2AND3", deps = [ @@ -194,7 +192,7 @@ tf_custom_op_py_library( tf_custom_op_py_library( name = "igfs_op_loader", srcs = ["python/ops/igfs_op_loader.py"], - dso = [":_igfs_ops.so"], + dso = [":_ignite_ops.so"], kernels = [ ":igfs_ops_kernels", ":igfs_ops_op_lib", diff --git a/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py b/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py index c9ab2831058..5957b4d19e9 100644 --- a/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py +++ b/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py @@ -22,4 +22,4 @@ from tensorflow.contrib.util import loader from tensorflow.python.platform import resource_loader _dataset_ops = loader.load_op_library( - resource_loader.get_path_to_datafile("../../_igfs_ops.so")) + resource_loader.get_path_to_datafile("../../_ignite_ops.so")) diff --git a/tensorflow/contrib/ignite/python/ops/igfs_ops.py b/tensorflow/contrib/ignite/python/ops/igfs_ops.py index 39cee05e071..15b71c6c2ef 100644 --- a/tensorflow/contrib/ignite/python/ops/igfs_ops.py +++ b/tensorflow/contrib/ignite/python/ops/igfs_ops.py @@ -36,5 +36,5 @@ from tensorflow.python.platform import resource_loader from tensorflow.python.framework import load_library file_system_library = os.path.join(resource_loader.get_data_files_path(), - "../../_igfs_ops.so") + "../../_ignite_ops.so") load_library.load_file_system_library(file_system_library) diff --git a/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py index c9af7386cf0..e450e2d84ba 100644 --- a/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py +++ b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py @@ -21,4 +21,4 @@ from tensorflow.contrib.util import loader from tensorflow.python.platform import resource_loader _dataset_ops = loader.load_op_library( - resource_loader.get_path_to_datafile("../../_dataset_ops.so")) + resource_loader.get_path_to_datafile("../../_ignite_ops.so")) From 72825596fac469dd12270135063c56010b20ad50 Mon Sep 17 00:00:00 2001 From: dianlujitao Date: Tue, 6 Nov 2018 16:53:06 +0800 Subject: [PATCH 171/540] Add absl_int128 to abseil libraries * absl_int128 is referenced by str_format_internal --- tensorflow/contrib/cmake/external/abseil_cpp.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake index c6c5021f60b..4546dbdecc0 100644 --- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake +++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake @@ -20,6 +20,7 @@ if (systemlib_ABSEIL_CPP) absl_dynamic_annotations absl_malloc_internal absl_throw_delegate + absl_int128 absl_strings str_format_internal absl_bad_optional_access) @@ -50,6 +51,7 @@ else (systemlib_ABSEIL_CPP) ${abseil_cpp_BUILD}/absl/base/Release/absl_dynamic_annotations.lib ${abseil_cpp_BUILD}/absl/base/Release/absl_malloc_internal.lib ${abseil_cpp_BUILD}/absl/base/Release/absl_throw_delegate.lib + ${abseil_cpp_BUILD}/absl/numeric/Release/absl_int128.lib ${abseil_cpp_BUILD}/absl/strings/Release/absl_strings.lib ${abseil_cpp_BUILD}/absl/strings/Release/str_format_internal.lib ${abseil_cpp_BUILD}/absl/types/Release/absl_bad_optional_access.lib) @@ -60,6 +62,7 @@ else (systemlib_ABSEIL_CPP) ${abseil_cpp_BUILD}/absl/base/absl_dynamic_annotations.lib ${abseil_cpp_BUILD}/absl/base/absl_malloc_internal.lib ${abseil_cpp_BUILD}/absl/base/absl_throw_delegate.lib + ${abseil_cpp_BUILD}/absl/numeric/absl_int128.lib ${abseil_cpp_BUILD}/absl/strings/absl_strings.lib ${abseil_cpp_BUILD}/absl/strings/str_format_internal.lib ${abseil_cpp_BUILD}/absl/types/absl_bad_optional_access.lib) @@ -71,6 +74,7 @@ else (systemlib_ABSEIL_CPP) ${abseil_cpp_BUILD}/absl/base/libabsl_dynamic_annotations.a ${abseil_cpp_BUILD}/absl/base/libabsl_malloc_internal.a ${abseil_cpp_BUILD}/absl/base/libabsl_throw_delegate.a + ${abseil_cpp_BUILD}/absl/numeric/libabsl_int128.a ${abseil_cpp_BUILD}/absl/strings/libabsl_strings.a ${abseil_cpp_BUILD}/absl/strings/libstr_format_internal.a ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a) From 3ec52d5dcec390ea0c4b91bcdead084b26381bcf Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 01:02:17 -0800 Subject: [PATCH 172/540] compat: Update forward compatibility horizon to 2018-11-06 PiperOrigin-RevId: 220239919 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index 1074a8b5a92..a553424f350 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -26,7 +26,7 @@ import datetime from tensorflow.python.util import tf_contextlib from tensorflow.python.util.tf_export import tf_export -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 5) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 6) @tf_export("compat.forward_compatible") From 6fc90a958d678c9f79c7761b5df263b95926e10c Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Tue, 6 Nov 2018 01:44:24 -0800 Subject: [PATCH 173/540] Improve KeyValueSort performance on GPU. We now parallelize not only the loop through the dimension that should be sorted, but also the other loops. Also combine several compare loops if the comparisons happen within a small block of memory. PiperOrigin-RevId: 220245099 --- .../xla/service/gpu/ir_emitter_unnested.cc | 96 +++++--- tensorflow/compiler/xla/service/llvm_ir/BUILD | 2 + .../compiler/xla/service/llvm_ir/sort_util.cc | 224 +++++++++++------- .../compiler/xla/service/llvm_ir/sort_util.h | 8 +- 4 files changed, 217 insertions(+), 113 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 21e44e1e7d3..1aaef2d3c5c 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2199,7 +2199,6 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) { int64 dimension_to_sort = sort->dimensions(0); int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort); int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound); - auto index_type = b_.getInt64Ty(); // Naive C++ code for the outer loops: // @@ -2213,42 +2212,85 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) { // } // } // - // This follows the algorithm described on Wikipedia: - // https://en.wikipedia.org/wiki/Bitonic_sorter + // This follows the alternative representation of the algorithm described on + // Wikipedia: https://en.wikipedia.org/wiki/Bitonic_sorter + // + // Each mask specifies how to derive from one position in the array the + // position with which it should be compared (we calculate the xor of the + // position with the mask). + // As an optimization, we can move the 'mask' loop to inside the + // sorting/comparison loop if the comparisons happen within a small block of + // the array. To make this work, we collect all consecutive masks that are + // smaller than our chosen power of 2 tile size, and pass them to SortInPlace. + // Each thread then processes one tile of data. + const int64 kTileSize = 64LL; + + // If we cannot combine several xor masks together, we don't use tiling, so we + // calculate the standard launch dimensions for the shape. + LaunchDimensions standard_launch_dimensions = CalculateLaunchDimensions( + keys_shape, ir_emitter_context_->device_description()); + + // Calculate the launch dimensions for the case where we use tiling. + Shape tile_shape = keys_shape; + tile_shape.set_dimensions(dimension_to_sort, + CeilOfRatio(dimension_to_sort_bound, kTileSize)); + uint64 num_tiles = ShapeUtil::ElementsIn(tile_shape); + const uint64 kThreadsPerBlock = std::min( + {std::max( + 1ULL, + ir_emitter_context_->device_description().threads_per_block_limit()), + 32ULL, num_tiles}); + uint64 num_blocks = CeilOfRatio(num_tiles, kThreadsPerBlock); + LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock); + + auto emit_kernel = [&](absl::Span xor_masks) { + thunks.push_back( + BuildKernelThunk(sort, /*implements_whole_instruction=*/false)); + LaunchDimensions launch_dimensions = xor_masks.size() > 1 + ? tiled_launch_dimensions + : standard_launch_dimensions; + UpdateLaunchDimensions(launch_dimensions, thunks.back().get(), + ir_emitter_context_->llvm_module()); + IrArray keys_array; + std::vector values_arrays; + values_arrays.reserve(sort->operand_count() - 1); + for (int64 i = 0; i < sort->operand_count(); ++i) { + ShapeIndex shape_index = + sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({}); + if (i == 0) { + keys_array = GetIrArray(*sort, *sort, shape_index); + } else { + values_arrays.push_back(GetIrArray(*sort, *sort, shape_index)); + } + } + return llvm_ir::EmitSortInPlace(dimension_to_sort, keys_array, + values_arrays, IrName(sort), xor_masks, &b_, + launch_dimensions, kTileSize); + }; + std::vector xor_masks; for (int64 stage = 0; stage < num_stages; ++stage) { for (int64 mask = stage; mask >= 0; --mask) { - thunks.push_back( - BuildKernelThunk(sort, /*implements_whole_instruction=*/false)); - LaunchDimensions launch_dimensions = CalculateLaunchDimensions( - keys_shape, ir_emitter_context_->device_description()); - UpdateLaunchDimensions(launch_dimensions, thunks.back().get(), - ir_emitter_context_->llvm_module()); - - llvm::Value* xor_mask; + int64 xor_mask; if (mask == stage) { - xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1); + xor_mask = (1LL << (stage + 1)) - 1; } else { - xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask); + xor_mask = 1LL << mask; } - - IrArray keys_array; - std::vector values_arrays; - values_arrays.reserve(sort->operand_count() - 1); - for (int64 i = 0; i < sort->operand_count(); ++i) { - ShapeIndex shape_index = - sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({}); - if (i == 0) { - keys_array = GetIrArray(*sort, *sort, shape_index); - } else { - values_arrays.push_back(GetIrArray(*sort, *sort, shape_index)); + if (xor_mask >= kTileSize) { + if (!xor_masks.empty()) { + TF_RETURN_IF_ERROR(emit_kernel(xor_masks)); + xor_masks.clear(); } + TF_RETURN_IF_ERROR(emit_kernel({xor_mask})); + } else { + xor_masks.push_back(xor_mask); } - TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace( - dimension_to_sort, keys_array, values_arrays, IrName(sort), xor_mask, - &b_, &launch_dimensions)); } } + if (!xor_masks.empty()) { + TF_RETURN_IF_ERROR(emit_kernel(xor_masks)); + } AddThunkToThunkSequence( absl::make_unique(std::move(thunks), sort)); diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD index 56a729bca8e..84449ef7e5b 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/BUILD +++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD @@ -202,10 +202,12 @@ cc_library( ":llvm_util", ":loop_emitter", "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter", "//tensorflow/compiler/xla/service/gpu:partition_assignment", "//tensorflow/core:lib", "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", "@llvm//:core", "@llvm//:support", ], diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc index 05ba4a40da4..d21b1dbfb77 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc @@ -19,6 +19,7 @@ limitations under the License. // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" #include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "llvm/ADT/APInt.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -32,6 +33,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h" #include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/types.h" @@ -39,31 +41,28 @@ namespace xla { namespace llvm_ir { namespace { + // Adds the inner comparison loop where we compare elements pointed to by // 'keys_index' and 'compare_keys_index'. -void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index, - const IrArray::Index& compare_keys_index, - const IrArray& keys_array, - const std::vector& values_arrays, - llvm::IRBuilder<>* b) { - // if (is_smaller_index && - // compare_keys[dimension_to_sort] < dimension_to_sort_bound) - llvm::Value* is_smaller_index = b->CreateICmpSLT( - keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]); - int64 dimension_to_sort_bound = - keys_array.GetShape().dimensions(dimension_to_sort); +void EmitCompareLoopBody( + llvm::Value* dimension_to_sort_bound, PrimitiveType key_type, + int64 num_values, llvm::Value* keys_index, llvm::Value* compare_keys_index, + std::function read_element, + std::function + write_element, + llvm::IRBuilder<>* b) { + // if (is_smaller_index && compare_keys < dimension_to_sort_bound) + llvm::Value* is_smaller_index = + b->CreateICmpSLT(keys_index, compare_keys_index); auto if_data = EmitIfThenElse( - b->CreateAnd(is_smaller_index, - b->CreateICmpSLT(compare_keys_index[dimension_to_sort], - keys_index.GetConstantWithIndexType( - dimension_to_sort_bound))), + b->CreateAnd(is_smaller_index, b->CreateICmpSLT(compare_keys_index, + dimension_to_sort_bound)), "smaller_comparison_index", b, /*emit_else=*/false); SetToFirstInsertPoint(if_data.true_block, b); - auto key1 = keys_array.EmitReadArrayElement(keys_index, b); - auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, b); + auto key1 = read_element(0, keys_index); + auto key2 = read_element(0, compare_keys_index); auto compare_key1 = key1; auto compare_key2 = key2; - auto key_type = keys_array.GetShape().element_type(); bool is_signed_comparison = true; if (primitive_util::IsFloatingPointType(key_type)) { // We would like a total order of floating point numbers so that the sort @@ -99,87 +98,146 @@ void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index, EmitIfThenElse(comparison, "is_smaller_than", b, /*emit_else=*/false); SetToFirstInsertPoint(if_smaller_data.true_block, b); // Swap key1 with key2. - keys_array.EmitWriteArrayElement(keys_index, key2, b); - keys_array.EmitWriteArrayElement(compare_keys_index, key1, b); - for (const auto& values_array : values_arrays) { + write_element(0, keys_index, key2); + write_element(0, compare_keys_index, key1); + for (int64 i = 1; i <= num_values; ++i) { // Also swap the values. - auto value1 = values_array.EmitReadArrayElement(keys_index, b); - auto value2 = values_array.EmitReadArrayElement(compare_keys_index, b); - values_array.EmitWriteArrayElement(keys_index, value2, b); - values_array.EmitWriteArrayElement(compare_keys_index, value1, b); + auto value1 = read_element(i, keys_index); + auto value2 = read_element(i, compare_keys_index); + write_element(i, keys_index, value2); + write_element(i, compare_keys_index, value1); + } +} + +void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index, + int64 dimension_to_sort, + int64 dimension_to_sort_bound, + PrimitiveType keys_type, + absl::Span xor_masks, + const std::vector& params, int64 tile_size, + llvm::IRBuilder<>* b) { + IrArray::Index keys_index = tiled_keys_index; + auto read_element = [&](int64 operand, llvm::Value* index) { + keys_index[dimension_to_sort] = index; + return params[operand].EmitReadArrayElement(keys_index, b); + }; + auto write_element = [&](int64 operand, llvm::Value* index, + llvm::Value* value) { + keys_index[dimension_to_sort] = index; + params[operand].EmitWriteArrayElement(keys_index, value, b); + }; + + for (int64 xor_mask : xor_masks) { + std::unique_ptr tile_element_loop = + llvm_ir::ForLoop::EmitForLoop( + "element_id_in_tile", tiled_keys_index.GetConstantWithIndexType(0), + tiled_keys_index.GetConstantWithIndexType(tile_size), + tiled_keys_index.GetConstantWithIndexType(1), b); + llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(), b); + auto current_keys_index = b->CreateAdd( + b->CreateMul(tiled_keys_index[dimension_to_sort], + tiled_keys_index.GetConstantWithIndexType(tile_size)), + tile_element_loop->GetIndVarValue()); + auto compare_keys_index = + b->CreateXor(current_keys_index, + tiled_keys_index.GetConstantWithIndexType(xor_mask)); + EmitCompareLoopBody( + tiled_keys_index.GetConstantWithIndexType(dimension_to_sort_bound), + keys_type, params.size() - 1, current_keys_index, compare_keys_index, + read_element, write_element, b); + SetToFirstInsertPoint(tile_element_loop->GetExitBasicBlock(), b); } } } // namespace Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, const std::vector& values_arrays, - absl::string_view name, llvm::Value* xor_mask, - llvm::IRBuilder<>* b, - const gpu::LaunchDimensions* launch_dimensions) { + absl::string_view name, + absl::Span xor_masks, llvm::IRBuilder<>* b, + const gpu::LaunchDimensions& launch_dimensions, + const int64 tile_size) { + // Iterate through the keys shape in physical order, but skip the dimension to + // sort and make it the innermost loop which is the loop where the comparisons + // happen. In the dimension to sort, if we use tiling, we iterate through it + // in tiles of 64 elements each, so we use another loop that happens within + // one thread to process this tile worth of data (thereby combining several + // comparison stages of the bitonic sort algorithm because they all happen + // within those 64 elements and are therefore independent of the other + // comparisons). + const Shape& keys_shape = keys_array.GetShape(); - - // Create loop nests which loop through the operand dimensions. The sort - // dimension is handled in the innermost loop which performs the sorting. - ForLoopNest loop_nest(name, b); - IrArray::Index keys_index = - loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys"); - if (loop_nest.GetInnerLoopBodyBasicBlock() != nullptr) { - SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b); - } - - // 'compare_keys_index' is the index of the element that 'keys_index' should - // be compared to. - IrArray::Index compare_keys_index(keys_index.GetType()); - for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) { + int64 rank = ShapeUtil::Rank(keys_shape); + int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort); + int64 num_tiles = CeilOfRatio(dimension_to_sort_bound, tile_size); + std::vector dimensions_in_iteration_order(rank); + std::vector iteration_order_to_logical_order(rank); + int64 dim = 0; + for (int64 dimension : LayoutUtil::MinorToMajor(keys_shape)) { if (dimension != dimension_to_sort) { - compare_keys_index.push_back(keys_index[dimension]); - } else { - compare_keys_index.push_back(nullptr); + dimensions_in_iteration_order[dim] = keys_shape.dimensions(dimension); + iteration_order_to_logical_order[dim++] = dimension; } } + dimensions_in_iteration_order[dim] = dimension_to_sort_bound; + iteration_order_to_logical_order[dim] = dimension_to_sort; - // Naive C++ code for the inner compare loop: - // - // for (int64 i = 0; i < dimension_to_sort_bound; ++i) { - // int64 j = i ^ xor_mask; - // if (i < j && j < dimension_to_sort_bound) { - // int64 min_key = std::min(keys[i], keys[j]); - // keys[j] = std::max(keys[i], keys[j]); - // keys[i] = min_key; - // } - // } - // - // This follows the algorithm described on Wikipedia: - // https://en.wikipedia.org/wiki/Bitonic_sorter + Shape iteration_shape = ShapeUtil::MakeShape(keys_shape.element_type(), + dimensions_in_iteration_order); + Shape tiled_iteration_shape = iteration_shape; + tiled_iteration_shape.set_dimensions(dim, num_tiles); + std::vector params(1, keys_array); + params.insert(params.end(), values_arrays.begin(), values_arrays.end()); - int64 dimension_to_sort_bound = - keys_array.GetShape().dimensions(dimension_to_sort); - Shape compare_shape = ShapeUtil::MakeShape(keys_shape.element_type(), - {dimension_to_sort_bound}); auto compare_loop_body_emitter = - [&](const IrArray::Index& compare_index) -> Status { - keys_index[dimension_to_sort] = compare_index[0]; - compare_keys_index[dimension_to_sort] = - b->CreateXor(compare_index[0], xor_mask); - EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, - keys_array, values_arrays, b); + [&](const IrArray::Index& tiles_index) -> Status { + // Naive C++ code for the inner compare loop: + // + // for (int64 i = 0; i < dimension_to_sort_bound; ++i) { + // int64 j = i ^ xor_mask; + // /* emitted in EmitCompareLoopBody() */ + // if (i < j && j < dimension_to_sort_bound) { + // int64 min_key = std::min(keys[i], keys[j]); + // keys[j] = std::max(keys[i], keys[j]); + // keys[i] = min_key; + // } + // } + // + // This follows the algorithm described on Wikipedia: + // https://en.wikipedia.org/wiki/Bitonic_sorter + IrArray::Index keys_index(tiles_index.GetType(), rank); + for (int64 i = 0; i < rank; ++i) { + keys_index[iteration_order_to_logical_order[i]] = tiles_index[i]; + } + if (xor_masks.size() > 1) { + EmitTiledCompareLoop(keys_index, dimension_to_sort, + dimension_to_sort_bound, keys_shape.element_type(), + xor_masks, params, tile_size, b); + } else { + auto read_element = [&](int64 operand, llvm::Value* index) { + keys_index[dimension_to_sort] = index; + return params[operand].EmitReadArrayElement(keys_index, b); + }; + auto write_element = [&](int64 operand, llvm::Value* index, + llvm::Value* value) { + keys_index[dimension_to_sort] = index; + params[operand].EmitWriteArrayElement(keys_index, value, b); + }; + auto current_keys_index = tiles_index[rank - 1]; + auto compare_keys_index = + b->CreateXor(current_keys_index, + tiles_index.GetConstantWithIndexType(xor_masks[0])); + EmitCompareLoopBody( + tiles_index.GetConstantWithIndexType(dimension_to_sort_bound), + keys_shape.element_type(), values_arrays.size(), current_keys_index, + compare_keys_index, read_element, write_element, b); + } return Status::OK(); }; - if (launch_dimensions != nullptr) { - TF_RETURN_IF_ERROR(gpu::ParallelLoopEmitter(compare_loop_body_emitter, - compare_shape, - *launch_dimensions, b) - .EmitLoop(name)); - } else { - TF_RETURN_IF_ERROR(LoopEmitter(compare_loop_body_emitter, compare_shape, b) - .EmitLoop(name)); - } - - // Set the IR builder insert point to the exit basic block of the outer most - // loop. This ensures later instructions are inserted after this loop nest. - b->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock()); - - return Status::OK(); + return gpu::ParallelLoopEmitter( + compare_loop_body_emitter, + xor_masks.size() > 1 ? tiled_iteration_shape : iteration_shape, + launch_dimensions, b) + .EmitLoop(name); } } // namespace llvm_ir diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h index 2f3bcda2307..06f6b8de203 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h +++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h @@ -19,6 +19,7 @@ limitations under the License. #include #include "absl/strings/string_view.h" +#include "absl/types/span.h" #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h" #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h" @@ -33,9 +34,10 @@ namespace llvm_ir { // the inner compare loop will not be parallelized. Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, const std::vector& values_arrays, - absl::string_view name, llvm::Value* xor_mask, - llvm::IRBuilder<>* b, - const gpu::LaunchDimensions* launch_dimensions); + absl::string_view name, + absl::Span xor_masks, llvm::IRBuilder<>* b, + const gpu::LaunchDimensions& launch_dimensions, + int64 tile_size); } // namespace llvm_ir } // namespace xla From bea0a012e079286f4c49e659f75c0a6f5f44b7a1 Mon Sep 17 00:00:00 2001 From: Anton Dmitriev Date: Tue, 6 Nov 2018 12:59:10 +0300 Subject: [PATCH 174/540] Fix style. --- tensorflow/c/c_api_internal.h | 2 +- tensorflow/java/src/main/native/server_jni.cc | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h index 8df85cb503f..2b5b798ee41 100644 --- a/tensorflow/c/c_api_internal.h +++ b/tensorflow/c/c_api_internal.h @@ -25,8 +25,8 @@ limitations under the License. #include #ifndef __ANDROID__ -#include "tensorflow/core/framework/op_gen_lib.h" #include "tensorflow/core/distributed_runtime/server_lib.h" +#include "tensorflow/core/framework/op_gen_lib.h" #endif #include "tensorflow/core/common_runtime/shape_refiner.h" #include "tensorflow/core/framework/tensor.h" diff --git a/tensorflow/java/src/main/native/server_jni.cc b/tensorflow/java/src/main/native/server_jni.cc index 68a53ef87c1..51db74fefd2 100644 --- a/tensorflow/java/src/main/native/server_jni.cc +++ b/tensorflow/java/src/main/native/server_jni.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/java/src/main/native/utils_jni.h" namespace { +#ifndef __ANDROID__ TF_Server* requireHandle(JNIEnv* env, jlong handle) { static_assert(sizeof(jlong) >= sizeof(TF_Server*), "Cannot package C object pointers as a Java long"); @@ -30,11 +31,17 @@ TF_Server* requireHandle(JNIEnv* env, jlong handle) { return reinterpret_cast(handle); } +#endif // __ANDROID__ } // namespace JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate( JNIEnv* env, jclass clazz, jbyteArray server_def) { +#ifdef __ANDROID__ + throwException(env, kUnsupportedOperationException, + "Server is not supported on Android"); + return 0; +#else TF_Status* status = TF_NewStatus(); jbyte* server_def_ptr = env->GetByteArrayElements(server_def, nullptr); @@ -49,11 +56,17 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate( TF_DeleteStatus(status); return ok ? reinterpret_cast(server) : 0; +#endif // __ANDROID__ } JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env, jclass clazz, jlong handle) { +#ifdef __ANDROID__ + throwException(env, kUnsupportedOperationException, + "Server is not supported on Android"); + return 0; +#else TF_Server* server = requireHandle(env, handle); if (server == nullptr) return; @@ -63,11 +76,17 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env, throwExceptionIfNotOK(env, status); TF_DeleteStatus(status); +#endif // __ANDROID__ } JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env, jclass clazz, jlong handle) { +#ifdef __ANDROID__ + throwException(env, kUnsupportedOperationException, + "Server is not supported on Android"); + return 0; +#else TF_Server* server = requireHandle(env, handle); if (server == nullptr) return; @@ -77,11 +96,17 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env, throwExceptionIfNotOK(env, status); TF_DeleteStatus(status); +#endif // __ANDROID__ } JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env, jclass clazz, jlong handle) { +#ifdef __ANDROID__ + throwException(env, kUnsupportedOperationException, + "Server is not supported on Android"); + return 0; +#else TF_Server* server = requireHandle(env, handle); if (server == nullptr) return; @@ -91,6 +116,7 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env, throwExceptionIfNotOK(env, status); TF_DeleteStatus(status); +#endif // __ANDROID__ } JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv* env, From 22419732f90a4b5667384b4169a59cf67d656f50 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Tue, 6 Nov 2018 04:29:59 -0800 Subject: [PATCH 175/540] A crude reference cycle detection mechanism. PiperOrigin-RevId: 220260494 --- tensorflow/python/framework/test_util.py | 118 ++++++++++++++++++++++- tensorflow/python/util/tf_inspect.py | 5 + 2 files changed, 120 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index 768ed36917f..6129d686e80 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -630,6 +630,109 @@ def assert_no_new_tensors(f): return decorator +def _find_reference_cycle(objects, idx): + + def get_ignore_reason(obj, blacklist): + """Tests whether an object should be omitted from the dependency graph.""" + if len(blacklist) > 100: + return "" + if tf_inspect.isframe(obj): + if "test_util.py" in tf_inspect.getframeinfo(obj)[0]: + return "" + for b in blacklist: + if b is obj: + return "" + if obj is blacklist: + return "" + return None + + # Note: this function is meant to help with diagnostics. Its output is purely + # a human readable representation, so you may freely modify it to suit your + # needs. + def describe(obj, blacklist, leaves_only=False): + """Returns a custom human-readable summary of obj. + + Args: + obj: the value to describe. + blacklist: same as blacklist in get_ignore_reason. + leaves_only: boolean flag used when calling describe recursively. Useful + for summarizing collections. + """ + if get_ignore_reason(obj, blacklist): + return "{}{}".format(get_ignore_reason(obj, blacklist), type(obj)) + if tf_inspect.isframe(obj): + return "frame: {}".format(tf_inspect.getframeinfo(obj)) + elif tf_inspect.ismodule(obj): + return "module: {}".format(obj.__name__) + else: + if leaves_only: + return "{}, {}".format(type(obj), id(obj)) + elif isinstance(obj, list): + return "list({}): {}".format( + id(obj), [describe(e, blacklist, leaves_only=True) for e in obj]) + elif isinstance(obj, tuple): + return "tuple({}): {}".format( + id(obj), [describe(e, blacklist, leaves_only=True) for e in obj]) + elif isinstance(obj, dict): + return "dict({}): {} keys".format(id(obj), len(obj.keys())) + elif tf_inspect.isfunction(obj): + return "function({}) {}; globals ID: {}".format( + id(obj), obj.__name__, id(obj.__globals__)) + else: + return "{}, {}".format(type(obj), id(obj)) + + def build_ref_graph(obj, graph, reprs, blacklist): + """Builds a reference graph as -> . + + Args: + obj: The object to start from. The graph will be built by recursively + adding its referrers. + graph: Dict holding the graph to be built. To avoid creating extra + references, the graph holds object IDs rather than actual objects. + reprs: Auxiliary structure that maps object IDs to their human-readable + description. + blacklist: List of objects to ignore. + """ + referrers = gc.get_referrers(obj) + blacklist = blacklist + (referrers,) + + obj_id = id(obj) + for r in referrers: + if get_ignore_reason(r, blacklist) is None: + r_id = id(r) + if r_id not in graph: + graph[r_id] = [] + if obj_id not in graph[r_id]: + graph[r_id].append(obj_id) + build_ref_graph(r, graph, reprs, blacklist) + reprs[r_id] = describe(r, blacklist) + + def find_cycle(el, graph, reprs, path): + """Finds and prints a single cycle in the dependency graph.""" + if el not in graph: + return + for r in graph[el]: + if r in path: + logging.error("Reference cycle sample:") + for p in path + (r,): + logging.error(reprs.get(p, "unknown object " + str(p))) + return True + else: + if find_cycle(r, graph, reprs, path + (r,)): + return True + return False + + obj = objects[idx] + graph = {} # referrer ID -> object ID + reprs = {} # object ID -> description + build_ref_graph(obj, graph, reprs, (objects, graph, reprs, get_ignore_reason, + describe, build_ref_graph, find_cycle)) + for k in graph: + if find_cycle(k, graph, reprs, ()): + return True + return False + + def assert_no_garbage_created(f): """Test method decorator to assert that no garbage has been created. @@ -652,7 +755,8 @@ def assert_no_garbage_created(f): previous_garbage = len(gc.garbage) f(self, **kwargs) gc.collect() - if len(gc.garbage) > previous_garbage: + new_garbage = len(gc.garbage) + if new_garbage > previous_garbage: logging.error( "The decorated test created work for Python's garbage collector, " "likely due to a reference cycle. New objects in cycle(s):") @@ -676,11 +780,19 @@ def assert_no_garbage_created(f): logging.error(obj) logging.error(" Object __repr__:") logging.error(repr(obj)) - except Exception: + except Exception: # pylint: disable=broad-except logging.error("(Exception while printing object)") + + # When garbage is created, this call can help identify reference cycles, + # which are typically the cause of such garbage. + if new_garbage > previous_garbage: + for i in range(previous_garbage, new_garbage): + if _find_reference_cycle(gc.garbage, i): + break + # This will fail if any garbage has been created, typically because of a # reference cycle. - self.assertEqual(previous_garbage, len(gc.garbage)) + self.assertEqual(previous_garbage, new_garbage) # TODO(allenl): Figure out why this debug flag reset doesn't work. It would # be nice to be able to decorate arbitrary tests in a large test suite and # not hold on to every object in other tests. diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py index 444e44eaf14..5f1e776640d 100644 --- a/tensorflow/python/util/tf_inspect.py +++ b/tensorflow/python/util/tf_inspect.py @@ -352,6 +352,11 @@ def isfunction(object): # pylint: disable=redefined-builtin return _inspect.isfunction(tf_decorator.unwrap(object)[1]) +def isframe(object): # pylint: disable=redefined-builtin + """TFDecorator-aware replacement for inspect.ismodule.""" + return _inspect.isframe(tf_decorator.unwrap(object)[1]) + + def isgenerator(object): # pylint: disable=redefined-builtin """TFDecorator-aware replacement for inspect.isgenerator.""" return _inspect.isgenerator(tf_decorator.unwrap(object)[1]) From a9b647d2ea65c7bd77deb06b10f1e5e95a6092ef Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Tue, 6 Nov 2018 04:30:30 -0800 Subject: [PATCH 176/540] Add a mechanism by which a method can be bound to a weak reference of self. Use this mechanism in defun to convey that reference to AutoGraph. PiperOrigin-RevId: 220260542 --- .../python/autograph/pyct/inspect_utils.py | 6 ++++++ .../python/autograph/pyct/inspect_utils_test.py | 16 ++++++++++++++++ tensorflow/python/eager/function.py | 17 ++++++++++++++++- 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py index 88f2d7a0564..8fc0cb7c936 100644 --- a/tensorflow/python/autograph/pyct/inspect_utils.py +++ b/tensorflow/python/autograph/pyct/inspect_utils.py @@ -187,6 +187,12 @@ def getmethodclass(m): # Instance method and class methods: should be bound to a non-null "self". if hasattr(m, '__self__'): if m.__self__: + # A fallback allowing methods to be actually bound to a type different + # than __self__. This is useful when a strong reference from the method + # to the object is not desired, for example when caching is involved. + if hasattr(m.__self__, 'ag_self_weakref__'): + return m.__self__.ag_self_weakref__() + return m.__self__ # Class, static and unbound methods: search all defined classes in any diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py index 51116b6cac7..e7a2622c1ab 100644 --- a/tensorflow/python/autograph/pyct/inspect_utils_test.py +++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py @@ -20,6 +20,8 @@ from __future__ import print_function from functools import wraps import imp +import types +import weakref import six @@ -262,6 +264,20 @@ class InspectUtilsTest(test.TestCase): c = TestCallable() self.assertEqual(inspect_utils.getmethodclass(c), TestCallable) + def test_getmethodclass_weakref_mechanism(self): + test_obj = TestClass() + + class WeakrefWrapper(object): + + def __init__(self): + self.ag_self_weakref__ = weakref.ref(test_obj) + + def test_fn(self): + return self + + bound_method = types.MethodType(test_fn, WeakrefWrapper()) + self.assertEqual(inspect_utils.getmethodclass(bound_method), test_obj) + def test_getdefiningclass(self): class Superclass(object): diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index 08266a115b2..aa0763d115b 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -1528,19 +1528,34 @@ def defun_with_attributes(func=None, return decorated +# When a method is bound to objects of this type, it allows AutoGraph to +# recover a weak reference the original method's self pointer. This uses the +# mechanism from pyct.inspect_utils.getmethodclass. +# TODO(mdan): This is not pretty. Use a callable wrapper with __get__ instead. +class _WeakrefSelf(object): + + def __init__(self, target): + self.ag_self_weakref__ = target + + def class_method_to_instance_method(original_function, instance): """Constructs a new PolymorphicFunction with `self` bound.""" def make_partial_py_func(py_func, weak_instance): return lambda *args, **kwargs: py_func(weak_instance(), *args, **kwargs) weak_instance = weakref.ref(instance) + # Note: while we could bind to a weakref proxy instead, that causes the + # bound method to be unhashable. + bound_method = types_lib.MethodType(original_function.python_function, + _WeakrefSelf(weak_instance)) + # pylint: disable=protected-access # We make a dummy MethodType object to generate the correct bound method # signature. The actual call is to a function with a weak reference to # `instance`. instance_func = type(original_function)( tf_decorator.make_decorator( - types_lib.MethodType(original_function.python_function, False), + bound_method, make_partial_py_func(original_function.python_function, weak_instance)), name=original_function._name, From 1e7b4eadb0b1ff5fdd0ae8915d558b934d19a168 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 06:41:45 -0800 Subject: [PATCH 177/540] Don't set invalid shardings in HloInstruction::SetupDerivedInstruction Copying the sharding from one instruction to an other can create invalid sharding information for tiled or tuple sharded instructions. We can just ignore the sharding for this case and rely on the domains to preserve the data instead. PiperOrigin-RevId: 220272535 --- tensorflow/compiler/xla/service/hlo_instruction.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index ada536770ed..264e217193e 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -1109,7 +1109,11 @@ void HloInstruction::set_single_sharding(const HloSharding& sharding) { void HloInstruction::SetupDerivedInstruction( HloInstruction* derived_instruction) const { - if (sharding_ != nullptr) { + if (sharding_ != nullptr && ShapeUtil::CompatibleIgnoringElementType( + shape_, derived_instruction->shape())) { + // Only copy sharding if the shape of the two instruction is compatible + // because copying it between differently shaped instructions can produce + // invalid shardings. derived_instruction->set_sharding(*sharding_); } else { derived_instruction->clear_sharding(); @@ -2639,9 +2643,7 @@ Status HloInstruction::Accept( return this->Accept(&visitor); } -const Shape& HloInstruction::shape() const { - return shape_; -} +const Shape& HloInstruction::shape() const { return shape_; } std::vector HloInstruction::OperandIndices( const HloInstruction* operand) const { From 366dd0eabccbf5bf968c16cb985e79efffdf098a Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Tue, 6 Nov 2018 07:36:30 -0800 Subject: [PATCH 178/540] Add a more efficient constructor for scalar `Tensor` objects in host memory. [tf.data] Optimize the creation of tensors in `tf.data.Dataset.range()`. This change improves the range benchmark from 148.7 ns/element to 122.4 ns/element. PiperOrigin-RevId: 220279090 --- tensorflow/core/framework/tensor.cc | 7 ++ tensorflow/core/framework/tensor.h | 111 ++++++++++++++++++ tensorflow/core/framework/tensor_test.cc | 36 ++++++ .../core/kernels/data/range_dataset_op.cc | 5 +- 4 files changed, 156 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc index 1dea6da9113..c7ddc6c21ed 100644 --- a/tensorflow/core/framework/tensor.cc +++ b/tensorflow/core/framework/tensor.cc @@ -752,6 +752,13 @@ Tensor::Tensor(Allocator* a, DataType type, const TensorShape& shape, Tensor::Tensor(DataType type, const TensorShape& shape) : Tensor(cpu_allocator(), type, shape) {} +void Tensor::HostScalarTensorBufferBase::FillAllocationDescription( + AllocationDescription* proto) const { + proto->set_requested_bytes(size()); + proto->set_allocator_name("HostScalarTensorBuffer"); + proto->set_ptr(reinterpret_cast(data())); +} + template class SubBuffer : public TensorBuffer { public: diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h index d0f9eb56e23..0d58ab3875a 100644 --- a/tensorflow/core/framework/tensor.h +++ b/tensorflow/core/framework/tensor.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_ #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_ +#include #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/tensor_shape.h" @@ -110,6 +111,76 @@ class Tensor { /// for details. explicit Tensor(DataType type); + private: + // A tag type for selecting the `Tensor` constructor overload that creates a + // scalar tensor in host memory. + struct host_scalar_tag {}; + + class HostScalarTensorBufferBase; + template + class HostScalarTensorBuffer; + + // Creates a tensor with the given scalar `value` in CPU memory. + template + Tensor(T value, host_scalar_tag tag); + + public: + // A series of specialized constructors for scalar tensors in host memory. + // + // NOTE: The `Variant` host-scalar constructor is not defined, because Variant + // is implicitly constructible from many different types, and this causes + // ambiguities with some compilers. + explicit Tensor(float scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(double scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(int32 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(uint32 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(uint16 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(uint8 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(int16 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(int8 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(string scalar_value) + : Tensor(std::move(scalar_value), host_scalar_tag{}) {} + explicit Tensor(complex64 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(complex128 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(int64 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(uint64 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(bool scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(qint8 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(quint8 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(qint16 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(quint16 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(qint32 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(bfloat16 scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(Eigen::half scalar_value) + : Tensor(scalar_value, host_scalar_tag{}) {} + explicit Tensor(ResourceHandle scalar_value) + : Tensor(std::move(scalar_value), host_scalar_tag{}) {} + + // NOTE: The `const char*` host-scalar constructor is provided as a + // convenience because otherwise passing a string literal would surprisingly + // construct a DT_BOOL tensor. + explicit Tensor(const char* scalar_value) + : Tensor(string(scalar_value), host_scalar_tag{}) {} + /// Copy constructor. Tensor(const Tensor& other); @@ -799,6 +870,46 @@ inline Tensor::Tensor(Tensor&& other) other.buf_ = nullptr; } +class Tensor::HostScalarTensorBufferBase : public TensorBuffer { + public: + void FillAllocationDescription(AllocationDescription* proto) const final; +}; + +// `Tensor::HostScalarTensorBuffer` is a specialized `TensorBuffer` +// implementation for storing a single scalar value. +// +// TODO(mrry): Evaluate other compilers or approaches to aligning the value +// so that it can be used directly as a tensor value. For example, in a C++17 +// future, we could use `alignas(EIGEN_MAX_ALIGN_BYTES)` to store the value +// inline in this object to save an allocation. However, this is not currently +// widely supported in our compilers. +template +class Tensor::HostScalarTensorBuffer : public HostScalarTensorBufferBase { + public: + HostScalarTensorBuffer(T&& value) + : data_(reinterpret_cast(cpu_allocator()->AllocateRaw( + EIGEN_MAX_ALIGN_BYTES, sizeof(value)))) { + if (is_simple_type::value) { + *data_ = value; + } else { + new (data_) T(std::move(value)); + } + } + ~HostScalarTensorBuffer() { cpu_allocator()->Deallocate(data_, 1); } + void* data() const final { return const_cast(data_); } + size_t size() const final { return sizeof(*data_); } + TensorBuffer* root_buffer() final { return this; } + + private: + T* const data_; +}; + +template +Tensor::Tensor(T value, host_scalar_tag tag) + : buf_(new HostScalarTensorBuffer(std::move(value))) { + set_dtype(DataTypeToEnum::value); +} + inline Tensor& Tensor::operator=(Tensor&& other) { // Avoid self-assignment, since we might destroy our underlying buffer. if (&other != this) { diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc index c5966041435..925ebc49454 100644 --- a/tensorflow/core/framework/tensor_test.cc +++ b/tensorflow/core/framework/tensor_test.cc @@ -830,6 +830,42 @@ TEST(Tensor_Scalar, Basics) { } } +TEST(Tensor_HostScalar, Basics) { + { + Tensor t(true); + EXPECT_EQ(DT_BOOL, t.dtype()); + EXPECT_EQ(1, t.NumElements()); + auto Tt = t.scalar(); + EXPECT_EQ(1, Tt.size()); + EXPECT_EQ(0, Tt.rank()); + EXPECT_TRUE(Tt()); + Tt() = false; + EXPECT_FALSE(Tt()); + } + { + Tensor t(123.45f); + EXPECT_EQ(DT_FLOAT, t.dtype()); + EXPECT_EQ(1, t.NumElements()); + auto Tt = t.scalar(); + EXPECT_EQ(1, Tt.size()); + EXPECT_EQ(0, Tt.rank()); + EXPECT_FLOAT_EQ(123.45f, Tt()); + Tt() = 42.0f; + EXPECT_FLOAT_EQ(42.0f, Tt()); + } + { + Tensor t("foo"); + EXPECT_EQ(DT_STRING, t.dtype()); + EXPECT_EQ(1, t.NumElements()); + auto Tt = t.scalar(); + EXPECT_EQ(1, Tt.size()); + EXPECT_EQ(0, Tt.rank()); + EXPECT_EQ("foo", Tt()); + Tt() = "bar"; + EXPECT_EQ("bar", Tt()); + } +} + TEST(Tensor_Float, Reshape_And_Slice_Assignment) { // A test to experiment with a way to assign to a subset of a tensor Tensor t(DT_FLOAT, TensorShape({10, 4, 3, 2})); diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc index 1ad5b007751..207e957e374 100644 --- a/tensorflow/core/kernels/data/range_dataset_op.cc +++ b/tensorflow/core/kernels/data/range_dataset_op.cc @@ -104,9 +104,8 @@ class RangeDatasetOp : public DatasetOpKernel { *end_of_sequence = true; return Status::OK(); } - out_tensors->emplace_back(ctx->allocator({}), DT_INT64, - TensorShape({})); - out_tensors->back().scalar()() = next_; + out_tensors->reserve(1); + out_tensors->emplace_back(next_); *end_of_sequence = false; next_ += dataset()->step_; From 3a3248fbbf9e9851196aa4ac19911582b2635636 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Tue, 6 Nov 2018 07:44:16 -0800 Subject: [PATCH 179/540] Remove converted function self reference from its globals to avoid circular dependencies. Override the self argument with the results of inspect_utils.getmethodclass for consistency. Clean up the module cleanup mechanism. PiperOrigin-RevId: 220280056 --- tensorflow/python/autograph/impl/api.py | 43 ++++++++++++++++--------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py index 123d2897390..7c2231cb56f 100644 --- a/tensorflow/python/autograph/impl/api.py +++ b/tensorflow/python/autograph/impl/api.py @@ -192,6 +192,7 @@ def converted_call(f, owner, options, *args, **kwargs): arg_map_target = f f_class = inspect_utils.getmethodclass(f) + # TODO(mdan): This may be more elegantly handled using __get__? if f_class is not None: # If this is a method call, it may or may not include self. # @@ -204,7 +205,10 @@ def converted_call(f, owner, options, *args, **kwargs): if owner is not None and (not args or args[0] is not owner): effective_args = (owner,) + args else: - effective_args = args + # Always override the self arg, because it might be different from + # what the method was bound to - see inspect_utils.getmethodclass. + assert args, 'Bound function call without self argument?' + effective_args = (f_class,) + args[1:] partial_types = (f_class,) else: effective_args = args @@ -255,28 +259,30 @@ def converted_call(f, owner, options, *args, **kwargs): optional_features=options.optional_features) result = converted_f(*effective_args, **kwargs) - # When converting a function, we write a tmp file and import it as a module. - # This leaks the module's closure. Once we've executed the converted_f module - # and there is no more code left to be executed, we can clean up the module. - # TODO(mdan): Look into workarounds that don't suffer from refcount leaks. - # Possibly attach the closure as a regular closure cell, instead of relying on - # module globals. - - # If there are callables in the result, they will fail to find their closure - # when called, so only delete module if all returned types are not callable. - flat_results = nest.flatten(result) - if all(map(_is_not_callable, flat_results)): + # The converted function's closure is simply inserted into the function's + # module __dict__. Since modules are permanently cached, that results in + # leaking the entire closure. + # Normally, it's not safe to delete the module because that may release said + # closure as well. However, in the case of converted_call we are certain the + # function will not be executed again, so the closure should no longer be + # needed so long as the function doesn't return any executable code. + # TODO(mdan): Attach the closure properly, using cells. + if all(map(_is_not_callable, nest.flatten(result))): del sys.modules[converted_f.__module__] return result def _is_not_callable(obj): - # TODO(brianklee): What happens if obj is a tensor wrapping a py_func? - return (isinstance(obj, - (int, float, complex, str, bool, np.ndarray, np.generic)) - or tensor_util.is_tensor(obj)) + # TODO(brianklee): Handle case when obj is a tensor dependent on a py_func. + if isinstance(obj, (int, float, complex, str, bool)): + return True + if isinstance(obj, (np.ndarray, np.generic)): + return True + if tensor_util.is_tensor(obj): + return True + return False # TODO(mdan): Rename: to_ops? @@ -357,6 +363,11 @@ def to_graph(e, if tf_inspect.isfunction(e): compiled.__defaults__ = e.__defaults__ + if hasattr(compiled, '__globals__'): + # Remove self to avoid circular references. This will probably only work + # so long as the function is not reentrant. + del compiled.__globals__[name] + # Need this so the source_mapping attribute is available for the context # manager to access for runtime errors. # From a5993cc89138a79a8d776ce2a4403d3e788df7ca Mon Sep 17 00:00:00 2001 From: Rachel Lim Date: Tue, 6 Nov 2018 09:07:52 -0800 Subject: [PATCH 180/540] [tf.data vectorization] minor refactoring PiperOrigin-RevId: 220292732 --- .../optimizers/data/vectorization/BUILD | 1 + .../data/vectorization/cwise_op_vectorizer.cc | 99 +++++++++---------- .../vectorization/decode_csv_vectorizer.cc | 27 ++--- .../parse_single_example_vectorizer.cc | 30 +++--- .../data/vectorization/reshape_vectorizer.cc | 19 ++-- .../vectorization/transpose_vectorizer.cc | 18 ++-- .../data/vectorization/unpack_vectorizer.cc | 28 +++--- .../data/vectorization/vectorizer.h | 71 ++++++++++++- .../vectorization/vectorizer_registry_test.cc | 4 +- 9 files changed, 165 insertions(+), 132 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD index 5175f6af7a2..541302361fb 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD +++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD @@ -33,6 +33,7 @@ cc_library( deps = [ ":wrapped_tensor", "//tensorflow/core:core_cpu", + "//tensorflow/cc:ops", "//tensorflow/core:lib", ] + tf_protos_all(), ) diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc index 709882e45ae..9d853f84a8a 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc +++ b/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc @@ -43,87 +43,78 @@ const char* const kExpandDimsPrefix = "vectorized/expanddims/"; // with shape [n, 12, 7, 5]: we need to manually expand the dimensions of A // *after* the leading dimension, i.e. expand A to the shape [n, 1, 1, 5] before // broadcasting. -Status ExpandDimsForBroadcast(std::vector* inputs, Graph* g) { +Status ExpandDimsForBroadcast(VectorizerInput* inputs, Graph* g) { Status status; Scope parent = NewInternalScope(g, &status, nullptr); - Scope s = parent.NewSubScope(kExpandDimsPrefix); + Scope scope = parent.NewSubScope(kExpandDimsPrefix); // TODO(rachelim): We can potentially get rid of all these ops if shapes are // known statically - Output const_0 = ops::Const(s, 0); - Output const_1 = ops::Const(s, 1); - - std::vector ranks; - ranks.reserve(inputs->size()); - // Get the stacked rank of each input - for (const auto& input : *inputs) { - Output rank = ops::Rank(s, Output(input.node, input.output_index)); + auto get_stacked_rank = [&scope](const WrappedTensor& input) { + Output rank = ops::Rank(scope, Output(input.node, input.output_index)); if (!input.stacked) { // If the input is unstacked, add 1 - rank = ops::Add(s, rank, const_1); + rank = ops::Add(scope, rank, ops::Const(scope, 1)); } - ranks.push_back(rank); - } + return rank; + }; - // Pack the ranks into one tensor to get the max - Output packed_ranks = ops::Stack(s, ranks); + Output rank_0 = get_stacked_rank(inputs->at(0)); + Output rank_1 = get_stacked_rank(inputs->at(1)); - Output max_rank = - ops::Max(s, packed_ranks, const_0, ops::Max::Attrs().KeepDims(true)); - - std::vector expanded_inputs; - expanded_inputs.reserve(inputs->size()); + Output max_rank = ops::Maximum(scope, rank_0, rank_1); // For all inputs that are stacked, expand dimensions after dim 0. - for (size_t i = 0; i < inputs->size(); ++i) { - if (!inputs->at(i).stacked) { - expanded_inputs.push_back(inputs->at(i)); - continue; - } + auto expand_dims_if_unstacked = + [&scope, &max_rank](const WrappedTensor& tensor, const Output& rank) { + if (!tensor.stacked) + return WrappedTensor(tensor.node, tensor.output_index, false); - Output input(inputs->at(i).node, inputs->at(i).output_index); + Output input(tensor.node, tensor.output_index); - // Number of dimensions to expand - Output rank_diff = ops::Sub(s, max_rank, ranks[i]); + Output rank_diff = ops::Sub(scope, max_rank, rank); - // [1] * rank_diff - Output ones = ops::Tile(s, ops::Const(s, {1}), rank_diff); + // [1] * rank_diff + Output ones = ops::Fill( + scope, ops::ExpandDims(scope, rank_diff, ops::Const(scope, 0)), + ops::Const(scope, 1)); - Output const_vec_1 = ops::Const(s, {1}); + Output shape = ops::Shape(scope, input); - Output shape = ops::Shape(s, input); + Output const_vec_1 = ops::Const(scope, {1}); + // shape[:1] + Output concat_pre = ops::StridedSlice( + scope, shape, const_vec_1, const_vec_1, const_vec_1, + ops::StridedSlice::Attrs().BeginMask(1)); - // shape[:1] - Output concat_pre = - ops::StridedSlice(s, shape, const_vec_1, const_vec_1, const_vec_1, - ops::StridedSlice::Attrs().BeginMask(1)); + // shape[1:] + Output concat_post = ops::StridedSlice( + scope, shape, const_vec_1, const_vec_1, const_vec_1, + ops::StridedSlice::Attrs().EndMask(1)); - // shape[1:] - Output concat_post = - ops::StridedSlice(s, shape, const_vec_1, const_vec_1, const_vec_1, - ops::StridedSlice::Attrs().EndMask(1)); + // tf.concat([shape[:1], ones, shape[1:]], 0) + Output new_shape = ops::Concat(scope, {concat_pre, ones, concat_post}, + ops::Const(scope, 0)); - // tf.concat([shape[:1], ones, shape[1:]], 0) - Output new_shape = ops::Concat(s, {concat_pre, ones, concat_post}, const_0); + Output reshaped = ops::Reshape(scope, input, new_shape); - Output result = ops::Reshape(s, input, new_shape); + return WrappedTensor(reshaped.node(), 0, true); + }; - expanded_inputs.push_back({result.node(), 0, true}); - } - - inputs->swap(expanded_inputs); - return status; + *inputs = VectorizerInput({expand_dims_if_unstacked(inputs->at(0), rank_0), + expand_dims_if_unstacked(inputs->at(1), rank_1)}); + return Status::OK(); } // Vectorization helper for component-wise ops. Since these operations act // component-wise, the vectorized op is the same as the original. Status CwiseVectorizeHelper(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) { + VectorizerInput&& inputs, + VectorizerOutput* outputs) { // Add new node with the same op type and attrs as the original node Node* new_node; auto node_builder = NodeBuilder(strings::StrCat("vectorized/", node.name()), @@ -144,8 +135,8 @@ Status CwiseVectorizeHelper(const Node& node, Graph* outer_scope, class UnaryCwiseOpVectorizer : public Vectorizer { public: Status Vectorize(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) override { + VectorizerInput&& inputs, + VectorizerOutput* outputs) override { if (inputs.size() != 1) { return errors::Internal("Failed to vectorize ", node.type_string(), ". The op should have 1 input, but has ", @@ -159,8 +150,8 @@ class UnaryCwiseOpVectorizer : public Vectorizer { class BinaryCwiseOpVectorizer : public Vectorizer { public: Status Vectorize(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) override { + VectorizerInput&& inputs, + VectorizerOutput* outputs) override { if (inputs.size() != 2) { return errors::Internal("Failed to vectorize ", node.type_string(), ". The op should have 2 input, but has ", diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc index c4460387bbf..76c00477476 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc +++ b/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc @@ -25,30 +25,21 @@ namespace { class DecodeCSVVectorizer : public Vectorizer { public: Status Vectorize(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) override { - if (!inputs[0].stacked) { - return errors::InvalidArgument("Expecting input 0 to be stacked."); - } - for (size_t i = 1; i < inputs.size(); ++i) { - if (inputs[i].stacked) { - // Record defaults should not be stacked - return errors::InvalidArgument("Expecting input ", i, - "to be unstacked."); - } - } + VectorizerInput&& inputs, + VectorizerOutput* outputs) override { + NodeBuilder::NodeOut records; + TF_RETURN_IF_ERROR(inputs.stacked(0, &records)); std::vector defaults; - defaults.reserve(inputs.size() - 1); + defaults.resize(inputs.size() - 1); for (size_t i = 1; i < inputs.size(); ++i) { - defaults.emplace_back(inputs[i].node, inputs[i].output_index); + TF_RETURN_IF_ERROR(inputs.unstacked(i, &defaults[i - 1])); } Node* new_node; - auto node_builder = - NodeBuilder(node.type_string(), node.type_string()) - .Input(inputs[0].node, inputs[0].output_index) // records; - .Input(defaults); // defaults + auto node_builder = NodeBuilder(node.type_string(), node.type_string()) + .Input(records) + .Input(defaults); for (const auto& attr : node.attrs()) { node_builder = node_builder.Attr(attr.first, attr.second); diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc index 7d0edfb386d..f81b2d01d99 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc +++ b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc @@ -27,23 +27,15 @@ namespace { class ParseSingleExampleVectorizer : public Vectorizer { public: Status Vectorize(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) override { - if (!inputs[0].stacked) { - return errors::InvalidArgument("Expecting input 0 to be stacked."); - } - for (size_t i = 1; i < inputs.size(); ++i) { - if (inputs[i].stacked) { - // Dense defaults should not be stacked - return errors::InvalidArgument("Expecting input ", i, - "to be unstacked."); - } - } + VectorizerInput&& inputs, + VectorizerOutput* outputs) override { + NodeBuilder::NodeOut serialized; + TF_RETURN_IF_ERROR(inputs.stacked(0, &serialized)); std::vector dense_defaults; - dense_defaults.reserve(inputs.size() - 1); + dense_defaults.resize(inputs.size() - 1); for (size_t i = 1; i < inputs.size(); ++i) { - dense_defaults.emplace_back(inputs[i].node, inputs[i].output_index); + TF_RETURN_IF_ERROR(inputs.unstacked(i, &dense_defaults[i - 1])); } Status scope_status; @@ -79,11 +71,11 @@ class ParseSingleExampleVectorizer : public Vectorizer { Node* new_node; auto node_builder = NodeBuilder(strings::StrCat("vectorized/", node.name()), "ParseExample") - .Input(inputs[0].node, inputs[0].output_index) // serialized - .Input(names) // names - .Input(sparse_keys) // sparse_keys - .Input(dense_keys) // dense_keys - .Input(dense_defaults); // dense_defaults + .Input(serialized) + .Input(names) + .Input(sparse_keys) + .Input(dense_keys) + .Input(dense_defaults); for (const auto& attr : {"sparse_types", "dense_shapes"}) { // Copy attrs if they exist diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc index dfb855ffa51..a094bfd1de4 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc +++ b/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc @@ -47,23 +47,18 @@ Output GetVectorizedShape(Scope* s, Output tensor, Output original_shape) { class ReshapeVectorizer : public Vectorizer { public: Status Vectorize(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) override { - if (!inputs[0].stacked || inputs[1].stacked) { - return errors::InvalidArgument( - "Expecting input 0 (`tensor`) to be stacked and input 1 (`shape`) to " - "be unstacked."); - } - + VectorizerInput&& inputs, + VectorizerOutput* outputs) override { Status status; Scope parent = NewInternalScope(outer_scope, &status, nullptr); Scope s = parent.NewSubScope(kReshapePrefix); - Output tensor = {inputs[0].node, inputs[0].output_index}; + Output tensor, shape; + TF_RETURN_IF_ERROR(inputs.stacked(0, &tensor)); + TF_RETURN_IF_ERROR(inputs.unstacked(1, &shape)); + Output vectorized_reshape = - ops::Reshape(s, tensor, - GetVectorizedShape( - &s, tensor, {inputs[1].node, inputs[1].output_index})); + ops::Reshape(s, tensor, GetVectorizedShape(&s, tensor, shape)); TF_RETURN_IF_ERROR(status); diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc index 4c286d9c4a9..45ad72bb7af 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc +++ b/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc @@ -41,20 +41,18 @@ constexpr char kTransposePrefix[] = "vectorized/transpose"; class TransposeVectorizer : public Vectorizer { public: Status Vectorize(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) override { - if (!inputs[0].stacked || inputs[1].stacked) { - return errors::InvalidArgument( - "Expecting input 0 (`x`) to be stacked and input 1 (`perm`) to " - "be unstacked."); - } - + VectorizerInput&& inputs, + VectorizerOutput* outputs) override { Status status; Scope parent = NewInternalScope(outer_scope, &status, /*refiner=*/nullptr); Scope scope = parent.NewSubScope(kTransposePrefix); - Output tensor = {inputs[0].node, inputs[0].output_index}; - Output original_perm = {inputs[1].node, inputs[1].output_index}; + Output tensor, original_perm; + TF_RETURN_IF_ERROR(inputs.stacked(0, &tensor)); + TF_RETURN_IF_ERROR(inputs.unstacked(1, &original_perm)); + if (original_perm.type() != DT_INT32) { + original_perm = ops::Cast(scope, original_perm, DT_INT32); + } // The vectorized permutation is the original permutation with an additional // leading 0 and all other values incremented by 1. diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc index 13b8500eda6..6e00c0cb051 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc +++ b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc @@ -24,16 +24,10 @@ namespace { class UnpackVectorizer : public Vectorizer { public: Status Vectorize(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) override { - Status s; - if (node.num_inputs() != 1 || inputs.size() != 1) { - return errors::Internal("Unpack op should only have one input."); - } - - // Add new Unpack node with the same op and attrs as the original node - auto new_unpack_node = outer_scope->AddNode(node.def(), &s); - TF_RETURN_IF_ERROR(s); + VectorizerInput&& inputs, + VectorizerOutput* outputs) override { + NodeBuilder::NodeOut value; + TF_RETURN_IF_ERROR(inputs.stacked(0, &value)); int axis = 0; if (HasNodeAttr(node.def(), "axis")) { @@ -46,17 +40,21 @@ class UnpackVectorizer : public Vectorizer { // Note: negative axis values wrap around. axis += 1; } - new_unpack_node->AddAttr("axis", axis); - - outer_scope->AddEdge(inputs[0].node, inputs[0].output_index, - new_unpack_node, 0); int num; TF_RETURN_IF_ERROR(GetNodeAttr(node.attrs(), "num", &num)); + Node* new_node; + TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("vectorized/", node.name()), + node.type_string()) + .Input(value) + .Attr("axis", axis) + .Attr("num", num) + .Finalize(outer_scope, &new_node)); + // Add the output mappings for (int i = 0; i < num; ++i) { - outputs->push_back({new_unpack_node, i, true}); + outputs->push_back({new_node, i, true}); } return Status::OK(); diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h index 8d4676aae07..7c9905f89ad 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h +++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/node_builder.h" #include "tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" @@ -25,6 +26,72 @@ limitations under the License. namespace tensorflow { namespace grappler { +// Represents the outputs of a vectorized op. Currently, a simple type alias +// provided for symmetry with `VectorizerInput`. +using VectorizerOutput = std::vector; + +// Represents the inputs of a vectorized op. Supports iteration, random access, +// and retrieval of stacked and unstacked tensor inputs. +class VectorizerInput { + public: + VectorizerInput(std::vector&& inputs) + : inputs_(std::move(inputs)) {} + + // Gets the stacked tensor input at position index. Returns an error if + // the tensor at index is unstacked. The type T must have a (Node*, int) + // constructor. + template + Status stacked(int index, T* result) const { + DCHECK_GE(index, 0); + DCHECK_LT(index, size()); + + if (!inputs_[index].stacked) { + return errors::InvalidArgument("Expecting input ", index, + " to be stacked."); + } + *result = {inputs_[index].node, inputs_[index].output_index}; + return Status::OK(); + } + + // Gets the unstacked tensor input at position index. Returns an error if + // the tensor at index is stacked. The type T must have a (Node*, int) + // constructor. + template + Status unstacked(int index, T* result) const { + DCHECK_GE(index, 0); + DCHECK_LT(index, size()); + + if (inputs_[index].stacked) { + return errors::InvalidArgument("Expecting input ", index, + " to be unstacked."); + } + *result = {inputs_[index].node, inputs_[index].output_index}; + return Status::OK(); + } + + // Returns a const reference to the element at specified location index. + const WrappedTensor& at(int index) const { + DCHECK_GE(index, 0); + DCHECK_LT(index, size()); + return inputs_.at(index); + } + + // Returns a const iterator pointing to the first wrapped tensor input. + std::vector::const_iterator begin() const { + return inputs_.begin(); + } + // Returns a const iterator pointing to the past-the-end wrapped tensor input. + std::vector::const_iterator end() const { + return inputs_.end(); + } + + // Returns the number of input tensors. + size_t size() const { return inputs_.size(); } + + private: + std::vector inputs_; +}; + // Interface for vectorization of TensorFlow operations. See `CastVectorizer` // for an example. class Vectorizer { @@ -40,8 +107,8 @@ class Vectorizer { // value in `outputs` corresponds to the i'th output port of the node // to be converted. virtual Status Vectorize(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) = 0; + VectorizerInput&& inputs, + VectorizerOutput* outputs) = 0; }; } // namespace grappler diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc index 054aeb9a8ff..0eee91f241a 100644 --- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc +++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc @@ -24,8 +24,8 @@ namespace grappler { class TestVectorizer : public Vectorizer { public: Status Vectorize(const Node& node, Graph* outer_scope, - std::vector&& inputs, - std::vector* outputs) override { + VectorizerInput&& inputs, + VectorizerOutput* outputs) override { return Status::OK(); } }; From e6d171a2afcf0b7a1f77f125751727232480edbe Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 09:09:04 -0800 Subject: [PATCH 181/540] Deprecate GatherParams input_rank PiperOrigin-RevId: 220292905 --- tensorflow/lite/kernels/gather.cc | 2 -- .../lite/kernels/internal/reference/legacy_reference_ops.h | 1 - tensorflow/lite/kernels/internal/types.h | 1 - 3 files changed, 4 deletions(-) diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc index 195a6d2b81b..eb622fba12b 100644 --- a/tensorflow/lite/kernels/gather.cc +++ b/tensorflow/lite/kernels/gather.cc @@ -88,11 +88,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* positions = GetInput(context, node, kInputPositions); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - const int input_rank = NumDimensions(input); #define TF_LITE_GATHER(data_type, index_type) \ { \ tflite::GatherParams op_params; \ - op_params.input_rank = input_rank; \ op_params.axis = params->axis; \ optimized_ops::Gather( \ op_params, GetTensorShape(input), GetTensorData(input), \ diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h index c92f28c79ef..380fc8f98eb 100644 --- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h +++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h @@ -802,7 +802,6 @@ inline void Gather(const T* input_data, const Dims<4>& input_dims, const Dims<4>& coords_dims, T* output_data, const Dims<4>& output_dims) { tflite::GatherParams op_params; - op_params.input_rank = input_rank; op_params.axis = 4 - input_rank; Gather(op_params, DimsToShape(input_dims), input_data, diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h index 04b95ddc63d..a05bd5e0033 100644 --- a/tensorflow/lite/kernels/internal/types.h +++ b/tensorflow/lite/kernels/internal/types.h @@ -852,7 +852,6 @@ struct FullyConnectedParams { }; struct GatherParams { - int16 input_rank; int16 axis; }; From c9b4689bc4d4024aa16b7d6cfc1c65fa1ed8486e Mon Sep 17 00:00:00 2001 From: gehring Date: Tue, 6 Nov 2018 12:32:01 -0500 Subject: [PATCH 182/540] Removed no longer supported call to in_eager_execution changes to model_analyser.analyse(...): - Swapped context.in_eager_execution() to the currently supported context.executing_eagerly(). - Added negation to eager check. In all likelihood, the negation was always supposed to be there since getting default graph in eager mode does not make sense. The current `if` condition is likely a bug. The proposed fix is consistent with other functions in this module, e.g., `profile(...)`, line 339. --- tensorflow/python/profiler/model_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py index acf02096fff..5f19eac0436 100644 --- a/tensorflow/python/profiler/model_analyzer.py +++ b/tensorflow/python/profiler/model_analyzer.py @@ -398,7 +398,7 @@ def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS): Returns: Returns AdviceProto proto """ - if not graph and context.in_eager_execution(): + if not graph and not context.executing_eagerly(): graph = ops.get_default_graph() if options == _DEFAULT_ADVISE_OPTIONS: From 012f0771a45de216b2f2d2dd85b7041bb8b5ce66 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Tue, 6 Nov 2018 09:34:01 -0800 Subject: [PATCH 183/540] [tf.data] Reduce from two map lookups to one in `Model::Node::record_stop()`. PiperOrigin-RevId: 220296822 --- tensorflow/core/framework/model.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h index 635a760b22a..f8db08ca17b 100644 --- a/tensorflow/core/framework/model.h +++ b/tensorflow/core/framework/model.h @@ -18,7 +18,8 @@ limitations under the License. #include #include #include -#include // (b/114492873): move this include into core/platform +// TODO(b/114492873): Move this include into core/platform. +#include // NOLINT #include #include @@ -169,10 +170,10 @@ class Node { void record_stop() LOCKS_EXCLUDED(mu_) { mutex_lock l(mu_); std::thread::id tid = std::this_thread::get_id(); - auto start_time = gtl::FindOrNull(work_start_, tid); - if (start_time) { - processing_time_ += Env::Default()->NowNanos() - *start_time; - work_start_.erase(tid); + auto iter = work_start_.find(tid); + if (iter != work_start_.end()) { + processing_time_ += Env::Default()->NowNanos() - iter->second; + work_start_.erase(iter); } else { LOG(WARNING) << "Encountered a stop event that was not preceded by a start event."; From 6ae8cb76c9bfd862d2472ce1c37291bfc1003ab7 Mon Sep 17 00:00:00 2001 From: Grzegorz George Pawelczak Date: Tue, 6 Nov 2018 18:11:17 +0000 Subject: [PATCH 184/540] Address review comments --- tensorflow/compiler/xla/service/algebraic_simplifier.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 9ee43202678..4d17dd707b9 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -455,8 +455,6 @@ Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) { if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_and, lhs)) { return Status::OK(); } - - return Status::OK(); } return Status::OK(); @@ -1301,8 +1299,6 @@ Status AlgebraicSimplifierVisitor::HandleOr(HloInstruction* logical_or) { if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_or, rhs)) { return Status::OK(); } - - return Status::OK(); } return Status::OK(); From 108c17095ed3a2d2c67e39ef4d20902035ad3861 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Tue, 6 Nov 2018 10:12:24 -0800 Subject: [PATCH 185/540] Explicitly check for None to avoid implicit casts to bool. PiperOrigin-RevId: 220303818 --- tensorflow/python/autograph/pyct/inspect_utils.py | 2 +- tensorflow/python/autograph/pyct/inspect_utils_test.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py index 8fc0cb7c936..4d56b93671e 100644 --- a/tensorflow/python/autograph/pyct/inspect_utils.py +++ b/tensorflow/python/autograph/pyct/inspect_utils.py @@ -186,7 +186,7 @@ def getmethodclass(m): # Instance method and class methods: should be bound to a non-null "self". if hasattr(m, '__self__'): - if m.__self__: + if m.__self__ is not None: # A fallback allowing methods to be actually bound to a type different # than __self__. This is useful when a strong reference from the method # to the object is not desired, for example when caching is involved. diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py index e7a2622c1ab..622e3bafc0a 100644 --- a/tensorflow/python/autograph/pyct/inspect_utils_test.py +++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py @@ -27,6 +27,7 @@ import six from tensorflow.python import lib from tensorflow.python.autograph.pyct import inspect_utils +from tensorflow.python.framework import constant_op from tensorflow.python.platform import test @@ -278,6 +279,11 @@ class InspectUtilsTest(test.TestCase): bound_method = types.MethodType(test_fn, WeakrefWrapper()) self.assertEqual(inspect_utils.getmethodclass(bound_method), test_obj) + def test_getmethodclass_no_bool_conversion(self): + + tensor = constant_op.constant([1]) + self.assertEqual(inspect_utils.getmethodclass(tensor.get_shape), tensor) + def test_getdefiningclass(self): class Superclass(object): From bd12be211a919b70c448a0a0d68b344f1c3993d9 Mon Sep 17 00:00:00 2001 From: Blake Hechtman Date: Tue, 6 Nov 2018 10:16:10 -0800 Subject: [PATCH 186/540] [XLA] Fix possible corruption of hlo reachability by using the global unique id.MOF actually needs to update reachabilty before fusing since mof will actually create new instructions PiperOrigin-RevId: 220304502 --- .../compiler/xla/service/hlo_reachability.cc | 2 +- .../compiler/xla/service/hlo_reachability.h | 24 +++++++++++++------ .../xla/service/hlo_reachability_test.cc | 3 ++- .../xla/service/multi_output_fusion.cc | 2 +- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc index 7e73cf5889c..4aa80677524 100644 --- a/tensorflow/compiler/xla/service/hlo_reachability.cc +++ b/tensorflow/compiler/xla/service/hlo_reachability.cc @@ -24,7 +24,7 @@ HloReachabilityMap::HloReachabilityMap( : size_(instructions.size()) { bit_vectors_.reserve(size_); for (const HloInstruction* hlo : instructions) { - indices_[hlo] = bit_vectors_.size(); + indices_[GetKey(hlo)] = bit_vectors_.size(); bit_vectors_.emplace_back(size_); } CHECK_EQ(size_, indices_.size()); // instructions should be unique diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h index 2c965f58bfa..7823b06a41b 100644 --- a/tensorflow/compiler/xla/service/hlo_reachability.h +++ b/tensorflow/compiler/xla/service/hlo_reachability.h @@ -16,21 +16,23 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_ +#include #include #include +#include "absl/base/casts.h" #include "absl/container/flat_hash_map.h" #include "absl/types/span.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/types.h" namespace xla { -class HloInstruction; - // A class for representing reachability between HloInstructions. // // It has an adjacency matrix and it is up to the user of the class to set the @@ -97,7 +99,9 @@ class HloReachabilityMap { bool IsConnected(const HloInstruction* a, const HloInstruction* b) const; // Checks if an instruction is in the Reachability map. - bool IsPresent(const HloInstruction* a) const { return indices_.contains(a); } + bool IsPresent(const HloInstruction* a) const { + return indices_.contains(GetKey(a)); + } private: // A bit-vector implementation specialized for this use case which provides a @@ -160,18 +164,24 @@ class HloReachabilityMap { absl::Span inputs, const HloInstruction* instruction, BitVector* bit_vector); + uint64 GetKey(const HloInstruction* instruction) const { + uint64 unique_id = absl::bit_cast(instruction->unique_id()); + uint64 module_id = + absl::bit_cast(instruction->parent()->parent()->unique_id()); + return (module_id << 32) | unique_id; + } // Return the index of the given instruction. The value is used to index into // the vector of BitVectors and the BitVectors themselves. int GetIndex(const HloInstruction* instruction) const { - return FindOrDie(indices_, instruction); + return FindOrDie(indices_, GetKey(instruction)); } // The number of instructions in the reachability map. const size_t size_; - // Dense assignment from HloInstruction* to number. These numbers index - // into the bit_vectors_ vector and into the bits within a BitVector. - absl::flat_hash_map indices_; + // Dense assignment from HloInstruction::unique_id to number. These numbers + // index into the bit_vectors_ vector and into the bits within a BitVector. + absl::flat_hash_map indices_; // Bitvectors holding the reachability to each instruction. The bit vector for // instruction X includes ones for each instruction which X is reachable from. diff --git a/tensorflow/compiler/xla/service/hlo_reachability_test.cc b/tensorflow/compiler/xla/service/hlo_reachability_test.cc index 21265d9f222..eec7fd65f81 100644 --- a/tensorflow/compiler/xla/service/hlo_reachability_test.cc +++ b/tensorflow/compiler/xla/service/hlo_reachability_test.cc @@ -48,7 +48,8 @@ TEST_F(HloReachabilityTest, Reachability) { HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); auto e = builder.AddInstruction( HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))); - builder.Build(); + auto module = CreateNewVerifiedModule(); + module->AddEntryComputation(builder.Build()); HloReachabilityMap reachability({a, b, c, d, e}); reachability.SetReachable(a, a); diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc index 6088fa4df66..9ccdd7d8d81 100644 --- a/tensorflow/compiler/xla/service/multi_output_fusion.cc +++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc @@ -318,9 +318,9 @@ bool MultiOutputFusion::Perform() { << instr2->fused_instructions_computation()->ToString( HloPrintOptions().set_indent_amount(1)); } + Update(instr1, instr2); HloInstruction* ret = Fuse(instr1, instr2); set_is_fused(ret == instr1 ? instr2 : instr1); - Update(instr1, instr2); changed = true; VLOG(2) << "After fusion, \t this: " << ret->name() << "\n" << ret->fused_instructions_computation()->ToString( From 7b652b409a901325da6daa77409f61471661cf9b Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Tue, 6 Nov 2018 10:36:55 -0800 Subject: [PATCH 187/540] Add support for additional features in ClusterResolver to support Distributed Coordinators PiperOrigin-RevId: 220308433 --- .../python/training/cluster_resolver.py | 127 ++++++++++++++++-- .../python/training/cluster_resolver_test.py | 90 ++++++++++++- 2 files changed, 205 insertions(+), 12 deletions(-) diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py index 5ecd4f34183..bf2b0571137 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py +++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py @@ -25,6 +25,13 @@ import six from tensorflow.python.training.server_lib import ClusterSpec +def _format_master_url(master, rpc_layer=None): + if rpc_layer: + return '%s://%s' % (rpc_layer, master) + else: + return master + + @six.add_metaclass(abc.ABCMeta) class ClusterResolver(object): """Abstract class for all implementations of ClusterResolvers. @@ -57,12 +64,13 @@ class ClusterResolver(object): 'cluster_spec is not implemented for {}.'.format(self)) @abc.abstractmethod - def master(self, task_type=None, task_index=None): + def master(self, task_type=None, task_index=None, rpc_layer=None): """Retrieves the name or URL of the session master. Args: task_type: (Optional) The type of the TensorFlow task of the master. task_index: (Optional) The index of the TensorFlow task of the master. + rpc_layer: (Optional) The RPC protocol for the given cluster. Returns: The name or URL of the session master. @@ -77,10 +85,18 @@ class ClusterResolver(object): class SimpleClusterResolver(ClusterResolver): """Simple implementation of ClusterResolver that accepts a ClusterSpec.""" - def __init__(self, cluster_spec, master=''): + def __init__(self, cluster_spec, master='', task_type=None, task_index=None, + environment='', num_accelerators_per_worker=0, + rpc_layer=None): """Creates a SimpleClusterResolver from a ClusterSpec.""" super(SimpleClusterResolver, self).__init__() + self._task_type = task_type + self._task_index = task_index + self._environment = environment + self._num_accelerators_per_worker = num_accelerators_per_worker + self._rpc_layer = rpc_layer + if not isinstance(cluster_spec, ClusterSpec): raise TypeError('cluster_spec must be a ClusterSpec.') self._cluster_spec = cluster_spec @@ -93,12 +109,13 @@ class SimpleClusterResolver(ClusterResolver): """Returns the ClusterSpec passed into the constructor.""" return self._cluster_spec - def master(self, task_type=None, task_index=None): + def master(self, task_type=None, task_index=None, rpc_layer=None): """Returns the master address to use when creating a session. Args: task_type: (Optional) The type of the TensorFlow task of the master. task_index: (Optional) The index of the TensorFlow task of the master. + rpc_layer: (Optional) The RPC used by distributed TensorFlow. Returns: The name or URL of the session master. @@ -107,9 +124,51 @@ class SimpleClusterResolver(ClusterResolver): string passed into the initialization function. """ if task_type and task_index: - return self.cluster_spec().task_address(task_type, task_index) + master = self.cluster_spec().task_address(task_type, task_index) + else: + master = self._master - return self._master + return _format_master_url(master, rpc_layer or self._rpc_layer) + + @property + def task_type(self): + return self._task_type + + @property + def task_index(self): + return self._task_index + + @task_type.setter + def task_type(self, task_type): + self._task_type = task_type + + @task_index.setter + def task_index(self, task_index): + self._task_index = task_index + + @property + def environment(self): + return self._environment + + def num_accelerators_per_worker(self, session_config=None): + """Returns the number of accelerator cores per worker. + + Args: + session_config: Unused. The SimpleClusterResolver does not do automatic + detection of accelerators, so a TensorFlow session will never be + created, and thus a `session_config` is never necessary here, and will + be ignored. + """ + del session_config + return self._num_accelerators_per_worker + + @property + def rpc_layer(self): + return self._rpc_layer + + @rpc_layer.setter + def rpc_layer(self, rpc_layer): + self._rpc_layer = rpc_layer class UnionClusterResolver(ClusterResolver): @@ -119,13 +178,22 @@ class UnionClusterResolver(ClusterResolver): merges the underlying ClusterResolvers, and returns one unified ClusterSpec when cluster_spec is called. The details of the merge function is documented in the cluster_spec function. + + For additional Cluster Resolver properties such as task type, task index, + rpc layer, environment, etc..., we will return the value from the first + ClusterResolver in the union. """ - def __init__(self, *args): + def __init__(self, *args, **kwargs): """Initializes a UnionClusterResolver with other ClusterResolvers. Args: *args: `ClusterResolver` objects to be unionized. + **kwargs: + rpc_layer - (Optional) Override value for the RPC layer used by + TensorFlow. + task_type - (Optional) Override value for the current task type. + task_index - (Optional) Override value for the current task index. Raises: TypeError: If any argument is not a subclass of `ClusterResolvers`. @@ -133,6 +201,13 @@ class UnionClusterResolver(ClusterResolver): """ super(UnionClusterResolver, self).__init__() + self._rpc_layer = kwargs.pop('rpc_layer', None) + self._task_type = kwargs.pop('task_type', None) + self._task_index = kwargs.pop('task_index', None) + + if kwargs: + raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs)) + if not args: raise ValueError('At least one ClusterResolver is required.') @@ -216,7 +291,7 @@ class UnionClusterResolver(ClusterResolver): return ClusterSpec(merged_cluster) - def master(self, task_type=None, task_index=None): + def master(self, task_type=None, task_index=None, rpc_layer=None): """Returns the master address to use when creating a session. This usually returns the master from the first ClusterResolver passed in, @@ -225,11 +300,45 @@ class UnionClusterResolver(ClusterResolver): Args: task_type: (Optional) The type of the TensorFlow task of the master. task_index: (Optional) The index of the TensorFlow task of the master. + rpc_layer: (Optional) The RPC protocol for the given cluster. Returns: The name or URL of the session master. """ if task_type and task_index: - return self.cluster_spec().task_address(task_type, task_index) + master = self.cluster_spec().task_address(task_type, task_index) + return _format_master_url(master, rpc_layer or self._rpc_layer) - return self._cluster_resolvers[0].master() + return self._cluster_resolvers[0].master(rpc_layer=rpc_layer) + + @property + def task_type(self): + return self._task_type or self._cluster_resolvers[0].task_type + + @property + def task_index(self): + return self._task_index or self._cluster_resolvers[0].task_index + + @task_type.setter + def task_type(self, task_type): + self._task_type = task_type + + @task_index.setter + def task_index(self, task_index): + self._task_index = task_index + + @property + def environment(self): + return self._cluster_resolvers[0].environment + + def num_accelerators_per_worker(self, session_config=None): + return self._cluster_resolvers[0].num_accelerators_per_worker( + session_config) + + @property + def rpc_layer(self): + return self._rpc_layer or self._cluster_resolvers[0].rpc_layer + + @rpc_layer.setter + def rpc_layer(self, rpc_layer): + self._rpc_layer = rpc_layer diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py index c004b2e2d3b..133afbafd7f 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py +++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py @@ -57,6 +57,52 @@ class UnionClusterResolverTest(test.TestCase): actual_cluster_spec = union_resolver.cluster_spec() self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto) + def testInitSimpleClusterResolver(self): + base_cluster_spec = server_lib.ClusterSpec({ + "ps": ["ps0:2222", "ps1:2222"], + "worker": ["worker0:2222", "worker1:2222", "worker2:2222"] + }) + + simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps", + task_index=1, environment="cloud", + num_accelerators_per_worker=8, + rpc_layer="grpc") + + self.assertEqual(simple_resolver.task_type, "ps") + self.assertEqual(simple_resolver.task_index, 1) + self.assertEqual(simple_resolver.environment, "cloud") + self.assertEqual(simple_resolver.num_accelerators_per_worker(), 8) + self.assertEqual(simple_resolver.rpc_layer, "grpc") + + def testOverrideSimpleClusterResolver(self): + base_cluster_spec = server_lib.ClusterSpec({ + "ps": ["ps0:2222", "ps1:2222"], + "worker": ["worker0:2222", "worker1:2222", "worker2:2222"] + }) + + simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps", + task_index=1, environment="cloud", + num_accelerators_per_worker=8, + rpc_layer="grpc") + + simple_resolver.task_type = "worker" + simple_resolver.task_index = 2 + simple_resolver.rpc_layer = "http" + + self.assertEqual(simple_resolver.task_type, "worker") + self.assertEqual(simple_resolver.task_index, 2) + self.assertEqual(simple_resolver.rpc_layer, "http") + + def testSimpleOverrideMasterWithRpcLayer(self): + base_cluster_spec = server_lib.ClusterSpec({ + "ps": ["ps0:2222", "ps1:2222"], + "worker": ["worker0:2222", "worker1:2222", "worker2:2222"] + }) + + simple_resolver = SimpleClusterResolver(base_cluster_spec) + actual_master = simple_resolver.master("worker", 2, rpc_layer="grpc") + self.assertEqual(actual_master, "grpc://worker2:2222") + def testSimpleOverrideMaster(self): base_cluster_spec = server_lib.ClusterSpec({ "ps": ["ps0:2222", "ps1:2222"], @@ -65,7 +111,42 @@ class UnionClusterResolverTest(test.TestCase): simple_resolver = SimpleClusterResolver(base_cluster_spec) actual_master = simple_resolver.master("worker", 2) - self.assertEquals(actual_master, "worker2:2222") + self.assertEqual(actual_master, "worker2:2222") + + def testUnionClusterResolverGetProperties(self): + cluster_spec_1 = server_lib.ClusterSpec({ + "ps": ["ps0:2222", "ps1:2222"], + "worker": ["worker0:2222", "worker1:2222", "worker2:2222"] + }) + resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps", + task_index=1, environment="cloud", + num_accelerators_per_worker=8, + rpc_layer="grpc") + + cluster_spec_2 = server_lib.ClusterSpec({ + "ps": ["ps2:2222", "ps3:2222"], + "worker": ["worker3:2222", "worker4:2222", "worker5:2222"] + }) + resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker", + task_index=2, environment="local", + num_accelerators_per_worker=16, + rpc_layer="http") + + union_resolver = UnionClusterResolver(resolver1, resolver2) + + self.assertEqual(union_resolver.task_type, "ps") + self.assertEqual(union_resolver.task_index, 1) + self.assertEqual(union_resolver.environment, "cloud") + self.assertEqual(union_resolver.num_accelerators_per_worker(), 8) + self.assertEqual(union_resolver.rpc_layer, "grpc") + + union_resolver.task_type = "worker" + union_resolver.task_index = 2 + union_resolver.rpc_layer = "http" + + self.assertEqual(union_resolver.task_type, "worker") + self.assertEqual(union_resolver.task_index, 2) + self.assertEqual(union_resolver.rpc_layer, "http") def testTwoNonOverlappingJobMergedClusterResolver(self): cluster_spec_1 = server_lib.ClusterSpec({ @@ -116,10 +197,13 @@ class UnionClusterResolverTest(test.TestCase): union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2) unspecified_master = union_cluster.master() - self.assertEquals(unspecified_master, "") + self.assertEqual(unspecified_master, "") specified_master = union_cluster.master("worker", 1) - self.assertEquals(specified_master, "worker1:2222") + self.assertEqual(specified_master, "worker1:2222") + + rpc_master = union_cluster.master("worker", 1, rpc_layer="grpc") + self.assertEqual(rpc_master, "grpc://worker1:2222") def testOverlappingJobMergedClusterResolver(self): cluster_spec_1 = server_lib.ClusterSpec({ From d4d7723ad210979ce5c3dfa49b537b10ba32cf81 Mon Sep 17 00:00:00 2001 From: Rick Chao Date: Tue, 6 Nov 2018 10:43:36 -0800 Subject: [PATCH 188/540] Export make_early_stopping_hook function to estimator.experimental PiperOrigin-RevId: 220309727 --- .../api/golden/v1/tensorflow.estimator.experimental.pbtxt | 4 ++++ .../api/golden/v2/tensorflow.estimator.experimental.pbtxt | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt index 862761a96c7..a172a71b3ff 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt @@ -16,4 +16,8 @@ tf_module { name: "linear_logit_fn_builder" argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], " } + member_method { + name: "make_early_stopping_hook" + argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], " + } } diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt index 862761a96c7..a172a71b3ff 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt @@ -16,4 +16,8 @@ tf_module { name: "linear_logit_fn_builder" argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], " } + member_method { + name: "make_early_stopping_hook" + argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], " + } } From 492cbeb448590084f975cba8896f2a85df23b036 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Tue, 6 Nov 2018 10:43:48 -0800 Subject: [PATCH 189/540] Remove `self` when bypassing conversion of whitelisted methods. PiperOrigin-RevId: 220309775 --- tensorflow/python/autograph/impl/api.py | 9 ++++ tensorflow/python/autograph/impl/api_test.py | 48 ++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py index 7c2231cb56f..e51ad337650 100644 --- a/tensorflow/python/autograph/impl/api.py +++ b/tensorflow/python/autograph/impl/api.py @@ -174,6 +174,15 @@ def converted_call(f, owner, options, *args, **kwargs): # TODO(mdan): This needs cleanup. # In particular, we may want to avoid renaming functions altogether. if not options.force_conversion and conversion.is_whitelisted_for_graph(f): + + # Args typically include `self`, as required by the conversion process. + # When conversion is skipped, `self` is not necessary, because the + # original bound method is being executed. This code removes it. + if tf_inspect.ismethod(f) and args: + f_class = inspect_utils.getmethodclass(f) + if args[0] is f_class: + args = args[1:] + return f(*args, **kwargs) if inspect_utils.isbuiltin(f): diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py index 276fb8748fe..df9505b267f 100644 --- a/tensorflow/python/autograph/impl/api_test.py +++ b/tensorflow/python/autograph/impl/api_test.py @@ -28,6 +28,9 @@ from tensorflow.python.autograph.impl import api from tensorflow.python.autograph.pyct import parser from tensorflow.python.autograph.utils import py_func from tensorflow.python.framework import constant_op +from tensorflow.python.keras.engine import sequential +from tensorflow.python.keras.layers import core +from tensorflow.python.ops import variables from tensorflow.python.platform import test from tensorflow.python.util import tf_inspect @@ -319,6 +322,51 @@ class ApiTest(test.TestCase): # The constant has static shape so the result is a primitive not a Tensor. self.assertEqual(x, 1) + def test_converted_call_whitelisted_method(self): + + opts = converter.ConversionOptions() + + model = sequential.Sequential([ + core.Dense(2) + ]) + + x = api.converted_call(model.call, None, opts, + constant_op.constant([[0.0]]), training=True) + + with self.cached_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertAllEqual([[0.0, 0.0]], sess.run(x)) + + def test_converted_call_whitelisted_method_extra_self(self): + + opts = converter.ConversionOptions() + + model = sequential.Sequential([ + core.Dense(2) + ]) + + x = api.converted_call(model.call, None, opts, + model, constant_op.constant([[0.0]]), training=True) + + with self.cached_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertAllEqual([[0.0, 0.0]], sess.run(x)) + + def test_converted_call_whitelisted_method_via_owner(self): + + opts = converter.ConversionOptions() + + model = sequential.Sequential([ + core.Dense(2) + ]) + + x = api.converted_call('call', model, opts, + constant_op.constant([[0.0]]), training=True) + + with self.cached_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertAllEqual([[0.0, 0.0]], sess.run(x)) + def test_to_graph_basic(self): def test_fn(x, s): From 384dc67ece9fb89085beabee3bfc05ff1eba18e6 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Tue, 6 Nov 2018 11:20:29 -0800 Subject: [PATCH 190/540] Recognize and avoid spending compile time on "megamorphic" clusters Megamorphic clusters are clusters which never stabilize to have a fixed number of shape signatures. Once we're reasonably sure a cluster is megamorphic we stop compiling it forever. This is necessarily a fuzzy concept, and I suspect we'll have to adjust our exact definition of megamorphic with time. PiperOrigin-RevId: 220317226 --- .../compiler/jit/xla_compilation_cache.cc | 82 ++++++++++++++++--- .../compiler/jit/xla_compilation_cache.h | 9 +- tensorflow/compiler/tests/jit_test.py | 61 ++++++++++++++ 3 files changed, 138 insertions(+), 14 deletions(-) diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc index 4a5ea9e0a5f..158cd0a95e2 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.cc +++ b/tensorflow/compiler/jit/xla_compilation_cache.cc @@ -233,15 +233,28 @@ Status XlaCompilationCache::Compile( CompileMode compile_mode, const XlaCompiler::CompilationResult** out_compilation_result, xla::LocalExecutable** out_executable) { - // Set the compile threshold to 1 to implement CompileMode::kStrict. - int64 compile_threshold = - compile_mode == CompileMode::kLazy ? kDefaultCompilationThreshold : 1; + absl::optional compile_threshold; + if (compile_mode == CompileMode::kLazy) { + compile_threshold = kDefaultCompilationThreshold; + } + return CompileImpl(options, function, constant_args, variable_args, ctx, compile_options, /*compile_single_op=*/false, /*compile_threshold=*/compile_threshold, out_compilation_result, out_executable); } +static bool IsMegamorphic(int64 compile_count, int64 execution_count) { + const int64 kCompileThreshold = 10; + const int64 kMinExecutionsPerCompile = 50; + + // This heuristic is trying to capture the following property: have we sunk a + // certain minimum amount of compile time into the cluster that didn't quite + // "pay off"? + return compile_count > kCompileThreshold && + execution_count < kMinExecutionsPerCompile * compile_count; +} + Status XlaCompilationCache::CompileSingleOp( const XlaCompiler::Options& options, const std::map& constant_args, @@ -253,10 +266,10 @@ Status XlaCompilationCache::CompileSingleOp( NameAttrList name; name.set_name(def.op()); *name.mutable_attr() = def.attr(); - return CompileImpl(options, name, constant_args, variable_args, ctx, - compile_options, - /*compile_single_op=*/true, /*compile_threshold=*/1, - out_compilation_result, out_executable); + return CompileImpl( + options, name, constant_args, variable_args, ctx, compile_options, + /*compile_single_op=*/true, /*compile_threshold=*/absl::nullopt, + out_compilation_result, out_executable); } Status XlaCompilationCache::CompileImpl( @@ -264,7 +277,7 @@ Status XlaCompilationCache::CompileImpl( const std::map& constant_args, const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompileOptions& compile_options, bool compile_single_op, - int64 compile_threshold, + absl::optional compile_threshold, const XlaCompiler::CompilationResult** out_compilation_result, xla::LocalExecutable** out_executable) { DCHECK_NE(out_executable, nullptr); @@ -319,13 +332,26 @@ Status XlaCompilationCache::CompileImpl( // (since they get the benefit of XLA right away without waiting for warmup) // and doesn't hurt much for dynamically shaped TensorFlow graphs (we "pay" at // most one cluster-compilation's worth of compile time). - bool is_first_execution = [&] { + bool is_first_execution; + + // We avoid compiling clusters that have "gone megamorphic" i.e. have an + // excessive amount of shape dynamism. + bool is_megamorphic; + + { mutex_lock lock(cluster_compile_stats_mu_); auto it = cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{}) .first; - return it->second.execution_count++ == 0; - }(); + is_first_execution = it->second.execution_count++ == 0; + + // The is_megamorphic bit is "sticky". We assume clusters that have been + // observed to be megamorphic once stay megamorphic forever. + it->second.is_megamorphic |= + IsMegamorphic(/*compile_count=*/it->second.compile_count, + /*execution_count=*/it->second.execution_count); + is_megamorphic = it->second.is_megamorphic; + } // Acquire the cache entry lock and compile, if necessary. // TODO(phawkins): this locking will need to be restructured when we implement @@ -336,8 +362,38 @@ Status XlaCompilationCache::CompileImpl( VLOG(2) << "Compilation cache miss for signature: " << SignatureDebugString(signature) << " with request count " << current_request_count << " and compile threshold " - << compile_threshold; - if (!is_first_execution && current_request_count < compile_threshold) { + << compile_threshold.value_or(0); + const bool should_compile = [&] { + if (!compile_threshold.has_value()) { + // Lazy compilation is disabled. + return true; + } + + if (is_megamorphic) { + VLOG(3) << "Not compiling cluster " << function.name() + << " because it is megamorphic."; + return false; + } + + if (is_first_execution) { + return true; + } + + bool reached_compile_threshold = + current_request_count >= *compile_threshold; + if (!reached_compile_threshold) { + VLOG(3) + << "Not compiling cluster " << function.name() + << " because it has not reached compile threshold; threshold is " + << *compile_threshold << " execution count " + << current_request_count << "."; + } + return reached_compile_threshold; + }(); + + if (!should_compile) { + VLOG(2) << "Not compiling for signature: " + << SignatureDebugString(signature); *out_compilation_result = nullptr; *out_executable = nullptr; return Status::OK(); diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h index b43e5d40e64..b2bf70462b6 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.h +++ b/tensorflow/compiler/jit/xla_compilation_cache.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_ #include "absl/container/flat_hash_map.h" +#include "absl/types/optional.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/xla/client/local_client.h" @@ -108,7 +109,7 @@ class XlaCompilationCache : public ResourceBase { const std::map& constant_args, const std::map& variable_args, OpKernelContext* ctx, const XlaCompiler::CompileOptions& compile_options, - bool compile_single_op, int64 compile_threshold, + bool compile_single_op, absl::optional compile_threshold, const XlaCompiler::CompilationResult** out_compilation_result, xla::LocalExecutable** out_executable); @@ -180,7 +181,13 @@ class XlaCompilationCache : public ResourceBase { // Cumulative time spent compiling the cluster. int64 cumulative_compile_time_us = 0; + + // True if we have decided that this cluster is too dynamic (i.e. its shapes + // change too frequently) to profitably JIT compile. Once a cluster is + // tagged megamorphic, it stays megamorphic forever. + bool is_megamorphic = false; }; + mutex cluster_compile_stats_mu_; // Maps cluster names to compilation statistics for said cluster. diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py index 561715ee1c3..6f51ae33a1b 100644 --- a/tensorflow/compiler/tests/jit_test.py +++ b/tensorflow/compiler/tests/jit_test.py @@ -593,6 +593,67 @@ class LazyCompilationTest(test.TestCase): self.assertFalse( InLabels(RunMetadataLabels(run_metadata_for_new_shape), "_XlaRun")) + def testIsMegamorphic(self): + + @function.Defun(compiled=True) + def CompiledFunction(x): + return math_ops.log(x) + + with session_lib.Session(config=NoRewriteSessionConfig()) as sess: + x = array_ops.placeholder(dtypes.float32) + y = CompiledFunction(x) + + # Make the cluster go megamorphic by running it with lots of shape + # signatures where the cluster is executed with each signature only a few + # times. Then check that we don't compile the cluster ever again. + + for shape in range(10, 50): + for _ in range(0, 49): + sess.run(y, feed_dict={x: [0.] * shape}) + + for _ in range(0, 50): + run_metadata = config_pb2.RunMetadata() + sess.run( + y, + feed_dict={x: [0.] * 60}, + run_metadata=run_metadata, + options=config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE)) + self.assertTrue( + InLabels(RunMetadataLabels(run_metadata), "_XlaCompile")) + self.assertFalse(InLabels(RunMetadataLabels(run_metadata), "_XlaRun")) + + def testIsNotMegamorphic(self): + + @function.Defun(compiled=True) + def CompiledFunction(x): + return math_ops.log(x) + + with session_lib.Session(config=NoRewriteSessionConfig()) as sess: + x = array_ops.placeholder(dtypes.float32) + y = CompiledFunction(x) + + # Run the cluster with lots of shape signatures, but in a way that it + # isn't megamorphic (i.e. each shape signature sees a lot of executions). + # Then check that the cluster has not been marked as megamorphic. + + for shape in range(10, 50): + for _ in range(0, 1000): + sess.run(y, feed_dict={x: [0.] * shape}) + + for _ in range(0, 10): + sess.run(y, feed_dict={x: [0.] * 60}) + + run_metadata = config_pb2.RunMetadata() + sess.run( + y, + feed_dict={x: [0.] * 60}, + run_metadata=run_metadata, + options=config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE)) + self.assertTrue(InLabels(RunMetadataLabels(run_metadata), "_XlaCompile")) + self.assertTrue(InLabels(RunMetadataLabels(run_metadata), "_XlaRun")) + if __name__ == "__main__": os.environ["TF_XLA_FLAGS"] = ("--tf_xla_enable_lazy_compilation=true " + From 848238dad4945dec7370f56fc9499fb1c28a7234 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Tue, 6 Nov 2018 11:39:27 -0800 Subject: [PATCH 191/540] [TF:XLA] Bump open source llvm revision to r346230 PiperOrigin-RevId: 220321235 --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index e4417f46c2b..7b3e17fbb98 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -472,11 +472,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "llvm", build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"), - sha256 = "2342cb98083eb1191a8411542dcd57cb3efc28677be4412e166f40cf22bd2b8c", - strip_prefix = "llvm-3fe1b12fca949399a3334a072ee7f96e2b6f557e", + sha256 = "e13fec9469075ab7dd4dbc046c7e37bd8330557aebbe42e3f1de2bad5adb8f2c", + strip_prefix = "llvm-b9e341e90c167b92136893c9085343aa254ddd37", urls = [ - "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/3fe1b12fca949399a3334a072ee7f96e2b6f557e.tar.gz", - "https://github.com/llvm-mirror/llvm/archive/3fe1b12fca949399a3334a072ee7f96e2b6f557e.tar.gz", + "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/b9e341e90c167b92136893c9085343aa254ddd37.tar.gz", + "https://github.com/llvm-mirror/llvm/archive/b9e341e90c167b92136893c9085343aa254ddd37.tar.gz", ], ) From adb6154302219bbfcebcdc19338d539daa0d45f2 Mon Sep 17 00:00:00 2001 From: Yunxing Dai Date: Tue, 6 Nov 2018 12:02:15 -0800 Subject: [PATCH 192/540] Use xla shape as the output of reprensetation_shape function. Mostly refactoring the function. PiperOrigin-RevId: 220325621 --- tensorflow/compiler/jit/xla_device.h | 6 +++++ tensorflow/compiler/jit/xla_device_context.cc | 19 +++++++++---- tensorflow/compiler/jit/xla_tensor.cc | 5 ++-- tensorflow/compiler/jit/xla_tensor.h | 2 +- tensorflow/compiler/tf2xla/BUILD | 1 + .../compiler/tf2xla/kernels/retval_op.cc | 11 +++++--- tensorflow/compiler/tf2xla/xla_compiler.cc | 27 +++++++++++-------- tensorflow/compiler/tf2xla/xla_compiler.h | 3 +-- .../compiler/tf2xla/xla_compiler_test.cc | 17 +++++++----- tensorflow/compiler/tf2xla/xla_context.cc | 4 +-- tensorflow/compiler/tf2xla/xla_context.h | 16 +++++------ tensorflow/compiler/tf2xla/xla_op_kernel.cc | 16 +++++++---- 12 files changed, 78 insertions(+), 49 deletions(-) diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h index 8881b697bc8..49f53b477ef 100644 --- a/tensorflow/compiler/jit/xla_device.h +++ b/tensorflow/compiler/jit/xla_device.h @@ -112,6 +112,12 @@ class XlaDevice : public LocalDevice { // compute, host-to-device, and device-to-host communication. bool use_multiple_streams = false; + // A function that describes how the on-host shapes of + // a) argument and return value, for entry computations + // b) variables, for all computations, + // should be represented in XLA. Parameters/return values will be shaped + // according to this function, and reshaped back to/from their declared + // shapes for computations. Must be non-null. XlaCompiler::ShapeRepresentationFn shape_representation_fn; // If padded_shape_fn is empty, a default implementation that returns diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc index eb3cf27624b..4f0941f1184 100644 --- a/tensorflow/compiler/jit/xla_device_context.cc +++ b/tensorflow/compiler/jit/xla_device_context.cc @@ -70,9 +70,12 @@ XlaDeviceContext::XlaDeviceContext( CHECK(device_to_host_stream_ != nullptr); CHECK(stream_ != nullptr); if (!shape_representation_fn_) { - shape_representation_fn_ = - [](const TensorShape& shape, - DataType dtype) -> xla::StatusOr { return shape; }; + shape_representation_fn_ = [](const TensorShape& shape, + DataType dtype) -> xla::StatusOr { + xla::Shape xla_shape; + TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape)); + return xla_shape; + }; } } @@ -99,7 +102,7 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, CHECK(xla_tensor); Status status = [&]() -> Status { - TF_ASSIGN_OR_RETURN(TensorShape shape, + TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn_(device_tensor->shape(), device_tensor->dtype())); @@ -111,9 +114,15 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_, stream_->parent()->device_ordinal())); + // The cpu_tensor and literal that we created here hold the data of host + // tensor in descending layout. The layout could be different from layout in + // device_tensor (but the logical shape has to be the same). The + // transfer_manager is responsible to do corresponding transposing when + // transferring the data to device. xla::BorrowingLiteral literal( static_cast(DMAHelper::base(cpu_tensor)), - xla_tensor->shaped_buffer().on_host_shape()); + xla::ShapeUtil::MakeShape(shape.element_type(), + xla::AsInt64Slice(shape.dimensions()))); VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " " << xla_tensor->shaped_buffer().ToString(); diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc index 6f8b198262d..d1f7f754c83 100644 --- a/tensorflow/compiler/jit/xla_tensor.cc +++ b/tensorflow/compiler/jit/xla_tensor.cc @@ -43,11 +43,10 @@ namespace tensorflow { } } -Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape, +Status XlaTensor::AllocateShapedBuffer(DataType dtype, + const xla::Shape& on_host_shape, xla::LocalClient* client, int device_ordinal) { - xla::Shape on_host_shape; - TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &on_host_shape)); xla::Shape on_device_shape = client->backend().transfer_manager()->HostShapeToDeviceShape( on_host_shape); diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h index 6d7a6fd66c8..77e80aa2527 100644 --- a/tensorflow/compiler/jit/xla_tensor.h +++ b/tensorflow/compiler/jit/xla_tensor.h @@ -50,7 +50,7 @@ class XlaTensor { // Assign the internal ShapedBuffer to new memory for the given dtype and // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it // is replaced and the managed memory deallocated. - Status AllocateShapedBuffer(DataType dtype, const TensorShape& shape, + Status AllocateShapedBuffer(DataType dtype, const xla::Shape& on_host_shape, xla::LocalClient* client, int device_ordinal); // Some Tensors can have complex on-device shapes, including tuple shapes. To diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index f18d8c20089..b710b38f402 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -365,6 +365,7 @@ tf_cc_test( name = "xla_compiler_test", srcs = ["xla_compiler_test.cc"], deps = [ + ":common", ":side_effect_util", ":xla_compiler", "//tensorflow/cc:cc_ops", diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc index e172c649325..53e7624d607 100644 --- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" @@ -72,9 +73,9 @@ class RetvalOp : public XlaOpKernel { } else { TensorShape shape = ctx->InputShape(0); ctx->SetStatus(is_constant.status()); - TensorShape representation_shape; + xla::Shape representation_shape; if (tc.is_entry_computation()) { - xla::StatusOr shape_or_status = + xla::StatusOr shape_or_status = tc.RepresentationShape(shape, ctx->input_type(0)); if (!shape_or_status.ok()) { ctx->SetStatus(shape_or_status.status()); @@ -83,12 +84,14 @@ class RetvalOp : public XlaOpKernel { representation_shape = shape_or_status.ValueOrDie(); } } else { - representation_shape = shape; + OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(ctx->input_type(0), shape, + &representation_shape)); } xla::XlaOp output = input; if (tc.is_entry_computation()) { - output = xla::Reshape(input, representation_shape.dim_sizes()); + output = xla::Reshape( + input, xla::AsInt64Slice(representation_shape.dimensions())); } else { // The core from which a return value is returned depends on the // device assignment of the input to the retval. Since we can't change diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index e177a5f07f5..214ca21e73f 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -37,6 +37,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/graph_optimizer.h" #include "tensorflow/core/framework/attr_value_util.h" #include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/types.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/node_builder.h" @@ -110,8 +111,13 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options) // The default shape representation function is the identity. if (!options_.shape_representation_fn) { - options_.shape_representation_fn = [](const TensorShape& shape, - DataType type) { return shape; }; + options_.shape_representation_fn = + [](const TensorShape& shape, + DataType dtype) -> xla::StatusOr { + xla::Shape xla_shape; + TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape)); + return xla_shape; + }; } } @@ -247,25 +253,24 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg, case XlaCompiler::Argument::kConstant: LOG(FATAL) << "Unreachable case"; case XlaCompiler::Argument::kParameter: { - TensorShape shape; if (is_entry_computation) { TF_ASSIGN_OR_RETURN( - shape, options_.shape_representation_fn(arg.shape, arg.type)); + *xla_shape, options_.shape_representation_fn(arg.shape, arg.type)); } else { - shape = arg.shape; + TF_RETURN_IF_ERROR( + TensorShapeToXLAShape(arg.type, arg.shape, xla_shape)); } - return TensorShapeToXLAShape(arg.type, shape, xla_shape); + return Status::OK(); } case XlaCompiler::Argument::kResource: { TF_RET_CHECK(arg.initialized); switch (arg.resource_kind) { case XlaResource::kVariable: { - TF_ASSIGN_OR_RETURN( - TensorShape representation_shape, - options_.shape_representation_fn(arg.shape, arg.type)); - return TensorShapeToXLAShape(arg.type, representation_shape, - xla_shape); + TF_ASSIGN_OR_RETURN(*xla_shape, options_.shape_representation_fn( + arg.shape, arg.type)); + + return Status::OK(); } case XlaResource::kTensorArray: { if (arg.tensor_array_size < 0) { diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index 2cc603a5801..08e00f38409 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -259,8 +259,7 @@ class XlaCompiler { std::shared_ptr computation; }; - typedef std::function(const TensorShape&, - DataType)> + typedef std::function(const TensorShape&, DataType)> ShapeRepresentationFn; struct Options { // Name of the compilation device to use. It must be set by the caller. diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc index 4ef154f856b..aaee208f634 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/cc/ops/resource_variable_ops.h" #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/compiler/tf2xla/side_effect_util.h" +#include "tensorflow/compiler/tf2xla/type_util.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/client_library.h" @@ -1018,9 +1019,11 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) { // Compiles the graph. XlaCompiler::Options options = DefaultOptions(); - options.shape_representation_fn = [](const TensorShape& shape, - DataType type) { - return TensorShape({shape.num_elements()}); + options.shape_representation_fn = + [](const TensorShape& shape, DataType type) -> xla::StatusOr { + xla::PrimitiveType ptype; + TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype)); + return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()}); }; XlaCompiler compiler(options); @@ -1086,9 +1089,11 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) { // Compiles the graph. XlaCompiler::Options options = DefaultOptions(); - options.shape_representation_fn = [](const TensorShape& shape, - DataType type) { - return TensorShape({shape.num_elements()}); + options.shape_representation_fn = + [](const TensorShape& shape, DataType type) -> xla::StatusOr { + xla::PrimitiveType ptype; + TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype)); + return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()}); }; XlaCompiler compiler(options); diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc index 20e1ee2ddb3..1e819dbb694 100644 --- a/tensorflow/compiler/tf2xla/xla_context.cc +++ b/tensorflow/compiler/tf2xla/xla_context.cc @@ -66,7 +66,7 @@ XlaContext::XlaContext( XlaCompiler* compiler, xla::XlaBuilder* builder, bool allow_cpu_custom_calls, bool resolve_compile_time_constants, bool is_entry_computation, - const std::function( + const std::function( const TensorShape&, DataType)>* shape_representation_fn) : compiler_(compiler), builder_(builder), @@ -133,7 +133,7 @@ Status XlaContext::CreateResource( return Status::OK(); } -xla::StatusOr XlaContext::RepresentationShape( +xla::StatusOr XlaContext::RepresentationShape( const TensorShape& shape, DataType type) const { return (*shape_representation_fn_)(shape, type); } diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h index 4da891634e9..8aad6cbced0 100644 --- a/tensorflow/compiler/tf2xla/xla_context.h +++ b/tensorflow/compiler/tf2xla/xla_context.h @@ -48,7 +48,7 @@ class XlaContext : public ResourceBase { XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder, bool allow_cpu_custom_calls, bool resolve_compile_time_constants, bool is_entry_computation, - const std::function( + const std::function( const TensorShape&, DataType)>* shape_representation_fn); // Virtual method defined by ResourceBase. @@ -105,8 +105,8 @@ class XlaContext : public ResourceBase { // Returns the XLA shape to be used to represent a variable of TF `shape` // and `type`, or of an argument or return value of a top-level computation. - xla::StatusOr RepresentationShape(const TensorShape& shape, - DataType type) const; + xla::StatusOr RepresentationShape(const TensorShape& shape, + DataType type) const; // Get an XLA lambda to compute Max. This is cached in the // XlaContext since it may be used by multiple Ops. There is a @@ -158,13 +158,9 @@ class XlaContext : public ResourceBase { // body)? const bool is_entry_computation_; - // A function that describes how the shapes of - // a) argument and return value, for entry computations - // b) variables, for all computations, - // should be represented in XLA. Parameters/return values will be shaped - // according to this function, and reshaped back to/from their declared shapes - // for computations. Must be non-null. - const std::function(const TensorShape&, DataType)>* + // Describes the on-host shapes of parameters and return values. Also see: + // XlaDevice::Options::shape_representation_fn. + const std::function(const TensorShape&, DataType)>* shape_representation_fn_; // Cache of prebuilt computations indexed by their type. diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc index dd3498ef7aa..4b1634f6974 100644 --- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc +++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc @@ -413,9 +413,12 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type, XlaContext& xla_context = XlaContext::Get(ctx); TF_ASSIGN_OR_RETURN( - TensorShape representation_shape, + xla::Shape representation_shape, xla_context.RepresentationShape(variable->shape(), variable->type())); - if (representation_shape == variable->shape()) { + xla::Shape xla_shape; + TF_RETURN_IF_ERROR( + TensorShapeToXLAShape(variable->type(), variable->shape(), &xla_shape)); + if (xla::ShapeUtil::Compatible(xla_shape, representation_shape)) { *value = variable->value(); } else { *value = xla::Reshape(variable->value(), variable->shape().dim_sizes()); @@ -570,10 +573,13 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type, TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape)); XlaContext& xla_context = XlaContext::Get(ctx); - TF_ASSIGN_OR_RETURN(TensorShape representation_shape, + TF_ASSIGN_OR_RETURN(xla::Shape representation_shape, xla_context.RepresentationShape(shape, type)); - if (shape != representation_shape) { - handle = xla::Reshape(handle, representation_shape.dim_sizes()); + xla::Shape xla_shape; + TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape)); + if (!xla::ShapeUtil::Compatible(xla_shape, representation_shape)) { + handle = xla::Reshape(handle, + xla::AsInt64Slice(representation_shape.dimensions())); } return variable->SetValue(handle); } From f9e169ee10454138296aa425f336bec4d847c64f Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Tue, 6 Nov 2018 12:06:23 -0800 Subject: [PATCH 193/540] [Java]: Release 1.12.0 PiperOrigin-RevId: 220326609 --- tensorflow/java/maven/libtensorflow/pom.xml | 2 +- tensorflow/java/maven/libtensorflow_jni/pom.xml | 2 +- tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml | 2 +- tensorflow/java/maven/pom.xml | 2 +- tensorflow/java/maven/proto/pom.xml | 2 +- tensorflow/java/maven/spark-tensorflow-connector/pom.xml | 2 +- tensorflow/java/maven/tensorflow-hadoop/pom.xml | 2 +- tensorflow/java/maven/tensorflow/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml index c2ece557d5b..db3a3609f1a 100644 --- a/tensorflow/java/maven/libtensorflow/pom.xml +++ b/tensorflow/java/maven/libtensorflow/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.12.0-rc2 + 1.12.0 ../ libtensorflow diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml index 0d6f46c6fe6..53f7a2d63ef 100644 --- a/tensorflow/java/maven/libtensorflow_jni/pom.xml +++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.12.0-rc2 + 1.12.0 ../ libtensorflow_jni diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml index ab54a61076a..a17724c805e 100644 --- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml +++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.12.0-rc2 + 1.12.0 ../ libtensorflow_jni_gpu diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml index 557a755236f..30831f90b9f 100644 --- a/tensorflow/java/maven/pom.xml +++ b/tensorflow/java/maven/pom.xml @@ -6,7 +6,7 @@ 4.0.0 org.tensorflow parentpom - 1.12.0-rc2 + 1.12.0 pom https://www.tensorflow.org diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml index 2f435a6da0c..dd6b52be624 100644 --- a/tensorflow/java/maven/proto/pom.xml +++ b/tensorflow/java/maven/proto/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.12.0-rc2 + 1.12.0 ../ proto diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml index da94c58c42f..f47c11809d5 100644 --- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml +++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml @@ -6,7 +6,7 @@ org.tensorflow spark-tensorflow-connector_2.11 jar - 1.12.0-rc2 + 1.12.0 spark-tensorflow-connector https://www.tensorflow.org TensorFlow TFRecord connector for Apache Spark DataFrames diff --git a/tensorflow/java/maven/tensorflow-hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml index 73ce7a9ffd8..11aaba983f6 100644 --- a/tensorflow/java/maven/tensorflow-hadoop/pom.xml +++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml @@ -5,7 +5,7 @@ org.tensorflow tensorflow-hadoop jar - 1.12.0-rc2 + 1.12.0 tensorflow-hadoop https://www.tensorflow.org TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml index fa137e94409..07fcfa51446 100644 --- a/tensorflow/java/maven/tensorflow/pom.xml +++ b/tensorflow/java/maven/tensorflow/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.12.0-rc2 + 1.12.0 ../ tensorflow From 93cc1c8c4d7b3a2f22181399accabec7851331c4 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Tue, 6 Nov 2018 12:20:40 -0800 Subject: [PATCH 194/540] Test for builtins before anything else. PiperOrigin-RevId: 220329297 --- tensorflow/python/autograph/impl/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py index e51ad337650..b8031b46cbb 100644 --- a/tensorflow/python/autograph/impl/api.py +++ b/tensorflow/python/autograph/impl/api.py @@ -171,6 +171,9 @@ def converted_call(f, owner, options, *args, **kwargs): f = getattr(owner, f) + if inspect_utils.isbuiltin(f): + return py_builtins.overload_of(f)(*args, **kwargs) + # TODO(mdan): This needs cleanup. # In particular, we may want to avoid renaming functions altogether. if not options.force_conversion and conversion.is_whitelisted_for_graph(f): @@ -185,9 +188,6 @@ def converted_call(f, owner, options, *args, **kwargs): return f(*args, **kwargs) - if inspect_utils.isbuiltin(f): - return py_builtins.overload_of(f)(*args, **kwargs) - # internal_convert_user_code is for example turned off when issuing a dynamic # call conversion from generated code while in nonrecursive mode. In that # case we evidently don't want to recurse, but we still have to convert From cee43bee0bed0569a36809f60156694bdded9b56 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Tue, 6 Nov 2018 12:21:45 -0800 Subject: [PATCH 195/540] Store the list of supported built-ins as a tuple rather than a set, because they are not all hashable. PiperOrigin-RevId: 220329509 --- tensorflow/python/autograph/operators/py_builtins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py index d312e6938b2..2f55d538924 100644 --- a/tensorflow/python/autograph/operators/py_builtins.py +++ b/tensorflow/python/autograph/operators/py_builtins.py @@ -216,10 +216,10 @@ def _py_range(start_or_stop, stop, step): return range(start_or_stop) -SUPPORTED_BUILTINS = set((abs, float, int, len, print, range)) +SUPPORTED_BUILTINS = (abs, float, int, len, print, range) if six.PY2: - SUPPORTED_BUILTINS.add(xrange) + SUPPORTED_BUILTINS += (xrange,) BUILTIN_FUINCTIONS_MAP = { 'abs': abs_, From b5d4c690c2b444b4fde34ee737f20d87c37a35f4 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Tue, 6 Nov 2018 12:36:55 -0800 Subject: [PATCH 196/540] Move the error rewriting and auto name scoping mechanisms to experimental. PiperOrigin-RevId: 220332137 --- tensorflow/python/autograph/core/converter.py | 8 +++++++- tensorflow/python/autograph/impl/conversion.py | 7 +++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py index bc366123096..49e24895a2b 100644 --- a/tensorflow/python/autograph/core/converter.py +++ b/tensorflow/python/autograph/core/converter.py @@ -109,9 +109,15 @@ class Feature(Enum): AUTO_CONTROL_DEPS = ( 'Insert of control dependencies in the generated code.') DECORATORS = ( - 'Allow decorators in local functions. Note that special decorators, ' + 'Allow decorators in local functions. Note that special decorators,' ' like ag.convert or tf.function are allowed regardless of this toggle.') + ERROR_REWRITING = ( + 'Rewrite errors that occur in the generated code to indicate the source' + ' code to which the failing code corresponds.') LISTS = 'Convert list idioms, like initializers, slices, append, etc.' + NAME_SCOPES = ( + 'Insert name scopes that name ops according to context, like the' + ' function they were defined in.') def __repr__(self): return self.name diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py index 197bd5a3e76..b2fa2825ebd 100644 --- a/tensorflow/python/autograph/impl/conversion.py +++ b/tensorflow/python/autograph/impl/conversion.py @@ -358,6 +358,9 @@ def node_to_graph(node, context): node = converter.apply_(node, context, logical_expressions) if context.program.options.uses(converter.Feature.AUTO_CONTROL_DEPS): node = converter.apply_(node, context, side_effect_guards) - node = converter.apply_(node, context, function_scopes) - node = converter.apply_(node, context, error_handlers) + # TODO(mdan): If function scopes ever does more, the toggle will need moving. + if context.program.options.uses(converter.Feature.NAME_SCOPES): + node = converter.apply_(node, context, function_scopes) + if context.program.options.uses(converter.Feature.ERROR_REWRITING): + node = converter.apply_(node, context, error_handlers) return node From c5c80c1bd471b6d68f5bda082ffd63df1285ae15 Mon Sep 17 00:00:00 2001 From: Ayush Dubey Date: Tue, 6 Nov 2018 12:41:30 -0800 Subject: [PATCH 197/540] Return a better status and message for a collective broadcast group with no send ops. Before this change, if we create a group of collective ops comprising only broadcast_recv, we would trigger a check failure with the message: "Check failed: ir->source_rank >= 0 (-1 vs. 0)". After this change, if a user creates a collective group without a broadcast_send they should see a better error message. In the process, we remove the check and put error message in the status of collective param initialization. PiperOrigin-RevId: 220332905 --- .../collective_param_resolver_local.cc | 11 ++- .../collective_param_resolver_local_test.cc | 67 +++++++++++++++---- 2 files changed, 64 insertions(+), 14 deletions(-) diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc index f90fb174344..3a03b6724c1 100644 --- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc +++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc @@ -708,7 +708,16 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir, return; } CHECK_EQ(ir->known_count, ir->shared.group.group_size); - CHECK_GE(ir->source_rank, 0); + if (ir->source_rank < 0) { + // NOTE(ayushd): changing the error message below would also require + // updating CompleteParamsBroadcastForgotSend test in + // CollectiveParamResolverLocalTest. + ir->status = + errors::Internal("Instance ", cp->instance.instance_key, + " found no source for broadcast. This " + "could mean that there were group_size=", + ir->known_count, " BcastRecvs but no BcastSend."); + } if (!ir->known_waiters.empty()) { ready_waiters = std::move(ir->known_waiters); } diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc index 2b43adbac69..9a501b32981 100644 --- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc +++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc @@ -200,28 +200,35 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) { } } +void InitializeCollectiveParamsForBroadcast(int instance_key, int device_idx, + bool is_source, + CollectiveParams* cp) { + cp->group.group_key = 1; + cp->group.group_size = 3; + cp->group.device_type = DeviceType("CPU"); + cp->group.num_tasks = 1; + cp->instance.instance_key = instance_key; + cp->instance.type = BROADCAST_COLLECTIVE; + cp->instance.data_type = DataType(DT_FLOAT); + cp->instance.shape = TensorShape({5}); + cp->instance.device_names.push_back(strings::StrCat( + "/job:localhost/replica:0/task:0/device:CPU:", device_idx)); + cp->instance.impl_details.subdiv_offsets.push_back(0); + cp->is_source = is_source; +} + TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) { + constexpr int kInstanceKey = 5; CollectiveParams cps[NUM_DEVS]; Status statuses[NUM_DEVS]; Notification note[NUM_DEVS]; for (int i = 0; i < NUM_DEVS; ++i) { CollectiveParams* cp = &cps[i]; - cp->group.group_key = 1; - cp->group.group_size = 3; - cp->group.device_type = DeviceType("CPU"); - cp->group.num_tasks = 1; - cp->instance.instance_key = 3; - cp->instance.type = BROADCAST_COLLECTIVE; - cp->instance.data_type = DataType(DT_FLOAT); - cp->instance.shape = TensorShape({5}); - cp->instance.device_names.push_back( - strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i)); - cp->instance.impl_details.subdiv_offsets.push_back(0); - cp->is_source = (i == 1); + InitializeCollectiveParamsForBroadcast(kInstanceKey, i, i == 1, cp); Env::Default()->SchedClosure([this, i, cp, ¬e, &statuses]() { prl_->CompleteParamsAsync(cp->instance.device_names[0], cp, nullptr /*CancellationManager*/, - [this, &statuses, ¬e, i](const Status& s) { + [&statuses, ¬e, i](const Status& s) { statuses[i] = s; note[i].Notify(); }); @@ -245,4 +252,38 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) { } } +// If we don't mark any participant in a broadcast as the source, we essentially +// create a collective group with only broadcast recvs. In that case, we should +// get an internal error from param resolution. +TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcastForgotSender) { + constexpr int kInstanceKey = 8; + CollectiveParams cps[NUM_DEVS]; + Status statuses[NUM_DEVS]; + Notification note[NUM_DEVS]; + for (int i = 0; i < NUM_DEVS; ++i) { + CollectiveParams* cp = &cps[i]; + InitializeCollectiveParamsForBroadcast(kInstanceKey, i, false, cp); + Env::Default()->SchedClosure([this, i, cp, ¬e, &statuses]() { + prl_->CompleteParamsAsync(cp->instance.device_names[0], cp, + nullptr /*CancellationManager*/, + [&statuses, ¬e, i](const Status& s) { + statuses[i] = s; + note[i].Notify(); + }); + }); + } + for (int i = 0; i < NUM_DEVS; ++i) { + note[i].WaitForNotification(); + } + for (int i = 0; i < NUM_DEVS; ++i) { + EXPECT_EQ(statuses[i].code(), error::INTERNAL); + EXPECT_EQ(statuses[i].error_message(), + strings::StrCat( + "Instance ", kInstanceKey, + " found no source for broadcast. This could mean that there" + " were group_size=", + NUM_DEVS, " BcastRecvs but no BcastSend.")); + } +} + } // namespace tensorflow From 0130d4a67a2bf933710a0e575ad9713f1862a8f2 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 12:45:39 -0800 Subject: [PATCH 198/540] Change OpKernelRegistrar::Factory to be std::function. This change allows lambda expressions with captures to be used as factory functions for kernels. This is a prerequisite for the forthcoming C kernel registration API. PiperOrigin-RevId: 220333640 --- tensorflow/core/framework/op_kernel.cc | 2 +- tensorflow/core/framework/op_kernel.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index 5f08c130871..073dfb3e7d9 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -1232,7 +1232,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device, OpKernelConstruction context( device_type, device, allocator, &node_def, op_def, flib, inputs, input_memory_types, outputs, output_memory_types, graph_def_version, &s); - *kernel = (*registration->factory)(&context); + *kernel = (registration->factory)(&context); if (!s.ok()) { delete *kernel; *kernel = nullptr; diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h index 165115aab32..52d7d14e218 100644 --- a/tensorflow/core/framework/op_kernel.h +++ b/tensorflow/core/framework/op_kernel.h @@ -1348,7 +1348,7 @@ namespace kernel_factory { class OpKernelRegistrar { public: - typedef OpKernel* (*Factory)(OpKernelConstruction*); + typedef std::function Factory; OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name, Factory factory) { From 4ae1908593ee25b1ee404d57a4b6c6f2f6ac4180 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 13:10:42 -0800 Subject: [PATCH 199/540] Add RegisterListener to MultiPlatformManager to enable injecting dependencies to Platform before Initialize. PiperOrigin-RevId: 220337889 --- .../xla/service/interpreter/platform.cc | 2 + .../compiler/xla/service/platform_util.cc | 13 +- tensorflow/stream_executor/BUILD | 5 + .../stream_executor/cuda/cuda_platform.cc | 2 + .../stream_executor/host/host_platform.cc | 2 + .../stream_executor/multi_platform_manager.cc | 207 ++++++++++++++---- .../stream_executor/multi_platform_manager.h | 92 +++----- 7 files changed, 207 insertions(+), 116 deletions(-) diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc index c9b40d3c619..b0fc1af8b89 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform.cc +++ b/tensorflow/compiler/xla/service/interpreter/platform.cc @@ -110,3 +110,5 @@ REGISTER_MODULE_INITIALIZER( // open-source project, so this will be a no-op there. REGISTER_MODULE_INITIALIZER_SEQUENCE(interpreter_platform, multi_platform_manager); +REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener, + interpreter_platform); diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc index c522e7ae23b..e416c78e7f0 100644 --- a/tensorflow/compiler/xla/service/platform_util.cc +++ b/tensorflow/compiler/xla/service/platform_util.cc @@ -59,20 +59,15 @@ string CanonicalPlatformName(const string& name) { /* static */ StatusOr> PlatformUtil::GetSupportedPlatforms() { - se::MultiPlatformManager::PlatformMap platform_map; - se::port::Status platforms_status = se::MultiPlatformManager::WithPlatforms( - [&platform_map](se::MultiPlatformManager::PlatformMap* map) { - platform_map = *map; - return se::port::Status::OK(); - }); - if (platform_map.empty()) { + std::vector all_platforms = + se::MultiPlatformManager::AllPlatforms(); + if (all_platforms.empty()) { LOG(WARNING) << "no executor platforms available: platform map is empty"; } // Gather all platforms which have an XLA compiler. std::vector platforms; - for (auto& platform_pair : platform_map) { - auto* platform = platform_pair.second; + for (se::Platform* platform : all_platforms) { auto compiler_status = Compiler::GetForPlatform(platform); if (compiler_status.ok()) { platforms.push_back(platform); diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD index d4d97087ba4..5c9d85acf4e 100644 --- a/tensorflow/stream_executor/BUILD +++ b/tensorflow/stream_executor/BUILD @@ -37,6 +37,10 @@ cc_library( deps = [ "//tensorflow/core:lib", "//tensorflow/core:ptr_util", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/synchronization", "@local_config_cuda//cuda:cuda_headers", ], alwayslink = 1, @@ -49,6 +53,7 @@ cc_library( deps = [ "//tensorflow/core:lib", "//tensorflow/core:ptr_util", + "@com_google_absl//absl/strings", "@local_config_cuda//cuda:cuda_headers", ] + if_static([":stream_executor_impl"]), ) diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc index 622a4a4edb1..b342e71bdd9 100644 --- a/tensorflow/stream_executor/cuda/cuda_platform.cc +++ b/tensorflow/stream_executor/cuda/cuda_platform.cc @@ -209,3 +209,5 @@ REGISTER_MODULE_INITIALIZER(cuda_platform, // Note that module initialization sequencing is not supported in the // open-source project, so this will be a no-op there. REGISTER_MODULE_INITIALIZER_SEQUENCE(cuda_platform, multi_platform_manager); +REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener, + cuda_platform); diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc index 410dc9da899..d16cca8dcc0 100644 --- a/tensorflow/stream_executor/host/host_platform.cc +++ b/tensorflow/stream_executor/host/host_platform.cc @@ -103,3 +103,5 @@ REGISTER_MODULE_INITIALIZER(host_platform, // Note that module initialization sequencing is not supported in the // open-source project, so this will be a no-op there. REGISTER_MODULE_INITIALIZER_SEQUENCE(host_platform, multi_platform_manager); +REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener, + host_platform); diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc index 5b51398d8ca..bbb56071f49 100644 --- a/tensorflow/stream_executor/multi_platform_manager.cc +++ b/tensorflow/stream_executor/multi_platform_manager.cc @@ -15,62 +15,86 @@ limitations under the License. #include "tensorflow/stream_executor/multi_platform_manager.h" +#include "absl/base/thread_annotations.h" +#include "absl/container/flat_hash_map.h" +#include "absl/strings/string_view.h" +#include "absl/synchronization/mutex.h" #include "tensorflow/stream_executor/lib/error.h" #include "tensorflow/stream_executor/lib/initialize.h" #include "tensorflow/stream_executor/lib/str_util.h" #include "tensorflow/stream_executor/lib/stringprintf.h" namespace stream_executor { +namespace { -/* static */ mutex MultiPlatformManager::platforms_mutex_{LINKER_INITIALIZED}; +class MultiPlatformManagerImpl { + public: + port::Status RegisterPlatform(std::unique_ptr platform) + LOCKS_EXCLUDED(mu_); -/* static */ port::StatusOr MultiPlatformManager::LookupByNameLocked( - const string& target) { - PlatformMap* platform_map = GetPlatformMap(); - auto it = platform_map->find(port::Lowercase(target)); - if (it == platform_map->end()) { - return port::Status( - port::error::NOT_FOUND, - "could not find registered platform with name: \"" + target + "\""); - } - return it->second; -} + port::StatusOr PlatformWithName(absl::string_view target) + LOCKS_EXCLUDED(mu_); -/* static */ port::StatusOr MultiPlatformManager::LookupByIdLocked( - const Platform::Id& id) { - PlatformIdMap* platform_map = GetPlatformByIdMap(); - auto it = platform_map->find(id); - if (it == platform_map->end()) { - return port::Status( - port::error::NOT_FOUND, - port::Printf("could not find registered platform with id: 0x%p", id)); - } - return it->second; -} + port::StatusOr PlatformWithId(const Platform::Id& id) + LOCKS_EXCLUDED(mu_); -/* static */ port::Status MultiPlatformManager::RegisterPlatform( + port::StatusOr InitializePlatformWithName( + absl::string_view target, const std::map& options) + LOCKS_EXCLUDED(mu_); + port::StatusOr InitializePlatformWithId( + const Platform::Id& id, const std::map& options) + LOCKS_EXCLUDED(mu_); + + std::vector AllPlatforms() LOCKS_EXCLUDED(mu_); + + using Listener = MultiPlatformManager::Listener; + port::Status RegisterListener(std::unique_ptr listener) + LOCKS_EXCLUDED(mu_); + + private: + // Looks up the platform object with the given name. Assumes the Platforms + // mutex is held. + port::StatusOr LookupByNameLocked(absl::string_view target) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Looks up the platform object with the given id. Assumes the Platforms + // mutex is held. + port::StatusOr LookupByIdLocked(const Platform::Id& id) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + absl::Mutex mu_; + std::vector> listeners_ GUARDED_BY(mu_); + absl::flat_hash_map id_map_ GUARDED_BY(mu_); + absl::flat_hash_map name_map_ GUARDED_BY(mu_); +}; + +port::Status MultiPlatformManagerImpl::RegisterPlatform( std::unique_ptr platform) { CHECK(platform != nullptr); string key = port::Lowercase(platform->Name()); - mutex_lock lock(platforms_mutex_); - if (GetPlatformMap()->find(key) != GetPlatformMap()->end()) { + absl::MutexLock lock(&mu_); + if (name_map_.find(key) != name_map_.end()) { return port::Status(port::error::INTERNAL, "platform is already registered with name: \"" + platform->Name() + "\""); } - GetPlatformByIdMap()->insert(std::make_pair(platform->id(), platform.get())); + Platform* platform_ptr = platform.get(); + CHECK(id_map_.emplace(platform->id(), platform_ptr).second); // Release ownership/uniqueness to prevent destruction on program exit. // This avoids Platforms "cleaning up" on program exit, because otherwise, // there are _very_ tricky races between StreamExecutor and underlying // platforms (CUDA, OpenCL) during exit. Since these are fixed-size and 1x per // program, these are deemed acceptable. - (*GetPlatformMap())[key] = platform.release(); + name_map_[key] = platform.release(); + for (const auto& listener : listeners_) { + listener->PlatformRegistered(platform_ptr); + } return port::Status::OK(); } -/* static */ port::StatusOr MultiPlatformManager::PlatformWithName( - const string& target) { - mutex_lock lock(platforms_mutex_); +port::StatusOr MultiPlatformManagerImpl::PlatformWithName( + absl::string_view target) { + absl::MutexLock lock(&mu_); SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target)); if (!platform->Initialized()) { @@ -80,9 +104,9 @@ namespace stream_executor { return platform; } -/* static */ port::StatusOr MultiPlatformManager::PlatformWithId( +port::StatusOr MultiPlatformManagerImpl::PlatformWithId( const Platform::Id& id) { - mutex_lock lock(platforms_mutex_); + absl::MutexLock lock(&mu_); SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id)); if (!platform->Initialized()) { @@ -92,15 +116,15 @@ namespace stream_executor { return platform; } -/* static */ port::StatusOr -MultiPlatformManager::InitializePlatformWithName( - const string& target, const std::map& options) { - mutex_lock lock(platforms_mutex_); +port::StatusOr MultiPlatformManagerImpl::InitializePlatformWithName( + absl::string_view target, const std::map& options) { + absl::MutexLock lock(&mu_); SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target)); if (platform->Initialized()) { - return port::Status(port::error::FAILED_PRECONDITION, - "platform \"" + target + "\" is already initialized"); + return port::Status( + port::error::FAILED_PRECONDITION, + absl::StrCat("platform \"", target, "\" is already initialized")); } SE_RETURN_IF_ERROR(platform->Initialize(options)); @@ -108,10 +132,9 @@ MultiPlatformManager::InitializePlatformWithName( return platform; } -/* static */ port::StatusOr -MultiPlatformManager::InitializePlatformWithId( +port::StatusOr MultiPlatformManagerImpl::InitializePlatformWithId( const Platform::Id& id, const std::map& options) { - mutex_lock lock(platforms_mutex_); + absl::MutexLock lock(&mu_); SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id)); if (platform->Initialized()) { @@ -125,10 +148,90 @@ MultiPlatformManager::InitializePlatformWithId( return platform; } -/* static */ void MultiPlatformManager::ClearPlatformRegistry() { - mutex_lock lock(platforms_mutex_); - GetPlatformMap()->clear(); - GetPlatformByIdMap()->clear(); +port::Status MultiPlatformManagerImpl::RegisterListener( + std::unique_ptr listener) { + absl::MutexLock lock(&mu_); + CHECK(id_map_.empty()); + CHECK(name_map_.empty()); + listeners_.push_back(std::move(listener)); + return port::Status::OK(); +} + +std::vector MultiPlatformManagerImpl::AllPlatforms() { + absl::MutexLock lock(&mu_); + CHECK_EQ(id_map_.size(), name_map_.size()); + std::vector platforms; + platforms.reserve(id_map_.size()); + for (const auto& entry : id_map_) { + platforms.push_back(entry.second); + } + return platforms; +} + +port::StatusOr MultiPlatformManagerImpl::LookupByNameLocked( + absl::string_view target) { + auto it = name_map_.find(port::Lowercase(target)); + if (it == name_map_.end()) { + return port::Status( + port::error::NOT_FOUND, + absl::StrCat("Could not find registered platform with name: \"", target, + "\"")); + } + return it->second; +} + +port::StatusOr MultiPlatformManagerImpl::LookupByIdLocked( + const Platform::Id& id) { + auto it = id_map_.find(id); + if (it == id_map_.end()) { + return port::Status( + port::error::NOT_FOUND, + port::Printf("could not find registered platform with id: 0x%p", id)); + } + return it->second; +} + +MultiPlatformManagerImpl& Impl() { + static MultiPlatformManagerImpl* impl = new MultiPlatformManagerImpl; + return *impl; +} + +} // namespace + +/*static*/ port::Status MultiPlatformManager::RegisterPlatform( + std::unique_ptr platform) { + return Impl().RegisterPlatform(std::move(platform)); +} + +/*static*/ port::StatusOr MultiPlatformManager::PlatformWithName( + absl::string_view target) { + return Impl().PlatformWithName(target); +} + +/*static*/ port::StatusOr MultiPlatformManager::PlatformWithId( + const Platform::Id& id) { + return Impl().PlatformWithId(id); +} + +/*static*/ port::StatusOr +MultiPlatformManager::InitializePlatformWithName( + absl::string_view target, const std::map& options) { + return Impl().InitializePlatformWithName(target, options); +} + +/*static*/ port::StatusOr +MultiPlatformManager::InitializePlatformWithId( + const Platform::Id& id, const std::map& options) { + return Impl().InitializePlatformWithId(id, options); +} + +/*static*/ port::Status MultiPlatformManager::RegisterListener( + std::unique_ptr listener) { + return Impl().RegisterListener(std::move(listener)); +} + +/*static*/ std::vector MultiPlatformManager::AllPlatforms() { + return Impl().AllPlatforms(); } } // namespace stream_executor @@ -141,3 +244,15 @@ REGISTER_MODULE_INITIALIZER( // purposes from Platform subclasses that register // themselves with the MultiPlatformManager. }); + +REGISTER_MODULE_INITIALIZER( + multi_platform_manager_listener, + { + // Nothing -- this is just a module initializer definition to reference + // for sequencing registration of listeners with the + // MultiPlatformManager. + }); + +// Listener registration should happen before platform registration. +REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener, + multi_platform_manager); diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h index 146a128e85c..06f5ae2c2ba 100644 --- a/tensorflow/stream_executor/multi_platform_manager.h +++ b/tensorflow/stream_executor/multi_platform_manager.h @@ -67,14 +67,14 @@ limitations under the License. #include #include #include +#include +#include "absl/strings/string_view.h" #include "tensorflow/stream_executor/lib/initialize.h" #include "tensorflow/stream_executor/lib/status.h" #include "tensorflow/stream_executor/lib/statusor.h" #include "tensorflow/stream_executor/platform.h" -#include "tensorflow/stream_executor/platform/mutex.h" #include "tensorflow/stream_executor/platform/port.h" -#include "tensorflow/stream_executor/platform/thread_annotations.h" namespace stream_executor { @@ -84,9 +84,8 @@ class MultiPlatformManager { // Registers a platform object, returns an error status if the platform is // already registered. The associated listener, if not null, will be used to // trace events for ALL executors for that platform. - // Takes ownership of listener. - static port::Status RegisterPlatform(std::unique_ptr platform) - LOCKS_EXCLUDED(platforms_mutex_); + // Takes ownership of platform. + static port::Status RegisterPlatform(std::unique_ptr platform); // Retrieves the platform registered with the given platform name (e.g. // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the @@ -98,10 +97,8 @@ class MultiPlatformManager { // If the requested platform is not registered, an error status is returned. // Ownership of the platform is NOT transferred to the caller -- // the MultiPlatformManager owns the platforms in a singleton-like fashion. - static port::StatusOr PlatformWithName(const string& target) - LOCKS_EXCLUDED(platforms_mutex_); - static port::StatusOr PlatformWithId(const Platform::Id& id) - LOCKS_EXCLUDED(platforms_mutex_); + static port::StatusOr PlatformWithName(absl::string_view target); + static port::StatusOr PlatformWithId(const Platform::Id& id); // Retrieves the platform registered with the given platform name (e.g. // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the @@ -114,14 +111,12 @@ class MultiPlatformManager { // Ownership of the platform is NOT transferred to the caller -- // the MultiPlatformManager owns the platforms in a singleton-like fashion. static port::StatusOr InitializePlatformWithName( - const string& target, const std::map& options) - LOCKS_EXCLUDED(platforms_mutex_); - static port::StatusOr InitializePlatformWithId( - const Platform::Id& id, const std::map& options) - LOCKS_EXCLUDED(platforms_mutex_); + absl::string_view target, const std::map& options); - // Clears the set of registered platforms, primarily used for testing. - static void ClearPlatformRegistry() LOCKS_EXCLUDED(platforms_mutex_); + static port::StatusOr InitializePlatformWithId( + const Platform::Id& id, const std::map& options); + + static std::vector AllPlatforms(); // Although the MultiPlatformManager "owns" its platforms, it holds them as // undecorated pointers to prevent races during program exit (between this @@ -135,57 +130,32 @@ class MultiPlatformManager { // of any platforms registered with it, and leak checking should be disabled // during allocation of such Platforms, to avoid spurious reporting at program // exit. - using PlatformMap = std::map; - // Provides access to the available set of platforms under a lock. - static port::Status WithPlatforms( - std::function callback) - LOCKS_EXCLUDED(platforms_mutex_) { - mutex_lock lock(platforms_mutex_); - return callback(GetPlatformMap()); - } - - private: - using PlatformIdMap = std::map; - - static mutex platforms_mutex_; - - // TODO(b/22689637): Clean up these two maps; make sure they coexist nicely. - // TODO(b/22689637): Move this (whatever the final/"official" map is) to - // plugin_regstry.h, along with the associated functionality. - // Platform-name-to-object mapping. These platforms are registered via module - // initializers, and linkage determines which platforms are available to a - // given target. - static PlatformMap* GetPlatformMap() { - static PlatformMap* instance = new PlatformMap; - return instance; - } - - // Holds a Platform::Id-to-object mapping. - // Unlike platforms_ above, this map does not own its contents. - static PlatformIdMap* GetPlatformByIdMap() { - static PlatformIdMap* instance = new PlatformIdMap; - return instance; - } - - // Looks up the platform object with the given name. Assumes the Platforms - // mutex is held. - static port::StatusOr LookupByNameLocked(const string& target) - EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_); - - // Looks up the platform object with the given id. Assumes the Platforms - // mutex is held. - static port::StatusOr LookupByIdLocked(const Platform::Id& id) - EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_); - - SE_DISALLOW_COPY_AND_ASSIGN(MultiPlatformManager); + // Interface for a listener that gets notfied at certain events. + class Listener { + public: + virtual ~Listener() = default; + // Callback that is invoked when a Platform is registered. + virtual void PlatformRegistered(Platform* platform) = 0; + }; + // Registers a listeners to receive notifications about certain events. + // Precondition: No Platform has been registered yet. + static port::Status RegisterListener(std::unique_ptr listener); }; } // namespace stream_executor -// multi_platform_manager.cc will define this instance. Includers of this header -// should use +// multi_platform_manager.cc will define these instances. +// +// Registering a platform: // REGISTER_MODULE_INITIALIZER_SEQUENCE(my_platform, multi_platform_manager); +// REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener, +// my_platform); +// +// Registering a listener: +// REGISTER_MODULE_INITIALIZER_SEQUENCE(my_listener, +// multi_platform_manager_listener); DECLARE_MODULE_INITIALIZER(multi_platform_manager); +DECLARE_MODULE_INITIALIZER(multi_platform_manager_listener); #endif // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_ From 7bfc20ffebc625e6fa8ae22e33f1515af3f28f67 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Tue, 6 Nov 2018 13:11:52 -0800 Subject: [PATCH 200/540] Creating python objects for Tensor Forest resources. In TF2, resources / state need to be explicitly owned by python objects instead of just residing in the graph. PiperOrigin-RevId: 220338075 --- .../tensor_forest/python/ops/model_ops.py | 74 +++++++++++++++--- .../tensor_forest/python/ops/stats_ops.py | 76 ++++++++++++++++--- 2 files changed, 127 insertions(+), 23 deletions(-) diff --git a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py index 596c59ead34..290c16fe396 100644 --- a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py +++ b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools + from tensorflow.contrib.tensor_forest.python.ops import gen_model_ops # pylint: disable=unused-import @@ -28,10 +30,12 @@ from tensorflow.contrib.tensor_forest.python.ops.gen_model_ops import update_mod # pylint: enable=unused-import from tensorflow.contrib.util import loader +from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.ops import resources from tensorflow.python.platform import resource_loader from tensorflow.python.training import saver +from tensorflow.python.training.checkpointable import tracking _model_ops = loader.load_op_library( @@ -88,6 +92,59 @@ class TreeVariableSavable(saver.BaseSaverBuilder.SaveableObject): params=self.params.serialized_params_proto) +class TreeVariable(tracking.TrackableResource): + """A tree model.""" + + def __init__(self, params, tree_config, stats_handle, name, container=None): + self._params = params + self._tree_config = tree_config + self._stats_handle = stats_handle + self._name = name + self._container = container + self._init_op = None + super(TreeVariable, self).__init__() + self._resource_handle = self.create_resource() + + def create_resource(self): + if context.executing_eagerly(): + # TODO(allenl): This will leak memory due to kernel caching by the + # shared_name attribute value (but is better than the alternative of + # sharing everything by default when executing eagerly; hopefully creating + # tables in a loop is uncommon). + shared_name = "tree_variable_%d" % (ops.uid(),) + else: + shared_name = self._name + return gen_model_ops.decision_tree_resource_handle_op( + self._container, shared_name=shared_name, name=self._name) + + def initialize(self): + return gen_model_ops.create_tree_variable( + self.resource_handle, + self._tree_config, + params=self._params.serialized_params_proto) + + @property + def initializer(self): + if self._init_op is None: + self._init_op = self.initialize() + return self._init_op + + def is_initialized(self): + return gen_model_ops.tree_is_initialized_op(self.resource_handle) + + def _gather_saveables_for_checkpoint(self): + """For object-based checkpointing.""" + return { + "tree_variable": + functools.partial( + TreeVariableSavable, + params=self._params, + tree_handle=self.resource_handle, + stats_handle=self._stats_handle, + create_op=self._init_op) + } + + def tree_variable(params, tree_config, stats_handle, name, container=None): r"""Creates a tree model and returns a handle to it. @@ -102,18 +159,13 @@ def tree_variable(params, tree_config, stats_handle, name, container=None): A `Tensor` of type mutable `string`. The handle to the tree. """ with ops.name_scope(name, "TreeVariable") as name: - resource_handle = gen_model_ops.decision_tree_resource_handle_op( - container, shared_name=name, name=name) - - create_op = gen_model_ops.create_tree_variable( - resource_handle, - tree_config, - params=params.serialized_params_proto) - is_initialized_op = gen_model_ops.tree_is_initialized_op(resource_handle) + tree_var = TreeVariable(params, tree_config, stats_handle, name, container) + resource_handle = tree_var.resource_handle + create_op = tree_var.initializer + is_initialized_op = tree_var.is_initialized() # Adds the variable to the savable list. - saveable = TreeVariableSavable(params, resource_handle, stats_handle, - create_op, - resource_handle.name) + saveable = tree_var._gather_saveables_for_checkpoint()["tree_variable"]( # pylint: disable=protected-access + name=resource_handle.name) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable) resources.register_resource(resource_handle, create_op, is_initialized_op) return resource_handle diff --git a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py index 44d486edecc..9184198cd4c 100644 --- a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py +++ b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools + from tensorflow.contrib.tensor_forest.python.ops import gen_stats_ops # pylint: disable=unused-import from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import finalize_tree @@ -25,10 +27,12 @@ from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import process_in # pylint: enable=unused-import from tensorflow.contrib.util import loader +from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.ops import resources from tensorflow.python.platform import resource_loader from tensorflow.python.training import saver +from tensorflow.python.training.checkpointable import tracking _stats_ops = loader.load_op_library( @@ -84,8 +88,58 @@ class FertileStatsVariableSavable(saver.BaseSaverBuilder.SaveableObject): params=self.params.serialized_params_proto) -def fertile_stats_variable(params, stats_config, name, - container=None): +class FertileStatsVariable(tracking.TrackableResource): + """A Fertile stats variable.""" + + def __init__(self, params, stats_config, name, container=None): + self._params = params + self._stats_config = stats_config + self._name = name + self._container = container + self._init_op = None + super(FertileStatsVariable, self).__init__() + self._resource_handle = self.create_resource() + + def create_resource(self): + if context.executing_eagerly(): + # TODO(allenl): This will leak memory due to kernel caching by the + # shared_name attribute value (but is better than the alternative of + # sharing everything by default when executing eagerly; hopefully creating + # tables in a loop is uncommon). + shared_name = "fertile_stats_variable_%d" % (ops.uid(),) + else: + shared_name = self._name + return gen_stats_ops.fertile_stats_resource_handle_op( + self._container, shared_name=shared_name, name=self._name) + + def initialize(self): + return gen_stats_ops.create_fertile_stats_variable( + self.resource_handle, + self._stats_config, + params=self._params.serialized_params_proto) + + @property + def initializer(self): + if self._init_op is None: + self._init_op = self.initialize() + return self._init_op + + def is_initialized(self): + return gen_stats_ops.fertile_stats_is_initialized_op(self.resource_handle) + + def _gather_saveables_for_checkpoint(self): + """For object-based checkpointing.""" + return { + "fertile_stats_variable": + functools.partial( + FertileStatsVariableSavable, + params=self._params, + stats_handle=self.resource_handle, + create_op=self.initializer) + } + + +def fertile_stats_variable(params, stats_config, name, container=None): r"""Creates a stats object and returns a handle to it. Args: @@ -98,17 +152,15 @@ def fertile_stats_variable(params, stats_config, name, A `Tensor` of type mutable `string`. The handle to the stats. """ with ops.name_scope(name, "FertileStatsVariable") as name: - resource_handle = gen_stats_ops.fertile_stats_resource_handle_op( - container, shared_name=name, name=name) - - create_op = gen_stats_ops.create_fertile_stats_variable( - resource_handle, stats_config, - params=params.serialized_params_proto) - is_initialized_op = gen_stats_ops.fertile_stats_is_initialized_op( - resource_handle) + fertile_stats_var = FertileStatsVariable(params, stats_config, name, + container) + resource_handle = fertile_stats_var.resource_handle + create_op = fertile_stats_var.initializer + is_initialized_op = fertile_stats_var.is_initialized() # Adds the variable to the savable list. - saveable = FertileStatsVariableSavable(params, resource_handle, create_op, - resource_handle.name) + saveable = ( + fertile_stats_var._gather_saveables_for_checkpoint()[ # pylint: disable=protected-access + "fertile_stats_variable"](name=resource_handle.name)) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable) resources.register_resource(resource_handle, create_op, is_initialized_op) return resource_handle From ff0ef344397d4b74a7770c4a4aa888dda8e0f6a7 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 13:31:50 -0800 Subject: [PATCH 201/540] pack: add support for int64 PiperOrigin-RevId: 220341536 --- tensorflow/lite/kernels/pack.cc | 7 +++++- tensorflow/lite/kernels/pack_test.cc | 34 ++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc index 24fabccde09..479495c875d 100644 --- a/tensorflow/lite/kernels/pack.cc +++ b/tensorflow/lite/kernels/pack.cc @@ -40,7 +40,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // TODO(renjieliu): Support negative axis. TF_LITE_ENSURE(context, data->axis >= 0); if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 && - input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16) { + input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16 && + input0->type != kTfLiteInt64) { context->ReportError(context, "Type '%s' is not supported by pack.", TfLiteTypeGetName(input0->type)); return kTfLiteError; @@ -110,6 +111,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { PackImpl(context, node, output, data->values_count, data->axis); break; } + case kTfLiteInt64: { + PackImpl(context, node, output, data->values_count, data->axis); + break; + } default: { context->ReportError(context, "Type '%s' is not supported by pack.", TfLiteTypeGetName(output->type)); diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc index a47e9ff40d0..4f58debc5c8 100644 --- a/tensorflow/lite/kernels/pack_test.cc +++ b/tensorflow/lite/kernels/pack_test.cc @@ -113,6 +113,40 @@ TEST(PackOpTest, Int32MultilDimensions) { ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12})); } +// int64 tests. +TEST(PackOpTest, Int64ThreeInputs) { + PackOpModel model({TensorType_INT64, {2}}, 0, 3); + model.SetInput(0, {1LL << 33, 4}); + model.SetInput(1, {2, 5}); + model.SetInput(2, {3, -(1LL << 34)}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2)); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1LL << 33, 4LL, 2LL, 5LL, 3LL, -(1LL << 34)})); +} + +TEST(PackOpTest, Int64ThreeInputsDifferentAxis) { + PackOpModel model({TensorType_INT64, {2}}, 1, 3); + model.SetInput(0, {1LL << 33, 4}); + model.SetInput(1, {2, 5}); + model.SetInput(2, {3, -(1LL << 34)}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3)); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1LL << 33, 2LL, 3LL, 4LL, 5LL, -(1LL << 34)})); +} + +TEST(PackOpTest, Int64MultilDimensions) { + PackOpModel model({TensorType_INT64, {2, 3}}, 1, 2); + model.SetInput(0, {1LL << 33, 2, 3, 4, 5, 6}); + model.SetInput(1, {7, 8, -(1LL << 34), 10, 11, 12}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3)); + EXPECT_THAT(model.GetOutput(), + ElementsAreArray({1LL << 33, 2LL, 3LL, 7LL, 8LL, -(1LL << 34), + 4LL, 5LL, 6LL, 10LL, 11LL, 12LL})); +} + // uint8 TEST(PackOpTest, Uint8ThreeInputs) { PackOpModel model({TensorType_UINT8, {2}}, 0, 3); From 3908ccbc8ac3c9f066933ceeabfd9d30220676c3 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 6 Nov 2018 13:34:01 -0800 Subject: [PATCH 202/540] [TF:XLA] Enable multiple streams for the XLA_GPU device, i.e., concurrent computations and transfers. This device is used only for unit tests. While the original intent of this change was to get more test coverage of the multiple stream path by enabling it on XLA_GPU, it turns out that it also fixes a XLA_GPU-specific bug introduced by https://github.com/tensorflow/tensorflow/commit/c4705c30d577138017069a2c897e8c9d66eb49bc where a CUDA host callback calls done() in XlaDeviceContext::CopyCPUTensorToDevice(). CUDA host callbacks are forbidden from calling CUDA driver methods, but done() can deallocate tensors, which ends up calling cuFree(). In multi-stream mode, we call done() eagerly, not on a callback, so the problem doesn't arise. PiperOrigin-RevId: 220342021 --- tensorflow/compiler/jit/xla_gpu_device.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index 717daadc4ac..6e2ea900453 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -59,7 +59,7 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options, options.device_name = DEVICE_XLA_GPU; options.device_ordinal = 0; options.compilation_device_name = DEVICE_GPU_XLA_JIT; - options.use_multiple_streams = false; + options.use_multiple_streams = true; auto device = absl::make_unique(session_options, options); // TODO(b/78468222): Uncomment after fixing this bug From 5f1379e4a626c0d84ec377eb252afeb1793798b2 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 6 Nov 2018 13:40:30 -0800 Subject: [PATCH 203/540] [XLA] Move ExtractSubShapedBuffer into xla::ScopedShapedBuffer as a new TakeSubTree() method. This method seems like it more naturally belongs in the XLA libraries. Simplify and generalize the implementation and add a test. PiperOrigin-RevId: 220343391 --- tensorflow/compiler/jit/BUILD | 25 ------ tensorflow/compiler/jit/xla_launch_util.cc | 40 +--------- tensorflow/compiler/jit/xla_launch_util.h | 11 --- .../compiler/jit/xla_launch_util_test.cc | 64 ---------------- tensorflow/compiler/xla/service/BUILD | 1 + .../compiler/xla/service/shaped_buffer.cc | 19 +++++ .../compiler/xla/service/shaped_buffer.h | 5 ++ .../xla/service/shaped_buffer_test.cc | 76 +++++++++++++++++++ tensorflow/compiler/xla/shape_util.cc | 5 ++ tensorflow/compiler/xla/shape_util.h | 3 + 10 files changed, 111 insertions(+), 138 deletions(-) delete mode 100644 tensorflow/compiler/jit/xla_launch_util_test.cc diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index f98ba487354..ba86f2e247d 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -651,31 +651,6 @@ tf_cc_test( ], ) -tf_cc_test( - name = "xla_launch_util_test", - size = "small", - srcs = ["xla_launch_util_test.cc"], - deps = [ - ":common", - ":xla_compilation_cache", - ":xla_launch_util", - ":xla_tensor", - "//tensorflow/compiler/tf2xla:common", - "//tensorflow/compiler/tf2xla:xla_compiler", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla/client:client_library", - "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/core:core_cpu_internal", - "//tensorflow/core:framework", - "//tensorflow/core:gpu_runtime", - "//tensorflow/core:lib", - "//tensorflow/core:lib_internal", - "//tensorflow/core:protos_all_cc", - "//tensorflow/core:test", - "//tensorflow/core/kernels:variable_ops", - ], -) - cc_library( name = "xla_fusion_optimizer", srcs = ["xla_fusion_optimizer.cc"], diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index 6e51bfca4a1..504bf51f2ad 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -191,40 +191,6 @@ Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) { return Status::OK(); } -namespace internal { -// Return the 'index''th subtree of the given ShapedBuffer as a -// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the -// subtree, and sets the input's buffer pointers to nullptr for the subtree. -ScopedShapedBuffer ExtractSubShapedBuffer( - ShapedBuffer* shaped_buffer, int index, - xla::DeviceMemoryAllocator* allocator) { - const xla::Shape& on_host_shape = xla::ShapeUtil::GetTupleElementShape( - shaped_buffer->on_host_shape(), index); - const xla::Shape& on_device_shape = xla::ShapeUtil::GetTupleElementShape( - shaped_buffer->on_device_shape(), index); - - ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape, - shaped_buffer->platform(), - shaped_buffer->device_ordinal()); - - auto& shape_tree = shaped_buffer->buffers(); - auto& sub_shape_tree = sub_shaped_buffer.buffers(); - sub_shape_tree.CopySubtreeFrom(shape_tree, - /*source_base_index=*/{index}, - /*target_base_index=*/{}); - shape_tree.ForEachMutableElement( - [index](const xla::ShapeIndex& shape_index, - tensorflow::se::DeviceMemoryBase* data) { - // shape_index is empty for the root node. Ignore that. - if (!shape_index.empty() && shape_index[0] == index) { - *data = tensorflow::se::DeviceMemoryBase(nullptr, 0); - } - }); - return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator); -} -} // namespace internal -using internal::ExtractSubShapedBuffer; - XlaComputationLaunchContext::XlaComputationLaunchContext( xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator, bool allocate_xla_tensors, bool use_multiple_streams) @@ -391,8 +357,7 @@ Status XlaComputationLaunchContext::PopulateOutputs( TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor)); XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor); if (xla_tensor) { - xla_tensor->set_shaped_buffer(ScopedShapedBuffer( - ExtractSubShapedBuffer(&output, output_num, xla_allocator_))); + xla_tensor->set_shaped_buffer(output.TakeSubTree({output_num})); if (use_multiple_streams_) { xla_tensor->ResetDefinitionEvent(definition_event, stream); } @@ -457,8 +422,7 @@ Status XlaComputationLaunchContext::PopulateOutputs( ctx->allocate_temp(write.type, write.shape, &output_tensor)); XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor); CHECK(xla_tensor); - xla_tensor->set_shaped_buffer( - ExtractSubShapedBuffer(&output, output_num, xla_allocator_)); + xla_tensor->set_shaped_buffer(output.TakeSubTree({output_num})); if (use_multiple_streams_) { xla_tensor->ResetDefinitionEvent(definition_event, stream); } diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h index 81e205d13f7..ea4efa0722c 100644 --- a/tensorflow/compiler/jit/xla_launch_util.h +++ b/tensorflow/compiler/jit/xla_launch_util.h @@ -223,17 +223,6 @@ class XlaTensorBuffer : public TensorBuffer { Allocator* allocator_; }; -// Exposed in this header file for microbenchmarking purposes, but this is an -// internal implementation detail. -namespace internal { -// Return the 'index''th subtree of the given ShapedBuffer as a -// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the -// subtree, and sets the input's buffer pointers to nullptr for the subtree. -xla::ScopedShapedBuffer ExtractSubShapedBuffer( - xla::ShapedBuffer* shaped_buffer, int index, - xla::DeviceMemoryAllocator* allocator); -} // namespace internal - } // namespace tensorflow #endif // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_ diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc deleted file mode 100644 index a45932403ec..00000000000 --- a/tensorflow/compiler/jit/xla_launch_util_test.cc +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// Contains microbenchmarks for performance critical functions in -// xla_launch_util.cc. - -#include "tensorflow/compiler/jit/xla_launch_util.h" -#include "tensorflow/compiler/tf2xla/shape_util.h" -#include "tensorflow/core/platform/test.h" -#include "tensorflow/core/platform/test_benchmark.h" - -// Test ExtractSubBuffer with different depths (depth of ShapeTree) and fan-outs -// (cardinality of each non-leaf node's children). -void BM_ExtractSubBuffer(int iters, int depth, int fan_out) { - tensorflow::testing::StopTiming(); - xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128}); - for (int i = 0; i < depth; ++i) { - std::vector shapes(fan_out, shape); - shape = xla::ShapeUtil::MakeTupleShape(shapes); - } - xla::ShapedBuffer shaped_buffer(shape, shape, /*platform=*/nullptr, - /*device_ordinal=*/0); - tensorflow::testing::StartTiming(); - for (int i = 0; i < iters; ++i) { - // Extract a buffer from approximately the middle of the first level of the - // tree. - (void)tensorflow::internal::ExtractSubShapedBuffer(&shaped_buffer, - /*index=*/fan_out / 2, - /*allocator=*/nullptr) - .release(); - } -} - -BENCHMARK(BM_ExtractSubBuffer) - ->ArgPair(1, 4) - ->ArgPair(1, 8) - ->ArgPair(1, 32) - ->ArgPair(1, 64) - ->ArgPair(1, 128) - ->ArgPair(1, 256) - ->ArgPair(1, 512) - ->ArgPair(2, 4) - ->ArgPair(2, 8) - ->ArgPair(2, 32) - ->ArgPair(2, 64) - ->ArgPair(2, 128); - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - tensorflow::testing::RunBenchmarks(); - return RUN_ALL_TESTS(); -} diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 04b2f72ac95..cc3355473dd 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -811,6 +811,7 @@ tf_cc_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:ptr_util", + "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", "@com_google_absl//absl/memory", ], diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc index 56952e3adae..28a30b5ee2d 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer.cc @@ -157,4 +157,23 @@ void ScopedShapedBuffer::Deallocate() { } } +ScopedShapedBuffer ScopedShapedBuffer::TakeSubTree(ShapeIndexView index) { + const xla::Shape& sub_on_host_shape = + xla::ShapeUtil::GetSubshape(on_host_shape(), {index}); + const xla::Shape& sub_on_device_shape = + xla::ShapeUtil::GetSubshape(on_device_shape(), {index}); + + ScopedShapedBuffer output(sub_on_host_shape, sub_on_device_shape, + memory_allocator(), device_ordinal()); + auto src_it = buffers().find(index); + auto dst_it = output.buffers().begin(); + while (dst_it != output.buffers().end()) { + dst_it->second = src_it->second; + src_it->second = tensorflow::se::DeviceMemoryBase(nullptr, 0); + ++src_it; + ++dst_it; + } + return output; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h index e1d26da4a20..f5210c9cfa6 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.h +++ b/tensorflow/compiler/xla/service/shaped_buffer.h @@ -176,6 +176,11 @@ class ScopedShapedBuffer : public ShapedBuffer { // It's the caller's job to ensure that the memory contained therein is freed. TF_MUST_USE_RESULT ShapedBuffer release(); + // Extracts the sub-tree rooted at 'index' and returns a ScopedShapedBuffer + // that holds ownership of the subtree. Sets the buffers corresponding to the + // subtree to null in 'this'. + ScopedShapedBuffer TakeSubTree(ShapeIndexView index); + protected: void Deallocate(); diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc index d69e6362e91..ca64bd3c8dd 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc @@ -20,6 +20,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/util/ptr_util.h" namespace xla { @@ -107,5 +109,79 @@ TEST(ScopedShapedBufferTest, TestMoveAssignmentOperator) { // TestAllocator's destructor checks that all memory was freed. } +TEST(ScopedShapedBufferTest, TestTakeSubTree) { + TestAllocator allocator; + + Shape s = ShapeUtil::MakeShape(F32, {1}); + s = xla::ShapeUtil::MakeTupleShape(std::vector(2, s)); + s = xla::ShapeUtil::MakeTupleShape(std::vector(3, s)); + + ScopedShapedBuffer sb(s, s, &allocator, /*device_ordinal=*/0); + sb.buffers().ForEachMutableElement( + [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) { + TF_ASSERT_OK_AND_ASSIGN( + OwningDeviceMemory m, + allocator.Allocate(/*device_ordinal=*/0, /*size=*/77)); + *buffer = m.Forget(); + }); + ShapeTree buffers = sb.buffers(); + + // Takes a subtree out of 'sb', and verifies the buffers are as expected. + xla::ShapeIndex subtree_index = {1}; + ScopedShapedBuffer output = sb.TakeSubTree(subtree_index); + + output.buffers().ForEachElement([&](const xla::ShapeIndex& sub_index, + const se::DeviceMemoryBase& buffer) { + xla::ShapeIndex orig_index = subtree_index; + for (int i : sub_index) { + orig_index.push_back(i); + } + EXPECT_TRUE(buffers.find(orig_index)->second.IsSameAs(buffer)); + }); + sb.buffers().ForEachElement( + [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) { + if (ShapeIndexView(index).StartsWith(subtree_index)) { + EXPECT_TRUE(buffer.is_null()); + } else { + EXPECT_TRUE(buffers.find(index)->second.IsSameAs(buffer)); + } + }); +} + +// Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs +// (cardinality of each non-leaf node's children). +void BM_TakeSubTree(int iters, int depth, int fan_out) { + tensorflow::testing::StopTiming(); + TestAllocator allocator; + xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128}); + for (int i = 0; i < depth; ++i) { + std::vector shapes(fan_out, shape); + shape = xla::ShapeUtil::MakeTupleShape(shapes); + } + xla::ScopedShapedBuffer shaped_buffer(shape, shape, /*allocator=*/&allocator, + /*device_ordinal=*/0); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + // Extract a buffer from approximately the middle of the first level of the + // tree. + (void)shaped_buffer.TakeSubTree(/*index=*/{fan_out / 2}).release(); + } + tensorflow::testing::StopTiming(); +} + +BENCHMARK(BM_TakeSubTree) + ->ArgPair(1, 4) + ->ArgPair(1, 8) + ->ArgPair(1, 32) + ->ArgPair(1, 64) + ->ArgPair(1, 128) + ->ArgPair(1, 256) + ->ArgPair(1, 512) + ->ArgPair(2, 4) + ->ArgPair(2, 8) + ->ArgPair(2, 32) + ->ArgPair(2, 64) + ->ArgPair(2, 128); + } // anonymous namespace } // namespace xla diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc index 17120e610cb..d0c35d8dee4 100644 --- a/tensorflow/compiler/xla/shape_util.cc +++ b/tensorflow/compiler/xla/shape_util.cc @@ -74,6 +74,11 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index) { return out; } +bool ShapeIndexView::StartsWith(ShapeIndexView prefix) const { + return size() >= prefix.size() && + indices_.subspan(0, prefix.size()) == prefix.indices_; +} + namespace { // Returns whether the given primitive type corresponds to an array shape. diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index 191ab04759f..a7a3026cf3f 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -147,6 +147,9 @@ class ShapeIndexView { string ToString() const; + // Returns true if this shape index starts with 'prefix'. + bool StartsWith(ShapeIndexView prefix) const; + private: absl::Span indices_; }; From c076822ac0f6ed15c38f24b966738b3b47ea228a Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 6 Nov 2018 13:45:47 -0800 Subject: [PATCH 204/540] Automated rollback of commit 7137218c2dcbb3963415b3a09fed64b96dceb3cd PiperOrigin-RevId: 220344381 --- tensorflow/compiler/tests/BUILD | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 194e710f1f1..06501e2177b 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -714,10 +714,6 @@ tf_xla_py_test( size = "medium", srcs = ["reduce_ops_test.py"], shard_count = 5, - tags = [ - # TODO(b/119059212): Re-enable this test in OSS. - "no_oss", - ], deps = [ ":xla_test", "//tensorflow/python:array_ops", From c89d3c799ff2cc63b46925a423db95120b4a7341 Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Tue, 6 Nov 2018 13:47:31 -0800 Subject: [PATCH 205/540] Stop setting device after execution. Placement is done before execution, so TensorHandles always have the correct device set. This change was made in https://github.com/tensorflow/tensorflow/commit/897666ccefbc852e76037b486c1963f278af48f4 PiperOrigin-RevId: 220344743 --- .../eager/copy_to_device_node.h | 6 ++-- .../core/common_runtime/eager/execute.cc | 28 +++++++------------ .../common_runtime/eager/kernel_and_device.cc | 8 ++++++ .../common_runtime/eager/kernel_and_device.h | 2 ++ .../common_runtime/eager/tensor_handle.cc | 11 ++------ .../core/common_runtime/eager/tensor_handle.h | 4 +-- 6 files changed, 26 insertions(+), 33 deletions(-) diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h index 953b3580c2e..5bc3bb689e0 100644 --- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h +++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h @@ -44,13 +44,11 @@ class CopyToDeviceNode : public EagerNode { TensorHandle* temp = nullptr; TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &temp)); const Tensor* tensor = nullptr; - Device* device = nullptr; - Device* op_device = nullptr; - Status status = temp->TensorAndDevice(&tensor, &device, &op_device); + Status status = temp->Tensor(&tensor); // `temp` is a ready handle. So the following call should return OK. TF_DCHECK_OK(status) << status.error_message(); DCHECK(tensor); - dst_->SetTensorAndDevice(*tensor, device, op_device); + dst_->SetTensor(*tensor); temp->Unref(); return Status::OK(); } diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc index 0fcf5d93877..73b8dc1b8d6 100644 --- a/tensorflow/core/common_runtime/eager/execute.cc +++ b/tensorflow/core/common_runtime/eager/execute.cc @@ -333,17 +333,10 @@ Status EagerLocalExecute(EagerOperation* op, // input handles are ready before executing them. // TODO(agarwal): Consider executing "cheap" kernels inline for performance. tensorflow::uint64 id = ctx->NextId(); - const MemoryTypeVector* output_memory_types = nullptr; - output_memory_types = &kernel->kernel()->output_memory_types(); - - Device* op_device = kernel->device(); for (int i = 0; i < *num_retvals; ++i) { - Device* d = op_device; - if (d != nullptr && output_memory_types != nullptr && - (*output_memory_types)[i] == HOST_MEMORY) { - d = nullptr; - } - (*retvals)[i] = new TensorHandle(id, d, op_device, output_dtypes[i], ctx); + (*retvals)[i] = new TensorHandle(id, /* d= */ kernel->OutputDevice(i), + /* op_device= */ kernel->device(), + output_dtypes[i], ctx); } EagerNode* node = new ExecuteNode( id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(), @@ -776,17 +769,16 @@ Status EagerExecute(EagerContext* ctx, Device* device, } } DCHECK_EQ(num_retvals, outputs.size()); - Device* op_device = device; for (int i = 0; i < num_retvals; ++i) { - Device* d = op_device; - if (d != nullptr && output_memory_types != nullptr && - (*output_memory_types)[i] == HOST_MEMORY) { - d = nullptr; - } if (retvals[i] == nullptr) { - retvals[i] = new TensorHandle(outputs[i], d, op_device, ctx); + retvals[i] = + new TensorHandle(outputs[i], /* d= */ kernel->OutputDevice(i), + /* op_device= */ device, ctx); } else { - retvals[i]->SetTensorAndDevice(outputs[i], d, op_device); + // In the async case, the retval is not a nullptr, and its device is + // already set since all TensorHandles always have their device set during + // construction. + retvals[i]->SetTensor(outputs[i]); } } return Status::OK(); diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc index ac9fd187b34..192d22dfd5a 100644 --- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc +++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc @@ -156,4 +156,12 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container, return Status::OK(); } +tensorflow::Device* KernelAndDevice::OutputDevice(int idx) const { + if (device_ != nullptr && + kernel_->output_memory_types()[idx] == HOST_MEMORY) { + return nullptr; + } + return device_; +} + } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h index 4b0f5182a0e..52dac94ccca 100644 --- a/tensorflow/core/common_runtime/eager/kernel_and_device.h +++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h @@ -69,6 +69,8 @@ class KernelAndDevice { std::vector* outputs, NodeExecStats* stats, StepStats* step_stats, GraphCollector* graph_collector); + Device* OutputDevice(int idx) const; + const OpKernel* kernel() const { return kernel_.get(); } Device* device() const { return device_; } diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc index 655add00e9b..9d5194f45c6 100644 --- a/tensorflow/core/common_runtime/eager/tensor_handle.cc +++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc @@ -174,17 +174,12 @@ Status TensorHandle::RemoteAddress(int64* op_id, int32* output_num) { return Status::OK(); } -void TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor, - tensorflow::Device* device, - tensorflow::Device* op_device) { +void TensorHandle::SetTensor(const tensorflow::Tensor& tensor) { mutex_lock l(ctx_mutex_); - DCHECK(node_id_ > 0 && !is_ready_) - << "SetTensorAndDevice should be only called " - << "on non-ready handles."; + DCHECK(node_id_ > 0 && !is_ready_) << "SetTensor should be only called " + << "on non-ready handles."; is_ready_ = true; tensor_ = tensor; - device_ = device; - op_device_ = op_device; } Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd, diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h index 4f2c1a31a47..adf76823e9f 100644 --- a/tensorflow/core/common_runtime/eager/tensor_handle.h +++ b/tensorflow/core/common_runtime/eager/tensor_handle.h @@ -121,9 +121,7 @@ class TensorHandle : public core::RefCounted { // Note that this can be called at most once, and only on non-ready handles, // and makes them ready. - void SetTensorAndDevice(const tensorflow::Tensor& tensor, - tensorflow::Device* device, - tensorflow::Device* op_device); + void SetTensor(const tensorflow::Tensor& tensor); Status CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd, TensorHandle** output); From a00c23b9f3ec79b4f52f800aaf9aad06bda412e1 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Tue, 6 Nov 2018 13:52:21 -0800 Subject: [PATCH 206/540] [XLA] Decrease intensity of "XLA is experimental" warning in docs. PiperOrigin-RevId: 220345625 --- tensorflow/compiler/xla/g3doc/overview.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/g3doc/overview.md b/tensorflow/compiler/xla/g3doc/overview.md index 6a172c3ae15..d3428b72761 100644 --- a/tensorflow/compiler/xla/g3doc/overview.md +++ b/tensorflow/compiler/xla/g3doc/overview.md @@ -4,11 +4,8 @@ -> Note: XLA is experimental and considered alpha. Most use cases will not -> see improvements in performance (speed or decreased memory usage). We have -> released XLA early so the Open Source Community can contribute to its -> development, as well as create a path for integration with hardware -> accelerators. +> Note: XLA is still under development. Some use cases will not +> see improvements in speed or decreased memory usage. XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear algebra that optimizes TensorFlow computations. The results are improvements in From b68d02a98af2f22ac698bd6958286e0bf7e35162 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 13:54:32 -0800 Subject: [PATCH 207/540] floor_mod: add support for int64 PiperOrigin-RevId: 220346035 --- tensorflow/lite/kernels/floor_mod.cc | 8 ++++++-- tensorflow/lite/kernels/floor_mod_test.cc | 11 +++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/kernels/floor_mod.cc b/tensorflow/lite/kernels/floor_mod.cc index beddac2174e..878716a5b4a 100644 --- a/tensorflow/lite/kernels/floor_mod.cc +++ b/tensorflow/lite/kernels/floor_mod.cc @@ -81,7 +81,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type); const TfLiteType type = input1->type; - if (type != kTfLiteInt32 && type != kTfLiteFloat32) { + if (type != kTfLiteInt32 && type != kTfLiteFloat32 && type != kTfLiteInt64) { context->ReportError(context, "Type '%s' is not supported by floor_mod.", TfLiteTypeGetName(type)); return kTfLiteError; @@ -107,7 +107,7 @@ TfLiteStatus EvalImpl(TfLiteContext* context, bool requires_broadcast, TfLiteTensor* output) { const T* denominator_data = GetTensorData(input2); - if (input2->type == kTfLiteInt32) { + if (input2->type == kTfLiteInt32 || input2->type == kTfLiteInt64) { // Validate the denominator only for integer. const int num_elements = NumElements(input2); for (int i = 0; i < num_elements; ++i) { @@ -144,6 +144,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return EvalImpl(context, data->requires_broadcast, input1, input2, output); } + case kTfLiteInt64: { + return EvalImpl(context, data->requires_broadcast, input1, + input2, output); + } case kTfLiteFloat32: { return EvalImpl(context, data->requires_broadcast, input1, input2, output); diff --git a/tensorflow/lite/kernels/floor_mod_test.cc b/tensorflow/lite/kernels/floor_mod_test.cc index 9d75f5ce2e3..9d78673f320 100644 --- a/tensorflow/lite/kernels/floor_mod_test.cc +++ b/tensorflow/lite/kernels/floor_mod_test.cc @@ -80,6 +80,17 @@ TEST(FloorModModel, BroadcastFloorMod) { EXPECT_THAT(model.GetOutput(), ElementsAre(-2, 0, -2, -2)); } +TEST(FloorModModel, Int64WithBroadcast) { + FloorModModel model({TensorType_INT64, {1, 2, 2, 1}}, + {TensorType_INT64, {1}}, {TensorType_INT64, {}}); + model.PopulateTensor(model.input1(), {10, -9, -11, (1LL << 34) + 9}); + model.PopulateTensor(model.input2(), {-(1LL << 33)}); + model.Invoke(); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1)); + EXPECT_THAT(model.GetOutput(), + ElementsAre(-8589934582, -9, -11, -8589934583)); +} + TEST(FloorModModel, FloatSimple) { FloorModModel model({TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}}, From 4828fd514ced0cbb8aa4a55033a81ace48acb8a7 Mon Sep 17 00:00:00 2001 From: Mustafa Ispir Date: Tue, 6 Nov 2018 13:58:16 -0800 Subject: [PATCH 208/540] Updates default value of loss reduction in tf.estimator.LinearClassifier to SUM_OVER_BATCH_SIZE for TF V2. It's SUM in V1. PiperOrigin-RevId: 220346638 --- .../v2/tensorflow.estimator.-linear-classifier.pbtxt | 4 ++-- tensorflow/tools/compatibility/tf_upgrade_v2.py | 7 +++++++ tensorflow/tools/compatibility/tf_upgrade_v2_test.py | 8 ++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt index 4b5de2e2450..4acbff2cfff 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt @@ -1,6 +1,6 @@ path: "tensorflow.estimator.LinearClassifier" tf_class { - is_instance: "" + is_instance: "" is_instance: "" is_instance: "" member { @@ -21,7 +21,7 @@ tf_class { } member_method { name: "__init__" - argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\', \'sum\'], " + argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], " } member_method { name: "eval_dir" diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py index dda45468fcd..779fd6ddbb7 100644 --- a/tensorflow/tools/compatibility/tf_upgrade_v2.py +++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py @@ -96,6 +96,12 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec): "you need to inspect this usage manually.\n" ) + # TODO(b/118888586): add default value change to update script. + default_loss_reduction_changed = ( + "WARNING: default value of loss_reduction has been changed to " + "SUM_OVER_BATCH_SIZE.\n" + ) + # Function warnings. placeholder inside warnings will be # replaced by function name. self.function_warnings = { @@ -108,6 +114,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec): "tf.train.cosine_decay_restarts": decay_function_comment, "tf.train.linear_cosine_decay": decay_function_comment, "tf.train.noisy_linear_cosine_decay": decay_function_comment, + "tf.estimator.LinearClassifier": default_loss_reduction_changed, } diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py index 6a0c3a787da..cf41a0eaf1e 100644 --- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py +++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py @@ -78,6 +78,14 @@ class TestUpgrade(test_util.TensorFlowTestCase): self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay]) self.assertIn("%s has been changed" % decay, report) + def testEstimatorLossReductionChangege(self): + text = "tf.estimator.LinearClassifier(a, b)\n" + _, report, errors, new_text = self._upgrade(text) + self.assertEqual(text, new_text) + self.assertEqual(errors, ["test.py:1: %s requires manual check." + % "tf.estimator.LinearClassifier"]) + self.assertIn("loss_reduction has been changed", report) + class TestUpgradeFiles(test_util.TensorFlowTestCase): From c857061d2a0835ba1e86630e85d1877f0bb5a3bd Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 13:59:42 -0800 Subject: [PATCH 209/540] Update OVIC instructions to include the detection task. PiperOrigin-RevId: 220346878 --- tensorflow/lite/java/ovic/README.md | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/java/ovic/README.md b/tensorflow/lite/java/ovic/README.md index 9e3ceb7e18e..368c486f4f1 100644 --- a/tensorflow/lite/java/ovic/README.md +++ b/tensorflow/lite/java/ovic/README.md @@ -97,9 +97,17 @@ filegroup( ... ``` -* Modify `OvicClassifierTest.java` and `OvicDetectorTest.java` to test your model. +* For classification models, modify `OvicClassifierTest.java`: + * change `TEST_IMAGE_PATH` to `my_test_image.jpg`. -Change `TEST_IMAGE_PATH` to `my_test_image.jpg`. Change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize). + * change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize). + + * change `TEST_IMAGE_GROUNDTRUTH` (ImageNet class ID) to be consistent with your test image. + +* For detection models, modify `OvicDetectorTest.java`: + * change `TEST_IMAGE_PATH` to `my_test_image.jpg`. + * change `MODEL_PATH` to `my_model.lite`. + * change `GROUNDTRUTH` (COCO class ID) to be consistent with your test image. Now you can run the bazel tests to catch any runtime issues with the submission. @@ -115,12 +123,17 @@ Make sure that you have followed instructions in [Test your submissions](#test-y Modify `tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java`: -* Add your model to the benchmarker apk by changing `MODEL_PATH` and `TEST_IMAGE_PATH` below to your submission and test image. +* Add your model to the benchmarker apk by changing `modelPath` and `testImagePath` to your submission and test image. ``` - private static final String TEST_IMAGE_PATH = "my_test_image.jpg"; - private static final String MODEL_PATH = "my_model.lite"; + if (benchmarkClassification) { + ... + testImagePath = "my_test_image.jpg"; + modelPath = "my_model.lite"; + } else { // Benchmarking detection. + ... ``` +If you are adding a detection model, simply modify `modelPath` and `testImagePath` in the else block above. * Adjust the benchmark parameters when needed: From 76e5b5d632ca845f3708cda20bcbdd0e9de01ace Mon Sep 17 00:00:00 2001 From: Guangda Lai Date: Tue, 6 Nov 2018 14:05:28 -0800 Subject: [PATCH 210/540] Fix the variable name. PiperOrigin-RevId: 220348163 --- tensorflow/examples/saved_model/saved_model_half_plus_two.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/examples/saved_model/saved_model_half_plus_two.py b/tensorflow/examples/saved_model/saved_model_half_plus_two.py index e1231708fa0..dfdde445404 100644 --- a/tensorflow/examples/saved_model/saved_model_half_plus_two.py +++ b/tensorflow/examples/saved_model/saved_model_half_plus_two.py @@ -160,7 +160,7 @@ def _generate_saved_model_for_half_plus_two(export_dir, x2 = tf.identity(tf_example["x2"], name="x2") y3 = tf.add(tf.multiply(a, x2), c) - y2 = tf.identity(y3, name="y3") + y3 = tf.identity(y3, name="y3") # Create an assets file that can be saved and restored as part of the # SavedModel. From d129383be96e29a840789adc024283dbdd862de1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 14:09:56 -0800 Subject: [PATCH 211/540] Update visibility rules. PiperOrigin-RevId: 220349112 --- tensorflow/BUILD | 1 + tensorflow/core/kernels/BUILD | 1 + 2 files changed, 2 insertions(+) diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 11b42f349df..859dc3b8d77 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -352,6 +352,7 @@ package_group( "//tensorflow/...", "//tensorflow_estimator/...", "//tensorflow_fold/llgtm/...", + "//tensorflow_text/...", "//third_party/py/tensor2tensor/...", ], ) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index fed0178176e..1e0d069e63c 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -22,6 +22,7 @@ package_group( "//learning/brain/research/sparse_matrix/...", "//learning/faster_training/...", "//tensorflow/...", + "//tensorflow_text/...", "//third_party/car/...", ], ) From fb2f956281bc0b813753df427a46160c06e27f58 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 14:20:25 -0800 Subject: [PATCH 212/540] Allow the XLA device assignment to be passed to the XRT compilation machinery. PiperOrigin-RevId: 220351155 --- tensorflow/compiler/xrt/xrt.proto | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto index 5678f0905ff..6ab77fbaaf0 100644 --- a/tensorflow/compiler/xrt/xrt.proto +++ b/tensorflow/compiler/xrt/xrt.proto @@ -6,6 +6,24 @@ import "tensorflow/compiler/tf2xla/host_compute_metadata.proto"; import "tensorflow/compiler/xla/xla_data.proto"; import "tensorflow/compiler/xla/service/hlo.proto"; +message DeviceAssignment { + message ComputationDevice { + message DeviceMeshCoordinates { + // The mesh coordinates for the device. Usually (X, Y, Core), in the order + // in which they are returned in the TopologyProto. + // X = value(0) + // Y = value(1) + // Core = value(2) + repeated int32 value = 1; + } + // As many replicas as there are in the replicated computation. + repeated DeviceMeshCoordinates replica_devices = 1; + } + // As many ComputationDevice as many there are computations (number + // of cores per replica). + repeated ComputationDevice computation_devices = 1; +} + // Options for an XLA compilation. message XLAComputationConfig { // The number of replicas the computation will be run on. If this is @@ -23,6 +41,11 @@ message XLAComputationConfig { // computation. per_core_args_and_result_shapes is optional for a // single-core computation. repeated xla.ProgramShape per_core_program_shape = 5; + // Describes how replicated computation instances should be assigned to + // devices. There are num_cores_per_replica computations, and each one will be + // sent and executed to the set of replica device numbers described in the + // DeviceAssignment proto. + DeviceAssignment device_assignment = 6; } // Options and XLA computation for a compilation. From 0b5dd6fc9948cc710caad031ce3a21591fa6236f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 14:26:51 -0800 Subject: [PATCH 213/540] Improve timeline logging for distributed execution. - Add DeviceOp (e.g. GPU stream) logging to distributed execution. This is on a best-effort basis. If device tracing is Unavailable, continue without it. - Add timeline logging of RecvBuf transfers over gRPC. - Add bandwidth consumed measurement to click tile for RecvTensor and RecvBuf. PiperOrigin-RevId: 220352522 --- tensorflow/core/BUILD | 4 +- tensorflow/core/distributed_runtime/BUILD | 5 ++- tensorflow/core/distributed_runtime/rpc/BUILD | 1 + .../rpc/grpc_remote_worker.cc | 40 ++++++++++++++++++- tensorflow/core/distributed_runtime/worker.cc | 39 ++++++++++++++++-- .../worker_cache_logger.cc | 16 ++++---- .../kernel_tests/tensor_array_ops_test.py | 8 ++-- 7 files changed, 95 insertions(+), 18 deletions(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 26dd295d0cb..932a0c3819b 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -3049,7 +3049,9 @@ tf_cuda_library( ], copts = tf_copts(), cuda_deps = if_cuda_is_configured(tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps()), - visibility = ["//visibility:private"], + visibility = [ + "//tensorflow:internal", + ], deps = [ ":core_cpu_internal", ":lib", diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD index 37029f3f1a7..3a534dcfb77 100644 --- a/tensorflow/core/distributed_runtime/BUILD +++ b/tensorflow/core/distributed_runtime/BUILD @@ -15,7 +15,7 @@ filegroup( ]), ) -load("//tensorflow:tensorflow.bzl", "tf_cc_test") +load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library") load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests") load("//tensorflow:tensorflow.bzl", "tf_copts") @@ -189,7 +189,7 @@ cc_library( ], ) -cc_library( +tf_cuda_library( name = "worker", srcs = ["worker.cc"], hdrs = [ @@ -204,6 +204,7 @@ cc_library( ":worker_interface", ":worker_session", "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:device_tracer", "//tensorflow/core:lib_internal", ], ) diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD index 4a10d99a607..d122016d3ee 100644 --- a/tensorflow/core/distributed_runtime/rpc/BUILD +++ b/tensorflow/core/distributed_runtime/rpc/BUILD @@ -87,6 +87,7 @@ cc_library( "//tensorflow/core:core_cpu_internal", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core:protos_all_cc", "//tensorflow/core:worker_proto_cc", "//tensorflow/core/distributed_runtime:tensor_coding", "//tensorflow/core/distributed_runtime:worker_cache_logger", diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc index 6008462d044..885c5e87c17 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc @@ -33,6 +33,7 @@ limitations under the License. #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/tracing.h" +#include "tensorflow/core/protobuf/transport_options.pb.h" #include "tensorflow/core/protobuf/worker.pb.h" namespace tensorflow { @@ -121,7 +122,44 @@ class GrpcRemoteWorker : public WorkerInterface { void RecvBufAsync(CallOptions* call_opts, const RecvBufRequest* request, RecvBufResponse* response, StatusCallback done) override { - IssueRequest(request, response, recvbuf_, std::move(done), call_opts); + int64 start_usec = Env::Default()->NowMicros(); + // Type-specialized logging for this method. + bool logging_active = logger_->LoggingActive() || VLOG_IS_ON(2); + StatusCallback wrapper_done; + const StatusCallback* cb_to_use; + if (!logging_active) { + cb_to_use = &done; // No additional work to do, so just use done directly + } else { + wrapper_done = [this, request, response, done, start_usec](Status s) { + if (logger_->LoggingActive()) { + int64 end_usec = Env::Default()->NowMicros(); + int64 step_id = request->step_id(); + RecvBufRespExtra extra; + response->transport_options().UnpackTo(&extra); + int64 num_bytes = 0; + for (const auto& chunk : extra.tensor_content()) { + num_bytes += chunk.size(); + } + int64 send_start_usec = start_usec; + // Prefer start time reported by the sender, if available. + if (response->send_start_micros()) { + send_start_usec = std::max( + start_usec, static_cast(response->send_start_micros())); + send_start_usec = std::min(send_start_usec, end_usec - 1); + } + const string& key = request->buf_rendezvous_key(); + logger_->RecordDataTransfer( + step_id, send_start_usec, end_usec, key, request->src_device(), + request->dst_device(), num_bytes, "", "RecvBuf"); + } + VLOG(2) << "done callback, req: " << request->DebugString() + << " response " << response->DebugString(); + done(s); + }; + cb_to_use = &wrapper_done; + } + + IssueRequest(request, response, recvbuf_, *cb_to_use, call_opts); } void CompleteGroupAsync(CallOptions* call_opts, diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc index 1ea19c48f09..079c09859f4 100644 --- a/tensorflow/core/distributed_runtime/worker.cc +++ b/tensorflow/core/distributed_runtime/worker.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h" #include "tensorflow/core/distributed_runtime/tensor_coding.h" #include "tensorflow/core/distributed_runtime/worker_session.h" +#include "tensorflow/core/platform/device_tracer.h" #include "tensorflow/core/platform/tracing.h" namespace tensorflow { @@ -179,7 +180,28 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request, request->exec_opts().record_timeline() || request->exec_opts().record_costs()) { collector = new StepStatsCollector(response->mutable_step_stats()); - // TODO(mrry,pbar): GPU tracing for distributed steps. + } + DeviceTracer* tracer = nullptr; + if (collector && request->exec_opts().record_timeline()) { + // If timeline was requested, assume we want hardware level tracing. + std::unique_ptr trptr = CreateDeviceTracer(); + if (trptr) { + tracer = trptr.release(); + Status s = tracer->Start(); + if (!s.ok()) { + delete tracer; + if (errors::IsUnavailable(s)) { + LOG(WARNING) + << "Hardware tracing unavailable, continuing without it. " << s; + tracer = nullptr; + } else { + delete collector; + delete out; + done(s); + return; + } + } + } } CancellationManager* cm = new CancellationManager; opts->SetCancelCallback([this, cm, step_id]() { @@ -194,6 +216,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request, opts->ClearCancelCallback(); delete cm; delete collector; + delete tracer; delete out; done(errors::Aborted("Call was aborted")); return; @@ -201,8 +224,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request, session->graph_mgr->ExecuteAsync( request->graph_handle(), step_id, session.get(), request->exec_opts(), collector, response, cm, in, - [this, step_id, response, session, cm, out, token, collector, opts, - done](Status s) { + [this, step_id, response, session, cm, out, token, collector, tracer, + opts, done](Status s) { if (s.ok()) { s = session->graph_mgr->RecvOutputs(step_id, out); } @@ -210,6 +233,15 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request, cancellation_manager_.DeregisterCallback(token); delete cm; + if (tracer) { + Status tracer_status = tracer->Stop(); + if (tracer_status.ok()) { + tracer_status = tracer->Collect(collector); + } + if (!tracer_status.ok()) { + LOG(ERROR) << "Bad status from tracer: " << tracer_status; + } + } if (s.ok()) { for (const auto& p : *out) { const string& key = p.first; @@ -219,6 +251,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request, } if (collector) collector->Finalize(); delete collector; + delete tracer; delete out; done(s); }); diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc index 95ca3c3b4d1..e0a17340870 100644 --- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc +++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc @@ -101,13 +101,18 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs, const string& transfer_method_name) { NodeExecStats* ns = new NodeExecStats; ns->set_node_name(transfer_method_name); + int64 elapsed_usecs = end_usecs - start_usecs; if (details.empty()) { auto byte_string = strings::StrCat("[", bytes, "B] "); if (bytes >= 0.1 * 1048576.0) { byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0); } - auto label = strings::StrCat(byte_string, tensor_name, " from ", src_device, - " to ", dst_device); + float mbs_rate = (8.0 * static_cast(bytes)) / elapsed_usecs; + auto rate_string = (mbs_rate >= 1000.0) + ? strings::Printf("[%.1fGb/s] ", mbs_rate / 1000.0) + : strings::Printf("[%fMb/s] ", mbs_rate); + auto label = strings::StrCat(byte_string, rate_string, tensor_name, + " from ", src_device, " to ", dst_device); ns->set_timeline_label(label); } else { ns->set_timeline_label(details); @@ -115,13 +120,10 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs, ns->set_all_start_micros(start_usecs); ns->set_op_start_rel_micros(0); - int64 elapsed = end_usecs - start_usecs; - ns->set_op_end_rel_micros(elapsed); - ns->set_all_end_rel_micros(elapsed); + ns->set_op_end_rel_micros(elapsed_usecs); + ns->set_all_end_rel_micros(elapsed_usecs); NodeOutput* no = ns->add_output(); no->set_slot(0); - // TODO(tucker): Maybe set the dimensions too, but then they'll - // need to be passed in. no->mutable_tensor_description() ->mutable_allocation_description() ->set_requested_bytes(bytes); diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py index 91bd93712a9..0188eb246f0 100644 --- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py +++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py @@ -1398,10 +1398,10 @@ class TensorArrayTest(test.TestCase): for d in dev_stats: if "/task:1/" in d: self.assertTrue( - [s for s in dev_stats[d] if "/TensorArray" in s.node_name]) + [s for s in dev_stats[d] if "TensorArray" == s.node_name]) else: self.assertFalse( - [s for s in dev_stats[d] if "/TensorArray" in s.node_name]) + [s for s in dev_stats[d] if "TensorArray" == s.node_name]) def testTensorArrayDisabledColocateWithFirstWriteCall(self): with ops.device("/job:worker/task:0/cpu:0"): @@ -1428,10 +1428,10 @@ class TensorArrayTest(test.TestCase): for d in dev_stats: if "/task:0/" in d and "CPU" in d: # Skip any GPU node stats self.assertTrue( - [s for s in dev_stats[d] if "/TensorArray" in s.node_name]) + [s for s in dev_stats[d] if "TensorArray" == s.node_name]) else: self.assertFalse( - [s for s in dev_stats[d] if "/TensorArray" in s.node_name]) + [s for s in dev_stats[d] if "TensorArray" == s.node_name]) @test_util.run_in_graph_and_eager_modes def testTensorArrayIdentity(self): From ca1636667dbfc7eeb35545f150916751742b5cd2 Mon Sep 17 00:00:00 2001 From: Pulkit Bhuwalka Date: Tue, 6 Nov 2018 14:30:50 -0800 Subject: [PATCH 214/540] Fixing example Android project build by gradle. Removing conflict with bazel file name. #22167 PiperOrigin-RevId: 220353261 --- tensorflow/lite/examples/android/build.gradle | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/lite/examples/android/build.gradle b/tensorflow/lite/examples/android/build.gradle index 66a62a921a7..74dacbcddbd 100644 --- a/tensorflow/lite/examples/android/build.gradle +++ b/tensorflow/lite/examples/android/build.gradle @@ -22,3 +22,7 @@ allprojects { task clean(type: Delete) { delete rootProject.buildDir } + +// Changed since default name 'build' conflicts with +// bazel BUILD file name. +buildDir = "gradle-build" From bdd6af98f02307e1720fb7928225f436b3e9e59d Mon Sep 17 00:00:00 2001 From: Michael Kuperstein Date: Tue, 6 Nov 2018 14:47:21 -0800 Subject: [PATCH 215/540] [XLA] Simplify scalar slice of rank-1 concat. PiperOrigin-RevId: 220356488 --- .../xla/service/algebraic_simplifier.cc | 131 ++++++++++++------ .../xla/service/algebraic_simplifier_test.cc | 48 +++++++ 2 files changed, 134 insertions(+), 45 deletions(-) diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 85fc42f7475..a850e8cb273 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -306,9 +306,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { // Tries to use a kDot in place of the given convolution. StatusOr SimplifyConvToDot(HloInstruction* convolution); - // Tries to simplify a slice(pad(...)) where the result of the slice is a - // scalar. - StatusOr TrySimplifySliceOfPad(HloInstruction* slice); + // Tries to simplify a slice where the result of the slice is a scalar. + StatusOr TrySimplifyScalarSlice(HloInstruction* slice); // Current HloComputation instance the AlgebraicSimplifierVisitor is // traversing. @@ -1826,60 +1825,102 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse) { return Status::OK(); } -StatusOr AlgebraicSimplifierVisitor::TrySimplifySliceOfPad( +StatusOr AlgebraicSimplifierVisitor::TrySimplifyScalarSlice( HloInstruction* slice) { // Only try to do this for effective scalars. We could do the same for slicing // out larger pieces of padding (replacing with a broadcast of the padding // value), but this is probably not worth it. - if (!ShapeUtil::IsEffectiveScalar(slice->shape()) || - slice->operand(0)->opcode() != HloOpcode::kPad) { + if (!ShapeUtil::IsEffectiveScalar(slice->shape())) { return false; } - VLOG(10) << "Trying to simplify scalar slice of pad"; - // Check there's no internal padding. Again, we could handle that too, since - // everything is statically known, but it's not worth it. - auto pad = Cast(slice->mutable_operand(0)); - auto padding_config = pad->padding_config(); - int64 rank = padding_config.dimensions_size(); - if (HasInteriorPadding(padding_config)) { - VLOG(10) << "Not folding scalar slice of pad, pad has interior padding"; - return false; - } + if (slice->operand(0)->opcode() == HloOpcode::kPad) { + VLOG(10) << "Trying to simplify scalar slice of pad"; + // Check there's no internal padding. Again, we could handle that too, since + // everything is statically known, but it's not worth it. + auto pad = Cast(slice->mutable_operand(0)); + auto padding_config = pad->padding_config(); + int64 rank = padding_config.dimensions_size(); + if (HasInteriorPadding(padding_config)) { + VLOG(10) << "Not folding scalar slice of pad, pad has interior padding"; + return false; + } - // Check whether the scalar we're slicing out falls into the padding. - bool in_padding = [&]() { - for (int64 i = 0; i < rank; ++i) { - int64 start = slice->slice_starts(i); - int64 low = padding_config.dimensions(i).edge_padding_low(); - int64 data = pad->operand(0)->shape().dimensions(i); - if (start >= low && start < low + data) { - return false; + // Check whether the scalar we're slicing out falls into the padding. + bool in_padding = [&]() { + for (int64 i = 0; i < rank; ++i) { + int64 start = slice->slice_starts(i); + int64 low = padding_config.dimensions(i).edge_padding_low(); + int64 data = pad->operand(0)->shape().dimensions(i); + if (start >= low && start < low + data) { + return false; + } } - } - return true; - }(); + return true; + }(); - if (in_padding) { - VLOG(10) << "Folding scalar slice of pad into padding value"; - TF_RETURN_IF_ERROR(ReplaceWithNewInstruction( - slice, HloInstruction::CreateReshape(slice->shape(), - pad->mutable_padding_value()))); - return true; - } else { - // We already know the output of the slice is scalar. If the padded - // value is scalar, and it's not in the padding, then it's exactly the - // output value. - bool replaced = - ReplaceInstructionIfSameShape(slice, pad->mutable_operand(0)); - if (replaced) { - VLOG(10) << "Folding scalar slice of pad into padded value"; + if (in_padding) { + VLOG(10) << "Folding scalar slice of pad into padding value"; + TF_RETURN_IF_ERROR(ReplaceWithNewInstruction( + slice, HloInstruction::CreateReshape(slice->shape(), + pad->mutable_padding_value()))); + return true; } else { - VLOG(10) << "Not folding scalar slice of pad into padded value as they " - "have different shapes."; + // We already know the output of the slice is scalar. If the padded + // value is scalar, and it's not in the padding, then it's exactly the + // output value. + bool replaced = + ReplaceInstructionIfSameShape(slice, pad->mutable_operand(0)); + if (replaced) { + VLOG(10) << "Folding scalar slice of pad into padded value"; + } else { + VLOG(10) << "Not folding scalar slice of pad into padded value as they " + "have different shapes."; + } + return replaced; } - return replaced; } + + if (slice->operand(0)->opcode() == HloOpcode::kConcatenate) { + VLOG(10) << "Trying to simplify scalar slice of concat"; + // Only do this for R1, there's no chance of this being useful otherwise. + if (ShapeUtil::Rank(slice->shape()) != 1) { + VLOG(10) << "Not folding, slice is not rank 1"; + return false; + } + HloConcatenateInstruction* concat = + Cast(slice->mutable_operand(0)); + int64 operand_start = 0; + int64 operand_num = 0; + // Weird loop structure to avoid annoying off-by-one errors. + while (true) { + TF_RET_CHECK(operand_num < concat->operand_count()); + const HloInstruction* operand = concat->operand(operand_num); + int64 next_operand_start = operand_start + operand->shape().dimensions(0); + if (next_operand_start > slice->slice_starts(0)) { + break; + } + operand_start = next_operand_start; + operand_num++; + } + + bool replaced = ReplaceInstructionIfSameShape( + slice, concat->mutable_operand(operand_num)); + if (replaced) { + VLOG(10) << "Folding scalar slice of concat into concat operand"; + } else { + VLOG(10) << "Folding scalar slice of concat into slice of concat operand"; + TF_RETURN_IF_ERROR(ReplaceWithNewInstruction( + slice, HloInstruction::CreateSlice( + slice->shape(), concat->mutable_operand(operand_num), + {slice->slice_starts(0) - operand_start}, + {slice->slice_starts(0) - operand_start + 1}, + slice->slice_strides()))); + } + return true; + } + + return false; } Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) { @@ -1907,7 +1948,7 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) { new_slice_starts, new_slice_limits, slice->slice_strides())); } - TF_ASSIGN_OR_RETURN(bool replaced, TrySimplifySliceOfPad(slice)); + TF_ASSIGN_OR_RETURN(bool replaced, TrySimplifyScalarSlice(slice)); if (replaced) { return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc index 7b3e957fbcf..be958e7e94e 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc @@ -3249,6 +3249,54 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) { EXPECT_THAT(root, op::Parameter()); } +TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) { + const char* hlo_string = R"( + HloModule module + + ENTRY test { + param.0 = f32[2] parameter(0) + param.1 = f32[1] parameter(1) + param.2 = f32[3] parameter(2) + concat = f32[6] concatenate(param.0, param.1, param.2), dimensions={0} + ROOT slice = f32[1] slice(concat), slice={[2:3]} + } + )"; + TF_ASSERT_OK_AND_ASSIGN( + auto module, + HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + bitcasting_callback()); + EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Parameter(1)); +} + +TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) { + const char* hlo_string = R"( + HloModule module + + ENTRY test { + param.0 = f32[2] parameter(0) + param.1 = f32[1] parameter(1) + param.2 = f32[3] parameter(2) + concat = f32[6] concatenate(param.0, param.1, param.2), dimensions={0} + ROOT slice = f32[1] slice(concat), slice={[4:5]} + } + )"; + TF_ASSERT_OK_AND_ASSIGN( + auto module, + HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())); + + AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, + bitcasting_callback()); + EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Slice(op::Parameter(2))); + EXPECT_EQ(root->slice_starts(0), 1); + EXPECT_EQ(root->slice_limits(0), 2); +} + struct PadReduceWindowEffectiveBroadcastCase { std::vector input_spatials; std::vector symmetric_pad_spatials; From bfb4bda0ffd2539d64f755f65f088c1c6aca2b6d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 14:49:50 -0800 Subject: [PATCH 216/540] tile: add support for booleans PiperOrigin-RevId: 220356963 --- tensorflow/lite/kernels/tile.cc | 3 + tensorflow/lite/kernels/tile_test.cc | 199 +++++++++++++-------------- 2 files changed, 101 insertions(+), 101 deletions(-) diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc index 6d13f9e92f9..1b747974743 100644 --- a/tensorflow/lite/kernels/tile.cc +++ b/tensorflow/lite/kernels/tile.cc @@ -182,6 +182,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: Tile(*(input->dims), input, multipliers, output); break; + case kTfLiteBool: + Tile(*(input->dims), input, multipliers, output); + break; default: context->ReportError(context, "Type '%s' is not supported by tile.", TfLiteTypeGetName(output->type)); diff --git a/tensorflow/lite/kernels/tile_test.cc b/tensorflow/lite/kernels/tile_test.cc index d12a7c19a36..a88ff66f075 100644 --- a/tensorflow/lite/kernels/tile_test.cc +++ b/tensorflow/lite/kernels/tile_test.cc @@ -34,34 +34,18 @@ class TileOpModel : public SingleOpModel { BuildInterpreter({input_shape, {static_cast(input_shape.size())}}); } - void SetInputFloat(std::initializer_list data) { - PopulateTensor(input_, data); - } - - void SetInputUInt8(std::initializer_list data) { - PopulateTensor(input_, data); - } - - void SetInputInt32(std::initializer_list data) { - PopulateTensor(input_, data); - } - - void SetInputInt64(std::initializer_list data) { - PopulateTensor(input_, data); + template + void SetInput(std::initializer_list data) { + PopulateTensor(input_, data); } void SetMultipliers(std::initializer_list data) { PopulateTensor(multipliers_, data); } - std::vector GetOutputFloat() { return ExtractVector(output_); } - - std::vector GetOutputUInt8() { return ExtractVector(output_); } - - std::vector GetOutputInt32() { return ExtractVector(output_); } - - std::vector GetOutputInt64() { - return ExtractVector(output_); + template + std::vector GetOutput() { + return ExtractVector(output_); } std::vector GetOutputShape() { return GetTensorShape(output_); } @@ -74,16 +58,16 @@ class TileOpModel : public SingleOpModel { TEST(TileTest, Float32Vector) { TileOpModel m({3}, TensorType_FLOAT32, TensorType_INT32); - m.SetInputFloat({1.f, 2.f, 3.f}); + m.SetInput({1.f, 2.f, 3.f}); m.SetMultipliers({2}); m.Invoke(); - EXPECT_THAT(m.GetOutputFloat(), + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1.f, 2.f, 3.f, 1.f, 2.f, 3.f})); } TEST(TileTest, Float32Matrix) { TileOpModel m({2, 3}, TensorType_FLOAT32, TensorType_INT32); - m.SetInputFloat({ + m.SetInput({ 11.f, 12.f, 13.f, @@ -93,26 +77,26 @@ TEST(TileTest, Float32Matrix) { }); m.SetMultipliers({2, 1}); m.Invoke(); - EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray({ - 11.f, - 12.f, - 13.f, - 21.f, - 22.f, - 23.f, - 11.f, - 12.f, - 13.f, - 21.f, - 22.f, - 23.f, - })); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 11.f, + 12.f, + 13.f, + 21.f, + 22.f, + 23.f, + 11.f, + 12.f, + 13.f, + 21.f, + 22.f, + 23.f, + })); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); } TEST(TileTest, Float32HighDimension) { TileOpModel m({1, 2, 3}, TensorType_FLOAT32, TensorType_INT32); - m.SetInputFloat({ + m.SetInput({ 11.f, 12.f, 13.f, @@ -123,7 +107,7 @@ TEST(TileTest, Float32HighDimension) { m.SetMultipliers({2, 3, 1}); m.Invoke(); EXPECT_THAT( - m.GetOutputFloat(), + m.GetOutput(), ElementsAreArray({11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f, @@ -133,7 +117,7 @@ TEST(TileTest, Float32HighDimension) { TEST(TileTest, Uint8Matrix) { TileOpModel m({2, 3}, TensorType_UINT8, TensorType_INT32); - m.SetInputUInt8({ + m.SetInput({ 11, 12, 13, @@ -143,26 +127,26 @@ TEST(TileTest, Uint8Matrix) { }); m.SetMultipliers({2, 1}); m.Invoke(); - EXPECT_THAT(m.GetOutputUInt8(), ElementsAreArray({ - 11, - 12, - 13, - 21, - 22, - 23, - 11, - 12, - 13, - 21, - 22, - 23, - })); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 11, + 12, + 13, + 21, + 22, + 23, + 11, + 12, + 13, + 21, + 22, + 23, + })); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); } TEST(TileTest, Int32Matrix) { TileOpModel m({2, 3}, TensorType_INT32, TensorType_INT32); - m.SetInputInt32({ + m.SetInput({ 11, 12, 13, @@ -172,26 +156,39 @@ TEST(TileTest, Int32Matrix) { }); m.SetMultipliers({2, 1}); m.Invoke(); - EXPECT_THAT(m.GetOutputInt32(), ElementsAreArray({ - 11, - 12, - 13, - 21, - 22, - 23, - 11, - 12, - 13, - 21, - 22, - 23, - })); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 11, + 12, + 13, + 21, + 22, + 23, + 11, + 12, + 13, + 21, + 22, + 23, + })); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); +} + +TEST(TileTest, BooleanMatrix) { + TileOpModel m({2, 3}, TensorType_BOOL, TensorType_INT32); + m.SetInput({true, false, false, true, true, false}); + m.SetMultipliers({2, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), + ElementsAreArray({ + true, false, false, true, true, false, // first tiletrue, + true, false, false, true, true, false // second tile + })); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); } TEST(TileTest, Int64Matrix) { TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT32); - m.SetInputInt64({ + m.SetInput({ 11, 12, 13, @@ -201,26 +198,26 @@ TEST(TileTest, Int64Matrix) { }); m.SetMultipliers({2, 1}); m.Invoke(); - EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({ - 11, - 12, - 13, - 21, - 22, - 23, - 11, - 12, - 13, - 21, - 22, - 23, - })); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 11, + 12, + 13, + 21, + 22, + 23, + 11, + 12, + 13, + 21, + 22, + 23, + })); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); } TEST(TileTest, Int64Matrix64Multipliers) { TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT64); - m.SetInputInt64({ + m.SetInput({ 11, 12, 13, @@ -230,20 +227,20 @@ TEST(TileTest, Int64Matrix64Multipliers) { }); m.SetMultipliers({2, 1}); m.Invoke(); - EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({ - 11, - 12, - 13, - 21, - 22, - 23, - 11, - 12, - 13, - 21, - 22, - 23, - })); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 11, + 12, + 13, + 21, + 22, + 23, + 11, + 12, + 13, + 21, + 22, + 23, + })); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3})); } } // namespace From cf02d61a83bf14d553b81e1cee6ba604f05a7758 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 15:04:55 -0800 Subject: [PATCH 217/540] Upgrade the version of Eigen to commit b4890dc6bc34. PiperOrigin-RevId: 220359861 --- .../contrib/rnn/kernels/lstm_ops_gpu.cu.cc | 2 +- .../core/common_runtime/gpu/gpu_device.cc | 16 +- .../grappler/costs/op_level_cost_estimator.cc | 4 +- .../core/kernels/check_numerics_op_gpu.cu.cc | 4 +- tensorflow/core/kernels/cwise_ops.h | 2 +- tensorflow/core/kernels/deep_conv2d.cc | 5 +- .../core/kernels/depthwise_conv_op_gpu.cu.cc | 2 +- tensorflow/core/kernels/random_op_gpu.cu.cc | 4 +- .../core/util/cuda_kernel_helper_test.cu.cc | 2 +- tensorflow/core/util/cuda_launch_config.h | 10 +- ...gen_tensor_reduced_instantiations_google.h | 111 +- .../eigen_tensor_reduced_instantiations_oss.h | 7 +- .../internal/optimized/optimized_ops.h | 8 +- tensorflow/python/eager/function_test.py | 2 +- tensorflow/workspace.bzl | 9 +- third_party/eigen.BUILD | 1 + .../CXX11/src/FixedPoint/FixedPointTypes.h | 4 +- .../CXX11/src/FixedPoint/MatMatProduct.h | 136 ++- .../CXX11/src/FixedPoint/MatMatProductAVX2.h | 1085 +++++++++-------- .../CXX11/src/FixedPoint/MatMatProductNEON.h | 40 +- .../CXX11/src/FixedPoint/MatVecProduct.h | 103 +- .../CXX11/src/FixedPoint/PacketMathAVX2.h | 73 +- .../CXX11/src/FixedPoint/PacketMathAVX512.h | 96 +- .../CXX11/src/FixedPoint/TypeCastingAVX512.h | 110 +- third_party/eigen_reshaped.patch | 48 + 25 files changed, 988 insertions(+), 896 deletions(-) create mode 100644 third_party/eigen_reshaped.patch diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc index 057e851aba6..15ae95f13cf 100644 --- a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc +++ b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc @@ -141,7 +141,7 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev, // const int gid = batch_id * cell_size * 4 + act_id; const int cid = batch_id * cell_size + act_id; - Eigen::internal::scalar_sigmoid_op sigmoid_op; + Eigen::internal::scalar_logistic_op sigmoid_op; Eigen::internal::scalar_tanh_op tanh_op; Eigen::scalar_clip_op clip_op; diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index d8ebdeff5d2..870b2f24d14 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -84,13 +84,13 @@ namespace tensorflow { // corresponding stream have completed. The following two classes // serve this purpose in two different compilation environments. -class EigenCudaStreamDevice : public ::Eigen::StreamInterface { +class EigenGpuStreamDevice : public ::Eigen::StreamInterface { public: - EigenCudaStreamDevice() + EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) { Eigen::initializeDeviceProp(); } - ~EigenCudaStreamDevice() override {} + ~EigenGpuStreamDevice() override {} void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream, TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc, char* scratch) { @@ -101,7 +101,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface { context_ = context; scratch_ = scratch; semaphore_ = - reinterpret_cast(scratch + Eigen::kCudaScratchSize); + reinterpret_cast(scratch + Eigen::kGpuScratchSize); stream_ = cuda_stream; allocator_ = alloc; PlatformGpuId platform_gpu_id; @@ -185,7 +185,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface { mutable unsigned int* semaphore_; OpKernelContext* context_; - TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice); + TF_DISALLOW_COPY_AND_ASSIGN(EigenGpuStreamDevice); }; // This factory helps to ensure that different GPU device objects that refer to @@ -292,7 +292,7 @@ Status BaseGPUDevice::InitScratchBuffers() { DCHECK(streams_[i]); if (scratch_.size() > i && scratch_[i]) continue; size_t scratch_buffer_size = - Eigen::kCudaScratchSize + sizeof(unsigned int); + Eigen::kGpuScratchSize + sizeof(unsigned int); void* scratch_buffer = gpu_allocator_->AllocateRaw( Allocator::kAllocatorAlignment, scratch_buffer_size); if (scratch_buffer == nullptr) { @@ -304,7 +304,7 @@ Status BaseGPUDevice::InitScratchBuffers() { se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size)); bool ok = executor_->SynchronousMemZero( - &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); + &mem, Eigen::kGpuScratchSize + sizeof(unsigned int)); if (!ok) { return errors::FailedPrecondition( "Failed to memcopy into scratch buffer for device ", @@ -692,7 +692,7 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice { const Eigen::GpuDevice& device() const override { return device_; } private: - EigenCudaStreamDevice stream_device_; + EigenGpuStreamDevice stream_device_; Eigen::GpuDevice device_; }; diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc index 76e5c989fca..0e552092385 100644 --- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc +++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc @@ -311,8 +311,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() { {"Square", EIGEN_COST(scalar_square_op)}, {"Tanh", EIGEN_COST(scalar_tanh_op)}, {"Relu", EIGEN_COST(scalar_max_op)}, - {"Sigmoid", EIGEN_COST(scalar_sigmoid_op)}, - {"QuantizedSigmoid", EIGEN_COST(scalar_sigmoid_op)}, + {"Sigmoid", EIGEN_COST(scalar_logistic_op)}, + {"QuantizedSigmoid", EIGEN_COST(scalar_logistic_op)}, {"Sign", EIGEN_COST(scalar_sign_op)}, {"Sin", EIGEN_COST(scalar_sin_op)}, {"Tan", EIGEN_COST(scalar_tan_op)}, diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc index 87bdba14550..f9f10c1b42f 100644 --- a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc +++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc @@ -60,9 +60,9 @@ template struct CheckNumericsLaunch { void Run(const GPUDevice &d, const T *data, int size, int abnormal_detected[2]) { - const int32 block_size = d.maxCudaThreadsPerBlock(); + const int32 block_size = d.maxGpuThreadsPerBlock(); const int32 num_blocks = - (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) / + (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) / block_size; CheckNumericsKernel<<>>( diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h index 66ba827a901..3f7aa0dc399 100644 --- a/tensorflow/core/kernels/cwise_ops.h +++ b/tensorflow/core/kernels/cwise_ops.h @@ -656,7 +656,7 @@ template struct erfc : base> {}; template -struct sigmoid : base> {}; +struct sigmoid : base> {}; template struct sin : base> {}; diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc index 1aa8c72d667..f9c8f16cb9a 100644 --- a/tensorflow/core/kernels/deep_conv2d.cc +++ b/tensorflow/core/kernels/deep_conv2d.cc @@ -500,8 +500,9 @@ class GemmFilterPacker { typedef Eigen::internal::const_blas_data_mapper LhsMapper; typedef Eigen::internal::gebp_traits Traits; - Eigen::internal::gemm_pack_lhs + Eigen::internal::gemm_pack_lhs< + T, int64, LhsMapper, Traits::mr, Traits::LhsProgress, + typename Traits::LhsPacket4Packing, Eigen::RowMajor> pack_lhs; GemmFilterPacker(const int64 rows, const int64 depth, const T* lhs_input, diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc index 76afd6f18c2..1398c876625 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc @@ -764,7 +764,7 @@ Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args, const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 || kKnownDepthMultiplier < 0 ? std::numeric_limits::max() - : device.getNumCudaMultiProcessors(); + : device.getNumGpuMultiProcessors(); kernel<<>>(args, input, filter, output, num_outputs); diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc index 3393b39faf4..edb2b10e3d6 100644 --- a/tensorflow/core/kernels/random_op_gpu.cu.cc +++ b/tensorflow/core/kernels/random_op_gpu.cu.cc @@ -217,9 +217,9 @@ void FillPhiloxRandom::operator()( OpKernelContext*, const GPUDevice& d, random::PhiloxRandom gen, typename Distribution::ResultElementType* data, int64 size, Distribution dist) { - const int32 block_size = d.maxCudaThreadsPerBlock(); + const int32 block_size = d.maxGpuThreadsPerBlock(); const int32 num_blocks = - (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) / + (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) / block_size; FillPhiloxRandomKernelLaunch diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc index 732ed33ede1..2b035ab0e9c 100644 --- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc +++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc @@ -131,7 +131,7 @@ class CudaLaunchConfigTest : public ::testing::Test { protected: const int bufsize = 1024; int* outbuf = nullptr; - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice d = Eigen::GpuDevice(&stream); virtual void SetUp() { diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h index d0d95736d3f..080d4067cec 100644 --- a/tensorflow/core/util/cuda_launch_config.h +++ b/tensorflow/core/util/cuda_launch_config.h @@ -128,12 +128,12 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count, CudaLaunchConfig config; const int virtual_thread_count = work_element_count; const int physical_thread_count = std::min( - d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(), + d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(), virtual_thread_count); - const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock()); + const int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock()); const int block_count = std::min(DivUp(physical_thread_count, thread_per_block), - d.getNumCudaMultiProcessors()); + d.getNumGpuMultiProcessors()); config.virtual_thread_count = virtual_thread_count; config.thread_per_block = thread_per_block; @@ -184,7 +184,7 @@ inline CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize( cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( &block_count, func, fixed_block_size, dynamic_shared_memory_size); CHECK_EQ(err, cudaSuccess); - block_count = std::min(block_count * d.getNumCudaMultiProcessors(), + block_count = std::min(block_count * d.getNumGpuMultiProcessors(), DivUp(work_element_count, fixed_block_size)); config.virtual_thread_count = work_element_count; @@ -213,7 +213,7 @@ inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim, int block_rows = std::max(kThreadsPerBlock / block_cols, 1); const int physical_thread_count = - d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(); + d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(); const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1); diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h index f71ddbf3220..6461a5e5426 100644 --- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h +++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h @@ -12,25 +12,55 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_ -#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_ -#define EIGEN_USE_CUSTOM_THREAD_POOL -#define EIGEN_USE_THREADS +// This is essentially unsupported/CXX11/Eigen/Tensor.h +// TODO(petewarden) - move this to a common location in Eigen itself. // clang-format off -#include +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_ + + +#include "Eigen/Core" + +#if defined(EIGEN_USE_SYCL) +#undef min +#undef max +#undef isnan +#undef isinf +#undef isfinite +#include +#include +#include +#include +#include +#endif +#include #include #include -#include + + + + + +#ifdef _WIN32 +typedef __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#include +#else +#include +#include +#endif + +#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 #include -#include -#include // NOLINT(build/c++11) -#include // NOLINT(build/c++11) -#include // NOLINT(build/c++11) -#include +#endif #ifdef _WIN32 #include @@ -40,58 +70,53 @@ limitations under the License. #include #endif +// #if defined(EIGEN_USE_LIBXSMM) +// #include "libxsmm.h" +// #endif -// Because some programs may link Eigen in through other frameworks with -// different flags, we can run into multiple definition issues if we don't have -// a private namespace for our versions. This is a nasty hack, but a similar -// approach is used elsewhere to handle the problem, so it should be stable. -#define Eigen EigenForTFLite +#ifdef EIGEN_USE_THREADS +#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool" +#endif -#include "Eigen/src/Core/util/StaticAssert.h" -#include "unsupported/Eigen/CXX11/Core" -#include "unsupported/Eigen/SpecialFunctions" #include "Eigen/src/Core/util/DisableStupidWarnings.h" -#include "Eigen/Core" +#include "third_party/eigen3/unsupported/Eigen/SpecialFunctions" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Meta.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h" -// Beware: the order of the include matters to some compilers. For example -// TensorIndexList.h should be included before TensorDimensions.h in order to -// use index lists to encode tensor dimensions when compiling with llvm. -// We're defining this ourselves rather than using the Eigen Tensor header file -// so that we can alter the macro definition of TENSOR_CONTRACTION_DISPATCH to -// reduce binary size. + +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h" + +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/ThreadPoolInterface.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorNonBlockingThreadPool.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h" - -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStats.h" - #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" - +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h" - #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" + #undef TENSOR_CONTRACTION_DISPATCH #define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ if (this->m_lhs_inner_dim_contiguous && \ @@ -102,8 +127,9 @@ limitations under the License. eigen_assert(false && "Unsupported contraction formats"); \ } + #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h" @@ -125,19 +151,18 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h" - +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" - #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h" - -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h" - #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" #include "Eigen/src/Core/util/ReenableStupidWarnings.h" + + #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_ diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h index 5e83b7b846e..f5576fbff70 100644 --- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h +++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h @@ -94,7 +94,7 @@ typedef unsigned __int64 uint64_t; #include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h" @@ -106,10 +106,11 @@ typedef unsigned __int64 uint64_t; #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h" @@ -128,7 +129,7 @@ typedef unsigned __int64 uint64_t; #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h" diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h index 6f7031b36d2..00be5a9db83 100644 --- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h @@ -3151,12 +3151,12 @@ inline void LstmCell( // Combined memory state and final output calculation gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput"); output_state_map = - input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op()) * + input_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op()) * new_input_sm.tanh() + - forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op()) * + forget_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op()) * prev_state_map; output_activ_map = - output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op()) * + output_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op()) * output_state_map.tanh(); } @@ -4367,7 +4367,7 @@ inline void Logistic(const RuntimeShape& input_shape, const float* input_data, auto input_map = MapAsVector(input_data, input_shape); auto output_map = MapAsVector(output_data, output_shape); output_map.array() = - input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op()); + input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op()); } // Convenience version that allows, for example, generated-code calls to be diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py index 781c3f0a18a..6c93f977291 100644 --- a/tensorflow/python/eager/function_test.py +++ b/tensorflow/python/eager/function_test.py @@ -1570,7 +1570,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): if not context.executing_eagerly(): self.evaluate(variables.global_variables_initializer()) - self.assertAllEqual([[[[4.0]]]], self.evaluate(y)) + self.assertAllClose([[[[4.0]]]], self.evaluate(y)) # Remove reference cycles in model test_util.dismantle_polymorphic_function(model) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 7b3e17fbb98..18a0ba6b197 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -134,11 +134,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "eigen_archive", build_file = clean_dep("//third_party:eigen.BUILD"), - sha256 = "d956415d784fa4e42b6a2a45c32556d6aec9d0a3d8ef48baee2522ab762556a9", - strip_prefix = "eigen-eigen-fd6845384b86", + patch_file = clean_dep("//third_party:eigen_reshaped.patch"), + sha256 = "d66cec3b54b3dfaa4666c1d49481a7197f93fc078cd53c54e2b4a8893a529c9f", + strip_prefix = "eigen-eigen-b4890dc6bc34", urls = [ - "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz", - "https://bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz", + "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/b4890dc6bc34.tar.gz", + "https://bitbucket.org/eigen/eigen/get/b4890dc6bc34.tar.gz", ], ) diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD index 759f8a9be92..194a2272d54 100644 --- a/third_party/eigen.BUILD +++ b/third_party/eigen.BUILD @@ -65,6 +65,7 @@ cc_library( # code. We use it, but we do not rely on it, as evidenced above. "EIGEN_MPL2_ONLY", "EIGEN_MAX_ALIGN_BYTES=64", + "EIGEN_HAS_TYPE_TRAITS=0", ], includes = ["."], visibility = ["//visibility:public"], diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h index 5ab36649187..ff359cedced 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h @@ -249,9 +249,7 @@ EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) { a.value /= b.value; return a; } -EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { - return -a.value; -} +EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { return -a.value; } // Scaling QInt32 by double. We do the arithmetic in double because // float only has 23 bits of mantissa, so casting QInt32 to float might reduce diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h index e6f4080ae12..8477933e1ba 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h @@ -15,11 +15,9 @@ namespace internal { // Accumulate the product of 2 QInt8 inputs on 32 bits to prevent // overflows -template<> struct scalar_product_traits -{ - enum { - Defined = 1 - }; +template <> +struct scalar_product_traits { + enum { Defined = 1 }; typedef QInt32 ReturnType; }; @@ -33,11 +31,9 @@ struct scalar_product_traits { // Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits // to prevent overflows -template<> struct scalar_product_traits -{ - enum { - Defined = 1 - }; +template <> +struct scalar_product_traits { + enum { Defined = 1 }; typedef QInt32 ReturnType; }; @@ -47,14 +43,16 @@ template<> struct scalar_product_traits // signed 8bit integers #ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT -template -class gebp_traits -{ -public: +template +class gebp_traits { + public: typedef QInt8 LhsScalar; typedef QInt8 RhsScalar; typedef QInt32 ResScalar; + typedef typename packet_traits::type LhsPacket; + typedef LhsPacket LhsPacket4Packing; + enum { // register block size along the M and N directions // One for the current implementation @@ -68,22 +66,24 @@ public: }; // The signed 8bit Mat-Mat product itself. -template -struct gebp_kernel -{ +template +struct gebp_kernel { EIGEN_DONT_INLINE - void operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); + void operator()(const DataMapper& res, const QInt8* blockA, + const QInt8* blockB, Index rows, Index depth, Index cols, + QInt32 alpha, Index strideA = -1, Index strideB = -1, + Index offsetA = 0, Index offsetB = 0); }; -template -EIGEN_DONT_INLINE -void gebp_kernel -::operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB) -{ +template +EIGEN_DONT_INLINE void gebp_kernel:: +operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, Index strideA, + Index strideB, Index offsetA, Index offsetB) { EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -113,18 +113,19 @@ void gebp_kernel -class gebp_traits -{ -public: +template +class gebp_traits { + public: typedef QInt8 LhsScalar; typedef QUInt8 RhsScalar; typedef QInt32 ResScalar; + typedef typename packet_traits::type LhsPacket; + typedef LhsPacket LhsPacket4Packing; + enum { // register block size along the M and N directions // One for the current implementation @@ -138,22 +139,24 @@ public: }; // Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs -template -struct gebp_kernel -{ +template +struct gebp_kernel { EIGEN_DONT_INLINE - void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); + void operator()(const DataMapper& res, const QInt8* blockA, + const QUInt8* blockB, Index rows, Index depth, Index cols, + QInt32 alpha, Index strideA = -1, Index strideB = -1, + Index offsetA = 0, Index offsetB = 0); }; -template -EIGEN_DONT_INLINE -void gebp_kernel -::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB) -{ +template +EIGEN_DONT_INLINE void gebp_kernel:: +operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, Index strideA, + Index strideB, Index offsetA, Index offsetB) { EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -183,18 +186,19 @@ void gebp_kernel -class gebp_traits -{ -public: +template +class gebp_traits { + public: typedef QUInt8 LhsScalar; typedef QInt8 RhsScalar; typedef QInt32 ResScalar; + typedef typename packet_traits::type LhsPacket; + typedef LhsPacket LhsPacket4Packing; + enum { // register block size along the M and N directions // One for the current implementation @@ -207,24 +211,25 @@ public: }; }; - // Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs -template -struct gebp_kernel -{ +template +struct gebp_kernel { EIGEN_DONT_INLINE - void operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); + void operator()(const DataMapper& res, const QUInt8* blockA, + const QInt8* blockB, Index rows, Index depth, Index cols, + QInt32 alpha, Index strideA = -1, Index strideB = -1, + Index offsetA = 0, Index offsetB = 0); }; -template -EIGEN_DONT_INLINE -void gebp_kernel -::operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB) -{ +template +EIGEN_DONT_INLINE void gebp_kernel:: +operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, Index strideA, + Index strideB, Index offsetA, Index offsetB) { EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -263,6 +268,9 @@ class gebp_traits { typedef QInt16 RhsScalar; typedef QInt32 ResScalar; + typedef typename packet_traits::type LhsPacket; + typedef LhsPacket LhsPacket4Packing; + enum { // register block size along the M and N directions // One for the current implementation diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h index 66532fb6002..8547dca1b32 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h @@ -28,6 +28,9 @@ class gebp_traits { typedef QInt16 RhsScalar; typedef QInt32 ResScalar; + typedef typename packet_traits::type LhsPacket; + typedef LhsPacket LhsPacket4Packing; + enum { // Define register blocking scheme. nr = 16, @@ -43,7 +46,7 @@ class gebp_traits { // Used by TensorContractionThreadPool, inputs must have dimensions that are // multiples of 32. template -class TensorContractionBlocking { +class TensorContractionBlocking { public: TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(((k + 15) / 16) * 16), @@ -144,7 +147,7 @@ class gemm_blocking_space -struct gemm_pack_lhs { EIGEN_DONT_INLINE void operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, @@ -154,12 +157,14 @@ struct gemm_pack_lhs EIGEN_DONT_INLINE void gemm_pack_lhs:: + QInt16, ColMajor, Conjugate, PanelMode>:: operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { eigen_assert(stride == 0); eigen_assert(offset == 0); + typedef typename packet_traits::type Packet; + // Use alternate function for weird sizes if (rows % 16 != 0 || depth % 16 != 0) { assert(false && @@ -178,10 +183,10 @@ operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows, // Pack depth in sets of 4 for (Index k = 0; k < depth; k += 4) { // Load vectors - __m256i L_A = lhs.loadPacket(m, k); - __m256i L_B = lhs.loadPacket(m, k + 1); - __m256i L_C = lhs.loadPacket(m, k + 2); - __m256i L_D = lhs.loadPacket(m, k + 3); + __m256i L_A = lhs.template loadPacket(m, k); + __m256i L_B = lhs.template loadPacket(m, k + 1); + __m256i L_C = lhs.template loadPacket(m, k + 2); + __m256i L_D = lhs.template loadPacket(m, k + 3); // Rearrange the inputs as required by the kernel __m256i L_AB0_AB7 = _mm256_unpacklo_epi16(L_A, L_B); @@ -236,13 +241,15 @@ struct gemm_pack_rhs -EIGEN_DONT_INLINE void -gemm_pack_rhs:: +EIGEN_DONT_INLINE void gemm_pack_rhs:: operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { eigen_assert(stride == 0); eigen_assert(offset == 0); + typedef typename packet_traits::type Packet; + // Use alternate function for weird sizes if (cols % 16 != 0 || depth % 16 != 0) { assert(false && @@ -277,28 +284,28 @@ operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols, for (Index n = 0; n < cols; n += 16) { // Pack depth in sets of 16 for (Index k = 0; k < depth; k += 16) { - __m256i R_A = rhs.loadPacket(k, n); - __m256i R_B = rhs.loadPacket(k, n + 1); - __m256i R_C = rhs.loadPacket(k, n + 2); - __m256i R_D = rhs.loadPacket(k, n + 3); + __m256i R_A = rhs.template loadPacket(k, n); + __m256i R_B = rhs.template loadPacket(k, n + 1); + __m256i R_C = rhs.template loadPacket(k, n + 2); + __m256i R_D = rhs.template loadPacket(k, n + 3); PACK_STEP; - R_A = rhs.loadPacket(k, n + 4); - R_B = rhs.loadPacket(k, n + 5); - R_C = rhs.loadPacket(k, n + 6); - R_D = rhs.loadPacket(k, n + 7); + R_A = rhs.template loadPacket(k, n + 4); + R_B = rhs.template loadPacket(k, n + 5); + R_C = rhs.template loadPacket(k, n + 6); + R_D = rhs.template loadPacket(k, n + 7); PACK_STEP; - R_A = rhs.loadPacket(k, n + 8); - R_B = rhs.loadPacket(k, n + 9); - R_C = rhs.loadPacket(k, n + 10); - R_D = rhs.loadPacket(k, n + 11); + R_A = rhs.template loadPacket(k, n + 8); + R_B = rhs.template loadPacket(k, n + 9); + R_C = rhs.template loadPacket(k, n + 10); + R_D = rhs.template loadPacket(k, n + 11); PACK_STEP; - R_A = rhs.loadPacket(k, n + 12); - R_B = rhs.loadPacket(k, n + 13); - R_C = rhs.loadPacket(k, n + 14); - R_D = rhs.loadPacket(k, n + 15); + R_A = rhs.template loadPacket(k, n + 12); + R_B = rhs.template loadPacket(k, n + 13); + R_C = rhs.template loadPacket(k, n + 14); + R_D = rhs.template loadPacket(k, n + 15); PACK_STEP; blockB_256 += 12; @@ -476,9 +483,13 @@ operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB, for (Index j = n; j < n + 16; j++) { LinearMapper r0 = res.getLinearMapper(m, j); LinearMapper r1 = res.getLinearMapper(m + 8, j); - - r0.storePacket(0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0))); - r1.storePacket(0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0))); + typedef typename packet_traits::type Packet; + r0.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r0.template loadPacket(0))); + r1.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r1.template loadPacket(0))); } // Zero the result block so it can be reused @@ -496,14 +507,16 @@ operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB, #ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT // Define quantized traits -template -class gebp_traits -{ -public: +template +class gebp_traits { + public: typedef QInt8 LhsScalar; typedef QUInt8 RhsScalar; typedef QInt32 ResScalar; + typedef typename packet_traits::type LhsPacket; + typedef LhsPacket LhsPacket4Packing; + enum { // Define register blocking scheme. nr = 32, @@ -518,22 +531,28 @@ public: // Specialized blocking for quantized implementations. // Used by TensorContractionThreadPool, inputs must have dimensions that are // multiples of 32. -template -class TensorContractionBlocking, TensorContractionInputMapper, Index, ShardingType> { +template +class TensorContractionBlocking< + ResScalar, + TensorContractionInputMapper< + QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32, + left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>, + TensorContractionInputMapper, + Index, ShardingType> { public: - - typedef QInt8 LhsScalar; + typedef QInt8 LhsScalar; typedef QUInt8 RhsScalar; - TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : - kc_(k), mc_(m), nc_(n) - { + TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) + : kc_(k), mc_(m), nc_(n) { eigen_assert(m % 32 == 0); eigen_assert(k % 32 == 0); if (!k || !m || !n) { @@ -543,8 +562,7 @@ class TensorContractionBlocking class gemm_blocking_space @@ -633,42 +650,60 @@ class gemm_blocking_space +template struct gemm_pack_lhs_any; -template -struct gemm_pack_lhs_any { - EIGEN_DONT_INLINE void operator() - (QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0); +template +struct gemm_pack_lhs_any { + EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs, + Index depth, Index rows, Index stride = 0, + Index offset = 0); }; -template +template struct gemm_pack_rhs_any; -template -struct gemm_pack_rhs_any { - EIGEN_DONT_INLINE void operator() - (QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0); +template +struct gemm_pack_rhs_any { + EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs, + Index depth, Index cols, Index stride = 0, + Index offset = 0); }; -template +template struct gebp_kernel_any; -template -struct gebp_kernel_any -{ +template +struct gebp_kernel_any { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE - void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); + void operator()(const DataMapper& res, const QInt8* blockA, + const QUInt8* blockB, Index rows, Index depth, Index cols, + QInt32 alpha, Index strideA = -1, Index strideB = -1, + Index offsetA = 0, Index offsetB = 0); }; // Alternate implementations for any input sizes -template -EIGEN_DONT_INLINE void gemm_pack_lhs_any:: -operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { +template +EIGEN_DONT_INLINE void gemm_pack_lhs_any:: +operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, + Index stride, Index offset) { eigen_assert(stride == 0); eigen_assert(offset == 0); + typedef typename packet_traits::type Packet; + // Get vector pointer __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA); @@ -690,15 +725,15 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index // Pack depth in sets of 8 for (Index k = 0; k < depth_8; k += 8) { // Load vectors - __m256i L_A = lhs.loadPacket(m, k); - __m256i L_B = lhs.loadPacket(m, k + 1); + __m256i L_A = lhs.template loadPacket(m, k); + __m256i L_B = lhs.template loadPacket(m, k + 1); // Interleave 8-bit elements __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); - __m256i L_C = lhs.loadPacket(m, k + 2); - __m256i L_D = lhs.loadPacket(m, k + 3); + __m256i L_C = lhs.template loadPacket(m, k + 2); + __m256i L_D = lhs.template loadPacket(m, k + 3); __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); @@ -719,12 +754,12 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index _mm256_store_si256(blockA_256++, L_AD16); __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); _mm256_store_si256(blockA_256++, L_AD24); - __m256i L_E = lhs.loadPacket(m, k + 4); - __m256i L_F = lhs.loadPacket(m, k + 5); + __m256i L_E = lhs.template loadPacket(m, k + 4); + __m256i L_F = lhs.template loadPacket(m, k + 5); __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); - __m256i L_G = lhs.loadPacket(m, k + 6); - __m256i L_H = lhs.loadPacket(m, k + 7); + __m256i L_G = lhs.template loadPacket(m, k + 6); + __m256i L_H = lhs.template loadPacket(m, k + 7); __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); @@ -745,76 +780,76 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index if (depth_8 < depth) { __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H; switch (depth - depth_8) { - case 1: - L_A = lhs.loadPacket(m, depth_8); - L_B = _mm256_setzero_si256(); - L_C = _mm256_setzero_si256(); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - break; - case 2: - L_A = lhs.loadPacket(m, depth_8); - L_B = lhs.loadPacket(m, depth_8 + 1); - L_C = _mm256_setzero_si256(); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - break; - case 3: - L_A = lhs.loadPacket(m, depth_8); - L_B = lhs.loadPacket(m, depth_8 + 1); - L_C = lhs.loadPacket(m, depth_8 + 2); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - break; - case 4: - L_A = lhs.loadPacket(m, depth_8); - L_B = lhs.loadPacket(m, depth_8 + 1); - L_C = lhs.loadPacket(m, depth_8 + 2); - L_D = lhs.loadPacket(m, depth_8 + 3); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - break; - case 5: - L_A = lhs.loadPacket(m, depth_8); - L_B = lhs.loadPacket(m, depth_8 + 1); - L_C = lhs.loadPacket(m, depth_8 + 2); - L_D = lhs.loadPacket(m, depth_8 + 3); - L_E = lhs.loadPacket(m, depth_8 + 4); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - break; - case 6: - L_A = lhs.loadPacket(m, depth_8); - L_B = lhs.loadPacket(m, depth_8 + 1); - L_C = lhs.loadPacket(m, depth_8 + 2); - L_D = lhs.loadPacket(m, depth_8 + 3); - L_E = lhs.loadPacket(m, depth_8 + 4); - L_F = lhs.loadPacket(m, depth_8 + 5); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - break; - case 7: - L_A = lhs.loadPacket(m, depth_8); - L_B = lhs.loadPacket(m, depth_8 + 1); - L_C = lhs.loadPacket(m, depth_8 + 2); - L_D = lhs.loadPacket(m, depth_8 + 3); - L_E = lhs.loadPacket(m, depth_8 + 4); - L_F = lhs.loadPacket(m, depth_8 + 5); - L_G = lhs.loadPacket(m, depth_8 + 6); - L_H = _mm256_setzero_si256(); - break; + case 1: + L_A = lhs.template loadPacket(m, depth_8); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 2: + L_A = lhs.template loadPacket(m, depth_8); + L_B = lhs.template loadPacket(m, depth_8 + 1); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 3: + L_A = lhs.template loadPacket(m, depth_8); + L_B = lhs.template loadPacket(m, depth_8 + 1); + L_C = lhs.template loadPacket(m, depth_8 + 2); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 4: + L_A = lhs.template loadPacket(m, depth_8); + L_B = lhs.template loadPacket(m, depth_8 + 1); + L_C = lhs.template loadPacket(m, depth_8 + 2); + L_D = lhs.template loadPacket(m, depth_8 + 3); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 5: + L_A = lhs.template loadPacket(m, depth_8); + L_B = lhs.template loadPacket(m, depth_8 + 1); + L_C = lhs.template loadPacket(m, depth_8 + 2); + L_D = lhs.template loadPacket(m, depth_8 + 3); + L_E = lhs.template loadPacket(m, depth_8 + 4); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 6: + L_A = lhs.template loadPacket(m, depth_8); + L_B = lhs.template loadPacket(m, depth_8 + 1); + L_C = lhs.template loadPacket(m, depth_8 + 2); + L_D = lhs.template loadPacket(m, depth_8 + 3); + L_E = lhs.template loadPacket(m, depth_8 + 4); + L_F = lhs.template loadPacket(m, depth_8 + 5); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + break; + case 7: + L_A = lhs.template loadPacket(m, depth_8); + L_B = lhs.template loadPacket(m, depth_8 + 1); + L_C = lhs.template loadPacket(m, depth_8 + 2); + L_D = lhs.template loadPacket(m, depth_8 + 3); + L_E = lhs.template loadPacket(m, depth_8 + 4); + L_F = lhs.template loadPacket(m, depth_8 + 5); + L_G = lhs.template loadPacket(m, depth_8 + 6); + L_H = _mm256_setzero_si256(); + break; } // Interleave 8-bit elements @@ -875,21 +910,21 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index __m256i L_G = _mm256_setzero_si256(); __m256i L_H = _mm256_setzero_si256(); for (Index m = 0; m < rows - rows_32; m++) { - QInt8* ptr = (QInt8*) &L_A; + QInt8* ptr = (QInt8*)&L_A; ptr[m] = lhs(rows_32 + m, k); - ptr = (QInt8*) &L_B; + ptr = (QInt8*)&L_B; ptr[m] = lhs(rows_32 + m, k + 1); - ptr = (QInt8*) &L_C; + ptr = (QInt8*)&L_C; ptr[m] = lhs(rows_32 + m, k + 2); - ptr = (QInt8*) &L_D; + ptr = (QInt8*)&L_D; ptr[m] = lhs(rows_32 + m, k + 3); - ptr = (QInt8*) &L_E; + ptr = (QInt8*)&L_E; ptr[m] = lhs(rows_32 + m, k + 4); - ptr = (QInt8*) &L_F; + ptr = (QInt8*)&L_F; ptr[m] = lhs(rows_32 + m, k + 5); - ptr = (QInt8*) &L_G; + ptr = (QInt8*)&L_G; ptr[m] = lhs(rows_32 + m, k + 6); - ptr = (QInt8*) &L_H; + ptr = (QInt8*)&L_H; ptr[m] = lhs(rows_32 + m, k + 7); } @@ -939,146 +974,146 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H; QInt8* ptr; switch (depth - depth_8) { - case 1: - L_A = _mm256_setzero_si256(); - L_B = _mm256_setzero_si256(); - L_C = _mm256_setzero_si256(); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - for (Index m = 0; m < rows - rows_32; m++) { - QInt8* ptr = (QInt8*) &L_A; - ptr[m] = lhs(rows_32 + m, depth_8); - } - break; - case 2: - L_A = _mm256_setzero_si256(); - L_B = _mm256_setzero_si256(); - L_C = _mm256_setzero_si256(); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - for (Index m = 0; m < rows - rows_32; m++) { - ptr = (QInt8*) &L_A; - ptr[m] = lhs(rows_32 + m, depth_8); - ptr = (QInt8*) &L_B; - ptr[m] = lhs(rows_32 + m, depth_8 + 1); - } - break; - case 3: - L_A = _mm256_setzero_si256(); - L_B = _mm256_setzero_si256(); - L_C = _mm256_setzero_si256(); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - for (Index m = 0; m < rows - rows_32; m++) { - ptr = (QInt8*) &L_A; - ptr[m] = lhs(rows_32 + m, depth_8); - ptr = (QInt8*) &L_B; - ptr[m] = lhs(rows_32 + m, depth_8 + 1); - ptr = (QInt8*) &L_C; - ptr[m] = lhs(rows_32 + m, depth_8 + 2); - } - break; - case 4: - L_A = _mm256_setzero_si256(); - L_B = _mm256_setzero_si256(); - L_C = _mm256_setzero_si256(); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - for (Index m = 0; m < rows - rows_32; m++) { - ptr = (QInt8*) &L_A; - ptr[m] = lhs(rows_32 + m, depth_8); - ptr = (QInt8*) &L_B; - ptr[m] = lhs(rows_32 + m, depth_8 + 1); - ptr = (QInt8*) &L_C; - ptr[m] = lhs(rows_32 + m, depth_8 + 2); - ptr = (QInt8*) &L_D; - ptr[m] = lhs(rows_32 + m, depth_8 + 3); - } - break; - case 5: - L_A = _mm256_setzero_si256(); - L_B = _mm256_setzero_si256(); - L_C = _mm256_setzero_si256(); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - for (Index m = 0; m < rows - rows_32; m++) { - ptr = (QInt8*) &L_A; - ptr[m] = lhs(rows_32 + m, depth_8); - ptr = (QInt8*) &L_B; - ptr[m] = lhs(rows_32 + m, depth_8 + 1); - ptr = (QInt8*) &L_C; - ptr[m] = lhs(rows_32 + m, depth_8 + 2); - ptr = (QInt8*) &L_D; - ptr[m] = lhs(rows_32 + m, depth_8 + 3); - ptr = (QInt8*) &L_E; - ptr[m] = lhs(rows_32 + m, depth_8 + 4); - } - break; - case 6: - L_A = _mm256_setzero_si256(); - L_B = _mm256_setzero_si256(); - L_C = _mm256_setzero_si256(); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - for (Index m = 0; m < rows - rows_32; m++) { - ptr = (QInt8*) &L_A; - ptr[m] = lhs(rows_32 + m, depth_8); - ptr = (QInt8*) &L_B; - ptr[m] = lhs(rows_32 + m, depth_8 + 1); - ptr = (QInt8*) &L_C; - ptr[m] = lhs(rows_32 + m, depth_8 + 2); - ptr = (QInt8*) &L_D; - ptr[m] = lhs(rows_32 + m, depth_8 + 3); - ptr = (QInt8*) &L_E; - ptr[m] = lhs(rows_32 + m, depth_8 + 4); - ptr = (QInt8*) &L_F; - ptr[m] = lhs(rows_32 + m, depth_8 + 5); - } - break; - case 7: - L_A = _mm256_setzero_si256(); - L_B = _mm256_setzero_si256(); - L_C = _mm256_setzero_si256(); - L_D = _mm256_setzero_si256(); - L_E = _mm256_setzero_si256(); - L_F = _mm256_setzero_si256(); - L_G = _mm256_setzero_si256(); - L_H = _mm256_setzero_si256(); - for (Index m = 0; m < rows - rows_32; m++) { - ptr = (QInt8*) &L_A; - ptr[m] = lhs(rows_32 + m, depth_8); - ptr = (QInt8*) &L_B; - ptr[m] = lhs(rows_32 + m, depth_8 + 1); - ptr = (QInt8*) &L_C; - ptr[m] = lhs(rows_32 + m, depth_8 + 2); - ptr = (QInt8*) &L_D; - ptr[m] = lhs(rows_32 + m, depth_8 + 3); - ptr = (QInt8*) &L_E; - ptr[m] = lhs(rows_32 + m, depth_8 + 4); - ptr = (QInt8*) &L_F; - ptr[m] = lhs(rows_32 + m, depth_8 + 5); - ptr = (QInt8*) &L_G; - ptr[m] = lhs(rows_32 + m, depth_8 + 6); - } - break; + case 1: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + QInt8* ptr = (QInt8*)&L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + } + break; + case 2: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*)&L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*)&L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + } + break; + case 3: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*)&L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*)&L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*)&L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + } + break; + case 4: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*)&L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*)&L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*)&L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*)&L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + } + break; + case 5: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*)&L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*)&L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*)&L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*)&L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + ptr = (QInt8*)&L_E; + ptr[m] = lhs(rows_32 + m, depth_8 + 4); + } + break; + case 6: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*)&L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*)&L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*)&L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*)&L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + ptr = (QInt8*)&L_E; + ptr[m] = lhs(rows_32 + m, depth_8 + 4); + ptr = (QInt8*)&L_F; + ptr[m] = lhs(rows_32 + m, depth_8 + 5); + } + break; + case 7: + L_A = _mm256_setzero_si256(); + L_B = _mm256_setzero_si256(); + L_C = _mm256_setzero_si256(); + L_D = _mm256_setzero_si256(); + L_E = _mm256_setzero_si256(); + L_F = _mm256_setzero_si256(); + L_G = _mm256_setzero_si256(); + L_H = _mm256_setzero_si256(); + for (Index m = 0; m < rows - rows_32; m++) { + ptr = (QInt8*)&L_A; + ptr[m] = lhs(rows_32 + m, depth_8); + ptr = (QInt8*)&L_B; + ptr[m] = lhs(rows_32 + m, depth_8 + 1); + ptr = (QInt8*)&L_C; + ptr[m] = lhs(rows_32 + m, depth_8 + 2); + ptr = (QInt8*)&L_D; + ptr[m] = lhs(rows_32 + m, depth_8 + 3); + ptr = (QInt8*)&L_E; + ptr[m] = lhs(rows_32 + m, depth_8 + 4); + ptr = (QInt8*)&L_F; + ptr[m] = lhs(rows_32 + m, depth_8 + 5); + ptr = (QInt8*)&L_G; + ptr[m] = lhs(rows_32 + m, depth_8 + 6); + } + break; } // Interleave 8-bit elements @@ -1124,12 +1159,17 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index } } -template -EIGEN_DONT_INLINE void gemm_pack_rhs_any:: -operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { +template +EIGEN_DONT_INLINE void gemm_pack_rhs_any:: +operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, + Index stride, Index offset) { eigen_assert(stride == 0); eigen_assert(offset == 0); + typedef typename packet_traits::type Packet; + // Get vector pointer __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB); @@ -1158,52 +1198,52 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index for (Index n = 0; n < cols_32; n += 32) { // Pack depth in sets of 32 for (Index k = 0; k < depth_32; k += 32) { - __m256i R_A = rhs.loadPacket(k, n); - __m256i R_B = rhs.loadPacket(k, n + 1); - __m256i R_C = rhs.loadPacket(k, n + 2); - __m256i R_D = rhs.loadPacket(k, n + 3); + __m256i R_A = rhs.template loadPacket(k, n); + __m256i R_B = rhs.template loadPacket(k, n + 1); + __m256i R_C = rhs.template loadPacket(k, n + 2); + __m256i R_D = rhs.template loadPacket(k, n + 3); PACK_STEP; - R_A = rhs.loadPacket(k, n + 4); - R_B = rhs.loadPacket(k, n + 5); - R_C = rhs.loadPacket(k, n + 6); - R_D = rhs.loadPacket(k, n + 7); + R_A = rhs.template loadPacket(k, n + 4); + R_B = rhs.template loadPacket(k, n + 5); + R_C = rhs.template loadPacket(k, n + 6); + R_D = rhs.template loadPacket(k, n + 7); PACK_STEP; - R_A = rhs.loadPacket(k, n + 8); - R_B = rhs.loadPacket(k, n + 9); - R_C = rhs.loadPacket(k, n + 10); - R_D = rhs.loadPacket(k, n + 11); + R_A = rhs.template loadPacket(k, n + 8); + R_B = rhs.template loadPacket(k, n + 9); + R_C = rhs.template loadPacket(k, n + 10); + R_D = rhs.template loadPacket(k, n + 11); PACK_STEP; - R_A = rhs.loadPacket(k, n + 12); - R_B = rhs.loadPacket(k, n + 13); - R_C = rhs.loadPacket(k, n + 14); - R_D = rhs.loadPacket(k, n + 15); + R_A = rhs.template loadPacket(k, n + 12); + R_B = rhs.template loadPacket(k, n + 13); + R_C = rhs.template loadPacket(k, n + 14); + R_D = rhs.template loadPacket(k, n + 15); PACK_STEP; - R_A = rhs.loadPacket(k, n + 16); - R_B = rhs.loadPacket(k, n + 17); - R_C = rhs.loadPacket(k, n + 18); - R_D = rhs.loadPacket(k, n + 19); + R_A = rhs.template loadPacket(k, n + 16); + R_B = rhs.template loadPacket(k, n + 17); + R_C = rhs.template loadPacket(k, n + 18); + R_D = rhs.template loadPacket(k, n + 19); PACK_STEP; - R_A = rhs.loadPacket(k, n + 20); - R_B = rhs.loadPacket(k, n + 21); - R_C = rhs.loadPacket(k, n + 22); - R_D = rhs.loadPacket(k, n + 23); + R_A = rhs.template loadPacket(k, n + 20); + R_B = rhs.template loadPacket(k, n + 21); + R_C = rhs.template loadPacket(k, n + 22); + R_D = rhs.template loadPacket(k, n + 23); PACK_STEP; - R_A = rhs.loadPacket(k, n + 24); - R_B = rhs.loadPacket(k, n + 25); - R_C = rhs.loadPacket(k, n + 26); - R_D = rhs.loadPacket(k, n + 27); + R_A = rhs.template loadPacket(k, n + 24); + R_B = rhs.template loadPacket(k, n + 25); + R_C = rhs.template loadPacket(k, n + 26); + R_D = rhs.template loadPacket(k, n + 27); PACK_STEP; - R_A = rhs.loadPacket(k, n + 28); - R_B = rhs.loadPacket(k, n + 29); - R_C = rhs.loadPacket(k, n + 30); - R_D = rhs.loadPacket(k, n + 31); + R_A = rhs.template loadPacket(k, n + 28); + R_B = rhs.template loadPacket(k, n + 29); + R_C = rhs.template loadPacket(k, n + 30); + R_D = rhs.template loadPacket(k, n + 31); PACK_STEP; blockB_256 += 24; @@ -1216,13 +1256,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index __m256i R_C = _mm256_setzero_si256(); __m256i R_D = _mm256_setzero_si256(); for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; + ptr = (QUInt8*)&R_A; ptr[k - depth_32] = rhs(k, n); - ptr = (QUInt8*) &R_B; + ptr = (QUInt8*)&R_B; ptr[k - depth_32] = rhs(k, n + 1); - ptr = (QUInt8*) &R_C; + ptr = (QUInt8*)&R_C; ptr[k - depth_32] = rhs(k, n + 2); - ptr = (QUInt8*) &R_D; + ptr = (QUInt8*)&R_D; ptr[k - depth_32] = rhs(k, n + 3); } PACK_STEP; @@ -1232,13 +1272,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index R_C = _mm256_setzero_si256(); R_D = _mm256_setzero_si256(); for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; + ptr = (QUInt8*)&R_A; ptr[k - depth_32] = rhs(k, n + 4); - ptr = (QUInt8*) &R_B; + ptr = (QUInt8*)&R_B; ptr[k - depth_32] = rhs(k, n + 5); - ptr = (QUInt8*) &R_C; + ptr = (QUInt8*)&R_C; ptr[k - depth_32] = rhs(k, n + 6); - ptr = (QUInt8*) &R_D; + ptr = (QUInt8*)&R_D; ptr[k - depth_32] = rhs(k, n + 7); } PACK_STEP; @@ -1248,13 +1288,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index R_C = _mm256_setzero_si256(); R_D = _mm256_setzero_si256(); for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; + ptr = (QUInt8*)&R_A; ptr[k - depth_32] = rhs(k, n + 8); - ptr = (QUInt8*) &R_B; + ptr = (QUInt8*)&R_B; ptr[k - depth_32] = rhs(k, n + 9); - ptr = (QUInt8*) &R_C; + ptr = (QUInt8*)&R_C; ptr[k - depth_32] = rhs(k, n + 10); - ptr = (QUInt8*) &R_D; + ptr = (QUInt8*)&R_D; ptr[k - depth_32] = rhs(k, n + 11); } PACK_STEP; @@ -1264,13 +1304,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index R_C = _mm256_setzero_si256(); R_D = _mm256_setzero_si256(); for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; + ptr = (QUInt8*)&R_A; ptr[k - depth_32] = rhs(k, n + 12); - ptr = (QUInt8*) &R_B; + ptr = (QUInt8*)&R_B; ptr[k - depth_32] = rhs(k, n + 13); - ptr = (QUInt8*) &R_C; + ptr = (QUInt8*)&R_C; ptr[k - depth_32] = rhs(k, n + 14); - ptr = (QUInt8*) &R_D; + ptr = (QUInt8*)&R_D; ptr[k - depth_32] = rhs(k, n + 15); } PACK_STEP; @@ -1280,13 +1320,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index R_C = _mm256_setzero_si256(); R_D = _mm256_setzero_si256(); for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; + ptr = (QUInt8*)&R_A; ptr[k - depth_32] = rhs(k, n + 16); - ptr = (QUInt8*) &R_B; + ptr = (QUInt8*)&R_B; ptr[k - depth_32] = rhs(k, n + 17); - ptr = (QUInt8*) &R_C; + ptr = (QUInt8*)&R_C; ptr[k - depth_32] = rhs(k, n + 18); - ptr = (QUInt8*) &R_D; + ptr = (QUInt8*)&R_D; ptr[k - depth_32] = rhs(k, n + 19); } PACK_STEP; @@ -1296,13 +1336,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index R_C = _mm256_setzero_si256(); R_D = _mm256_setzero_si256(); for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; + ptr = (QUInt8*)&R_A; ptr[k - depth_32] = rhs(k, n + 20); - ptr = (QUInt8*) &R_B; + ptr = (QUInt8*)&R_B; ptr[k - depth_32] = rhs(k, n + 21); - ptr = (QUInt8*) &R_C; + ptr = (QUInt8*)&R_C; ptr[k - depth_32] = rhs(k, n + 22); - ptr = (QUInt8*) &R_D; + ptr = (QUInt8*)&R_D; ptr[k - depth_32] = rhs(k, n + 23); } PACK_STEP; @@ -1312,13 +1352,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index R_C = _mm256_setzero_si256(); R_D = _mm256_setzero_si256(); for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; + ptr = (QUInt8*)&R_A; ptr[k - depth_32] = rhs(k, n + 24); - ptr = (QUInt8*) &R_B; + ptr = (QUInt8*)&R_B; ptr[k - depth_32] = rhs(k, n + 25); - ptr = (QUInt8*) &R_C; + ptr = (QUInt8*)&R_C; ptr[k - depth_32] = rhs(k, n + 26); - ptr = (QUInt8*) &R_D; + ptr = (QUInt8*)&R_D; ptr[k - depth_32] = rhs(k, n + 27); } PACK_STEP; @@ -1328,13 +1368,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index R_C = _mm256_setzero_si256(); R_D = _mm256_setzero_si256(); for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; + ptr = (QUInt8*)&R_A; ptr[k - depth_32] = rhs(k, n + 28); - ptr = (QUInt8*) &R_B; + ptr = (QUInt8*)&R_B; ptr[k - depth_32] = rhs(k, n + 29); - ptr = (QUInt8*) &R_C; + ptr = (QUInt8*)&R_C; ptr[k - depth_32] = rhs(k, n + 30); - ptr = (QUInt8*) &R_D; + ptr = (QUInt8*)&R_D; ptr[k - depth_32] = rhs(k, n + 31); } PACK_STEP; @@ -1350,34 +1390,34 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index Index n; for (n = cols_32; n < cols; n += 4) { switch (cols - n) { - case 1: - R_A = rhs.loadPacket(k, n); - R_B = _mm256_setzero_si256(); - R_C = _mm256_setzero_si256(); - R_D = _mm256_setzero_si256(); - PACK_STEP; - break; - case 2: - R_A = rhs.loadPacket(k, n); - R_B = rhs.loadPacket(k, n + 1); - R_C = _mm256_setzero_si256(); - R_D = _mm256_setzero_si256(); - PACK_STEP; - break; - case 3: - R_A = rhs.loadPacket(k, n); - R_B = rhs.loadPacket(k, n + 1); - R_C = rhs.loadPacket(k, n + 2); - R_D = _mm256_setzero_si256(); - PACK_STEP; - break; - default: - R_A = rhs.loadPacket(k, n); - R_B = rhs.loadPacket(k, n + 1); - R_C = rhs.loadPacket(k, n + 2); - R_D = rhs.loadPacket(k, n + 3); - PACK_STEP; - break; + case 1: + R_A = rhs.template loadPacket(k, n); + R_B = _mm256_setzero_si256(); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + PACK_STEP; + break; + case 2: + R_A = rhs.template loadPacket(k, n); + R_B = rhs.template loadPacket(k, n + 1); + R_C = _mm256_setzero_si256(); + R_D = _mm256_setzero_si256(); + PACK_STEP; + break; + case 3: + R_A = rhs.template loadPacket(k, n); + R_B = rhs.template loadPacket(k, n + 1); + R_C = rhs.template loadPacket(k, n + 2); + R_D = _mm256_setzero_si256(); + PACK_STEP; + break; + default: + R_A = rhs.template loadPacket(k, n); + R_B = rhs.template loadPacket(k, n + 1); + R_C = rhs.template loadPacket(k, n + 2); + R_D = rhs.template loadPacket(k, n + 3); + PACK_STEP; + break; } } @@ -1394,46 +1434,46 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index __m256i R_C = _mm256_setzero_si256(); __m256i R_D = _mm256_setzero_si256(); switch (cols - n) { - case 1: - for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; - ptr[k - depth_32] = rhs(k, n); - } - PACK_STEP; - break; - case 2: - for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; - ptr[k - depth_32] = rhs(k, n); - ptr = (QUInt8*) &R_B; - ptr[k - depth_32] = rhs(k, n + 1); - } - PACK_STEP; - break; - case 3: - for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; - ptr[k - depth_32] = rhs(k, n); - ptr = (QUInt8*) &R_B; - ptr[k - depth_32] = rhs(k, n + 1); - ptr = (QUInt8*) &R_C; - ptr[k - depth_32] = rhs(k, n + 2); - } - PACK_STEP; - break; - default: - for (Index k = depth_32; k < depth; k++) { - ptr = (QUInt8*) &R_A; - ptr[k - depth_32] = rhs(k, n); - ptr = (QUInt8*) &R_B; - ptr[k - depth_32] = rhs(k, n + 1); - ptr = (QUInt8*) &R_C; - ptr[k - depth_32] = rhs(k, n + 2); - ptr = (QUInt8*) &R_D; - ptr[k - depth_32] = rhs(k, n + 3); - } - PACK_STEP; - break; + case 1: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*)&R_A; + ptr[k - depth_32] = rhs(k, n); + } + PACK_STEP; + break; + case 2: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*)&R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*)&R_B; + ptr[k - depth_32] = rhs(k, n + 1); + } + PACK_STEP; + break; + case 3: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*)&R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*)&R_B; + ptr[k - depth_32] = rhs(k, n + 1); + ptr = (QUInt8*)&R_C; + ptr[k - depth_32] = rhs(k, n + 2); + } + PACK_STEP; + break; + default: + for (Index k = depth_32; k < depth; k++) { + ptr = (QUInt8*)&R_A; + ptr[k - depth_32] = rhs(k, n); + ptr = (QUInt8*)&R_B; + ptr[k - depth_32] = rhs(k, n + 1); + ptr = (QUInt8*)&R_C; + ptr[k - depth_32] = rhs(k, n + 2); + ptr = (QUInt8*)&R_D; + ptr[k - depth_32] = rhs(k, n + 3); + } + PACK_STEP; + break; } } } @@ -1441,13 +1481,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index #undef PACK_STEP } -template -EIGEN_DONT_INLINE -void gebp_kernel_any -::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB) -{ +template +EIGEN_DONT_INLINE void gebp_kernel_any:: +operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, Index strideA, + Index strideB, Index offsetA, Index offsetB) { EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(alpha.value == 1); @@ -1678,17 +1718,21 @@ void gebp_kernel_any::type Packet; + r0.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r0.template loadPacket(0))); + r1.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r1.template loadPacket(0))); + r2.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r2.template loadPacket(0))); + r3.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r3.template loadPacket(0))); } - } - else { + } else { for (Index j = n; j < cols; j++) { for (Index i = m; i < rows; i++) { res(i, j) = blockO[(j - n) * 32 + (i - m)]; @@ -1745,7 +1789,7 @@ void gebp_kernel_any -struct gemm_pack_lhs { EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, @@ -1755,15 +1799,18 @@ struct gemm_pack_lhs EIGEN_DONT_INLINE void gemm_pack_lhs:: + QInt8, ColMajor, Conjugate, PanelMode>:: operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { eigen_assert(stride == 0); eigen_assert(offset == 0); + typedef typename packet_traits::type Packet; + // Use alternate function for weird sizes if (rows % 32 != 0 || depth % 32 != 0) { - gemm_pack_lhs_any lhs_pack; + gemm_pack_lhs_any lhs_pack; return lhs_pack(blockA, lhs, depth, rows, stride, offset); } @@ -1775,15 +1822,15 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, // Pack depth in sets of 8 for (Index k = 0; k < depth; k += 8) { // Load vectors - __m256i L_A = lhs.loadPacket(m, k); - __m256i L_B = lhs.loadPacket(m, k + 1); + __m256i L_A = lhs.template loadPacket(m, k); + __m256i L_B = lhs.template loadPacket(m, k + 1); // Interleave 8-bit elements __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B); __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B); - __m256i L_C = lhs.loadPacket(m, k + 2); - __m256i L_D = lhs.loadPacket(m, k + 3); + __m256i L_C = lhs.template loadPacket(m, k + 2); + __m256i L_D = lhs.template loadPacket(m, k + 3); __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D); __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D); @@ -1804,12 +1851,12 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, _mm256_store_si256(blockA_256++, L_AD16); __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31); _mm256_store_si256(blockA_256++, L_AD24); - __m256i L_E = lhs.loadPacket(m, k + 4); - __m256i L_F = lhs.loadPacket(m, k + 5); + __m256i L_E = lhs.template loadPacket(m, k + 4); + __m256i L_F = lhs.template loadPacket(m, k + 5); __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F); __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F); - __m256i L_G = lhs.loadPacket(m, k + 6); - __m256i L_H = lhs.loadPacket(m, k + 7); + __m256i L_G = lhs.template loadPacket(m, k + 6); + __m256i L_H = lhs.template loadPacket(m, k + 7); __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H); __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H); __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16); @@ -1868,9 +1915,12 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, eigen_assert(stride == 0); eigen_assert(offset == 0); + typedef typename packet_traits::type Packet; + // Use alternate function for weird sizes if (cols % 32 != 0 || depth % 32 != 0) { - gemm_pack_rhs_any rhs_pack; + gemm_pack_rhs_any rhs_pack; return rhs_pack(blockB, rhs, depth, cols, stride, offset); } @@ -1898,52 +1948,52 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, for (Index n = 0; n < cols; n += 32) { // Pack depth in sets of 32 for (Index k = 0; k < depth; k += 32) { - __m256i R_A = rhs.loadPacket(k, n); - __m256i R_B = rhs.loadPacket(k, n + 1); - __m256i R_C = rhs.loadPacket(k, n + 2); - __m256i R_D = rhs.loadPacket(k, n + 3); + __m256i R_A = rhs.template loadPacket(k, n); + __m256i R_B = rhs.template loadPacket(k, n + 1); + __m256i R_C = rhs.template loadPacket(k, n + 2); + __m256i R_D = rhs.template loadPacket(k, n + 3); PACK_STEP; - R_A = rhs.loadPacket(k, n + 4); - R_B = rhs.loadPacket(k, n + 5); - R_C = rhs.loadPacket(k, n + 6); - R_D = rhs.loadPacket(k, n + 7); + R_A = rhs.template loadPacket(k, n + 4); + R_B = rhs.template loadPacket(k, n + 5); + R_C = rhs.template loadPacket(k, n + 6); + R_D = rhs.template loadPacket(k, n + 7); PACK_STEP; - R_A = rhs.loadPacket(k, n + 8); - R_B = rhs.loadPacket(k, n + 9); - R_C = rhs.loadPacket(k, n + 10); - R_D = rhs.loadPacket(k, n + 11); + R_A = rhs.template loadPacket(k, n + 8); + R_B = rhs.template loadPacket(k, n + 9); + R_C = rhs.template loadPacket(k, n + 10); + R_D = rhs.template loadPacket(k, n + 11); PACK_STEP; - R_A = rhs.loadPacket(k, n + 12); - R_B = rhs.loadPacket(k, n + 13); - R_C = rhs.loadPacket(k, n + 14); - R_D = rhs.loadPacket(k, n + 15); + R_A = rhs.template loadPacket(k, n + 12); + R_B = rhs.template loadPacket(k, n + 13); + R_C = rhs.template loadPacket(k, n + 14); + R_D = rhs.template loadPacket(k, n + 15); PACK_STEP; - R_A = rhs.loadPacket(k, n + 16); - R_B = rhs.loadPacket(k, n + 17); - R_C = rhs.loadPacket(k, n + 18); - R_D = rhs.loadPacket(k, n + 19); + R_A = rhs.template loadPacket(k, n + 16); + R_B = rhs.template loadPacket(k, n + 17); + R_C = rhs.template loadPacket(k, n + 18); + R_D = rhs.template loadPacket(k, n + 19); PACK_STEP; - R_A = rhs.loadPacket(k, n + 20); - R_B = rhs.loadPacket(k, n + 21); - R_C = rhs.loadPacket(k, n + 22); - R_D = rhs.loadPacket(k, n + 23); + R_A = rhs.template loadPacket(k, n + 20); + R_B = rhs.template loadPacket(k, n + 21); + R_C = rhs.template loadPacket(k, n + 22); + R_D = rhs.template loadPacket(k, n + 23); PACK_STEP; - R_A = rhs.loadPacket(k, n + 24); - R_B = rhs.loadPacket(k, n + 25); - R_C = rhs.loadPacket(k, n + 26); - R_D = rhs.loadPacket(k, n + 27); + R_A = rhs.template loadPacket(k, n + 24); + R_B = rhs.template loadPacket(k, n + 25); + R_C = rhs.template loadPacket(k, n + 26); + R_D = rhs.template loadPacket(k, n + 27); PACK_STEP; - R_A = rhs.loadPacket(k, n + 28); - R_B = rhs.loadPacket(k, n + 29); - R_C = rhs.loadPacket(k, n + 30); - R_D = rhs.loadPacket(k, n + 31); + R_A = rhs.template loadPacket(k, n + 28); + R_B = rhs.template loadPacket(k, n + 29); + R_C = rhs.template loadPacket(k, n + 30); + R_D = rhs.template loadPacket(k, n + 31); PACK_STEP; blockB_256 += 24; @@ -1953,24 +2003,26 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, } // Perform the actual multiplication on packed inputs -template -struct gebp_kernel -{ +template +struct gebp_kernel { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE - void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); + void operator()(const DataMapper& res, const QInt8* blockA, + const QUInt8* blockB, Index rows, Index depth, Index cols, + QInt32 alpha, Index strideA = -1, Index strideB = -1, + Index offsetA = 0, Index offsetB = 0); }; -template -EIGEN_DONT_INLINE -void gebp_kernel -::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB) -{ +template +EIGEN_DONT_INLINE void gebp_kernel:: +operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, Index strideA, + Index strideB, Index offsetA, Index offsetB) { EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(alpha.value == 1); @@ -1986,8 +2038,10 @@ void gebp_kernel gebp; - return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + gebp_kernel_any gebp; + return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, + offsetA, offsetB); } // Create result block @@ -2205,14 +2259,19 @@ void gebp_kernel::type Packet; + r0.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r0.template loadPacket(0))); + r1.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r1.template loadPacket(0))); + r2.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r2.template loadPacket(0))); + r3.template storePacket( + 0, _mm256_add_epi32(blockO_256[i++], + r3.template loadPacket(0))); } // Zero the result block so it can be reused diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h index 9cd31570231..9e0efae6c9b 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h @@ -14,15 +14,14 @@ namespace Eigen { namespace internal { - -// AVX2 optimized implementation of the case where the lhs is encoded using signed 8bit +// AVX2 optimized implementation of the case where the lhs is encoded using +// signed 8bit // integers and the rhs using unsigned 8bit integers. #ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT -template -class gebp_traits -{ -public: +template +class gebp_traits { + public: typedef QInt8 LhsScalar; typedef QUInt8 RhsScalar; typedef QInt32 ResScalar; @@ -40,22 +39,24 @@ public: }; // Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs -template -struct gebp_kernel -{ +template +struct gebp_kernel { EIGEN_DONT_INLINE - void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); + void operator()(const DataMapper& res, const QInt8* blockA, + const QUInt8* blockB, Index rows, Index depth, Index cols, + QInt32 alpha, Index strideA = -1, Index strideB = -1, + Index offsetA = 0, Index offsetB = 0); }; -template -EIGEN_DONT_INLINE -void gebp_kernel -::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, - Index rows, Index depth, Index cols, QInt32 alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB) -{ +template +EIGEN_DONT_INLINE void gebp_kernel:: +operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB, + Index rows, Index depth, Index cols, QInt32 alpha, Index strideA, + Index strideB, Index offsetA, Index offsetB) { EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -85,7 +86,6 @@ void gebp_kernel -struct general_matrix_vector_product -{ -EIGEN_DONT_INLINE static void run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - QInt32* res, Index resIncr, - QInt8 alpha); +template +struct general_matrix_vector_product { + EIGEN_DONT_INLINE static void run(Index rows, Index cols, + const LhsMapper& lhs, const RhsMapper& rhs, + QInt32* res, Index resIncr, QInt8 alpha); }; -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - QInt32* res, Index resIncr, - QInt8 alpha) -{ +template +EIGEN_DONT_INLINE void general_matrix_vector_product< + Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper, + ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs, + const RhsMapper& rhs, QInt32* res, + Index resIncr, QInt8 alpha) { eigen_assert(alpha.value == 1); eigen_assert(resIncr == 1); eigen_assert(rows > 0); @@ -78,26 +76,25 @@ EIGEN_DONT_INLINE void general_matrix_vector_product< } // Mat-Vec product -// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers -template -struct general_matrix_vector_product -{ -EIGEN_DONT_INLINE static void run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - QInt32* res, Index resIncr, - QUInt8 alpha); +// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned +// integers +template +struct general_matrix_vector_product { + EIGEN_DONT_INLINE static void run(Index rows, Index cols, + const LhsMapper& lhs, const RhsMapper& rhs, + QInt32* res, Index resIncr, QUInt8 alpha); }; -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - QInt32* res, Index resIncr, - QUInt8 alpha) -{ +template +EIGEN_DONT_INLINE void general_matrix_vector_product< + Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QUInt8, RhsMapper, + ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs, + const RhsMapper& rhs, QInt32* res, + Index resIncr, QUInt8 alpha) { eigen_assert(alpha.value == 1); eigen_assert(resIncr == 1); eigen_assert(rows > 0); @@ -110,28 +107,26 @@ EIGEN_DONT_INLINE void general_matrix_vector_product -struct general_matrix_vector_product -{ -EIGEN_DONT_INLINE static void run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - QInt32* res, Index resIncr, - QInt8 alpha); +// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed +// integers +template +struct general_matrix_vector_product { + EIGEN_DONT_INLINE static void run(Index rows, Index cols, + const LhsMapper& lhs, const RhsMapper& rhs, + QInt32* res, Index resIncr, QInt8 alpha); }; -template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( - Index rows, Index cols, - const LhsMapper& lhs, - const RhsMapper& rhs, - QInt32* res, Index resIncr, - QInt8 alpha) -{ +template +EIGEN_DONT_INLINE void general_matrix_vector_product< + Index, QUInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper, + ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs, + const RhsMapper& rhs, QInt32* res, + Index resIncr, QInt8 alpha) { eigen_assert(alpha.value == 1); eigen_assert(resIncr == 1); eigen_assert(rows > 0); diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h index 3abd4ee49c2..223ea4d58bf 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h @@ -8,24 +8,20 @@ #endif -inline int _mm256_extract_epi16_N0(const __m256i X) -{ - return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8); +inline int _mm256_extract_epi16_N0(const __m256i X) { + return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8); } -inline int _mm256_extract_epi16_N1(const __m256i X) -{ - return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8); +inline int _mm256_extract_epi16_N1(const __m256i X) { + return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8); } -inline int _mm256_extract_epi8_N0(const __m256i X) -{ - return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16); +inline int _mm256_extract_epi8_N0(const __m256i X) { + return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16); } -inline int _mm256_extract_epi8_N1(const __m256i X) -{ - return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16); +inline int _mm256_extract_epi8_N1(const __m256i X) { + return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16); } namespace Eigen { @@ -34,56 +30,56 @@ namespace internal { typedef struct Packet32q8i { __m256i val; operator __m256i() const { return val; } - Packet32q8i(); + Packet32q8i() : val(_mm256_setzero_si256()){}; Packet32q8i(__m256i val) : val(val) {} } Packet32q8i; typedef struct Packet16q16i { __m256i val; operator __m256i() const { return val; } - Packet16q16i(); + Packet16q16i() : val(_mm256_setzero_si256()){}; Packet16q16i(__m256i val) : val(val) {} } Packet16q16i; typedef struct Packet32q8u { __m256i val; operator __m256i() const { return val; } - Packet32q8u(); + Packet32q8u() : val(_mm256_setzero_si256()){}; Packet32q8u(__m256i val) : val(val) {} } Packet32q8u; typedef struct Packet16q8i { __m128i val; operator __m128i() const { return val; } - Packet16q8i(); + Packet16q8i() : val(_mm_setzero_si128()) {} Packet16q8i(__m128i val) : val(val) {} } Packet16q8i; typedef struct Packet16q8u { __m128i val; operator __m128i() const { return val; } - Packet16q8u(); + Packet16q8u() : val(_mm_setzero_si128()) {} Packet16q8u(__m128i val) : val(val) {} } Packet16q8u; typedef struct Packet8q16i { __m128i val; operator __m128i() const { return val; } - Packet8q16i(); + Packet8q16i() : val(_mm_setzero_si128()) {} Packet8q16i(__m128i val) : val(val) {} } Packet8q16i; typedef struct Packet8q32i { __m256i val; operator __m256i() const { return val; } - Packet8q32i(); + Packet8q32i() : val(_mm256_setzero_si256()){}; Packet8q32i(__m256i val) : val(val) {} } Packet8q32i; typedef struct Packet4q32i { __m128i val; operator __m128i() const { return val; } - Packet4q32i(); + Packet4q32i() : val(_mm_setzero_si128()) {} Packet4q32i(__m128i val) : val(val) {} } Packet4q32i; @@ -182,25 +178,25 @@ template <> struct unpacket_traits { typedef QInt8 type; typedef Packet16q8i half; - enum { size = 32, alignment=Aligned32 }; + enum { size = 32, alignment = Aligned32 }; }; template <> struct unpacket_traits { typedef QInt16 type; typedef Packet8q16i half; - enum { size = 16, alignment=Aligned32 }; + enum { size = 16, alignment = Aligned32 }; }; template <> struct unpacket_traits { typedef QUInt8 type; typedef Packet16q8u half; - enum { size = 32, alignment=Aligned32 }; + enum { size = 32, alignment = Aligned32 }; }; template <> struct unpacket_traits { typedef QInt32 type; typedef Packet4q32i half; - enum { size = 8, alignment=Aligned32 }; + enum { size = 8, alignment = Aligned32 }; }; // Unaligned load @@ -455,40 +451,47 @@ EIGEN_STRONG_INLINE QUInt8 predux_max(const Packet32q8u& a) { template <> EIGEN_STRONG_INLINE QInt8 predux_min(const Packet32q8i& a) { __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1)); - tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = + _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); - tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_min_epi8(tmp, + _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp)); } template <> EIGEN_STRONG_INLINE QInt8 predux_max(const Packet32q8i& a) { __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1)); - tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = + _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2))); tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1)); - tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); + tmp = _mm256_max_epi8(tmp, + _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2))); return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp)); } // Vectorized scaling of Packet32q8i by float. -template<> +template <> struct scalar_product_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) #else - scalar_product_op() { - EIGEN_SCALAR_BINARY_OP_PLUGIN - } + scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const QInt32& a, const double& b) const { return a * b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type + operator()(const QInt32& a, const double& b) const { + return a * b; + } - EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, const double& b) const { + EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, + const double& b) const { __m256d scale = _mm256_set1_pd(b); __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a)); __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo)); __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1)); __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi)); - return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); + return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, + 1); } }; diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h index 2092ce1d4c9..84750c1945a 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h @@ -127,25 +127,25 @@ template <> struct unpacket_traits { typedef QInt8 type; typedef Packet32q8i half; - enum { size = 64, alignment=Aligned64 }; + enum { size = 64, alignment = Aligned64 }; }; template <> struct unpacket_traits { typedef QInt16 type; typedef Packet16q16i half; - enum { size = 32, alignment=Aligned64 }; + enum { size = 32, alignment = Aligned64 }; }; template <> struct unpacket_traits { typedef QUInt8 type; typedef Packet32q8u half; - enum { size = 64, alignment=Aligned64 }; + enum { size = 64, alignment = Aligned64 }; }; template <> struct unpacket_traits { typedef QInt32 type; typedef Packet8q32i half; - enum { size = 16, alignment=Aligned64 }; + enum { size = 16, alignment = Aligned64 }; }; // Unaligned load @@ -244,7 +244,7 @@ EIGEN_STRONG_INLINE QInt32 pfirst(const Packet16q32i& a) { template <> EIGEN_STRONG_INLINE QUInt8 pfirst(const Packet64q8u& a) { return static_cast( - _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0)); + _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0)); } template <> EIGEN_STRONG_INLINE QInt8 pfirst(const Packet64q8i& a) { @@ -410,9 +410,7 @@ EIGEN_STRONG_INLINE QInt32 predux_min(const Packet16q32i& a) { _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3)); res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); return pfirst( - _mm_min_epi32( - res, - _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); } template <> EIGEN_STRONG_INLINE QInt32 predux_max(const Packet16q32i& a) { @@ -424,9 +422,7 @@ EIGEN_STRONG_INLINE QInt32 predux_max(const Packet16q32i& a) { _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3)); res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); return pfirst( - _mm_max_epi32( - res, - _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); } template <> EIGEN_STRONG_INLINE QInt16 predux_min(const Packet32q16i& a) { @@ -437,13 +433,10 @@ EIGEN_STRONG_INLINE QInt16 predux_min(const Packet32q16i& a) { Packet4i res = _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3)); res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = - pfirst( - _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); - return std::min({ - static_cast(w >> 16), - static_cast(w) - }); + std::uint32_t w = pfirst( + _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::min( + {static_cast(w >> 16), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QInt16 predux_max(const Packet32q16i& a) { @@ -454,13 +447,10 @@ EIGEN_STRONG_INLINE QInt16 predux_max(const Packet32q16i& a) { Packet4i res = _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3)); res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = - pfirst( - _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); - return std::max({ - static_cast(w >> 16), - static_cast(w) - }); + std::uint32_t w = pfirst( + _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::max( + {static_cast(w >> 16), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QUInt8 predux_min(const Packet64q8u& a) { @@ -471,15 +461,11 @@ EIGEN_STRONG_INLINE QUInt8 predux_min(const Packet64q8u& a) { Packet4i res = _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3)); res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = - pfirst( - _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); - return std::min({ - static_cast(w >> 24), - static_cast(w >> 16), - static_cast(w >> 8), - static_cast(w) - }); + std::uint32_t w = pfirst( + _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::min( + {static_cast(w >> 24), static_cast(w >> 16), + static_cast(w >> 8), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QUInt8 predux_max(const Packet64q8u& a) { @@ -490,15 +476,11 @@ EIGEN_STRONG_INLINE QUInt8 predux_max(const Packet64q8u& a) { Packet4i res = _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3)); res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = - pfirst( - _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); - return std::max({ - static_cast(w >> 24), - static_cast(w >> 16), - static_cast(w >> 8), - static_cast(w) - }); + std::uint32_t w = pfirst( + _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::max( + {static_cast(w >> 24), static_cast(w >> 16), + static_cast(w >> 8), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QInt8 predux_min(const Packet64q8i& a) { @@ -509,15 +491,11 @@ EIGEN_STRONG_INLINE QInt8 predux_min(const Packet64q8i& a) { Packet4i res = _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3)); res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = - pfirst( - _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); - return std::min({ - static_cast(w >> 24), - static_cast(w >> 16), - static_cast(w >> 8), - static_cast(w) - }); + std::uint32_t w = pfirst( + _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::min( + {static_cast(w >> 24), static_cast(w >> 16), + static_cast(w >> 8), static_cast(w)}); } template <> EIGEN_STRONG_INLINE QInt8 predux_max(const Packet64q8i& a) { @@ -528,15 +506,11 @@ EIGEN_STRONG_INLINE QInt8 predux_max(const Packet64q8i& a) { Packet4i res = _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3)); res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); - std::uint32_t w = - pfirst( - _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); - return std::min({ - static_cast(w >> 24), - static_cast(w >> 16), - static_cast(w >> 8), - static_cast(w) - }); + std::uint32_t w = pfirst( + _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)))); + return std::min( + {static_cast(w >> 24), static_cast(w >> 16), + static_cast(w >> 8), static_cast(w)}); } } // end namespace internal diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h index a09eac67070..d3b02402971 100644 --- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h +++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h @@ -33,28 +33,23 @@ struct type_casting_traits { }; template <> -EIGEN_STRONG_INLINE Packet32q16i -pcast(const Packet16f& a, const Packet16f& b) { +EIGEN_STRONG_INLINE Packet32q16i pcast(const Packet16f& a, + const Packet16f& b) { Packet16i a_int = _mm512_cvtps_epi32(a); Packet16i b_int = _mm512_cvtps_epi32(b); #ifdef EIGEN_VECTORIZE_AVX512BW return _mm512_packs_epi32(a_int, b_int); #else - Packet8i ab_int16_low = - _mm256_permute4x64_epi64( - _mm256_packs_epi32( - _mm512_castsi512_si256(a_int), - _mm512_castsi512_si256(b_int)), - _MM_SHUFFLE(0, 2, 1, 3)); - Packet8i ab_int16_high = - _mm256_permute4x64_epi64( - _mm256_packs_epi32( - _mm512_extracti32x8_epi32(a_int, 1), - _mm512_extracti32x8_epi32(b_int, 1)), - _MM_SHUFFLE(0, 2, 1, 3)); - return _mm512_inserti32x8( - _mm512_castsi256_si512(ab_int16_low), - ab_int16_high, 1); + Packet8i ab_int16_low = _mm256_permute4x64_epi64( + _mm256_packs_epi32(_mm512_castsi512_si256(a_int), + _mm512_castsi512_si256(b_int)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i ab_int16_high = _mm256_permute4x64_epi64( + _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1), + _mm512_extracti32x8_epi32(b_int, 1)), + _MM_SHUFFLE(0, 2, 1, 3)); + return _mm512_inserti32x8(_mm512_castsi256_si512(ab_int16_low), ab_int16_high, + 1); #endif } @@ -64,55 +59,41 @@ struct type_casting_traits { }; template <> -EIGEN_STRONG_INLINE Packet64q8i -pcast(const Packet16f& a, - const Packet16f& b, - const Packet16f& c, - const Packet16f& d) { +EIGEN_STRONG_INLINE Packet64q8i pcast(const Packet16f& a, + const Packet16f& b, + const Packet16f& c, + const Packet16f& d) { Packet16i a_int = _mm512_cvtps_epi32(a); Packet16i b_int = _mm512_cvtps_epi32(b); Packet16i c_int = _mm512_cvtps_epi32(c); Packet16i d_int = _mm512_cvtps_epi32(d); #ifdef EIGEN_VECTORIZE_AVX512BW - return _mm512_packs_epi16( - _mm512_packs_epi32(a_int, b_int), - _mm512_packs_epi32(c_int, d_int)); + return _mm512_packs_epi16(_mm512_packs_epi32(a_int, b_int), + _mm512_packs_epi32(c_int, d_int)); #else - Packet8i ab_int16_low = - _mm256_permute4x64_epi64( - _mm256_packs_epi32( - _mm512_castsi512_si256(a_int), - _mm512_castsi512_si256(b_int)), - _MM_SHUFFLE(0, 2, 1, 3)); - Packet8i cd_int16_low = - _mm256_permute4x64_epi64( - _mm256_packs_epi32( - _mm512_castsi512_si256(c_int), - _mm512_castsi512_si256(d_int)), - _MM_SHUFFLE(0, 2, 1, 3)); - Packet8i ab_int16_high = - _mm256_permute4x64_epi64( - _mm256_packs_epi32( - _mm512_extracti32x8_epi32(a_int, 1), - _mm512_extracti32x8_epi32(b_int, 1)), - _MM_SHUFFLE(0, 2, 1, 3)); - Packet8i cd_int16_high = - _mm256_permute4x64_epi64( - _mm256_packs_epi32( - _mm512_extracti32x8_epi32(c_int, 1), - _mm512_extracti32x8_epi32(d_int, 1)), - _MM_SHUFFLE(0, 2, 1, 3)); - Packet8i abcd_int8_low = - _mm256_permute4x64_epi64( - _mm256_packs_epi16(ab_int16_low, cd_int16_low), - _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i ab_int16_low = _mm256_permute4x64_epi64( + _mm256_packs_epi32(_mm512_castsi512_si256(a_int), + _mm512_castsi512_si256(b_int)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i cd_int16_low = _mm256_permute4x64_epi64( + _mm256_packs_epi32(_mm512_castsi512_si256(c_int), + _mm512_castsi512_si256(d_int)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i ab_int16_high = _mm256_permute4x64_epi64( + _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1), + _mm512_extracti32x8_epi32(b_int, 1)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i cd_int16_high = _mm256_permute4x64_epi64( + _mm256_packs_epi32(_mm512_extracti32x8_epi32(c_int, 1), + _mm512_extracti32x8_epi32(d_int, 1)), + _MM_SHUFFLE(0, 2, 1, 3)); + Packet8i abcd_int8_low = _mm256_permute4x64_epi64( + _mm256_packs_epi16(ab_int16_low, cd_int16_low), _MM_SHUFFLE(0, 2, 1, 3)); Packet8i abcd_int8_high = - _mm256_permute4x64_epi64( - _mm256_packs_epi16(ab_int16_high, cd_int16_high), - _MM_SHUFFLE(0, 2, 1, 3)); - return _mm512_inserti32x8( - _mm512_castsi256_si512(abcd_int8_low), - abcd_int8_high, 1); + _mm256_permute4x64_epi64(_mm256_packs_epi16(ab_int16_high, cd_int16_high), + _MM_SHUFFLE(0, 2, 1, 3)); + return _mm512_inserti32x8(_mm512_castsi256_si512(abcd_int8_low), + abcd_int8_high, 1); #endif } @@ -128,10 +109,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet64q8i -pcast(const Packet16q32i& a, - const Packet16q32i& b, - const Packet16q32i& c, - const Packet16q32i& d) { +pcast(const Packet16q32i& a, const Packet16q32i& b, + const Packet16q32i& c, const Packet16q32i& d) { __m128i a_part = _mm512_cvtsepi32_epi8(a); __m128i b_part = _mm512_cvtsepi32_epi8(b); __m128i c_part = _mm512_cvtsepi32_epi8(c); @@ -145,9 +124,8 @@ pcast(const Packet16q32i& a, } template <> -EIGEN_STRONG_INLINE Packet32q16i -pcast(const Packet16q32i& a, - const Packet16q32i& b) { +EIGEN_STRONG_INLINE Packet32q16i pcast( + const Packet16q32i& a, const Packet16q32i& b) { __m256i a_part = _mm512_cvtsepi32_epi16(a); __m256i b_part = _mm512_cvtsepi32_epi16(b); __m512i converted = diff --git a/third_party/eigen_reshaped.patch b/third_party/eigen_reshaped.patch new file mode 100644 index 00000000000..7acfdcf9fef --- /dev/null +++ b/third_party/eigen_reshaped.patch @@ -0,0 +1,48 @@ +--- a/Eigen/src/Core/util/ReshapedHelper.h (date 1541195478000) ++++ b/Eigen/src/Core/util/ReshapedHelper.h (date 1541195478000) +@@ -39,6 +39,11 @@ + return total/other; + } + ++template ++struct get_compiletime_reshape_order { ++ enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order }; ++}; ++ + } + + } // end namespace Eigen +--- a/Eigen/src/plugins/ReshapedMethods.h (date 1541195254000) ++++ b/Eigen/src/plugins/ReshapedMethods.h (date 1541195254000) +@@ -105,13 +105,13 @@ + inline Reshaped::value, + internal::get_compiletime_reshape_size::value, +- (Order==AutoOrder?Flags&RowMajorBit:Order)> ++ internal::get_compiletime_reshape_order::value> + reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST + { + return Reshaped::value, + internal::get_compiletime_reshape_size::value, +- (Order==AutoOrder?Flags&RowMajorBit:Order)> ++ internal::get_compiletime_reshape_order::value> + (derived(), + internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), + internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); +@@ -128,11 +128,13 @@ + + template + EIGEN_DEVICE_FUNC +-inline Reshaped ++inline Reshaped::value> + reshaped() EIGEN_RESHAPED_METHOD_CONST + { + EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER); +- return Reshaped ++ return Reshaped::value> + (derived(), size(), 1); + } + \ No newline at end of file From 0113d430f978e87a4e57cf3a320121d2f3fc1e79 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 15:58:19 -0800 Subject: [PATCH 218/540] Internal change. PiperOrigin-RevId: 220368991 --- tensorflow/contrib/distribute/python/values.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py index 42fb92014a0..2474886aeb9 100644 --- a/tensorflow/contrib/distribute/python/values.py +++ b/tensorflow/contrib/distribute/python/values.py @@ -776,6 +776,18 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase): def op(self): return self._primary_var.op + # pylint: disable=protected-access + @property + def _save_slice_info(self): + return self._primary_var._save_slice_info + + def _get_save_slice_info(self): + return self._primary_var._get_save_slice_info() + + def _set_save_slice_info(self, save_slice_info): + return self._primary_var._set_save_slice_info(save_slice_info) + # pylint: enable=protected-access + @property def _in_graph_mode(self): return self._primary_var._in_graph_mode # pylint: disable=protected-access From 860ac50033bbc2111d3c229f42fcf240da6fdb03 Mon Sep 17 00:00:00 2001 From: Francois Chollet Date: Tue, 6 Nov 2018 16:14:03 -0800 Subject: [PATCH 219/540] Enable arbitrary ops to work on the symbolic output of Keras layers. PiperOrigin-RevId: 220371736 --- tensorflow/python/eager/core.py | 11 ++++ tensorflow/python/eager/execute.py | 15 ++++- tensorflow/python/framework/ops.py | 38 +++++++++++- tensorflow/python/framework/python_op_gen.cc | 61 ++++++++++--------- .../python/keras/engine/base_layer_test.py | 60 ++++++++++++++++++ tensorflow/python/keras/engine/input_layer.py | 10 +++ 6 files changed, 162 insertions(+), 33 deletions(-) diff --git a/tensorflow/python/eager/core.py b/tensorflow/python/eager/core.py index 8fb69300209..e168b4bd5ff 100644 --- a/tensorflow/python/eager/core.py +++ b/tensorflow/python/eager/core.py @@ -60,4 +60,15 @@ class _FallbackException(Exception): pass +class _SymbolicException(Exception): + """Exception class to handle use of symbolic tensors when executing eagerly. + + `keras.Input()` creates symbolic tensors (in a FuncGraph managed by the + Keras backend) while in eager execution. This exception is used to + identify this case (raised in `convert_to_tensor` cause generated functions + for ops to construct graphs instead of executing the kernel). + """ + pass + + pywrap_tensorflow.TFE_Py_RegisterFallbackExceptionClass(_FallbackException) diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py index f9b8d2cb5db..6f8c780170c 100644 --- a/tensorflow/python/eager/execute.py +++ b/tensorflow/python/eager/execute.py @@ -64,6 +64,16 @@ def quick_execute(op_name, num_outputs, inputs, attrs, ctx, name=None): else: message = e.message six.raise_from(core._status_to_exception(e.code, message), None) + except TypeError as e: + if any(ops._is_keras_symbolic_tensor(x) for x in inputs): + if any(isinstance(x, ops.EagerTensor) for x in inputs): + raise TypeError("You are attempting to mix computation of symbolic " + "Tensors (computation rooted at tf.keras.Input()) " + "and concrete values. This is not supported. " + "If you need this support, file an issue on the " + "TensorFlow GitHub repository.") + raise core._SymbolicException + raise e # pylint: enable=protected-access return tensors @@ -188,7 +198,10 @@ def args_to_matching_eager(l, ctx, default_dtype=None): ret = [] for t in l: ret.append(internal_convert_to_tensor( - t, dtype, preferred_dtype=default_dtype, ctx=ctx)) + t, dtype, + preferred_dtype=default_dtype, + ctx=ctx, + accept_symbolic_tensors=False)) if dtype is None: dtype = ret[-1].dtype else: diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 14e4c0ca41a..36561a8c301 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -1068,7 +1068,8 @@ def internal_convert_to_tensor(value, name=None, as_ref=False, preferred_dtype=None, - ctx=None): + ctx=None, + accept_symbolic_tensors=True): """Converts the given `value` to an `Tensor`. This function converts Python objects of various types to `Tensor` @@ -1092,6 +1093,10 @@ def internal_convert_to_tensor(value, can be used as a soft preference. If the conversion to `preferred_dtype` is not possible, this argument has no effect. ctx: Optional: The value of context.context(). + accept_symbolic_tensors: Whether Keras graph tensors should be accepted as + a valid tensor type during eager execution. + If False, this function will raise an exception if it is passed such + a tensor during eager eager execution. Returns: A `Tensor` based on `value`. @@ -1115,6 +1120,19 @@ def internal_convert_to_tensor(value, raise RuntimeError("Attempting to capture an EagerTensor without " "building a function.") return graph.capture(value, name=name) + elif ((not accept_symbolic_tensors) and + isinstance(value, Tensor) and + ctx.executing_eagerly()): + # Found a symbolic tensor in an eager context. + # This happens when we use the Keras functional API (i.e. calling layers + # on the output of `keras.Input()`, which is symbolic) while eager + # execution is enabled. + if _is_keras_symbolic_tensor(value): + # If the graph of the tensor isn't the Keras graph, we should still + # fail, for the time being. TODO(fchollet): consider allowing + # all symbolic tensors to raise this exception in this case. + raise core._SymbolicException( # pylint: disable=protected-access + "Using the symbolic output of a Keras layer during eager execution.") if dtype is not None: dtype = dtypes.as_dtype(dtype) @@ -6000,6 +6018,13 @@ class name_scope(object): # pylint: disable=invalid-name self._values = values self._ctx = context.context() self._in_eager_mode = self._ctx.executing_eagerly() + self._has_symbolic_input_in_eager = False + if self._values and self._in_eager_mode: + # The presence of a graph tensor in `self._values` overrides the context. + for value in self._values: + if hasattr(value, "graph"): + self._has_symbolic_input_in_eager = True + self._name_scope = value.graph.name_scope(self._name) def __enter__(self): """Start the scope block. @@ -6011,6 +6036,9 @@ class name_scope(object): # pylint: disable=invalid-name ValueError: if neither `name` nor `default_name` is provided but `values` are. """ + if self._has_symbolic_input_in_eager: + return self._name_scope.__enter__() + if self._in_eager_mode: self._old_name = self._ctx.scope_name if not self._name: @@ -6053,7 +6081,9 @@ class name_scope(object): # pylint: disable=invalid-name raise def __exit__(self, type_arg, value_arg, traceback_arg): - if self._in_eager_mode: + if self._has_symbolic_input_in_eager: + self._name_scope.__exit__(type_arg, value_arg, traceback_arg) + elif self._in_eager_mode: self._ctx.scope_name = self._old_name else: self._name_scope.__exit__(type_arg, value_arg, traceback_arg) @@ -6213,4 +6243,8 @@ def _op_to_colocate_with(v): return internal_convert_to_tensor_or_indexed_slices(v, as_ref=True).op +def _is_keras_symbolic_tensor(x): + return hasattr(x, "graph") and getattr(x.graph, "name", None) == "keras_graph" + + register_tensor_conversion_function(Operation, _operation_conversion_error) diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc index 2022fbcbaad..b06cec4f2dd 100644 --- a/tensorflow/python/framework/python_op_gen.cc +++ b/tensorflow/python/framework/python_op_gen.cc @@ -355,15 +355,12 @@ string GenEagerPythonOp::Code() { } void GenEagerPythonOp::HandleGraphMode(const string& function_setup) { - // Handle graph-mode case - strings::StrAppend(&result_, - " _ctx = _context._context\n" - " if _ctx is None or not _ctx._eager_context.is_eager:\n", - function_setup, - " _, _, _op = _op_def_lib._apply_op_helper(\n"); + strings::StrAppend(&result_, " # Add nodes to the TensorFlow graph.\n"); + strings::StrAppend(&result_, function_setup, + " _, _, _op = _op_def_lib._apply_op_helper(\n"); AddBodyNoReturn(" "); if (num_outs_ > 0) { - strings::StrAppend(&result_, " _result = _op.outputs[:]\n"); + strings::StrAppend(&result_, " _result = _op.outputs[:]\n"); // Special case handling for stateful op with single list output // that might be empty. if (num_outs_ == 1 && op_def_.is_stateful() && @@ -372,10 +369,10 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) { // TODO(josh11b): Can skip this if the number_attr/type_list_attr has // a constraint indicating that this can never be empty. strings::StrAppend(&result_, - " if not _result:\n" - " return _op\n"); + " if not _result:\n" + " return _op\n"); } - strings::StrAppend(&result_, " _inputs_flat = _op.inputs\n"); + strings::StrAppend(&result_, " _inputs_flat = _op.inputs\n"); // Compute graph-mode attrs. if (op_def_.attr_size() > 0) { @@ -387,14 +384,13 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) { attr_name, "\")"); } strings::StrAppend(&attr_values, ")"); - strings::StrAppend(&result_, - WordWrap(" _attrs = (", attr_values, kRightMargin), - "\n"); + strings::StrAppend( + &result_, WordWrap(" _attrs = (", attr_values, kRightMargin), "\n"); } else { - strings::StrAppend(&result_, " _attrs = None\n"); + strings::StrAppend(&result_, " _attrs = None\n"); } } else { - strings::StrAppend(&result_, " return _op\n"); + strings::StrAppend(&result_, " return _op\n"); } } @@ -643,25 +639,26 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode( AddDocStringOutputs(); strings::StrAppend(&result_, " \"\"\"\n"); - // Handle graph-mode case - string function_setup; - if (!GetEagerFunctionSetup(" ", &function_setup)) { - result_ = function_setup; - return false; - } - HandleGraphMode(function_setup); - AddEagerFunctionTeardown(" ", output_sizes, - true /* execute_record_gradient */); - - // Handle eager-mode case - strings::StrAppend(&result_, " else:\n"); - + strings::StrAppend(&result_, + " _ctx = _context._context\n" + " if _ctx is not None and _ctx._eager_context.is_eager:", + "\n"); if (eager_not_allowed_error.empty()) { AddEagerFastPathExecute(); } else { strings::StrAppend(&result_, " ", eager_not_allowed_error); } + // Handle graph-mode case + string function_setup; + if (!GetEagerFunctionSetup(" ", &function_setup)) { + result_ = function_setup; + return false; + } + HandleGraphMode(function_setup); + AddEagerFunctionTeardown(" ", output_sizes, + true /* execute_record_gradient */); + strings::StrAppend(&result_, "\n\n"); return true; } @@ -750,12 +747,16 @@ void GenEagerPythonOp::AddEagerFastPathExecute() { if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", "); strings::StrAppend(&fallback_params, "ctx=_ctx"); strings::StrAppend(&result_, " ", "except _core._FallbackException:\n"); + strings::StrAppend(&result_, " try:\n"); strings::StrAppend( - &result_, " ", "return ", function_name_, kEagerFallbackSuffix, + &result_, " ", "return ", function_name_, kEagerFallbackSuffix, "(\n", - WordWrap(strings::StrCat(" "), + WordWrap(strings::StrCat(" "), strings::StrCat(fallback_params, ")"), kRightMargin), "\n"); + strings::StrAppend(&result_, " except _core._SymbolicException:\n"); + strings::StrAppend(&result_, + " pass # Add nodes to the TensorFlow graph.\n"); // Any errors thrown from execute need to be unwrapped from // _NotOkStatusException. diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py index bda26dabcc6..704589349a8 100644 --- a/tensorflow/python/keras/engine/base_layer_test.py +++ b/tensorflow/python/keras/engine/base_layer_test.py @@ -121,6 +121,66 @@ class BaseLayerTest(test.TestCase): with self.assertRaisesRegexp(ValueError, 'You did something wrong!'): model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3))) + def test_using_symbolic_tensors_with_tf_ops(self): + # Single-input. + x = keras.Input((3,)) + y = math_ops.square(x) + self.assertEqual(y.graph, keras.backend.get_graph()) + + # Multi-inputs. + x1, x2 = keras.Input((3,)), keras.Input((3,)) + y = array_ops.concat([x1, x2], axis=1) + self.assertEqual(y.graph, keras.backend.get_graph()) + + # Mixing Keras symbolic tensors and graph tensors from the same graph works. + with keras.backend.get_graph().as_default(): + x1 = keras.Input((3,)) + x2 = keras.Input((3,)) + y = math_ops.matmul(x1, x2) + self.assertEqual(y.graph, keras.backend.get_graph()) + + # Creating same op type (matmul) multiple times in the Keras graph works. + x1 = keras.Input((3,)) + x2 = keras.Input((3,)) + y = math_ops.matmul(x1, x2) + self.assertEqual(y.graph, keras.backend.get_graph()) + + def test_mixing_eager_and_graph_tensors(self): + with ops.Graph().as_default(): + x1 = array_ops.ones((3, 3)) + x2 = array_ops.ones((3, 3)) + self.assertTrue(isinstance(x2, ops.EagerTensor)) + with self.assertRaisesRegexp(TypeError, + 'provided list of inputs contains ' + 'objects other than \'EagerTensor\''): + math_ops.matmul(x1, x2) + + def test_mixing_numpy_arrays_and_graph_tensors(self): + with ops.Graph().as_default(): + x1 = array_ops.ones((3, 3)) + x2 = np.ones((3, 3), dtype='float32') + with self.assertRaisesRegexp(TypeError, + 'provided list of inputs contains ' + 'objects other than \'EagerTensor\''): + math_ops.matmul(x1, x2) + + def test_mixing_keras_symbolic_tensors_and_eager_tensors(self): + x1 = keras.Input((3,)) + x2 = array_ops.ones((3, 3)) + with self.assertRaisesRegexp( + TypeError, + 'mix computation of symbolic Tensors'): + math_ops.matmul(x1, x2) + + def test_mixing_keras_symbolic_tensors_and_numpy_arrays(self): + # For the time being we treat Numpy arrays as EagerTensors when mixing both. + x1 = keras.Input((3,)) + x2 = np.ones((3, 3), dtype='float32') + with self.assertRaisesRegexp( + TypeError, + 'mix computation of symbolic Tensors'): + math_ops.matmul(x1, x2) + if __name__ == '__main__': ops.enable_eager_execution() diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py index 6f5d1fa7cfb..4e96106004f 100644 --- a/tensorflow/python/keras/engine/input_layer.py +++ b/tensorflow/python/keras/engine/input_layer.py @@ -194,6 +194,16 @@ def Input( # pylint: disable=invalid-name model = Model(x, y) ``` + Note that even if eager execution is enabled, + `Input` produces a symbolic tensor (i.e. a placeholder). + This symbolic tensor can be used with other + TensorFlow ops, as such: + + ```python + x = Input(shape=(32,)) + y = tf.square(x) + ``` + Raises: ValueError: in case of invalid arguments. """ From 60aa63b60bc1a453c4adb6c946fcbf5e889682f1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 16:21:17 -0800 Subject: [PATCH 220/540] Tests for INT64 support in binary ops and 'pack' PiperOrigin-RevId: 220372969 --- tensorflow/lite/testing/generate_examples.py | 105 +++++++++++------- .../testing/generated_examples_zip_test.cc | 7 ++ 2 files changed, 73 insertions(+), 39 deletions(-) diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py index 81b5ed80987..2b129df766a 100644 --- a/tensorflow/lite/testing/generate_examples.py +++ b/tensorflow/lite/testing/generate_examples.py @@ -780,38 +780,45 @@ def make_constant_tests(zip_path): def make_binary_op_tests(zip_path, binary_operator): """Make a set of tests to do binary ops with and without broadcast.""" - # These parameters are split because we don't support broadcasting. - test_parameters = [{ - "dtype": [tf.float32, tf.int32], - "input_shape_1": [[1, 3, 4, 3]], - "input_shape_2": [[1, 3, 4, 3]], - "activation": [True] - }, { - "dtype": [tf.float32], - "input_shape_1": [[5]], - "input_shape_2": [[5]], - "activation": [False, True] - }, { - "dtype": [tf.float32, tf.int32], - "input_shape_1": [[1, 3, 4, 3]], - "input_shape_2": [[3]], - "activation": [True, False] - }, { - "dtype": [tf.float32, tf.int32], - "input_shape_1": [[3]], - "input_shape_2": [[1, 3, 4, 3]], - "activation": [True, False] - }, { - "dtype": [tf.float32], - "input_shape_1": [[]], - "input_shape_2": [[]], - "activation": [False] - }, { - "dtype": [tf.float32], - "input_shape_1": [[0]], - "input_shape_2": [[1]], - "activation": [False] - }] + test_parameters = [ + # Avoid creating all combinations to keep the test size small. + { + "dtype": [tf.float32, tf.int32], + "input_shape_1": [[1, 3, 4, 3]], + "input_shape_2": [[1, 3, 4, 3]], + "activation": [True], + }, + { + "dtype": [tf.float32], + "input_shape_1": [[5]], + "input_shape_2": [[5]], + "activation": [False, True], + }, + { + "dtype": [tf.float32, tf.int32, tf.int64], + "input_shape_1": [[1, 3, 4, 3]], + "input_shape_2": [[3]], + "activation": [True, False], + }, + { + "dtype": [tf.float32, tf.int32], + "input_shape_1": [[3]], + "input_shape_2": [[1, 3, 4, 3]], + "activation": [True, False], + }, + { + "dtype": [tf.float32], + "input_shape_1": [[]], + "input_shape_2": [[]], + "activation": [False], + }, + { + "dtype": [tf.float32], + "input_shape_1": [[0]], + "input_shape_2": [[1]], + "activation": [False], + } + ] def build_graph(parameters): """Builds the graph given the current parameters.""" @@ -3242,12 +3249,30 @@ def make_sparse_to_dense_tests(zip_path): def make_pack_tests(zip_path): """Make a set of tests to do stack.""" - test_parameters = [{ - "base_shape": [[3, 4, 3], [3, 4], [5]], - "num_tensors": [1, 2, 3, 4, 5, 6], - "axis": [0, 1, 2, 3], - "additional_shape": [1, 2, 3], - }] + test_parameters = [ + # Avoid creating all combinations to keep the test size small. + { + "dtype": [tf.float32], + "base_shape": [[3, 4, 3], [3, 4], [5]], + "num_tensors": [1, 2, 3, 4, 5, 6], + "axis": [0, 1, 2, 3], + "additional_shape": [1, 2, 3], + }, + { + "dtype": [tf.int32], + "base_shape": [[3, 4, 3], [3, 4], [5]], + "num_tensors": [6], + "axis": [0, 1, 2, 3], + "additional_shape": [1, 2, 3], + }, + { + "dtype": [tf.int64], + "base_shape": [[3, 4, 3], [3, 4], [5]], + "num_tensors": [5], + "axis": [0, 1, 2, 3], + "additional_shape": [1, 2, 3], + } + ] def get_shape(parameters): """Return a tweaked version of 'base_shape'.""" @@ -3261,7 +3286,9 @@ def make_pack_tests(zip_path): all_tensors = [] for n in range(0, parameters["num_tensors"]): input_tensor = tf.placeholder( - dtype=tf.float32, name=("input%d" % n), shape=get_shape(parameters)) + dtype=parameters["dtype"], + name=("input%d" % n), + shape=get_shape(parameters)) all_tensors.append(input_tensor) out = tf.stack(all_tensors, parameters["axis"]) return all_tensors, [out] diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc index 49f7b527bb7..aedea52065f 100644 --- a/tensorflow/lite/testing/generated_examples_zip_test.cc +++ b/tensorflow/lite/testing/generated_examples_zip_test.cc @@ -94,6 +94,13 @@ std::map kBrokenTests = { {R"(^\/div.*activation=True.*dtype=tf\.int32)", "112968789"}, {R"(^\/floor_div.*activation=True.*dtype=tf\.int32)", "112968789"}, {R"(^\/floor_mod.*activation=True.*dtype=tf\.int32)", "112968789"}, + {R"(^\/floor_mod.*activation=True.*dtype=tf\.int64)", "112968789"}, + + {R"(^\/sub.*dtype=tf\.int64)", "119126484"}, + {R"(^\/div.*dtype=tf\.int64)", "119126484"}, + {R"(^\/mul.*dtype=tf\.int64)", "119126484"}, + {R"(^\/add.*dtype=tf\.int64)", "119126484"}, + {R"(^\/floor_div.*dtype=tf\.int64)", "119126484"}, }; // Allows test data to be unarchived into a temporary directory and makes From 9abfbf996300eb25cd07c81ffd8f74953ef98d57 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 6 Nov 2018 16:24:58 -0800 Subject: [PATCH 221/540] [TF:XLA] Change XlaCompilationCache to accept XlaCompiler::Arguments in most cases rather than parsing an OpKernelContext. Build the compilation cache signature from the XlaCompiler::Arguments, rather than building it inside the cache. Add a helper method XlaComputationLaunchContext::BuildXlaCompilerArguments that builds the necessary XlaCompiler::Arguments. Fix a bug in XlaCompilationCache::Signature::operator==, where only the contents of a tensor were compared for equality, not the shapes. Remove code that reasons about OpKernelContexts in XlaCompiler::CompileSingleOp, instead passing the specific information required. PiperOrigin-RevId: 220373606 --- tensorflow/compiler/jit/BUILD | 17 +- tensorflow/compiler/jit/kernels/xla_ops.cc | 6 +- .../compiler/jit/xla_compilation_cache.cc | 210 ++++++------------ .../compiler/jit/xla_compilation_cache.h | 73 +++--- .../jit/xla_compilation_cache_test.cc | 54 +++++ .../compiler/jit/xla_compile_on_demand_op.cc | 9 +- tensorflow/compiler/jit/xla_launch_util.cc | 56 +++++ tensorflow/compiler/jit/xla_launch_util.h | 14 ++ tensorflow/compiler/tf2xla/BUILD | 1 + tensorflow/compiler/tf2xla/xla_compiler.cc | 86 ++++--- tensorflow/compiler/tf2xla/xla_compiler.h | 19 +- tensorflow/compiler/tf2xla/xla_resource.cc | 13 ++ tensorflow/compiler/tf2xla/xla_resource.h | 2 + 13 files changed, 333 insertions(+), 227 deletions(-) create mode 100644 tensorflow/compiler/jit/xla_compilation_cache_test.cc diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index ba86f2e247d..6ef8dab6c94 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -21,7 +21,6 @@ package( ) load("//tensorflow:tensorflow.bzl", "cc_header_only_library") -load("//tensorflow:tensorflow.bzl", "tf_kernel_library") load("//tensorflow:tensorflow.bzl", "tf_cc_test") load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured") @@ -255,6 +254,7 @@ cc_library( "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/tf2xla:dump_graph", "//tensorflow/compiler/tf2xla:xla_compiler", + "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/core:core_cpu", @@ -265,6 +265,21 @@ cc_library( "//tensorflow/core:protos_all_cc", "//tensorflow/core/kernels:variable_ops", "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", + ], +) + +tf_cc_test( + name = "xla_compilation_cache_test", + srcs = [ + "xla_compilation_cache_test.cc", + ], + deps = [ + ":xla_compilation_cache", + "//tensorflow/compiler/tf2xla:common", + "//tensorflow/core:test", + "//tensorflow/core:test_main", ], ) diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc index 56b7909ffd3..055de7afcc5 100644 --- a/tensorflow/compiler/jit/kernels/xla_ops.cc +++ b/tensorflow/compiler/jit/kernels/xla_ops.cc @@ -286,8 +286,10 @@ static Status CompileToLocalExecutable( // rather than a one-element tuple. compile_options.always_return_tuple = false; - return cache->Compile(options, function, constant_args, *variables, ctx, - compile_options, + std::vector args; + TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments( + constant_args, *variables, ctx, &args)); + return cache->Compile(options, function, args, compile_options, lazy ? XlaCompilationCache::CompileMode::kLazy : XlaCompilationCache::CompileMode::kStrict, kernel, executable); diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc index 158cd0a95e2..3df5479a55e 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.cc +++ b/tensorflow/compiler/jit/xla_compilation_cache.cc @@ -17,6 +17,7 @@ limitations under the License. #include +#include "absl/strings/str_cat.h" #include "tensorflow/compiler/tf2xla/dump_graph.h" #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/type_util.h" @@ -65,14 +66,14 @@ string XlaCompilationCache::DebugString() { // Compute a string signature which encodes the shapes of the // arguments in the supplied list. -string XlaCompilationCache::SignatureDebugString(const Signature& sig) { - string result = sig.name; - for (const auto& a : sig.arg_types) { +string XlaCompilationCache::Signature::HumanString() const { + string result = name; + for (const auto& a : arg_types) { absl::StrAppend(&result, ",", DataTypeString(a.first), a.second.DebugString()); } - for (const auto& v : sig.arg_values) { + for (const auto& v : arg_values) { absl::StrAppend(&result, "; ", v.DebugString()); } return result; @@ -84,7 +85,9 @@ bool XlaCompilationCache::Signature::operator==(const Signature& other) const { if (arg_values.size() != other.arg_values.size()) return false; for (int i = 0; i < arg_values.size(); ++i) { - if (arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) { + if (arg_values[i].dtype() != other.arg_values[i].dtype() || + arg_values[i].shape() != other.arg_values[i].shape() || + arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) { return false; } } @@ -108,96 +111,30 @@ uint64 XlaCompilationCache::Signature::Hash::operator()( return h; } -Status XlaCompilationCache::BuildSignature( - const NameAttrList& function, const std::map& constant_args, - const std::map& variable_args, OpKernelContext* ctx, - Signature* signature) { - signature->name = Canonicalize(function.name(), AttrSlice(&function.attr())); - signature->arg_values.reserve(constant_args.size()); - - signature->arg_types.reserve(ctx->num_inputs() - constant_args.size()); - - for (int i = 0; i < ctx->num_inputs(); ++i) { - if (constant_args.count(i) > 0) { - // Use the values of compile time constants in the signature. - signature->arg_values.push_back(constant_args.at(i)); - } else if (variable_args.count(i) > 0) { - const OptionalTensor& variable = variable_args.at(i); - if (variable.present) { - signature->arg_types.emplace_back(variable.value.dtype(), - variable.value.shape()); - } else { - signature->arg_types.emplace_back(DT_INVALID, TensorShape()); - } - } else { - signature->arg_types.emplace_back(ctx->input_dtype(i), - ctx->input(i).shape()); +xla::StatusOr +XlaCompilationCache::BuildSignature( + const NameAttrList& function, + absl::Span args) { + Signature signature; + signature.name = Canonicalize(function.name(), AttrSlice(&function.attr())); + for (const XlaCompiler::Argument& arg : args) { + switch (arg.kind) { + case XlaCompiler::Argument::kConstant: + signature.arg_values.push_back(arg.constant_value); + break; + case XlaCompiler::Argument::kParameter: + case XlaCompiler::Argument::kResource: + signature.arg_types.emplace_back(arg.type, arg.shape); + break; + default: + return errors::InvalidArgument( + "Unhandled argument kind in XlaCompilationCache: ", + arg.HumanString()); } } - return Status::OK(); + return std::move(signature); } -namespace { - -// Builds a XlaCompiler::Argument vector from the arguments to the XlaLaunch op. -Status BuildArguments(const std::map& constant_args, - const std::map& variable_args, - OpKernelContext* ctx, - std::vector* args) { - args->resize(ctx->num_inputs()); - - for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) { - XlaCompiler::Argument& arg = (*args)[input_num]; - if (constant_args.count(input_num) > 0) { - // Handles compile-time constants. - const Tensor& input = constant_args.at(input_num); - TF_RET_CHECK(input.dtype() != DT_RESOURCE); - arg.kind = XlaCompiler::Argument::kConstant; - arg.type = input.dtype(); - arg.shape = input.shape(); - arg.constant_value = input; - } else if (variable_args.count(input_num) == 0) { - // Handles the non-constant arguments. - const Tensor& input = ctx->input(input_num); - TF_RET_CHECK(input.dtype() != DT_RESOURCE); - if (input.NumElements() > 0) { - arg.kind = XlaCompiler::Argument::kParameter; - } else { - arg.kind = XlaCompiler::Argument::kConstant; - arg.constant_value = input; - } - arg.type = input.dtype(); - arg.shape = input.shape(); - } else { - // Handles resource variables. - const Tensor& input = ctx->input(input_num); - TF_RET_CHECK(input.dtype() == DT_RESOURCE); - const OptionalTensor& variable = variable_args.at(input_num); - arg.name = variable.name; - arg.kind = XlaCompiler::Argument::kResource; - arg.resource_kind = XlaResource::kVariable; - if (variable.present) { - const Tensor& value = variable.value; - arg.type = value.dtype(); - arg.shape = value.shape(); - arg.initialized = true; - } else { - // The values of uninitialized variables are not passed as inputs, since - // they are meaningless. However, it is legal to assign to a resource - // variable for the first time inside the XLA computation, so we do - // permit uninitialized variables. - arg.initialized = false; - arg.type = DT_INVALID; - arg.shape = TensorShape(); - } - } - } - - return Status::OK(); -} - -} // namespace - Status XlaCompilationCache::BuildExecutable( const XlaCompiler::Options& options, const XlaCompiler::CompilationResult& result, @@ -227,8 +164,7 @@ Status XlaCompilationCache::BuildExecutable( Status XlaCompilationCache::Compile( const XlaCompiler::Options& options, const NameAttrList& function, - const std::map& constant_args, - const std::map& variable_args, OpKernelContext* ctx, + absl::Span args, const XlaCompiler::CompileOptions& compile_options, CompileMode compile_mode, const XlaCompiler::CompilationResult** out_compilation_result, @@ -237,9 +173,11 @@ Status XlaCompilationCache::Compile( if (compile_mode == CompileMode::kLazy) { compile_threshold = kDefaultCompilationThreshold; } - - return CompileImpl(options, function, constant_args, variable_args, ctx, - compile_options, /*compile_single_op=*/false, + auto compile_fn = [&](XlaCompiler* compiler, + XlaCompiler::CompilationResult* result) { + return compiler->CompileFunction(compile_options, function, args, result); + }; + return CompileImpl(options, function, args, compile_fn, /*compile_threshold=*/compile_threshold, out_compilation_result, out_executable); } @@ -257,8 +195,7 @@ static bool IsMegamorphic(int64 compile_count, int64 execution_count) { Status XlaCompilationCache::CompileSingleOp( const XlaCompiler::Options& options, - const std::map& constant_args, - const std::map& variable_args, OpKernelContext* ctx, + absl::Span args, OpKernelContext* ctx, const XlaCompiler::CompileOptions& compile_options, const XlaCompiler::CompilationResult** out_compilation_result, xla::LocalExecutable** out_executable) { @@ -266,17 +203,25 @@ Status XlaCompilationCache::CompileSingleOp( NameAttrList name; name.set_name(def.op()); *name.mutable_attr() = def.attr(); - return CompileImpl( - options, name, constant_args, variable_args, ctx, compile_options, - /*compile_single_op=*/true, /*compile_threshold=*/absl::nullopt, - out_compilation_result, out_executable); + auto compile_op = [&](XlaCompiler* compiler, + XlaCompiler::CompilationResult* result) { + std::vector result_dtypes(ctx->num_outputs()); + for (int i = 0; i < result_dtypes.size(); ++i) { + result_dtypes[i] = ctx->expected_output_dtype(i); + } + return compiler->CompileSingleOp(compile_options, ctx->op_kernel().def(), + args, result_dtypes, result); + }; + return CompileImpl(options, name, args, compile_op, + /*compile_threshold=*/absl::nullopt, + out_compilation_result, out_executable); } Status XlaCompilationCache::CompileImpl( const XlaCompiler::Options& options, const NameAttrList& function, - const std::map& constant_args, - const std::map& variable_args, OpKernelContext* ctx, - const XlaCompiler::CompileOptions& compile_options, bool compile_single_op, + absl::Span args, + const std::function& compile_fn, absl::optional compile_threshold, const XlaCompiler::CompilationResult** out_compilation_result, xla::LocalExecutable** out_executable) { @@ -284,36 +229,15 @@ Status XlaCompilationCache::CompileImpl( VLOG(2) << "XlaCompilationCache::Compile " << DebugString(); if (VLOG_IS_ON(2)) { - VLOG(2) << "num_inputs=" << ctx->num_inputs() - << " num_constant_args=" << constant_args.size() - << " num_variable_args=" << variable_args.size(); - for (int i = 0; i < ctx->num_inputs(); i++) { - TensorShape shape = ctx->input(i).shape(); - VLOG(2) << i << ": dtype=" << DataTypeString(ctx->input_dtype(i)) - << " present=" << ctx->has_input(i) - << " shape=" << shape.DebugString(); - } - for (auto& iterator : variable_args) { - const OptionalTensor& variable = iterator.second; - VLOG(2) << "variable present=" << variable.present - << " type=" << DataTypeString(variable.value.dtype()) - << " shape=" << variable.value.shape().DebugString() - << " TF arg= " << iterator.first; - } - VLOG(2) << "num_outputs = " << ctx->num_outputs(); - for (int i = 0; i < ctx->num_outputs(); i++) { - VLOG(2) << i << ": dtype=" << ctx->expected_output_dtype(i); + VLOG(2) << "num_inputs=" << args.size(); + for (int i = 0; i < args.size(); i++) { + VLOG(2) << i << ": " << args[i].HumanString(); } } - TF_RET_CHECK(constant_args.size() + variable_args.size() <= - ctx->num_inputs()); + TF_ASSIGN_OR_RETURN(Signature signature, BuildSignature(function, args)); + VLOG(2) << "Signature: " << signature.HumanString(); - Signature signature; - TF_RETURN_IF_ERROR( - BuildSignature(function, constant_args, variable_args, ctx, &signature)); - - VLOG(2) << "Signature: " << SignatureDebugString(signature); // The outer lock protects the existence of the cache entry. It does not // protect the contents of the cache entry. Entry* entry; @@ -358,11 +282,11 @@ Status XlaCompilationCache::CompileImpl( // cache eviction. mutex_lock entry_lock(entry->mu); int64 current_request_count = ++entry->request_count; + VLOG(2) << "Compilation cache entry hit: " << entry->compiled + << " signature: " << signature.HumanString() << " with request count " + << current_request_count << " and compile threshold " + << compile_threshold.value_or(0); if (!entry->compiled) { - VLOG(2) << "Compilation cache miss for signature: " - << SignatureDebugString(signature) << " with request count " - << current_request_count << " and compile threshold " - << compile_threshold.value_or(0); const bool should_compile = [&] { if (!compile_threshold.has_value()) { // Lazy compilation is disabled. @@ -392,8 +316,7 @@ Status XlaCompilationCache::CompileImpl( }(); if (!should_compile) { - VLOG(2) << "Not compiling for signature: " - << SignatureDebugString(signature); + VLOG(2) << "Not compiling for signature: " << signature.HumanString(); *out_compilation_result = nullptr; *out_executable = nullptr; return Status::OK(); @@ -403,21 +326,12 @@ Status XlaCompilationCache::CompileImpl( const uint64 compile_start_us = env->NowMicros(); // Do the actual JIT compilation without holding the lock (it can take // a long time.) - std::vector args; - TF_RETURN_IF_ERROR( - BuildArguments(constant_args, variable_args, ctx, &args)); XlaCompiler compiler(options); entry->compiled = true; - if (compile_single_op) { - entry->compilation_status = - compiler.CompileSingleOp(compile_options, signature.name, ctx, args, - &entry->compilation_result); - } else { - entry->compilation_status = compiler.CompileFunction( - compile_options, function, args, &entry->compilation_result); - } + entry->compilation_status = + compile_fn(&compiler, &entry->compilation_result); TF_RETURN_IF_ERROR(entry->compilation_status); CHECK_EQ(entry->executable.get(), nullptr); entry->compilation_status = diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h index b2bf70462b6..846d0c963db 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.h +++ b/tensorflow/compiler/jit/xla_compilation_cache.h @@ -18,9 +18,11 @@ limitations under the License. #include "absl/container/flat_hash_map.h" #include "absl/types/optional.h" +#include "absl/types/span.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_context.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/framework/graph.pb.h" @@ -31,13 +33,6 @@ limitations under the License. namespace tensorflow { -// Struct that represents a possibly-absent Tensor. -struct OptionalTensor { - string name; // A descriptive name - bool present = false; // Is the tensor present? - Tensor value; // If present, what is the Tensor's value? -}; - // The XlaCompilationCache class caches the results of the XlaCompiler class, // which converts a Tensorflow graph into a compiled XLA compilation. // @@ -59,11 +54,7 @@ class XlaCompilationCache : public ResourceBase { // Compiles a function into a XlaCompiler::CompilationResult that can be used // to execute an XLA Computation. Compilation results are cached. // `function` is the name of a Tensorflow function to compile. - // `constant_args` is a map of tensorflow argument number to its constant - // value. - // `variable_args` is a snapshot of the current values of the - // resource variable arguments to `function`; uninitialized variables are - // represented by an absent OptionalTensor. + // `args` is a description of the arguments to the computation. // // `compile_mode` controls the behavior of the compilation cache on a cache // miss. If `compile_mode` is `kLazy` then, based on some profitability @@ -79,9 +70,7 @@ class XlaCompilationCache : public ResourceBase { // outputs. Status Compile(const XlaCompiler::Options& options, const NameAttrList& function, - const std::map& constant_args, - const std::map& variable_args, - OpKernelContext* ctx, + absl::Span args, const XlaCompiler::CompileOptions& compile_options, CompileMode compile_mode, const XlaCompiler::CompilationResult** out_compilation_result, @@ -91,8 +80,7 @@ class XlaCompilationCache : public ResourceBase { // XlaCompiler::CompileFunction. Status CompileSingleOp( const XlaCompiler::Options& options, - const std::map& constant_args, - const std::map& variable_args, OpKernelContext* ctx, + absl::Span args, OpKernelContext* ctx, const XlaCompiler::CompileOptions& compile_options, const XlaCompiler::CompilationResult** out_compilation_result, xla::LocalExecutable** out_executable); @@ -102,26 +90,6 @@ class XlaCompilationCache : public ResourceBase { string DebugString() override; - private: - // Common implementation of Compile and CompileSingleOp. - Status CompileImpl( - const XlaCompiler::Options& options, const NameAttrList& function, - const std::map& constant_args, - const std::map& variable_args, OpKernelContext* ctx, - const XlaCompiler::CompileOptions& compile_options, - bool compile_single_op, absl::optional compile_threshold, - const XlaCompiler::CompilationResult** out_compilation_result, - xla::LocalExecutable** out_executable); - - // Takes `result` which has been compiled from a Tensorflow subgraph to a - // XLA computation already, and generates an XLA LocalExecutable `executable`. - Status BuildExecutable(const XlaCompiler::Options& options, - const XlaCompiler::CompilationResult& result, - std::unique_ptr* executable); - - xla::LocalClient* const client_; - const DeviceType device_type_; - // Describes the types, shapes and any compile-time constant arguments // to a kernel. Key that uniquely identifies a compilation output. struct Signature { @@ -138,14 +106,35 @@ class XlaCompilationCache : public ResourceBase { struct Hash { uint64 operator()(const Signature& signature) const; }; + + // Returns a human-readable description of the signature. + string HumanString() const; }; - static string SignatureDebugString(const Signature& sig); // Builds the signature for a compilation. - Status BuildSignature(const NameAttrList& function, - const std::map& constant_args, - const std::map& variable_args, - OpKernelContext* ctx, Signature* signature); + static xla::StatusOr BuildSignature( + const NameAttrList& function, + absl::Span args); + + private: + // Common implementation of Compile and CompileSingleOp. + Status CompileImpl( + const XlaCompiler::Options& options, const NameAttrList& function, + absl::Span args, + const std::function& compile_fn, + absl::optional compile_threshold, + const XlaCompiler::CompilationResult** out_compilation_result, + xla::LocalExecutable** out_executable); + + // Takes `result` which has been compiled from a Tensorflow subgraph to a + // XLA computation already, and generates an XLA LocalExecutable `executable`. + Status BuildExecutable(const XlaCompiler::Options& options, + const XlaCompiler::CompilationResult& result, + std::unique_ptr* executable); + + xla::LocalClient* const client_; + const DeviceType device_type_; // The value associated with a cache entry. struct Entry { diff --git a/tensorflow/compiler/jit/xla_compilation_cache_test.cc b/tensorflow/compiler/jit/xla_compilation_cache_test.cc new file mode 100644 index 00000000000..018c7c219f4 --- /dev/null +++ b/tensorflow/compiler/jit/xla_compilation_cache_test.cc @@ -0,0 +1,54 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/xla_compilation_cache.h" +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +TEST(XlaCompilationCacheTest, SignatureEquality) { + NameAttrList fn; + fn.set_name("afunction"); + std::vector args(1); + args[0].kind = XlaCompiler::Argument::kConstant; + args[0].type = DT_INT32; + args[0].shape = TensorShape({4, 0}); + args[0].constant_value = Tensor(DT_INT32, {4, 0}); + TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s1, + XlaCompilationCache::BuildSignature(fn, args)); + + args[0].type = DT_FLOAT; + args[0].constant_value = Tensor(DT_FLOAT, {4, 0}); + TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s2, + XlaCompilationCache::BuildSignature(fn, args)); + + args[0].shape = TensorShape({0, 4}); + args[0].constant_value = Tensor(DT_FLOAT, {0, 4}); + TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s3, + XlaCompilationCache::BuildSignature(fn, args)); + + std::vector signatures = {s1, s2, s3}; + for (int i = 0; i < signatures.size(); ++i) { + for (int j = 0; j < signatures.size(); ++j) { + EXPECT_EQ(i == j, signatures[i] == signatures[j]) + << signatures[i].HumanString() << " " << signatures[j].HumanString(); + } + } +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc index 31cb32e3059..1fe612d43d1 100644 --- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc +++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc @@ -187,8 +187,13 @@ Status XlaCompileOnDemandOp::Compile( compile_options.always_return_tuple = false; std::map variable_args = GetVariables(ctx); - return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx, - compile_options, result, executable); + + std::vector args; + TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments( + constant_arguments, variable_args, ctx, &args)); + + return cache->CompileSingleOp(options, args, ctx, compile_options, result, + executable); } void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) { diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index 504bf51f2ad..7fe21c370e6 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -438,4 +438,60 @@ Status XlaComputationLaunchContext::PopulateOutputs( return Status::OK(); } +Status XlaComputationLaunchContext::BuildXlaCompilerArguments( + const std::map& constant_args, + const std::map& variable_args, OpKernelContext* ctx, + std::vector* args) { + args->resize(ctx->num_inputs()); + + for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) { + XlaCompiler::Argument& arg = (*args)[input_num]; + if (constant_args.count(input_num) > 0) { + // Handles compile-time constants. + const Tensor& input = constant_args.at(input_num); + TF_RET_CHECK(input.dtype() != DT_RESOURCE); + arg.kind = XlaCompiler::Argument::kConstant; + arg.type = input.dtype(); + arg.shape = input.shape(); + arg.constant_value = input; + } else if (variable_args.count(input_num) == 0) { + // Handles the non-constant arguments. + const Tensor& input = ctx->input(input_num); + TF_RET_CHECK(input.dtype() != DT_RESOURCE); + if (input.NumElements() > 0) { + arg.kind = XlaCompiler::Argument::kParameter; + } else { + arg.kind = XlaCompiler::Argument::kConstant; + arg.constant_value = input; + } + arg.type = input.dtype(); + arg.shape = input.shape(); + } else { + // Handles resource variables. + const Tensor& input = ctx->input(input_num); + TF_RET_CHECK(input.dtype() == DT_RESOURCE); + const OptionalTensor& variable = variable_args.at(input_num); + arg.name = variable.name; + arg.kind = XlaCompiler::Argument::kResource; + arg.resource_kind = XlaResource::kVariable; + if (variable.present) { + const Tensor& value = variable.value; + arg.type = value.dtype(); + arg.shape = value.shape(); + arg.initialized = true; + } else { + // The values of uninitialized variables are not passed as inputs, since + // they are meaningless. However, it is legal to assign to a resource + // variable for the first time inside the XLA computation, so we do + // permit uninitialized variables. + arg.initialized = false; + arg.type = DT_INVALID; + arg.shape = TensorShape(); + } + } + } + + return Status::OK(); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h index ea4efa0722c..437db019a0e 100644 --- a/tensorflow/compiler/jit/xla_launch_util.h +++ b/tensorflow/compiler/jit/xla_launch_util.h @@ -35,6 +35,13 @@ limitations under the License. namespace tensorflow { class XlaAllocator; +// Struct that represents a possibly-absent Tensor. +struct OptionalTensor { + string name; // A descriptive name + bool present = false; // Is the tensor present? + Tensor value; // If present, what is the Tensor's value? +}; + // Takes a snapshot of the values of resource variable arguments, whose indices // are specified in `variable_indices` argument. We snapshot tensors that back // resource variables since concurrent updates may modify the shape, and it is @@ -139,6 +146,13 @@ class XlaComputationLaunchContext { bool allocate_xla_tensors, bool use_multiple_streams); + // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch + // op. + static Status BuildXlaCompilerArguments( + const std::map& constant_args, + const std::map& variable_args, OpKernelContext* ctx, + std::vector* args); + // Add all inputs within `ctx` as XLA arguments (returned by arguments()). // `variables` is a map from TensorFlow argument number to resource variable. // diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index b710b38f402..57019b3bc8f 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -218,6 +218,7 @@ cc_library( "//tensorflow/core:stream_executor_no_cuda", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/memory", + "@com_google_absl//absl/strings", "@com_google_absl//absl/types:span", ], alwayslink = 1, diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 214ca21e73f..e6d7710c244 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -49,7 +49,7 @@ namespace { // Checks that arguments `args` match types `types`. Status CheckSignature(const DataTypeVector& types, - const std::vector& args) { + absl::Span args) { if (args.size() != types.size()) { return errors::Internal("Compilation arguments have ", args.size(), " elements while function has ", types.size()); @@ -84,6 +84,39 @@ bool XlaCompiler::Argument::operator==( return constant_value.tensor_data() == other.constant_value.tensor_data(); } +string XlaCompiler::Argument::HumanString() const { + string common; + if (!name.empty()) { + common = absl::StrCat(" name=", name); + } + absl::StrAppend(&common, " type=", DataTypeString(type), + " shape=", shape.DebugString()); + switch (kind) { + case kInvalid: + return "invalid"; + case kConstant: + return absl::StrCat("kind=constant", common, + " value=", constant_value.DebugString()); + case kResource: { + string output = absl::StrCat("kind=resource", common, " resource_kind=", + XlaResource::KindToString(resource_kind), + " initialized=", initialized); + if (tensor_array_size >= 0) { + absl::StrAppend(&output, " tensor_array_size=", tensor_array_size); + } + if (!tensor_array_gradients.empty()) { + absl::StrAppend(&output, " tensor_array_gradients=", + absl::StrJoin(tensor_array_gradients, ",")); + } + return output; + } + case kParameter: + return absl::StrCat("kind=parameter", common); + case kToken: + return absl::StrCat("token", common); + } +} + XlaCompiler::XlaCompiler(XlaCompiler::Options options) : options_(options), initialization_status_(Status::OK()), @@ -177,15 +210,16 @@ std::unique_ptr XlaCompiler::GetGraph(const FunctionBody* fbody) { return graph; } -Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options, - const NameAttrList& function, - std::vector args, - XlaCompiler::CompilationResult* result) { +Status XlaCompiler::CompileFunction( + const XlaCompiler::CompileOptions& options, const NameAttrList& function, + absl::Span args, + XlaCompiler::CompilationResult* result) { const string function_id = Canonicalize(function.name(), AttrSlice(&function.attr())); VLOG(1) << "XlaCompiler::CompileFunction " << function_id; - auto it = cache_.find({function_id, args}); + const std::vector arg_vector(args.begin(), args.end()); + auto it = cache_.find({function_id, arg_vector}); if (it != cache_.end()) { *result = it->second; return Status::OK(); @@ -241,7 +275,7 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options, CompileGraph(options, function_id, std::move(graph), args, result)); VLOG(1) << "===================================================="; - cache_[{function_id, args}] = *result; + cache_[{function_id, arg_vector}] = *result; return Status::OK(); } @@ -658,46 +692,48 @@ Status XlaCompiler::BuildArguments( } Status XlaCompiler::CompileSingleOp( - const XlaCompiler::CompileOptions& options, string const& name, - OpKernelContext* ctx, const std::vector& args, - CompilationResult* result) { + const XlaCompiler::CompileOptions& options, const NodeDef& node_def, + absl::Span args, + absl::Span result_types, CompilationResult* result) { // TODO(b/74182462): We implement this by creating a new dummy Graph including // _Arg nodes, and let CompileGraph walk it. This could be optimized. std::unique_ptr graph(new Graph(OpRegistry::Global())); Status status; // First create the actual node we care about computing. - Node* main_node = graph->AddNode(ctx->op_kernel().def(), &status); + Node* main_node = graph->AddNode(node_def, &status); TF_RETURN_IF_ERROR(status); // Create dummy _Arg nodes. Link these to `node` and also via a control // dependency edge to the _SOURCE node. - for (int64 i = 0; i < ctx->num_inputs(); ++i) { + for (int64 i = 0; i < args.size(); ++i) { Node* node; - string name = absl::StrCat(ctx->op_kernel().name(), "_", i, "_arg"); - Status status = NodeBuilder(name, "_Arg") - .ControlInput(graph->source_node()) - .Attr("T", ctx->input_dtype(i)) - .Attr("index", i) - .Finalize(graph.get(), &node); + string arg_name = absl::StrCat("_arg", i); + Status status = + NodeBuilder(arg_name, "_Arg") + .ControlInput(graph->source_node()) + .Attr("T", args[i].kind == Argument::kResource ? DT_RESOURCE + : args[i].type) + .Attr("index", i) + .Finalize(graph.get(), &node); TF_RETURN_IF_ERROR(status); graph->AddEdge(node, 0, main_node, i); } // Similarly with return values, create dummy _Retval nodes fed by `node`. - for (int64 i = 0; i < ctx->num_outputs(); ++i) { + for (int64 i = 0; i < result_types.size(); ++i) { Node* node; - string name = absl::StrCat(ctx->op_kernel().name(), "_", i, "_retval"); - Status status = NodeBuilder(name, "_Retval") + string retval_name = absl::StrCat("_retval", i); + Status status = NodeBuilder(retval_name, "_Retval") .Input(main_node, i) - .Attr("T", ctx->expected_output_dtype(i)) + .Attr("T", result_types[i]) .Attr("index", i) .Finalize(graph.get(), &node); TF_RETURN_IF_ERROR(status); } FixupSourceAndSinkEdges(graph.get()); - return CompileGraph(options, name, std::move(graph), args, result); + return CompileGraph(options, node_def.name(), std::move(graph), args, result); } namespace { @@ -757,7 +793,7 @@ Status ValidateGraph(const Graph* graph, Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options, string const& name, std::unique_ptr graph, - const std::vector& args, + absl::Span args, CompilationResult* result) { VLOG(1) << "Executing graph symbolically to populate XlaBuilder."; @@ -785,7 +821,7 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options, &options_.shape_representation_fn); core::ScopedUnref context_unref(context); - std::vector real_args(args); + std::vector real_args(args.begin(), args.end()); int token_input_index = -1; std::unique_ptr token_output; if (options.add_token_input_output) { diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index 08e00f38409..f10cfbe0c65 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -18,6 +18,7 @@ limitations under the License. #include +#include "absl/types/span.h" #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h" #include "tensorflow/compiler/tf2xla/xla_compilation_device.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" @@ -118,7 +119,7 @@ class XlaCompiler { // The type of the argument. If the argument is a resource, this // is the type of the variable's value, not DT_RESOURCE. - DataType type; + DataType type = DT_INVALID; // The shape of the argument. For: // * a parameter: the shape of the parameter. @@ -155,6 +156,9 @@ class XlaCompiler { std::set tensor_array_gradients; bool operator==(const Argument& other) const; + + // Returns a human-readable summary of the argument. + string HumanString() const; }; // Options pertaining to an individual call to CompileGraph() or @@ -315,22 +319,23 @@ class XlaCompiler { Status CompileFunction(const CompileOptions& options, const NameAttrList& fn_name_attrs, - std::vector args, CompilationResult* result); + absl::Span args, + CompilationResult* result); // Compiles a tensorflow::Graph into an xla::XlaComputation. // Similar to CompileFunction, but takes a Graph as input rather than a // function. Status CompileGraph(const CompileOptions& options, string const& name, std::unique_ptr graph, - const std::vector& args, + absl::Span args, CompilationResult* result); - // Compiles a single Op, given by an OpKernelContext, into an + // Compiles a single Op, given by `node_def`, into an // xla::XlaComputation. Similar to CompileFunction but takes a single Op as // input. - Status CompileSingleOp(const CompileOptions& options, string const& name, - OpKernelContext* ctx, - const std::vector& args, + Status CompileSingleOp(const CompileOptions& options, const NodeDef& node_def, + absl::Span args, + absl::Span result_types, CompilationResult* result); // Returns the shape of the XLA parameter for an argument 'arg'. diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc index 63b09c8f02a..a322eb9015e 100644 --- a/tensorflow/compiler/tf2xla/xla_resource.cc +++ b/tensorflow/compiler/tf2xla/xla_resource.cc @@ -26,6 +26,19 @@ limitations under the License. namespace tensorflow { +/*static*/ absl::string_view XlaResource::KindToString(XlaResource::Kind kind) { + switch (kind) { + case XlaResource::kInvalid: + return "invalid"; + case XlaResource::kVariable: + return "variable"; + case XlaResource::kStack: + return "stack"; + case XlaResource::kTensorArray: + return "tensorarray"; + } +} + XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type, TensorShape shape, const xla::XlaOp& initial_value, int64 tensor_array_size, diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h index aa9ce1b171f..857b9a928bb 100644 --- a/tensorflow/compiler/tf2xla/xla_resource.h +++ b/tensorflow/compiler/tf2xla/xla_resource.h @@ -18,6 +18,7 @@ limitations under the License. #include +#include "absl/strings/string_view.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/framework/tensor_shape.h" @@ -35,6 +36,7 @@ class XlaResource { kTensorArray, kStack, }; + static absl::string_view KindToString(Kind kind); XlaResource(Kind kind, int arg_num, string name, DataType type, TensorShape shape, const xla::XlaOp& initial_value, From dcccc7cd1cbea00844003c50775774829c858b66 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 6 Nov 2018 16:25:18 -0800 Subject: [PATCH 222/540] [TF:XLA:GPU] Register one XLA_GPU device per device actually present, rather than unconditionally registering one. Also set a GpuDeviceContext, since the commented-out code works fine. PiperOrigin-RevId: 220373651 --- tensorflow/compiler/jit/xla_gpu_device.cc | 33 ++++++++++++----------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index 6e2ea900453..44197016958 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -53,24 +53,25 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options, return Status::OK(); } - XlaDevice::Options options; - options.platform = platform.ValueOrDie(); - options.device_name_prefix = name_prefix; - options.device_name = DEVICE_XLA_GPU; - options.device_ordinal = 0; - options.compilation_device_name = DEVICE_GPU_XLA_JIT; - options.use_multiple_streams = true; - auto device = absl::make_unique(session_options, options); + for (int i = 0; i < platform.ValueOrDie()->VisibleDeviceCount(); ++i) { + XlaDevice::Options options; + options.platform = platform.ValueOrDie(); + options.device_name_prefix = name_prefix; + options.device_name = DEVICE_XLA_GPU; + options.device_ordinal = i; + options.compilation_device_name = DEVICE_GPU_XLA_JIT; + options.use_multiple_streams = true; + auto device = absl::make_unique(session_options, options); - // TODO(b/78468222): Uncomment after fixing this bug - // status = device->UseGpuDeviceInfo(); - // if (!status.ok()) { - // errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT, - // " device"); - // return status; - // } + Status status = device->UseGpuDeviceInfo(); + if (!status.ok()) { + errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT, + " device number ", i); + return status; + } - devices->push_back(device.release()); + devices->push_back(device.release()); + } return Status::OK(); } From 85c2885a661c2b2b206e6c10de63c63d45d4b628 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 16:55:00 -0800 Subject: [PATCH 223/540] Automated rollback of commit 0130d4a67a2bf933710a0e575ad9713f1862a8f2 PiperOrigin-RevId: 220378109 --- tensorflow/core/framework/op_kernel.cc | 2 +- tensorflow/core/framework/op_kernel.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index 073dfb3e7d9..5f08c130871 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -1232,7 +1232,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device, OpKernelConstruction context( device_type, device, allocator, &node_def, op_def, flib, inputs, input_memory_types, outputs, output_memory_types, graph_def_version, &s); - *kernel = (registration->factory)(&context); + *kernel = (*registration->factory)(&context); if (!s.ok()) { delete *kernel; *kernel = nullptr; diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h index 52d7d14e218..165115aab32 100644 --- a/tensorflow/core/framework/op_kernel.h +++ b/tensorflow/core/framework/op_kernel.h @@ -1348,7 +1348,7 @@ namespace kernel_factory { class OpKernelRegistrar { public: - typedef std::function Factory; + typedef OpKernel* (*Factory)(OpKernelConstruction*); OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name, Factory factory) { From 9180bafa25282148be3f333256756217e039f2ea Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Tue, 6 Nov 2018 17:18:53 -0800 Subject: [PATCH 224/540] [XLA] Account for subcomputations' individual instructions in xla_hlo_profile. Previously we'd say that the "cost" of an instruction with subcomputations (e.g. a while loop) was equal to the total cost of its instructions. Fine (I mean, not actually fine, a loop can run multiple times, but at least it's something), but we also profile *the inside* of a subcomputation, and we weren't saving any information about *those* instructions. PiperOrigin-RevId: 220381684 --- .../compiler/xla/service/hlo_cost_analysis.cc | 48 +++++++++++-------- .../compiler/xla/service/hlo_cost_analysis.h | 19 +++++++- .../service/human_readable_profile_builder.cc | 10 ++-- .../xla/tests/xla_hlo_profile_test.cc | 18 +++---- 4 files changed, 62 insertions(+), 33 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc index 108aeea097d..3a7b432e500 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc @@ -269,7 +269,7 @@ Status HloCostAnalysis::HandleOutfeed(const HloInstruction*) { Status HloCostAnalysis::HandleMap(const HloInstruction* map) { // Compute properties of the mapped function. TF_ASSIGN_OR_RETURN(const Properties sub_properties, - ProcessSubcomputation(map->to_apply())); + ProcessNestedSubcomputation(map->to_apply())); // Compute the cost of all elements for this Map operation. const int64 element_count = ShapeUtil::ElementsIn(map->shape()); @@ -285,7 +285,7 @@ Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) { HloComputation* function = reduce->to_apply(); // Compute the cost of the user function. TF_ASSIGN_OR_RETURN(const Properties sub_properties, - ProcessSubcomputation(function)); + ProcessNestedSubcomputation(function)); // Compute the cost of all elements for this Reduce operation. // This counts the number of times the reduction function is applied, so it @@ -311,7 +311,7 @@ Status HloCostAnalysis::HandleReduceWindow( auto function = reduce_window->to_apply(); // Compute the properties of the reduction function. TF_ASSIGN_OR_RETURN(const Properties sub_properties, - ProcessSubcomputation(function)); + ProcessNestedSubcomputation(function)); // Compute the cost of all elements for this ReduceWindow operation. For each // output element there are window_size - 1 reductions to perform. @@ -336,9 +336,9 @@ Status HloCostAnalysis::HandleSelectAndScatter( // Compute the properties of the select and scatter function. // Compute the properties of the reduction function. TF_ASSIGN_OR_RETURN(const Properties select_properties, - ProcessSubcomputation(instruction->select())); + ProcessNestedSubcomputation(instruction->select())); TF_ASSIGN_OR_RETURN(const Properties scatter_properties, - ProcessSubcomputation(instruction->scatter())); + ProcessNestedSubcomputation(instruction->scatter())); // Compute the cost of all elements for this operation. For each scatter // source element there are window_size - 1 select computations to perform and @@ -574,7 +574,7 @@ Status HloCostAnalysis::HandleRng(const HloInstruction* random) { Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) { TF_ASSIGN_OR_RETURN( current_properties_, - ProcessSubcomputation(fusion->fused_instructions_computation())); + ProcessNestedSubcomputation(fusion->fused_instructions_computation())); // Fusion nodes that produce a tuple also produce the entries in the tuple. // Ignore the memory accessed inside fused ops, since fusion is supposed to @@ -595,7 +595,7 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) { Status HloCostAnalysis::HandleCall(const HloInstruction* call) { TF_ASSIGN_OR_RETURN(current_properties_, - ProcessSubcomputation(call->to_apply())); + ProcessUnnestedSubcomputation(call->to_apply())); current_should_compute_bottleneck_time_ = false; return Status::OK(); } @@ -624,13 +624,12 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) { // Since the number of iterations of the while node will not always be // something that we can statically analyze, we cannot precisely compute the // cost of a while node. For now compute the cost of a single iteration. - // - // TODO(b/26346211): Improve the cost analysis for while nodes. TF_ASSIGN_OR_RETURN(const Properties body_properties, - ProcessSubcomputation(xla_while->while_body())); + ProcessUnnestedSubcomputation(xla_while->while_body())); - TF_ASSIGN_OR_RETURN(const Properties condition_properties, - ProcessSubcomputation(xla_while->while_condition())); + TF_ASSIGN_OR_RETURN( + const Properties condition_properties, + ProcessUnnestedSubcomputation(xla_while->while_condition())); current_properties_.clear(); for (const auto& property : body_properties) { @@ -647,10 +646,12 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) { Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) { // Compute the cost of the true and false computations and take the maximum // from those for each property. - TF_ASSIGN_OR_RETURN(const Properties true_computation_properties, - ProcessSubcomputation(conditional->true_computation())); - TF_ASSIGN_OR_RETURN(const Properties false_computation_properties, - ProcessSubcomputation(conditional->false_computation())); + TF_ASSIGN_OR_RETURN( + const Properties true_computation_properties, + ProcessUnnestedSubcomputation(conditional->true_computation())); + TF_ASSIGN_OR_RETURN( + const Properties false_computation_properties, + ProcessUnnestedSubcomputation(conditional->false_computation())); current_properties_ = true_computation_properties; for (const auto& property : false_computation_properties) { if (!tensorflow::gtl::InsertIfNotPresent(¤t_properties_, property)) { @@ -680,7 +681,7 @@ Status HloCostAnalysis::HandleScatter(const HloInstruction* scatter) { const int64 element_count = ShapeUtil::ElementsIn(scatter->operand(2)->shape()); TF_ASSIGN_OR_RETURN(const Properties sub_properties, - ProcessSubcomputation(scatter->to_apply())); + ProcessNestedSubcomputation(scatter->to_apply())); for (const auto& property : sub_properties) { if (property.first != kBytesAccessedKey) { current_properties_[property.first] = property.second * element_count; @@ -725,11 +726,20 @@ float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const { return GetPropertyForHlo(hlo, kOptimalSecondsKey, hlo_properties_); } -StatusOr HloCostAnalysis::ProcessSubcomputation( - HloComputation* computation) { +StatusOr +HloCostAnalysis::ProcessNestedSubcomputation(HloComputation* computation) { HloCostAnalysis visitor(shape_size_, per_second_rates_); TF_RETURN_IF_ERROR(computation->Accept(&visitor)); return visitor.properties(); } +StatusOr +HloCostAnalysis::ProcessUnnestedSubcomputation(HloComputation* computation) { + HloCostAnalysis visitor(shape_size_, per_second_rates_); + TF_RETURN_IF_ERROR(computation->Accept(&visitor)); + hlo_properties_.insert(visitor.hlo_properties_.begin(), + visitor.hlo_properties_.end()); + return visitor.properties(); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h index 46b4bbeef22..19d0af1de1c 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h @@ -153,7 +153,24 @@ class HloCostAnalysis : public ConstDfsHloVisitor { // Returns the properties computed from visiting the computation rooted at the // given hlo. - StatusOr ProcessSubcomputation(HloComputation* computation); + // + // The difference between ProcessNestedSubcomputation and + // ProcessUnnestedSubcomputation is that we expect to get profile results for + // an unnested subcomputation's individual instructions, while we expect that + // a nested subcomputation is completely subsumed by its parent. + // + // For example, subcomputations inside kFusion and kMap are considered nested, + // while subcomputations inside kWhile and kConditional are considered + // unnested. + // + // Another way of thinking of this is, kFusion is implemented on the GPU + // backend using just one GPU kernel, while kWhile's body is implemented as a + // sequence of kernels, one for each HLO therein. Backends don't necessarily + // need to follow this same implementation strategy, but we assume they do for + // the purposes of this platform-generic cost analysis. + StatusOr ProcessNestedSubcomputation(HloComputation* computation); + StatusOr ProcessUnnestedSubcomputation( + HloComputation* computation); // Utility function to handle all element-wise operations. Status HandleElementwiseOp(const HloInstruction* hlo_instruction); diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc index e103222b55f..9acd1cf70b4 100644 --- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc @@ -90,10 +90,12 @@ string HumanReadableProfileBuilder::ToString() const { op.optimal_seconds < 0 ? "" : StrFormat("(%12.1f optimal)", op.optimal_seconds * 1e6), - op.flop_count <= 0 ? "" : HumanReadableNumFlops(op.flop_count, nsecs), - op.transcendental_count <= 0 - ? "" - : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs), + op.flop_count > 0 && nsecs > 0 + ? HumanReadableNumFlops(op.flop_count, nsecs) + : "", + op.transcendental_count > 0 && nsecs > 0 + ? HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs) + : "", bytes_per_sec, bytes_per_cycle, op.name); }; diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc index 376559500ef..0f4ddf51f32 100644 --- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -91,8 +91,8 @@ Status ParseOneProfileOutputLine( string match_usecs = "([0-9.]+) usec"; string match_flops = "([^ ]*)"; string match_trops = "([^ ]*)"; - string match_bytes_per_sec = "([0-9.TGMKi]+)B/s"; - string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle"; + string match_bytes_per_sec = "([0-9.TGMKi]*)(?:B/s)?"; + string match_bytes_per_cycle = "([0-9.TGMKi]*)(?:B/cycle)?"; // The underlined part is what we're trying to match with match_opcode: // @@ -307,6 +307,7 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) { string profile_output; ExecuteAndFetchProfile(&profile_output, client, computation, matrix_shape, matrix_shape); + SCOPED_TRACE(profile_output); std::vector profile_output_lines = absl::StrSplit(profile_output, '\n'); @@ -318,14 +319,13 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) { ASSERT_NE(while_body_profile_start, profile_output_lines.cend()); - auto while_body_profile_end = std::find_if( - while_body_profile_start, profile_output_lines.end(), - [](absl::string_view s) { - return absl::StartsWith(s, "********** microseconds report **********"); - }); + auto while_body_profile_end = + std::find_if(while_body_profile_start, profile_output_lines.end(), + [](absl::string_view s) { + return absl::StartsWith(s, "********** microseconds "); + }); - // We emit a blank line before the "********** microseconds report **********" - // line. + // We emit a blank line before the "microseconds report" line. while_body_profile_end--; ASSERT_NE(while_body_profile_end, profile_output_lines.end()); From 42b7d78e321bbf9fd62354c57ff519110d9729c2 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Tue, 6 Nov 2018 17:35:57 -0800 Subject: [PATCH 225/540] Don't auto-cluster control trigger This is a limitation of how we use deadness analysis. The right fix is to change how we use deadness analysis, but this CL prevents miscompiles in the meantime. PiperOrigin-RevId: 220383866 --- .../compiler/jit/mark_for_compilation_pass.cc | 15 +++++++++++++++ .../jit/mark_for_compilation_pass_test.cc | 12 ++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index dae6ca4ad24..6b8ec58619a 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -61,6 +61,10 @@ struct OperationFilter { // seeding behavior as TensorFlow's RNG (b/34749654). So we avoid // auto-clustering stateful RNG ops. bool allow_stateful_rng_ops; + + // TODO(b/118970344): Whether ControlTrigger ops are allowed. It is unsound + // to cluster ControlTrigger because of how we use deadness analysis. + bool allow_control_trigger; }; bool IsStatefulRandomOp(absl::string_view op_name) { @@ -225,6 +229,9 @@ bool IsCompilableCall(const NodeDef& call_def, IsStatefulRandomOp(node->type_string())) { return false; } + if (!op_filter.allow_control_trigger && node->IsControlTrigger()) { + return false; + } if (!HasXLAKernel(*node, jit_device_type) && !IsCompilableCall(node->def(), jit_device_type, op_filter, depth + 1, lib_runtime)) { @@ -455,6 +462,9 @@ Status FindCompilationCandidates( op_filter.allow_stateful_rng_ops = (registration->autoclustering_policy == XlaOpRegistry::AutoclusteringPolicy::kAlways); + op_filter.allow_control_trigger = + (registration->autoclustering_policy == + XlaOpRegistry::AutoclusteringPolicy::kAlways); if (!HasXLAKernel(*node, jit_device_type) && !IsCompilableCall(node->def(), jit_device_type, op_filter, 0, @@ -469,6 +479,10 @@ Status FindCompilationCandidates( VLOG(2) << "Rejecting " << node->name() << ": stateful random operation"; continue; } + if (!op_filter.allow_control_trigger && node->IsControlTrigger()) { + VLOG(2) << "Rejecting " << node->name() << ": is a control trigger op"; + continue; + } if (!op_filter.allow_resource_ops && (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) { @@ -604,6 +618,7 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) { OperationFilter op_filter; op_filter.allow_resource_ops = true; op_filter.allow_stateful_rng_ops = true; + op_filter.allow_control_trigger = true; return IsCompilableCall(ndef, jit_device_type, op_filter, 0, flr); } diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc index ef4f1ea2b06..82aa7473324 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc @@ -817,14 +817,10 @@ TEST(XlaCompilationTest, ClusterControlTrigger) { std::unordered_map clusters = GetClusters(*graph); - ASSERT_FALSE(clusters.empty()); - string cluster_name = clusters.begin()->second; - - // ctrl_trigger_a has inputs with mismatching deadness so it won't be - // clustered. ctrl_trigger_b is okay to cluster. - std::unordered_map expected_clusters( - {{"const_a", cluster_name}, {"ctrl_trigger_b", cluster_name}}); - EXPECT_EQ(clusters, expected_clusters); + // TODO(b/118970344): ctrl_trigger_a has inputs with mismatching deadness so + // it won't be clustered. ctrl_trigger_b is okay to cluster but we don't + // cluster it because of b/118970344. + EXPECT_TRUE(clusters.empty()); } TEST(XlaCompilationTest, RandomShape) { From d818065679c3dbd58ba1917d52e1f9d5fd94cc01 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Tue, 6 Nov 2018 17:36:16 -0800 Subject: [PATCH 226/540] [XLA] Don't count ops which run "faster than optimal" in our "seconds above optimum" total. In other words, for the purposes of calculating totals, let the "true optimum" be the smaller of the estimated optimum and the actual runtime. PiperOrigin-RevId: 220383899 --- .../service/human_readable_profile_builder.cc | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc index 9acd1cf70b4..90904ac0011 100644 --- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc @@ -99,13 +99,20 @@ string HumanReadableProfileBuilder::ToString() const { bytes_per_sec, bytes_per_cycle, op.name); }; - float optimal_seconds_sum = 0.0; + double optimal_seconds_sum = 0; int64 total_flops = 0.; int64 total_transcendentals = 0.; int64 total_bytes = 0; for (const auto& op : op_infos_) { if (op.optimal_seconds > 0) { - optimal_seconds_sum += op.optimal_seconds; + // An op can run faster than the estimated optimum. For example, we might + // estimate a fusion's speed by looking at the size of its operands and + // result, but perhaps the fusion doesn't read the entirety of all of its + // inputs. For the purposes of summing the instructions' optimal speeds, + // we treat the "optimum" as the smallest of either the estimated optimum + // and the actual speed. + optimal_seconds_sum += + std::min(double{op.optimal_seconds}, CyclesToSeconds(op.cycles)); } total_flops += std::max(op.flop_count, int64{0}); total_transcendentals += std::max(op.transcendental_count, int64{0}); @@ -116,7 +123,7 @@ string HumanReadableProfileBuilder::ToString() const { print_op({is_entry_computation_ ? "[total] [entry]" : "[total]", "[total]", /*category=*/"", total_cycles_, total_flops, total_transcendentals, - total_bytes, optimal_seconds_sum}, + total_bytes, static_cast(optimal_seconds_sum)}, /*is_total=*/true); // Sort ops in decreasing order of cycles, and print them. @@ -157,8 +164,10 @@ string HumanReadableProfileBuilder::ToString() const { entry.text = op.name; entry.short_text = op.short_name; entry.category_text = op.category; - entry.metric = - CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6; + // Ignore ops that run faster than the estimated optimal here, as we do + // when calculating optimal_seconds_sum. + entry.metric = std::max( + 0., CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6); total_discrepancy_in_microseconds += entry.metric; table.AddEntry(std::move(entry)); } From ee1263ea3a0d8cb0bccf824d78d25d6c9d545561 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 18:10:47 -0800 Subject: [PATCH 227/540] Delete unused 'on_device()' method of `DistributedValues`. PiperOrigin-RevId: 220387985 --- tensorflow/contrib/distribute/python/values.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py index 2474886aeb9..257d2e76144 100644 --- a/tensorflow/contrib/distribute/python/values.py +++ b/tensorflow/contrib/distribute/python/values.py @@ -75,10 +75,6 @@ class DistributedValues(object): ValueError("Device %s not found in %s (current device %s)" % (device, self._index.keys(), device_util.current())), e) - def on_device(self, device): - device = device_util.canonicalize(device) - return device in self._index - @property def devices(self): return list(self._index.keys()) From 74cb85181583f541114c2f7352677d59903cd3a7 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Tue, 6 Nov 2018 18:41:45 -0800 Subject: [PATCH 228/540] [XLA] Fix invalid assumption in HloComputation::CloneWithReplacements. CloneWithReplacements assumed that the `extras` instructions came before all other instructions in the postorder traversal. This is not true in general. In the course of fixing this, I ended up removing the `extras` parameter entirely, which makes the function easier to call. I also augmented the interface a bit to make CloneWithReplacements easier to call when you just have one or two replacements to make. I will use these overloads in a future patch. PiperOrigin-RevId: 220390779 --- .../compiler/xla/service/hlo_computation.cc | 90 ++++++++++++++++--- .../compiler/xla/service/hlo_computation.h | 34 +++++-- .../xla/service/while_loop_simplifier.cc | 5 +- 3 files changed, 108 insertions(+), 21 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index 01ae6a55fcf..e2579217fdc 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -33,6 +33,7 @@ limitations under the License. #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -845,14 +846,46 @@ std::unique_ptr HloComputation::Clone( return CloneWithReplacements( /*replacements=*/std::unordered_map>(), - /*extras=*/{}, context, suffix); + context, suffix); +} + +std::unique_ptr HloComputation::CloneWithReplacementPairs( + std::pair> r1, + HloCloneContext* context, const string& suffix) { + std::unordered_map> + replacements; + replacements.emplace(std::move(r1)); + return CloneWithReplacements(std::move(replacements), context, suffix); +} + +std::unique_ptr HloComputation::CloneWithReplacementPairs( + std::pair> r1, + std::pair> r2, + HloCloneContext* context, const string& suffix) { + std::unordered_map> + replacements; + replacements.emplace(std::move(r1)); + replacements.emplace(std::move(r2)); + return CloneWithReplacements(std::move(replacements), context, suffix); +} + +std::unique_ptr HloComputation::CloneWithReplacementPairs( + std::pair> r1, + std::pair> r2, + std::pair> r3, + HloCloneContext* context, const string& suffix) { + std::unordered_map> + replacements; + replacements.emplace(std::move(r1)); + replacements.emplace(std::move(r2)); + replacements.emplace(std::move(r3)); + return CloneWithReplacements(std::move(replacements), context, suffix); } std::unique_ptr HloComputation::CloneWithReplacements( std::unordered_map> replacements, - absl::Span extras, HloCloneContext* context, - const string& suffix) { + HloCloneContext* context, const string& suffix) { std::unique_ptr context_ptr; if (context == nullptr) { context_ptr = absl::make_unique(parent(), suffix); @@ -873,18 +906,50 @@ std::unique_ptr HloComputation::CloneWithReplacements( }; VLOG(1) << "Cloning " << name() << " --> " << suffix << "\n"; + + // We want to do a postorder walk over [replace(i) for i in instructions_]. + // We can't reuse MakeInstructionPostOrder() for this, because that will + // generate a postorder of plain instructions_, and our replacements may + // change the postorder! + // + // The postorder we want here is simpler than what MakeInstructionPostOrder() + // does -- we only care about operand dependencies -- so let's just do it + // ourselves. std::vector postorder; - for (HloInstruction* instr : extras) { - postorder.push_back(instr); - } - for (HloInstruction* instr : MakeInstructionPostOrder()) { - if (HloInstruction* replacement = replace(instr)) { - postorder.push_back(replacement); + absl::flat_hash_map visited; + for (const auto& instr : instructions_) { + std::vector dfs_stack; + HloInstruction* new_instr = replace(instr.get()); + if (!new_instr) { + continue; + } + dfs_stack.push_back(new_instr); + + while (!dfs_stack.empty()) { + auto* cur = dfs_stack.back(); + auto it = visited.find(cur); + if (it != visited.end()) { + dfs_stack.pop_back(); + if (it->second == kVisited) { + continue; + } + CHECK_EQ(it->second, kVisiting); + postorder.push_back(cur); + it->second = kVisited; + continue; + } + + visited.insert({cur, kVisiting}); + for (HloInstruction* operand : cur->operands()) { + HloInstruction* new_operand = replace(operand); + if (new_operand) { + dfs_stack.emplace_back(new_operand); + } + } } } std::vector> instructions; - std::unique_ptr new_instr; for (auto instr : postorder) { std::vector new_operands; for (auto operand : instr->operands()) { @@ -894,9 +959,8 @@ std::unique_ptr HloComputation::CloneWithReplacements( << operand->ToString() << ", used by " << instr->ToString(); new_operands.push_back(context->GetInstruction(replaced_operand)); } - new_instr = - instr->CloneWithNewOperands(instr->shape(), new_operands, context); - instructions.push_back(std::move(new_instr)); + instructions.push_back( + instr->CloneWithNewOperands(instr->shape(), new_operands, context)); } Builder builder(name() + "." + suffix); for (auto& instr : instructions) { diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h index 2cce866e5c1..fc7d2035e5b 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.h +++ b/tensorflow/compiler/xla/service/hlo_computation.h @@ -319,14 +319,38 @@ class HloComputation { // the map's value to replace that instruction in the cloned computation. // // If replacements maps a key to nullptr, we remove that instruction from the - // new computation. - // If additional instructions are used by instructions in replacement map, - // they must be passed in post-order in the extras span. + // new computation. If an element of `replacements` references an instruction + // that's not already in the computation, it's cloned and added to the new + // computation. + // + // All relevant instructions are cloned, *including* unique_ptr in the + // `replacements` map. std::unique_ptr CloneWithReplacements( std::unordered_map> replacements, - absl::Span extras, HloCloneContext* context = nullptr, - const string& suffix = "clone"); + HloCloneContext* context = nullptr, const string& suffix = "clone"); + + // Convenience overloads for CloneWithReplacements. You want to do + // + // CloneWithReplacements({{a, std::move(b)}, {c, std::move(d)}}) // ERROR + // + // but that doesn't work because std::initializer_list is not movable. These + // overloads let you do + // + // CloneWithReplacementPairs({a, std::move(b)}, {c, std::move(d)}); // OK + // + std::unique_ptr CloneWithReplacementPairs( + std::pair> r1, + HloCloneContext* context = nullptr, const string& suffix = "clone"); + std::unique_ptr CloneWithReplacementPairs( + std::pair> r1, + std::pair> r2, + HloCloneContext* context = nullptr, const string& suffix = "clone"); + std::unique_ptr CloneWithReplacementPairs( + std::pair> r1, + std::pair> r2, + std::pair> r3, + HloCloneContext* context = nullptr, const string& suffix = "clone"); // Returns true if the given instruction can be removed from the computation. // Parameter instructions cannot be removed without violating invariants of diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc index 630d71e5ca2..d039b818e21 100644 --- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc +++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc @@ -253,7 +253,7 @@ static StatusOr TryRemoveDeadWhileParams(HloInstruction* while_op) { // Create the new while condition, body, and init value. std::unique_ptr new_while_cond = while_cond->CloneWithReplacements( - make_while_computation_replacements(while_cond), /*extras=*/{}); + make_while_computation_replacements(while_cond)); std::unordered_map> while_body_replacements = make_while_computation_replacements(while_body); @@ -266,8 +266,7 @@ static StatusOr TryRemoveDeadWhileParams(HloInstruction* while_op) { while_body_replacements.emplace( while_body_root, HloInstruction::CreateTuple(new_while_body_root_elems)); std::unique_ptr new_while_body = - while_body->CloneWithReplacements(std::move(while_body_replacements), - /*extras=*/{}); + while_body->CloneWithReplacements(std::move(while_body_replacements)); // Add a new while_init instruction that repackages the old while_init // instruction's elements. We rely on the AlgebraicSimplifier and DCE to From c2516aa0663f015d021a958012b7bfff1bcd78cb Mon Sep 17 00:00:00 2001 From: Saurabh Saxena Date: Tue, 6 Nov 2018 19:54:42 -0800 Subject: [PATCH 229/540] Adds a max_num_elements arg to EmptyTensorList. This is required so that XLA can know maximum size to allocate for the list at compile time. PiperOrigin-RevId: 220396133 --- tensorflow/core/kernels/list_kernels.cc | 30 ++- tensorflow/core/kernels/list_kernels.h | 3 + .../core/ops/compat/ops_history.v1.pbtxt | 4 + tensorflow/core/ops/list_ops.cc | 1 + tensorflow/python/kernel_tests/BUILD | 1 + .../python/kernel_tests/list_ops_test.py | 231 ++++++++++++------ tensorflow/python/ops/list_ops.py | 18 +- 7 files changed, 212 insertions(+), 76 deletions(-) diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc index 3d0c193d9fc..5f244b1b10f 100644 --- a/tensorflow/core/kernels/list_kernels.cc +++ b/tensorflow/core/kernels/list_kernels.cc @@ -42,7 +42,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice; TensorList::TensorList(const TensorList& other) : tensors(other.tensors), element_shape(other.element_shape), - element_dtype(other.element_dtype) {} + element_dtype(other.element_dtype), + max_num_elements(other.max_num_elements) {} void TensorList::Encode(VariantTensorData* data) const { data->set_type_name(TypeName()); @@ -63,6 +64,7 @@ void TensorList::Encode(VariantTensorData* data) const { core::PutVarint64(&metadata, static_cast(i)); } core::PutVarint64(&metadata, static_cast(element_dtype)); + core::PutVarint64(&metadata, static_cast(max_num_elements)); TensorShapeProto element_shape_proto; element_shape.AsProto(&element_shape_proto); element_shape_proto.AppendToString(&metadata); @@ -74,6 +76,7 @@ static Status TensorListDeviceCopy( const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) { to->element_shape = from.element_shape; to->element_dtype = from.element_dtype; + to->max_num_elements = from.max_num_elements; to->tensors.reserve(from.tensors.size()); for (const Tensor& t : from.tensors) { Tensor tmp(t.dtype()); @@ -140,6 +143,8 @@ bool TensorList::Decode(const VariantTensorData& data) { core::GetVarint64(&iter, &scratch); element_dtype = static_cast(scratch); + core::GetVarint64(&iter, &scratch); + max_num_elements = static_cast(scratch); TensorShapeProto element_shape_proto; element_shape_proto.ParseFromString(string(iter.data(), iter.size())); element_shape = PartialTensorShape(element_shape_proto); @@ -175,12 +180,19 @@ class EmptyTensorList : public OpKernel { } void Compute(OpKernelContext* ctx) override { + const Tensor& max_num_elements_t = ctx->input(1); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(max_num_elements_t.shape()), + errors::InvalidArgument( + "max_num_elements expected to be a scalar ", + "but got shape: ", max_num_elements_t.shape().DebugString())); Tensor* result; AllocatorAttributes attr; attr.set_on_host(true); OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr)); TensorList empty; empty.element_dtype = element_dtype_; + empty.max_num_elements = max_num_elements_t.scalar()(); PartialTensorShape element_shape; OP_REQUIRES_OK(ctx, TensorShapeFromTensor(ctx->input(0), &element_shape)); empty.element_shape = element_shape; @@ -198,9 +210,11 @@ REGISTER_KERNEL_BUILDER(Name("EmptyTensorList").Device(DEVICE_CPU), #if GOOGLE_CUDA -REGISTER_KERNEL_BUILDER( - Name("EmptyTensorList").Device(DEVICE_GPU).HostMemory("element_shape"), - EmptyTensorList); +REGISTER_KERNEL_BUILDER(Name("EmptyTensorList") + .Device(DEVICE_GPU) + .HostMemory("element_shape") + .HostMemory("max_num_elements"), + EmptyTensorList); #endif // GOOGLE_CUDA @@ -237,6 +251,14 @@ class TensorListPushBack : public OpKernel { " but list elements ", DataTypeString(l->element_dtype))); + if (l->max_num_elements != -1) { + OP_REQUIRES( + c, l->tensors.size() < l->max_num_elements, + errors::InvalidArgument("Tried to push item into a full list", + " list size: ", l->tensors.size(), + " max_num_elements: ", l->max_num_elements)); + } + TensorList output; output = *l; output.tensors.push_back(input); diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h index 12581b15b1c..c2591f53141 100644 --- a/tensorflow/core/kernels/list_kernels.h +++ b/tensorflow/core/kernels/list_kernels.h @@ -56,6 +56,9 @@ struct TensorList { std::vector tensors; PartialTensorShape element_shape; DataType element_dtype; + // The maximum allowed size of `tensors`. Defaults to -1 meaning that the size + // of `tensors` is unbounded. + int max_num_elements = -1; }; Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out); diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 916deabd6ff..d6984fa7868 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -20924,6 +20924,10 @@ op { name: "element_shape" type_attr: "shape_type" } + input_arg { + name: "max_num_elements" + type: DT_INT32 + } output_arg { name: "handle" type: DT_VARIANT diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc index 7d79df9c1cc..88d6d14c306 100644 --- a/tensorflow/core/ops/list_ops.cc +++ b/tensorflow/core/ops/list_ops.cc @@ -22,6 +22,7 @@ namespace { REGISTER_OP("EmptyTensorList") .Input("element_shape: shape_type") + .Input("max_num_elements: int32") .Output("handle: variant") .Attr("element_dtype: type") .Attr("shape_type: {int32, int64}") diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index f0b677cd0ec..c527fad59f7 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -118,6 +118,7 @@ cuda_py_test( size = "small", srcs = ["list_ops_test.py"], additional_deps = [ + "@absl_py//absl/testing:parameterized", "//third_party/py/numpy", "//tensorflow/python:array_ops", "//tensorflow/python:math_ops", diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py index d57012dc860..92552854aa6 100644 --- a/tensorflow/python/kernel_tests/list_ops_test.py +++ b/tensorflow/python/kernel_tests/list_ops_test.py @@ -19,6 +19,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np # pylint: disable=unused-import from tensorflow.python.client import session @@ -42,36 +43,84 @@ def scalar_shape(): return ops.convert_to_tensor([], dtype=dtypes.int32) -class ListOpsTest(test_util.TensorFlowTestCase): +@test_util.run_all_in_graph_and_eager_modes +class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): - @test_util.run_in_graph_and_eager_modes - def testPushPop(self): - l = list_ops.empty_tensor_list(element_dtype=dtypes.float32, - element_shape=scalar_shape()) + def _testPushPop(self, max_num_elements): + l = list_ops.empty_tensor_list( + element_dtype=dtypes.float32, + element_shape=scalar_shape(), + max_num_elements=max_num_elements) l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0)) l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32) self.assertAllEqual(self.evaluate(e), 1.0) - @test_util.run_in_graph_and_eager_modes - def testPushPopGPU(self): + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 2)) + def testPushPop(self, max_num_elements): + self._testPushPop(max_num_elements) + + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 2)) + def testPushPopGPU(self, max_num_elements): if not context.num_gpus(): return with context.device("gpu:0"): - self.testPushPop() + self._testPushPop(max_num_elements) - @test_util.run_in_graph_and_eager_modes - def testStack(self): - l = list_ops.empty_tensor_list(element_dtype=dtypes.float32, - element_shape=scalar_shape()) + def testPushInFullListFails(self): + l = list_ops.empty_tensor_list( + element_dtype=dtypes.float32, + element_shape=scalar_shape(), + max_num_elements=1) + l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0)) + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Tried to push item into a full list"): + l = list_ops.tensor_list_push_back(l, 2.) + self.evaluate(l) + + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 2)) + def testPopFromEmptyTensorListFails(self, max_num_elements): + l = list_ops.empty_tensor_list( + element_dtype=dtypes.float32, + element_shape=scalar_shape(), + max_num_elements=max_num_elements) + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Trying to pop from an empty list"): + l = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32) + self.evaluate(l) + + def _testStack(self, max_num_elements): + l = list_ops.empty_tensor_list( + element_dtype=dtypes.float32, + element_shape=scalar_shape(), + max_num_elements=max_num_elements) l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0)) l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0)) t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32) self.assertAllEqual(self.evaluate(t), [1.0, 2.0]) - @test_util.run_in_graph_and_eager_modes - def testStackWithUnknownElementShape(self): + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 2)) + def testStack(self, max_num_elements): + self._testStack(max_num_elements) + + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 2)) + def testStackGPU(self, max_num_elements): + if not context.num_gpus(): + return + with context.device("gpu:0"): + self._testStack(max_num_elements) + + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 3)) + def testStackWithUnknownElementShape(self, max_num_elements): l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=-1) + element_dtype=dtypes.float32, + element_shape=-1, + max_num_elements=max_num_elements) l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0)) l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0)) @@ -85,10 +134,13 @@ class ListOpsTest(test_util.TensorFlowTestCase): t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32) self.evaluate(t) - @test_util.run_in_graph_and_eager_modes - def testStackWithPartiallyDefinedElementShape(self): + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 3)) + def testStackWithPartiallyDefinedElementShape(self, max_num_elements): l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=[-1]) + element_dtype=dtypes.float32, + element_shape=[-1], + max_num_elements=max_num_elements) l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0])) l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0])) @@ -102,11 +154,14 @@ class ListOpsTest(test_util.TensorFlowTestCase): t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32) self.evaluate(t) - @test_util.run_in_graph_and_eager_modes - def testStackEmptyList(self): + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 2)) + def testStackEmptyList(self, max_num_elements): # Should be able to stack empty lists with fully defined element_shape. l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=[1, 2]) + element_dtype=dtypes.float32, + element_shape=[1, 2], + max_num_elements=max_num_elements) t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32) self.assertAllEqual(self.evaluate(t).shape, (0, 1, 2)) @@ -115,7 +170,9 @@ class ListOpsTest(test_util.TensorFlowTestCase): with self.assertRaisesRegexp(errors.InvalidArgumentError, "non-fully-defined"): l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=[-1, 2]) + element_dtype=dtypes.float32, + element_shape=[-1, 2], + max_num_elements=max_num_elements) t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32) self.evaluate(t) @@ -123,15 +180,20 @@ class ListOpsTest(test_util.TensorFlowTestCase): with self.assertRaisesRegexp(errors.InvalidArgumentError, "non-fully-defined"): l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=-1) + element_dtype=dtypes.float32, + element_shape=-1, + max_num_elements=max_num_elements) t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32) self.evaluate(t) - @test_util.run_in_graph_and_eager_modes - def testGatherGrad(self): + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 2)) + def testGatherGrad(self, max_num_elements): with backprop.GradientTape() as tape: - l = list_ops.empty_tensor_list(element_dtype=dtypes.float32, - element_shape=scalar_shape()) + l = list_ops.empty_tensor_list( + element_dtype=dtypes.float32, + element_shape=scalar_shape(), + max_num_elements=max_num_elements) c0 = constant_op.constant(1.0) tape.watch(c0) l = list_ops.tensor_list_push_back(l, c0) @@ -142,10 +204,13 @@ class ListOpsTest(test_util.TensorFlowTestCase): dt = tape.gradient(s, c0) self.assertAllEqual(self.evaluate(dt), 6.0) - @test_util.run_in_graph_and_eager_modes - def testGatherWithUnknownElementShape(self): + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 3)) + def testGatherWithUnknownElementShape(self, max_num_elements): l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=-1) + element_dtype=dtypes.float32, + element_shape=-1, + max_num_elements=max_num_elements) l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0)) l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0)) l = list_ops.tensor_list_push_back(l, constant_op.constant([3.0, 4.0])) @@ -162,10 +227,13 @@ class ListOpsTest(test_util.TensorFlowTestCase): t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32) self.evaluate(t) - @test_util.run_in_graph_and_eager_modes - def testGatherWithPartiallyDefinedElementShape(self): + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 3)) + def testGatherWithPartiallyDefinedElementShape(self, max_num_elements): l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=[-1]) + element_dtype=dtypes.float32, + element_shape=[-1], + max_num_elements=max_num_elements) l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0])) l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0])) l = list_ops.tensor_list_push_back(l, constant_op.constant([4.0, 5.0])) @@ -182,12 +250,15 @@ class ListOpsTest(test_util.TensorFlowTestCase): t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32) self.evaluate(t) - @test_util.run_in_graph_and_eager_modes - def testGatherEmptyList(self): + @parameterized.named_parameters(("NoMaxNumElements", None), + ("WithMaxNumElements", 3)) + def testGatherEmptyList(self, max_num_elements): # Should be able to gather from empty lists with fully defined # element_shape. l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=[1, 2]) + element_dtype=dtypes.float32, + element_shape=[1, 2], + max_num_elements=max_num_elements) t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32) self.assertAllEqual((0, 1, 2), self.evaluate(t).shape) @@ -196,7 +267,9 @@ class ListOpsTest(test_util.TensorFlowTestCase): with self.assertRaisesRegexp(errors.InvalidArgumentError, "non-fully-defined"): l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=[-1, 2]) + element_dtype=dtypes.float32, + element_shape=[-1, 2], + max_num_elements=max_num_elements) t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32) self.evaluate(t) @@ -205,11 +278,12 @@ class ListOpsTest(test_util.TensorFlowTestCase): with self.assertRaisesRegexp(errors.InvalidArgumentError, "non-fully-defined"): l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=-1) + element_dtype=dtypes.float32, + element_shape=-1, + max_num_elements=max_num_elements) t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32) self.evaluate(t) - @test_util.run_in_graph_and_eager_modes def testScatterGrad(self): with backprop.GradientTape() as tape: c0 = constant_op.constant([1.0, 2.0]) @@ -224,14 +298,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): dt = tape.gradient(loss, c0) self.assertAllEqual(self.evaluate(dt), [2., 4.]) - @test_util.run_in_graph_and_eager_modes - def testStackGPU(self): - if not context.num_gpus(): - return - with context.device("gpu:0"): - self.testStack() - - @test_util.run_in_graph_and_eager_modes def testTensorListFromTensor(self): t = constant_op.constant([1.0, 2.0]) l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape()) @@ -241,14 +307,12 @@ class ListOpsTest(test_util.TensorFlowTestCase): self.assertAllEqual(self.evaluate(e), 1.0) self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0) - @test_util.run_in_graph_and_eager_modes def testFromTensorGPU(self): if not context.num_gpus(): return with context.device("gpu:0"): self.testTensorListFromTensor() - @test_util.run_in_graph_and_eager_modes def testGetSetItem(self): t = constant_op.constant([1.0, 2.0]) l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape()) @@ -258,14 +322,36 @@ class ListOpsTest(test_util.TensorFlowTestCase): t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32) self.assertAllEqual(self.evaluate(t), [3.0, 2.0]) - @test_util.run_in_graph_and_eager_modes def testGetSetGPU(self): if not context.num_gpus(): return with context.device("gpu:0"): self.testGetSetItem() - @test_util.run_in_graph_and_eager_modes + def testSetGetGrad(self): + with backprop.GradientTape() as tape: + t = constant_op.constant(5.) + tape.watch(t) + l = list_ops.tensor_list_reserve( + element_dtype=dtypes.float32, + element_shape=scalar_shape(), + num_elements=3) + l = list_ops.tensor_list_set_item(l, 1, 2. * t) + e = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32) + self.assertAllEqual(self.evaluate(e), 10.0) + self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0) + + def testSetOnEmptyListWithMaxNumElementsFails(self): + l = list_ops.empty_tensor_list( + element_dtype=dtypes.float32, + element_shape=scalar_shape(), + max_num_elements=3) + with self.assertRaisesRegexp( + errors.InvalidArgumentError, + "Trying to modify element 0 in a list with 0 elements."): + l = list_ops.tensor_list_set_item(l, 0, 1.) + self.evaluate(l) + def testUnknownShape(self): l = list_ops.empty_tensor_list( element_dtype=dtypes.float32, element_shape=-1) @@ -276,7 +362,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32) self.assertAllEqual(self.evaluate(e), 1.0) - @test_util.run_in_graph_and_eager_modes def testCPUGPUCopy(self): if not context.num_gpus(): return @@ -294,7 +379,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): list_ops.tensor_list_pop_back( l_cpu, element_dtype=dtypes.float32)[1]), 2.0) - @test_util.run_in_graph_and_eager_modes def testCPUGPUCopyNested(self): if not context.num_gpus(): return @@ -331,7 +415,7 @@ class ListOpsTest(test_util.TensorFlowTestCase): list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32)), [[1]]) - def testGraphStackInLoop(self): + def testSkipEagerStackInLoop(self): with self.cached_session(): t1 = list_ops.empty_tensor_list( element_shape=constant_op.constant([], dtype=dtypes.int32), @@ -348,7 +432,7 @@ class ListOpsTest(test_util.TensorFlowTestCase): s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32) self.assertAllEqual(self.evaluate(s1), [0, 1, 2, 3]) - def testGraphStackSwitchDtype(self): + def testSkipEagerStackSwitchDtype(self): with self.cached_session(): list_ = list_ops.empty_tensor_list( element_shape=constant_op.constant([], dtype=dtypes.int32), @@ -369,7 +453,7 @@ class ListOpsTest(test_util.TensorFlowTestCase): np_s1 = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32) self.assertAllEqual(self.evaluate(s1), np_s1) - def testGraphStackInLoopSwitchDtype(self): + def testSkipEagerStackInLoopSwitchDtype(self): with self.cached_session(): t1 = list_ops.empty_tensor_list( element_shape=constant_op.constant([], dtype=dtypes.int32), @@ -392,7 +476,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): np_s1 = np.vstack([np.arange(1, 4) * i for i in range(4)]) self.assertAllEqual(self.evaluate(s1), np_s1) - @test_util.run_in_graph_and_eager_modes def testSerialize(self): worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0] with ops.Graph().as_default(), session.Session(target=worker.target): @@ -407,7 +490,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): worker_e = array_ops.identity(e) self.assertAllEqual(self.evaluate(worker_e), [2.0]) - @test_util.run_in_graph_and_eager_modes def testSerializeListWithInvalidTensors(self): worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0] with ops.Graph().as_default(), session.Session(target=worker.target): @@ -425,7 +507,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): worker_t = array_ops.identity(t) self.assertAllEqual(self.evaluate(worker_t), [1.0, 2.0]) - @test_util.run_in_graph_and_eager_modes def testSerializeListWithUnknownRank(self): worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0] with ops.Graph().as_default(), session.Session(target=worker.target): @@ -440,7 +521,26 @@ class ListOpsTest(test_util.TensorFlowTestCase): element_shape = array_ops.identity(element_shape) self.assertEqual(self.evaluate(element_shape), -1) - @test_util.run_in_graph_and_eager_modes + def testSerializeListWithMaxNumElements(self): + if context.num_gpus(): + # TODO(b/119151861): Enable on GPU. + return + worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0] + with ops.Graph().as_default(), session.Session(target=worker.target): + with ops.device("/job:worker"): + l = list_ops.empty_tensor_list( + element_shape=-1, element_dtype=dtypes.float32, max_num_elements=2) + l = list_ops.tensor_list_push_back(l, 1.) + with ops.device("/job:ps"): + l_ps = array_ops.identity(l) + l_ps = list_ops.tensor_list_push_back(l_ps, 2.) + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Tried to push item into a full list"): + with ops.device("/job:worker"): + l_worker = array_ops.identity(l_ps) + l_worker = list_ops.tensor_list_push_back(l_worker, 3.0) + self.evaluate(l_worker) + def testPushPopGradients(self): with backprop.GradientTape() as tape: l = list_ops.empty_tensor_list(element_dtype=dtypes.float32, @@ -452,7 +552,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): e = 2 * e self.assertAllEqual(self.evaluate(tape.gradient(e, [c])[0]), 2.0) - @test_util.run_in_graph_and_eager_modes def testStackFromTensorGradients(self): with backprop.GradientTape() as tape: c = constant_op.constant([1.0, 2.0]) @@ -464,7 +563,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): grad = tape.gradient(result, [c])[0] self.assertAllEqual(self.evaluate(grad), [2.0, 2.0]) - @test_util.run_in_graph_and_eager_modes def testGetSetGradients(self): with backprop.GradientTape() as tape: c = constant_op.constant([1.0, 2.0]) @@ -480,14 +578,13 @@ class ListOpsTest(test_util.TensorFlowTestCase): self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0]) self.assertAllEqual(self.evaluate(grad_c2), 6.0) - @test_util.run_in_graph_and_eager_modes def testSetOutOfBounds(self): c = constant_op.constant([1.0, 2.0]) l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape()) with self.assertRaises(errors.InvalidArgumentError): self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0)) - def testSetItemWithMismatchedShapeFails(self): + def testSkipEagerSetItemWithMismatchedShapeFails(self): with self.cached_session() as sess: ph = array_ops.placeholder(dtypes.float32) c = constant_op.constant([1.0, 2.0]) @@ -500,7 +597,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): "incompatible shape"): sess.run(l_0, {ph: [3.0]}) - @test_util.run_in_graph_and_eager_modes def testResourceVariableScatterGather(self): c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32) l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape()) @@ -524,7 +620,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): [[1.0, 2.0]] * 4) self.assertAllEqual(self.evaluate(updated_v_stacked), expected) - @test_util.run_in_graph_and_eager_modes def testConcat(self): c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32) l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape()) @@ -584,7 +679,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_int_tls, element_dtype=dtypes.float32)) - @test_util.run_in_graph_and_eager_modes def testPushBackBatch(self): c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32) l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape()) @@ -626,7 +720,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): "Invalid data type at index 0"): self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4])) - @test_util.run_in_graph_and_eager_modes def testZerosLike(self): for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32, @@ -651,7 +744,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): self.evaluate(t_full_zeros), np.zeros( (2,), dtype=dtype.as_numpy_dtype)) - @test_util.run_in_graph_and_eager_modes def testZerosLikeNested(self): for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32, @@ -692,7 +784,6 @@ class ListOpsTest(test_util.TensorFlowTestCase): self.assertAllEqual( self.evaluate(outputs[0]), np.zeros((2,), dtype=dtype.as_numpy_dtype)) - @test_util.run_in_graph_and_eager_modes def testElementShape(self): l = list_ops.empty_tensor_list( element_dtype=dtypes.float32, element_shape=-1) diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py index 386626e6a95..b4a1fc6af61 100644 --- a/tensorflow/python/ops/list_ops.py +++ b/tensorflow/python/ops/list_ops.py @@ -33,6 +33,20 @@ ops.NotDifferentiable("TensorListConcat") ops.NotDifferentiable("TensorListPushBackBatch") +def empty_tensor_list(element_shape, + element_dtype, + max_num_elements=None, + name=None): + if max_num_elements is None: + max_num_elements = -1 + + return gen_list_ops.empty_tensor_list( + element_shape=element_shape, + element_dtype=element_dtype, + max_num_elements=max_num_elements, + name=name) + + @ops.RegisterGradient("TensorListPushBack") def _PushBackGrad(op, dresult): return gen_list_ops.tensor_list_pop_back( @@ -42,7 +56,7 @@ def _PushBackGrad(op, dresult): @ops.RegisterGradient("TensorListPopBack") def _PopBackGrad(op, dlist, delement): if dlist is None: - dlist = gen_list_ops.empty_tensor_list( + dlist = empty_tensor_list( element_dtype=delement.dtype, element_shape=gen_list_ops.tensor_list_element_shape( op.outputs[0], shape_type=dtypes.int32)) @@ -63,7 +77,7 @@ def _TensorListFromTensorGrad(op, dlist): else: num_elements = None if dlist is None: - dlist = gen_list_ops.empty_tensor_list( + dlist = empty_tensor_list( element_dtype=op.inputs[0].dtype, element_shape=gen_list_ops.tensor_list_element_shape( op.outputs[0], shape_type=dtypes.int32)) From 0da937014254b439d4e628b456478f1f19e609a0 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 20:18:06 -0800 Subject: [PATCH 230/540] Update ops-related pbtxt files. PiperOrigin-RevId: 220398192 --- tensorflow/core/ops/ops.pbtxt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index f590794f305..309d43c572c 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -9605,6 +9605,10 @@ op { name: "element_shape" type_attr: "shape_type" } + input_arg { + name: "max_num_elements" + type: DT_INT32 + } output_arg { name: "handle" type: DT_VARIANT From eee31fe6ab8b89e0ba3402c3ccf7c2902147c437 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 6 Nov 2018 20:30:16 -0800 Subject: [PATCH 231/540] Add convergence tests for MNIST using TPU Distribution Strategy PiperOrigin-RevId: 220399204 --- tensorflow/python/keras/engine/distributed_training_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py index fc408dd39fd..f939b7565a8 100644 --- a/tensorflow/python/keras/engine/distributed_training_utils.py +++ b/tensorflow/python/keras/engine/distributed_training_utils.py @@ -493,7 +493,7 @@ def _get_var_for_numpy(distribution_strategy, input_array): input_var.dtype.size # Calculate number of elements we want to copy per slice. - batch_size_per_slice = np.ceil((64 << 20) / byte_size_per_batch_element) + batch_size_per_slice = int(np.ceil((64 << 20) / byte_size_per_batch_element)) # Copy slices of the above size starting at 0, except the last slice will be # smaller. From 5990afceae207e395139e702979fcf9176518cf9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 6 Nov 2018 20:46:13 -0800 Subject: [PATCH 232/540] Go: Update generated wrapper functions for TensorFlow ops. PiperOrigin-RevId: 220400279 --- tensorflow/go/op/wrappers.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index d65063fe794..801448a63c5 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -16738,7 +16738,7 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o // handle: an empty tensor list. // element_dtype: the type of elements in the list. // element_shape: a shape compatible with that of elements in the list. -func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.DataType) (handle tf.Output) { +func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) { if scope.Err() != nil { return } @@ -16746,7 +16746,7 @@ func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.Dat opspec := tf.OpSpec{ Type: "EmptyTensorList", Input: []tf.Input{ - element_shape, + element_shape, max_num_elements, }, Attrs: attrs, } From 71e495448c744729c64fcc38b608a029c15828b4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 01:02:23 -0800 Subject: [PATCH 233/540] compat: Update forward compatibility horizon to 2018-11-07 PiperOrigin-RevId: 220421900 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index a553424f350..984ae7af416 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -26,7 +26,7 @@ import datetime from tensorflow.python.util import tf_contextlib from tensorflow.python.util.tf_export import tf_export -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 6) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 7) @tf_export("compat.forward_compatible") From 03e63a291bc95dacaa821585f39a360b43465cb5 Mon Sep 17 00:00:00 2001 From: Yun Peng Date: Wed, 7 Nov 2018 11:18:53 +0100 Subject: [PATCH 234/540] Explicitly import tools/bazel.rc To fix build with Bazel 0.19.0 or later and it won't break build with old version or Bazel Fixes https://github.com/tensorflow/tensorflow/issues/23398 --- configure.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configure.py b/configure.py index 2eeeceb3399..2d2bcd8d583 100644 --- a/configure.py +++ b/configure.py @@ -1555,6 +1555,9 @@ def main(): check_bazel_version('0.15.0') reset_tf_configure_bazelrc() + # Explicitly import tools/bazel.rc, this is needed for Bazel 0.19.0 or later + write_to_bazelrc('import %workspace%/tools/bazel.rc') + cleanup_makefile() setup_python(environ_cp) From 24579bc55c379ea1d43b3d3b9d319038aa12f091 Mon Sep 17 00:00:00 2001 From: Anton Dmitriev Date: Wed, 7 Nov 2018 14:34:41 +0300 Subject: [PATCH 235/540] Remove unused dependencies. --- tensorflow/contrib/ignite/BUILD | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD index 5587119ec2e..2698b83a56a 100644 --- a/tensorflow/contrib/ignite/BUILD +++ b/tensorflow/contrib/ignite/BUILD @@ -107,11 +107,7 @@ cc_library( "kernels/igfs/igfs_writable_file.cc", "kernels/igfs/igfs_writable_file.h", ], - deps = [ - ":ignite_client", - "//tensorflow/core:lib", - "//tensorflow/core:lib_internal", - ], + deps = [":ignite_client"], alwayslink = 1, ) From a3bf061545b8d10b540ece7e7754c6a916b89525 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 05:44:37 -0800 Subject: [PATCH 236/540] Add user provided constructor to XlaClusterInfo - the default one is deleted with libstdc++ >= 7.3, as there std::map does not have a user provided default constructor, and XlaClusterInfo contains it as a const member. This is diagnosed by clang. PiperOrigin-RevId: 220448936 --- tensorflow/compiler/jit/encapsulate_util.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h index a3b193eea74..304f9f31205 100644 --- a/tensorflow/compiler/jit/encapsulate_util.h +++ b/tensorflow/compiler/jit/encapsulate_util.h @@ -117,6 +117,22 @@ Status PreprocessForEncapsulation(Graph* g, // Information for XLA computation. struct XlaClusterInfo { + // The implicit default constructor is deleted because host_compute_core is a + // const member whose type (std::map) doesn't necessarily have a user provided + // constructor - while libc++ and libstdc++ 4.8 provide a user defined + // default constructor, libstdc++ at least >= 7.3 does not. + // See also c++11 [class.ctor] p5. + // TODO(klimek): In c++17 we'll be able to initialize host_compute_core + // without losing aggregate initialization, which allows us to get rid of + // the constructor definitions again. + XlaClusterInfo() {} + XlaClusterInfo(const string& cluster_name, + const NameAttrList& func_name_attrs, Node* node, + const std::map& host_compute_core) + : cluster_name(cluster_name), + func_name_attrs(func_name_attrs), + node(node), + host_compute_core(host_compute_core) {} // XLA cluster name. It might be different from `func_name`. const string cluster_name; // Name and attributes of XLA computation function. From 1062c2475740397ee09fa23f1cee22b9c92ac5e7 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Wed, 7 Nov 2018 06:58:06 -0800 Subject: [PATCH 237/540] Fix test. PiperOrigin-RevId: 220455669 --- .../converters/error_handlers_test.py | 29 ++++--------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/tensorflow/python/autograph/converters/error_handlers_test.py b/tensorflow/python/autograph/converters/error_handlers_test.py index 29597e1da3e..1f6c5a68217 100644 --- a/tensorflow/python/autograph/converters/error_handlers_test.py +++ b/tensorflow/python/autograph/converters/error_handlers_test.py @@ -20,12 +20,10 @@ from __future__ import print_function import gast -from tensorflow.python.autograph.converters import control_flow from tensorflow.python.autograph.converters import error_handlers from tensorflow.python.autograph.core import converter_testing from tensorflow.python.autograph.core import errors -from tensorflow.python.framework import dtypes -from tensorflow.python.ops import random_ops +from tensorflow.python.autograph.pyct import anno from tensorflow.python.platform import test @@ -44,29 +42,12 @@ class ErrorHandlersTest(converter_testing.TestCase): def test_no_origin_annotation(self): def test_fn(x): - a = 0 - if x: - a = random_ops.random_normal((2, 3), mean=0.0, dtype=dtypes.int32) - else: - a = 0 - return a + return x + 1 - node, ctx = self.prepare(test_fn, { - 'random_ops': random_ops, - 'dtypes': dtypes - }) - # To simulate a function without origin info we use the control flow - # converter which adds a function that lacks origin info so we will not have - # a wrapping try/except that reraises the NotImplementedError as a - # GraphConstructionError. - node = control_flow.transform(node, ctx) + node, ctx = self.prepare(test_fn, {}) + anno.delanno(node, anno.Basic.ORIGIN) node = error_handlers.transform(node, ctx) - # TODO(b/111562364): remove run_cond from traceback. - test_fn_try_body = node.body[0].body - true_fn_body = test_fn_try_body[1].body - false_fn_body = test_fn_try_body[2].body - self.assertNotIn(gast.Try, true_fn_body) - self.assertNotIn(gast.Try, false_fn_body) + self.assertIsInstance(node.body[0], gast.Return) if __name__ == '__main__': From 2aa655b1387ba79821cd4942b602af916b754f4c Mon Sep 17 00:00:00 2001 From: Anton Dmitriev Date: Wed, 7 Nov 2018 18:25:06 +0300 Subject: [PATCH 238/540] Fix license check. --- tensorflow/java/src/main/native/server_jni.cc | 12 +++++++----- tensorflow/tools/lib_package/BUILD | 3 +++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tensorflow/java/src/main/native/server_jni.cc b/tensorflow/java/src/main/native/server_jni.cc index 51db74fefd2..323d9846a33 100644 --- a/tensorflow/java/src/main/native/server_jni.cc +++ b/tensorflow/java/src/main/native/server_jni.cc @@ -63,9 +63,8 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env, jclass clazz, jlong handle) { #ifdef __ANDROID__ - throwException(env, kUnsupportedOperationException, - "Server is not supported on Android"); - return 0; + throwException(env, kUnsupportedOperationException, + "Server is not supported on Android"); #else TF_Server* server = requireHandle(env, handle); if (server == nullptr) return; @@ -85,7 +84,6 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env, #ifdef __ANDROID__ throwException(env, kUnsupportedOperationException, "Server is not supported on Android"); - return 0; #else TF_Server* server = requireHandle(env, handle); if (server == nullptr) return; @@ -105,7 +103,6 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env, #ifdef __ANDROID__ throwException(env, kUnsupportedOperationException, "Server is not supported on Android"); - return 0; #else TF_Server* server = requireHandle(env, handle); if (server == nullptr) return; @@ -122,8 +119,13 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env, JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv* env, jclass clazz, jlong handle) { +#ifdef __ANDROID__ + throwException(env, kUnsupportedOperationException, + "Server is not supported on Android"); +#else TF_Server* server = requireHandle(env, handle); if (server == nullptr) return; TF_DeleteServer(server); +#endif // __ANDROID__ } diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD index 85514b8629d..f3806e6b329 100644 --- a/tensorflow/tools/lib_package/BUILD +++ b/tensorflow/tools/lib_package/BUILD @@ -193,6 +193,9 @@ genrule( "@protobuf_archive//:LICENSE", "@snappy//:COPYING", "@zlib_archive//:zlib.h", + "@grpc//:LICENSE", + "@grpc//third_party/address_sorting:LICENSE", + "@grpc//third_party/nanopb:LICENSE.txt", ] + select({ "//tensorflow/core/kernels:xsmm": [ "@libxsmm_archive//:LICENSE.md", From 4e4fc3b889b9861f81b444ad88f6906a8792c64f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 08:11:10 -0800 Subject: [PATCH 239/540] Fix a couple of linter complaints. PiperOrigin-RevId: 220464726 --- tensorflow/core/BUILD | 1 + tensorflow/core/graph/graph.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 932a0c3819b..b6e818e0a70 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2169,6 +2169,7 @@ cc_library( "lib/**/*.cc", "platform/*.cc", "platform/profile_utils/**/*.cc", + ] + [ "framework/resource_handle.cc", "util/env_var.cc", ], diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h index 585afa2d008..6a224ca4a23 100644 --- a/tensorflow/core/graph/graph.h +++ b/tensorflow/core/graph/graph.h @@ -425,9 +425,9 @@ class Graph { // Constructs a graph with a single SOURCE (always id kSourceId) and a // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK. // - // The graph can hold ops found in registry. `registry`s lifetime must be at + // The graph can hold ops found in the registry. `ops`s lifetime must be at // least that of the constructed graph's. - explicit Graph(const OpRegistryInterface* registry); + explicit Graph(const OpRegistryInterface* ops); // Constructs a graph with a single SOURCE (always id kSourceId) and a // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK. From 821602a38483c60dd05ef1a4b157428a9891b004 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 08:17:54 -0800 Subject: [PATCH 240/540] Certain ops like AddN have dedicate exporters to TFLITE but they shouldn't be used when allowing select TF ops. The dedicate exporters don't write NodeDefs. PiperOrigin-RevId: 220465584 --- tensorflow/lite/toco/tflite/export.cc | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc index 489c21295ef..ae4bfe93916 100644 --- a/tensorflow/lite/toco/tflite/export.cc +++ b/tensorflow/lite/toco/tflite/export.cc @@ -332,6 +332,11 @@ Offset>> ExportOperators( std::set* variable_tensor_indices, const ExportParams& params) { variable_tensor_indices->clear(); + auto is_tflite_builtin = [](const BaseOperator* op) { + const auto& tflite_builtins = GetBuiltinOpsMap(); + return (op && tflite_builtins.find(op->name()) != tflite_builtins.end()); + }; + // The operators are in execution order, so we just follow tf.mini order. std::vector> op_vector; for (const auto& op : model.operators) { @@ -360,7 +365,15 @@ Offset>> ExportOperators( auto options = Options::Custom(0); std::vector mutating_input_variables; - if (tflite_op) { + + // Some ops like AddN are exportable via Serialize() but do not have a + // corresponding TFLITE builtin. In that case, when flex mode is enable we + // should export it as a flex op, not as a native. + bool export_as_flex_op = !is_tflite_builtin(tflite_op) && + key.is_flex_op() && + !op->tensorflow_node_def.empty(); + if (!export_as_flex_op) { + CHECK(tflite_op); // guaranteed by the if-statement just above. options = tflite_op->Serialize(*op, builder); mutating_input_variables = tflite_op->GetMutatingInputVariables(*op); @@ -373,7 +386,7 @@ Offset>> ExportOperators( variable_tensor_indices->insert(variable_tensor_index); } } - } else if (key.is_flex_op() && !op->tensorflow_node_def.empty()) { + } else { auto fbb = WriteFlexOpOptions(op->tensorflow_node_def); if (fbb) { options = Options::Custom(builder->CreateVector(fbb->GetBuffer())); From 56e2ee40d3730e1b692ebd7f5a0b970ef3d314ad Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 08:51:44 -0800 Subject: [PATCH 241/540] Apply EIGEN_STRONG_INLINE to all tf_kernel_library when --define=override_eigen_strong_inline=true More tf_kernel_library are suffering from #10521 So we apply EIGEN_STRONG_INLINE to all tf_kernel_library if --define=override_eigen_strong_inline=true, this reduce the maximum presubmit time from 2h to 50 mins For release build, EIGEN_STRONG_INLINE is still turned off so that performance is not damaged. PiperOrigin-RevId: 220470017 --- tensorflow/core/kernels/BUILD | 13 ------------- tensorflow/tensorflow.bzl | 5 +++++ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 1e0d069e63c..be448ed3db1 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -41,7 +41,6 @@ load( "tf_mkl_kernel_library", "cc_header_only_library", "if_not_windows", - "if_override_eigen_strong_inline", ) load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl") load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") @@ -3058,9 +3057,6 @@ tf_kernel_library( ]), # *impl.h are excluded by default from the CPU build, add explicitly. hdrs = ["batch_matmul_op_impl.h"], - # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true, - # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521 - copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]), prefix = "batch_matmul_op", deps = MATH_DEPS + if_mkl_ml([ "//third_party/mkl:intel_binary_blob", @@ -3128,9 +3124,6 @@ tf_kernel_library( "mkl_matmul_op.cc", ]), hdrs = ["matmul_op.h"], - # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true, - # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521 - copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]), defines = select({ ":xsmm": [ "TENSORFLOW_USE_LIBXSMM", @@ -3529,9 +3522,6 @@ tf_kernel_library( ":xsmm_convolutions": ["xsmm_conv2d.h"], "//conditions:default": [], }), - # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true, - # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521 - copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]), defines = select({ ":xsmm_convolutions": [ "TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS", @@ -3687,9 +3677,6 @@ tf_kernel_library( tf_kernel_library( name = "lrn_op", - # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true, - # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521 - copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]), prefix = "lrn_op", deps = NN_DEPS, ) diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 74773629d29..8e5ab94b536 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -1167,6 +1167,11 @@ def tf_kernel_library( copts = [] textual_hdrs = [] copts = copts + tf_copts(is_external = is_external) + + # Override EIGEN_STRONG_INLINE to inline when + # --define=override_eigen_strong_inline=true to avoid long compiling time. + # See https://github.com/tensorflow/tensorflow/issues/10521 + copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]) if prefix: if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]): if not gpu_srcs: From 54c7d9e64b0b1cac0ee9f5048ae46120bea72f78 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 09:34:31 -0800 Subject: [PATCH 242/540] Update TensorFlow Lite 'Updates' section with 'AI in Motion' PiperOrigin-RevId: 220476495 --- tensorflow/lite/g3doc/_index.yaml | 14 ++++++++++---- .../g3doc/images/landing-page/ai_in_motion.png | Bin 0 -> 579541 bytes 2 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png diff --git a/tensorflow/lite/g3doc/_index.yaml b/tensorflow/lite/g3doc/_index.yaml index 43b5e3cfc01..1e9f17bc165 100644 --- a/tensorflow/lite/g3doc/_index.yaml +++ b/tensorflow/lite/g3doc/_index.yaml @@ -182,6 +182,12 @@ landing_page: background: grey heading: Updates items: + - heading: AI in motion: react in the real world + image_path: ./images/landing-page/ai_in_motion.png + path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii + buttons: + - label: Read more + path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii - heading: Introducing the Model Optimization Toolkit image_path: /ecosystem/images/tf-logo-card-16x9.png path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3 @@ -194,16 +200,16 @@ landing_page: buttons: - label: Read more path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5 + + - classname: devsite-landing-row-cards + background: grey + items: - heading: Using TensorFlow Lite on Android image_path: /ecosystem/images/tf-logo-card-16x9.png path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d buttons: - label: Read on TensorFlow blog path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d - - - classname: devsite-landing-row-cards - background: grey - items: - heading: TensorFlow Lite at the Dev Summit youtube_id: FAMfy7izB6A buttons: diff --git a/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png b/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png new file mode 100644 index 0000000000000000000000000000000000000000..df605833d7926cf76068501782239647cd3ba3a0 GIT binary patch literal 579541 zcmZU)WmH^Ivn>k29YS!I1lLBI#$AI1cXxN!Cb(+|?hxGF-QAtWCAizockX@XoH5@1 zwRTm_s$KP?M(w%Q3RjeuL_;P-hJu1Zla>O;a-qxRK4uEIC2V!6#!7(74UWSUW2B{| zZ^jiXpkrwLqU697P!5MR&`YV&-ANu9QOk@gCcf|6O?B4kzUh7VHkG!%)zad8IOzj8 z)PWj~P4n?XW`q->XA#BRJ(q}#h{@f_f<}yk!lM1e@J)8u9S!{+A?)CM>#l|@T*76@ zyt*8U4s!<^kij~X z2_WVkdN4Qc6_JR77uE^` z3Q8UkfTrYndbRTQ9MrE`i9>+qZblk=eCAG6pAI#D5uY?+LYVJS#3*)uL*Vpw`HPT0 zYFU^?iL>MJ52!Y&oO8KiSY>oz(jhx^_cy%*Y_F)?j(&bJ)xfO8U8Z{!fariTAa1OY zm0QC{Kw55V+Y?@Z9DiB}wHnqe07VF%hQYYV;eq3w5Sp8U4SW#MPK+MiW1PlVr1>jt z4*Hj8sB`wy+uOZjj^|Kx z>eV1_i3!Oh@Ux=m-*^F5c-(GyAQavlf)^A zkHVi8HX_6*7d8wbJjZp~tOS!jXf#(;k^dIpg4+tK8~U7sFpbH9+yU#eP<?}6!urjP>_5u6}tnG>btlEh1o zhy#w0L{cPDge&>1!ubblA>>44Ll`Y@P%(Q(g9GQgC{aGiw7`)A2TEqlX2513v*c24 z`b_2#?-9i<@R>IxNT~N+Qx}JH1NrOs*zlIX5WHlP)d}@lHb*lil4C>M2GVrruJUgn;b6YmrB6aP<< znZjm8BU*x_%p|W-(^2J78l*qL;jp4XOz~7o6f=yD+Pl%z!apQuRhC(B=70tt!BA3?vfcb$vt77SVOn$6#OfoB3SS8z))H8B4>M>evdToL}rjtTY zlvkpq;!Ypgn&uVk^%F!HnY@r_I{a+}ca${sJf0*qF}^M#GHxuPERL6jn>IFKDcO9u zc=#egkZz0RLtI$3M>R0NS~pi|r?IzSmr@*0JjN){C~=p6fJKJmt3shkA-@chj6%}P zXvnB+vN{8qN@pHlqm)~GALU|RkT?w};vZtQW3@CzRpmRi{j!u&Jv9>L zM0IB+*&hsQZ8=1K*311flEFD4BQ3r%o1BX4w_|RolSbNcD>F zLhZ2WfbST3{^D!?=JICs)&{Qu(<)rs0}a*_U7lWLovH4n9ZxLaV<8^ zx&<)HViaBE=-2fvx2)_hXV$&g*c)dX`p>>AAQ*?5gyy5Oq#iNtF!I!zEGx~$AA1Zt zPKO_SIlaxF%c%*-;>ZE zNNH}FNNk87sx!(O&ONP;pw5>~EDC}ezB~aV_F1Z0S|`>~4lCdJ35$ITI~QwI=5Lfm z-^`5C4r_hKBvQ0lrt~+vMC`>EBTgBT)r-pcx@4SioEJ(pL_pKPQn(7YYXZ;G&ihKmu&QJDAyQXiW`m^${R>dITmfTxa&7v-*VBW zLz>Y@c(iytd8mjwxEF0H?PNA@tt&O{l>UZJ#HX)atK4uMpxIw-&c?CCvka3hpHw$k z5qS~0Yfrh2otv(8;Z>v9eff=(^2KkXXVnpca6=~J|7(+=tIj-iq7vLz)8^Us@&LRj zWk_HiWhUGts%~oerKY3!YPj{scS*Kf=FdI$L*zY1%5!RCie>6K^NUuNhF?{0MPBKT zLMQM2J?Q(uH3VIBF11oM95vsk_!r2V=w|5eGS@O$bYl#2Qa?ZHU8yC7+w2qTBx_{r z&V_3$gj10d(5da|Y3v{3jRh#~FhYq~Cc#DTb%=*`z9S^}+fz|5_IB`5e_6J%|6jpl z_gcu8Hx-Cw=3KIQi8=D`+yl-D2vax{xMHTl*W7o?t)8bo^J!o&b=8^hkbZJ3bIUc* zgW{t1cHk@$`~3C3MiwbGgjda#ua(&K$?kA=GK;}ay;AqC)vIN!;dGaC$ojByO<&L( z?3ce^G%;~tclsC5rS;)eG*ndllyY9NsMogH*!k5L&yIJi(`5)UxzHBX#(e>|d3uU* z(Ir^zV{j?(@;-(@hp{Oz?gPBDfEbTMf@ppy$YlNDt9YN-ZYmN^@3|2r72TK;$>t|C zL+g6ieSBGpZ$C^r{K_(~Ki^gF;b}K-UH`C9t4>w*R8>59{So(`e;xgvKIh;1y24M& zDR8ZJ>Gr%4^B{9!EvVsp^b0Y|fsYeVpCaDdy+N*6_^m*ZPybN5wlQ||Cfo2H9w`s zcSUkhdnZ$JPG$~fR!RY6a&mG$ClfQ!4>5`V2mklRPif)e;s9b{ad&rTb_X!qJDIbv z@$m4lu(GqTvorm(U~=}fb20Q_vU8^TzfS%?KVqiN#!i+FE|&IoToE#_*Ao$7Z2`~YIDJZVP@@-z= zlmR?IerMX1SbxW1mUpA~MT@K$x7Ig_J9IDqHx`}lfVEH&J`5;^ zbYSS{7*E+E)6(y#r=|6K!28?~2x51Jt9nCWo}+P@G{`x=P9E!9C{yq2qv(!AiR{5f zA7UqZPkkvr!~KHyN`}tMi_+Ki3dRUIuQ)8RGY3-tJ=!L>1TR4XSHCcwh^`Lnym<)l zt&*jrqKDQMtA*a-oQ3$q(7if1R|C%&%hCOx%KUACzPKd66{1t=a4rwK1QTKBH*j4~ z9^Q8ZNwWm}vOu-9wd{-(A3v!~+kj^mWT;zjI~bw3Gz*>(BHnm>QE>_y*(AFTZCk zXq)mny?2%>wKp*hWOJ$LxWuXw_!+;nv9V$3ZEo(tcCW5yfBqMB&6~H6d5PD%tv)#; zQ28(g4;rrPR_j74$0?X*634vjo5hOVSo_p>{ZlhS``-^F&#Sog;M%17hK7b6S$!Yx z$BnJG(?0di>&eT(3E951iMc)sb}x^I{mY&ve;}XV?I+FLFar6jFB`K0O``jMqE&g6 zfBz0)-txY2^D$xlHd%S>f7o*^qcA!|zK-gK4wOxo(3C$d8{_cBB3>PLI;-;e3HSG> zs{Lv`pC@qvQ_?}JdWg2Rwlu&FO=o`QMLU7Cv;b>U289zsEkK ze4qW(Ufixw^__b-e`&5hMRkbt?3UCr4uPnE8TYlf`g`_O55g94=iSByDh8=wcS=)V zw%c{#!+XUJ_s6LEReYsVs`^NsSeHD%UZW8+hWVrpf#2cQi+iZ^;nHqBi4u$LQo|{_ z)r#M4--S;q9Y#LBzQW+hjux1H3(JnT$5>07L32HtuiSh->7fJK0B z!ZAGm<~NhD&P`Ax)CE7Pv8JB9C99{eG1Ym~+k2h)R7l5Lhhf@1QBe+*E`_@#+t%mj zGV815BFmXA1qtN@9AM`~}oie|837|0(a1CYJkDp6- zh_xLiak|;zjkS13#BE_w4z=+3{U_7o@b0gh)F^)&aV>(i6xUEQ{jGi5^QK#m{Ty!;rb;AaYjNm0s{SE?S%&T=YH zjjd8EAlI5b?Y*2fTV=`!C!cdrwQ}e%;ES)Zn{2fcbuI^gN1_KzIurikANY*_m?}iC zN+UBg%H?c$WubA<-Z_{9MYH?b;@qd1kux#z9Ug|L@AyucKn|Fr(R-lwi%)APu%TQP zl((=yN@M1kkPYrTp~=Wu-ca(FB+yyz!IN`Ub6efyCYvpo^xDmPfw>_w5z678C;zio zvfkf-OB6G{Jy19KbN?bY5j6qdr+s~MkZmOb&`?*G{!)>ON*FgpdI=(Y={~o1e(JOR zq<$nDs?FC3_$44oFzQYlPxkkz?A26uG3}4>5Q4k7y>*>~dxoh;rjQ@Pm@P_?=i82C zQ|QOL|05Z}B~xw_+HEagc|{faMc0)nJkEtv7thHCJF@!N*3A`b(*$$V)X+}EKV2`9TWaS|);Fv@N*tT&%38 zJ2yz}*|{G(PdW~8&xF1h3~BU(7c}JT>^+`v?%R5e{Wz%>S#_w7@jXWi^_J|D6ny;% z`8f7}Z|hrn-}dfXjPb+3=Bck6byK2oFsO#o1?kF8Gr!fu&xC0C&u*~4lv^9})>raz z$4u%U)@k>wRaJLVcjU;6oat zbY@Z3D&A@(u*42QgMbNwukBmDq`lqzaJ1ua@$L(+9Q&^mWV3f0T?%=kw`QO^imX3Z zb^KEJZDaig&?TCy)M7D&1A4MmcqWvc6ptvhONO@2%A1YHUT>2E*!4vjh^j2Ju$>}u zq8^xGaTUo2iSOSM!;9c=|;gRvWedGG95#xuv9ZHg;R6^lw;#iaj~( zS}fy8+NdMB$HR!IoG^A&P~wzz@v+W2>*L9zaQd-|(bj|NeiSc;iXsEGBx)qfy8oie zs#)=>!&3kP)4Rq3gdn!%*gr8D4R)%bGjYQn`ih+yW zWTRjzHA=n#eBJ~=&*hlY6TNxiA(B5Llw%CLkj&R2q=4faDK;zrg0+i}#MqxPd(pWCa7o!IvyqP`A<2W1S$hmZ|03A>7J?M~gCZh`o! z;CycXCksZjKMjQP7iBq%u4~}{nxe0+1p|q7y(F!6=}l(DDAkz)JHgUWMW^+3p`Q&U zS?meb`j9%e*=bAq)m(5kyJLXqn<9GqjKlcCe~23AoEtM*x0@>ajT>7T_;e4VTylv5 zPMO!7-C2cs-{s~yV%CM@tXpO&L^?qd&22a3vlP+9qt#ah4x_2CJhzNd@{wi8A(QHp z&8-Z&Fz|fXf65+6Z7(*K`sLX3^Jp%P~Boh1TprH|( zD!&l7AML{mJbtUmW!RP57I+6s$9%re9ALzZ;eE8dhED?LZL_2T9&GI`(TS}ARBg`o zkC3X5zMYT9mk*D})g?i~dvpRmK=US9W9R^kOGwqODbVDXNFzp~d}>bZJcncJabAi&7t zC(Uv=&oK(enftyOFV1G$1_X?14xtE{p5qJ^^_3KC2m0@ZX^0h(<8XPUBs@}L)U5Ho*u2aTTwC)BX;+* zfUoUN@EjnCe>z%SOAW0;PS}fM7UG7>7G&yV8*jNr4Uz%0{-9tYcRLXJq>`5` zCxOwd2F2cJp}FT=G`?TTL5klj(omrZTWLnm(>4(@sE3chC^K5Yp{Omwr#MyUsMqB|C^`NP|!3y)9%Fr*^gr&WF7}P>&PEDs19@LN`+9 z#`O2Zf@wweN54Lz`Q*GAf!S5sRt_H^E^VOZLJ#QeF zJm=)ta*SX5ZbJ7{AIAuN7=n-R*S-QiFW?JQ{f~UI-Syh92ZdnY{rB=Cv8nV&oWsd% zk}N~_sbyy4VVPj3&u)8?S;?NEmFk;g`eOx=f8gj z^b5Z$lrut2B7xUSD&kvj%Nu(gZ|7#GC&D%CDW$1+XxH{@BMiF8XZWmqsuRk4`^;gr z{dkPKul%`4JY~*1cWtjkA9J`RW#|CEdGp4e0uGZx82^CvY_#mnHt@{{)S$9+yYZ+JR{rLQM%lE|d%l+qleY z^5EA{gvtR`Q*T?{fktYuwYnqPVDU$iv0Zs+a@972V457Z25|MywoH}*66spaFxjQ3 z!Ji8_y_&?kqv{jOkywwgc;aOoTZ7sNXx2hU{r#`dar^VC&-Vkm>~sj|3iWHF`7S*j z=ZyXbcnysyeqp>czQ=ijh zp!YvYj8gl!rV=@GAXpt>T2EmG)UA}I+woFJeMD~{#yGX7Dkm= z_+W{O{D^`nWN1T$A(H=2srUh>_<#~gX+%PV{;Oor#L%OE_AJ}?MZuCI8+D0UU*>BQ zmMgAV=*qI&C15W7x##|KQ>`KfAi!N|)@#TRT%rteuB0ojJamLhr}$*4rkN?o%jQ>6 z7Tk?L2YMg|PKiXZ0sHTdOSLry$RP>nriDcwmZE-K~YQK|* z2i6$7n9?SFH^F+(8D-Z=aqBiSwkJw+`*97fUlp+v|LwMd-m@v}$F&i$mqHDXQ9P9) z6WZ2T8v^N1K?7~tC{Q}vTmQ1-v_hD0I^!s$lP7!_n>Kx3pP}{ks`s}UFRbDvYZTq>3Wz_)UlDed?}#)rICPo{su*0%7Ex2;ihql z(?RT+AVXSw<`-ZX&yjd8l(BWSw;wwuyt5Qg3;Ie?54Eu9|P_&a{yvU*a8 zAUW4n?-^VCxY~4uwSan}&NDgDGGt|&!~G81$5bY~+Na_mOu7v&N{7F4u13Gh$@K9W zuZyGoX)ql)Xr~DsH+@_0_nGJY*J3CK2Z^sdA-bgDZ%BgKo#kcg7;QZ!ubBnWk#k|J zB7LR^&v9e;SYv_2aYHA;J(dzM%FXvs6W^h!%w zP;-oW1BrD!xM7_1?Ce!C_&UE-=&GYOMCoHYRhtrua#)3GF0YSYsngoymgT*3)$y=N zvgvbEvGe6z6!rI~e#7B#GRK(`n!iIIh}x6p5X_O~FMbkgW4}|?oyxuTt&|p6;H@Fs z)83|#lakuTidx^KAvg~gqizTiIF{G_KQs#;zQEHhDNLB8X86A;`o3?1NkH_dKfq(J z27bB(_jRnik_%&Z3BKVrnN6>4P7hShdQ9oF8^GpEC|CF92Bj?wb*i&NN5l@F@oGIM zZ_vE@qgkrv9P-V#!h8+2Ch9&lJ?w`}ixk$Z#hz5z%w{I z=w3ok@`-Y>yg6&@$c;l;`KE38%j^LC3AT@0)( zRQLu&c*9%_#E?Wl2#>~~0-u%WP%zxf4dCm}ug^3k+)0i4wn8P#LN%Kg_({dlV~S;F zT4_|Nz2K?a0M@dlL8TI_G^Mzhx{DoHR!(oFFWz36kX4mD@^LUwAB0%p%iz-mgF^jV(y_XqHr$ z{-9P7rIM#b6IYC+;i-c@*X{}Gtp-H(tY&)&Y&K~U*!P)h8&qekxDt=?@PlGj<^e{~8-n<(5<9ip0+o=){zJ1#YI}2ZZ%RLg6|H6G9o=XkU2FJ%qX(eYq>G*)(-p^?m zjw;)#eE-0agKnzZX)|kC5bmT{r^iTNN_%_Xiuw6MXx2tnj`6mhdlTomP@7lECXU|a z^weh;o@>7X?4HpqK=l|%dz1?u={h-KM*d9Vx;+?37fG2*^emN%htbcLxKqAGaA%p; zD`&dGe)OtqkBl2FGR9}IwcSKYokjGw-n2?h!;;VaWc>0SZg%QcfJ4(zqGp(hSmZ9128mu$ zRgZKdkvyKoUB}*Hz#P@lfbc(T=Wee9x_&)ZSKjoYS~YU!f0Vl@-P2Fm+W}bLZZ|h} z7SCft*Z3T~A9_0YTrOl5vHoq0|D{wugE&C*b|c0Nwi=)#H`G#QI z!(vN@KWL>~c3Sdg1)`@q+!P|b<&jtJPQ{wfG*wr}aZ79(bFm0xjBH`4f_7R%t7c=} zZsR!AnHr~!wrRo$zGNMOUe`bjxh>N;mo?%3K7_RDxTU-}{n1)d$2{U!wN*^z$Pg^6 zNbgIprR(a9Xv%I!crmA`5th}Gj!?dyHwQs-N({tYY{Dv)9-(Dt z$vOt#y~o9hP~|CzU<H;>I(s408RoN&b56(CR03j+Js(dOV5#p1y?s-YG#+( z5}Z*fxwg`xqsGb3GU^k=BVhDQZGKwt%Ix8pcQPJ{bW3C4#WSO?rHSs~*NGFqoP4>p zJ*RWZ=t@9J3eB4BMPnrYV+2z=uWN85n+sn{7&Iv{{1n=01E04be{iC_xUq$Yp)N(- zZ(zrsdeQa1P0OTxBTWkv%;C|OnEbU38yE$^oYjzl0Esd0_E$CA!Z{`)&qps2d8=b#2K8V)ZYp-}dSLRx#O=vT8^pY+`*w{StrmI#I)9DPzAR_^&w+sX* z$gism;Z+w95y>;jW>x`qX_D>KLZP-U1aqdSqEeN&3RQQ*A=&UJ|FmNXt+M&;KaRn( zl%a_;eh3;z8xAkUEo1^9N?hKw<(X=ZB99SECP?~^Z{YKJ@O}tsb0@nl$MV{XHH&{; zwHcA)8Fs(Yvn^VhrHI9$_vMO|G8xm<`I^##%BKBDjL^A@8!o8x@;b_^ZjXA=`GQ`C z!5un=$1-RpfyZ%e;l(H0*Iu3{yGlvp=zQzCbVXlj$)fe0cV0RWc$HRlb+1WzxRhLF zIBSxtMCAP$(9@&YN?c8R zmcxkXsCaRbZ7S1co77nB>!KUab@R|t^`h4tgYBQHdN^CnZk=&0K7p=tc+O#-mdP3) zl00XgNK^I&GHQ`(yplNtVmSZ8hC$J^syZgGh^r&%&zg;+{CPG5nvAB6t=oQEqBFg! zXsLI8jSI~7280x9f-+t;ayOi}o8)vul~yCpca9T_`@l3)9=Ml14xz%^zKzU0E!Cmt zVrupN6d{Rw+85*2QgvUrUazV%83+pppd?>^u(UVjUNM6YdzDtg4*f�Z+l4JeoBb zN*D0}Q*V2sO8H;{wuO$B0Hg?1k$I$teows-?Ih~n{L$n09*5E_iJQ!=9nqP!o1)=KXK=qyc4b zEtPRPHc~rJR6BkzDeQS2{f^kMvY-+!tBTA9ZzLl+%z0!S6!iH5=KeYqhVIY#%03AA=&~ zB84R=xtVLzem)9n3Fvc-4(G(%3xd)ER35IL^V;{ zQ0=Q-${0R`GlU890B^EioFL;LR z0cW%3G*}0HMikpid?W?3$H-i^<^iACtI;l6V(eH>JJ%Sqp;18Bt{L^iWn=qx{I#Zh z6GSHK-o9Rm=lC4Alf+Neb(!TPmBOJbp+&YZ1N)-0@Y3p)=+*aoEahJ+n5cGaY4}@> zA6lOxGa{#rk2gryB8AK>zURKG_^o=tnQjw%3#tEAa$}c781E59Ck`WV$xS?RMoZHz z8c%4dE1Omh%Ykm`u9_NaK&@p$N_(;ibP@l{x+Y)fM3+%saw&BF6NRB%rV4ebGIPI~ z!)K?11=YUe4!5eKee!ILXRjsU?biK*kMupe>TTxo;eB)KkTV|0bL;Lzt-YFO>gt(b z-U1TID(tv3hUYBQkvrXPfH5aQxpK+s$I5xxZAx9wY+%voeCDYqs_e;eRTU^owHj9y$o9sR&)7$OI{50KD7Ze<(O<_BkI3#4S3R(ps$jQe zdU=_@vq;m#o?g*cKTJK&#ej+8i7xJ4Hi9+InbcVTpKlkF~6E@ayF@v9O2 zz2V!g;Kg8{o|56P%@E@)P3eU7c+>HWz2=YI(RivkpShQ8;mCMmEH>BzEN(4$uT48)u6O z9&crGFCh`YS<9S&O6@d&Mqxzo6E{o>dh@YN|4V}G&)6W}#817>U+TLiIDS+~W@~gp?QJfQ;oM(olU-a`n;waWVhrB04p!1JqL6Xqbz+D2i zc*{Rky;#~v!2!fwE>5qJNiT_l6OGB@$-kT?U$)B=3J28~USEjMyL0n?T?iQBskEK(YdbFa|zUFutg}bsQaH_|n z+_dIC<`N{Yx>+3d)6;9v<&1T~;|*(x8Pe`h@oB*7<-rvt;BW~7eSB&Gn;~TJn?5at zF3T?x0H{6|xj~dk4~pBiK-vrL3h1%`js(rO+S+}=LNEadC4So>2L8rF?cp(+?69qA z=dsRM@|z2{Wt-VcPt4%=8!6B8s@C0v1fG_&3i;c>x9-9QB$wC--z@ybO)oa=22ZqzvK4tD^mPlHfL9OUvw9^HTGtJ47w7A}y?=5OLC2Kb2B(_afr%sYUynAP&2{)bMbd+viis3zk!W{pcR^i3H++g0*0WPwY^nQenfR>9~Bgm z_Djxd^{Mjay!zh0Icq*XUq_Fde;ifx>5?_kCu|pfHgYC|gQgP)9AF-|Eu*?7-5OA9 zG)+|NWIAiX6Rswl^J8yH_cCk>MrYX}fj;eYw<_Q%IzHS8Rw_u)xYvUPz3Jz&ozxh* zo1yP{AJ8#fA^oK1XIa>4)O$$G>~99yT>{rEP>?*+JDoQF1th@l`Z#k3vX+h7M7BP0 z_JHh}TEZ(82eUv!D61 z=uB7{*4OC|JF#*o8i7bHHt1pjpZ1V;6n7Pl2I z_cMw0D)^gq88r%Sue>AG_RL0`T3OibUj%^#&zmu5z7!N4TN2rWjCQGDc#biOSrTsl zpiumX&$qfUB{wu@N0FhS3oB=(S;)pp%^5P4c}eH;4;s2=>@R}iU}FO~PWP&}*%Jlr z>S($Zjxm*?*DrCer+_-i`;e$#;9{Qdky!X-$dsB33QsT_qY2D|BG}`q>FC;RsyrNL z%^Vlnle4(^V7%1WBTAK^fEXwOZ?>|GtcNN_ilmepnIVLxiKIHF8id5%BH{sRK!yQ< zH4MJ>8`oyxfmJ9oU8+VlII^f~75rlzsw5b*OmqvzjPn9=6gZk+6t(#GlQC1Z!*J2d zGL@E{JZn&k+mv!^rBQO;jgB)wP%NOa=lc^&X7uADWal|Loj^*XXCowW)C3FuT0(E9 zAA;?_T!o7dQGOd}QzUsa|2;cg3C}JOsc4;Oznr!z!B|xL zmc3dsslqlpRIm#+sa}{z(6sSmF@f<_PEZ6Iw)K?(-Is-mQc*<4z+C{l^ z*gZ5lVHveaE+hsWsqZ#3&ZBwqgsODMKsk0P(S@fxnzi`tdJxuQc?kI-D&tXxPol!5 z(vqvo&6V7d8-pls_paOJPR;;Da7_Jv|I=6fnOee7)uEZC+z4faNd4fix(!yl0be~^ znr^0_Tt_yUWbmf+hdRm&qUqRLWH0mt&VI zEK_AOQgz)H;PVmran?onet+@rB-C}G^Rblip%!2=7v?7z=HI;}nvW=T5dn|OkxQDb z*Mm#%kHaO8Kxi^NFy0zJLcsNUqhPYN+!tOO&*i7l*oXU4OgPT3+%JrtR~q}nXV!XH zqXCM=-NF=}K5$jodfb&dXc*{R8~PjI+nNx`VlAEH$`f!qx33kze_9>9lqcRX6wjuL z(Fax9_>|?$dH?e`G8_gj*PvrDhL>8_ZoaGqY#9~r&HjrlYaz0SZ`Dl!^A=&JpyR~# zTx5EX{m~?@oK}}@wL9&HTiJN?v);#L?0T&Xik`mXpBvB9`|}+LFLCxJY1TCtxok8E z1k5p!lI(?IYrrDW!l&i(V-^Howa6b-4-`J(>Miq$yZQ!hi>wr5#jNqgE`4@RE z#U_)%@A;9UE}^nfc1F$`-3KBdaE`>fHYWDq@6NveMANRf21+S53L<)Rc|Q`?;i@F& zjAyKr!7Ywh&>tHkFL_81E9;eBi=zkg@nH1@gR!Ad8!un+EmXb#;;tV^pWXP3@X6jx za?Biv1|{`r4C(H{_+XFOqS8P~=US_N3Fi z7q(UlOuV10@e;hwG`4+J^%fX=kmWlh;DT6SXP7v}Wti_YKLk2hj@}ztJb(i&xosZk zifoEba0X`N7h$!(6~<*J;vUoV_ zp^V8cr99m5Jt={j|3@G>tF>4hEo)DVjPMo85&DHj<3Hx^7_7VIbM;#yqkW8T_{;Ly z0@#spz$dixnq?qeujg7E_vD1Vxsax{iF~xlh}W2vecrVYq%b=Ftzao-CM#>h*V{-(2Bt#&`J>H9$Fek8(k=N2eKh}m4FktAbxT;~#V zn#b=TnEk%mgwZC#!3!opYy8QYSK(r=(;{epS9)-?{DHDhSGd(31Gnv1| z2bLi>mI@r%6gSEqUO1yy!kaK)d3>b+N@%V2*pU|YP0rU@N-w#c6-Y{Qf(%*n{%u+B zG0XuUXLIK4K=ia({ovOE{ZL2qEd(~f-28*FDVM0#7X5zT=cIoPq?zBx<+M%n>g6)j zCiU_1`0{F~`z9o-oLH0eRimk%bgSAl2hpD=i3-dIX}P0Nw560wfOU5&Bid?7#5nzLSzXj{-^3ywu`_hrG)x0fp< z=VPL6m!ggWtmoRP8Jf0%x8xqFw$lH|9Rj@3|yzgMsyBU|r zx&omnIVgLg)9f-ob5B;)jmMC7XP@#JO#_7XO3ZLsvcjtep62A}p&CU;I(xY?zTJ}i zYZ(bN`Xum_mxg&#Zm#F04U%I;iQv!K#-Xkm5BedEE|VGJB2Ut)fxdkX$yzLB$JYer zisS_!K>=9YxP*AmmK~bARM2aNw4Ofsu390b+)g=&xM}YjHllpZoib(k_uiqDa=HYo+a5$>ZmbT-qhBK#3CekIkuNXvp!)@e?&hW{QVmc(Dh*B~f&p zqr9bcXMZlXvGMqjOYP_0RR=|~XC=DI;D)N|d?~ODPa4u;`}co$0!FRi**T^)DRL@m zDBRr2iMG!iiyhUQP$nq@6|dHmma#yh^}o#(lKe+ToI9SUnz)W1Rn`{yw3c2^vnX9R8|+IxFqTd#W2RsnD2uGW9V|l5ts5k4f^No?!dd(Zxa+ zlLHn4Iw6uwVi$|^TMfG#WsdF<{M74iNJI)BC)zb91g+8gayqm4sX_b7Ep4zx{J{Th z>pgYnMey-k)oW*uw)Hah`1pE9o4aTC{pRH*Ma-D!(bhd2yujYuCp}Lp%(}*tb_5J^ ziK#DqPxpaMn50YveG*k;M{MZD=wx0K0OE{|yvB&X03nw(sI-O_V94;d7z%(rIz7vc zVfEO!6E9I3IvGzu#e#8e3-ihfM*czsJwC-?SWCw=IMz$Oh?R1lQNNG-lLiL*jNBaZ zxsHBxmqqC#PfV@pSaR1OM6-j(!67b=E6RIEYwBd_0)cP0A#;|BV$fJUi2i zg6MLGKGpu_IjM=M<>V48)cZ)^rUQy*eH6gZI^eMI)!EJSE4FbAf_@`;N6Nk=K1;^o z8Zr0Qb{k5nCvI<*-k7?l5jIqCGn|R0$?XpZ(a8Azp z$FbqhFg&|AY|RB2&$+(QSlHNSFb?CYUEl33?C#2Y%+obSJCHwQTzuQ#M{P$2$(5jd zY%{qIbEP%|S44u*3-Y3p_oY&o7M11z`0;>B)zY#sBhKdNEst!xOCEuc4`Y)1bB%IZ zr^kp@v`ITjCx!%j(NTlI$7CG%EKQ|9NMF{<=i^bGY^_UB=t}*pgm_}7;m7BY6LBps zywqGrRCWoXJcQUVa(Qb^+vpYQA!|O$aQ@5Bi-|^hYn#^k8&%WR!szfSfaD}u!&in8 z;xB#CxP@1-L3xHUalrSZSw%{caw^7@b|WAjQFDyWvd|Ea9s~WYB zpz}2Q>a54Z6QUvr+N|ow9~Y$(SKZup=8|zeKUdaV)^)DJ%hn*z6i?5z=yHuGsa6NK z-$@^romKE_B+N79kI#LG14dtW@jxZ&_yHJqDx;P1)N9HTn|Y#paW@RF^ugGmFr9wx z9wJNzkK|I8Yl1)Cg~3x)ei^X!$2E1+(nqZk0=6o;&*}|!J7IrX=nlJ1Yzdq9JbeiY z^R37i{Lf&g{|^9OK%u{>?vB;L5m(1G{1p4F%xUE0+VsvhEpq~rhHHB7b@HTbI+>zl zUv}4YHGG$4-jj62R*{-4rLzuN@r-RA2Hvx+)&U)UXTY?3myT`ovK^Dg$bQ8#Totf= z5ou=`q@6Y#5NzQybKu#r;zIKGNZ3w!=*@t>+T6w;+5qN~~i%T7TUY8r2c=gY97NP= z)p2@^a{yr*C#;T_gNHLb80xG)l7|Dz@j&)l3>5F-eBNT^!Q+!VpYqk>=kSRA@MpO- z$V2vAdoS(HbMx8s&g5r=`oSd5@us6I7cUPtE?><6;tcfIK39O6=tKQdgujW>No^P* zOJ~|a2k9`Jl%>vZ{zx}XJX5-L0DJYt`XUd_N5eON*6|D=q-@k_rwOZ>Xk(!xzx{*G zxs)e^u>4}P*st)NFiR6Akkce15{{S6_Hx>r+YIJ!@v7ZBey@Y=XX-L&ttRqRIrUDX zJMYD#jy!gFUeMJG>uJ8TgTLvubei9jB$=;B%Wp*(t6X(jMhJP_7KCHWUVLK&zIIky zv~I6$GH4*LD~Jwe95C3HTmP-QP7e7}gjHaC=w28KN?zLuT7<$&c@|gtWtD(7M;daF zH+;co(y_N1*c`WAX%}9(M6cu7XMn9guHyLtKDUHf7yZzmtEKwB!q}2hE`J+fHB+kSZVv=HY$&+aJOt#I?kVUl!-pY(MaV9Q`pmCNEXRW~L z26LrdJAn@o5~0#5_hCxu+$sB%RE z<|Kx%89l{Yc60^~*n%

NBvf&}y579J+05nk=_r?U+u6PLrn41s=IDh^z3}As^*q zZg=lJNJsvHj@%ANak#qDL3rEI;3fLNSuHvpJD%{K0S~JW3?v*l#6cb%E%S7nExvXF zJoikr>4tQ&((E5+^Llh>IhZD_`O>~aD^Uq{oCX~&-@xM_&u2As00fESz&>!Fi#VWv zs%MH+Pw~6pL>I{1GN@?dFd(yKK3wO+ptC^=_;si~Qm4Hh@%s!y@{5Z)a}t-^DA8DCZh8o@-Aevyf6Y22*ZTzMa2^T-kx9wgt%nW@ zoETv1sjG=JV^YM$y~j(q+=?eZ_Nc{oQjFpucPjsWe%>|hiKaCVL; zbeClXzfT~+e6V6v6b%c44?eP6-hmIfL9_x^fCd}jrYZNiBLgs*Wk?5S`SZp|18ikN zNpOj;1Y2~7l^T@UMLUgW?4>jP%9YD})sZ(h5n`xPCMk-gNHXc?Ka?6z&`IPNUjv5M z0-eY+%Tn=XG4lyw9LIG0?iGZeNBnpRw4GtIadbGRK8*3!BZIeb)Mx<{huBj$Ia_b3 zB1QxgbK#Kw z=A~(}d>!DTL)gxjxH(v4Aq@ZkEhqk&k6-@n5WQ$z+!Sb%i}PxCY|BhF4^9CYyw@?# z7MHY#wsis(??vS~0<>7SVGvf?TZ3D4OipoG_B*~d*^`HaTdJG;qD&gDV49!Jt=vVlGC-aftyD^P_ zJGg4weU8P4WlJW-W?e7i5G=E)*K+9|&?fO?zoTOF@H1&TRCzlR4nbBFu%i_^@C+o7 zO`Hat3TgD{*DjiPJX7B~qzb)%No414X7arvWZKSpQk@oyvkMy9szd}!C+Bn zd@N=3E3`837+NM(!u)D zugQ_PmgWqg3LSYf)tnq@7fpOMDC5vhk~;VMFnL~uHX+})L9f;w>GheRv^T(}K3S)@ zUZCzrSKuEUeJ-gxqZ)h+h#UMA%vswAT2Xqk_3OqqwP|hy%IdF zLteY0V7>7$Pk5w&ZD@1=p`0BI$V2GUH>W)kKHHdFl`uc_I(Z}xpJ_|^(#St65u-6f zy<{6({M(XhB%_I#p3aYWu8pPbq}k2(QNzUq9~ zVv1^kCk+|X0Hbm1AETlr6qo=GV)MYYAA>SPs1jQL)v5Ua8|`o$Ea0S45DhA?9pu`F z6wSNfJMdAk*x-PH3iPnWO5rH{R*)USHlt#nlel(+*Z;B2z%tFKLm_~2{*P`59=b==+gS7?bg?Y)s=g$ZCJG9Qd8Am@6~SfX`LN)+Go=Yt{N6>phpAo zy6Q~Xk#OJ=o(r!%i@So29~E-5ir0q#06+jqL_t&x!8fK^Z*0(v>#_?|LyvYE`mD!d zm!G~axN@Y*?h+$d7e7Wc{-&$XCRay@{0P>2(~^CpGrW!XET!mG?oeahPfmZ3tDZGy z9Z@K)HZg5o$=pn~O{9!?Y^!XLe2a*$rsPddc1UzMq7sh|gAS_f+h!VXnc1(aEK;=X zP?eF{ZrgNl=4~xpkF2E3bRynLU@Ed9}-p;i*za%&YxkA_hx1To&2$_H^dp% z;UDcNZ727B1kU!~=g!=>Tsp$3hY8>9ZwIB6eZV{&Dx6BcgNMj}pEn-S$q5r(k?#;b z!m~8xdbAzraBsS}LX#5Vznt~L*`2AlY|q0N@)3`;C|9?^F0$&~FsL@2=OHUW>f;m- zx}V~;oo<=&Rf*Y>z}aA3%=RT5$Feo(;bkI=^f44e`^Vr49!!&#KGdW$tn#@>Io`W< z_9BkyTi0(6Z@lp)uWmifZ0365B|pMWnSh2*Z^WBfWBB7ge06f<7f|(9wqu-5A{os- z<{52B1LNQPxzOf>kq_)Gar|bh7H8|E2%K#XkHknrVU`wWVar)UbD7*I+hwa8``WVt z!CrOkAmmuSSV3!?um1I0AwECi%x@exwvE`nWri2n@P!WY)}@E_!0iuy^rPWZo_l!0 z>)WNLZ7{r~1mpo{nG6{TuJ2=&ECn%5hpOy@uRG%)o;`au&!t>KM^{-LI>kzewxC^g zh?nOGPnSuCRQr7ls2Vte>-mS%Z*RsthB!uhpsS3Oi@MZ2~r4hMU- zxqbLHE1ysDmD(P@8L3~XZ9D8>Ui)SKfC&Uv0^~<|QKmE~Aq(@_DcjJqJoo$d<#lX@ zSF!q*K5#tencsU)_lMh0csT$zzE9(8+emA`WIFw#jWPYf5swD!9&M6`?U;u=kCl%d zQW8wM!mDH76A(2qUt5usiEXiP(Kn9<=oovyyvVnO;9+-ni@NbB^=}7S^IQY~hr7^1 zTjU`Ra%5ZT+aLXs_6qr7Ff=W#`0zD@x>kmQh$ui*B2gV2+*o)rdWx8=AVkVALa{T6 zl2Vuub`rqoq({IY;p`W7%~l8z03C(Kak#6ZiFbUl%&1Io6@_w(DsYA_aPpeK(3cEe z5hR7bvHkNZdM$Fe15ERxq=M%+J137>Eyp5}GF z%ivpoe4VP>WOX{)akPAoniZ8-`Wk4k|M6Mttk}St@kxVD`xXaT1|5q-_6xYL{|2iM zZ@>9w998erznX9S%Qh$(A@tjz#A*2*x^I$+(R@-~R>1@klsx=!l7_mjbEk1m8*W-< zF<;q_{2E*Qn40IZ8ZKG<0Vr}Hoj;v^dENDadtRGcTbOikO5XCg0DRu$#cZb!vs;jZ z2e*s#N{~2|zXJ*FXpe#F6Sf}TWzg^ud+2@h(w)2a8O-W9>6_Gmb0#848>x%jl8!B@ zfBDmq)?qArVrrCo_q*S?cAaNat_+tiT*{lVd?2<4k5#9pdJt!Vx|viteT>jFMz-E& zI#U`Wp?x;Z>*Mz##c#2kP77T5GF5otI-Yl)ZoKwYgAeLCf7(~tjL=c_npol-Bq=oy z%jT>-aCI_>XPTg?jVzr6edRv1Cbs-9ZDawTEY81(f~*GX!dj0k&$h~aHP%5L>{SLH zBn`|9LvF1CnAbspcId#*L4aG2@>Ox-T~*Ysj9ceOTb%Zf62KpMSsFU5GPNruU|p}e zTt4Ieu+9VCH!qzYe)aA1!;P~`yq;BO8^0KN@9l34k1hm;)LO8xdEQsrtav@FVcN;UJ_SB+=5t2^$8ZSQGizxW<4P z(Q#P0piCpPFoKo$$nR26pqx3IVgu#F*D<(pq`RMGG0I~W!pDsSrj?WI4I~$t;P`kh4Fd{Uq}9pTB4cV9uGb ztp=q;6OlsvB^Qdzq5YF@9Vc}$t7&0&tmdsnmb^Y|{u zbJxwXy~5x0RF3Jd>2sJ)5#Dy9LwFu-Z|X=7&MLlT z(dU2%U*qhvDe~8cIqZPEKNh%mMmrDZ(V4l0!+Kb&5IP>_^jx{T_jm=*h&0X+{5Q`9 z_nLCWDpJUiF3X@hAzxgSoJW*tzYC8YOqE|ws1UXv)rsl!WaiL@ouCe?3=5H_(!IsT zp`xwUhHXoAT(xWC{gSmEqXUQxK7emramGk9z6THvKuphJ-V8o!P@;~F?_gb!#$Cz` zylsCPa{3?QrA=Q+`7L{_u#F{7hT9j)R+NQ&iQDt^upHw(U`d8%u$E`M+rr%LrQENu zHRcQNzDuY47W?b{Kp74yvE7j8=DE7!_M$4H|2x4bS> zS=3B*7@FZZVC4y@f>Ur~hAuBo5&i7e3*@BTfLrHKT69j$A|2lEv+|(6wFSFew}I65 z5mq0-E6#kan-vbfH}V|#yS(btho*n@@vXe}bBk>n_Un>S_($E$BvIwbg@YgE6S+Va zmWcPj3~OfKc`k+3hpVhQIC!|uAmSVYJ7=gj)=z~-C2QW6{q$^__wv@~jyriaT^XJX zQ+$10aC6woYi!Ar(i<5 zy5!F`WcDn`1h8e%e0qN@jQ7^vZC>K=f`{P83Z&BNaF$~X;YGWouB}GmzBIzyamZ?--PDiAuA8u_jT8`vT@`Eo|cnmgXg7GW|^mNq05!md}9Mg{Q%#R z+Qqj(_J{YbtPj8O9e5BDBFEHQXCOY2cStMSe3$3oo1YoLMWyU~+cq^X(f{cPRqAt_cI8x}ej2cK^pi%{F93kO` zg8^LR%X0fT+I4iaz+0VMzBxQ(1m+S7hP#L`4#}c(zvz*f9~vCR3{5IZa&+cxP>QsX zzLCy0P9T*!^k@{S+ycPd5$B~Ho3v3haW?*Pts?AvzyVzLIl|bJM^7?ScJ1c% zswgJQp-dBI^2puK`A_6k4V^mbV06jMw2qzg)^YNEJX8$1h%Wmn=VHuj^sAYUqxk&p z^%96qNJ54-npSq-ah^d04VNo_HXha&9V$CrJADiok&|hZ`&8a9aX7$qAO1$(pQ==%GYP%|%S01h=NQZP$LL3xF+PV7Xo(pL<6upJXYX_-B23AhAZ3m6cR-fmsOV_Vm8{TB_;BzUbPy6Ng_3)#7 zv|-_+V+%|Evhmh`pDn9oS?~>;O#Dc1*RPJ7p3U31>bhv{c+XY$3+{FCZBOFuba~L} zuAoiVz}L3(fbA^13?fdSyF@*(Ewdhvpq!eg$kcdv%yobYP^rwa$iqB)u9}XH7S`E% zWtz7O(uVr6I2~g11|Ez{St;nPNWS%GbvQkKl(YC5^+6sVct6D|qALszNX23BtZi^n z^^CJ|VZ~qg@f5uzQXRSq>MErla&z^@Ho|8hs7uJq3>$38fA{*u;rdw|?`0>V$>S=Q z@=3}1t;4WE`*`Eh#&CI)^sVQ^%_|p%cWz!}`xJPQ!NaF7hYxN&9KQ4Mli}fWbc77l zvAWq)Kgh_zwlA@$=N8-^%0PwkN+;F-?eA6FK=U;mQP%Rp_VC8Vwc%IZ`SsMn^Sqka zT|ZK{`{L#B_~~=L;?BVQQ{HOEYmsA{Y!ljK5OMAtb#nDNTL(WL_MhAxUVgfTcTXiK zv6WEc@G0OhTP%AQ2Ryf=DBJ>=LArj>Su(nJf3`sGQ}YX_asA1f@?a_oV|9XiI}i8X z`Bhkr6`2s{tOa@5Q?C+TK~VC35z&WTG-*iyeVCtjf+HEyXnOEFC~{U0J(QlY0y`V= zx(a4vP?fkE>|iFY1{F4vnYprI(!gnDpYgt8OSsG{U|Hh{Eml&M=h225B7Tm4N{uC} zCbQzEI`5RpSs>(5TIkd7`(VB(00O{YyKE^<(AWXbDvEkxNEm&>XH zc`k?#C6!!A;r$%xL&9GQsXXa}EQ(oH2MVF1a9K|bcKo%fCq8U}^0EgX9r>`eGWTd5 zw2tL_rjL>K^sr2*3EO!*pJrh4m`C9iPxO)gjfvXi$8Eyt3*w3$LWxVp@f zG_>7}C~|5T&nJ(DGas(8LbizLu=?JSPomu>A}u()tMw=SmVX>7^kCeI?}>&Oi1ysz)6uqBCW2ZqAy0B2hgGyF~jYePENzM;@<=zeo$eO-8T zIQE&^RBD-Ygp%HYCU%9GfZqj+CeUdJK5rMX>)xzd-*JF*x|5= ze|qiwDPH61DglGzV6nmE0H-u(wFkN`oL(M&?Y*89$+PRMI^YBI7Q5#c zm*VU%SWw^Y%dGu*+%*gKi5fBeGB{t%Z(sgyAB_21bc1)(YmVyCQlvkWg6h)-^ta z$b%gjBp6pc253+fN=;4}&7i?ozPbv)%)bwbW?7k82V|7Jxe9BMC*8?fYa7t&Tbt^^ zU1@4MiavisJMpH_UqDAue8hf9m2k))#SylKXMW|;6)WXYrj478zJF<7p-Gq*3?%k3 z(yKSG<_Z2P3oI=>F#ddo)*)wTjdVC`Z>`a$TUO+!c?(3clk1qc`$n7-G!uJioS&cn zH6(!0ZT{@^9OOEQbQt@zk#onLGfB=y*tj?|NJl{j*xDH@YzPp37C$4Y8cT0m=QDc- z-cG{!PV0`y`q81&`rh+2ZhAdUo0UW4^jer{FuzB`^z})+a@Y1-b`1lsgyV7I>|+i} zdD=1moi;C1sKQ}fAM?DEt4Y1E&H){K-OhBUIPDCkRM#UjCexDUPPw zd^&d0F7*wgyy8&N9@@saJ)kF$OjD*6#=IwZF8p?CBN(TmN&A@7WUBFXRu+jEVI5Kc z4-LlK4_K_YAiXmPAkk3#ci$i0dh@MJ zL~U-mJ%)(@9yay^sR5thHM4pMlBCY%H{J%f*G*er_u2R!Fd28!eu@v}ET@!YX9wke z0Xf_1(1Q-VeWkeWFiW|yu{JzpRlxL*Kl&)E1H$Nt=4AmqNblRJ>_h#kxewN_;m{oL zOvIgAyuOq7`1`8Xx88ndc=N5d@D<#Zfb3O@EuC_YnJ;FbaF2n=ZMH9c@cr*GTY8&U zzWQ3+l9xFg{6YTAW{FBTF1e1K@JgmNK3a8=)seT}cw@M6<@#`e*X!m*D(VrQwfe$wh3*6X79XR zzGu_R&yg>7fUpflo%sq~^=DirqF8m)?tDo=2d|)j-+7#Ax#Oo}h4@LK_?bh6R&cxW zVdXv5koqpK52LIrfrz!Dwl(h z3{FdhGE{S-Aq6<{3hQCL>}zenw-n_Kb%zXwnbrT&`)>}vc4KomyTJq5teW_up|v$8 z@u0zBM}`sFXcS#i$zYfIwXw82ynka2QTy!(S08O}poKCvxxRV&aQG`O=>gW|rBx=zxfa;x7{m>&ZBw$bKzpfAetL6Z_>I5x#o^!o?XM1BqP@As_hwEp z$Wg{_cPg2i13ViC!{*^SD-N5(n>ViwUwr$<@ayltGyJRX{BZcgfB6l*+3|#OpU2jY zvirf64J#;lV-s%MNzg=$oCk#wpukEnNBt8|B>(2N=}ZVy^^po!3*0&sMflwZZduSl zi5MuOF)OGHJB#X8Cbu}be0AU!Cq{a{MizOF6j@p5OvuC7YgU0BA&6f=&V<6XWf?rn zm87_;kZYfPKv`wTQg5SDM=r>uf!d=`(zv{>Q2+#PS5fvDM0|4Vli}L+dUKQINJrre zO0MhtbMve2#v|=>T0tm>&@{;2%gAd&@n!lb+wjIu^C;>uZ(ws>mC?%vdkyN(qke(} zl$}ynzJA!u2RwaiMu~W|dvvHH;~QbrWtLD`;S;DHtkm|CBc&C-?`<`v$kDV*^J>9ixWkv83<9KTBctl!t+vfN{3htQ@}OtJKl9%&SQ zZjD7}(H=aVM3k{&8dF6!4`W?P`xx0Ar=w}gIS7i!G%N6LAP0>-b&7L2n}|4!gTpXGhY3g$xFh=_+W3+i3EbAIa^cXf1I^i$0l9P% zB-;~6vJ|(Udd~8G=qUz^cI611QRxpjrwp+MTX2c*&=CdtvB(TS1QbdHC&DZ)axWO= zg(s83oPqb%tzY`$mxebONI28#DD1U3v$WFkB&za%q1`}Ke0kk_zb^Jv@GQ< zSns{@_Hd2uOFo}+jsb)YtC0KSTklqwz|178Wr!zOz+_dk(A;zKD$R}X1zUR2chPT= zE9nRjSIV6~W$sE)($rPeE9osrty!E>56^)sJgyk{4GKR_Sc86bS8a)Og=duSWNj!? zMhHZwSbe$jkktq7rB&LNnB-l=S*^(luz(v2O+Vt*D}ku2+D#LrRYE85#YF)%mf|IV zGL$V==XopK<~{=kx3f7|pblpHI#bAWeFO$~9MU3fOH102syc*F{H0PE;ckc{|)gR>+z;XufZ@HN5}!+2J>Mc>3Lo z3~+2FV)NRJTDI-b28+woJ&W{qWYujq8}#Kz9^h5J2l%(@eg%8jV3zRenT6r2U%Wp2 zAzPbvUuru%iwn-s$jYm4|AKq$2LjNWBDc=3t{m_@&za#r{GHz({@Pc6eRvz&KFgLQ zCk6Hx0QfOh>x8l!Gs$_#b+UZ}dtbkFp4E}f;l|ag!@CSB{;z-j2gA3%|D(XKXdBS% z_CGRND*rMjQqCH!yp}<6QFok8LGVg~4E`B}^{3#i!rXw!2QxB=khmz!FR*ItjOZ{5 zA7FCdV>(SMItH_Nu#(Xu(xmYvADm{?rNAW4qO5=#3PR4z$gZ=mXjZ9I=}@LXq?cee z=ruJ|V@RfCscF{{9jc-11-F4vPE|VjB;FCCW%2C1orpYAWAh4AS3#ud_Q$t}O9=Y> z`Ez4ljn2{xMRQ&My!@6d>>QPsVaD?)r$mnNFe5;s@;kDWsPS4hryS>c%+t6&KmY4W zK)u;W)-tr%O2jrQ*e3VYkXHxd3}MU<-BevwS*;~uUw1u^6Mvk?iSA9E+sfR|r?|}5 z!K@Zzy7?NP_tQMa8-)2u;j>_l$J;Ph8OP(Rv#4U`WhozJE3t_MAon9NC*hl);-O?# z@LpHB%}2qA72K#K_uP8+$it1Pi=E^6x$@6tBR`ebxUr2Zs)>N_w2wL@@-U{$T1Y4= zby_+IC4VM!nT|z=c9Geewi})Dx*q8V)HX9)P0a=$6==up*ytndl*H%HA#G?~*S+BL zJWd=5sBJ3~+gb0xqsQP)cy4;Yn|n^N8#o|e+fcRj;vpS2^MGKOb0Ar&3ULi{@RIW05ZbMnuCRR`R>Eb8`txYu5X30{q^DML8Ax%Rc<+HN__3_C`R2C(5Bl89M*Fs;L!A6i zhP~~lEp~`yTYQw=2yza?MJM{WxxF5#a(9ldA5YT=@%}Z z%Ov141|oS-nOEn&V36>G?|;Ay=$YZ>)yr(1*yii#%#7p6Jp%uaKIFq=w>}*1J>jE= zj~~Z5TV7aWrO4^M(zBeD-2CNQN02L-s?{KgF(T|mR&I}R<>AdY-eiCJwc!FQ5w1Qc z$3A!tHT=|Iwc_Qkf~AYl_%SNClbca5OZb;7oYAGg>n&OoQn3M zNrTq?!H!MA!K!pUo^imD;_Uo4%}%I=rdU*s8S3yLDArt5aWLEpA35*uPtZ zfI$xNKZ!rUruPmQnDFrajq^*x7jIq|-agB-Y;2{<^CbA2e*Dw1U?%XaYvaHWV#i=z zHNCi^2m(Ju`er-%%V0#h*w*B0pD%4L4_~={ez<-A+3@5g`U7u1P?`xiWTIaxV-^kC zRd|2v+PUFB`J4a2@LRwB8^bl8YsuTuXs>;)I0HiPE0g#O)_?854npEW5#@`IRopi& zp5+;(FJ&U&?AfmmfAlBc4qv-GH^VDl1D_w;RqzB!bvEEgArv|SRyCF3F(Zaqj=D;K z2nGcta%hO01w$}3bL4|uJ}V(!3gV$1-h;paek!wu;|D%OaY)BO`+%V%P6rk4_lDqM z(WfDb$y6w4&M1-@%?f1{R*IsbNbpyBtZdzQq6bECj2fZO5u+W@5vw38j0$_?>&$_) z=F8d{4SPLyR?3QwYae4X+~+gxzTe!}z*c2X^v~f>lro`l)aC0a9SsU##9ngmYwwh3 z!URQZC9rAQUWfJbxIc#kM(LNhQyQvVVD&?#` zG&%Fmv0`4iZ@|g9>eIvkCTUU@QyAkKUpsQW3eoX>)`t4J(S_x{dCj@=#Qt8BK8+Hl z;XB?YZC1u8?1pKaCuvGvSyu-zn!}rj<2dhK&dGDir0^_%gTr>n!!q01Z92?*oR)6S z=JJ>HzH3`ZnM9^W%6lTD*$eETh{)Mgw{29Xi?d^22b}g~hR7g@rNwsFK?4|ri&J?( zC5!Y4(r7Zq*Dyq+z2)4zd2g7>ku93ojwXZ#w94kioNuW;rRi2%Do2>*otEX{N-8xfpn&z(S6Ge5OTl0)Y&KEa-dNY(xg=VS$aKW4kp!w2_<$B!NiJ3KJGy1q7S@zxx- z5&3FJXR&vIb*9%rgM$dma-X#h>@NH2mBShXD7R~D^Fa2q7w%ohNyUkM^ypEXIQO)> z+VQ8~_+z%QTp7-tzr@3{Y?b1Jaewx~2g8p(x;MPoVcXO?ugheM%S#3r2MjO{aKti$ zuOS%5GaMjyBj7K6xc-Q|R-36qsvh2R8`aG>vijhIqrTXnZ6mp%J+2NqjDuw)K^f|I z)C!+5Wye$fCl;8FQ!aJwta%6kh-~l^MTfteUoO$W2DC{%g zy|eo~+w<6T#Dm&=sr=&N@cy-Rz5(I$8S1Utu9^r5fv^6kCxToUEH~#-2J)vpxy3%V zAOy6mqQe^uApY}zXKi@E=McX6$%Elv{OJ$)7#~{PWgxuX*2pV@w=rJH#hGeP(m`%DeA?Wnp;!h)Y4kP*8Fjk*!jU>lnmY3b2WvD=e0lF+3w$rZWQQP=NDgAHG#V7w+R-^#=P#(NT1<2ZS4 zoW^@yoyjQdSsaZY?Ma6&BlqM{rv_zVI7wrdVR|__#m6*$%vj3i=2#g?te=*NJXaq; znK2#Pxw0+u+WzX06=k;5#lX7q$$z`xp{?~!W-9e@!WFg@&Y{hna0<(iRMEVcI@2Mwy3%7cJ?mY`&>hc z+e&#Bs>RmT$P zZ1;0M`X+w&wEId_w=p?r+SojkH{=||d0iZ~cs62n3kR0BA35k)q)%NUpRdV00PiZ3 zhPzvjaqfBb<;MBEspk|AgYIqeF|XL6)Z329dpIc>ps_qlL!1txZ?ExetGBkgG5E1K zS0Jukxi*|*tCa(aUfoqb9b!A;N9=iRd=-&N$GNUb{(*)6rlah~w7H(jpV3M8A#(f| za2pO59CT4a%JG`BTiQSicph|o#_V9oQlGW%T+wP>xl>)Z|`FugWkfg3-qfByp-eWS^xnbpX$k*=ISi8A;X><6k zFW(rx{O--+9M3#l;4Mn$&ma?gRZJ`OP1-v&FMS`wpzID)zKkn))3u>G_R)h|%C4{y zaP!Q{@M3RaxOokGyXHRDZ;;;VZzdc;JaG6)%)^_H= zA&II5MTN3IPG`x#Qs=)}y`aLK+guyI{MH-8)8GD09=zu5Onf_y)g2WN75wMRicm5` zIuT44HtKbbeP$?fQUruZZasqxMWb>nuw?{{is5LFL4c!qcP%B@V2{guR-#pwh{Uo- zqb5y@d+ej6qjKNd(UD8QBg0mPaS01K*4P3=v(cCCMsJcqTy40v7tNcUUM6aT7h%(I zLJl1%6)}}kdW6>`T_LF%E=TR5k48bdUp#v;e0q3ixP0YeW@OAWEz|p-mk{}Aup>c{ zRdi&Wjn-Ux^59_@Pr0ag8!pX=Kj8{0aN(V_A~;OQ7Ka0~r`W2ChlARrZoLr`!MacfZRB z^5?^AuiYHpdFNf0a9$%mbst8?3McYdvzxX{Md~D@;ubIbOQxOgO}pPM@9;o?boo9t z>hi6HU}GpkoKKD4eiy#u8s=|`U${;yyqod3hVA&44gElt&XUWHrptni(|CMuTw-&n z$QDa?-h7iwnV5mN#wJlOt84w`Wv2VaZPz&ZyPrkkSjIY z9tb-o5IAfy>dJqP(`EGa zHhV_cta|^mYs2DQE~wLHV04*ntHDRb7yTk>jA$DeB|(1Ej>B!*a^+o}hoEN<$~LnO zuMJl&U*UI|lchLChmNuxJCKMVJme*RL^{_Gy-1tHTYp!tvV$g zbBV^42#JfKs4e6u@(bkXk}poTTe-T~X-5Q3k-Ps+! zy2~YRtE_EVVlzL>fDVopIa+ua{@weh!w-ITpQUlkei84K&V)<#>=q4_u8zCOJF;pO2^xU6oCPD1D;zqw|D*@tfq zf9KoZ8g5d?T-g&x-s#YqF&pMGSN6T6OPG95=m~8qit z*LEUQeq!=_jzaqG=!-83R)kZe;won(!a-}H@-2AsZbaEGg_Y5bn8p{pWi@A|cm81?I9 zI=(m?k8OaR0G)N0#)!*H#J|pFSl3CPVsv=%)6voX@XkALV;H|a+~AZCm$N*2SQl`* zsn%mXS1}@YxQJ+tdfd9f3#&5Q1iMpSn#ZZ1`PKf?<9qu_uXS_2ZCfVc6;?bM-f{XV ztg`T1*YNom|1=#+!*G8&PLU}6-8rUgR0;jt89$vF@^yzJO5gq3yL2MnWZgfPGSQf) zp-#L}_9DPFj#(P`cU#lC3lwmHp2%mOwtChy3DvyFuQd6bzE>Mp={Ky$dp_bg)*t`m zd&37G{A{?(9Sj<5SJ~(Hl+o$E_kWy=u^vBqlKmYnPqJjGBkpgo{mW)e_?K?)R9L0m zw!M}<)AU{&b!Lywg?O)A;jRcad_LtKiHGcUSZ6cmCd);ha9r#zC&avVG1JgW;Za}ju~}4qcbb@Wl)m0+83j)X9f&;+48&p(%E!r-P*at=Iz&p-JR>1 z5z3wva6~r9+*J_DqVs{E@f+GLt5uE6WE**#F9PBcuU^jK=sXm64{9`gKkb|^GwZU$ z&Bl&0ue#TWd^=)OFDu7}eFjob;l=o{>F_IK(2S6yS01~&!b%BG%Q*(_#L;Qe(hxWA z%~v~zb{t}OL-0Dw(sFoB(%{JYr~AVpV_mmv&?+=^E`47JHwc(p@tda(sys0Bobu)XE6+F;{hD7Gg zjJ(A4vjZm23hK7OfH4j^Sg+Q7qDA(2oIbBQd5z^uyW1~@UweCZc<1Y1A6~n8D>u6D zAM7KOli_cD>l?!z4sTea>=}~`NvdUQ^C_wMH%&7j1#5c5QE%WYs%orx?0P-D?u>ncl#CZXyFNPfBW_DviKmZFTL7&lOUPqe~l5*!4^ zu_NK>3^Z68>TZMq#FaF33RBq4O8bVKtwG*d_hd6%H6_=)Wq5(|pBGa|Nv#sX&-xMOAAJEnH9w?G{c8eWuEIdY6S8 z(mwg*(;QXis7uK=`h~xp_tU5Ex_+8=h3h9R zUqJH(kB-}TD*iMg)J{J~44u|RgjRw!qS6U>F8%8_@X>IWX$U^zP^piXL9+1+@|3LQ-+_7wT6ny)sfyUS^M&^=6hsU<@E! zc_~9yQxAEQJ^E=VRG9V4c|ZAbw#4Nw>V3m$H0Gp|QO{y*fXAo2375EIyXB|tgz0yf z;f+t+(r9*hsKyKS=e73C#pqk%HEzkUX1;FRx-oqHov&g5b7CKz2~RZYbWUw2jDYJr z@ZIqK_NngncK~WYmA}Uo?!0SEqi=tX{2EWWv^>lo{mMU5Cyk2hp2MI0?5D$@{^@@n z?tT6-#t0io$y4cY=Hc!=PE4YacHe``ceNYV=rBHgO8bkPoqcdWga)g)kQ~p`jPy%` z2M0KN>gLxoL_5|ox#_RSEu&SiGDFsgQUuSXX*mN$b#`3h_O zpx)Cni7-08AMrQyp~K}&ty2~dwGcvPG8De^tn*Jl^|PJf5Y4B8AM+UPy0m^a++yVT z`sLN(@&+4qk(cNXuq__$oeiHpI2;~4!*GU1%hLMt8JCm|biy{$>2f68WjN312|PGh z9G>iRN*3=ft(G4t@TvW>!bW!UX`g*1dl<=w$Q(w=&kgP_S;N47F}%j9KGzv>UuS8e zq=f9^SWM$j9*XC@_&&abPa7e9wEH>$pQ(ID@beGzk<}e`HV;@nC-v%>vd;$Mly~x# zy$E;i+~V}2#o@}T1Iw4O9MM@;sTe19|MD997C7DL>ocMcjU>ekA){)i< z%v#U`sj+P2tMjkyw%ql5nTs5Z6WNS$;axpe#|;Qdf*jypU)*Au74l%!&F0Sf@X!9| zKNvPIzdfvPZ)fS-F?)pE3~!yYPI-zgoIEgfk*IN@`jvTLF-Z9uc?oG`US?YOuwHbJ zh5mm={1&DrpSJ0DBLo_ME*565&D5=?gYD+3OY$$efX}_v5U(*N%Het}_#d%t{=c z?6YR=h@Mg?2hd6yJf#03qiReIg(gG;?IBJ&LyJVW-Hn ztKxxF`PGg6iYg#Xl_BaVx}$kkNJ})DXG{;?`|SR(d;NJX_pmZ;S(P+`>c80^E2MM| zYc&9LjPj7jVl%XL!S|Vp65nK_Pk-oy6@^7h92VlZ1nPs|-vow>s{gM4T7I0Vc4FUh znj9MoW%e4GF$&wfB0TL86|Y%Q8(|HijFZRtH|A*zCD}w~&hk1Q!~ww0VX}0L(|R$B zJI3qpi{bs9MrH)^z5P4hG`#&t5;qdVlW1`rzZ>|DD^45p$f1FQqH$FG5lI--&}}!> z;jABSIfM>n`9->oTXdvaC5LPf7?hxP_B`#9A+=hVVU~pSe7Ej!7PCeg&DEdp5-0SW z=Ow)3SAhxdN?XBZ`Qn1!R5D~yh6fSfYj z@A1O-xZrEyNAC~6{p;^?T;j>_gi~|YHo0txe=km-(SSP?v^RYClPALimXN&h1|#y9 zZc&eKP`{fc>L{a(#C`sPr5v<9x{UZDr82xUdXXP=Dbs}K+xX3AA<#+4Rhj!y(r`)4biRvOipRnFb?LM#p3Cm6Ti z{?@M!KmPF#hdqqkH*a4a{_Z#59R9_>|Nig+9o1v%t_|8)+x9bP)X1Dk2BAPDqxlhql)I#Qef z^x2bz;lt0F$>Bb&8;k_Med&l!J)7N^DT{VYl*h)4;SsAi-hZ*dN(hz~a*~gGPT5eR zwvcDs;jzyWvpp7>9bkAPr%m$b_3KxMtKi%s50)6*^|s~pMVI`bA0D?$1HZ`V{cwhz zW;sA!a4*~NuS|-+bdY>E&4|TReJR*P@C%qY1*kJl^ou$MBlU;X1mqv+WJeTi{`t(2M=y;IozC(MQ~! z!9dDDeHC&Mw|TAp$t$(19u+p~;*7`mp8sQ*)lXphIf5*>OBz&`=NS&v?`+|~s)+z* zi?**aTXK82^x9j)5}iMBxvH7}BThWbyQHJ9I`6+ggmm1>dZvN_YHabU?qh)m7_2^kz7q|=hJtlr` zDOBJppk7NaE59pE>jA&b*4Bf5wjTF)^CztM^lz9B^QoU<(|9j~o5q>HSKKx*(_v_w z0-5+1oPNdIG)SK>K!QUG16$O`eZJb zG089eP>!}U!R#M?tuw`gd704JQ{;9i-TQF3zsIQ?e;T928*86&6V3O81%MhxU(E@5nQE*Z`dx2NdI zDI@56`%f^4869Ti_|m0I>0mrYZ@%`muMHpl?1SMkGhSc&)o%{>_D+ZU|KU%#pNWxb zj@r}~sDTsn7~>1OL0#R3hZ`K(>P*<>OIPT~tP_r|!3U7Mhn?`^n=+BccG4Lq-J_8# z-*Y(9x{xtKQx(9kh? zjuGr&0JRLuM>t_B{%aV|w)K`*{>=gx5F8^6f~Y_Bm}woOM_TNs*%yF{BRZ`MQVsWVgl z3%wDH^+egs;+*C6_}<@zLkJx*CT`Vd<yAs&ZNw4Y=%cfKkspuuzQq0}ZURRj zc)DKB`x-JjRdMZnLUa+1oQk25suqgn#{}!!h$uCRSjWD~h$_x^6EK4v5vCVUCu`wT@P$gC$_? z_>|IqexD0+m>Rx*{Te;p8nds=GSza*@vq-izRfa80y@J+V&?h#YU)NTi8|%t0;*Rd z_+`j1kU&*l8T|s%39ofVk*w}jzhhM0;8;U%{Arm1~x@ zxG25wrPEV=XdJK7UBjngFZ(u)i~h#xpJD!{uD`9sg_G7Poc_8mrhVRTinHI16I|^P zSGHtoJ z&&$oy-}^HSv^oLr;e*ekPn#Hfjy^j|ZM~KK63F@q-#_zyoU7j&E^`XV-od9?g5ch? z4VI}K($R3qi^~%Bxg_c2u1gLMhPS@*b&d*U3jVouDTY&qXiB%F(0of?+IDCVr#;cH zWm>wRtc0C*E^Iu#X}bK&h)!LM!62`*e4WYJG9nx^Cm5K(yRhJ=Yu5 zcKzD$RXPu!-@iK?vz*0F?7#a@-y6R9jWag$Uu6`28T@C%z9Y8iPC8BU51)4CUVn`n zr(eI#QlDM!o#0Li@-H(w#uJQf8Xodx?`F` zy_=gJFmuGuP0u?R`(J-^Yj}$d!CM^pyG*%J<{%5rAalZknLYe4iPx|}u5*L&bCx@4 zC@;C^2uNp+OiwxZZ;7Sd*El`s`7WngG9z*H`5}kBAm6>m>}|ogW&m~_o8&9kFAd*$ z_ttQG=ZyP1w2jze%(9PDI%nyKYeUhA5P7k4ULTvqxi4_!Z|ILqA$t@=!FG(2$IJdI zcL;&+|DVJXGEV3*`C;x z*?VNH5$7yhZtzyRB-pxPp8H;Y)z)-a_#n-C7Tla)Tj#KZ}Ul1+QnFAJ~m_pg( z1hXgj*`v>xRoWZY$=lu6-yF8C+(czq{UTtZH;(ypmaZ}$z2&3py$<(V=l83eow!YG zjb$ec6tq)P3fCElI%NS*oVYB28Yza$i4T8c`Qg2Ul=XuVv<0gZw}65pnqfRE%qn<_ z#_2C1)0c1*A4A<9zJ4cG=qQK{4Lc%$bsZdql@5hVo9yIhD0zeC3a1I|@2|53hQfp5 z<)&+ckvXA9S;BT8r{gRzdqb^5>LV?(Qe zR?J9w2|p+-6>C>8`%m|VdsN=nZd^~LEB>yCOnjrlf@ zpIr`w(RCbg>DQz5`un?x%kTD=8}T)ces&z67v0pa;nmrSGq2P9o`s{(JrXLu=DVNG zQ_>AfJV56pFgdilD&!3M(_%r;?Sr=3aKOg4lOjOO}oaK zajdr-!F%t$KW9Xk5lhxlAF!Mt$GB4Gu3@y~U<(@Nr(7`R=}j7mCk#A&^yz0gE^~|7 zhYgl3IiTVIkH-qGunEz&WPy#h+0cn0^yJaq;nRdwfP&9ihqsx1xP63hRftP1OQpOFc? znl9YUc>~>-5dnIT;Z{6T6_~Q1VqLa&SQMd#-0wy(pPR$r3m5`D}yB@aTbgrapkH z^UG)WM#iR7aa-QxO**N-kPR*Ye4C>>-?+>YuT|^=Z5I0qq?P4IsE@XUMyzIqc2kaj zU1mnbzEe6+cFG7uu22DC5-JLBGdLWsQf^)H^V1KwP5k~dHcsy$D3*BbY!27iaQw|T zNq>j^7@KtFqo?QyL_ocY#uJ(;JFWuZ^QEqNH&8jM6IGr<2)-4u^1t4V6IWuc_wg^H zBQKdr+Sp;Xf!mU=91Q>TAN}s|kN@F681CG>!~Tye!}FZl>9QyDfU-&ExEcS&`nQLh zYuA|E{LjO~kG_YlF)LwzaebGO`VICp&_P8f*4McZc#-8@2y%ygC_C3~4@;Yu;T1Wg zEUE8Aq$hEru2Rn((*aJW#QP;6f5yW|w=7A!{_@&-SHZ=oe)Avj;sHRvxqOs+{WET* z6DVNF2NNOOUoLEBx!2PN*zMHiE9+as(gwU4Q5<&Y`fHlDzVN9{H>aoGEhEOA^3gEO zm-^rJF(1zS}lg9Y$wEsGu=1~+inf2-vo4!{=oDY%DU%J0) z3D{tPbJR%<8;{-WvTEV<40IbED^*qsF!M#@#K===e$f*>PX<*cCI3Y`%d{%<^Y3{) zjiSb7yw;10uoYwe_shb~W6a-Pg?sK_eE8jdKAW$`H2t2!^}BFWnEGzH%yH{)oTM?v zX&lX`Fu%x^xSRHZw!UzN)%rx&_>$HrZ~ZRPY5kJved5_TxHLgS1`PF2L&JFJjQS!z z4KU*v_x<;OG~B!QX-3>^e0l>gRTr0o(Gf+dP8jjrSE;~gld*{5w#QnD|MaJSHoSG4 z<2Y%ky%-6Nq#ZeC57;&|u%@1wjU(C>S)x{C^z8BQ>Bm1AUcdA8VHso68Hsk%)xQOd z#FkM%(~yC`M!CDOuKLx25u+Ml;pE?E{pzF|3C zcA%81voY6qcNz75eRzZ44x1Z=^uMmK`BomCUl*T3bmutG67VHm(JVYXkY_D74Y}r9 zW5{3eY;KnGG@y?^{&4ug4}L)UbPCDWxC+ygCCz z7oBjVLA(3xG3)Ztu`*mt1G?bl47?iKc4%!Ey>!kaM_-_Kk69|W%lY@a7$UAWzjgI8 zwgjWYj6l1w_QMZ8;C_v_67QY2?r>xEPlkW>NB`IGcmK}s4c~nC-Qkaa{60r^I*7MM zTgvRi8*h>yjL5RI!_D8)QhZxpYgD>&W-jBRcamEZ`L|A}!^P3~9Cbndx)Wh|!~5*c zMDm3EP?n~0#*BkU%6{dwO*Z#(?*7`d=ve|s{fRAvM=fh2mJuy*KGJF6oo~w$+B5or z-<22mEM$)n#yG*dYx~1GdsogF*xcN9;F2XMn=8Yc*VoxsfDz5IsORLLcAR-;F-UTJ z#cTX)sIL6#J>NuVzT-R6nRiLP7$E_#{$u%YHtU!U!1YT@!{7V6e{cB1-~ao=?bq%w zlW>)Y?9H-pDl5|RT7r~P?86f}2fJJVx5&P&XHP!CZgd&g&Ed)|XBsY{yX-_q@0M6H zXDsz`#hJ8K^v{z*`RCI2Iz^N;u0EJc(-nwhNA3$W4w&df`u2TS1!!%0hyNx+_i6@ZDDm-ddCp?H3}PQKDO zB6ycf3FC$~8{JJh6;^ECE#Yj+0h_KDm|}f?P?e~iYiB+9GybH{_jdDllP!VytZ+lE zko3r&5eOs2WTqKq@7armjAy~F_$VrJuy>zK7m2~ZxUA{pk3Sx6dI1s31-cW`nJP@l z3HZ2?kpBFydP+z+QwhoKK~R$!FNN_Jb%`>@19`puCGh-3)BPVF9vMb2%(K#mHV(3} z&@zLex4}Ba?2}L)R*Hb8eDW+f@D!?7N=!X$T`XPh?|D8p4L)r&b-uU%E0KT60P!>p zKBwRE-RTs}JiJuaI-+Q46`sORah)ee{;EtEZ<@Yve)li&zGSfs9y8*CYuY*;D?E+- zrq!oE<$KX@ib?}cnFyEl-x_)R!V8sO8A9SpML0gi@PZ{RpRuNxOEMXWHol|Rb}&4F z#A-Yv&a8{Ckt=F-k0D!!Ad5dORh=)zpXFIe-DV#15a!VVYDYGCB2AK^ab>GfN$TRST#Xp|?*Dnd zi=*?=G|i*Rr=lV8|AK!^TDi$nJU<&Q zVN^Y4X_4H!7)4LSlhv&3mQ-N=pBzxzAC!=-mmhTr&&-x&V+zxY?f z_kQr+@Y}!nw};1v963w}^z!8!bRzBySFYY<@MN1bumPY!JzmJ_1C9DfDYPV?k(JJ} zmP~?6^GRJ!{wNgl%71{=Tk)1zC?n%<9=1z|1X)z)rRpw4E{q+cRXR}1m#(vUe3SbI zI8o^AeGD*CWC@L@@5GHuSsaLsMaGgXY4X3+EpZuI(&e3DQ=$+a0!cS0G=j++jvw`y z)9q!J58hfE4py#YfXNdvH-`P;<|Srsy#K+?)6!7PfG-56oiL#VY^ylD4I017Z}K;C z0FQJF_hnpziL)fn6JD$*mRXYZyT9@F@b`b`?+kCg{x$NI5q9d7Gx%TdXn&-`Fftt= zl9PXB;eHeTUAlT>c>5dQX4dpED?Yw9EN@+-S6J6jGgQ{kbubSOBeCv~oL zsLPazXh+`7b8Sk?KRFT^NO|S2^vb(=&SP|bk7Ahl@f|qB%2TPIla*`;GfJbtMpz#t z9<2x+#vO674DQV3T$IP)P9CTS($pRMf}PVk{hq%!K~q|r-saAyd@B#j`SSkV_l7Ii zU#FH}NrXL8DndIIr|c^T0F?!0kqDtVk!3QK`B-aOap|l)tgs}+hDIT%gH@0w@eK$O zQQa^eLSedyvvA-dG>sFcEji`LWkILW8rH6z(iqWkSlFHy@o3zk$3AQ}f(t_e zsYI!CUF&=5Q6#2G30Egikgr z5~q?W2=Jo=5U1S590usg%d_En!P~6DMcxINpeS121S2j#(PXBFKQ2R1hcsNN)#$|| zbI~TPG2$;De?bXQF(5!Ytm$Nr`QwO*f;AtaYpGxgbyaXyDE@#U%<^0LYMdsaDw8sR zl9L?}nB*?H`Q@SB_1u z@cx=EpM}F%zDsi%V}+;TD^9~U4&~TzQ=DBFHvamC9)N9VD;>XzVBsoVkLKe(9LftK zP5HC~=!mClpRFIv0SA3F@@;T+-AS;#)n-EQ#K^q6_MSQ`@H0+V zdX#Utq-tO=34>8sBoX|M4wihXKB$Z5TzU#!gg8TSh4-hdrOzdyEJImj&AWWL+;bU2 z$ql8;tm|Inw@Tgp^x-ENgD(z)He=!Vr)_o8*(d375p|PF5Uix%K+p>c3b^V$~>+RtxOCcO# z4;|*8vWu+E9^~5y@Y6ND?d>z=F*;Xq&xg%vXn$p)^C`ZP4SuvKFVGn~PV%LZUiSic z!cMJ|Nt;ZV?lYDP-Q(ESWk$x`_i#+SWj2{EvkdaVlaH$N6*U|AAPGB-8dK`w8Edi~ z$<3x|e4Z|yx{H{9fWV(d1F}rCx0;fi~r-h!@b+D58wWkZ*k=NrD6NZ4VI{E zqf;0I(E;0a2S=4p>7a+9^fKhKS0G}RUscAXQ^Q?nCk$WopvsAN4`l5(G@XQRj8K-j zSpJ2y{SRM!rL(ro%oH}&GRtz-xR+w{3O|-*ZLqZO%?<8fc=7$=#WMyu$+LwMI|8-j zEQ$NaG9W>gF0Bi5l8OWeZumcHTFIn4%apgh7;uaf{vmn zcQ)wIV?eW;fDW^FF@S&6EmJjrO1DghP(A(mC=t~Mf2+*76w30X{<)7v3XFh8z?+@# zawC#_kS4gGJbYQE%{TeFab=6n!<%%}Z*y1!CnX`=*k;(l3)qpz zFFdLr5=h5<_GB}WQv->iaA@!rQF`Pj|E8~tPhiYT+2D_!70j#O)_>v`#z0(vrO}ly z=(8{`vxk;-zRTuDkLXyZGnBFpHqugOjBaPuI)@jYg}0xTMERMg?V1MR&c90%b@sx; z9}RnF2gBNpud=3$%mVoWyWOpn(?g{&EzwisgBWB&r3AqiPLWvfMn(}_go~dARbdx_ zt?@E)&|Cbg zJS*(us3QhV#th}>lgy{g3=i%*3dp+9-K#MUvMsr0e)t%v^gGomnNvj5t3dF=0gR@v>ZS3jqf%B!Ki zT*$nv&EYTlnO>(khiZXTxrxT<1)Hsp#`$?JI0G=3%|-aeF{P<#nLaPZ^Lth)iym4U zN9D;FgRly=^G#jHy6H2Gcj~0m7%tX13{Fs~Eo2}<z(nd1}1 zvEoTQ)R!)kU2xecb+j^A#!`96i7L0YZpCP|?yi9vgGpZl(0|ZUzp8g*cw57c{>`V~ zzAI~unflSdQz*JkoQIbl6+3jSAlRh^olohYgRjGtpQi@7j^9qiHcNV3iW){0>rCGY zujx!($scfV&fljHzxds_uW}g_AY2$b4fCHyj()Z(Jjp9hPx4NP+WX*gGmpDuHlOq5 zHRL_y#Lj^-Tjx#z^UK+Nk7m^guj`o(??W;TJGJ_yZNo<$e(>NyWMyYd+4|Gi^qwXC zJjH35_jN|$l$i8f_+*YH=iI zMwrzJ%89%no#ry;=d@GWYnT z&Vy2ycxBIs`Dcd!*TrH=*E;u>v?~J|*Z_{@uhUa68c@ciRk*Y70Wez4s>^24kU}KHTpkDdZUyzC-@WPJYHtrc*~-$5c2ze0u5j6}_jzXdtzIW@F64NiDj<^UJpV;mSU3x6j{Cgur!dnH z{ik8R>$)EC{i0!}pRQr+%Q$-0Q#A7mk9vZ@POIU+>~l(A$4L_uIaXz>k<^B&FrC&c z-HG2tx~Z{JwNsf9kI`WUOpchQ_T-xrMi4U)VLi?0*g7ZRIFh^W5((;=smW?u(_hs? zl`oNpsZa+$!ccqgkW0j@6OmKX+I)t-F&~=W*)!QwIQ6rWbHpY@Pu)3YTHPd8=m2TB z*fGfA7o@4N>tLvZIg1$KhX-6fcJPGt+LsvYbIDT(p`LW4%xr8uu4j!Vi{9`Mk$U(d z?VSf2waP-9-lzF9l}+>0@%=U&VW(7>zg}vl@p}2vWemg1EUnm#0c@PW21&`HalcGg z&%+z3=$D(R`y$*l^cC;W;z(^y)ydONfO@3i9w}ytneF34^8G$11!}~5>6^1$cJ`$C z9!HV)QNy;=uAz7Nk+9O$Gd{{S9fI*`m`=ZT+6}Vk1#@;lYL<*P8Y3|?@zAD}FEeWof>vsKE=l@g|(=y>Qi!!_pT46lKK z7=7|R3!T5b)c+!VbDoSC(F0Q4IzzQG?6S}3%I;0>Kww74LF`DPY{0UmidOYI@uatL z}6%?FE?W;8I35=;U1Du8UP{_Zna! z4z$K@{w(^8H6o8sKvKvuTXTFKf^Typ^v##L4K|h-mm@=o=Z$}~*7C2zl((YpuzZYSIbQ)(Ubn<5 zfA~s1`7poc&M!;uR1c^6(6}px8Xz5W9Nm~3-tng4?caWFh;Bkr`Wrs+lOcsq7~MQhRr=^-G|~YM{F0k98MSpVqAvMz?z6H*E~;KB8cer;rZ?tb-kxv{zv;Nn807Ku zkMNm^R*wuJ-7dq1`z^aLl5?-cIPqeAiD*`|SmFPA#i5)+IvjHs3M z<>3hznK{aD8qTx?HFarb!sU5Pv%<#)EPhSP`8r1WTdw`IthkTPuy(~q$;Z6XFrM*E zx**B-79E_cS3NG2`v$zIXeyG~Zbd7WRYqIV=|BSHWsdfw5>sEGi49ANsLvm@*+O1)6>y2UK z`kTY*)whPN>#wm4Yj;@B#_r9Offd?%4-;}Zl=`=9hnmjy!WozKx$K7&o^$txWhiv0 zcQ6tjc+YdBIpHJC@^Fz4gN*JU|0NTs*Rnwyzbeht;KPRSc` z5u4OJLYUNh`h-{CG!m7ckAe(sW~DYtmgiE37Ht>uUnEoc@@+h+MLamlFef>!GP|@( zy?aE5@ZrM;o>&4t^Ue54Sjd@dj2XFzTVy=_CxvlA4liRI^Er-3@l~I(g~xbmq^7YL z9OTz_+hWGWPQ*Hw&nApD^r*gdGV)V&+n~>@x*B<6~O|6YxK^VHT8KG&12)p&Ey0Dh9%QT@0e*FKQ*SFyJ^cMSm5Glvt{UW zv?InmFsvw|V&oXuXX`Y~%w~YnBQRgEt}z>OLI6M%C=+ji@gaG?(1jPG=FDbwBd!E5 z(WN6)B!uO1BVaAKqK@p+wdHs`8eKP=UB7-k>nEk(2Jp+!FedtnhjPY6+~Yf+DntJg zqzGQh?>vrZ3WT_Rv^)2Kr}?HIznA0eh`)S(SqVgr{IPS^1-gV>SzCBIug+&k!>56$ z!WZz?^$s^yV-P_9_~2rBO0q#NROe zoW{TSUf&nWa7WCuD#P|F{_E=%$u(%6;y2D;)Xj6lygMITbaRyAr=sALl}jv}+2ky9 zFK}~>``)w1Y+$9X*7#@Ce%*E8;Ib~3E}v;Qk$t}P8x!-nL__0KV{xBZ07p|b?py9s z)$x3v`uDfbQUekm>8=hf%SvhdA3c76(d5Y;thuKCTE%EOV$Xy##7m6E3LALOp=Xr$ z7=BnALL_%D(4h1I)9pm_K^n(xG2vK|9MYwHE$Q;g3lb(pr@U- zt*)P)2?yCq*VPFJ)9$jo=qEq<3CoCFKE&mL*~8*8h{eoM$e;$?Ix`3I??&qEdw>CX zuoEFYre_)VsiCFA`@Hjel!G5piQzF%SAwQ_;rmpc2{Yrs zX?rYP61{*v(yii-0`_Mx)OrFf-mkEP&2iw;r0eevOV_^1M(I1;cKq6~Oh;yw4)Mk+ zCs09$^|@H;f^YE*<4&a*JQf zih6RvN(UG(WsBnOe@?}~l~B&kdt}Woorh9zl7MugL^3=B!ASKOJzr6#(I1RqPJF_o-ZF3aH;pg&`R`FF+g~&zju|f^4En+s zA&!2tRSaQY9qVP7{zm)3O3m_&l>r%4+>tk<2r8Gp$c1@SdehIj zZVCPKBq;7V-uauk+)g8}JHIjB3`lUns0Mn>(&0Ngr}v26u!1&?F%F39!4fdyf{oU_yiP^i{zIzpe ze{cBcqn`|a{^$Q?xO?|w&VApfe%s8YTGqQ+pHI9rB=DeaoHwFJX@XvZ()0J9J@bZQ zcbKIGh)&FPBV^=H`%UE%9fRH;ophbM^{JhdTbx|v8T^Ya;Iv9G#{Ip*6+n@2( z&xQ{wJZm6~Come(UZ}Rado3LZHy~$oHFV8f2;r+l8}Iq+jKJtd0vmpY;MFgoeGH!R zK8?}5`LP~rG@3g;Ui89^CH7bm#+>_ zSjO~_<2nyc53|DHlTSVy?%bi%Ks=4K`}gmqljG7SJFez?DG)@M1ut<;!z>6q(k*zM zl7zj$N`+nSQn<3gOax1SoQXK~U<34facOrrz4`0I>RZ3f#_8=W)$-U@R0=$*;&Jlx`JWFnL$OyHg(TWuMLc4#|*HQ(q%f+W<){eaCo2~o}Y;c?C=sHc~1U&wo^G$bucdYZzl=qT;>Mb&1 zI3eHnkM0f64^M~fo7i!i+$Vx;bs4ocF$wEug^Ygr5K@i-%U?jNYlrQ(+6b{Otv_Vhf%bUF&Az-lm#Wi|X2O+`*j{j2Q-;!g& zSuRKF$-An1EYpTMYwHZPu`KWWiTuoCm-C4l(!R(j*}}dj+%VeWR&7qBS{S&D#ga!ZS;*^% ztWiFa5sZEpoh@K1S}O|`BpE=VmE1F?DbvV8i=zQHTqF)XR;GkA&0n{%I<;uJW(;wm z-(z+kKVripN^*%ybTXBfq(@nlkf4P$`5UK*lu4YOqA?_1<2#SD^cJ86dE1?_kVU-4 zu+HJ5M*GO?1c;#Vj<9?Rqd^j8!u!}rjLQDv1wFYGE=NHMMX7J&3Zdhl3Yq#O=@~!M z5Yrx$J?Aanm*JiQjdGZJ{de*n-LgWGcmA1AnGK{P?=&zAMEJ?yI4Wrc$fwni`Ivk~ zSKCFte9UDz>Q+Q}QI6`R<=)taK><2wh35AnX|-f46bQ1k{HC}{e`4%*41&e2ldqA% z&|N-bxiXwIBZolOAHTaSPQ1^(PUBc~O0UJ|yHuKnQRes?M&EwFm{#$sY{X#fdPeZ# z4Q}HHxv89^KYS;m>Cb6ca=8Tgd35w-I5^@2rO!Sc-hcl+MqYms1Iy(qHma5^VGH|+ zyBL8I8yXv4{r##uw3hd@oFmQ`w_$qDHNMTC&;A*sI4bOoM}5ggP+|KY`Og7*!)Kp= zK794f*EytPJx95&F`Cl``yr#u)=I04>`HN{rfythkJ_ULpA1`UMs+44r%}OMQZ+5o zu`$&MEjm>!LM0>){j*b$dRzJlhn~pBdKtMIw?@r9*+dwOJ_d9!onw5OFTI&~lO+OI z+!VXDlg-A)1rR*8EIj$yP|tK~Bm*heKfc355mylcV=9#g>9BOT_~55?tG$cY{wf&$ z#-s8GvY^X>J=-f!#c_7MW(aJP#EA~r-oeA;$4`dOK4$3{r_HR>0sVx{$>I^dX0Tjh zhF-bxR_&t=61PTvWH%bcF>KK1bP}e{TCWW2kN2pDJoRg7`26li z!!Z{mZtgHkWFsFMp@+1SQ9N`lU*B7<`pq0fq^(Zd$c^iq^tijdIBX#b%h0NwUDA8L zbP0QbL00y~th3B*k-G^mV*FirV_5t_`B zEMydCDX8mJB1+#?@1Fmx_(0 ze1gNrDm0H2yS+mS^Z|S>!2S zHa0r>E}0kn=og9cUJTVsK=ttF-+JbQKhmf%_~|DfGUITM(~b@>r1pl-S&H(M3%Xny zR1LO`bW-l9#?G&J?|dvjgQ||lbir_RO>dStXaxFi!dn@NMOX7Z3nzt&VY0=ChLY{U zW7gU~dE&)u)xbMKw#CS{$B4RA^WcE}cciz$o{me~JH&xv>e)wZ@V?Cm?ULuVQ#aAt ziahnVv~&l;_ZdAvCO@UyU)q9tultLv!!;%hF8+Mlfwk!9H2j1)=5jm@6PGW!KVzHQ zx7WGjARPz+6=7Uzw8+uK+gqC2MfQG}u#TsplBnzVvB@h>{x*#>kI75(+=K{Qori8? zYxco*#9wHTK7QUwbjV2lXPo?FNmSoUoJNsa}qLFK7@8IB=A97&^DVM@Q zgL_(1=119$0u5ib+p^v~HOXbSAanG>K=n=?Q3&pb&@=aIV9@NN6Hn+c-240i7x-;+ z+w{`#;3@Rpd%)cfoX!J%2Pd2+#NGt&82N;mip;!$bB~k7PF?x}{qEfiA^tHv9=gi& zl7-*OMA!b2d;IluWA6g95F5kx`io)BP7?5obg1n3H~{B?0Vc)78a4$H&?U4;+YJ@; z>C3o29l?40w)RFkBiI%z;&ijHdWdBYWu~5=QT+x@dG*sz0;Tm;E@8&kFl;!!XE7>? z!XG5y(@zL+$L?ro{f(IXGps&^@4PkU%n3`H)+cS8*B&Gx#=bN|H3zjcghLux6v9|Lno=?1T4)QwCfd#6DiV zGHku^wP9=bGUbkXSC+F{rY|*O9`RSA_K{z(@$JRa&$up~4RE`h|IO(GXRHlVI4;Fm zV7B2o6+c}?dQ&VD;$KWuDsvU7I}j%)BVM{tQquB(W*$`T?R%+39 z!`(Qnyq#Wt%x$1Y(B*M>nXv%Y#w{aE$h4Zif%4cMPMaXk~Nwl zEjQ<Q+xV47 z>v&@@^@6vO@_mdv2guubTo(fW8-Gf1=vJ=it*4kMQfbaim>CFAWFO!JNA6a2Q-|~t zPocCtI*(iupg|ae(|D0p$v*mG8pb#AMypJdfCN?fH!m&lJ`5AS(-Kr(iXZVbBHJyL z(MTiAm(7!ld7AVK&E#)U(>O22X~38Hx1Y~`Z+M><{a*(2GPpSoXbaJiR7Qwhda}pf z1&`&d5oTvM$s1=Fl=+mFQHJuC`XQEb3DUYj7h~UZ{xLK)ntRq!ebAl57oYi`<|i{? zdKb#%scG6_@Tls~A3n|v&Wnt?t#Z=e2?n5}`~`!-4o}`gap>|V>#?=k)Vt5}s;$c` zpT$_yz!H5KIdi(i(KOEJnRYBjd348vAQTGTXsn9g?!_O-X0B;W68JWq1u(}$z^wQfb{-Du8bL+mZwb8Ne9F(opLuJC zK!17L-)HX&ousEv_lB#xS8~GC2OoVf!&;q=!UIM&P%~rXOc-sYOL#QRJRQu`*XUR! zUzDrVlSVoR(h5D{DJ=4*$b}CFkZrG&w;fmO9QpI;(M|Tm9B^6TJx(J#qQiW7xP0|m zE@6GL$G#unuU@-8ym;^!9@vkuCT)oX8f}#+r+jGaec}Lm}Qs31!%sq)QqEv3aPUUC4;4eUVPScpDHjSfipo}&F z&&)Q13+m1BA(tOM{)}UDInI}Tx*GlF=|w!jrTk3WjPof^8qCUcl7KKNUxW*nIKGcA zZRq&<#dPLy4PVbn+dMB_BSoj~!Rv|>uND}X{XKo~*>J|?p3CUu(xuJeAyc?_?|m}d z-dG-XS2pMLoJaoUzT`QspC(^)h2Ml}GuqkXLphAHY&b!h=Z@21bGa*nN`ng%M*-rM z2z*;p*y3oa4E&&5;rePFkdzT znKq_y=uF2>g;Q#&^hQA=N0h=09n%kNYAD`DLM^Ffm5*fe4VA_rfd9Z71taQ3MmaG5 z?GTXG$qDB-J6(t(F~d&dz$lBSHWAs=n^F%`NgkH$0-_%dbo{I+OlIz9 zSiVcZW19L3Z(*r-|JJ*7m7lWzMNZts$1j5XZxb$i8a{R)*HCC1`4{B>b2__jmaPT| z13(>fNr=lsmb~nz0wAYMDv;g(u{W@hhJzj`+Ilw11ukjCk`l;xddFV*~%i zyf(aLRItV#dEz43{jXlU4NV@6YIV44j!Djl(FUs$*SuOT zE%*D}8f|{auVvaYcbfXHWEH`Ye&MV7R)c5ePdsJr2}$zUcsR#Lq9r|X^=}@0R=v<^ z2nzg2b0YCaL)1==jsC+Y&$ztoY}jV`$#wQQ*hp<~2Z5c#OBm^?pP*m5j=WQXdfUx` z@BQ!x!{7SOWe)VvfF0|Jk)}>(`kcuWPw8yFTjo<9QZ9f~Zqh1`4mhcY>+758KzP!L zgEqA!rSz`T5Wew?qn&;YGfl6FjjmV;;T1&-UduW?K#VYrH}RN$%P{=PGe7lQ z8)AoE#Fl#~<;m3-+A!Sg?D9ct$P_7;-yko;l(`26AeeL(E@KRCvK%U_8farZGT%0Q zDfAeBla(D;>DRKP8oJXHsXyTW;X#iSan|MaHb*<+ zs=Am!VND2H9m!t6W@vrNtW#1poQy1jM%j&ZNa7o8`7JueB=eZlWTNv*NufG!~Je71^ed1e(3$N=q zjjz5dYzpf;ViZG7tTM{I2oLhIvVpzE?d+>`n0Hv}bpO+*!v~)|qRwQGjYInrdJFTH z``faK?66ry)O3nrMqgwi$5XlZrS4UnX}pAt{&_cpaQ;mvB3~E&kZoyjmMB{CnT~QV-}-^_^oKwD>w70RA6k;QC6k9bYe&1j2Vbk zo~vxQrv9UFJJMw3;*ZU9@WjCQ;?fy%&&&kEnsvfHI6;&98h-;aYK9sDKmyMrcExXd6A~gDAe9=Ym z3s*PXLe1ky5;*f<8La7kiyZ7EOB{E!+Xr8)H6H22+Iye zI*?dHz?jo983)P)__8PT5c8L+&I`tQ}i-^-}NnL!8PF5tR5^rz~p) zQ!`lNQRC+M+7dR0Ow->ipG*UtY5(TOIIntFr;CP*_)N82$L(kP`)t2{wx2Xv7F+KO zpTYH_STtpdoMBkVf-3ag}l4=Crr*E#4VUnGhLqb<$@2q&}EO$Fv78o*kl-oV*hn zsq>A``!Rzd8iK11c(9&)V{Mg26Jv5~X?T41Q*O>?Nz2V`jL)K1`ldXX*3`|zm1FVK zJO~#Zl}_ktek~vF36bVamMyKZKQ8xe(1B2nUEdhDbv(Sm^E|zhN7JHzyM}4~Xn&s# zH+znLiW7`8Uj1s<_&R*bU&C0oa#9O;4yX@2y4Pb|Pu$}$(x$u*$d`vKi*xV8Du#?Q z@ZGvmI4}CLjaUa3$&AasJj?dC7+FkPAIzpVO&w*KE^R&y= z?9_lsgjbbxM$lZ1BIce#ySLDD#-WSJ|ywe3Hn^RTpiBez+ zNpySIIO7sR?9x+o{p9H}r+wLR1>ShCOfw)q|4$Mxkj|6lMat&<&gIc*{yh0~qLB@0 zFhH$h>E=kyD+cM|z`nhwCulG*H^I8f(?@&5vlDK$yh+HVTND5cZ(tW0xj$vP#xZR( z+2u{)pF~WfM+F?gurjY7jgKSzkyI(DQmb~NB#?L`eMMzH#y{z*kfLeI2Y;!ovWbZf zgikvWR&pM_dVIp=OdE{e(?CH2vl5<1%`eQ_0NOCu`Pc_EcqfdGfC5Ja5P2{nQ^bww zzw8mN34nfH`>P?t6*%C!y`s=&n!j$e%;0$=r1Ax08j8z|=6(Cy-_GpAzx<;=p<$*W zpw4y_Eol|T#GjEEi17OwNzk(o3}FHe6>oq73lT>}av zC_Ho_H%J5Mn59_%`9J?=fb9GcSQAK4d|c;bx`!#B09!B&-UMK$72u%sB0g8-Fl{H{ncU%6kKk z&xqoJ-xoe#3N$9%4-!c|Z(5yR=ox50AJ=IsG^nnjUqB`r1^G>(rL!7+28 zI^~@jEu)F(L>xgj&hn|A*vS*N;iqb+c7pp3l7~==sF|ZsqE=U42uUGG5VCobcMPO zBh$6d%9MYm)O1XrraO1$S>;U|hK32Ro_b!t%W5qj?tc)*<)L;Wyg#T$xJ^%-NF%Z; zp2X4oiL;U|9r0Uto|gHy{!Mek_{N@TC%a3_A;?=Lb+Y#m1&>+DVW?326U-J^Inwl>&vb2sHuU9e7Y8J8CZu9H_%^PDmp&7%JBUIAsJ zYu%L()QoiEodGJrC}}CnE`zfDR8L*Xw9G)Am#>}LJ_B{=9vN`2*g=4Hk5YB{?$)Nq z%VS=n?{;8X55$~$fEiX>(Wg(Jpf~r?JB%~Fp^TrzRGvwnd8$34Yu(Vl>yUyrC+tKm z(ph1a*QL)00K3mc2!)Y~Sp8N`>*(x_Z8mvhPdHfVDUjX=vC6EkCySi|+n4G^G-JC1 z0&E^;Q`Wh2WQQe9Y#z45YzdsZ`MY_L{g`@%7oqUw-5E`vUvOh|gJX<$=8OY9iDNYV z6P;)MAN`OS`cj=kk(i7+>+7hyiC6xnU*YBH-{epT%k^dK9ZZyMJKa6$$iz^Jc_ z$f;tJ|1cAsV+ou~(#?ayRjG;6oYn#E&Jx6z+Gi!TzO6Pu`u#7o}A(_z8M2Qv{8Z9)dl39h8;ZxN#MXkaq={p|DM=u>)3bS73W zzd>JPo6I)jjnB?=jt^Ys*5w6`#j{Y$jLU{BkQw!7TF_e>UGGR8kxGP;!c(TKpsIna zj0Z*2B~t*5*KeGHt5!(c z8H_bMJFIG|wT^3C3hz{}27?_DuZU+!hh`XQN8~*6u{6|i+U}S;xt#~~~F-tgN0AXZWaqzQ)F>5#>PNUVvEgcWujiWuf zLam*>0XB5dqTy-hNH~pUM{QjiVrRvU$O5;bEHe^DN{qJg9P@37k+~!0DCs_R zh49j*zsv59m{oCP*bA_vJ2dcSqo{L4H8LWS@q!os;UEP88Qv!};MgFkV>N}jX97v$opIH#01JH)~{o1jr685&DEq+Wk{4`ebcJ9yta%9;x@a~h?B7e6qWX(HA=QMl|xi8{pKmGCWtKYtkEZmb|LgF?} z-bYvYP~R_mzv$<8Iydm3p)C(vbRO(Pm}iFL71#38&!2j)^Sx;>T4;Nu+kPe`cy!+h2V`m;kRhFy|#Ws zc?lx{V0>41sTbgBgQay#Y;dJpH7p(i>vC=KWQjplv&*D5XfrO;fmp}rGp!mZF(2vL z1fTURNs_MVg9=r~c%&oij0Qu{!;8%~Fe14*{OAE`poi|m1i5Dl0G?xCJ-w#}Q%o2c zryLR}kKz=SpQBR()DY!Kfbs|c#CUp@uxU>c(+~c)e)1{J;zVk(MV`Bog9zgFpqi}q zsAxtk)D;2R;FDTI-BTtrx`C?vAQ;#0(U%d2mYR2bnTwgVNp@N%D?8FbXBJc@WgC39 zM{0EdMGIcS&7h)|ze4iYIJB3sU*j2H!n7Rq&voIF;!J2W8NCV7Acwe7?YdF758ss|=iA(^Oq;(!>sqpa_=W z6D4|;d{fSfs6lu$&qHqfMm>}gJVQ}rC{0p-(V173cGrDL2e9xXO+gN>F?yCW^+yAX z*5umxM)|@~kO^7j<|D;Hw!>*aiu~Q#E#WBN>(`{)wc(*US{Gyj5dfBg!jOtY3Zmbxsob`@?tt=-&<> ze|8_kjLl#e&KZHQQWP6>^@#_IODDt4YrDgpH?Ff`4W;EsG0RLUVF)UI#`H)~YQh%J zcCAbx#j9v+_GC`)Qc%ZNJXK?X8ISGZo4@*NS-&ry94lB+f!C<9Q&|myPYq@p{^#5- z?r5y=`dJ}o^E~@hv`H1B25>fRBFz}L<|To4N}RE( ziN+fn+sYp+3erfWUwNT?bzGa0_!@k|coLAf?5rv0bY$>9MJDTGTUSs#TAw?*n#jI_@}JVcGT{zH|}KA{S1XXV9oX_>yq85 z>xN6$P#f3I#Q|w)JgX;uYpfi)#+!d@?Lo7`1(p$Y!rM>%T1 zFGGftO<87$tsO#?SZMziu{@%ZcaA6H7YK3q$>WRGZ`07Oj{kotRlI+xpm&p7=?J*kV{95pi@@Xf?4gluKj#YUBh*_VanNk z0G2=ZKDdNwgUi;ck*=~PoW^mF-WGS~(-ya#y*T9{{3wLflVg~qN^P33QCf*(CWH-V z4k!`sX~N5TMBFX`vo5iU(CJrsI)11AV}{DIhVW8Om& zl}4619WpCPTGmYuSk@$+&R%Hx)BI%#4=c-t%Uqo1$oj@MN30&RJcqpdm{kanTXiM` zT4QZz7Rh#-(RXN(Ioo@6W8FGpnT_)B{A`aKnla=yxV-K9)g5ez=fnM{`@;j4S3RS? zID+nqYp2;Z7TqEJO$H~Z3=(!3y|B!_yS*{I{??u0M<4$`Id;?&zjU^Zc2fKoo58l9 zZ|h%Y!=2GnR#orPCOF6msSb=d=w=y>S27eF@<^FX{@9YLlL$km6+ zW?IUHIKc&}DZA35UTJSUp*=ffki@~Q7mF-u+Io{!1K^?JRi1VvPJy@c^eH$a9gCi?R%zvbwax(x0_U&L@BnB%kU}g_D`XOpUmUTHPHTjbpTbd5lCF8*&R0;FTS=6^ zqC?2#!^ZGSS6*ipP<^X-a%}l`aLjfv1HgqOJQ=;5I7L6_r2I{3OgJf{C;2Y_z>j%b z?^VB4KB@oOvGW+=33y%c2M3V{>6$Oak)eUf5w903;e>>ULR8=Nm)q$=Cbb=@b7~N! zWsj+tE3S86-J(EYJOH7gbV7}Dc*dnj8<$gJ&{m=f{9}aK7^#+yT;_##I{SZp&B(c7fxogWz$QO6sO<2C;8L;S{#n@r_uC{@MVc*O5?ck`@&btG6@n}$gq)D1fj)xof`vH{(vQP_L- zq^89Qs}cXI$kN3_f6g#W7BC{Ht4YT-!=Am*bRv4N&WxEhXVLGoDoPt^%5@60#-*JS zJGUA~&RWrqj2#S&r5MXp8}f=GZ(vY)8i!?|8XP+m6h3g6e>LLAx^Upq=#hAZIm+%d z{xCe{M4o4ys^YD(I~;|nerMAyv@9`d^@BhAamvIF2GQX@%hQN=#_1t$zTM)MV>?b7 z=?8q*pmTM=B6GD2S~b=pPYRLij7tU>cL+(N&x{w|#>Iw4W7A8b^pB$w++Z}>H1`m$ zqxu@<8o)NJ8n@4py~k#@&N>4Ah|TGpX7OB%t$#n7Ad6Ex#H=lIj8o4>Rpur>!(r3rj20$K7$ETe~JEDGDc3QRwo{uu{ zgsb)Iwd}+hhP`Jj_plNoF=;%agW=Jr-GL~BPkibOvpLZC48ySVbCpqK6GQT$LteyF zWfM~1lTv`bkcI{l`9yU?UBEkK#Tkq<21)2nV9aQcXzUjsc1GROopT{|w2@D|dRNcJio)e3Qhlhu|hldAN-e+x(c z;|Kk^T!WdrJDDT3qL!ERD05Va>42tc?ALQ; zEe(5MB73*M%pi)%0LGYJ+^gDGH`3f1`@7s5APnSnv$aN{L*UgtxeJ;aCha)xKlkQm z)Ag>2bYW~L4fM4!?nwhpiL9U%0TFT3wA;Wyfy;<<$PVsZ1|wM{WU_SI*E^6pIR`|; zLW2;>u`!yuIy$Rqy60*dpiK+~aN{^Z>9R=XM|nlwYiKi_Q-K-5R>czz60TRu6P~bK z(w(bc;={4O0eb2zMO5I`)#iToiCjQ|$FpGjZnV|DlBBX&11)IiRBMek2}B-$+ywpc zw;;bW-^wM>fih~iB^Bc%7vV#5^QRzIX}rW;q=~|_jHWW9jN`|%Rj4JhN!L+ICT8fW zC@h}Fo720KLNm;cv3yY$_s-?!HY~H_4(tTqX=NbnR|o^$(v#YIFQn#<{_wUi45tQ= z7;Hh%Cdb9AGGb1>(h9{pE6%ohI}A2{K-w?u4to3Jh}2y=`pUTG-nO|V}75R zg$n$V3;RI0^5e{>s>~bNPdV=>fqK@#EGfDzYn2c_aszdZ1AHo z*-Bd=gnG(aLl}gS9y%ao&eSEL#_|u}nb}7>Xh;^i1v4OZR*aHB2n;W7akDb7tv-MV z53qd8w2qIoZeXcmGnDdM8}Ahm9egb)7M7^{0dWYi3NXQOx~gzU7#bFlYPxar z7K$Ac3W@}|z_CIL#6rh@=gl|L8?U~CHIRO>qnX(6-JJ=Gjp2Ipa03Jc2QRIXGLp0K zmvEeEUIA;K_zHv?i z*1~N`aQgbXLlAD|{F7Gjp>+~S_%IdF0k+k4hD#CfDA+W}m@S}w+ZvmN+XwC^Auywa zIVM$(V%5d`y6-?H3hmO8s50w{K*!BDCMkp76c{Cul437PIJpMLgfyc2P! z!e8zee=7v}EnI#qe|2>=;8Q>gfzaF&6S&lU375cy1tE-f0&?&u!=-v^?`rdED3ZP1J6O54 zBDzf{eW$=|Ux<5}5JHj8U|Qyl4s+Dx3XVzxxK(+8Cmtg7h0v{VL`Av?Mn7@yKl@Mo zh!f%Tpizasp!Xw%r@R_%(p|=4cdSXRQ_H075_mqv?1JF9Z&G?Jq{0RRiWT_ zO!V?wf#)i}y|>@YU*4BI(P!mHq1MQDZNOj<331AQyq-6hDPqm8M5x8k3GSNPW=x7NNc+Q&vxXram~R5$hM3b@Or=#Fl~my2fI4>LLohx;Hj~4_9<_X*#NK$QZIm^$<5u zg4VR+NpOgp7nM^kZn@+&zjFdLa-Zpu+vwl;7M6JFFKC2$zq}OEy?hO*`Hml_tF?}0 z=3<`Zl&14e<;d1C$Ap1_`wD`Ru$Yo>@f?@;0Rv_6?CcoL2{NoPcB1V3+joHm%8UFK z%Vg`u`U-nXDoq%UnsGT~6MAdcK+r0qiu$FiqGb3*9((?_e#>@#%kM+w%FT)PnN2vt zz>`O6qzze!cUkEu847p|?`)?jN`qB~rATGH6dr~P>$V=ss z(=F9Et>G76h`aFD;W;%+}QlQ1evnz7KsW}v&EMeAjo|kBvjEMm>jQx5#l|-(jbnF zxF8?2iEfzc=zx951sF_q>+v7sDeA)i96I2#ejmpF_?>ZR8_wdoIrU3e zWhs_y{_^8*ufPdRO8mHKT!pO%2L0Bz-@wBI=J6`^tnrclnpus|QZIcYgkh0XVb210uK%R( z04;A+%(#@_0V4RwzTx(*LT7>Q5DsKK3g9viEzK_RTehxZ_FwEd;h8H7Vx=9yRey8y zH0_H2mVENBG(;Je?M7A`ZTIhBNiV}rcH#~r?YY8{g-8S(`q#3gmvS6fzwr1i?Yl>) zi&Yr~iSn8BCT+xuf(;crl=XU9w<{ki2VTNu z`KufVSt>D>?8*zIxQ;7%mR-L-b*3f$acamPYt0Q*dj)K2-654@>=S* zue_)ALF?uhe&I35LMwu*%8?dbhrk~09`?TntM#%{p#WB8?6^Cq0pHUs)Z7MMK)@Ef znO*}H+@}~5x<=aGSb?Anii>ef6Y}LVEbaTyb(0D|?J@G5!s#(uMF{DnHM#w1KKYK% zL*V1vQ7(VCO@=^RU7k(DoTyx3Wm|NLlNk5Zl}8=Cn`h+}?t1WE(};(XE=&`BhQioJ z2ZRDlUq)GA&-w!(3i4Tbna7sKlW1R29}j#A6-HZJRm;Ly&g;g>n4~FoZV>_Y!GqW& zytcI*?nYazl$~8k>hA1L)0Zx#UQRw99O$R-Sj~Y~DCCy&o-hfImu0&;>B_@ACz!8t z(3(++gvnL6Os6@m?FmJmpN>C^(7@q1s^QKm?T6bM&kwMEI{&IL=z?V83WA*NaV5$S zxbh=imqrn=$NPHHgQw48<*tGeL*-|PvmCsHPkNDlkHdTqf1ndp=)2$xJ(SuKh^unhcw?y5K0f{3?P;)0-qJ{KUK~%e_clVwxN_lq+KAJ& zF*?u}t_a&Zw#D{uutI(B;r%p=T(VCa9XA?k!YC%sMLyZZ8slCPU78wqP_;HHB-bvyu}yHdoe2z);_4^x!}RH@Pb@h!)6Q1DLj^~$76Iwuiu zrzeMyiUY@4N!uYP^GtC-rO14~gAz#X9LS5fvhav_guH{Xi}ttjE+6nlS_`0tC^z6L za}{+$M<+$)S0{gCY*}ZK!0~K(asgq6Ffr0`Xr-%nuJq9NogFBc@Rd3q1lFrDR~}OG zTH%Ox(5B$;{4T#OGI$2>G!#XndFFd%MqNxx!&+IUbCf4^J2#s?lmg4-vyUONfHdk< z$k&S^KLba0u9uCmY5egv!zx^URMNpe|9oz_Dg-|Fy|ROuqFv@ac}J1pEz5^`R1kR) ze&-eCP*zdxN!c-%0H5PSW1#7%j&jHo_=4|JMwZQrwgU56{wf@L1(a!5%p(%Q;0 zK9>Ed6J?^6??ablV?!OWb3tHi$f02o7ntv;{X{_HH~k|5RVM5!@DU%wv`UEPkby)$ zfTVm;u`3B&S(qkMbOas6z&}8tI6Pp<75An?PPsm6z~JDc@H-|QCq5=rv^OWUqsu|&`;*@` zlYgg|%#$aSf6NlJE)*^;zxLAg5ZZqB%XhHC;M#;@rsa2N4BX?7E;u1@qg2R=28Mdr zwAr3Ei4@@S?+&Mj9$-yT(04M>I$#}VVajbd1BR)+d(_QAT@<*PZodjItU>IPx3by1k0Oi zoQk^4WLf1fA^fn}sM-cSS*!5GKZF^XMz)%RCv_2i&(1uF1Ch3gVV(tlama@R3KGKR zAnRnGfC2%J(gtojK9f0E2315RCo~0RS5VxmA)b%;sMxeP!77ZC_K3dSt0OpU9i-vW zb7^_8p1$~e76lvzvQ3-VFCA7!=-;u^j_@1S3F_0eq#0qPv1e2o#-H#}y)JEG-J-Rn zP$zmz9NWiPcqg&xnV91l={*zXD6CU#ZZzIM<(t-V7{2`ou_4^@Kuf8Nr~?5ctfN#C z0y`aJ8taoLb#*YEg1fFRya;;ty?qz31G!3t&xFNKIVQX}j=ifw3eJ|JV6T-(jv#yr z-lkKr@!pUA;0i`s#Wi~9_M>Z+R(RpjitkESS)b3W(|R(EqENvgbZxS(we_{2g^V@+ z5*5wzowyY6R6F!xr@#=cNX1M)Z+VG)sE&Z-M)Gnadv|+l`VX+=9&WCrhfn6xFcJ5<2`+JAaySj4P**tU z=+lQ!(xlqsYwqVN46lR!>9C(*mTkZwpQ@wiZlT0(Bji7Saf)_glwgi=U)45z{eV7KIPXDe zIoDrH&tIBkenS9aU!qFK{QN>%c=9M+8NZO;{q#}V@83@k9zIC@!xMpjnsL%@_h{{4 zXxTTS8Dp&#gCTa1m>a^iLjf6=t9@LI{4|cX)2+LA)8iK}q*@Pj>Yfb7@-ci5s3dXP z>B-*vX<>~y5)#h>A#|=(&+36!xdI>^o?PJLcwa@+(JA6_9i+@dR__{3(?Ej24165Q zmG-xFx1}$sPoCWdE);rQc`WzCzyWg6#W)P5g^kF^%pvkM+Z#eD_2qw1ruU@J-`>Ex zH6%b2_Je(Gdt&ujIBDNldm|0@Gm0a5WRCoinLoKW{>J+=?JX00<2QT(oG?PgY^(sw z4L5LYdyEpU(%aqJNWBAltgf=!6yw1tlNf*D@y-nzPHdyRA?DGTZ?kbpcx-nF`Zk*X zwoBZE&`cjV_sfHob1c{AC%i;HYPl!Rx#zv*L>iy_lRxqdFYhhKZr454(EHBO<*KV? z$p`#4hjXJcL|KNgWd)Cdi}IXboIB*FA#XqrVp$`XWjW3RCWupbyeuzhFX#^}1P|dE z<1mb!vgv?Z+#z$r3;c+yl5dpqQ?8R5Eu62c978jV16S0j-G_p7dRe&Xb0*&vzq!+D zrA19vNtNMiuUS5{IuJ5C#;JlZSqEOO3oWG(ez$?MVNH75~ z)7v~HKlM4VWpGCaEb)6domu%);e<)8%`$5DO~^o;=$#fZC{rPDX+1%JHNTW0{_^EB z?@Z0*kGLsrm%dwp1GYo7GbUNwleV{*aETb!*rom`*S0Hw%0#7A zt$3}WL>kFS3mIaU=!i>P>t1zw6@A2m=r`IflgJj4{LG0(flUiz7j9GS1WS+!T9-nc z477o;>gvz>+7fM3z*8v21%gS~G>(4-aPcJb*S)~i$So#}3KO4Ee5#Bs({^;Bcu=CIBlS?b~ABr5pq?^4N*}$9yb{Vfc6dVIa%J4hJCw0VRA~Ex|%Ql~&wRc(g9c zY)5&hAvrxiI)r>Og9f^u&f}WWeRU%Gp#uX%jS5rwGq2)x2WqipDNN zpD;UKRIq`@X3$=~>93PKgCE2Wmg+=6?)mjsI#Xa z39f(r+jof4!)hawrK>s$Y2wncDJ={P3`FE7pLsdng*De94HE$l{C48HqjIjx(4D(? z(i1Gxo2=GIlQrn{-1vC9bn#Le9T}yKY*fZtFD@0@bwOHWbzPU3Sb2hm+@CTuG!%-o zed_ZL1kp)clSXmvZKu8!R^HcGDcIRnnO5-2u1yN~(#ja~WFK?j!rTHUE+Ys_TlO1s zkt-HmU0H#M85g|kvgGQ+v$Ux%5p-mg zs9FmvYq;>Bm_eMniMlVW7saiYRdH7=B@Jo10ZkhAb%p3l=WxO8t1<4O4GkvytbX-& zat{tV8N*`(y=i)UC=Efkt`I!;{P)vG|K=y~3AuD7&CkrIO-@H&>m<4nZChVm!63#4 zT*}LI0)D`b^RmJkoFEiZSKDihZ{7ajqTmr;6V`i$@=Cf-xwjw9r-=&}(qJcig*M5- zW8R8(z<-$tHZbrs;XbSzRLnv0O>m}w>1pk8IanDV(R$!;ywJRuSL)zv8@{=L(Qe0x zS{tkmVoW$TXblRLbB6uZ3LZQ7D1!hx^d`S$Ow=);$!oZLCvSJ0NfM=iZQn-{{_LxC z@-2KZFN+#3hK?$3x0CV%iJE6&w>M?Mn810Rz1 z8?Ib;ehJ8799EU^xzXXAWAD$drP-ww#_b$^cProoNajP7Dd!vL{{R`&aU;CerV zpB&B;x&+FD#HGdnjn3l6^4(KG>VU(sBJBQ?FUXJbFY%Z!=Ow92-sbACWtM#;P#VsZ zvn*R9iEVYh6JFaWZ}Kufwed~EKmtkpNqTAowyY@dj(d5tN~?2~_zYtWb(eiAk95xI z!jRa>TpdaTZ@=aejC|S9XtfOnfWu}YqTsg&N@+NP3dGfdAYF7NR-Hj6@!d54bT*i( z8@R!D|G;AXG9-G5KRz=tKc(!r zC^ypjkpJ;s65>Y;V4g>Qe*5hkGT8nY6fi57S=HEUoJ{)s^fU+GbfjPW@)zmp>^#h> zo<95RcG`h~Pn{o!NcK6*XD9Uxj$&QwqMR(4DtHNps*?WBTA!m@t30^^fPg&)fAQR= zLt98hJ^@Fu|#r5M#k~fBJjwW61>3Y* zWY=-q*#P${O0EnjoE^i&6;#~7c?1tIA#zLuj^zaeZO~xi;ZjL0(Ca%@oP@d8j{?@GsELYjD z_Q^-TPS>wp2G<8^j7_AwtO9JKa2*;h8J>J)q!&x(rSw;y+)f*ejSijwSU{)0J04uw zQen5P{N%s<)jsDpIB@&~z3|g^G!O>B6mX-|fX_OX>Mg9_t^laui96e(>q>>y(M$a! z2!MTQ{L(}k1|N`qnm9MYipg&gDh}8SG!PTwI>Mhy(ILSERIFDw-D|KK<+)^Ulql&- zm|{|PJTUP}>*8{C%`m5%C<+@f-aY*5{>=S!=l&f|*T&5Uc-L61(BiBA{5FOyg?c(H z=)yAP{g0m9j|p7ZR1l<1p6bq>y8u(d$9C3IcV;NT=hE`L4$MZpx}5%(ok+iL04|)KtW6 zhXQ4n$}Z(UoZm>xM23PQ*gxHuhEdFi7{h&Cz^@^Oe%V3bQ}C{ne+hT2@sUARZs;p` zl%drR;NPx(LuJfamGt~`Q)z-U#{>yU2ZW4Aq1anqzz8=7 zCb$Zj$7(33^sD=qI-!*|6dqkIoeK;Oq2lCzGxw*&o+X|;=Jh-{tRtu+u>AbSqjcf& z6&>HlEidHBIF%Du>fCu>pb3BR!;vwUa`Wch%b=b#y zVGm(`1!K$B_8N-jYV6(Eczicp?uMbAQ54knE|n9F2U&phybebcRtMUvv5-9@xRHh6 z2!C>122TeU&Z*7NWh-q4xac!^Ld%i+ZL%;+qMTm-#l64nSB7u!Ou}{seYCf8jtV$b z<~e3Ql#$!d$_DOJb)3m#Lm8!?Eh8TCH_GF^-@=w3o!&=Y(*tirDuTZ`=e1H_F9w1` zXiUUjLlZ|ogwxIj+{4q?`7TY z6$Uxf!+2&awKE=EacE;jSXn0Kp}<`TlF+BCj4BZZL{V9#4*nm-{BC;ru=9F0AQ)h0 z2Ny7pH_(mG?y~>=LiTUE z%FfTZttK~4fh+o-jDC9sjMl=X)yc*#yGMfXeeDcn1s{pe33><1u*>~Dy+aU(zOUI* zS?%g&(--GXIO#KCY+?ZlpDlzcdICg9gfyN?WqeM^3gbs;nl7_9JHYuZfh(}P!BOy< zLV6c6@h*&(Csr&23#Ip3BlZ{~AoZAI>yOaBj) zqw|xK>5u;KkJ%&f9#*)!M5tLvYglhS|NILCuAx}Q7n3_JoF5TdErS*TO@YKu42oRj z2|V%BIx~0yP)YRukY+MmCh~V~RZ&cQ*|rnfBbjYGp*3@f46owL^2MhELj^)hi57gr z@|KL&JY_!N79SR~$nv)nv0C%sU|%pC=B>bw>Fv|JR0`9Ej)Y5{Z3m0p&6~H_)NE8A z8Iw)U{a~7?k2(&74Z1sE`#CA zhkuY>dii+E+ka_PcTKm`7Mz_pE?QTAVc! z6H0r~IygeuQ?Wii#O240vIv7vDD`~U8N)4Yub>cUM3~{!;uR)e6d;&oDP(GK38e$qfu?2; z`eKgcujrDu+VI5j`WMUqQC0tsP^Rg_02FjoUgewMXTJfM^Hrj4}unfMi7nMRA=DDo+UW8LXaGW_Ck@Q zy&6BvwEI=An42FstkzVCgo0GlF-OzG+2~ zY=V|ut*Nqd)XnNZJ88SwG+jdxU?QPR$_%JfUKm9-cGOb`78XKRaPu*+TAD<=#Iy$j}ArMmTp! zmq!r6e+7fqRt|gPQ$Zj#8e^r3GI)#&4KN)c{2y{o#IJt$n>5%*=;`tEC`0fq=9_RI zRz3)&1o{DX1O@mk-wcdUtmVbHUb%{W0Q|b-ZQ;_kTVGG>D;Q%|SwUHQ3U65@bna2Q zI@XhVue=ossDTg+fy^+Z75t$rX-?Wd<@Zno^Coy+-o6OL2$)MA=M$R{ zUjb*nFMpdSjA15{jWP06hFdv*V3PdmQlyZ!wG zL8qniT-FqAq(4J$R3-$m@=@K}r0Xh1;i8JjqLQLg?#b$QsrV3djZ*UF*3@0Jm1t$o zGw>S6SydX~UVbKTDEW|zdme`LA#mE4;!8O|n99n@N}Y~Z=rIu%*-TWdtSox$pGp-& zuQ@WZxf8iyh%)7Q5ssMO(u!{YnKroJ!TFsBIkzRzui0?rBiotF@;l2l00)lX9rU?i zN{>+lDVz(+f+~M~`C9<bq~#mPVv z4axP<9KdwB&{Y8OOjRI&KLA$JaJK$Snz=Ir8+N;&(|Z5_KmbWZK~ydlJdx{xFix1( zlLZErb=WJRvX(8E5|~S4mhaRE3h@CxW?9#^fo>|*0wA2gD8fO1e1Y%)-bclii8Qw% z_d!nS>+;<?|=WhFoSl4;yYMA*617X1%v7W_iiZL z*R2GCS9sG+CJW&T;!gWu>x6Kl_`m%}1oN>Ck=AAeJ>>SBwkl^e-IqQ^i6{BZ9v$op zZ|oN~eKJY!;BKL!BNx+8P`~t2iOKmH01RXq<cBLV6=OC!?DztrY_>c-A((q zcGKrKaFuX40GFr9BQ`kvX^#^xD^d__9D;_`%SppN^in~P`ReB_tBI$;t`KW`ytFDA zwo^sMXO4L-8M+W1ATYR@Tiz6Bhxif}c9ko9?la|B7@exmSXc`j#DVoVA$ zwP54moJXQIla74@exdBJvd}pCnQ;m?6c%yd1nGMtBiW&D{`6dfBLwFo6ek5<8T&qP z>t8Pa>ZJW*)_x1ct0+sj>=!4DShe7}my>yv5%06`9hP0-)`ILFlX71}Y%WD03acW0 zV3~Db7LWT(f`+s$l|9|ZA^-ruXpMLE#yuAs2(<^BZb)U~1fR25zg7Mg7Z%g@+Da(1 z3P0VfC~ZS0P8jZUYo&j`_xzPKv%Z)f>pKshkTO&Pg8mh=<&)x={E|(KrI-M~Df~nJ z?6`8=wG!W5-e^D#tzJ9$lW)Ou#GT9WU0C4OhHBqcPBAnJx+C$vpyMgd>JBGZ-p9BvGbR4yEAj?L4J%<@7v%u^J7=UzDJMT61i z8p;EGzK^9G=$V2lM8eW7k8riP?kTK0zI$0^un)JlP)rbD zN)-~kPUdCa2dv;C+9nLZo|TA9%PI|StoJgaov;}8+Z8<(;Sp948g*7@5aQZ_zXJ(N zJUdUs;e*sEeeC1It0E$%Rxr%ynmobkRaaXr4YPuxwRi)3%n&hbmvNz@?TSei{73`C zBO~Gd;|fCs+=j0|!$W~;fW2gsC~u<^6Ii1eTX-|5yijR6W?xt{F1A;$olo!FMd`B7 zP@JXxzW#ptrZ4!ixKh~PC++tBEPD->Q*-+kK>~n5*JODvXz1!?-&5CKxSTZ(m+5H?~03e zLgwftMcT>dU)i43jvfJX{EcD|ucFR@a3;o`+yM^r+4C|5yV zQ01KQ5B}|V^`UcVlU^sJw|v2fHVY_iiegO5PlamVxr!Htt2SGluJizb2p#BcOTz!H-n>4nqew3RE%snlOwQMT!szw=>*MR}}nlzkG) z0prnsr9mn{Cykf&T0_ij+(o~$5~4_Y(l;g~xh*JfGIq#Y;3{4LcC?3Y^U^B*5{Xyqcos{A%hTzuXqD5L;ee!55vu zZWCbI;fIN|w7rf$I0Lr>Yl8mybPw$bk%)oEk~;C6TiKwg`^q*;%nF5}4A43euwb4F8AP`sn;EKN>K4A7Y@0h9rVMj1+mw^uU*>nve4-Z(9!V2Ok z$;?H$F7gN;Q$1w@H_VG{c|x`4U^GEIAU}6~CZM8b-kre}_kvZTE?&knVVAMwyeGId zL6O0B6d3nur1y#W@`Q8DQhP0Z_w8?o_WY|~{|2iFKBUI#XX1ok({_aqth5xA>nI11 z1fOAu!oTUy831w{@`ES~IQ->(Fvy}UCq+d&av%AuY-2pOm*|)V8`!e~fsZj0uLPaX zUJz*zb0O55KTnGhYcAV6Vd$NlK1fBhbl2uIvQNY>#v zxbfKyT*@Y)r_2zv^1u7e2lz+Np)fP?GN3>Chz)j!?)%tF7cY&bmtX5akw7S`^ru_5 z9@!cYuCXGp0Zw%d>cE}A$y#QnkFlGq_6`=)v!%aeId%ZIG2_U0f)ZOLVKdmsE^`ed}aS{Sx~{?$)mf;4g&KmvE-Q} zqo4W_PH1EZuexNES1H7~q%6NN%&U`fO#0SG!aPZrIpi0`_~qO36>h_729Di}p*7jf z+yf|43Yxh*+$FF~swC@vk;BF!03uu|Bul#siwpGA3LpDUC5bf|TrZ3~)7RUVI(pj> z-q~w*xQ*M%Li*vxXTuuM-W0W^|#1U}Q z#bF!KuU9_m&;)h);z20cJgm`{eq365J7p~LAT~?l_bvYQdCrPUEmod}a8ns#U)=ZJ zxSoFZ`F(^t_&iERFJsmDR(id1^&0ful-7IJ30g2rySmf+AG{lVD?PbFFvea4R}`d0 z&jVS*YOnC`9(6&YV>v zT38qsq$@1=Yi`PhRvb4f5{?n+NQHV2JQ-lXzEhFfV_dEyKq|18XE^A~?%78g)f`tI z!YDrMfBQqFQzc*FTcyd>d#< zXdX(1Mn32kTI#^Su*#i^7P!zlVg$CDA+Mop=}uQ9 zL(YzoZiv;4HViECO8HYoZvgP18>kBDJ(!Qx@yOrv9kj0v44#)Kxw~mJdaD)R}e@u&Grmdh=Qx3E)w zU@^rNJlMjy&fc3?;Z!_StMm=~llq3vF$bK3W^uiuPa_Y1vNT8;RH;eBF#y^nk-&d8 zblCcwgV7>50h5`mC$}}@(|N;?*1FoBz)y17T5e=yCQq&;k2UySHh%|>%_sl&pB8^T zA_Pi#Xvvo_KAkIc!xlz)oPDMY?DIYM#6Z6QnE4COoX_|Q*hHo6 z55?8-PLx!Qy_TPwOgZu;=4m4~=9LlKtMQJggYcjOKF&eTg&GB&2joXaOtlOz4GhXF z@)6PCJREZ-^{dbczXtR;vDyq@>Rf&c;MZM(8UJj&7=Qr$@>$SHK`fD1X#9;;!{v2> zT!%S#F~R6U5c%yk5V4KbOGMOqhe?0xrxrz!uNA@X2!`5*k6usTQ251fu}OOmp+;-8 z@tXm`3b~hHBv6Rcc5)vu@;)pDFvn0r0!TaFw0uVabT(Gxm@(Ph3Q#Zs1PM1>qe!@% zS!q##RRD|H_@*8~Chl`H4CxRHlOJ6?Z2n1I08zSC<+cX^MZLNHKxyPD-{tSJf69C% zv}LMjdhx?F3N4`#bFa4OJ&u2?6 zYx}qO{wjd}wq0hmkzBclit>3^Di86p_~RgT;;OR@z72DBRZ3SL)6&W)PfFF2CA*Y) z=E?K`VSJ6cTXn`FpYhln@+V*Bk`tSdTf1buTJZ<{?bI_-g+bk298>IEC)UWs`efYu0 zAy8j?VH%}kExrH#N9m)FZeh7vP2=Z=X)6aKLQ5Zg_y|F!mhRtMOb_nY(hq-hi4{Fg zg2kP#xnlGnCM4*+qVHvuGg@#cHacgonEi6M_stya{XU3t&tsASrD5 zPQg^hBJ(){$I_6NUImZX|3Is?=vNUMM3%neZg7_&>#lgr3c^Uw#dPh;8@LjpK9H*^6K!Dq&JT|pqtlfd5Dsk@-zUYIMwZ&8?06WODM$jYbTa*jXLYN zGc6-bBgye`g#%+q2siYPt65lzCu#e4u?&BF|8ZIc?go5TOIZ~j=;$l`hHaaZpY)=; z)CiGOJU_sweBnL=qufJyuZOYcFb$~8!~c6&fjB?dMZEu!G|g)1`_R%Jup3}t1*N)L zd4dAapB6b|p$R%w5e@ea=vSQk5r3}YNE7j4!VBRUSp4>#pS(xJ4fIU*yldORHHVb~ zH&$PmoXVDLx^!-7A$@xDM$pyhC?QxcT#Qx2byf-VTh_8H4ZE@W;p0cFpshup4-O1M zZy3m!7hGMMV-JISFT_hfZ5SrKhb~G(b8L`an8Q^JgVZ{kucd`{LfSTS2%En1Ekx;( z5oIMpfiu&Qd`RIUT>Mb#ZNCbKeHbpkl;gAD{l&dR*aVG=JKwqA!uPJ2u%;96HuJs) zjWywRth-{Jl_#h&=8Xpa)-4Ft^gK!ov?5&tn}U$z=Lp4Vll-gfOWI_G<_JEwWdI7^ zU!T5^COBlyFx;d>;m{QXd=S*bZ5A<%C?mV?>57LU42>RxKTM3#=dOMMF!V_~fu#yf zULcy_cP^YuAI-DM${am3(4CeR9;Y>UhvQm>338xKE9s?cgqufUu&xR#3Zb|ukDx5* z+IGY~frGnv8uokqk zNuUV#>aDQyy}r7L0`Vy3ALW+uu|brFUR(;1Ep&Mbi@Jh|qX@vnPRK7ZB;e#9F9W|k z;m*syIcD-xf@T0g$bMnZy&T;ZM#Kvf{5RG`N9Q1YDYQh15L&ju8W! z?6b-fe2u!@Pov?{@(wh#3X@|B+K&y$e79o!a?VqMF!I?d(f`7246zjbA<$+%d5-}R z>|u@1HcK$11z|9#%_hU&w&~i|NxQ704Q2Y+U^n7B1`fA;tMrRI$F!a&mL19fWw}h) z1f9}+lJZR!Wt!PJjCKbbl*_)xb`J8xK;OnnqO0I7)xqFzuGl%|qld99hk*zpgQX(F zGB$aU`M$hCu%v*0+sm{;v3^U4VBt*uFP9V`1OOz)SK9EXoZ84>>%w=^!0aMHB6Kd> z#cHYlV6Bf$GwueD>`uniVLA|D?C~b7{=NOJ?5BX~F+nOU?%{73U<9q15d;TuW&&3u zbP37lBH$>N9Zwsb$Ag3esv(w;MnSs+#f>YP;0}-UiN$ISBRFP%1l!X-~0a*OX6Or{8OT_>>!E7XEx((AI0;Wr6?rxpws;`2BG1 znltIR^>w`BPllCm5np5py2{u%DOLr-Fs_?SwsHC>f@XH9V;EANq#@i9gP;D;!mU>A z0bHI2+{UEiDnvU2(NMQq8?{m}LwyOnz&Qo3FvvdndCru1-I`R<*fZ;#6i;IKlM8rD#0N#7=!}N<^{S1rQ z8gQ|Z2}@rKt6i_XIYmG9q`&!BKL=(ab96GnLURxA$;=niTi=;PKs=x3X1~a&W8Z@t zqQs|2nQ+Dorl08lV71 zNS)Rl`&ndi)H^+OF}?KSi|Nk28=SU&Cj{%KPoG3T7%F#cd@N1jqwi+SM^B!`SrA%w zA3k^xs|KFV{rCR(--D)Kh!bhQxc?yCzIPv?dmSNQ82{=?+KLhc9Y`m^bb&#k+100D zxCnMY1Fluf5CZ8VCzL(fy}iO|+*nw>T+LBfQ#cJerynZ_-r_az>GQat5O8Re8HGk^ z%RIK-v0qx5B8W0Zel3Cc>OAYDMp|L56x?&Pd>hpKK|rZ+&7>V zE#(>rw6wqe#;fU#H(yE*AKp#>=l}NSxYI!AFb-*Q9)b12(}$^rO|^>?z2PoW<9w9a zCo^&Ao~|Pf%_?o7N#TZ~>&ggL*YBO1N^RIu-g$&&Rcn`x09Iul+r{7Dv$3I>2~4ZC zCmB~1hVeyip&aa8z+yx3bGpUQiy8iC>em@OfUP`y` z+{I|Kmo7|Qh_PH_|BHtM#>LQ+9~ffTh->n;aFId*o<*tjUQ3&TzO-yOP*(x8oMnaC zjjmYg;S7i>t^_QhSg9Pl??P*$(O?Ey9a>$*eGjF>XU=1x2vfe3eHFnDt%6Ojv)mf%d%TnwxBf(G@hZ< ziSYiy#WBuUKrn$e_UWT8HqI-sw4zA5dgy$nanG@4p9r&l>cX&#Fy6)rS`~%*ko}3$ zm!D(y9yT>Nvjlg_mbQBO4}bVpy7@1E4j#JF@Biq#sb^?}@>qeROvjRvQx!Th*n^%5 zRL7;V4A&0#567c?OL0!ch&D7CDbEqgd@gT**mcQnG{c{e-#z-{9FVo(Eio;A@}z{E)O>(ux9Z-1En@@^a@o-&wFT)gf|BE=QT@^wY##Asos& z@_K2MjO2iCv$te=`bGAKokx%^=8H&6vA`^B!Y$U6b@JtR?zebZJoT%j*L)AG^pj)J zJt=JMhi?%;WaA-o<1(~>J885z%OU28x`fw%^t1Dp_sV)2I7m(%r1u|0>Oc4T=vm^1 zcI7)^4rMWLkPSMf3!W`5Nn@nTz%#$ITe!wd4TTxkqE%kgz_94}@97^&hwR(g#K^D& z&vCr!+9#g?P;dv&V-B=lS1}C_?tQfXq%7$LMT1p*d3q|~R;uA<1Y z;#@aN-9Lpf3~uuD7+a2{45$26LgXA62|0JjmpfPhGf|wp2WXe_T3}>x;0G3yW;(RG zw_fxv<>V|vcXDODi}GwdfVG2|Z6XDAw4%kpq<}UMW%#V)9nL9W(0BF?Vs${{hxr>7 z2l9py*^z4C!mJgAvY1TTwaPR`vC`@4qyXiTGEGJJbEAa*p%R8xWEofxhvT0KIPgFk zS019hN*a`H$3fdLOqdswVG|oxl{8dF4*4{&y1G|FCKL>jvSmC?2vnSDUZitFs4f>Q zoi`i)?#y720Kkx)E;I8-7LFZ!Uiod@fpaI4vOnVWNebaLZ+>s?GAGs1ZeI&C??FnO z=>c(HzW7`^hRM0?Z$q`9$zZ$!f*63_k&pfm%J?r8Q^T2*W(n9bVf#xWm)QKEghU?l z0Z=H7@jFT;9pAPs#JD04g&S5^Vy!ngP-$}+fng;Erg*ULGaIHJ{%L85P3(e3Nvrtk zYPnUA7-Vv-G58F-reN)9qk7ScYu$gWnU(BN05@PVAAfN>eKGSG>%B{TOaKswlfFz! zfh7)VGhYxo2ufs3@{GIG?lzmS zpuOoAcG6ov_Ph_4+2IcRbyhz1Uv*P>+=>HgZiV8RzK#6$s1^7k|5xCT(3NFtrP``3q?g{jmj26s_@`-ziD2`1F0IuU(_jDZ@26EZ<1S)V zdo=fmKo(fi*+lyDhcoFv`u3Ydl_5IK0nup~J1v9@Zo$nAdK9ip+=eWhB5SY6UojHs9T&3u}QSKX524Fjr_&ft1~2e0k0k7 zg05n%xJPwiC^&?tu>gxA(2CN}hrZVB%vF|N%G@OaqtU*?P~|Eg0+9S?ja9Zb1erEq z?EG-h+A8xlg! zWqrIun zvf{(MTE``27rOVejnTvY-huDkdiErEf^(A!l-0$&8LNP_k#eJQ{$@Y~hC6_ZcKDqi zourL-rw#? zq}vI6yvoigf0y6!viOq$^=~dL9-G}qxtMsA|CHtCU%EzU)s*4wBIs%HX(Lo$2%Ui) z1RsPsm)m=YL*2^eOSL!;voXR>8M5DW6FNCra)00X1LoQ}PQLa`EU zk`NjNW^$Kg7*#TCvY!$uPIzVea~M)#R`9nCYNl*5RM#j(5$Xi4_XSlyXFpFEU>E5dv_IQgDhs%ZrYRZ_*UZrG(FXk;ZQc-UnZ~%Cx=( z!$C|YF`oo8dH#IdJNRUP*piKQwhZIwl0^Qe{HxWe9}S6U+qw%09CpnQ=qAq&J<6xrs?zfxu9L!r*fm zl=zN&^65jnv5mXW{w8H;eZ+5(u&1xQ_$pTVO8V6=Zl=#Z;gs1<%Va{rGvK$sn@P{V z2s34+YYSJUu?a#VzBtWbru^M*1jhC>^I$FA`E(XnJblKojxn+6@?}hGeR9RLe$HVX zag6M(A_#Y~Z{}M>jk-=39AXW6AFBq6KIz)oly4*?t)Q2tFQsq2^}S$d|MUO!KSpS} zP6QDkqFr^YzfWUK$*`=uBUT`_w7antW~&RL$~H`B2NyNt=nivjzXzc6|M-C}%^s@qF;!uhX4dH;BT;CP-Wo+R*A1WL5fHk*K2;q1r{i zwNGD0AVm3MVz(^&rJQ_R)o7-kJ@yN^5+NfNW`$Ul2x&_=YP3mfcb(<;ZThMSL&F~Z zSL@_pH^#l8sa4ig5OnKMa5SHFg;fiekKy4Qe$~i3(xVfw?qnXgrMprM!7ElWf!%#G z(vAN7TJpp4jxfCoF%0&PrvL08{1>>=AmlP(bl|!%J#`s{?^h^v1_i*6A71E&aFxg- z6wf_`-!9yo&a-l~M?bGYgYsePXo0_(7TqJx6V(3Gu&0^3k41Nkt<}QZ1BEw?~63IJfF7aq4#qW>185C z4YI-@{(^6jR@w==k?L(IA2+v4bGnCy6_jVO@(6Gy5lHi0zH}wVzbh(+Zq=5R@~#r+ij|l2uj1n6~<@&Rs9cqp~HYiI=x`lifimqP8a`-zx?c-Dd7 z)5q$FD;XL)+yk?Xp<#>F{Ojk&(#GCe8fE2llIUJ4#mlS?EVCD6eVzH*@daP4)p}A# z1=ldT7c&d4Ss4DnuS$fWUwc`JFanbbjP9q-;chZ_3~6jDFL`oU%@N4t`SOrfjDqfm zQqc%v5|7TOSWd-n3*)Na704g`UaF2w;MI`Fr(-pEJn0=%K@`rLAVD0SKFav=_nF81 zI)y9JlEr+ACE3u$yjc{J4Q}=;a}{GZlnY=BC4wrr+6{f|V z{Kq*u_5wny@@eVY)%G36hq8(*5h0*cl`e7CWh5x)ln&+u&wX;w<+cGUn1^BKj141T z;at;366;S~W=&3)Wow>Q#<*y_eH)B8Ixy0pCa|lb)rO^d3JJJ5`Y=Y`}|T+^Cvf> z$l{}KxXf=kcDj?P%&dmCrIpGvCG_phj8ZEC!Vt6ACXBojqHaf#aPyoUHaqhuJ)5~t zoN0W85e6z9`aE_bP!*MG3}eO+h!OeP_mYTXV@{e>0KV-gGCbE#+Z2Yf(T zVAu*zs{Y!WcrC#BliOVoJTR2-pTJ8p?`^#w8*79aefc-&zwj-F!3Fckh=??nZa!hR zj}*wX1b7&oKFYnUQaO3qS1JQ`S)fXwt;2jgsPp~s@l5*U#*H{(R?EEQNvBQ@S`5R# z(Mc|lA+5lii^9ZJX4VW)iq^t>OkUmCv7wC22M;3p&;Qf^B>myv`(uRpcC6Tod=}EX zzj+_L%M5W#ftl)N*w48l-+t>m>En;s*8!vSB-~9_0Npp?GN^*P?mhO`1}CO_>AK@s z(<(DDK9b(V8h+uzbXvoQ_|~o4QNFMRen~G*H8>~`Ik~yvcyMSMg{74~USM3WPzRPJ z-rc`*n>OWenx1RWO`2w`qIFz;j(|fy@HG^dWoV-&vqn(sq%RfhWu^)??mH2F(O zR65fr?kC;w5ot&`T-^%4bOdUJLpOvvZe`5we`-a^5oT#aT zsfL@3LKbdQ^o3_@>~`0~N-W(E!s9I0i3l^$O=pPF zFTS{spiVzfrYo_ombS59tFwc>2wkJ;DJ~(G*jHACuD6I+UqxtZ0Zx@SEmr!T=kTkx zFS}puK%)zb9Jt_B7qReJg!_ z_YQa~!oYF>8V4(&OdnvlSwm@H<(IyyAsMxbQ^uHW=;oO)&%)O_iq+4I50Sz} zn}W~M{teu~=4a>FcXNpHhpUy#{95aZc1OSy=(YtV|4;t?KjJv$i^u`chiFmCqG0}V z@y&hy$1f*#U-;$HYH{WM_lNxzzf2PS6ipll(r4r%)9?~5j#yf?k@d6jwM2Y=$6 zBfbNE=(H?Xxl2oZ$)il4^IE#|=4su$@vTX?GN9!z5Ptf~w`Xv6_CB)u$2OIjNgJr7 z4;>3`T;Cu{p?K95G4>Gv=T~YBILor-tx+6_!UbD;;D2@*Qqi+YahXdA5ClWG;}V{1 ziVveUrIl$dJ689QOEjYRUZq9(O}E7@+=Qfc4@>q81?LmTt%jy-07c$X*h_!z6! zC=VXo+TBF}41x!_0@nn+IF~7><%oC64;d~nM7_$#@);?|-vaLYQyQ_&rnB)X58>ih zj13iaS5=&IdI^o(%YH2lTgu@zc&YQ7#mUDsen~4~@RY9xj!toB`92LLq9nF*kOWu- z(RyDJdVC9@75SV}ZD=fGk-m5w@4o4ejSpxfVgS?cOUC9zE;9}PtteY-4$Hf13rrwv z{GzdH_UERrrtQ`FG=mRy6CFALbMGA(L75^m*P-aDyEGg|FDRaS+=NFRL&j}qd1jqb@G}gTGD5? zu|6W81-}&kz~r*2>9h)}^t#gI=09%fZ(t{l&od6}Lvd^Wk`BSrvQ6Ve&z#Jyw3cz0 z>t{n{$3YrmCGPDXUL}0tB$G43Gyb6s+zRIBA5qR03UmkI58GihG9h8p@^O$x$63N3 z-A~V59!t-ja`wlkoHYE1jkQetT_}g$xF=MbSr%;rX6RR6TYZmP*?=o!#Y-my4}T90 z52iQqt5@gQK_Pg`rqQ|CXW@^pkZ?eH-EtbZDt!9sClTj=7i)NoTilsc`fRs*CA?&~ z=5YmN4`ElqQQy>l1cQI_PyXw4`8tu57^4ls)&7hB}ap2#02qYhVK!{NM zRM!xgJZr-J5iUc^C|wn`FQg?cNp9GF{_69w+L`^gP2*&yI~CDcXu}T12{-w7pu~&} zpdg`$xeU*)FYa9}L3tw5l;vBtf}D)l%d|WsjeX&&g9l-$Y`C!|bP9&e|Y;hoL6$h8Ag1z}$oEENeS2AArOsUvt8a7+7R zSfDEis)-;w`pPR%w=u4J!9zFsD)a$4u(o9*g+jcOnEPP{iG|H=}rP6(`VTlERCq(I|gD-a1UhZXS&xK8UgO|By{N9gmle+qPnZC>;&3eF@Y09xBmW5Edj!QliFy%Jc-?WUc(AiTr*RBwqbXub~L7H)aHK|u^O zVm(GXlJ1ct!YCm!U2)eaZuG_ucwr4AxdtTN{jEiOhHONB#{>WQjlym;De$dPkRAy% zhf)!=V=ra=+Miyuj=vhTGZi}qUB&gRBnU6gB^f1J7&SyN5&26CI4dvPFP)V1VBeL> zRipu4M~wXYgdo03jAdb8{T9RwvTy8w+#nfu2LG!$^6wMEL;DCtiNWvKYiF{k;3KX; zVhn9#N{dg!y6Gca$A8SQo}F0Fnp}!!GQ9HQn`!p(7o3XyC@nm`mwJaL*`PK`m{=lY zfQTxc!?*?cM(EHzf;|kJg18Cu-a^paM{5+d(tI!hsl+GI2HFGwS)22H9$>aKIvDF< zMu4nB0hRVdL9~ZG_aRVdiHJm&2|I)+Wm&m@Ey*&pVCF#@X@TLJ@MpbVCpe5>$ydqsPT0=@T;u}^P(PDody zLZB$pk5P-n3?BR{F9)zp%R%SHDC^b@$}0~v-a#rBFY{6uu0z*|V+`cXl+0r}g;r0D!h?EpU(zQo;`t7-VW~9D+1{4q=d(>9bF9Q&{26#Ym1CcVR&L{zILe91t}!JO*=B z(IWB*!i)!BiQgUkpW8Vwtp(w+6~W@d)EKKoL}|g_esA|UEpkHl?9(UUM}>;=iaKQ8 zdC3(EGjKnB@&v0rzML?5&y|oawzcZeu5Q9iicu8;qbdFA098P$zn}hB;WOO=4Qwp2 zrvk;{5f*IwNGq%dfEj&B_o1L8^2&R>H!P|OsFS^uf-r=lp&$cn`a0{L(Zgo6S8@CL z{oj9^6_qJgMo@-+o650H`25nPOGE|g!F}gGY4oKg zBq;o84n@I@q}wQi!^0EQOSGcpbyle6`S#E%-5`i{kA-)I_Wd{i^uI&Fng%~>>9bF7 zv6=k&^y-V}(yxF0Z|J*C_64a>w{kATa2n_yO>1unvJ=UfyQ;{haHyLK&o|3}|r)ALX~)um7C zo1waoSY64+fu^*<0h~8)-(-d3b{Hg7Al`iW^>k(WQXJF=aj>WZ(kWBi#~*GS>;4Xd zTFe=|6N!GjJM`mu*P+!fVA=0A__o0T((4z0saLZ_xi=sYdb~r%9hjr(V_gn>RC(o?M|1me3=PQjd%hx=Mvd;c7 z4Tx}#ls7N1(zuOquJ8+hLC>y8gdh)oos0YgZlOa1IT%^VJ+NB$E69PNyjd)=q*3;= z@?fQTt};x#t9rViN;~oe`Ml>$X;6}elqU@JT!;7RO6tBxm0*oP-s?hWlrmSL4T8~N zY#o!|K2b1Ni1*wcdA=(o;#jHAy5+ALOq;3eh;ggj8D1&Ws}VuI;aM%s(76VI*pr|_ z9U8TA1qt}53>hdS6eRi~XxVo2PngBu9&owGOT|YQ<0`U87d#fw(hfsQd8LZ+$-XM- z!TT)th%*W-KABFeTW9^^@?m~qeW%10ug&Xa8o%O)-v#mHuj1*~eVM=tPv7M-WS$H- zKCK|XnbUjqEpw}y8XYLf0~4lm0K?r{VdG;zlK}!!3lrb@D=){A=iI~FX>* z(dZc(BU%F|ULur;DC4WkoLpY&8^m`P%hn!D(74q`(U6G5WVAPrm%Ce42}h zVkj?)JSj8(4tQ<2c%&~jaffjQLiYuQagmGi$GahG=6m{e# zM61YbefDQ&x(@2QckiXU_aEYR;fg|jXDt~Mrq8Vki$%a8qhWG@(J08ucw~AqtH?)k z>j>45^wL}hOzm_@H7jV5m_PbeZg8sWHrC~R_StALesFt%34S5H_4buCJv{*(H>c;W z^r5hJs8qEQ zoS=!r;I!mx^e=^y{2f0UlP_F~{xm&q?4 zJWDsf_~!^{xRo$jL?~eBM>-Ra*5$;b-)S4l;vg#lJO<_*7E`-}Y4x36TJoHN2Pl^- z8d_oSDot*@Wm7eZ4CYSqGRZXB(*g?U3f5wMY46JIWgRoLzzX>ZEE~69v zQfqV<3P3N`<3`6Ydkse7H0H2|);X=0C}JW&fv=S_4pgcS;c>dbbTjEV7TN&jfKy%_ z4_YOi2(@mzI<33Uu?KoN6qtVvy5WXlm+y6@s6rc;&J($(1uOEw8t}1S1A+69c5KtW zAN-x)LouC9ohS|Lt^|i}=vQbwCA|H&TzV@9mXKFS7UJLOO^0jpu3zo4Cw9 z$PMGr$uHdXXourZYrm_4_HU+Zjp_%`y$ZyQbnpHbF%GU@zfNe->*?x+%MqDL8m2Hl zl*^an+xE&CEGKI9%X>b-F|5CMrX^IDNbB{WKhKE?-+A~)tU!=XtF)&xd$nTlszPre zY$BXQWB99pBu+w5Cz7GAx{fCmh;pAn=2w~KT#3CnJ)O=^o)7o8EzUI%?jH7cy!5Tt z(~tk1AG6BRmp;FJJ3af1u<5MCUj5PE!FQW8b_ki;&%UE=+}+kuAeT{o8dN^&hEK-Z z3ge;^p}3Ah-9h`>aT(L&e~wj-1vZy^)p(v1hkia5CybLL{m&7Mn%U10sFntl;G+ZV3kr=={mwt- z@kO<~iiG8hlfKNU^WU<6<5%>xc_>&oyo2Wi?a=~em3U`cP*I6mLkj+@Jhc+ub>DFX zpR!-}h>jyaipzN~V5L0i$95Fo8N~d|=ed7#w(_sfzD|ko{na1|6f;rZLj1zOH@gAWlFy689GPML#*sm ziC!i97?gF%3A%Aya=8M&#d#dcdMXu;6%Q>oD917QR!I%xj=Y%k(x2nYdJSi-5>aKt zi^dZ}Ztp@@j=DzzWk3XYm&PdNjxJW(oqNK)kl(xXz5HG#ARgth zS|6p2xv2*roHSD|7MR03PGUF^*L@| zpghw606+jqL_t&|39qFA=-dh=*t3J>?7hiLg3ehbCyS340Iz@`Z}n+ex-ZL%mzh4w z2mH1SNjVR?utM-e7o-kc20RPEc-2}obD1(9CQXEn&4nkaHZY!=*$Z*ZUW1;|OKimI zPbv~yOH70-M95fumR6qKO*NE>{-KE|vxx(#SevI#kijHX>$5I=aUle)Dr$rJw7FC& z^B3e$7Ssl!S>$B$&r4mtRs72EYi&@|4W!a*S`n=aRhYbVBp-1U%iY2Nfk9$?R9?h{ zZZqCmQ0_i%0$;w(0F=0u;4CYD#}MLGlpi^Af11WJqPKh>KTpffuO)*2m$5f}(mXj1 z^Q!N=r@N=8dwT9`XK!GE1#vL|0)iZpLX>ERWKs4PE5hNRKKQTk7f1L%u)|W2@g}GvEXA66`*+oum6es5m6et6qW@qRB3|G_ z#E2ZT$K&=Z{$xH;F=AxO$SKp?INl1c7&KxFuhO*0wBnBl&ndgu2`|RPwaDORas-Ac zgdG)aoeyZR1kb_j^87B^-(fZjjxj;Xlu2MD(y_c6?XrbUdl{^59=-SAak+8pcG!`- zl4O57pcN<`%$8fnhlEXBcTfh?MI6slFip0_1xthWj10zjNn}yPM%zZ-S&mop^`8tm zGb839eQZ$Wmiv8H3zk>*I8f_udF|_C<&_ukV~=UxXy*yeP(WDX5T}=3?=RDLcgvS| zFvGY#OFld0m2Zq<+BA$y*LgS&g=Q|hY9pE6gP1{RU|u&MB3o#vb(h>fh<1G)c!=BJ zr3|cu%KD*t;hh8&;s&Fjo&Pr4TTk0oXb=uVBco^{*c6MYkVLGDjohvfm|hKtf{Snz zhmx-Bm(T9pjXqLir0^A!tc)qWnbtG z<^TA1|Bl1QCh;t^h+DQxl*bbllyHn-B@3kiEodw9OA6ud>|z5m@CsXfU=b^<4PIKFo1Nc! z>!%^?*gl2L9VWgWT~eaON4J57i_NNi3LxT?CVmRl^eK#+#J`vR@+z!4^y~%Mks1B$ z=7(sUbrrdd5B*cXA06@*n~<46@rqC1VVZ(AeguU~I}?a`91gRRw04^b0YwLUVD!S% zv%^XtWoyfONZk*6+yf!rQwhe`9)0WPWf{9};(02tu0%Z;a%~qF(I>B*=g>mTyY9{4 zKmb#?2M;;SZ+8vT7)@``WZS1`#Bqp)YgjyHL4&v0uNc| z%QmT1OFT9xIJe%jn^e)ofFSz{%ehtHz3!Q$IIO zN^?e8<$Cp1&ukfrm9qUk6m&2XmZ$gHOsh=4G#Qj?gxE zv-5DcJoxZ!5Q>e|NQ7rujq2}n(!eTQmaX5@%f2ax)TjV6WPdv*~_V%SI+F#GwCMX04*RN#-3XAu^`3 z4);btKf!Cs=>fE`gVjgt9OkXy#lWm)Po9;;+y-!ZP{b(+NqtnhtK6~f3P1{{qLOog z`zpwmRT9k|Q|Pe0_TV~#!ya?Y=H@(dY-cBq?eZ|j~RPhp1ef-*zi)cHbz z%y~RkJj|atSe{5;1{KRuNMy<9~7Zg4iMA)#r*>N6+=nA@+L&%;g61^GYxij+&cj{IOvSXCty(X5gwA7n%;Om(#;Uhd>=2kHyAJB6OX>}l~;Zl zGhall>c{4bMEQ9kQurs6HYJeT(?vZmX}%*}_8mz~=AQ`l%7gE{@F%NP3-ql7Gr!5W zYWlLm0beWlDi20c<}>m%P3;SAn;`Quj{u>azf1R?a_i>%s!fTs7GNcV7}9@C5Na4^ z*qr(yn$1~OFg%-|iEL6S8F<8Lu*$P8$9K-+@yX2Elfp&t4T6 zgiDuWf_g~cwReua6RrpfD}D+N5-2q;jzfhLnSB{1wRl@wq$5Ko5n(LOuOP&9z(knl z(W3{LTfL8WArAeaf9*SQMM4`J?-44j=hij^nmp7HS&q124@JLH|4Jl%rx}J!ko~pE zL^8AUr2PAT|0kT+F&V2G)3_?Vg>V#RYZBq6UYd0oq-#!y;dP39oDAGNsQHq>@jMRk z&a(@A-?{?fdud#6fw&?X(+Z$;Yyt44$CL z-(Um&5Q337?j&lwZcIlG*3pn^@9<;)&>R6A;6wtbWlbcHs6H)k}9VDMF z@?3|x+n_!k5;jCz&SPIce(qw~|M)I0cz=y468(+l?*x*p#OqTxepZ`I5|JQ%T;c%xhk-?h?^^CrvKH`A?gr#GQQ6<yz0TRtskvAuz|g@vVE^8(>@f)$tR8MwFRjNZgk;uHyPqVbYbk zil;R%ua@tx9}^-sF{@-TzNVo%R?xi6b60e=zD=G>(Av*G{mhiHv2yy>-E#GnYh?#7 zF^`FRg6oLFHsXO=Q9T{1U|EC4xpHF5_YqiCDkvyOQ%<1>oEUCr9|fDT5uR*DoW0^| zS~CwGvRS$xT0QLekghf$6mEbeekwFJ5l-BY>_&gb6vHt3R@gGw%hvr*%jfUDSMJVj z;$VQ&%!hFNFg3}UIoHb}IHEnfG+CH%2VQbdKBiaD3jtAj&A-r0Lj#0;?FXynFZR4N zl?-enKOdd4+i`;$JPwshMIld&HyvVBX4?FcuF8ld{UTekXmBYQY2WaUyz98PwKaV8 zyNuy?(>^cwEz|$@*W>Sx%07zsY@ShLs}+Fpn)NjwHP>8P^@#hUte(1yBfU+oZ-7hWqe-iDw*&qTW1UcZxz}u) z%{n@1@Z1)`?cspy2PnVT;H-ISC&KSG0*bU*C5)a0{pD(_3gk3pQh4wTAq#WAk*<-4 zDAV@uu|eK>S5Iy_&QM5HDZYWVgu-ybH}^`YOx9KR0d?<2;i9nMoTEE_>6TU*j`JP% zDM`C~z*qSs_eAXOz`q8s;QU9qz$5UPF{-;--GGN<4f@7CZ-M>5YTJ2^#=A=Gp2EFt zOiG!r+_$J}_#Mg)?~N#uJc>a_DI1tSb?YXd^v)S7HC)ATuGNI=xkAKoq)IzdYOhY=t=M<6^NISUbFj%qn(b_hf5AYDXI{P^aldECYtTHJdNI9yR&?0_l3<``S$r3Hk6!BZS=^|)NW&Y^0TPNIEVp(TA~bd1f2 z2PeYJ>5yqV7*90AIJAzH0$n^VbF#5Y#lew1&LiOTV>WCrEke*ZKR`2tt`3M+-$D7> z8!yCao%=Pqz||g@zdg(cwh>9Q~OVbP*a}clc>prg|Mpq z)k&ZDT$v=}`?GK33Nfo8@{TbD+!=}35J2!;K|#sTHxyo-f*~V;SL2(9_pYAQezZK9 zxL^VreDj;}UCHoJH;HJNW~elvv|08ey#L|HjYi5k(+=tqSG?BPr4n`pCLd%h%uhyN zVMKdc1sHM1vNXNiX423_v8yvOfm;ZJ8B!2(wNwF4p(s~m(p1eVTR$gpZQmCWuH7@? zq(~C--=)2*5K*qgUrn9DgqzI!@Mfjfdx`H|tb7b%hwdtp^{?>Km8y3L$5O3#^sg|% zlqWr=B@GUog61nIonk)t@ehB3V6n#@lvCiD_W!iij0xsv{0IX2SZKZHon_Ww|G4i# z7p`;jv(e8-<2>`!#K1IFcrY*PD<%i6f!=skNut@3E3V6HOTj=@T(0F+Sk!iVwGOw~ zjKPhMKPjuM*j>N=N;%J})&2WVLetyM+@}d}*l=TB zp+e03wyEyFI`HTPeKagivxqNP_ zT)Tdu{NNvKm$5TfA$dAbKsZB@A`{;Zo+^yEVyGacw?zdU@!i8P9i#4{vJX6{G|jHS z2FX0#D_kA>V3?1gn2A3MLPl}S^M!0!`EXo24>g)@Ub?cG%vi;)NcI(fGS8jn3E!E4PI+Zp$v z*(a`ZYK(&o_&gLB(mnprU(#&^66TCBWnzx(!OT|^I)#SVluiBIkFf7bJOYB|R0`C+ zD5O2BM@5J$7ArX0kTKVF>JWm8-xuTTG6Wci@Gu^jx|uuo%!mDMo<*cGLr+d8*qhcx zoz~zzH&6&2AkgSRE|_#R|BSnRgqKZdu6u|F5RzQ+R?)t=I0v0XneHkJbC_fDkQEte zf`XCry^c-dr~~FQ^D@4tW9x{7fe@?aj&F{$APovj#*Bj64s(h76BCc62h!Jc3X(L4 z_(vT|>Sw~U$Ynn}$vTn9F{y1z&1)sH$5F-1=we^_hg!%kN1jm;nZqowb>-1oFa+Z+ zHJog?22h6p&81iw6MR$OMkis_Ik0A%fq!D6Tzq8^JK@`9@!>QkP}pWd za1b-fE|cW!I_5Gn(=3sl3|E!Icy?;%P`d*puUPIR-vP8QZtB~iledr{-8@Ug4nxh7rvoS=XJHPan52A+ z_i8IP%gx)j%9oEGGC{F42orHYzseNViP$`n$U+oht^!0MKqTxONDesT$AE-t8b1b= zlOpNGl=$v|W+00%HYUD#-vl-P#%5d-WE${9h8^+b8SM7itgDxYZM2E*o0xsPS>F52 z6YS;D7Nb2ma|XiKvtM3(c?^8TJ>`?`vOLek3sKePe)7yus&1m+fKY-kM?g}I^aJ=j?3Ds=!2-}wuS6;5i`6l9JK z+0zIXqG=x5sE}meSJ;VLIbM8ML%pwfD2#mP&q>2)O$L2-vaL8`9|S|rjd6Tmf029h z-SOorpCgS3FsgiwI%YZcK@en#GQa4f1b+0P%)9y4P|FDKCFJ8L`$!_JprTntr*@?6 z2w)*dy3~ts#boL=@Tb9JwNkOM6yhvogtPlPWNy@|xr$+VZs^;`Y{-f6IJ~TDFmli$ z>n;Sj#h_i%FER@u!lm$~CGYXK&qi`LDs~{ycpC6D%;tIE z&Pbxx3U0PpYUDp)M;`h)*NjQ?C^ymS;&&ADuzof&%1gCjnx^J>A3N}Q1b~Mhe_TeU z&X;ez@ml%wx87k=64QYlRJna{4Ej41tHS$?%{0#;59b#DQQi)x*v`(ljL#B36R5Bc zy2;gX=dZ$`UWJ7r$1zux5fJ!zCQlLs@28e9s{KPzwkxVMaP?#JAdRXk9w;lb+^Cd_D8 z8MiX2GH3?RzmZWL)WnfGxW~@1x(ich9EBtZy8oPfoHJ< zfgT}3WkD(?#8csru&8)RL_%Oy-o|}%+q^ehX^Zhh-LDE!hbDq}@l>upH{XnRJk)f4 zw_f(MAq4?3G1CUG$NR)hW0+5tVp?GqrrP9R^Nc9|d8W)r&Fe7LB3=-9vsw$pE^o!n;Hn19$ch{r#YEo=$&RlJMd@GZpM`xiWSHMk4DoZ z2(biuON+n>1xZbq23SdxkBj&Sv2Eb0bJ8{|6plStlT~>68vtK4(Q{nt;Gl3Q@j#FJ zD6FGy=kcDVS5RS#g@6qGws`?Jaj*lrFHIZ5yfe+}b zP6iPy0+-OZHSfK&@}z?pQsv&Qn`ll)(SA}eos|Pa{4&!Fv9#qgKatiYK{w~EKuG(j zhuVRtihv{99vbXbn5vhBH}-W8tFZ{*#2-}GVR82M&? z(Uz~sx~BE>@q3@ei5SL~alAc#jI#J^VXbh4XO`*D$w1rQMOGo)2rEOw>j}U0f;4@B6-|Jl{!tWH!M(Gl_N_hhvy7HNspno!B5l3|@Wh zJOdT85A4M!CcE&$#L|2l0x;+3t#N?X%mXHMmS-oYTrPEzgEQM7#%t3|S)9i=`w29+ zEYVJ$?JbwCz0CX5+gQg#5vM}G`^(2@3L!R_{)HRXb+p!Otm;&lrtQ%$xKDli zmmjjSae{F>jz)D3O$me&W^4<4W#b8Ao2O?e?}bGRe%$ATlt@tf&_1b5m+$-;&v)kO zeSJ2+0Gzni#5H~N58ndzaJn*<8P|9TX8_`jU_S~x$f&f#wOF2cn%7ZikdGIcUw)C9 z=1t&1bf@(qpHA#p$pyljZLrVnN3~ey;_U)=Mf5Rt+Q%i2s3GWpia zyo?thu;ic-9$c6EIP%*O4w}DhRV&ulkJhW1?4)<`&5tAsm_^+&d4nNa*-Ot$o1754 zh8gJ?I5(`K2j(!$KS(Bgcgf!+Z})0!&_->8$vEJ6?#%hHZ{LC0+GpYy>Kkb6KAV16 zW;mp5eC#gf1(XL=wqY{65tt5W`$7My@B+1pX~8Oc(*RIXO~#tz&Ax~2q#T(GCPl_x zY|=$25I+8FPyLY~?uB3NKd+8-nZ7Vfv16i}iNng9#H$PGN2V!1@i%~elt<0i@8%&W z4dyK~5%8hzx(S|nf}qbzxJr%`A`@RLu6Z?$!h=kMbB5U zN(zQG7~h{ESJ4s<;&lz?>paOu0SuBfQAA0GQ$aWeKL{%g2w(Iyj(11YJ!> zKM=ozjo9JJ8*XNMjr~6CTX1DU>bQrN8IC8l%QA-u4dPH6`PrT!_S(0xKSp{P+HU6g zeHgbkoUF7XoV@b#6*gE;mjC5{{$Cl7TQEycFq0c9KmPrH3}e|*rk!6{5%Q#KPXo73 z`_#{{!6tC=XnPGs<2vsd;KFr(i#-=XBT!fdUm*_}#9}tMUC>uMXqvcfTJG^}Sc~_Iu`Wge%A6dtb*; zy_(iEt-41UwH$lJni#SUQKq;2Vh;%)Ydwr*+?u*+h_9RWk)O22XZ{J#^OrA`j$gmW z9E8ab9|Dh<=LgW_tN1|L9Yc(`w8?)Ft%9jUBqnBy^3RF zxNXH7=>7qYJvdPF@uQj82ca{LP&_wqRgh_ip38_Ul&HLLL-!E*>#0zB-G!-rH_Uwp zW7gHQLE0Z?PS8Sy2hSXVt>jzMEyro>RRr2@mR3n1z39^7hO%sbAFCt1-JDhrT&n4r ziY@tA1{mFAPY|eQ2!-7CvLdcv;laJ~GXCrv@g}J|2DRa0HP7kPNEkD(iTo z*kfSp;&mTnqv{Y4CKwQ4ehG$aWQt{;Q3x-#&@lWbPM(308ewDI`(^FPeJGVH7$B-t z%SM7wTbM;{tSyvTe65clRh>9}mD6x10%!uum5+8hVvyg!@E~Q3gdnl8N=Ins5UfBl zh~ydw90NO=V?o{k<7g6{Zlq(70zD`r<1~P1Vvr#IOHBEa3FF)GKfbpq2CucB@;M1w z)Q1=LS*#-Y@tI)uO*YSb$w%|8`802$K-1(eQy;I7@v_?d^+Q}TA92l}HoMw|d&0Q( zcfuo6C1J^iAC;|%S(q^~5h(5F#wuI{Hxp;Y&kBc1NQL4wzNbI>^b<@f*i1-=gl0QB z#-?QbTa2acrGDm<0jAFnhvY_zYUPU%qt*k4PS*mBNsFFJOZpC``zX4gM+I zA5z{PTD_GOU3~VG=?631FUMr5ptp-Aqn$bj*dThLg(rQ&DkmnJn(fWD*Y*o*<4J`4 z)%W$jzRy0fUm~Lr$RhGHf6ObHL*mk(RKH1LALT?-`!~Cp-f`lhf~Ku?He2al4XwT+ z-XR+{&6Hl|-iR>4OfDqP_twE5%&iQsX=QfZG)j^n*a0k`nK+`(j(y@}T~Y%ilPn%s zhytk3=2d^L))?Zv2{I1h$vkM7%&6bTMp3A6kdU^sw#T&Mt!7r931Z%YlvJS&4yv@b zS#B@RfJYl;rkCY$R>Rs6vOP;6G=r4s%7;S424i*$Q#M#ACTKQvWBN9a36?&{HxOni zrHpcVwH}5Z!wBDeIENeF89e)ND5$_-E^rTQFO2IZCu|==t2YstU3uxEz9F=uO%&GQ zZ3i~fIYb7dO|uui@i*FRyZt#f&DpLw9_n`@fEt;`h&eK)hteGTlyAb(e~Ba_!||;0 z-go?te-XYL2v3W^;IZaQbDCso5>xAwcxL~Yw+z3(=o4^8hC!S+ADIBn(L9G_(eoRC zwTzFLXJ3cE_@?k$Pt#Sr72jhV83$l!)*+JaqhRw}536RE@H%zr%1h9g!h!Hfci z!xL~1*#IwWktjkNJIn_6?X5XPIQkU0@8DRVjX6)|!}7$1Lm0IU1Ofl;Fz`Hc$JK)Y zHp6zqr0D>|cFTl#28?v$-o5E^3eBckQa79GL`7zY1+38e+V2n=Ta0x90u1-v^gVn# z^pcWLJSlS=XifzV9k9n{f9XMa`K8y(o8SIc`OE+GXDCwU%G8BB<-2U=?;Fs28!Hsh z|6P=hDl}CFZm|cb!mS_EKQI#Ry$r@j+DNu!RZI=YV<@<;qMWZI;ufrm#?pL^F#zwu4j4PvTv z_R`gI@$x0+shcr2q-k!R_Lm3!vXbi@MH_)&{4xVJ(}PoL?K@b>)l*X&X1r^7GIPEb zZ{!(ts-d%iSan46KL~!bwds&W#gcRs*r6Qi>t$6D<+CPA=~0iB5%^E(yZg25KFytW z*%Rh@8WW?Z%D2A5>GsUszxet4z#``gO|iOBnvafp8vRfRt>Ta#$Hcp-sncZ)uaXKX zI!(|ihjp-x3M#`W!-au+n)=uuvdTUt!M)*W^NbBw1Ee?ZrPG;*-q+5ZyC9N;P^D7I zc%n`R0>z0FC&A%7>qX0hbu4ZibHZQe5*~cI<_aJRg((h(?d|8s(F*pbiY!gF!bBRn zrHg#Vh6E_^jDAQ&hz^N+yU0Q0fG|#jDx`@a0uvF3t>OK}L)wn_sBtTib7QA&wA760 zG2yfK-^VrOgL3iJ-+|E3fsuI>z#nq$fI$ z?F+6F7c~qrUHW>DKIMH5=HvNh+S-Ttl(0Rx_GgOaaM zaP$1~;uHFly$Nc1Y?CnJtL7urlKirc)>RGB{L(yo0`M{dK8#>?=5n8TNl^C8S9GCB z2cWQyC3A3N3B396=BFGeHwUekSb#e+fuuE22yztfBoYELnLvZ0zD~X=JjmGC7bM}Q zWG@^Ikq>D?%guWyqB^#sS$dp-VP{sQuHu4uqlv&PlR0R9)X%cw8*jcIE^Z$_M2j)A zf@#W3dFf>c-0G7cD0bA)Ahy3}hs&4unH2Gcv<&07fqR?xe)9lruKObN$b{(@W=)^o zbfYZ74I^)nJrAq6Mcmaz=spuX0=RH;V|ltX&N`uIVA4mh*o7e{s+!Qw?%neG8&`wS znr3AQKi-cRi+IG?!@Nnxu!T#)4gzCnz!UfNkMYDqe@;4%6LBbiIbOtf2_Fp!IPo=@ z*oFc{8pns&~huN>bwtT7*b**hM?o+sLLvoO`;$?n` zpEbVaSdx8eUKI|t&ADG9C-`imSWbN6%0z>Fc~6C$_~N~p6U?Iq+Y2I54!CS#Y@6wl7THz>2L_2|1N@d4dmA{R*r1PR=O2W|`_Up3F|Jil4Rv9@0>dwZ z4KwP*re`nrRp?cxM-7#e_F-R{#dKws31uGT${I`EBhcV}CNIz986Uy-IOmt>Evvt; zjY-PYHa14HFKvaDFa>0w7-N+tF|h0*+YZeW=Cng5KLj+c0oMBS{0f26O5iksHaI)!qG zOhcHaO?uEudw%KybELrE*Ed`)U%XzfUcJuwFH>x$9|)yK;8o-cK6!OqQ05`4^Ifag zI9!l@7&~*k7FSuNr2Gx$+36>^%0=phLnl)+S7oP6WgSg*DxDdQQ$E z)(ftRXuGc2muJzac~w*F3T+5}Fp6zX3iP=O1Yw$N553?R!)n6bNqiJLzyYi3d*?xw zEwsiyQ|oM-MZquuq(S0D;2Q5zfQU!mMyT|vJ{vdEw7zHgq$b~|zeQZ)8)`XCvZ#B^ z&%h|gi4|&aUWFO2c?37Cta+P*I1=}^*k^HA=H6V11N}B6Z_~ty`0+ab0Fy8?G=HLa zuw_@XiQ1I{s3(1~O<&+7XOX!MlR8TeA&_pkXvsO$6#|tV_J{Mhf`fF_IPyIs zF!6&bNP>Q-2t%0xADPM-X|Co(Mg^k1YZJ3+p!986`{|g3K&lw;Ht3TD$5#52FF{t;|2!uMy}+l5IB4B zDyGLM!!iFCK9yDo^f>N-!J;|S0$wmbzx6pQ%DBdbUZ~XRk}vLDKzLs)lc)PR%=0A9 zIWE!N0E#+knz^v(VU>&(?o}kf)no`F1^Ak$rwXo>uwxRDl5wY*L{Eegaky*cQ1kIS zQzn1Wz%Z#O4734+_kRA@!9c$8?H{1^It9_vCYVW37i}aqP1Z)DV$S4*q|^_AYDaR? zylDi<&Mz`}dIWSb+QFt*&F=t$!1(ydGBf=Ea|{m4InS~!+Sm_&@#mZ=aSH;6&D{XR z4uRr8?F(tzV5-^>C^oj1${i+D_ezXUT`1#c&m#a0ptMyhryhecW%_zf;2CNFZ4n1x zq1jkw0-=|vM-t;8&XPco*+~MD!VhA>cZN^Cb3zHgrH~L!%^8EZ5+?_ZaIJkKBy4Lw z*YC~Sh@1ChAN^J1*>_DIl=pOQHNS*&O<%7yUACvjtzWrDX8dU~HGZ@yqWIF@w>}aD z`k0YxA8A|X!DSMOtgCSxJeHC7zL&5`5@XX5X&f*ou#t4*sVh|mC*&+wX2Tvb1KtB` zeMR3yfVhkI90yS!OO$O8Aekbmi2G|K{FWtC-~chiWYAna**09Nk!ffjm^_0?keHG# zu$+2_9lJ~cZ|Z?F4#1q8eNClo6XQUwqBfi6D}lRq^=i5QfZKi`RE zT0P(FVS3QrhsIM|&qHlHVW4S)|8y;v{iLa(aMg6c4ZI!Y%=z=wi8Cb@bZ8)~I1wJ( z(IrllZv&1t%RYNH2&3nPfSLWncgL^KGTbuRdSele%)Ruz{h9E!fGCFyy$WN4>Ag3- zad~lcA2E#I;6I;3$oHC=aLvANqJHHiN;8cO^?EE16D91dEB7KXA?n_gnT}BFn|MxpBHyS)eg~%$_j9~im~jnJ7WXpQZiL-u_4ek+?|~1i z<JA3FS4BYKIcMuqun3?d@VAmC2t12@Q zA>d$vyofe5A9zJ?*`g*I_X+BNPi^P%YYGqv0~1x4LyBSpx1HV>@}Q3E1kl=LwaMf5|wg`snG>V@mCmh)#XqY*t742A_Oq=j%rdxWcf zEZvcY9Co0=pS)N$AKono9D1oE6PbSZ-0UDMEFmy2i%jvw_NAnP56g4 zHo?0&yO&=y6K`hgn0YcY;0)qx<*)wwr$O7Rwz_(JlcDTrcjIj`n;s~+)9EJoCDL!P zisTtF2+lCHu3U*b3TV0xSEC(F8ITgb9R~=6(7sTF$ZSJ14;d?J!GkGR5klHt7{mj{ zfvZY3T5}mW0XjF7P2v+zfmePvc(LE@lNLV86XoZK z9naNFtY?myEF;P$O{_vhp=RW-8Ej3he3NubdezK(6q-W0?KoE9NOSzTsas^?Sb-;i z5ni0-tPxyhi-#WGX1;ScL7BszUz44|!4om>JJ#G!;JHwmF>NC3rIJQr))jvDE;9C< z^AQQ$y1Xp~Tc; zV2G81-~GwA%TNBR--j)xB` z8XC0hFuWk-!cF#~7@YZzsU;XQUQ-0^$-!8xW z#YgO?V=|3lJqY zn7Gs#OW-o8AOkdq$qs<>?ew|JWr2g}pc-X#Vyb-eNB?!X|M6R8`s2STYph)Cx8wb3 zkP}Rq&|>*tgrm?HgxJ_UzO{tw*w66VG{)w@i)9S|;T@%t<5K#o!Hi^1TjCqXsb-#no6lXqx!l(GY6Qf9%_O0mu}77 z-DCOo6KpXdBs*wlA$z)WyZiZGdHWZ)$~V8s znJ0K%Iz2I2K7IGIveOUyg$c?&gzoCKYvswzeKagG^K-lh23sqPjka?3{MmB%i)BEu zUw-(bZ*aKS5RA%Nxry)mFK(>S4|q7k^yK{c(|CBA!UgACdG}YJB4h}^oZMaJoI}&P zkB~h!){BchJT`2gVbyC3W3H{84QH%$RDLgD-}Fa#y5XJ%9pMSL6(1EMgn=6T5GMG> zl9Dnv+>AEJtYSjg6)HkwS8**cqsi0@q0+C46B5m;KaXAM{1! z4P4i>L;Kvv;Ee~6IeG2TSMz9iXV^fvh7G;tc)E2LxS`jc-+cH!g70QIH9UcaYaJRo z3KKg3?xBri@`~04Q2k>|e>(^Ve-*dtbHL|V8ppP<$+&s%bJE@;9H}KZ&mD#1_eVH- z^u6BYhDo)CvuL~BMD4P^{pC+QwtwBz;AF2EhGrCoij&bl6enJ5uH;!onlG&~e|RvR zj9twq+gR%(e4egnJn||aM}LPBSUN*rcH!ml%$f77KCpTE{AFm_2;)stDos|JxhP<3 z?SL`Jvd`e_@JGiU;6}agg;#Njy#Sqh7-MlCs{&~!BbRZCO*b&NAJdzWOqrTk5Aox) zc-q&mlM?pNxQ;u-+&u#07v2_WV^v_-R_Q$ylJsy4fwWmaMkh28MF}RD6p6+*6(Bp9 zsvMx?Xh$*92L0Y=+2TL{Uw>M@`FlSpfAN=pUQSO<#Y%#-*TX1f_MMQW&U@ggs}Op# zk=8!s%%FvZ293*3(waQi#)gX^*0u6STi#dka2eiv59!Vyl3C%~r3 zFK7mFd2FSDLKr3D7_a`!v*sIdq9$hNlW72LK$5@w&U56Ig=f4RqqVMawHlZP#1O`;IJE%-GAjcbA0L2ReEK76&un| z=W#vA?XrJh2$P~-`f-#)5#=2e&h?^Xox%hy8j|yY2g|9IKE&IXJl>uJ?r@DNGT)&P zlypMrYpjcE}4ALoo4 z4_9^dz`ax9IShfugH#<8j-h^(*g7h>ckdQ;fF4j;Vcg4D(`!|jZP5#K3Abo*u-iiwW@tD{1t|$AKWg}(|0+yXPP;-3p42}(1~-Ur@yWIkAM8%mp8ug zs(>Mnq4MEJ{~NsWYB(5jbyPu1<)>pnMPGPl5*Ne>J3Nq+YefP-wSq|CFB0TiD<{)_ zge>_!@@XQ{9`Z$MVIT*QCyH(byZgpC42A=<-ur2}^vauMaN-i1MHp;!m$q4UaMmaD z^o6^~4c#Ek38>`~cBmdQrfs;@QcKJ%2or<}%fdD$NoP4k=PceCX3L{5ZD)1`jQ_O%s{F@jx=YNyJ2j-Q^IgBSRqSCLSJEm=yFytPgeJL$^{xfWgIP#G4q5w9EIlkAJp9 zFy~+IK=$UbzK;P+Okrfar{LtPCe;3{&t63?64XLkpNwmGZr?ZWYn+JK{KcjFQn{9>xP<5=??yYH)%sblHMDXP zt!`FF>@CaIY@{Dzqk!V6yQOUd%?2AenNHP^Z0?YTvYkx2Sc#+XGO!Q+w~t1DixU#h zPd{8L^G`O)*S>xh?Zqgb7ADFTT3?p*V8qQ;*o>H0OER&B1jqT`a%*!*7+TGwdhYTx65tnezYrr(aPf!UFE* z^gPrvcu>w?9Kn;z)$-0SZ*vReU>frfADJ7U3P*Wo9OK$Hf8tAyJ3GVd z8u$?>gePFS10(4|C`c9Z8ZF^uG?z34KDgy|^^4ogM zTjKQVPrl3z({4!aLvVJ&8OOWN#R)vKV}jrYUYWu{7%~q(y7A@X^7GYO!AO4N>#w3c zL~($f?MX6?aR~XV!SS1`HICDSsRxLD}T(^K36Cp^H7vnuC(If#aGJZ%h$@psmWk|I;1m^#u4Q&lEk&v z&vq+JxF=?p@zgzazKmaaz1;tZeQO0Kg7Ws58&_Z`hG3+7FuCZ(bBlDxcbk|u#fDMZ z-HAfr2^-C=p^TGeK_224H3r{>n?i!)w?kZwaoWJ1SR{)$YY3&`yn!|YFvW;G-> z+JaU4ynHl0Tc*F9F5g$l$l~PXYgfq6Gc0fzfRGooo&0vGlbYRCXr^#efLlWd-a_!< zbI?+pcVZ9Ez6ao`VrkC39`uv(flk!34@NG}FhJ;FUebin^9&AE8iCJYW<$MA5nggw zg^}Zn{TR#-TlBL^Rp%x9O`JI1=e6&T`bB`$P!W^dNn3-saQwO6dwpJhw#87t)q9_R z>le^5K85(bN$YdQOk6M_?6Osa*CyzK6cCB$0skRk)C;}y!n%fa_il{EDQHOV9k`%HCfu(h1h79 z*N>@&oe`56gvbp&?QXoHBOXFb9^AfF#;_}$m^#Y|+MEg-0ul{&AW-%o2%*HQ0I`Cv zV9l@e0-*LfxviY|03iRJES*K|H0z!1jXsF?4zh%+Ut(YeZ2i*!xpnos{afEZn=ap1 zV)$DeN-O`G-V%>OE8i$2t7={bub~AYX{N4u_&#H)Ro2FS346SquyKUxsnL##$u^oN z-(nSlNm*vm!;ds&lIavb4A!aE%d7F+#5cgirhjA!yUT*0jIrYv@fG0-;!A{RhD=Rr z6YUS<#egA#nCHDrOjbGn$_%L0+NQ5r42=oc{1hIXTqKUnx#0B*CL?%$K#RYG)^_j& z2XV2ou)oX3=-D+ULMtGgY!_GBVO_K-rdDNf&T_WPS_H+%P zX&mJsDLnHGc9siQaLG2n#zb7KkB_Jw94Pa+$$b3f3ae%MRE9RjFECD6J7j4Gzvu(vb)nVnU^!PNQv8rec<)4@hL(rWB#|KC9cd#y<%&=I z0tXPDG`BG*Ksf1CrY>Ic$BD)C5<;#7Ta8fxcbE7ZItFQQ#Z002M$Nkl^W2r!UVX8g!lmzBJSr`EUW53&E6rw7M?*A~wb{OOAU=jd7$VvBrreII11B0ajJBUO_T#xg!_j1O8>fd!?dfIS#&IQ!hB<%%APBokBs3mcOCQI9T@F? zm_1E+Ws>!6ufP6SEd_s65NXO}qa7VGQ0`e%IiqEQ+E<0dU6}4Qn9G- z{kw-(Y65wm)r0}|K)MlD`})gQu0;LT5fC&JGhZpIdvlg~w+syqB@nb*`W=c+>K6@dd)1?aW(3n3>a+L9^UH79$F27! zA^lO>Bb>0TR$jyr&xA)L@Ed)qP-NSTnBOhsnZUQG=QFQ;jPaM%jAWJ8&ei~;NH<4){MOSo%yU~DCTIWHPr^IrBFpjc zS@4`Xs7Q7d&S2MT#&igtd*RV4zt1`Fj@|W0$0w)Gl}``AXdvCQkCd0n)3YyvMrn8U0iIl3u) z+YNKoy90=tgn@nl7B<;>JCOz~Vb z%y$;?_YGzq@iP%)J($2lG?P(c{COrrBB>FY!xnFS9W$m_>5(bV)umc=t*dQaL(4xO zo|~{igjf%wHC3BsXvM4bpiVM5j5GSJCwv_TVNlZp!SP!*RCorZiD!w7LGUTzJ5l+@ zL8aiW*3BT?UHXjEZX<^I5RddOuev#!X%Vdnc~tEv>6{q**gSRa@+78kJ507{Z&y4y zb~FUd)kQQCOo+Ri)?XZ>P?+;Jn6WSw3LLRMxJ?|!3mIOU)bO?Uqxoa9>PDkG*oU2G zKOUKeFc+9$1LpvzucJ}kUPGh9hVE`$A$Eb6h(~2WA~0Oa`O{2>tOR+G7{r{XT>1WYCd&@((4$K-@(m1uvCGM} zIfMf>()o!iIBoOzCk)-tCgC*x7FrR{gVEpiIs&~Xl^Vj98Nw&CpnY}i^H}1h4AY7y z^i+KFyMly4qUp0g`_6ib7cvJmACn%BW4Btr$kT5|FfLYeeV6Dflm1U0F1#lTmUfvt7kM)LdOi%) z_rCW%v|^WAFo<5D0gkQOhG-yvvQHYM;#qh+8>hzoFI|syYeY4#fNzs8fDX9ZC(^Db zv#dsAR#DHW5rzWV{s>K7lTN1HI@MsFGQ*M@?eKL|y|#%hGzt~Hs%RFNLIe@`f4rYf zdptB%idV+-C(UqMG1S{LH0`SAxD#V`w#Li93&0TBh}T z-+ifQHn2h8dYJW8&-MK=ZhHg-JdH2UBzfjBXkTp8k^=I~HnwqHn~rsHdZ}V#jMD%1}EGuVL=IjzH>0ck}UJLh(rWt6)+w ztGVF#NnEm;2AZruY5~&4;H7xdl+1B8j8b=dTNAUyvk*oZ3Jd_yUL{n{`;K$Rr|-IY z+vu-*0TwTsxrM;XwH-ll8G+jRx=+D|=&)h{^YqS{2jtt<`pc5WZD;C9d>4=VD6i#%C;GmTM9bsS6(7;&0ubsU-3IUd_IhA8%ggUtj zXnQfr=9|1H&60-@*uo-^#JuHadqTM&{XR8$Cia&IZxyMU;NN@jkONdXtQm&{uEe>b zddOI*{UgM*#LxwNw39Z3Ig=j@0^i7%8x`lP|>N~5XKHBM3O@qwHx9)mju0Uo*P72fgXf{ftP(`N%nJ;7@1V&$V?>M&kOcz z^k?)rF(^FqYbs-&7Dprr><&!E5=3+zct-wqX(CbJfqh!@a7t9`=bi~q5Z6uGFy}$Y z@VoIXCur|YlE^2s<)q>UMkiC>$0Xx~m03qKgi#7*0zxfywwv1^My@&u;vh!xj_)M0 z&hZOD*i8Irmi)2p-V)RXYb+QVC<`MnB{7jXSxKB=`ZB3da(GW4X8Gsod_e{2q#0s-Du#pa7ci#z_{tA&MxyF>KiEnBX<9id|J_^DNgnjQs6^vo@ zE8R$6h9)0DE6Ss7UXAO|hMGr=tER7gU-3FLt4+IO&zI#u!0b_B6!`%8PymRp&A-Tz zHVChvujWm@6;`HmLRCl(+KS|?CR#H(na1!#E&dwLCjG!t0;SYYV zy#CtjOog;Xsf4@)Q(VbUs5#+=M;Gn_mw4~k%%89j*iVOim+znc&O6f@ovXBr8-9*= z=`KuylRJH`(D3jfbp8=e4={s?5dGL_$^&0>lda=lny!0Q``^`Z+uEJ-Ba9 z4?wZs2b^z(s6#&4*GcE{Iq<5fm!7I*A_C9E6EuTE?CraD?G@ZRUPVKBK1?s9JTmZx zz&R=s_!@*8eBrwS=sx|eW0M#w%xjMy+%JzG-(znTZG|3g;ni@xQH&he9=yABGWYhv z40ZRo>63%!coqjYH+N!>pesur*yU=x<3k0En%fqffk~V?Y@b;N`dfI%YM~PbsobP3 z4fhNH$EC0p4hPVyO=#Ob-z7KqAI_pV>Mk$5d>y;){_@@5{oTO9$D9{&Z+e=Qu$6L> z)s$Oz@7tVMA=aEP+|NT3d#S5}-w6Z=6&iZKQKPI$+5wE2&~rZ03#1IDLX*|jdyw=) z*P7~QcZaAeG+k3EZOk`ihPgjEei}uR3d!#BtGE6Z6Vg}9x4-pf`QU?(SV6%n8hbgx z?54k^2mI)ppv^%o14Jxo zx=4c}f=Q^)QO=o%pW__~T?sbMvmxpb^*7)8waKZzit;kA@EA$NFfl_hEZujU>K>C&p<009}VdG1Nl34+29-&WzT^wgw)ySUy!5eH?GpZrnz z{8v9M(>LD1XSJJXAYAPbK-&twY|LZAvCaxfHwTKa`(3F{mv2nPTtR^C)iT4G6Z7|P zmmxM{kDk3&MoyiEv15q~qVFmUjX=N{DZL0IgSbI_0)dl&(|q6vbRAFtORZ8!mjOcF zJJ9(b%iP{Kp{1o18WERdd>0N!q4^Lmnh!S0NPG$caXbp~>SXJH@mHTS&zgDkz)>;H zopCcp^VbUuv4^BB9G!n(uA9c{m|w9%5u*rOz=rXU=sH#-uKw`5b-Gw3LjR zIwU-O5Z9BzOoD?@Pg$S<=10A~dWc|hMyu5WVHq49z{CiXC?-aZ`h^I&bPofJ5ZwuG zwZW`O__kR>_elF~gb2MZ5d#dZv{s31O!Ki0YDzUqys|Kg_ebF zix5eA4z5tQu;GEthC&|>sIwvQC;K7pTYW}85JLMwW<8QMI1vf?<2S(@zxqty9H7B8(#<67nPF|s1;wrpr`Cer#O~cEeX)%guxRXIlK=hlzJcWt>VrH<7 zPBy?)-0EgR)5U6zVJV!`CvhFx)L)mADHLU19;o#3ryn*9P%cLcuWQ#Ym)Bo?0n^T3 zLZ>+8K|zEnC=9Hi@eL0p)VB>=bH~>pjH51t$5UJ-x52-}3;u#` zj&;XdG)>6R%77a88N{jMKi}5pCdzX!;#*`K!#DG=kNnwhp31B#jOI)83kzYgW7&q7 zufVP7OP~?{WA~Cl1}l> zdima2kLY3g03974JI}#!Zl1mbqc8?Poi401p6RTy?aQ%bUN&QJpqrC$XVH%OjXJXd zerscqm59|cjJ6X!M6AHXVQbKfa9s^kz2~8=^qUMsyIMX%JAJ8O(c``v1p6&LOz|$5 z)TlP~whhdK(p)vjo$fuPT-|i-!N>I14*Sopqxe`W_aAe*xBLO^CTl|Cymt*iyx~^7@)}RA&yM$_ zaC9G_vQ0n zU*glTI`|yrXM2Ng@y@X>-Y}HtFX;p;U4b{m=U%fR-6i)R80ei)+^7P#xvOlXWfUl^ z>@)Dnwo=C^-Fxdy9OuO?%q|sFVpSI&&DQO)uQpaJ=udf&I4IG6wS&kc`6lK&-;uAh z+5;68K&(HROO0WD_||D%nV*?|RZc3PiYI2|->8*vaPFgzsR(aLPn&~RroNK;81L}W zNB|*pJ7>s&#y`vC7iD`Bc_gmLTS=Sz7ETSQnP<@~!(YvUqjMnS0^N(*=Yc&P-eB0zreZ-)F%OgdF!!K`J~3I2_Y znkO5=JN^sEJzPb`bBz*NG=zwb&?IIT6Qt>1|4kWx@%1uv{wlb@^1Sxfeiz6D(ZnC5 zC6LplD>u8R|UlU{A*02^lUoje6FF@R;KeoJ~6P5S#czd|rzXvII;$Q8vJ$wTV;! z8l-KY3*3Sz^RT2kn|l=Omfn82Q2DZd%~>O`Bw zjeBnVCP6yhX5Tq^9Df%Y?szP+yper=*vw|SK*gsmD0QhNhj=2GVvzpNswVC|0o-zTuuF6oTQ9BTRH@8?eW{GwQ zyU*c4n9_o+r^FVmb757Y|L`Kja|8(D0+RF;1?J|+49MopmHpNL#y(7J&c3OA;ng>i z_T!<#JSJKC-1Hvwa0&>>>u*BpKGwL;!nMXTiABy zq$r71`0gP*vRo=xFTGUGo;er$5j0f@Mi$(QfhLIQXLAh(l(Zcv@J2^CxDsz{2YbwE zc%sp#z8d%*aBA}OM0h`3TtJb4_Oesq4@TLI;Ew%$aAbwP+{I&&_}CBb;IEFpU5kzO zGF0Nc^-ku6F;AKlk|`enSNu~**a3exp>gZXs~%q0%f?q>=gOn)h`azj$pdEQ35X-z z3W}K7#hKEM;%NNBWfX+aBEEU}+e0+KUw(;}U(-nHXrIXp9)h0=b()~6nAEX=v_K_+ zW5J&=52i52C-jlN4@MDs?6(>mpXT8h^|fYd#+AV}{_Na*Xw3(PINS~!cA8wi_WEn( zr*m(W_uhRMXBl5BU;oA0lbL(pPJVdP| z)z%T_MkR@^{H=@TtN@pOLzzZj2ygR?IMAXPca|k>HLobvIHEpvq#gEk)Ss6sA*IRU zf^*&u3J~$b6(>8VhrTp_l|)H*g^7Jjwi3{Q~G?~>8S0ADZ;I01@&fIV> zj>tutm-rFm(+Wo&Z4byzezq%xMDG3PyC^quHGxSI4skC+X73nJA;GJ81wfi_j=SV( z9BUhKZkDd87vX#LxTHy+X#&pF)m3htzl6e&=E`GfJ)x=J2ug5G%iRkkmrBk5jzFeV zr4>X}7Z0NCVrl&61pqS~kK0NlAZ(f6tt1uPGcB)Tus0Dap)^hhjLpG5ZZ6TLKEClb zL=JnhGcTg*8H_}s(I;NGm&BIaj3kmVDf7P1iH=TPVF}Zs20W1v?a2W|FI*g=ZCPhC zTlb-^J5QIv_MS4sGUw9m&tUd0meW^X$F6+5eDbqDFAJY>@*sA<2h5!9l+&&XmI+zc za(WV2(FU%2HI^y|(0+twHtG)w1F$``B-5Yai~Y+x*s|k(>&%6)8$LvmY(vU(6Zb^# z`LZ&zfPdsCG-W%$*&+7P76BzpbfuR7<{i%tewrNZ6NFA=yvjhMQQgmrQ1@uW|gA z$)CQ47J(1KCCqxLv4_3&^s#rL4_xJh+yR7HH~Fn_%CyJOw|A-Wa+4otmT1g@i%oII znZddpz}V}@T0%3w@HlX0YU(m8IxCnQJZ7-7v6{B8uDZHG0>)cc+X*xT2rUPktAYS- zeRO{e((;4gNsPg48304HjYuxs9g=MvfI)4l2Y<-HHyFT?bs0)VEVZdOE0;~%&{ zKdxcQw8#qD*Z^8UG~dIV_n_8#55>*_HsD`8oGv@}zC=OBgh`!t!<=BdOkfuG!o{oP zfj1rMv;rgAffn)HxwA~v_fZx>KkYw&C(}7MKUaSK%bzo;ZIvJW-jCQ+C{qaSbvzir z-Kqi17e{}Ezf6MoQ^)7g*WZ3;-xX(Kj0^pU`y67$rR;N4tPfaXIRgRcXD;82Q(DZGn5Z z-$h|^`O+)p8lIC*ozydouFqxY%n#<9v8@$=rV{PE7bd~P(~{8<=E#C6CFdcHu#f5h zchPz_bbr(=dj`b4*h2`Nb`P5-qH00+meJZONT4ulXCu7=;o3STacKGbS zADj>uTg)G2M^kMZzZmPxap9#7c&xGF?E0 znt_JSuI!XcufI`lJ)SG)X{+|=t{}~`FJl8i(2?rqZuhon?kEGP0?>L)OisjH6swEW z+i~PiIHzJzbEzC#@-D*AtKYmgUfm00o037?hjD~X!WS(puCQ{+$}adqPY{>!g(m3b zmtQJ>`ltV)e0K9DW8pMRSwFasiH?+p`YXIg1)(LfPr}1zuZb(tJmVO^iGZ)8wZ`R! zhoDo`(SMFh8B}8=tx2$mWBmyi5458I#P!TfAr&BtimyKGWcCcs7uS{--ZkZE`ee*5gWgbWCeLhSkB-aM)YC2?8kg~BT8Z=O;D+tR?8I60qD57JwfUE_=ut0ktVF3#tX z#)c5j_8a_Wy_Q)4ox#L!9{yLy2(G@k$3eOoJQ=mM-Mfqv^KliTi-UPj;BnNI2e%5u z;e4^$Ld)p?*b`yHv$9zr_2F42BZJ=!RFhxRSYElz>?et41wf`qAW??^s(Cxn2Ee1C zbe2q!1(4qFHN?b8W1WcE7K-O4+^gMqADhd~GWEjir5lew{*kG2`lZ40`0mZJ{OC4&AePJ2*^A|y z|L|Xx+mjc{=NwSi&H;3LNW*Gy4qR;j=x%})CY?+;cEmQF((W=F%c9z=cW|JRdm`Fl z91BatOZS(`%7f2J*T{I8xNs#lV|Qq~&*VKeg@+S7LOgjq9UEMf64W{=OBf^%{*$7_ zr0csN2o-)ABLNyo0=~8H@{NCwV14v1?`pos-zQ!qaj19_-pb*4_UlsGg& z)(|W9S@ier^5e91JTmPdfbX!XH9C3*S9#jyA~d5FcEgxN)G^@FyC#HI&kbbuy=G9l z9E5Tc54dh&&Hw|wS=bZg@_)Mwkb*cSdR^n3kHRVledOkEPr!ERbDPtMJ(wm!051R( zFz{W%iFbtKeGxC>9e)C!^3Cyn6lb0noCFNVbm0TOP?Zc;_w(%lSMw$CY~ z)bfsC;Qoi0$RbTLGl8Gv>5A3uJGaU^@BETeNk0o)?0x!1X2uo-Q%pV7oM{&F8A_Dj z``XK~!qW|%yL$c{o2$_*l6qbOU5GwC7fsuuKJb_eZ3{q4qbAffw1&rmxT5 z8+;D$EtoitU=nkU&C?e+MD-Mo4!UuT9wjzo$GDCQP4V2H6@rJ)z<(4I#qO@X^6>uM zGR%hePR6$LH|rXudz1~+%wIY=m`96lOHPaomwo0?m9<^a$t{$6tIUxKU;XThncsK> zLxN{T%$WAjR#H)#27Is(&@>{LH)}`VGV=IG6z0t}=+Y*2(u86gx}vSNC-92XA*3;1 zD6qvLeKLT;%{ejf9R>%yoLT08xb^+=-4|XdSHHt4*jOIS;XP}P6`xrc_->e;e%fQ6 z&J$4w`eI^g68E>4k&)N_mBBEo$~q9_q$c z({P0cf4ZV|Q@C?R^o6vB&gqQKqrM`Rm5U%gv?~78D;}t3B#v9_PE-?Qcp6!f;J3aL zElw7U4$87>e6QBUoSdxO%%m-u%PD)i_|d-iAuc87%H@~8hlX?v0fQ9{>k>fC4!1!J zQ*z)h#M7m*L6&wmH#);3gPTe`;6oa(ROo>^J1dI;u;FpsV?vZ4-TeZ7|4F%z-~WkA zSIX6IeUHth7t7m!_J5bP`Oo3wu+59jH4+Qz90;rJCk%zaK%=w1%qj=K?(NUh13Ovb zKTvxEa-eoDJByFYy_LE0=!?Nvoj7ypYUyGVc`xQJ7y7Q08BS~TRKp-WNYx^uOrJ0c zGZF=*fV(iXym^&q3ti)*jW9Uwu}9&^*m$Oxuv9@HcM!lQ`o*P=Ig}5+34l z#iz_ZQ#M6Cbur#Jo3kcyllF8PX<5S6l7o=tV?aB=$a5?cQho$psujqZJq;C2G3+XF zPr}fSd6EwYu>!GCdPXEZ2$T?ngF~DFkXH~4givvemeRj0>-KRv`t8SPbT_e0oMb?= zM~5*0>0s{-ZWgy@%10l)%L+$-dE;x}VgoiByR9^-$ciUnwZ2Z|9w67pBfNv3xLYE4O^nygPc|dL1_vQ-Z;7KDF{DubBK8LNz$i z_}yX@N1_f~Emaiocc380n;7$6{T}r)cGgd_SQAU)>V5N>7fqgVM>^7nH?HPOWj-^Z z{a^7|SVf5u(?8ZgZyqtB$&eD-Fz3_7=Jdn+;(>aNHl>t8&SMdQniY`g|}Eju)njtSE4J972LBw+Q5s*(APrGH)mJqX!RLT|LkU;;o^i z+1T(vZPF=-pjUv>Jk7B)${vg-39QN875y%BRkkAs^BCIRFF~|FYdZ9x{fWnbVenPv{4PP0=vB{JbN+s)aw(N zPFVsK^Gt2kiLnd*Ezsp@y+ zsAKc*87e+Y>8*!2Wl@>4nOM5xj^KVk{yFxnp8{&K%alhN>4mHP9XN8R?c0ZO_G1Xg z@`5~Vfurl;O$^}`Awqb`6pQ<=8ae*$YyYL0DXFMb)WLcXX44KdF~QgM5oCmSOJ4w{cG59)>3! z;;>1*@ab7BR=%wZ!_aYWuq@76P7wJJb&5|`JA>8C-nE)X!G@SLkRb9n{`FWqZd+ft z(S}Wkw%#2@ZQbpG#qNJ#FO_ptX=Afz*CEU~r|jLp1?B%w+nYXXdR&KnefNEx_1-)C z41fU%kQ70Y7HNg1;*@E*WG7L{G!@f1hu8|%)Iy6`sqG>`t<2@PM<#g(pP>H$M6ZwA_1GZMSdbA zDhGOLFbN^zFyr4lLTu|b6az#qqBr0cg{wzslbu)!h)}Zpbh`|l!rkc7jk3WG*UuK8 zmDzj0fazQ;V`r|GZ~v?Rx!n8FAC=EO{44T>;1HbDTtf)x0gDi}gPi{2y%51M(`4&d z**2DYV-a>g@qs&u$`#6ER%C|gPWWj)efTiC zWv6REY#)voz-b)`*G?8a-ge;LBk_><$S0DEM}8j(n#*YO`aIJ7eg0TVEh)bbP@6|8 zHo~SG*>~`B?!ZF%?mO>Br)@xvk7Poz4#Gukd}}CI_u0_B!Zk3#*el$??6<=2z^C#M z7I7XXnn)M~dQzV(D>KM?CxF7{0G89+Fdg8nhs;{mXr>(whKX#3o|T?jZE96N%(B8(2MLtqlYJDolosq>vSK+Axk?_UUj# z#f~j4;%-of1q^)Ry+?WAvV+a=4%ws!|M%YR(K2)4wQ{sM$!rnpBgWXoZ#=`yZ5wF^ z-{iEyRs1!$7Ma2>9vVu_N6mXq*&g+r_^Nyd@z|ucOK|n(81M z3KsRzMboabM9ln;KfP7XYaqd=O}pI)gUlE%ZKgeDk?bx4S0^LJ4m7jl8V7{CRU+K1 z>~9i3-Ef6K1I!_e0_rEgvU^O#J<{xtqgd<>#B7*~rVH*3FYLhHAeJ*3P%&rh99W%= zy!H9#<=_6>e_ej}pZ#9Be&uSUvHs%9wR6wm*5{c8S6_StUV|3%z1S5e`m(!$+><9y z$}?O9L*e4Nd`AnlWjc#Pi4nyo6foYmtbXtZ+uU>>6)w{UZ8R@Jk>EkXk!kDS3Y+JX z_cTr%j9c)jix;nz8|;pI_Utv_7=lK?6>B7pfzi6ymVV*);!>q#2bx4 z0ETgd6y0&b32hg_RKg7~7xydwAY}n7JP-)ob$T0`Qji%SL}drK)OBfteJL!rA34E2 zNMg-Tds_owCo{(`_IKZ>8#i7qbIzb53#6M^7Y0N2-GFw+PO+~Ew4{J!w2$6+HqkQD{i<{v$$J9(_Mw# zNN{v?INYn;kaT7>O0kr8mA&aowaov5UM4BRE%P@VgM8r7>xcx&%IXA7i~BRG_$a zqg%_=5R$VU;F{;|n5_cQubVo!5hofJRAN*PybfdqHD!Hsd{3Ff**f{Od7H$(Z2Jm> zKM&y|b-a&YBtGDY?r6eoJNbr|PO^q(<1#+n_ zYXm@~YXv%t)S)k5)Gf~SPj)~4s^w7kLl;IfRiYqCi-VNZ5Cz1a?8P#4;c+lOLfDck zvep=&^)aLKtllY~{A8iL^u}+OzDfLQRR&1b4*`Ub5P4arBuEUoFbDn+o^li>wgF;~ zu^bt_!GY|EDJ0N<#g+Em6dW3|2mA-Y#1t0z1GiF_jp~er zoiTn86BHHKilK1`t9X@@G>k=JP2w~gfRBtgH;0dS6L;Cs^My~$OH9k4f*bKLrc9~7 z`6~SSt(zY?rLGU}x4-=l7_^4Vn{WOg8Ey>0Fe_8kii{%vObzm6w5@A@Q54wz_IRiTIJ|J z)O&!aU!BBacXj}(r7re6l9A4Dj!4(vJ6bMHeYZTG`=oq+|A+Vm@07<+mcYT8GJSqL zW++ZiOyN$_10Dx)&7-eEHx3LP{9MpzPs2D;4m#Iq11u^BSQ?I3*~OJRVzW*Cvizst_xX$%ME3aL=RPH={ zQsyuM>>r~Hz&mtv=wr%$59RTIzOjKo*#mD@2vg|mV>6{4cqS~q-2M0oU4OrP<>f2P zc6F6!EavNC=4;0_8_o+(s#%{8& zv-iNOF{|tBVW(@WpqXQ%xyjf0OOgDTcv(ZZWT zp`?Cp+_nV{EK~Z}CUTZR9GnGK_!=dQqiaNTWi%%Jx#=1B#2i9AyH|@t+|!1b;a8Y( zH)hQLEXHSz1N?i7ajFa4Ior`iTRPrJ^T$|+oq2VnH$19JiJhzu`M{tN8XIj^X;UN8 z0q`8b)7Id-NAT7GnAH%5w+~VJBAOVqxz4)6shLX%zixOpf;;9qf)K(T0{Sbjyb`nA zx9{9Xz}lu=bDs#BlrqvCMxLz&SG=^hp){bt<=J2G4e-=@4&32#1rA+z85=Fa7hU6_ zdx$i$%*K0bxRko1a2M;xv=ASm0J=^i-Ey|ex4!jN6z2bjnd7b4wYeK*#COtRjMPG7&XcanXSenpN#DYyP z|5x}{ne)3`bDOhK*_D%LSdN?5hC4mqbknc9;Ri#^Dh;toXP|@fqAj2a`)%+M=$GWp zd-x9dgiOjAJnCe`rQXZ{sgCe0@eJ;Zmu_Xu|kDk-`|s*R*2 zVhqIB&KlqOrzt83Pbg}JJU?P_p755vt@6pc|D{}DO83;|S3n{S%b+NtQlRBC(o2wD zShB*6vj8@@@%azfY;F)?*4EX+i^#^5CRQ^LKTO9NWhcKLi0$cvJ7s5mp*+5|#5#tT z%K6JT%kTW*|5kqVC;z(aKmEA$Y|k+WGP@$E5`;{hVl7oc-9wm!ZD)&)=@c-6fN}Rn zVTAom7xfxJ3hRnfO9$&%-+unHa`nq!1wLZ%UYIG1Xw2*E9tifr;uJVEteyo?j(_tI zS;MWWC@mj)mw!&y6g8$sn+`srZ(nLGu zpsOmJ&& zF}9fv6um=W8rYkhVC3@*;7=%=*^F`)!T&s+9pV3A4Qml=9R`?j>p;08-t;>AYd9k? zHg*YRYnzB3So2U~WV-La_tSFt<(mM_j;k|MfkTz6HP`*G!E$ zxHB+OQL7eKV(Y5lr3JM{Jn@}q$iE`gJAMKfSGIWalDLnyN?|M#eBmGID_=+#ad00k z+u;G@QyhB2ptQ= z`;I!2r{}PY(q3`4lh7W~@8g|A1P8qN5EqfB^ULM^+jq(ovkzkkYYffo^-#(<_s3cx z8LSr9v0x$?);Rg%TR!VzZC2oyE;gsz!iw2X+Cic#xw!k(#Atc#^)Dl^Y?a5XH8{;+ zwHdP-vus>ed1NBX{OXvb#Z31P1^5TEkIMh`$NvGJ-i7kL?|qL2y!h=RL}|TNfU^x^ zAO)6sv}l0`nY(S5g0g2Vi3)bkKHR_02BeJ}rTt!4E046$2gj?f{(SZ&zb~v-$L9m5z%^?rJ7hZ)+(<@*Mu)z7*vnTLa z_E4bznqLT*Sf=gIQ!~?Sls1Rb=(-SuQQXfaaDM`T$Y=7@DHd(BE`hZaqgZGMQEuIV zSXUqSE`)_*f@HRXvUT?vV8hJ7jQD`>ZxX`VF`G5ptmmP9*{!!9*PkH_H134nfkAGL zv2+C&%BvTri6l2z&P-#3VPXF|N=*kYL%sdok*NgYx!|@O0pQ1NlT@S00wu_Y_X;3$b1#gk@#~QCyrcaST-`+;cX8HWn8B z|D6r-I@aujLR9Z)8m|Q=qfEXxh@Fv>ej{EKjKjFC>_LTVzw}#a~WzyJV107*na zRQDRI_)+jzn0E%l{whtZp)fp!zs}9iVOUy;je0fESSJHOWM0q{3a)`SV4XbgW6c@L z!>Q8|b!mXv2ix8a6ZhcJHn8pNAxt5E=WYD;Uh`-msv@g(7Vx&j-uccBqfc7P>JP4> zkRXP(585vC72okprenSvXt`quU0%m~|A$!t8T0MK1s_$oTddz+fc9NNw zzgiBcG_=k4l71es6*>r1nAe>n7J0FmmK|tiWibYkiAl__uCCzYU=jo;+*o-60@*u) zcQaRSlm+&ZIM`&d-RB>cfpf2vv8hYtd%yp`l#k!~$A(8+= zocT~P;i5s5Az=3inYjQ{@9dsuPl+>SlJJO~2%ZNJ6{{J`XFq?tTwt--5dkrYI$d zon~V|>LfbFtD(@n`GX&nd-v|)%7hyez&o`*jB@7&STG1mQ)Rczp7QXAQZWJZt~%xu@AS4HZW_`P%l1 zHU^g3c9nMWU1az|PW9IQ(w0Z|kXGIazkSR4lFmA|rHSX(fq(vwG{95S=abCe>txEA z(0-y|YrkGyUo3y|gFg@DVVeQo#m?ijTL;!Rt!^93tL*yxxIFpsY~$0*h=3K>(4h_( zM-eVE;k3@yUpVy@mj4BQ8ekNzde{whk6l?mxO=bcUOZPWvb*#UJmwHsR3cpPyu&Qn z25W5gId6qeu{_8zfugCbib@B{bsv7+hgdXq?{Ra^6mk%B82DW0w6uug&7e0(bRPvi z!#{h7qjo&S#!?KB&hqY({wW5YH-Gq-v1{}n{ExqnFFt2xkh<@vgy*tt+1Fi?)$I&>mXNwM2$H82Ko)@FBpT(1Ga^rc767 zuS~`#&m9%sn!dsw?MPq_-CJM5Sa$Iut~mH83yZ>k(oJUNH;yn0UC|d1sCsBO*At8& zNcZ(HdxT;qe)=#JsD!#Be0&E`3Su7K%TxPtUD0}d8bM6D@uz^~*sUeo{%wDDjbR_p z1_t+0!20n2-bJ~Xffo@K34C6>#t!HR(sL-nYxsr_;PyL!k%Ng71P_ED1wH;5_Sao; zxeUQ=0KsMuIL0PVqoBAK4FV9dg6>Ld^be#h)w2=1G)NSw+=9c=?cywbXXLzVGZ$m;k_z8;lVcr8xEx9u+u#FcI3VLG2nBc?HU zh3>4Tb4{jUYYmz0-aoon4d6OA!j{ z7#+{@w3b_jKOP_mGSdJr79R0pAM;v)a-Z3&Fh&615K+{In2|a>B0`@o;*>3nj=_ei zROw6=~sc;j*q@vAC*h7+ z#Tno6PvZoFB=4v>A4p%GLs=z_ZK47voZ;rjGdBY@bo3$$yE72V5i9Vun29D`APyKT zKl^d^X9HzSj18AFXU`IGi6AY^IvVXN;DwKCXz#%22&;?3Z!12`%cH`Ga+e(^ALn0R zTCMmM>KK1@U0UpzXNOHq4bojEezEe)S`>tAC zeBhrf(;7E$MmZKAe!0Nn8;l~c;B$b5G;?=9Cg0t1{-v)nbD)b7L?Sa#gAzJ4I4Mt< z?O^8H1+pg5yd*A>KS6|v!~>0{AV}Q~umUf^Y)A0*Jb&#=L~go+g#h2O1;P@t9^wp^ znAgAkyJh0ytL2@4|G(o7g$2jyQ`-STP^c1DK~GVircPX|08lZjO9v*-^;4_`JDcU) zh0CRrP3$5>8L&P6^kdvn_L%*+h6~h4Fe`Vu-eAL`V=UfcEG(H(t8A0Vv}u8~%<;ql zj`AGLJo5(z>W|K0+XYc4MaO&c{h~v*3rDSEE`!DfLfM0TwuWY z{`bFM{^*bXs5~P^y#9u6ViJCZB&dX2^RDIFR<47PAzFA^S(C_X*|i?^oBG^yedpPq z!h%1r5jcr*ftUUsb!-1dxOHDMlF1R%;96RDpoe|JU9u%Z=|DL>g0B0uw00qmZz+#ZH^>%21GDK>G|xl)$dJtlL2*Vls?= zIf{!P{bDaI704cZzy@^#Y^X>`+GT`QH))!8jnGpzO@hW(m-ox`xo&nhHsTDz1>f{U zL=kES9)s?zC~c$5pY^pJ{7G{Tx`B5po3v@tM0TdjwybUE*`u~wrpf%oXnk**T9(&Q zl^4Fv^!6Lx@~e5~w|-$iNV=%y_^!PSuj_Y_O_Y@IS$-U51#Pu|moGA1{l2CRSSee} z@~5Buvi$hRf5~jmDweaJGKHUNFZi+l?BZgy18+Hv_2>yZ-n)?09mDH76P-pB*qo+uW>KH;L9Cl#E&A;G<{}%+?Ltdfeo=V#g&7`E;S^?n4)o@G zj72*7J2dPXihK9(mLL4ZUzC6TFa8DfrtGF4TYkl>=k|88S*10f8H#ldx2i{v9}y*I zF)X!#tmd0oh>Mk0WlWgj8-zDuiuR^*p*-bB!PC5~Wh%@EeNkho9KxM>q>z6i9BBUW zn_6G%>Y;+u$L#df%+>M=E=*I?R~YCgo3_io;WxFeQBTT>z62N32|=7FB&@L(q6fie zaBv)f3Rfe>4FyuiLV;5#4J88jLRh!oihJ5t-eyF%6u5Yfpk=wbaI7P&C|nw?$r*cb zrBz$^9gRQ+2XS4*S#~Q|P%&iW7BfAknW0!k*g0V4?1=iNYY9pN!m4Amb#NzL-75E) z-8?4WLumVe@iB$%6s8m)2YRvUvygiTWwcL&Bz0P0;phnN!uq>!uy>04r>N*TPJ(S^ zBzTgGPT|QMx#(HfDi$3hgw`YIF4`;=NpW21-0KwB#godiZKfh%6G_LeBg%I7{VuS| zo0VZyzJ|fqJl0|;GK5|?Rn>iDe7vK4>6Mque|ho~+6qfD?!DoTC>=FN(h+<#sv->l z7lN$O{&4OAf8bC_I}XP->rI`3*{c}$>96J(8*yTpSBa>!Nd}Q76x6lRv?bC-ISD{X-;^g_my6KAM_e$@x@DiluG;CHxidx){QD84JltTC`%qfBJkiV3#>sfvy4yAbu z(+fp;Kst&%fUWX_pf_mSz8ImF`A+(C#&>Rkou(J(!&OT1W)o*|{U3&P@q`b~Z7*{41iX%TkIE3_OJ()JP7GQJ^$5UQHIqZrfi++Quzt3#zb z84QT*@4|Ze@Rwy9;rhnw-zYsqtU2hy+9DOu!BUCvbAik0uPz9oi|Je~V1rCM8c$j` zjhMCtfu`+{*={4u(Rf2J@bgz+DJxGNmA(1LxE#%v+1rckDRH%&x%hJV-T&!-E$_bh zr{(kae+==iq7mojP44XwG2X2=KM6*{aQYO`vJt48Q~!HRxjp=Ri**pMaBpM=ta&VO zT$;U&RJm48U3(QK-vuoAucKA36Imj*U8lo{GW!^a;!j+}vwY}d*hdUdw!q1hx5-W2 ze8JWKzlYG-mJXF(J3VRCra$~oi zTm){sbUA#TR~9!>2B1GCEV{82x?{DAByFUPtw-g-qkHT;yHBkAMXZYJFcn-K$aih` zS=rr$5g(3}bK}=h{9q*rR}KatpTqzIttq{|cqF=8PsMx1y*O!W^49bEzWtonetSYA zKC5Zk2Zj193ne}2JfxlHkK{SeKO-Hy#xE*;wepjhM#GwA<{4*sRNLXRN{?RGv^h`+ zKSDWe^!|rGFYmnj7D8WNxqRg!7T&e;<%?70o5aC49{>OGCx6V^0^AL7A=4VStIsO} znh`)8Al&%fUt(wB)eJs#59WVrG2Q(C_5Wo@7eHYg!_g)3Fk5ph=oQo$D`f0R53M{qW zE^3^)a03^nFNXrr=h^|%h2ReieqYl@w)_#Y$kUOA!Q~VScn|5fr%#{aF*8>PqzcS# zLaETP1-~++q`Wl4#?tcON2_c%<4g~;O~S3qhE~Z<7UEjIwmw>e zMU(5Q+>}!?wB9NJDhv@ynzQaVK9`L@e#~Arz~Pvu3rH6+?ql?=-JCSmx0`xO4>m=s zF&vaFkV85!&x+?9W55yhtcS41k7oubaE59RzU?M<9!4`$rdeb?#MHSn+^5@#c$B9r zpeu|n(;mI9Wx}0$dS;4xpx7}ESZ300(pY2)OaPT!mApOTkqkR3BYnl=fW3a{*S@MZu{JzDsiid%i_r1GzPrR7_$9$#5dL+@Fruuv@DL^*!^_dNWx!s?ZEcQ(#` zWtpz^(o(Kf-v#j+Fpi+_2zY^zuM#?l+Z&7up6SMZ(kpdy<^UR&&Vy!1D-SZ@jxw(F z7=k`ux*n|uH#nmZNo&Ewpoty~-||*}28dBPaF)sFZ53{5wF{Y66sfqR0+D!D<_$Rz z8WerWCA<)`-t(950GD5u&Ew|}8ddUi*2O(8R_J>RMDWxYvWn}H%Bu9GjA;-UbG0mC zIY4Qa4u|M#8q8H7hFPDgtfeQ1uqH(%G__#yk+hWUAwDbbY4r;#?8qMl)(^e&uoJ~0 zu~`A~x3cxU?<`uZiMYtei!9X#&mAW5>x`=na0u_KSk%v+Ig_76CXrDN3N_E?askWa z&qOF`)Yo7r2uutQ<#72Z)=6A?{aa<2-J$h~G)<^doP<@sU46)6;KN#HSTGsj2-h*7 zK%7IYL$Gl>(Q><(zB*~XLMBo^j zJj>#4YIeZ31Z1f4yB_BA+{f8&zdoCj_u@g}&L@WQ^?riI9tB5~^xbPF>KSR0t@yPB`(_}7%zWYIhmb9n@gxiP)Cw12oT{=fm0 zY4<)X9Y-wg#wFY+zom1Tu&pdS#c&K9%?MUAtA&tzXE)4F7!bhwSXeD{a-rMW`W)A@ z2=C9JV=c_212*F$z=6;9jCum?jj|*AF79#%0JFc3 zr3xl&q@3Bgdqj3Qi_$z18_OMO5#&4RLPnxr^9YvXr$l6V{A@PXJ~&&}&p>Ucw@s9q zV;0@&f3HAsnt1h-1Oe#Uh1RiDAGl(h{uDDW(qY??13M4w558(}-a^X@Z4jS|n@B2- zS}9xC*&Z)GH?8GRNIM9Tta)d?^|OG*`gs=j?ou zq(1k?>m<+o{^Z>;!tJB-lXrg1^z<5Sbd_`t=3V7$FTGU0`tnU?f^dC=_P5}ja4!^% zVXXNqVAU@gdhB%i8AFmYPTkDXD)9JofJwR+{u8(ZxVXh4hntieAVQH0RPdoCloR^l z3Tv2*?qi$nA{5Cp`x!7D6c6|7^g4qK{{lZ$&U~t6O$CS8CFM(B`cnDq&O=f;(}q== z#jWPQi!cj2qg2bzf3=SqLBxKhb=yc&TD$d+*MIoyZ+txlKWDh33Jw12w^29#gz7mONh`^_(5~X0f8^qMri`Ge6d-E5 zdJcXZsr&$V`PG)tifaWjRR{fYjQtC)mrHCScN#ZV6$sPXhPuVZsFA?L^O~&yiiuDL zAp&{*xUK4Jo;WE~|| z_hG~C>gM7`moCPn`JfACUuiG-^A2t0?9x>4ITV(q5Y%)faUZ1Z9b9XG)q353a35FR zy)uH3Gdj*}8;XKanY8pCG4^)DN5VJ+ogAPL$n&+R?z0EZAq&-MF6cX5Ql8MJ6DTx> z2)7f2MVFu}pdsia+QI68%vdB3(M6qkkP)D$e=wy$L-d>Y23XC@Yx%zTu+6--O@ws- zT2ax?1MyWxVr5ypwzrM39?>w3_^^Emvd?NR94=Qot>ky(TEArs^5q#$uIBkBZ zTe^C_UkR7rLUR>o-oJ1N>)&)_shR2|xjYhhX*_?<5)GmV$aiD53t8w8C4x?i3mcBT z%$g6%qqtX&+afZiYdlmO^ls?VoeTOlcz|w^d!ax$GpnLTIaE|W*`ZH9$ZIl(`FU70xQdfEt>>KhmP>xA>m2oYe}NA+btCkWPauPvVmkhkTE=i*_^{ zG1B1Ibl&=e3^w~6m zj1R)6Lhd+|umlG&Hvt4P(u7=)a0K77K30aK(nPnw)#}8Hczu$<%si4HtRh8FGYi(( zZy})H1eq>C$Zz6--x@{2=rl%W>SoFZzYhUjWp~{zgpOhEUHxx|IW$AdY>3D_%-Tj{ zHY;G2umc{_cWY_jopRdI=1RHyvmcZTH-Do{umJkFe;h`rUl*2Q)=|V%9*oH-D0V)P z(2XCl(L(h9(!#Qe#nnxaBq#?;{(wpvkQstJ!|!SemGgp0@I z`734S)o-&P`&#+wfBrXR^}z>aYYk}@fV36R(f%$l!wq;h#h1qLc3uU_d zLfK^CT4o{bCW1jSJa{kzU1!H>{CDT^#lKx15t+>XI(6nsIX`}tzS3WAJ^m3*2<;pY zHkrDQ58#f)9upn?Of7HGQ`j~8m{}L6oQ>9D^B9WL4!Wbl56pcmxTRH~Sx^)B0oTef z(xd;PKhUB5jK|WHZ5FJ{CW`hEciuqLFotB*e$O-0^CqWaP4AIy>J`Os)bG7MIf5HX zD@{9DhJDoZ;z{_LQW`o6s9NbWC(H6G{-t7zNxNnIS#~JuSzk zzkPoJK60z9F+INq4fNq!#1aN(j0Tv_M!*pw_?m;4Ry1i^da%zZ<;W7MiNoN$jN6VY#_Uxiy@&h~ ze_FssSif+z`#Cde@-xD4;tu1Yzi30++t!6@0#+CGIWxSE;JZmESqi2tfek+6K>iIP zNWJmK8zH!cy8`7LJAOo)>8lLD!bq+HzYpIvYKi-1%%Vgrvw@y%8Z*P>>7c9opY$I- z)dvo@Dqw8Wl%`e!Io|*$s&Toy5-Li#H6+j3IRfed5Ci@i^jR< z2H|n4T)+0^a_!nHxET;s!+JYjWHsV@^RTQq;zCFr8_k>YxV0UlN$3~bL1GoIQPpd=((F9|Zo%L4hmi#8Rl2QL_$NOCf%xFVl?;)25D#@XN)8rE>brshBaH zM3Ja8=8TaWXDVP19^+3>dq0_dim;3!j@gJF6foCSOyGgw@7}$82y-Zi;FeWxZc7YZ zD%5Etq9=ebDctpq?&+XGL;flZ-E4Zu+*I)0aV)GVG76Cjis6b;5#<^9O&#@>ju0d{ z(GOHe$RAv(uv-s*?TUl_$!B~XE?-)&MR1HAyt7TU@iQER04@)4!M`|G>2ckIiJVQ* zs=bIC;y8WSD1|QQ-zAM}9uu zi=as12d0cHqEEy<{8+k+zAod9BJz82q#`ZuBdmGDyLlQg$k!vkfX8cr@ms1$9fh~z zupQ=vf~3uR`;Uju`I>*3hwr;omKi&g_j-^GM~4l;j?#!*?LP9N3;cz>4}PTbFv4%2 z+0J!?e_$M-L1TT)&=4~dSn*u5AU$qljM%5Gl^M`+gZ~J-vQO{}@^tL6TRQ|n%F$P- zP~a=}gAsoIb;8%=xsI`Vwj7a%W3$TQ9>%n&XVi(*$^;yNQ}Jebp2b_V75M}oN?4pr z4(vokH&yfw_&)0%)}F=2$wrxSADswhVRj4#E^m$?3)C+ncF{ljaqpiP$6(01G?fRV zHM(BJqUmNd7|Iy(jD0NDfLLML`^X4b#Th*KguM^>+4|I9wyS)fv}7ZdlF&NYh_$Kn zp2xrPJf2Z0D;7smK~h6LdH%1-B45$M4dsL|kLCvq%tmam@Mk2x5e8F%+en)y{*tbf zhjz|r^n6XWR+2{Lz3-w^F+*_YAwVfG$M#x83u&Xj-n zdw*DNeeh1XO$4buH}^om-^F5L2OQ9%f&gzF9dHxn-dHJNaq!wDyzE_wewT<7SJ+2q zguo{>7e1%6AAefTUpgogQTNU=KR)MZ^3LA;0F?M z3Oo*UF{P;0pbgG|#QdOwxR$axOyf_#N@p#kJMU{}Filtj`Id+AJkkUK@~o{|S~8+4 z{OQIj-w&7RAx{Sg-99WM$|huK5OI{tC1-nDyfcFr4j=*sXfl%FV!$9%j&vc=iZW7+ z$RbnN-kCzcTn_)*_ul&vg8ovuj*Fd2w2XB2(GwI1qouH?1B2@}40n9&Z0R^PT;_J! z*bB>y3ZpE^5P46ZJR|)A)?lqV&?TES*+-rbr(9)Mpgjw|yH$^$W|oA7-o4a4c4B3Z z30OSLo5iM7&N$M0EV?Z${?>vK-$$)MqwPUH1|P76LNIY~vOPG_jY-Z$?GK?*@Z3j% zbu#cj@W&JLiuU4`{6fg*St#i71t0pRY16oA{=_rVc=o90e9iMF?@BvOBGR4YrvRRG z#9LtYx*nO|$$fq9InqT&R+t}oD#(jxt8Ywx)9B~zfYpmziS_-DZ~w5Aw|+?dSQ~&p zYw#*Q9>O{&U=hcgfh)k^K7?6dROk)GP@c!VsH}>G<$Bb?T>oPQX$1)Ro?%FBrx2X^ z)|rk224Uf=KjF75Gd92YQI-O-?jp`!b8HuTNdLAh2MU`L5cj6u9cy1!G$krahDPmdiBdcfBrmuP+^?C z7#Q=M*>QK;c3Nns6Y~pNppC6ht-By-P4CCmO+F;vDxFDBrp*PzxEnK*Wm~S|9;cg; zuo)Fdnz?xKV$5Vbe)K4WFL|$Jc&I=b1@7or_YwPBwVI+x&~Cc6nARB!`@;@G=W~I3*cZfzR{>>KfH4FN_pq4pOMx+O1sKHMS9y| zyFPa`4*;<_PEvT)dfdDf4&k(Z5u&?!ZXNSynQ~wMwI7*N{IxLC2aG;xb(Z!I{@Uxz z`}q6^gfm+M;12;d)-*W-wTDlxu&5-v8KlCobT1FE z-E{eMtWZ`|T69sqZ7;3pS{-9Q<$CtmhyR3ujkJjqsFruS$Vc1rO@z>ez7L#k9q(@2OF=*URP?n(ixM#dH^s!NwVz=qFrHE$f+DY?t(?Sh9 zu4(DQJ0L<&lb1$rLma#2!F?Zk3Etw&s-}fL7VuAw_LtEij0E6Acjhherr{5z=(rnM zoPH5(3d5r`qT-dZD)kfAIILgRmmRBpQeWij7yk;dhZ%?j8!Er?KA%6AxrtvmOlLNM zL=e>_z%<4Z3pdYAGB7geI3U@6HnK-8H*P>U?Ms?$(0rEp1w8aOg(NFPERV9ng6rdr z<#O(|Z-7)H$@EMhte{^-zzrr6wEO!b=j z=V3ih;IQ-LA|Z$!>1^rgBP_ozj+p2hm``(rVGp&X0ZmbP^_=+mixFiJ)L`2&P<+TwqXXQ+K=Toqq;aX zpx@*UJ8C~+)=j@kcb;9w(zimtauM%H@0l{*dxgHmqHyv*KpE+WsRaV)FZSe^DM|#w z$lQ>O84GB7kN%?a)z5w@Xk)?XvlW{5h#P4b{PVM)WHZ%sB9XhWn1#@=tED;xLV!rT zXR`PkS=t`{e4iOUmqPqYGrp^IX0n>Lo@?p#W1rXWTJOlm<5P4;VP3X3@cn||^dB(&a%BYC5di7W zbp^j}#wnC}VGtg19ZGVOb*Fi?9Z_!UzS6CWq?s|ms;!k@%fIeNj<4I$m^7sVAzf!% zf@|uNmTBmoGA&zWFz`S+&(rw{w1Kg&3S%kmZ*!db>J z2C-NZgJ9_WtFPV+*k)%Rm$_$iJY!83v?q;tL_`|+gKkX$BN-+v3>Uyim0C*b93f|G zeLM?~c=X!CI(gK(NIRhyw^UG~4`G-IrL@75d`KB0 z*6lG)_>BwKwW1sPbqs@n%7n^~8R>XY)^RX>!bW(6P4(So=}$fvMK9 z{dK+eD>0B%nz)d+gm4YKLC}%Hx@S4ho!c{NU7IW4@?9Ddl0Kp%WDzyx7oOYeBJ2sD z^@uFtYZ!K!k#f_%J!Br~R^?MZG)Oc-4QJ>I^j|`oqCgl*SDKa|9@FpL4c+xPDrGSS zac@7eEb~N_N#bFHhK!X@5)8fw+eY=-6k(bY~LE{X2tPFOSg}J9?Zvouxt@1sr zn&#Z=9HLbTX24iq2)lgD2FdQhg^1JOCUGd_9=2n5G@upyf}@=Y3{pyAg>?Y35lnpGktBZ3)P@&=1pO$4J^^8oOWnYPx%Xj|!Gv$}> zyjdQ9^wV;@wMgS&?W1h1z8>v3j(XauX-44`9h)m`GKEkv#RBdDB3rnTjRM8ulLs+H zJz{D7m+Q3xgUrWG;CA8icb0j{CS%J) zLScqMi>tBShXziunEG6~bMFJ%8zzE~zqZ^)DhzIJBHY=(qaQVi<8<;pAn|?PKS$G~ zvrX%>lQfyW@*|VgBWS(Jt3GR;d45u=PrW1mfLUQJ`Op%3JAClL&&v~BCUYMMti^6P zbqGU;=YW3bHp3Q*3nOCfJF`T6jgYg=Iv58c2eod)!&yO&D)@LUoFTk3z}eRvTs2TA z)P&WYG!tjX%Wr?@x62unh!I>ZR*3fT>8;P;M#)3Ohd=R9`+BVNg6}A#>XM?Oplf1J zuk}Om;GfT21TIffcnP7Mv(fs5%cv@bBvf(okYaco{7eO;n||=p<>~URuYVT@pQrE$ z3|KscCE%w{TKzV0C)*+KN*hU=Ngn}wlYgZPbFf~ASPSbB%|jEJ?+g+7uCb};S=RCO z_F@Hu&xKDm^|4-(ih0!|x`sED<5AyJ3fC$nK4a}6MvYC@FDQH&8B2jm-$jcURyjaJA zEK4?ozudih54;&7SGQO5u{|l1jSs^^dP`nA9D zxivDq{4(f~xe=PQdMFU(${A;9avhkaSYQy;v|kixnz$=`R89ewW1xUPY|vn>oiy~l=_>8g$5 z%h>0k@g(ShLS6II$CweWiL{xnNm}w5p6IXiJ?jiCj$g`zbg!tF=!pId@3Nn|UykFM zvPSG{1?=*-aQoyLBVyQ%qZg5vICADk-eQI-!iKK(&V?8oC=T0%i(iNLZm_B77B2RC z7>o_5(#`njER5gTm+iY^TsgXLfc>M`cg&fG0o+Fo9j_v0i;HhJnwug-`1R?rGTMi% z!TPApRf6*%TOLMqNsN2cX={HzhL1gj(Vre+YxNMMS!&k3Hrt3i=IUdouwdg zi$q;8$|C1`AG}>If&76}7a$}<$-$HucH#gI4^I{qt}k@>fi46t7~>YR5>RIN=z8x! zt)KfoKBgNC=0h;$A$HF0LTg)Qcj;~fGNUPNJllwfJtHu~FMsd*Wpesb`SeHssqC&i zLQ_TnfOuX!EI%St#0%l0EF_M7gvw`kJ_Qc;j5t3P-*!RlJ4}T>-1`OYP1hLsClEOC zm1D3NXMx!^)AlMs;<@6>>srqj%TBDF;M#AZt=kelcYl(!Ed`gNlb@$AO5*chJ+i~b zbK!+S$k^R}{oCL9HcHZ1>?Z4!a0ri%lpM8Q6;=m_c-~Ne4eC3e%kXR2HC@f4C2;aU zCN#?;J?koy(Sm7+LZ=gl@ne1X@L_rV=4;GI;bV+@Q7=Ny%nWN+5SV3J`bx%OHP)Ijs%fi4fQ2>~M7(Gk7evYs2L-J=4GXh5+yH=k* zwMh23H|@Fs9icCY!|wFCmcec|iE{uoN{-BApUn`ZtFCTX0r6OmJ2r`{epGHuUP190 zf)TUu62*9xL2n1E#MmB|4E&r|u~PJ~GgKc7D-V0XJ$=Zj+;b=oXD2U}f$mcXa=4Qy z42D7qLvtXc?69QE#3(N!N`e0v80dFey}R*s#c%{H3T+C!Dnt%OX$^9~JJeDb_@m`O zWztmAitVa0tw7O5Izvd>H)HpB*$CxT0<3frxF?@h(=?4y-}}4@;LWQz`M&)pZPagS z88v;p<0jq%u<(2B^ZHZp<~BA2ddfoA+qXZ<-9=F#SBSKcvEg+k(#8%=vsz#JwXTL` zbWqg~*Sv=jZkO3P+*-x#SwhU;EZ6U(_bfa6hi(D68I?E#hv3ck9`4g}>B2>X&EfL; zmtQNl@7{^>d|&Hq-;_6bwvT85)ZN1}9oUS`Kg1p*gADqvM^JcDm^B)ot|$GhX^M3Q zU%FYS>s|-b?1S=+-}rJle|nO^&lz1^6Uc9AuJqh^Ic6$! z{Zw&x!L4<(zdk~NP?4}N0DRy_9PZ-k2dL0v3Xs-YJV`UPjy`jMbaTu}!s)7@_-?D= zG0}kFoJaXhT!@u6x}z);M6?vn2hgj6hn&c9$o1#W-3QYoGV?e3=1qMMpOk7!HrYz3R6 zUROHzobE!@k9yg*3i96XA!LUyL$CXdGB2eMc_WkDDhb7Q%DtDE5tspb*pJe z(s~x@17yLK^S_zIMV!pbI{Uol$Aeb6UPl_s<)|>mjeq<+ zUgbrU9k4-1u8Y9PCVz4(fB|^H5Xz|Q91R{KT}wxXX_ZEfaLrYzT}AHkx%-Cn(MA@n zYp8oWL`Z0GDh;~vq1`6JWJ|XQ_s&p;dl%zb)Q2-HjF}C`De5U2A`M;wjC-Aw>n`r@ zKcSJ$*$wH+9w=%ex*Tn-*Fsh{>l^OX^ameX-Y$kX4eE;ov~c6vSU`?(r~{Y5Bl6ag z)E(=F%>^zY-*&S``zDHmuCU_-KGCS;Y>mclD4KF{qhmer0%P}6IG`L>2u>R7S9QQIiVhz&XChINOGYW*&hB|5Irb^E#5u}QvX>YEx;hV>C zl33cGo)C^X{`#Z7_3WVIbW8~I_{^@Xtd;XblISx+2=8m!R&GJ4Jdd+)CD_*6xR4l* zKtwd2Pa2`F3oF*)+FW_~(;t?pm%mmfuDr@(Wca?D-8`JvG-z4>+T$m zaCmN@)6b$`Ej)*6g%C;yK4qJ$JEeD&&EO`^!HtMGg*9=1o5hhR7+afMKfy&|@^ZQS zrEjvK+;sWihksg@?ta93V+!XHzTQ$NWhQnVW`xW$PpItv!A5y-_f`z(r_Nm@+764* zc(=cSB5@mkS9b26WcFiyt&0Kl8P@W#G6pfpSYr@v%F2_-%MzY@95?lyH!mRBC(j|i z)*wF2KYU_U{<`B=a&5jbZ^ay~d#3ST3;%C?{p)Di6XjC|7b7azHWiNc>li5LJoQLW zb1*So1={O9xg26j+QcFoQENg|o4C zVRh|snK}PDLc!@6^oDRXdhp<5VAC?gj_rHwa^3h!x{yp}+s{Trcp71mT!WVTaRQCA3Yn?&wtb96R3`6_?_}D3?qT}Y5nv{RZ#^?j z>n5)P2HU9h?&NtDNbIlC-p#kRseSo`(rUg) z0ZSn(crj4RE41o4AU|aR^6Rg?UjFYt`;(aUu!`nkpU^S~YvHHGFvDe9mV#ci0Wb#6 zNk0gk8tqHrF9RZ^kngt|1r*>W@d+Pe0Q?l3xHf7?ffb?DNKEc>yv1TkXO|TuI+(pW zMk$uo-D_Zuy-TJq;RZx_Wy2zGZXJTp`B*?53WS4g)H7jI!4(g>T4f(sNKx=@XgCNs z1!A#F3 z+<`7!xE4IS+d4s$gXxg)R?tAlhOf{WK z7sc|-zKTE!PEOL~IwNO-LJ>j%(UNRGaK<6hQeOqy4DONZE*xWq!H4xV@>DuGH%A7KsuxiXzsn?Lbhb(7baS%7iPE@7o%Z@YFCFBy$$_8p_scX)hA-3Mtg%+ z=vkxjff1BC(MSEU!6oKtSPcq4;_WVNu)D3qgZknV0IvHyDVrWlAbldloUqk}ySU=zV${6@w`BEAJ-bXGMoJMH12%-g9>JF0>JQ;}$pYn%I6 zH*uptYx2`Ta)Su-v0OV0IX3xv^3F22goZuG5q-kvOgHfW^p|}$IC}W;yD-u9a_*(C zl&(P*`H?;>C8Udmu{4yJ@;i@?uzWBpf#0eFzf(+F>a>=Gn+AZo8O$C6o>t;M_LDfp zrZ=ky27AnabQI$16J=+02L{L@*Rj(x<(vQXUzK|wzf(SY=Pye)I~}V`>{6!I3K!jp zPiAU?gFyuZY5mQ$x->At*TRDb)P>FUE?+G@3~rs_x`7Y;?O&Ar^Be44fr8Oxe^6a867L=8B(`cfs|?fp9Bc=n5qH+sm-YIU>*egZb7g|H zCzDL$yVLnAue@5GvCaJE`WC?fhA``8IZ>JBPt=KPX)ti$UmjY9!prvIJ(wP|3_Kj{ zc?o7o+3dj(p#i&D-@*>o{k`J|^HT^Mr|Bzq$cr@ytd(%;e3p3Y8yk1aH2&1RJ>#t7 z87zzJQ_(f5rzC5GcG!T1^*B4b&@p-GbKcd9ixQYJcDvyOyKoWk7{QTrZG8!!N@B}{ zA4Ag)AdyVKD0gGZlg)>Scy{jG`Lf0Wt&KJM7qlJLA`}UkY7e{D4i3{kM}(BbKz__l z*sgo%#-gG?WyY?BkjNF9y5I%JM&RKUiFvIRQ$Z2_!Y`(xVPr|Q@+>h5-w3O&))<+) z44eEM1U!Nl1O>#cJjj75@t^b+FPcB?^aF9k(@O7=BY(t46;6U~%q`l{v-txDJjz3Q z4}lQaI`l1#@}0Uq!m|H)`S9cSf%!NV{LU}T0ef!vwa+t4w~FSKvoa~LSf&DxR$(no zE-4fo9};z=(3izc^7rJz8flL@pm;;2(oeIa}3> zTd0)gqHWU&SqH+H0RygGz0Cc!a{t~P-r+iktIs%t^Ly`q9Am|+^DE`j^c2=e-RH0j zN;1&T^85p4JaEOrhy2bbzaT;jiv{5iYwIfK(1#5tQo*N+g2JBGIv?u(Id~q!5YhSU zZ!u76MB-;XjVh&}ZU47DI(7+SaSk7C^C92JI_M>7N_Rf${7^= z5rlF>BfBH<9%Y5aH7Gf-N+z1P7`usb6{DU7(OA+2SLDgH<#3%(+&HqUd68u{K7^kH*3bc;)xjv$}sQ3*a#0~>%ERb*L z#Dz&T#te*3MMMT=Q2X(H$^g?a@(g}oT_9rr2Ag|cq;+JSqMeSKnL zG?Zx;OM&y=a^=0>FJ1ruKmbWZK~(A`7Q#Qput5l5j5mhtv=8+6r{GU^7;v)RIOASj zne0pUN9&>@V}S-&YZd9K<0xrShC- z@9oJk6M>>13K@CZS9I&zWS92axH3sEDj1_9gAv$5eC!agqPJ(WoH{jH&R#rQ9zDQS z)v=lWDH#9|w1n{lWnX%*q?ARx=0Oym{M;wvsZxpxu7~5a_^{7wv6jlDvz$qZqR1Os zfS**sISLm5JA2fQzrxZylWxc`bl}Pl<6V@O{INby{5PN`NMvSPfHm?nGE*PpgJTs~ zM6OrxcTN_VpkEg}i(6@0WpWQAl)P;hMIZ!U+ItYXl8y!jd%4y+FP~F#6=v#%P-R{* z^I(;PQ#puggi%-Jp9T^qacH5OxEC^vueAC@yyGv)oi{L`}Y z^g-!mYW@KKqVAeS>Pub0X9(PO)WEMe!x9G$? zOW01J<3SWAO$7~^CeoWUEn1bn9W=a_K}}E3goVue)2G;hns;Y7+wQ(Uf2Nn6d^5bE z14SUu0-ig>cH2Q-i9_+{pv~F`U3%yPTL*WTlD%D4naP?MI}g3!uC#{`MQ6Xl+Lyya z2Ft}yVirudml@#eV}{|gTYLju9(6FI2Yl{e+KJU-erY}yeoHfh_)N|%Ao#;S#>V2iN|QkmH-gzHJ}PZl3Wk^Ka3!pV>^b z@vqWVrnT%Uh?o>CXA$0T_`ZIdqz}|V?=FzvViURlijV$-7#vlE-Ay;b6#_Ulj`~<% z59!{#^wo8+S24-F&C6L#2UY|y_*b}+5H8yq_}qcDTjAFKkw-q$Z(M7uc(ct+5dX+u z*tN<%eDshRVH8JZ05zQ4xOOeVCMx)gYlT5^Bz(?Dh!2G{g{asdi#)7@<-m3sV2-hZ z4%zmFov?iG z_rbvsii@*=(v%iw%+2wG@(33 z$D}Vp1FwDc%S6j@_5z~{)^UZrW5)MA`sEIxyth$CT!(?78C)ylaL5}K79#g7*T>#N z!2yRdQSeTBCt_*vqq=_3zUzC(TltFaYvLvNCwZ7lJ*;EmmZ#iUh5S=f{0(Q)pFm39 znHt=vKNVKe(1g^oVY-tNVq40>(2r{ zaDzyOK%ebq@P?pMp1TIf{+WF+pW8;ksIqe8A$=yT*k19PceXiS2W?rWJa`Tcr46#h zM(ca%6W+MP%=;FC|0d@hltj}Bh=G9)j=Ux<`34)aFOea+aW57`9A$GY>;{BAKq1*A z-(Hj)w@es8i5fwsoE{!1mu9BQFtZe^7|>NdT>~L=wjY|6;=99z_u@^N*k333IA*JX zV)b%+!i#Y@rRGl^p_sq zLYP@E*Is|442{pA!KJmZQ|d7vf26PYh-pt4PGo|jqwj8S-+l;~F3eE^pc%^{vmNWW z9SswrZ|pq2qFARlmzj}ZlP}^jpv0_{`A0i!tbx;8Yfo_(m zYp;~4D=(LY$M?#k`?qjexl{&mcUpt_XjP6VSCnR*)b@4lvdq8tK|X)uL0d>C*$HGC zGNnW-H~G2G_}RXJf3DTM`4ZuyyPRWvLeJN}RzCXZ3$&!)KY`is90zWjU+uPJh2NT?mjY}tXE_jbu#}ziGP}^wa(2; zqeb<1Uqd-{F(pdi+E2^F*$;5%IA6~8UMT}dljZKCAF(!q$Xk6=^pSbsU{`AxKtxW# z%D+!&xfv7!H-LJwJR5s9jE*CaGcCJ=i_G5PW>|I4oc0;kyPYMhi$+CTU`Of=zH>kY z$1n;R+yGG<^!Z)n{VCSj^f43RBJ3fQhHj>XovB#-d>Q3w8tcUf0$#Qgfk{Ik+1Xkt zyDah_9A(B481|T1l+iiOdW>SRMRciyU4#(?OO+krM|b9127LzwU_L%{@uiH~0iU+v zC-qDJ(M}*zOMkdS!9!HuX@W?XkvMEmxl)7Z_|9k95VhTFvm`HRnyH~YVHTQb5*dg)Nu!1K*6rKn8{c3o297n_!}668RAekqLD6!2?zz%* zRjdsF;LLF?!}oq?T4`Tg35SP$sM5XnzVqI+^_|bW52c&$%-4Ksn|kdxD)aIuH|?{$ znxEyX7)bAq4c5bN#G`rnt!Z@Oasjyt|EjZRxR=JP7->hD4W5~qE-Se20s-x0fDCvd z7H#8c;O^22gZziDBD8XjYsw3_c+y3kLS7;NptDE?Zpc%Zgv~Kc zT2?`GQL{5Z^^o#1V{^|-adl#RwZy-}(Eu5@&Y^0MFqk~5jaKg#fIxy!k!?unVG$Z9n8C@z8A$R7Qk#6J`N1s>@K-%Y` zGua?79zi6w0Ed6015uf{=C7|z6Ud8HCx2o<_f}v53SJZ1x&lJ@kby9*849#p?zGEq z7>1sDmu2O^ojHOf%2MOYEPW>wVlQ3QH)kh^)WaRQgDM1g*N8c)Kq!xyZ;zr|4RAApZ-bdB}xnIF9xwgU||@F z>X~hD-KeYN?N;>=4B=ewe*PH)=1iG};dE>HC4Zy0JiYbva)xyhr-?h>gBAMG-Oq?c zevWBs7OJ6{Z!!fOt|8RdPG|>mfUC#zb+1nopT4U{;)i=-9_S=8w_4nn0;@$nm)f# zI0z&|mtC=wsG!JH9iXk8OjgEP>ydSk?zM_og{)AE=3C|SlJ6aCJS?}S^}~*1!Ve=j z9UX7LB=J?G!>(>TK)|0O63r#{Bw_dH#rrTDtXPZ^qr+1%OE8Gr9Il_Z3XPzM3}LO= zE$b}yH>ytuiufMoIn6qN20Be_{nISAcR+Irb(?hdodFg|PfYZcXY==AyjUp^s`{{C zY;GMf^D@Ki$0^EKhV)Q!U{H%s)`?DKq$m^;tS4h5^jnmWAwtF)vE_i>arcfG3|T+4 zxN0PpEf#X2XfUAK-}U$!C(4kUE%hL@$hcg;A!A)9+LfV7-KEx0vf@ZasUbkNB-Klv zPQ5x9%LcljW7{vRsWwxjI;gZ_n+F`cjH%}4+e$xS zmBRWHppmw=t=}+t6!dd|YtNg1`JZt0ScJ#nR)&J6@M;}GL4wD{!O8E#MNdWLAwx-UOwg`}_f(7rn*=dfT4A##T&xDkS8KZK6Rgt+G`@nhTDOoM zTnZ*9Mp!h6BD6e&vOF1ebMba8dPG2SmP?Cq%udiB=09T>_;W-WV-3);_=tXr(gt)w zUh8SCRCuJv+_Q)Wfri94Zz(JLdC*XU2U0~Awu#ZIdJ!0U5%M;eRa7Yp!IJ0l7H8A4 ze0dpVt28)x16IfhlpS&F8{ErlEX$WU4=5BIz-gUbYc;I@_Zwk$5tymM$+=!tj~a*f8Q6B-zyUO_qpGiX5isJ{q1SvEpBxO(MMx%tW~&_*|TOfcg)AJ65zu9Xl! z3iL^P2!FJB(78C^zcUV%25TQtptlU4`^;bV<5UhJ9ca>*+(o4}RefLcsQt%#5A!n5 zsx0_ip62FVDm+?gUFRT^vQ4Bj^AL8^YJhTGiVB6he%CxzkldJbkM(HM^dPf9@&Gr! z?d@Z?c+#)BA!xKLWs;Y{E{xVTFf>MhjA3Tu)}YB$BC;>TH=u3lQ9Q;0EFtuAV_odR zD$~N)*QBu?p#Xt%4GttRb)*sJe3soih%8=4TQmuy-Xu;)8@&0Q$BXCuI@^#=#jS8! zOZhKwhwzUTU)LVsan{xl*v@|1PizOsoHoQww^7oZ4YzK`Sfw1FHAv94nmt#X#ZbY( z^1nlH3eLOy+;mVyy`=yEZy26!cI8X&tvB_N_LErbci#C9hi}g9Tl4k{@f;1=T(z$AMZB%)fvWj}@5FlCN0YIZ z04oeu)m!hrSuVZ)jWTiJI&I28LfMuRWqBQ~=a(EHRRu$shw1DaSPwQ}(gRpz91s-J zw5lE>C~O}slzvysGvIZdzku)VY%ugr6p6hK7J(Tz`!;(@OkZSu!_6}G2ZVzC(GSb~ ztq*BjqAKaLMv)ksMMdI77$g>myH#CgRu-PzgCiIQ_7VfBVIk3a@eO`<`=fGd>MH9b zF5zl7Ql8v;j}5~X%GsG~r3XQLnP^Tf+zGutIx>wZ?wh05y`JmyIE$CIwD~S5@VR6s zY4dTim~6FHzCjDkXHA|pC(m*4T*#6Era%8{ue}}&{IkzKi+1x}3jQZ`@LS6{p$Wmx zcJ=HxP6;1inb7yvv<|Mpa2>#-@a9hdla?DAA<~$>{xVfU>~~=;fRa2y;Ebz;Cm%wK zEU4Z`@a{kepE>gzxDVYZORM);Bu3OF7FZu*t=NH~Z||+JH_16HRR~P>38>5^gR%-J|%fTh1JL z%+nVX>~jEjWHoGSxW07kp!hP~e2PV}`-UYYVhnWZT8-UZ7}5^H3$zPo2w!^JMh5L{ zO|&@(ldbw(E3{VpEnIjMmQaIz8GX~<8y$wT#t9F@ojVA+ z_Q1iirEpNbgS>~ZXz(z^pjKy(hHC|X@?KcPoq~!-Sp4MOF({cj9!B}r!IE{kaK8Vw zFR|WdX%08DKHLbWQECYyz+h`-73kvA_&n)9?#(3do@Kkn!wsDjOh*Yyv9pEmazlR# zN)?yH(_rnz0M=~761qWNT0gtG_E2C(%CqHlT<`XA&vQ>9+y@!-wop8Er8~v~*}bvD z!tCubiY0#P+-U+1JR%>$W+LF}o)a78sXTxSg>?lK_lNL)i$#WW_a2quDe|3Qts7%H zO%t?aZNy0!IN}TqK$FRTIa5Cpa+dts&nYyE6WckyafBwrF6c#y4eT>wyAG}o9BC9X zNw_PJ=O*pGiJNN-0^l$5prBAFNEdhpexC46h#S<40iD5JR^AEijh|)r`GyiQ@!*^q(3QD%t<;y_6ARg7Xi2+2mTap9_)d1?I(+M)a~NT9IUW;X|KmgAcx zW`ck6&QD@NylcmSY@C_G?EvjP)*b*1Q_K2m6UFN zqlQ)oN73K8sC~(Mg>n0VKl^}inAqq|3X!&jmT8anKH~HGO%Y5pe8w|`YuZ-Ga<)On}q>m$5f}vhz9< z`%drszM~u602&(!5ZuH?(rB}19FNCxyp3HdCsj$RQsoc%5dRaY%0G~=`Id506OTQa z8BrctqQpg%D1rnDVn+kr4fMY6+wIQp_niASn;_|#7r1@j`z~jF&a*$yi6@}7A;|HV zbqG(ocCN3HJxpI>67^J**rx&^5b-#d6D8Ak=cZh4#1BTL!VV{Vycg~BI(@QH;ujkl z%FZ$_3Lc%dyOt5b(hjLyxA2JBgQwWX6*)e5A4r3>{+F2RIqn+^8JXSeJ9X8fTsGjp z0tMrKnn^Ru1Z87IsbYt0g}X||xs@2vOp$C8hYRD`!$F8e7dITnta*zb(22Mg&-j3~ zpFHesaV>gozhjXr)mMQvkI-=D0a6R5q_zuFRG7$pw|M`HGsiL}`9xLuw{5k!T$i`i z3oy@sU7idX1c_;sN+2C~lHQx%L?`EI!!o{aw#;`pd}W+96p3_jaJ zgSLfcwFzxqC4G|4>Qj4I!e?<6L7}-l)wOh{cGR$|(-Z6!LC`BEqR^6}U0!Bq>U#3~ zHW3*3;*Wov#y{>(cRqSQm5>@mQThhvw&w5flM^WkMBFgg+H5xqWGUG}S|2z|05WxI zoH{Jw*Z$-Iz`UP22ZmEEfxuH^_tT^MciC}!1O!1U*j)fzp99`-)S@oF|MtTti;*gx z{_5MsyyU`po(m46pf9`?5EuEixn~I?x;T6}7m$DYw|q^Fw0^S^FSSv3?%auI!e25f zg-{DA;1-jEMC$a?#u*p z+S9(4Mu%UAN$X1u`>koObD6jag00yUuDf%THHM2Enh0}h+ba>C+JrTb+Su3>u@vm6 zj=Pe=L2r-RFx(cVmI>H?g!;XjG@6=`S5r$nF+ix-vu`TU#oTE%Fukn8*CD~#v$#{8 zCV@Z)f#>_AhdbbNNB~*D3G-#toq^NMFs5ohYN{Jyd_bfyQ#I~y05fFH?Fy1DI6h1Z zqsEvhOz=Abyg-`4ShI$_sK!md``8KCh3@2(!H3mzJb%JeF*k)qjw0hYIfs$I+cG!` zIP_a{x6>$WNaO-01Z3-WKKT^ocwg#s|Ngx+JvG7jVMbOMh%zLBmf}I*C{&{3w{v6F zQ7V0wkXugF>z`+jd?PaQ$OuWa92D1;8aCUMnFTeZ5=j{P@_m0>qA)D%MdlKS1}qRb zZ3BT4ZJCM|+IH>gm2~UQEyh2WsUpZm72{)BnRYl1-U1%DZ`}tZjF9dcOkYBfbdLon zRkY33FH_x&*}bm^mrMw-eXu>jka6CFW#ZoC9Pb>F6>Sv(6x>Pwpk-t>0jqOz$S%cg zt@!%GSac%L#_rZAYc^Q}x-pqHlZ!AAy6h=D*mE`O+{1_ZXbYE_r)1x2?n?*w6YDaf zW>P4KPxRF}QwZUkHv22)yO@@k3ybA~|NZvm_?@pYC~>I@I1N^i@u{VKr%2@tm5}RT z{{!Hg_cmbQ$Juje69GR22&UF%%OLpikj}(E+A4FE(;Ev*T%d1FxHWFUSRY`r+}j~t zIo4+tB+hLG!PSjeYGlCoxUVHZW_Emx%_rGS*+Vy|Zs^j`=^+@i>{?=AW%z&7b}bOk z0~;Uy_P6Q1ci)YSUe8~-5-{-c%P+--X&-P;t9@wXnRl%l9<~+W05`FYIo~E`NPaS6 zX&5udPx$dY*L$(H#M#0~F!QXJQv;`Bop{ZUFkc7}!ldsN-z8&Ku&!I6G(`)~5!O}P8aRX5*Y#>`buEN6 zpP{&D^TN_{768C&j&Y=cg@JBvZcWQ8cf;)zKMLw}@PH#e`nKzq`gCfNx!R%~3@L)b z4sh>S*v?{&M*Dd+3$+SzSr~FWOp&T<)z((Ff;c~pv-s0}Jv<5_i^};`hCcz4qi|>P zjpDFA=i={j*r!l%aALh$hO;JK!3a^r5%}os04CnY?by&?JsIKv38!0b9_ZBa{+{CcOYRR%7jNX61;u{z9^1|q2u3Ovu4=!YhN zHGm9y2S>0-pjpOki6p%AnF&FV*+k2`i*~iP1M~1}KZrdgK7RYJ)7tdo@YAgU+3aGi z(LF%I$IBUbIg7s095W59f>1Pxo;iP!&F*?&KE*0zKVgUC{q2?1d+u^N{oK_w^>{o2 zJUg(Iv>9;2JnSKvYX0+ZJoA{wPp=dwFf}}(`2~mhB>dT zLOY0A5wOG3&`IGir(|h*Jo$UHiFfcNe-3Se=`r!=MSF1j!6!6kKb;*pi{_>w%`eWy z*jH1|u4cDJq;65_$$7km3T7~8oyB8DPs?}+aa(*jc> z@eDzE+(RSe6f?RRrpt^q>oQ=PKhFC`M1g5D^!r&*?MJ|#`#D*YR z<}(x%^sSn47OOs&o)FvqT(ks9Bs;AA%p7GxE7RpzIG<{OJ(mDG4mlV)udyZq+S>@ct0SJO zo`8DusBdCjy@3|_?$}r`S2HuS>CHFaNFRNC9md^&Kn2_te7GS>!ZwxwQ7xU}4=tnZ zo(EF}ajkEKVP%vipG-%4!uMF9qwT?%Fz!8F#5t%DL(^)&tEoz(y^K{eiOuoe06!kF z8K1kQqbNanU|nu{D3h^|W^s>q*U>UIp)Hr-yDRcxNCI*VxNrh+?xw!QSg5ctn7%^z&YaVc3V@*T4kH&?+2kv&Rr`(O#jKHHPN4y1tS12d&~;WtM$eXHM_m zzZ1zM9*&I>7dDJ#^AtivS1@%79|a7CAORhc&WW4;@Q?l|-Mf1)y^A1WoQEk|m&p)k zA`IOl-+ueov;+4h;HRFk4-GN%Z{6F#+a5@B1vt|kSBpgCxMyrlVeC9uuld&)ylt`1GOS&|o~~H#YM$dNRF3Gj z0o-upHL6>Z$zWt|ZI5e73$|-o0YOGv*Gyc*fR9kpAn>{doM1KT3bwl5tRkrCM&w7O zfw*A?SHvyigp4&&K=+zK?92>IJioE1?dS#rjAtc>GiiW6nTN0*DBvwDVrcU6$sB;Hi~C4e>%-_wB0F}3o+Dp21Y0zm~CmJTVywgjHw zZY2VO^{X5(yDKw7nN&$5W4x&^te^fak6pB%vT0`2pJQCh?er#aspr8FvA^jMSOlNc zfUA#4z@XJkUQ%mhV*^%fH$bL;3cvNhKI$IE%NRJULAv-u!x}+Y)wEf>>Y_6RaU0$Y zk$I8o{!WJg>7gbZd`4#^*bcBg6VbA`;@siLNAZroB)4975te9t0}ijvNa{N<@QhFl zh6s0=z^XO^`tn!-ME;8brF05#Qu2`j+xrf*gX!7v;mipzycCNmZt*nVxhwfi_pWKFt+DPYK`X1H+e9sSMD4b-qqiBf$uG~j!LHgrgXbmt4SMCuffiR(IsF{FV z8x#nT2|EG-?C^dgx%S(;36|pC&e}W#!M-y|2xJEYz8uVfY+(L%LHf?0{3PAK{(c(! z_`OtzMWKO>M%n>o zPnP5&=k<7x$G+5tqCFzPtnr~%5tW7VM+ECL&ZTqE&t3--tb6w#z_6KaIM;3eB`W?F zfqK=$cl~&+lXR`-k&br)sW|Tt0|;-{K$kHOPefVqU`B3Hm<_VOeC>-^MX<&Qu!n?; zQ~yEJW9D|~>H~sz+|<(YZ+CAIxLbm_X5pZ!l5ARg%LwZ=>C7o&ZqgRot2wk%UBC)H z)o3eC#;bK=7#Ai@-}mq1#>CFm-92oKg@4FC@d^7_sMKgF+1O7vbebXPba4r$AOG?_ z;v)bl0{GmdrX~Dv*I`1OFa35KNLkwpyxPWrQ@s#9v!mEm>c=eF3Ik!l-{Q(-T3MV( zJ+!A4X0)wtkUdcb(}Sm(E4U4=tdWI^y3P(?NWE=jiy(FACpUhZu3dR94fLKv zQ@TP@g*ITi9haBc7)$-wmuahs1L|SUw|3mug3Z`U2<)|;Fu(o4zJX#g$vH4GGY2A2 zm~@do^AdYW2q+xE{Lvq7fbhA%TnS@lL$T5z2*8<)LIHO&FuoTI8}A8+8Gc}RBus$| zVRDdb&H8Zw5&84K_$_W}OELaFn}to_khPy7(AH3+?c(D<*Mbo4^Bekj_P9AQWbS0?Lnj395ApbgygoVJGVx|EB)^O5;9 z;QsoJ&(c5sM}N!{n+TbwV`HpMf|fO5H#OCxWj;%v*vpe(P-9#iA9r_m44Fj6b}s}D zl`3ZZnwg$T9qhZHMJ9Z)sa?57!ClQbLlz2<*lWV^pug^fuEn~qt0hg$!2H2ftYAUw zXY;pErZB&|z^N9h*@YF>TPrZw9D&@%PgK@8rzmQ~l+14n_*cPGL9Y>|j)GGa>(Y5O z9%pr8K6SMq{Ni$Vgy5=Rjhwma_!09|67*J3?4cLvI!u>qEBA z%n;QrlW}lCHFZ2+K!58^(S2JKs*f{`EV+lI;c`g6JGO1;BL}uX=Gbf*bq`cnGBymJ1mi%ig0t zzlAS{)$78yu0HM1**2^#OKh;IKppOZl#g8(7}JFh2xo3x2aUM54Gj%O(u}+1eJx^L zGC(1ivn~twLFYV#ddl*sWZgK90Vm)U-?JaK%Q|A-IMa&irgUz2CZp!~M9dsB8p18@ zcTX(WZa2bS09Bo*7KHZqLw8%8L!hWK6BQHnrkT zI)F81i?yk6q<2S2AqSk~`eIb$NreHG*>S~lIuv+; z7hK5dc?P<85LvujyR2>6y@XO@VUu`M@WC$Q3$)5w(JnYfg>|b7+Uy8;?jMxId>tW# z%U8K`wAOZacTN?!)rY{~9C9BXP$lRQLXQ2k@Zc?gU6)79AZ5n7K)5rFa<=@kei0mM zaB~a6#+ERz&PTM8HdN6@6Z<^SKgT2U8jQ2A*d!%p57&!y#Qq2gHYeJY>6>FO z9a|^nV|9zTFmSJvH zu*B>9F-yTNM59{OWYm!dFh5XK;L{oKU3i|y720zQGu2UY4p{Jh_^i)p)+CiFn&%h4|KrrxKgb4kZ>5Sg z+#=A-h#-#RI|7KJH|0jhc1d6)1O{Bc_~JD*L4_$-K`W0*hTGh}XnnaI)-STpCTU;^5%A897E00jt$d%!Gci4?C(t9N0< zpWwRDpSs$Hhz-DW@$Fv zy*CkX%us>2va8XPr;lL_S4iR$-NxO;gMB zXg%2{0?qFR#C4uju|pjrASX5%0}=J39c)5Nd`NaOV;vgn7+0)x`i&kUgdO3&RnK_T z5%;mnZrUXTqxpqNT(8hz_l&??W3zx6ksvELs0o&Fl{q@Z@~MT&Jo09$k$Ev-(S2%k zrCLW};buI#!K@QwQb$Ta11mYbWswIs8yADtFGJ%CPYn8H8`gt zYY#$D(0xH>4oHo*6+x4>*=7|meu{k&3}jzh+#s7#OM2njXc{`zNwbMDL6f??ya7Bs zMTmGO{p2Tq!afc6(%JJUHDCm9-1;mwb6O;Deu}_bgZq2gRo(#J_4Q4H?n!ga_`7Bu zH;J!16}!%N5$tz%v2?)b<~1L*Fak%_I9C7B`xq_RFUnB5T0i4luo^cdN$GYqO6uianDmbYlc8%YI zWAK`pQ+CmM?wWp)YYam$-0l~m#e_;JSNLfrg`|lTOnKRiTVi(v1eN`<~_$p;ZR|_C@V9`Bt=1xy>{(dB;{CKB<*my zY~yQoas6^#yB@{uzT+B|FHkF;g#sp=!)LrFKG3~n8w;6r%SY(4)QnZZGK60f0kq(y zATYYGP*~e&cg9C3L4pqII;1tYmhmP=j(61-%h(ODQP|u9a|kud%3J%X&?$`pFHooM ztFtpx;6h-KvB~|ldz>?hToSRz@904JPuvI3nalzkQQ$nf!PrZ8<*g)s8HaGf?>r}Q zTK*klLEmEBjtjTK;>crG{QBy*&*qPUbspQCU8@tXlqOsZRh-Fqi~j*xlw?@3Xxvbq#x}w6I<|uVN0}OH4eco3Js!09FXe@>wDxm#B)yI8@?t zdcc^QP^Z#8!+1`7WgX66D5Mc)z36opynLfL)~up=ZDzbEzV-Jyv~gk&VTS$4c3A6` zY;4OU%yBvP2`v!xEP%I}DitDJ)QwDv zrgPh9Yy&n%cDP<5Ko%lK5Q|87erg;5id6yNS~e#4CH8@U;qM|>x^WuKFiW5O@;|1P z$M>*EXv#zI0Khfj+x}+{+h}n2^y4vn=Qr6;;vx(s5;llJgf;!>UfS(HlX?cvB9R_p zk$4DHIRSaB;(g1AzCK%e@!NU-YTb@C|Klj`7IRX(X}QG_z|Zp{;CRt@S^R$GIrPa~ zpyld6?{A6b$@)d;9x`1vD}rx9!Xc<|pz|&S3e}kbwkkq%Zh$$>4%(Q+)$kMcfQ! z$_-{~#w6RtPT|6gSWeC6E)3}2HdYOU<&Jh-s{m3!t-st6lf6nnoem@8>GMWZ(Zi4;qjy*sdoq{kf^JRjpq#8WxC) z+9sJ5-QYk$T-VKjwNJ3Psgv~&<%IDM=!kO(iaXQ`tDy;{w_a0odB}UtyW06m=30OH zrMVfHE`uY{0%7)*L5VfMd%|uO9y|rUPOJrCRB-ZAA!P6qoGDW!(*PVZ@8YYh#m>*E zMT$5Do)xbO1C$c=WX30FxM?E+#O%}paMX~#_uVUKyV*-*ZjNj^Q@|nSZL~^i+VmZM zygWmy)Q$ARAAFB~AEb+Dr~l^d|H5X11n>`@Vjk+T5+O`rVUS@_D|PjSE9vQzr)ip0 ztRBvVzRK-ri%;PiqbrmQzs#LMyt??Q>2hrRsBL!ybSJRCyJdHl9xM<2q<@`RAnOu> zpR@vnWb{?nFqI1n?3}!b1?BW$7g_Pz()QhZ4i0d`n6VM!HfeSb>7!$Rnzn}<67f1m zS_Odx*M2n_O|~W%(@{G@?;sh-Xrj2P1e3qZyAo?-f+T<22nbuj`ycVOev4?4N^gQEP zfzqmp*)Z#wWlH?2Uxhzl%9mATUL%_(NC|?wFSZAaeCKFWXda3Hg=qw-STC8DIS%$o#$T7KM~@!GI(8n!@g@A{ zt;=^jig&3v`Wo$sJ6hUwPgQHqx`BxaVcm)0SzQQQNxWivB0d5bl@7U=k7YP7;u3`o zgTWoM!NI|BTUv!b@DOjfPE5$qim*R&ra#6?CfbzN2biOFpi^3mo6r!qV`W=i!Xm}< z0favp+QRYYzwdc(RWHC;IfI=K~nLLI?h~!-4T|phJs6V>{4bZo_bd zpCLtx-Y(t^wl1dB=S5iBi?qF1p?YBpr}1Ws*^BeO%6;@jey0ZLyV|| zj38=MbNakq82PF32lP=*8H@q|RmE<uDr}%DX!?T{#>GdD~PwDZ^-=upV{1rQ0 zW03%nNHFMWxvvy05~D47JY;mY(%jTIgnK8QzwmsxQ@JP%hMj*p0fHjny1O?mq5<4o zg8>o%PfTOcPrr-f7$%Afe4o63;_^417S9)pIqI|y#)i@$kc^d8@+Swv=RWyf{*)i( zSzq}Qtj63khRlVtYmlf!Mgqm#$8C}c^YEGsjhQJNKX)|Gsrq0fF-fX9J7T9;xQ0NQW2s3Su+|ga;*__#RHCySZRRoLG0#mj~trQhjFw2*o zlL2S9bo}0DX4b%A2A*M0;CaoO?VWWncl|JxYw6al`;-B!cb}o27Sh5FrT*?dm>h&m z7{&1kch*Mh4wBv3gu$bnn(97~Qa8KSUP+Ip$kHaE~%%I?m@4WK;baD6^>xFD)m{u!m+7M6(_}*TK&GhPP2I)f` zjMNN-lihwnWPN?35M;F8%$FNRsfkmlGj*|e#c|Tz#a-ZKw(8iySB77P2ZKn=r7+XX z-WR&7IY#vjYR~Z}uWm-eo>Lt*9;tWH=9%@VvZ5s| z;i7c!-eelWQlyXl6|yzynzBSTn8z@dwxAV8=j`aYsN>-6mhCos@@=P4e0BeTVUXHVO(Q>An`$rKW&$* zS8z~cSxJ_uO?}Ng66mGgd)3i+OV#p{4&2 zSGc#{`fIFO6Rf}5bmQYs($%Y1)Af%&3EEdBCjl!jt<@$fLSvg{j5O zeyJN;6_)dK084OTneJryM zTqLNF4pD|mHI+pwS9F<{R%%7=>KVWVzZYHuFPlTkw=v5Q|BU z#@eV8M{&$|P z3pV2eXBz^@K}?_n?#Ix@bI0XdcD@{!3ZcWs1#p(%angK0mr=anH=p?Fcb?p+0L#Jn zo$CbiZKO*te=i;30-~8LBPc7)ZDac5Jx3$_jsfZlnM*V+>-Y?d@N}_I3Ux77Kv?!5 zaEB$#N2JIaI{$o{xc@o#ND&9HRhVZGm@p813+?nSl5rP4$d_OFZtB37^24`(n)cXK zt`3Gx2E#TBHsM3Zw+qUJuO_D&0N$9NOP@n*FFk)XwGq7S7Q*hlx=k4q1bM5Sfy)bE zq!BSbdG0MUuK0(;v55VQy;b zr5;jMV)fc2O|U|zzO^MccEJyQ|L`gE$D&=G1%40&U{(wouhDGkz6_)ZHgL*)2Mi}#4G`WQ-!m(Z($oSQHzKID zG~fbAKXtv(x^!mfRS+Vj-wT7lcuD+aTy&8N#sUN_;Xg!UoCQ*aM>1A{cq_Chu?~&v zptdwQ{+PMP#SAU502|Cbb-H4XiL{7bUA#+^iKhRjL{=F+w9 z!6VTo+oiB#JAK~6vSn;6N5;i_YHgR7U?>~raZh@Ixh|!L54C{7U=sh*OghvGEM9dD zXnPwD(u1*2(jR{R70?npBBNQ|V+{e`=@OPrncyc+CUBp-P1&_+Xz+Bj4wKyl z;gPttnx~vLxC8b0(le!qY>7Z?Z!bLkWRBNGuo(BU$`}r3B|KK~{P4B(`E{ci1 z^kni0d;7c`YeC`PEJkZsAhz&2kbXTOuzF^CIt`4Rj`zje;zPg1{o-TGbqq~a`uuax zg(9V`EpO@u#usF`o9VlAPzcq+mGB~r*>Tr{%{Ps$M_4FakIbF-d^X?%Q0lrU_1SY zHXI{Q;+m)2SBWxox-Y%??N_l>6KES_#aH<3Nz?K;grYazh*6H`94E^w#*AjzggBy3 z&is~EdldKLhYjK=%CH=t4O+@rZqmmcg!*j+=QYwpZ!_+whZrLTRbYX_bjNWnWPT&- z5{&&W)=J@}vd&taGd|Sg5cpEStYbkDRk(|_cJv}(w?!ObY4bs<-o}y&2fmF$5miQ> zUub&l@lay_8ECQlcWC)O)FK2)(f!Ni8Dox8(c}yEPEGzHbl)u@YG||B=qgbtR^lRA zLxU8YHKgpbV6_ft;gR&54gxf^90$!4?np~m=IMkcK?G{=Xle3quMhvfRu8+a>*KglS zA3vN(Q?sOtg%&kI@1J|_%9q`la*Oic0su~4^E`u(GvH*z7tL{T1Hj*Kgn89Y^MZ1( z?`d|CfYh_d8XF<#>mbZ1lR*VYHp%_n*h${_Kpc!>n3ST%n4F@jxZ-TZ)eOu<+Z9&w z_53Zo2h7LMTvi4t-s4)dOQHi)R6<%5P({>Bu4+8N}C*Y*J(T7B&1yk|a=UzzDW4F@MG#WxSsbl?sK+&9H ziZ5+VriCein8Fp+~AcpwPg$JVI-J-qz7O887KYjXiy7}2>5G)zUs2~uxnl1YXFamS< zT7nH^zOyR{LIk%0IWeAjzR9f&;#b&EJ1g9)nbbd6DPO;9i&q1n7Rm*8Ms6(qrs73))!v?^Nx001p(aEWdcHBD%8rg zv#aH4n0&K9jDaW)hS8KAEn*^Pe+m$$T4$UPQZ*W)-u@$e<*%n1Hc?zzLyOqam(C2l zh8rMSe}dZ^YUT-WokvT%n(jWllb*x6{JxHfcYx;vfXI?qRLmP^(M^46@DC6T7g@0$Cg7vk$_# z77*D^phS@2y7w|3%4YI>ZyV-q-d$6w>t1rHk ze)XH*V70mpiZTA61GkW?X%07)d-v}GQ#CM*#I8X2r>37Gv~(cQb}%+t^kDc|JFX*_ zoPwAN7xANY$z*xBZex|xUIi}4LQS*TgNBEPaAQ~iR+iHUaU8m;c_^r>Oi?hLUt}){ z1Y*k;Mk-*w9hM{LjEM$JTddobTDAjzAJJx%urUWEm}4!A72w2;r;DkvUF!~T&jzCg zxvTkf*3=qiw*?w58DM-Bsv_ROIV@k3IfMmpUq?E1pm zL}4NxEirF0?Ut{brLxJJ-rduCZ_j za?E&F0U~36R1e_JUiZL2Ip!vXrk|`P5S@XDwU%*jJ{LZW$8k{I5t=TS;ZS(XLV(w_ z8d<0CX!+LZC*YI&_R03>q8MWW9`&AdNb4Yz3#>Tb)|Xw~ydExnjBRnWuwROb$hkQZ zt5CnnF5$KUOQ$qJ%i|((A@UtsE)>3-NndQ;?cgk3>$E_L$4!^KxJax70%SYN#etn+ zXa&KvD8dflOoi2)rjwdnPvRb~y2h+mO|)YzJsKQrPv=L6p%-TBk=_A=fR>xHt*w%- z9>HDw!sFa$KIo|U#eQ3+|9tP{VLcvxhvhB5eew}s^?$sYX?#{Nn|`<+`qxMw?Q1(Y z{S%Mm9F8)C)jsCWY*29pqU(8QlLsYqS{AW>|-x{9FD(aK%aqK37Rq zeaO2Tlc{3kHfdisxY-UYlH3Gh-h>;iJ&~8H#8O?-Z5G^Pa*ztL6Yk*Nt2?~wSw5gf z-qH0Qb&AIj48Q@{$G|lXDzL)9$DFWcY|Os?O|(~cZLI(vwpGJ~ii8+u#y^7KDbEWr z{yY=+R9=WA;s-Cu6H?a^<5LOWQQwAjrw-wYvZ`5wZ2Sat!27`|?7)PI`;HJ&_HgOz z9Xg$!d+}uy+&x$ww$q0<@1>u=`);~5@ieUg7rJHYd9jOCdK%Z=h>0jByJ(ah@F=ba z5Jcm0ryQJ8&Nd`BGHBkp1H0@a#O>C6B$U1VI{vjv>lBr zWg+d^hCQBNVPmUKh@9FokB~fr+4Z+Apt|^zCr^axg_A9aWyCeh%zex6(cpM4yODs5 z6;Mtz+z5HA5J48%19s+T(yh1uEM57|-%s`3=dd8dQ1Q|c22 z=u0iBlkwF2-OL1PY7CT!IqeW8X#-}ikzlsL3s)(tF-<(Ug_{VPJYpr#T`<9D8INEv z=bixAyQ!5-MA!cC$807vly3g&KOkhdF&T9OIQxtPGAQ!_@TftzLG(ql89S7v@q0Is zo_Evmne$8rNQu^!FlkrQmVFr1c-Wz&{b3^GiojmLMga@QzmEsb0i<5O@*KoTxtu)i z-YZXClvltPSM$MRq2-MSx#VlbIT+1_>fw75Y`=?7uDhrpU*`!fS0XeGf~f29{e6U{ zZmwKL)?|te-4x8ZlbKfwqH9Pa8n})QHebQLO(bArB3+#7cgCt3v%i5*&ZX>Ln3m%y zgal&?Sh$;;5Q25Ga)G3)jTeA;G}oObTLRWB79qGUHB|Mmt9UO6b2r_&`!Pa>E<4Q# z1e0kNCbzm4rVif66rnYZl`w&4Fxm1V%uo&1qoz>^|Mp@KvL{nZXp+!+sNtpE3-ep} z*3yoZP5hAwOa*z_FLzEoGu%t?=~9}RU4$uHVtf?1aOnch^f{h>ItR0{&E{RF5C+}%No;4%~YCR7{|nk5B-(Cw77xQWlGmAgeVB= z#MBeuP`DyR90a1B{ZnSi7Ikz`LkiekvWZ;?_qzR~_VoP4Rstvc!jd=rWHJq&!KDHh z6q#*_Zhz|#>&dtS))g6%>P%Y+JT$pxg^xNmY&^i4QjOr!k}Vv&S_-i0R8%3HAp~^w z5wN&M3aTudnY1 zwV>UhgeJ257zZWe7I+3YG~jL*t+T@pkcy9A$(0RyX)P0%{{>*D!K>DM28m@ebm6fPLJHGO>jlMo!v zk6yqfW0EyC9=oFtjtmpL4PIjH%pypc>UI;YsSKEmOQcnVVCvUztcN>J+jjU#h*|D~ z*|n5zKDisWS+SwFrE=%F%hy>=srr1!NgFwPnmD(>`_z$f36~iui7Zc~Lv{(?e}bDWLE4AR6UG7N%EmGF;v(7_|CnRj z5p9>3q*>eo22IqFzs=YsR0zAf5NfbOxH(Z!2mxWe;9(iZf)Owti1cRovY3SStL96V5ji7meFmKSenor$(LaD~@ZG?&y1Vb71L##eE)T6KeE)1lD z>v~op6fUETx0cowgdr`sCU9^*Dk#(Xq_?JK*YYo%X*G~xb{*b9lS+3&ATbul`aFC_ zAw{7|4R&a;`RzJ%{rQ|szU5el<6dGtampG6MmcfqibFRLF3x}z7UyVA(YHq5~&`CGA z>+0&K&rRUZp)|wZQ!`WJjO%iSA+WBoPU534=%uhpe}VxwrVBcS$3zVYEKry82ACQ* zz8YX?V_j9dN*u=y?m~nDe21$sBPz^>0!@Cy0WKfHsddB=1D{WeYe1o(9nVCHQp!rc_Z&Zdrmk#zQz zZ>NsIk@UO2`SY|nbqgkHBQ!#g&`cQ1W8m0C&uN?~Xdm6aOAN$D8oK-~?SU~y`yjdO z|Hw#`$D5}8oezeGBWj7S_!+;yxd_MHBhOEHuunW*o^hVa&xyr~r($@y9DQ>>a|4dQ zMXt_6EJ7AvJ3hTvu3SmCZ&w7f12kfp(8nJ}9)?meOJ+oEa}`8abGvRWxlh&uL4l}g zrq{eHL9fM~#ae8gb@^1G$xnU+AOIQugLNG~@hzr9Gse(<*slcmQd;_+yKVLcd1*FU)l6T2G>u-cV&u*WKXjeGTI0SOE?h_1W4GZ?eA6@q~o zn`#JC6O70<%*W^V?|?MM*~>-kKNblx-I&cqi^9|8yXn#7Ergv7cG_j%6*Sw8SV0E* z2WY3+k4Qy{U%Xn(C4ykpo{A4{-FuSu&!8>6^n+mPHR*P=j50dR2uQW^h%6AJ1W+R2 z?Bbr`e7crQg5cam_6tW}1QY)y)+x0DAX?ZA$ab+yH+UGt?OH&KvWD+F_@)&h>j2ln z5wcCyx4<+xndL)}@*{kP))%~_hSqtO8Fsh&ZG@^wB}@4>!93@wDfS6_NDz4yU`gaqhL%22Ck-0Km9C%sokXP+-OI7GVzrD@1=eM>U(>8!GAJ! zwYZpJVdFiSGspFd1A@!)iGXYHk;IqL2qq$@yd;nxm~H`oj*={34lU&B>ITe)mMMe# z=qtoFCvF+D2+8}5oQ6x5Y;G zf@x0|M+V6zN5&@@Esa;7zV-j5pZ@!QpZ>wW_}A%`H~%127`FiPs@sD*3s*9?T6zO! zu}%q_GUi%<-B`@{EH$kM2#i^a52Hi-6cB4`S#!XJdo!IH#0tx}pE+}uEMh3d=$l$; z@j|SH@*44aF*klIEO=j~h6y-?%OZ$dx2_#@A$6$%={xOMrQJAQlKzG&vL7R2;3&Av=_q_~Pwz&p( zPMH@K4z<8V-l)ue(SNRo#$31*_R4_l7)4+&*TbSFZVeC?3&7Z^dl50XxdF7yh;H4YT%T%)L*-mE)0$mC&l&zGrA44{+UyD z?4Y(-kc<{ZMoU6dEM?0j4CSG(+$@6j?G&&B)BZOOc z2-l3-4K3NBh$+U?Ik7_F*0;(4;d_p?ecvO=z=P>ET!g1n%RbiV(`;WrnVY)}xL4Eu zO5CX#=gT9nq${t#2F|QXV^8MOPk;G-`uPVRk)~Qpa-NHQ0^0*uQk83ApE-#(M~lmU z0yu2Q*BxArxv+C_UViQy9zR(`yvKzw`N#k;U`0S_BbtW>;t2LY+tx}a(<0iG8c6$oNE6=o6cmb!H$I>5`wrlVan7*se6U)5b~%6(SVxZazyYe6WG!@nBMrK zf0lmzv%exs!8Xz_E-`iFSjYUX_fj1|xJKq1v-EWI4yGUe)Bi5r`~5G|gX`}C(3pzR z&<5T3n!aiMHeJ{JlSw4tmehIfDjF;Z4Fjb++1HQm*A!H~^yRt8zlv*l)W7-NZ@N@ZmI9- zJKs)5BDM9wuz8E?q4lE-5q&M1XM=rfm_s#MdCWjr5Woi9Hr#pF`XZ$wK2e&2b+3>~ z7Y-KYaOFXRfAz{M5z9~_mFd&zdk~o|n6WALe7H-(fu0ENY@;nL5bcS{+VpVjcC@3r zvyY&_bLpLT-XnwmX8n0Rnw@@p>FPr9muT%T~4Vv z&9HApCEC+=#%&$J{D4@3S6_J%E6-XQo1Dc$0s|rA2`;jC1yN9#Q_~t+Qt*T8-XjPC z&)Fx5g)kfq_8u2Q0O8yANKl^<;eYP{5zto>zY-0Cc+N5P2gNM|;md>3!3j@E1#cH14xKSZIs$DkIP9qGefzZWTwZ{rOt}B^9 zU0HQO^3WZ|`P+j)8k@6%=6m$aaC$O6i6UhqZO;Ph%(wfwXw5M5lfsf?+}6fsYRr-M zva+B8*HEl&#Dz3y&W|w`J``3ma zS>kzjxpm?!Lt)N)F@Ll{G~(O}10LcQcYOCfpXcGY7C21?1Xso4aGs-$w6g}Es<2$} zj<92yzFulPBI{IJYR5IhK+^~W#l=YLmu@V&>d6et_?i-0&AF~XwowMfQ#=eT@Kl}Li={LXrRqCSu=TG%PlX1TU2i6fxj>Z5+j4W<#O=+O}5=f{seenC= zq<8=37wO;p+y5oCtI<1y5_>Plog=qd+&qY(-h;%DR4 zG@baI+Tfn<9$f7{#LC(XgNKWf|9DQpKseI<%IA1I#*j?<~syV+T=gsC)z7$ajg~@ z=mSSPZnr9rIrgV-zoq6;#a&Xo>ciIYyhmlX}0J|)6{^wX6g`n>JcU( z1rdKGT@i0Yf&`mDf5H`zD*2ha!n>}P%)sVK-J_-sUg&V&J(=9wBg4CkQf0S-s$L3P z8>AWs9$FDTJCHh>$Y6L>*~;d^xNcT%vX2ZggT#?lW6@PHR>8)o$T7GL!Q&cJp(P49 zkjKM>#k+ifn{CF+I(Og;w#^JiANkL7WxPUJjFmkZ!M z`K0I8>&H1LH|n@877@*~z&P;@4sXC9?rY|YpyY})e)B`5{MB^jwI8M<%x~^=8;or< zhSzeAY|bzPp2iA!_Mx3^!&H~QR+H%F|ZDckztoM>RHDp0e2GR z#s-)btPJUZ_z;+og*klL2XJw~C?qOb2XV_3ni)n5ne#_0LG}s}1co92SMwePhLx~@ z2j@X1%emX!BC{0z>FH-qnH!AEAZ^}2_&CD-seg1d{pbI9GdAcuV7@Wqu?Csk6NhzL zhp-rRIN`v6fQW^tQ96PsKNSFg%EieAM~I7;=LGKiKmRPfu{zsZNnI;Wffoo zb|@|gJ8)U4&y3GgxQvh5Oe+#@3*o`FCez_~_>9*>bIfng<2po1I1R>vb*)UG&%4Z* zjQ+P?xtxCc!Oe8>(z*1#AHJME|NLQk`@eh!+{3i6cZY3o1FyBUt<>DUnEu&6|49U@ zYn>Zq%>MGNzf=L3PGN;oWBBONqhOMTv7G8ZJ@)Wn>g?}Lebmbgz9KFr!&pT~9!;yE`|) zwCh@!nL1o#agTD}2~%Zzt&#q_UxTqOnJE<%MHIqH7A6WjX3GE}XFTnYpT8Zm{?+HM zrfxG70fk{ni8o_`qFarJ42^3ny35V@=6VPVlQ(SeEcbl<)@uf z3yy!bInMpEzWff}jcA;~KhhdF7&mmS3!F|Fd{?sODng*(eJtC$mQ|$-hVGOvwjSO-@}qos|6i59ktXEYz;HQ9b7d@L)~7V{>gvy z!}QtRN9q6g-~YGtKmM=(F7;tOYQd7Yy0D0)0|wvn!M7?df>|sNim)K#TnS@V^j&xe zMF3;qm?;1%D7=Ps%(%1_;v8(R7JtVnXe{vM+HmYWyzUy3!Jn9%48~r@%x9cSzm17< zE_|jCE_^qa!(6bQJXhcddMbc;hzkOTssznKhBRYoiTy5sN%x0PAh4e*vn=0__e|I9 z8g)JUvF|=>{eIj@TURvKjENOI0wfrB;lqB}3O}~paqt1lc9XgeEXBj;E~bYFEr+;$ zMr;(YP~Z>8*s-h!Z{fE`k@)^=xq{EBDB)s5KPnVBSzj4Sg|!Tew#7vfb;ghUCt#IR z%P!7yJM&!FzAvu(VEJQ7oV)Hb4dn;@i?_THHK|}Ir8cZU9mMEu;EKD2qWK7X5iV%) zJou71frSR=QWi)X5jxYe7V`9>5?nmpb>%LfN?TLMHipbs<{oa2WGXD7z_ASeAJ3g_5O%E};WCrSlan5;FaIeT>}1+83o52G-pSK!cTc|d zmGknD;_!#h%e(`W%ngVX|Iw#+(+xDgm%shTsRkF6s(KhEo{iSU z^S;9g@5sE^IFT6;43fab15HJf+7pPK2v1^X+SWagyA#w(Al2)C^yBpThwrBkKm1*K z{>qDN6x0Ur3f5>Ggq7`;v^)4hjeIHE2ycSvCG zKX`!4({yO6&W>D$Ff_1%)Li%#ufuRxV!m?TyAe9Fu!s=5S$dP`VaIFr;leeO?x{Jm>wgg|YdLywhez%#?` zwdvbehS>y?yx{w!43%+JoBWjnsAK+I*BQ9@VqKZhM~g}|!SXejPbxWgw{RJOeKfne zDT;i<%0Rs$cgI2Pq~Bs*;Yba=Ns>al&^Cl&$$x8=Y%mArl<;EwXd*=lK>8yNx_ z4w<6(rcN124;d=!0sJVFAX;GU{P_#%)6ec8NL)>K?-L_LVBB-K`Hh@8&nA9k+`__Y znO&W2xGN1}LBi^Zw)az9;a*^qt}D-9!!7O;0H~6TI@6595yEUw)IUy)gu?l`b0g^? z%o>tG2%)iuhl~?AP9J+(W z-v8((OdAB#*IcU%kxW@179j<-4y?ECJgddWRJ>ZY9Um25YN;J_M@0rnB5gY);9{6K zr)?yd%*xI( z1sDG5uCSejB);k2K5O6f$!E{L9Bp)$-wgx^)4|F#H?Zc!+aaL(JeG|-r+MwT4$Nfd z(TEa6rru|*+qxVBH;fZT{Oxte-uC#eLRW5^zZoCanQ){fMIl$1bG-+wQMc|Y_&!r8lKU0q%ala5mG7c%;CU;rt85l)jaEf=JZ^Q{y9GY0dHDyLX zg#M}=OXEAzV4*&s&}xtWc`=V!F?Nnwabzg=k&`{JwET=$UH3s-89U>X#P6Y4;2Go1 z6cFkuvqj;@bq&1gPU~Eja4VuIk;niF%za}F=mW4L4Hh3*mgC=sb-WvTRfAB1qZo6k z<<@5qB50pwSHLGUQEm<24SY8K^odLIvi>I-R;O)KYV%K<`|f{ZlO@et?d z{UBV+2mzD{bYoS8pWu9pF0bI;v6BT=#c3ZIvD=$XI@H^9;5zqMvi3Ypvm`M2PMMzaDlaRH9Ji!6 z^X2m4E1X7o@i(7&rQ94&bGgMU&wf)Q|Li~OnAwO;MtDfVSc%zvYvXD9-8(-|*S_@! zsi_BGAZSH~RsZzk(cs-+dbloPQL}1vlv+Cd*L9h2S6_p~4$<;d6I@$Ej^s`J+UvUp z)A`r`9zm!L>EpNmbGmqe9Ys$M0vJFs`TT1Meln15b#^T6G@@~ZIs5jH|A#bj{jGHG zcW@8Wk~`7#}!D|b{O#2fL64F z^q;n2XK#l7ZD6(@)!o`u_>V`Q+59{>xW%-2kLbYLQiiyrtJ<0&`O#lpl<L>eoR31Ea~*{F9q?L&;0pI7hnKL^}4$P@g>dO{vz7!1!5Ei5CjMc#}B@f!0Itl z7T$Sym)HqXgFb%3T5d{@XJxvwl}nA+CP;n?_ZVG0Y6;kNgP>1tzfXGEm&rCY92*Gr zbdr(|>9~=-3*LP5rI@xVkfIEPsSSCRxlm8)+yWf5Gp{q|Er39H;o!$1L2n)HFENjO z?AS>FTPyR2;39KXiFv;chU@HUHfdTwi^TIX@d?e*R}Ko)lx}0J%X$PgAc-`(q{W5d z>Ljpng}4vlU~6_h(#r}f3Jx*|x{ru6WJvtTOi?1`xmG9@ydeF^ur6a`RyQ^xgy|M^ z)t(D~!m~_?hgwlL?ecvET;WyMpK!0_9XB#_opTr1f(YVc{oujvbZ+Ea`uxsAvMpU? zZwNF`1j=@FH6UP&;7dIm489wjJ$!(W@n{9Nq}j+cwT7^D?u>hM5cfh_UGbO9j+O@n zkI~U{z!000!R(wK8X_2*%?Md5rU=%xX@yvrZl0f;U!bj-NwhDHee{9$=Sf8S#(Zxp zoO7)jK|+nLTEzpHS7E6Q7d>~WR!9?_%18XGB5N1UP+?m4#=RR>{oxNn6Y8Xh3B(0!|s4E1#00^fq<5KS#>t|=I_kae5 z1vm(yb@(bTQEy=5D2!|6ay+S`e#gY~m~EfooZ{mBc$915m+mtNNk?i`dD@!cNaEbh z58*xNcU>C2c!@Z>(KNvvEzeEk*PrRCI7ucf+-CSTt>zYV>BILwAPGc6>cfgqt(sI} zm%8BX!f$#D=Awc)ynYfdtP{(1M0`b zLf@SiQ^>kLWS(6YfD#C$+53o33*)&y1ihT8@J_R|`c!JDkhVOhfCuuRv$n zXU?2SDq~!0UUx8UgE)E;p*P38Wy_xM6L7i1M%V`rFZkb1u|{&{ALmJWRUa0Kl?5`qk(N8t!`y!R;9S(<&Ro}k1rmH{+(|g7 z1iqtC+9}iPd-4%chgHUCL_?x~_AvzL^8DGXuRENPfa{n5pWt`-K`&X48KnY01kV?b zFy71qZBcM*L`l}y)kCaaFE|D(5_`jl=cxn`2r)i9 z;8@TXnRhTCzdX;ltk^#ya1YP$3U%^4Yu>cHF|=O9%I|m{R-#NsuqBBX!XUVnJ}#Cn zrK(aJaCIuRs$3)+B#la+-FuksqF6e>Qom2akzM$PeKvx10a%u82)GdzsJnvAkE^{= zEWYBK7MGE1o?fpz8GehVLfQq$0%h(ydgna| z=ZkNoo^#hQ3n9TlXds`oiw9_&M1(cX=(&uTXkOJs+T7L|rc|@?v=K9*KU)Q6G8eNz zv_r1{1~dvi&wo384-?+?xBhQZxGko^OW(qz1{4A`tS!b9y%}i&muB};Ti59{{KkKq z>Ia@nx8MGYbTB^#Afp{YV_Jc^uEILN2C%e2V1Rl0rY02TEatcvf;^A<`AH0mF^_RC zKT^hAdEClvj(IQcf6aICzTd}ia$IE|+kD+ADJI41wzGJL7s}@`@&5FrJntu~RRE29 zT#UtL|9y@L>Fnsn*D*8nGW~7^uQq)T#?Xu&#t-!O52Z#nqH(??pxJB)5&j*1MPcIus;NT)q&A|Vj$Byy5*w84yPIh3#-B_EbKF2q`a>4IyC*l2 zv;Tk~)Mgl-#_E>z!udZSdlOm&+Fyg#qM11?;di;l=4&+|RD&YTIjt6Fiqw>5S(1pF z($?%-8~An~tS_VKTF2j!Jt#1jxhdQ$SD#O#XGdWq7a)2tc3Lc0b4zFpWttnCVEnh( zcngh91KPXk>BV5ewh7*T1miitxO^YYv#wQ33maI$n%Q`c!0b)aB{IfrvUTa~MRsuC zMiA-DLL19L#;lf15T(6(G6p@D-gxnK2uBq=*KQ+ZEE1d1hMzUeF+zi@cH4!6U#40m zXb2+hm2Qy z>>=Q;!!oIE?000uHS;6xhyyOybch9-a6Jf?I~xS4gU71S6qQgY=u!jOL6gId^OPe2 z5Fg2$x$lkINEv2vfUpz+eijN)il1C#8Qudl33?;o6qmQxUOksCoI3|oL3TCV(54Z3 zpFCM74fGztly3HupEpP|U>!hlcX~)bXzA*qe45&4u^bHacj3}8k)FGJF5SL24#Uzz zaQsNR_jo)#p1g_GryW6pJxLH!SII7>#dLDw5!SI+n9~v&w0g-LcPechY~seYNH(Si z2ovpL^)g1{^yy(NRSS{g^(6$*7PM{CpWjTg566gyA)^k#+6vKi_z#yTZ*Nam4rmK- zt)M8bWGWHjfko!)1cW zHVOp65IcvID=rkzd${K9i&m$~GQN#XrMQgq4IjR-MpVn-02F3OEvb9}ZXn9shvM_T zaN@I+;)`68h}%xahL8VT_eU@#HPkPCweCvxvpB>8a;OE3ewdooOmhT^q!F@fCGbiQ zn)d|~B4`OWcBF(`o{S_KtaSV4CuxWIJw^OWJ@`dfJ=7h;bwOLVw6FpjMjL@OVSr#> zQ3M}?FFR2B?UGHXvHd80?+jh%)3l8hl{uw zg-^NT$MZo4C|zZx%&P*4hvVzL5GEAx`5%I*Q!PG<@_`@5p-SsCk4pNTU!q>?|vr$EqE;V*)GV}ImSCEL*X^R zQFW{*oXVgpjE##01B4a;`zPkIir`uW0^yE%_m1sSV3jBIS@}iB&`;(aT#30(E-^>E zZxQ_SPNcr(TY5KSSQ_STPEFkKN1(y=3SJk zx>akD7v_x($o)2f0DCylfPJz#Egnv$1>gyh7bA@^Ix2h+5~vTId(dC@AWf^#n!qUp z7Ib4B>V}T&R<_}eBwvKJp0uwj4ek3OG)X!wjOivU?%0=S<)VrXM@wO-62Vm~i0(q7 zlAy`pwo0rAyu`q;o){^~9nB;KKzYKSdX8eOSW zSsjugN`ACj+=>fgsd*sPkZr0Fx2me@{WJ!C{?+5BtOepVw%B6|o<494b$t=g{e^H( zvqP{Kg1+_;Ir{G_2OV?}Xa$o- zJI?zK4B_<;*d=u*oqGP2bclIZO@PFaWgc(a*~;?bp%FHfD@T!4LH~H&n>5RU%>!V3 zsRZI!Cx3nuli!3;@cO^}cj<$-{-3n)+h3%M&pn^2I?iBrG6R5wi+ad+yfXVBHForn z{E-!@002M$Nklzztmuv7H2}q%7QVSu& zr#bn?jq3O5CeoQ-S zYDL$R2xxeOj7?_n7<-7ObrEES1_Ys0`0F7<(>6Y$5`!i7iD+s<*doANt+GDpl}B}0 zIWW(IfFC~@0}qta;1C-lLCm|*5UrvRxPzv^yoj!8%h2v1|y3^Nwqn^A}7Qzr7Z4c=Ft; zVsY9QnYzQ!+;?`If+*lN0ijT9h_?d*BJLM3WHL4e_`8vnFdu7Er4FML!vnOn3hTi^`sBuE>D8Ctgh|S- z4cA_LG5y6`e-*AA!>4-)!k>w{2m09mV+z592D*u;{>AQ6-3t@+tKVM-2HhjTEOaD< zAbs)%fw^vCRs#dP%e6||2py2IcRlJNqDz-+wG9P|T116N-JE(!^9gIqnu1vp`7xI< zRkTD4%_e=>Mx)M1#``kk1{oiLkE^gE0s*leGja48@h9Z~lbJcE$TIUJ@m6rYe}5dy z-atBk0YQR(i8tb%m}%SK`~3IxixcWH-bP>wh~=JRj9`zlO!ql@5p5TJn*!C>6#$|f zeuTS%-%hY29qsffK0F_|1tBr|%5B%J_}p}pl@)6GGxH>k%`R~%ow;;ATZh(`5$5Je z>cM*NAZ6;&5SeJE(%qZ45#AcHh_(i!x=;D;%&qlO>yQGU<1ACNLcB@||7ik^Y%J*_ zNuod-jWg?o>AFoYaV1=h!hz!^L!`DiYnF*E1Lvr@w|nAbpYfw2!jEo3GK8b&MgxD% zl4jQQ&%&J+Dy?z8=XfZHxWS);rGOz`a2;5O%(*mCJQiAi#v{grckkW17inf)PhRs7 zm$;r31PyAIcK!dPz3Fpa*?Hi10KC|DfB>OBi9EXig;qZXPcmCh@Q?j)yqI zGRYR$QCn?(WjKp7zsWNWUyeulcl?YDbDeUcW8Xzq#VvnxSk}>zX|vq#%xK2W_9@7y zLTl}{b=)Otl8m(BZ*@$sG++j6k{iD5V8J0X?RsGV6t!2HLaKS(hIrl!1{9M@e0%oM zj2rFDK2nHO$(&=ffgc46_5o|Rgg@u}F-XH4>kylA?u@1SiqSMzJ%Z^Nu*WzGfq^>$ zAK5S&hRjgWQm!Qwxy>i`k)S;$wM%22a519Ikq2ekUgwuR95h7vxBFWhfcK)3X687F zOL#8hl|%XBQ->ydleK@1aPeE~gkz@s8mt@|2!2q4p|8N4dJp<$8-c_8@(g|u_agl8 zBK|>zxvcml6a&;p*zmAk#UY%;@>$#{&L%pZu(&VO6wgF#Atj*)LUC{`L1UD+Ei z9Jw;ymYc41fyW$R3khqkE{qhNs#>pJZ3IX`Frq!j>Q~N##uSIoW3Y-FakfoIH@qyy zI;!Ot9})3C<~ZIz4qi1#w#074lea&>hH)XCX3?!}>L#~L>{uZsB-Y;vP*g@Bb8uA0 zZh>qZyc(DS?|UeDwhuP2i!7nFDFFcQ|LEVQn?L`H^zieKQ`f*+8a#8J1sJmt06xqY z0Hn3qaZFIyak;B6edCXQl%5Xsrw1SYEbY%vvxdQ`LlEVq#{RTc{IUxTd(eeIy;k_cCFErD2qbfggwFM&xq;p>p;4b)a6(gP2O_zvQ_^gdCVuODPD19ci!0B?cieKrvaKy7 zjtk=+2D#57?aWcHNXG)@`!WwMRx*4A{CW2Hy0dMO7?cV8>!1D#A_h$% z?`zAiEqO-hEE@a{&8u1Qj`_3nnrqbGDXql60~NmJc9XM<^z#xLn$FlHPIIKE!0!$= z+%1jm5G@@LS|@SntVxp-&skI^(azBivtup0&F39zDVw>VzuoFRqR_^kxG1_`%jB6Sc8_LY6M2XmpOKw_*4 zdu+835<($K1eQt-FcC<+H`pg4V}6(aj9+)amy4L=Y%-7R;xM&f9#-*Q5MuO`1h7n6 z-fKZzkjLR@Z0tdB18H^mmXOqp>(&=vVh%yHA&AkCo(c?+l2a@w)FBTmck^LBrV%<1 zhlYGZ8XI@b1CoPEc*UcTDrWGI(C}T3BHzH*{x0Xt{XO9yB1%&V=bF1w8}`#{iFJlA z@--8emJ%Acp$l)aF9h(o^J(k}llBY`OMHBs*4>@FA2SDcvCW}LwT;&AD{s>07}+-x zddNH=UdWk3th2CMg=Tz{Xj-#4=a&!yu4DG+W`CM|NF;_i8T}P|@<8A(QYrDUM6*P< zhk1Jl@2;Kw^toDA{SDkKRa0R1sBp}jFU8>)2MjUi3f%6w1zc@}q=FbPh$7$Dq{;G3 zSDbzBM_(J)RpCWgk~s9Rt)W3IKeH{G$IAS=TYWQ(uYxBTV(a5Zm_;Nq%d{-d;x+|{ z`fP;LHTPw7tgp`1)^YmKHT(Z_*e7njE56+>Gh5O|n&Vk_@y9xQ zYfV2Ts0`ZED)4nk_IXBc0)uGaS)*Me+&Itu5tteDJErHXm+pO31%1?rz^qa!J->}| z8FUllC_YjEWWQCxe2M;TSMk~0VgpvdbFTvb*H-#gIvABf7P>Rf7^@tDnHQ~-uo(SP z9xLvdms)gZdu)uyJ8IG`)3KJGv>)wFX+Q6W-&{E#CQm%GpG|WJkNh$MWFp$*&MNr! z`Q5ZIdMzywxTON&n87`28&9Oo*8bFR@?vTlxR`dE2GgVYD$>lRn|Gf=x2Uy2m_vVS z)|D9u=_EhqFCCGYY)keY>)O$c;{av?kK$f&9hte1mVwCU1*cMOJIyaovS^|U8=VD$ zi6|=|K+_2<0oSA!IL*g3q?NFcs-kz)`}2;>#VV>m(TV;dzFQ1vl)Ivp6}(Z z^}aPN4&NnZq>+Q-DjpT;Hl>=;7Tz(RuRDtS)}dS(Lave)Vq^~h-XrGl+_Sql;B2I` zZ+;t%KsSU41PNTOhse;QsF(Seryp(dHF?mZmb?JXj&45Uwg_0zO8 z`Z@VxN65G3+UAhoR0UbKzaNq`oeof zJ&Nm>pB49orQ*mn%`oP@l1J7P8xg^4h;Fc{K6npqfG z#Kpb4OLhEKdV*;R_!UWCHy;*X2&c=JFQ--P;%{Rgt}lh~z6PRqK$}s~t1=g^8<2Pp z48F=cI;km@gQ#fi#FUFDY%prAI9!7V!))uu=YN?lyrO1jh&t`0?|=K-5rck*wmhI8 zCm}@F(4H9ir?Xw@FSX~{3OHeObRul>tMQ`{4G4#5X-OmA-~vavEi_6?8`J69!>u$r z*MCH@jQL{bebBUN*{c1Ee&A$BttVj4>5^tt@ytN8*K@J?0-bvqOA}~z#K+% zF@`RYw#5ZPZC*75fDX_KSZawDW*BAkGHjfOy>~OxZX%D_|1$78{!XE(GmNa7VDp!V zV=$thV==u99(n#z&^#B;b9R6Jgf(V+{f(9Awwe3bgTR%4~uJM(l=du7U#f zG85sY`0Sts&{VPoa~h+kg^x|Znn^PgYn{Rz*fxR#O?CY^QN&mv!wQOVK z%~`XAP z=4CU)Ia`dU(t3$R56wK4#?J#UGccVt)Al((v~M@`wnVwo;VJN(@FuYxLIpGM1&*LM z%6;SSU|RV7`0?ZT$dowm)H>Kc548y%j*I2_-0_cVo)v4H{SfAN%=KU>m`?@qXF2wp zzjcl-qNd5%TUKET?Q`??p3JNaf@AJsdE$)XxeOBxqt6L5%zhj8x6SnIee21IfBv{B zK947+@j-qVbuw+F=dS%4ZRA>MzLCt_NRuuNl<0c_qZAgx6LO0pqXJO^Uk;>ov81R( z-j?GQ$7k9}jm<(WhI-Ykye*Dv48$xKTo@uO&oDUb3f-uwH`h4XQiggbeO(u5Ny z;ptg~%biH4tGeGp+)-Q4pa#l4D$^=N2bK%UgP!Z*bsmVK7DfFWaIbw{+{+E={u z51*a9+QJ!8Lt;S(nYF1W>F$SH>CEM~Qd`eyD3)fx%RaNO?WuCPyj!%!vw$zqy2CZ- zC{$C{pu{4&%K%r9_s;S&oPJuOQ+tK%&&F3($ z;{#QG_wZTy_wm!q&wkhMqO5?wsEU1h*u%%O{Mu_LzW)wMEz+I~ue`Wwk3sUP zl@BmwxDNq!B0YTg1P0+jnt|x5szP;I+mwEAk>h= zsA6|mY>eciN>PcFcB?V)O!|(_4VZx@Bv17a>aRpO(_=(c(qxB8YE0~!nkKXvRqQ{2 ziOD?L42Zva!uZv}_=p6R)NWEw)@BGr%xsX-PP1e1(=qJx(Ok4b5gWqM*y>x9zk!5X z=U$^nZJ=?`QGDps0Cu?S%*i6%(8LbgEb@~R)2XkwE$%&h_z)s|BVzQse}#J>h@3Rh zTV5m-W;;Y61XC0K8#2Hp{5GnIW87FrDA#Q?Ae8<7yWQ#W(|f4x=P3iE+^~@*r|-Zh z?WA|#`eTSQ{eqbPm}AXCc+AX9paD69GjLDDrgx6`SqJ$~PE5jlw5I21bJVbij*T|u z&f}egD%}9P8S3`^b0B+|fhEkPR>*sF9)t=J-r9PZvHu8*3VG?IpF@})&9MTOUCg}}hT}u- zs@Q{gVeu468}TWJIfbZH8cG$^QsjjP=p~N=u^WX*gfET}o`ZmdP?8viBuu9n%RRz9 z?yTdRfG@`ki}gCpWThrWs}*%8z2)y;p=R|b6mEL{l-86?k*13^0 zz-T9Zy^=ou^cu`F(qQ|FLLWRDr~OM|qNyOr4b2`uc@%vq@wbMl3t>+J4z2THo56YJ zQ3d1F05P+Iz5cdlt=^}tE}}JR1R%Uhj6)L}Ex!HgQ2O5Qe?Oh*KFR#)BF6oG`ak~9 z|B!zC*T29#0egLZ+X)htMs;!d-bIUe@k}4(;Aqc0CHolD7}e-B64Alp@k}hbL|RY# z-F{VTjI3Z`3I=tF=qiA=gG_m0!WQFbx%tRrWmFT0X8{f1t^vTbciyIdwk4)*Igf%d z<^B$3L`;6>a}ZOGXXML6%W`0OM@^}OiEzLSPwz24jYx5AuSx7rg-_APzzE5rj0|s@ zd+et--|xlT^=aM3MSNT0Z+DF!9tZA6VZ27%x8HICDfW&NLWTWl;KQ>-Yr+n z{6TXt4K`1K0cMbHT(7>a+tA{4# z5UCRUb2tju;lwPKw~uYBM7?t%8cH2(cQLo3PUPpktc7A;QKt7}4wI)0g>AB2=UwDQ z+4>!L_*-q5INUxDe;CRZR@@gtBGm9nEO2E3iQwYniu>r46?|o@2aoHO(>(f7%|~;=DQj*fOFl+GcxlyUE5u2F7?C zeQQSLqH-VmJ8m3Z(95xNaR;|RY(jVS6=z;>-x%uS?bg={O=^^S|D|mVv93|{cgU@!Ie6%~v znCK_9==5+j+$vcg>k?y+R(`LkFDMLo@sGLX6$s5H{&YMBB z%mz#;8tQ9_%%P25K6Lt8q$66Jut8t|j`A;36nW)yZilrke<<7Ic>CgpUnY4eo1*PL z{@QnZ;8D(Z2Ng-ElJ^ng3COf%hfUwsXP&SL+*%sCw3)ikTs(~2bXfoL`*x+5MZFy$ z!;#4)Z!u{8-iH_~;WTLLxA4N-U}LW)%ryEhelyiKx2I44`Y+Ob%x;#^N(?{*H?(#} zvviiO!dYhv(*yS!sBh~|m*4wA*!F(*$tPeL`C7UGKsp@Dxbrn%R`PJ%zvlU|TSc29 zPsi4dwy*grV1U~keiivf0&jAVDsCSw2KoNq_g&7}_e^Gyd@LbP+fa$RVgWrCX9zS}DufaOnF=Lw7Krn5xc^5JEjgmtbE-qY4 zB=Pj-;+si)oP#fgrrD6O9nwl@P-GCRE(x#}n!K z1SWBVL>gdt+&s#d-EGus5Oag)TEXjYHMu56&fsHuaQO0#Ybdgi&`Mu8yff0 zsnfj?foB^Npb2AZ)47iLBBl!`v1K=mrkfx>xc?}k(70#7&L-pm^_`x^3<-i>1S2zZ z>(<@W&+f7=XiB4ndnr1Ks;}xBUDzO1qajxPpPFFM>S)! zy?~wnI%-%Y-JgASGfZXPeeWI2G%C{|dF-`9qyPjvINoit0iAI;rsmPoAURdCxnU6P2m>qBD})=rKR08%Odk|0x31muHU$s zzV+UBB4U(j22Ty8>)@L3$p_j;HR1) zpfQjr&|G6~23vTEw5X_rt!4{ZYY;@c_=)}wqK&f*87vcGYoxN~JkqzK*OiR18B9d7*iRSa7%H5#~ZYgh)6%<7Dq4=E%}Ldd`24lj9uD$d>0= zDQE!(gaoSR-Cy4a&MwM5h$v^>J5sUmHRtywY8h$P@{xV7MflfeB+7<4TD(-i(e zk1!8glz7FL2Elzg0FG)fDXYT8-cF z9cD$`h6oa)NeH7cpxXMkMEiJXpP)YDU5Fq2onC$OG`>oUX?@{ln9XKi0vH$(8pMFjGt4&tm9cpW)+CVlt-=8_P@aPN@@0I+j{5t4x@ zh%`5klSvVlgQy2T$Sk=Fw0=d`uYa1}fB#$fbIekp47el=8QI!0fnx!hrzklM<~|7j zY$~Oh(SwH%5xfAi^t?%=MOR8Eiye^wzITy|^x*_9x@QGqSMx^w4qb!9K^vIM8uhFJ!A3nkGL34#VI)IJAGv@b z!7)3SC)wAUF3sWFHcLQ?RfH04Fc0TWoo3Gyb{0oCwT-r73w*e4df>$fVKM_wn0oqN zW-zqDRLr0f?pxGA#I5z3J|d9G;Ir8!A=zz*$^VB_rWsd`DWKT?5^j2oQ~cI7~#DbAnIo1s9+)(>nhBkyAa;JdhTRus%TDg_;)VM zPq8-y7Czf;X?(stJs6)wJHgsr4Rb&C5MQU($lvh3I|uYP)-Z;u&!+Lfs<25TEyJ>% z2hJ-GVa~NR4!QUMUTe*f2iS{pOAZIy=iXCMEvga-_>FUwNo}P0WS}Q4{|a3r1}Y^0G5L`(_bz6%9ZqtOVq*K;iAyegye3h8o!|6BWLcIf#sY-1 zE}gvvS9t&0&xqAbn8YjJ#H<4qe{6%!;GiB45tKO!ywt~qA`TYPT98NhNwApI)e&h# z__kOOr?uO}_u>HJ>_q=a`u3mxdAj+lpQQ28``BsErjg6xGpS#0cfRAeXHqm*I`_M6$z9`~7sXhW zSg=G=VAfdN?K3BxniU<0_YGQH9*?39{_(qi}q{#%wZrF_9VD zEvSLYnw|9e8^h^aEO;x3zkl_2f18?#k~1|v4?(WoauvI{V%pOr0g8<#$jNb?n_VZu z5~?_0t*#o0KE%l$+KN@oaw;%=IzVfXBYCtUb;KbLCL;D2pj~Y}jV#=i;j6HMros6B zZP*Z3Vz#3yc4rIQM>GaY%WJ`eNC57F9GkHn-6SHz)YLc;SMsi<_1LU;cGVKC2PbP5 z?@guEj9ouFS6A$vgP7ce$zkVe2&kU!K5XoV)91H7#Xfo+jYI>XBu@t;u?|9N2Uum( zmRZbp@OSFPAERTh5oV0I=}1SK5e`&Ux2I=>mb-iZw=ictL>_uhAFu~S(;pIUuMZVB zaKqeO#SdYV2tO;7JIey0315Swa`{TCWuZ{p)6&2KM0%M71fw8C2_g_7 z@H4**tbm8=uz^}#rG`Y4+JZRQ9UaBL2`5uF3o`6-CFW?n4#+XTMI;irEMj}y+|@x3 z;$8>YEhJ*Mzq|`U4LpM;YOzOmF9T>GU&@QK=tH;$H$_yD0`R=e{UC(6a8dElAH*Go zW05K2FLDa}WO@lU%M;&f)?v74H$8N30DHX>^Ae+XEThg(m_(o$4??8ByOb_nK9&Cc zfA}Qru4mk1)T|wh+c!}b==W}f7JXDVU_5W%zeit?CkX2K^XKWxh1b&UJK_!&LGX*F zEHa{N;BA?Sc1-&=@mG+TzYXJE2aK(;kcVFzZC_zw??ZbqH@n39*kq$giv2BsUvZqy z>U%3N0#sm=NMbTM_Su@Ee~xWBb)Gr+lV2k^f#{I%tf8&_y*7WYcw5YIe;SqMKT#zJCH!dn4>3`_V{ z2%0=QHPi>rS;JSVE!r=EX!JNG?5^uo)J=zK+gThEI`)-oxQ95pzrP=$%VNN={o9X3 zTfZ;IOBzTw>>Ih-FO#qkmO#(*1l+2^0 z*kYW*B80LTYFkmRC*ww*+c?zDO=Ft6mmSw51?9^;cqkCa;bseZEP8?`z?YXCj=y+D z1QzhegIM>uXB+a}G98y+Ird#159`Na+Y7oY-?3lGLt24nj@1&3^Db~v$40|W-wN>X z9^ppmOy+T3XEiOXf{<4Wrio?_xldgg8)1JHllUe5Oz^`K4{73h_3UuEgrI776{ej2 z+hHEA@)?EDUV~_E0IzGpWK|6Zy9F`d&gOd!_*HBYIcW|vrrpZHL?G|bT5KE53=O(< z)N;lRe%1WJn@0btnRE=qkK$Evhs3bEaJvDW5%XjqijSJpeC>MW_~g0ed{ax}+GhSn zTZ+DOOu+YKMBelS`PH-j_BU6jUIL)3%n_7A#sXOCMEEy!vL*GOU@ZY=?qKP#x6Pab zj<*m_yMAbxWb`IA#^ShGu&9P`pe!f+289Ja?sRSVU^Q1X->Ui*--iPnLtnm^ALr*@ z=fh6nUO*bxbW}N~czpOifMGJ}^FYu%m3$QNO_RU5$S;^Qh^?9u_W#Z}fIlNkL>D>0((yUWl^2NOZczH;^hb2eaqNy$fN+-=D=M$-Y{iVa!7j5tn z2`t=vBQ0F9wtaGusL*d;{{W77Ilcbw?_-z}4)Uwn{meE2tMaO7g@8@ZG!Kt|TJ z3V<`b>juubs#qIa+R}xKm(t^V515}ih35KualGwT=^u0Ck;vcW&fqmORfP;Ogfyp4 z2@Q*cv9AUE17=`(N<_Lmf-ja<CJDcNS9r7R?d&;G-YJzBIeDTA3`MqMy z3pmj+>+01v!#vH6uC^G5S_p?&=vaU>Pl|}&qCz=E-}JCDPo`$(XX zg9w_QeU2^rEC_R*=ruX;zy{2=jPn)*Rs))i{{B7?91i;|>_)4ZL3+9YQLPg&cr>J4 znhYpSZ>1iZw%AX0ll!)~>vR*+Aft?pKApg6cp)Uc?z&xrd7T8~?96l^-O;fJcK@2%|-4)h*&s#Ewp5ZTa%tSgemw z#uv{IlBN!c`dC_289|>NlyH)+FaQ7xDNLKQLr3wk@$tL~78-=hm#(CL_kaC6ULXuH zM3_ER>nz;bxa(MIG@HS}K0?y6vpHsDPbblEAzjZV2{JjV;L)-oZPe#&Wf}Yqk*H=& zRr(@M%raD7dr0U=#5QBnD*a=Gn+c-xY_Yf+{Ggg==C=xtQF>@3GM5O4Oo4%OTBqYv zr-tno@`jCMp2plaQ5+d}OBlusaP0YWT;e5(fc%S}2pZ_)b3+4!Azw>3o{W)awq@rL z*_x?iD$%J_pLLJn(di4ImP=(> z)PcG=KfSh1!indA4IJQ&%BE;g`IN*jr6Y6*`U_1Nroou8$#gbg+uhu($&A{avM&u| zwgV%h1iT*HH_up<(2Ul)-gq!SY@4u4wtQU4=^y$%{34LlR>EikenGS`5u$F@Xnlr< zdea-=rjKqvAt0>8HUt(3sEW9-wd5n=;+iHgBZ1qFR!IWT?p9FJgaBp{A%Q*mlBr0l%whC^1fIgw0p8QRkzCpT1`@5$1X5zrni_W0n_X*r8mP8)C19 z@Wv3P)$|2DM!pCo*`MY9^~n&bZmDepM>5DEC;-e!m;dM>o}m?)drq`1{p9TRuO3Bu z&1H@=V5T&E@yYd~EQh|FQD7|kB0xO`pveF0N-CZmoii9o-{pdG^Oelg#}LFjm~`*1 zL#X2Gvje`1Xk%z*)abgSwD{J3m7;X5m4OHbkXz4LYv+Lz%_n7mUmYAK=)(}9RaqCI zrJBJfi6>D<<;L_8vN2>14-be$()$vMH32HLd)o?mCJ*hI)>gh&MgZeG~lQG@VoB|Soz zxvv>6b2AVGd8l14M*R4=3!g{kl&(+W8DLr_9hf74iC~luuj9z0Sq6@;J|_ z8@GOOg+-yME4}`mKfz3(J$?4!Ptxe!&(TWEr&E_+PfZYg2LQI}K&2(ys0JlKT)k?U zH98{YbuX7|`O#11a}-$qdU0HjKRI?yHuhs6N>(v@brS@*iPKLLv58Cgi0rbc?*YgM zd_WT!SY8^JV>y21QusN7&ZEc2>wa`MU%i;#wpyUD6EpI#-%TwE)Vp)kLI zh<=vFCdWY_)D_8wZzxSxQkHY_+gdS|A)wn@Izeoh^|TRTYz`ZL`~bEf-i+#^Il%05 z7fsA|I_ikF^gLj?l330s*wjj!Xpxd&5D|u0$krh|j7u(*)HVS(EQ}LK_mnP5EU-ay z>TK>zy=Qufwp5?yvDvQEWC{}=+r5f3S4pvw5VbAqYY?P3*tel77y-d5^{z(hVuYkF z)ci^k7m0?`2kD<`xo?$z*xXQ zKokNhFf)y4M(RS7(F`+ZdnRVkJ}`C?O3gT=YYHiDaE9nNT?4TsLJnAVL}~X1#Q!u- zW(`&B7!T~9#4#ImLpcm9=1===Y_27Cbbz6nf`(umf`H;yh8LbA;o4hxhLB(j%}!*5O0p2R+vTv4Y%ed$L;+f}IK*U+5C-y;UM9E^0&1Ib+^1w7 z!fkG0DF~}&G5Ia*bIf* z3Ew+@I4^yuw63kWHeDL-CE&qKdWKdZXd0fo%=_=c9^y49>oqhwOAsHun07S)zgsNY zYwXr;TI;Q_+l~1&JMHN0f=PvFT*Ramri@M?4RtBQq)}jGtOC%fpMB#darzReQ8M0h zz!L!Ho&sCo0Kf=*17p8}iOcK;CXSexnFa7+_+Qtaoxl|bsfRg#=L>`bXsMdf5?O3D z^)JClM8qM8u?)=M64y#YG%==*gOUGqh;`hyS+jT0qE=!G`R2v5Fx8vs5h0hCS@-lK zq&V7{8Q@yRRi;wMTFs1%VB_RF$)=#LeG>5}&|ZE1nVR1$z}a!rFZx2l-SRx5pXiff z4tdY#G7ERmo*5wG^y$+vt}^4^^Xxd=XC8+P1lNYrwtPRP5r!rHWDqDkn=q25PKJSV z3qBnU%nM;xr)l$4I2rlVM>IK(qP&1{@)ZbdgY=DM2xh)7^VEHkU=(Z!w#T^MyLAsu z$^x)}e@;UyLLGeLU_9%9DK*?;zfJs}7G|f?*b_{*2ns^C@_vS)a`DGb8GdApFeuEh zUpy^WHlOl$W{^q1&$vUqW8U!N#hFrMlF#vsb{-!W$Jc&{_n9dY#|AQ4!W^<59JaUG z5rG2S!=jpbuHb1Gq1qBS!uhW-)JdR_OF@R(G3TIywN-GZgnVaPLwapsAa%f)tPx$Q z0;`S|d?V%-=Wr&5>1)D-8FTTuInAEdQwISjb}&0s$gkvopS5VOaU^ZkoZ=n=fNg?x zEFpNs=MhYw>br(ROIU-j*0mK~LJ|qmf$2G`4Hwq$0 zCasEu-YFJ48%oaJij2gqW}gqil6Oh$W+TO6Mf}9O<(u)FTd|8OjkUzEEjE=MP%gvV zq0f-M)#AXZ6CQa-nJJ>4w6&t%qv8huPe|SXy39f6GR?~u%+CoaKzWY)MFrz*`WMTN z-%%fn&22vZN|C2|70AXx4(&xWKbLE|K;U^r0l;aK1C9!4aq@Bc=of40>bJg|P7GYY zOu~Vq7x@UFbm-v`{ZjrFJ0z2@5^4Cr=BGMdUjujYT?LUzp2%Fg2x*HS&QONeM0Kg8K%bg5!ud8JZKz|{-(~E$ zO}ouITQ2Tw^o3zagnX@mQSBk@=Pv#>TS$OfupKuV&n7V22u!Kom9f#r)P=19ZOFN^ zXOQ3#6^g$pcFOL=0dSXyK0TN==?76=*^EQ^N_z6-H`qfzw9Ak%&qA=c>u6(o_INg( z?tdo`_uqZ^3$!Mq>H4R)Am~f!2Y>M2rVAri=v$DS^Ou_H@C_}%VTstS*7F0HQi@nx z+a7IG(r4;@knjcsnYclJ4b7okl(!`0ki-o3G^AJ0z8d|i)(Cc%K?6>uTbLs*r(6ae z+;i^wNBtdV$1nz&h08_IY3t&1^TuaE`~po7wwPpM*Sp2y=R&8$oOr-L|D9h2dn;%x zk(tDaw5aGKVc$NmE-qf~z`cPf!u+#m5W$QCIMDVt0jq1kuj4LrpoG${mCzMu8VtZb ziZz46GmgsTL=XC&b&0Mt5AWt9@CNV3W5<%bm0}JOqWL0>#QYayFv>790Fe=6Gy~B| z-i2D~e;rN1y~zc3cqR%FjMj(PCQnM}*ig!G7x`_R@>dAA*nQztm`KPFZNvCWWJr7n zi!#Yk7prAexo0R`eF(OR!5?F8?zqjePaU?Fr36)hjpOG zZiHt#v2LbbGyrcvkiVEuM<3H3rNR)-;AYKgkX?Wa{7pLRqee+g1dh@#u#UMap40qi z7j0<+b=`!aR2Zfm`#YClB$N>2OU8>Z5i!bf?N|uY`X3kt%yrfNHw>|Y8@pbHSq+6{ z$Rs-t9LuozFAtXUMp*IiI@Zead~0W2^qx$tCHsCa(Y<7Ty|0Or41`CKhfK5#gs>N8 ztDJQ@^?hbg%yYF7on770juy=NEHg{WB^YIXvo<8)j(1|tQ+CWP+GrF?ahHB^C4efz z2ZKlK_pYvXOa%=oP1H(MT5eMIIdf6`R$~28XxK+2w3u_i)gJjsNRB^c51_fJfI%RV zwU#7Xr9z} z>nKOR1#xj`LxD{NZ5?R9yV;a;YiTB}qB(AOqZc8{PFi?|K#aYV8W>wcOs`SKJa9IL z?@}998oQ_wq%j#h%Fu*|@Yula%(Gi*bNchNKKFpS5kay4O4_^leZ2qAk}lH&u1jIw ztYkpiSZ%fyr7h0)eT0jeCp-7-|5|)GS`mV|A*N$ugwhp+danHni>rkd`lbfcE;om( z1s^)cH1XfXcTqv2IHH-okt)$PF2H!!v3cGm76;SJ7a7E6{I2?dm44>d<@4v#(nI!+ zKqzD18^*W_oaG$ILXte@)-8~vfKFe>cu^1Oc;~+Jz|H&OX$)w*L0S?qyP|RV%QVLi zbLPl!6c@h#_a7FYfhOujSpiIzDKM4GHV-))%@w9Fy%2b`OLmcR!=H$DbrT|#Z21#g>vFa%&dlJl^C26%N|5|JMEp@a0h zj+Y0JbT8M{e2(Yk_3<=$B+Jd`{UU8qWKPe0;I9P&`L)-shIF-}G)@>v7I%=ij=f4n z$r$Ox3XJa(T?=uK#As@qJrBAXu)AgRv=goA%{NPgaGgbpg!ZAmISme+Or1U0t3%WV z5>1%dSC*{@GiE;D=_2RJ47vl@ERQG1J(}0Y-F6>p;f8Q5Jt5FEkFm6sq~>(gczLX zvk82vt!S8-NE>Km8p-bs>Jz2}ERr3aZ7gnVsN`Z#`r*?jQ3qGk52NYcLo{3awMfX= z;0)7@hfnUNQ)dT4ifJ^csYx{mjp^Ip`T?r>gY?s1{`>UBolhdF&O2}a0FA>L1O?SB z2Ke#dkg54p1xV|QyZ1r(>*=+N=LzT6it{#%4~wf>6#Y1q)VQz+b5(Uvcxc@WqkG_P zos45mJB-j&31 zD7BgOU?}1p=UFU(v`a~tlDZZ`xo$jMOm~36hQ?lO;I}c`LaK_tYaIk%SI=tt!M8fn zH!ifNk8jQc8`KSyps#|&+tl+Zv>B5TYybK4$@JD+Z!pde*a>?sv=Evy8SB1ppa)#D zkluLXYWnD-Pk7ddR8W(O!#U>KUTEVSn+z9uj3tEp?VR%PTE@V9 z1_lNKx4F5T$3+NJBmKCCjb>8O9_sF4J|5olC}6;AHGm!p{)9D|bT>wnc5wqd8*AWb33iCwFm_WiI00P6rm}P-whT*7~B~e!eSjXW#8s`^t z!_+|#a~-CZhYz{$Gu|)WE#}9uB!`!#C1JGLvLoNXjkXrZbMuxTlVZzzL-t@_HgvbaRYQULQsO>2&Ktg*B)Ipl{WdoCIdl1&Z1zd$ zMHXw#o}yKLSs-UGCBj9DC6KP%tIjd*s9f8SiKB3eLE2j~~bDe*AuMo%=WcDz0;N^SIbf z4~JEb)p3?sKEU)rX;mfM{tl*1Za?1$cU;xdNBc4nVOkrVbKsJTn2eEQao4jw<8l1D zxC4wj&PC<8{la(_ua)ySo-XH*6Nof(?v)U-UFn^7-eH0K7#sX-`+aI?I1~0v(p$G~ zV^*g243i-c{tjA!sj*?t^8XVP%fsr25vuY(9y)70!Uc1s;&QD9>( zOx9FMizFKfldAMeg2csU4b}AW+B{PJlQB|;z_Y(PSs2wMDH%L~KzYt)Z%XRU44(?q zkj)JoE@3#VK0x>Byo+a9d z&d{0))q|uKV3t;qB+t>7v59ed5TuEdSv_HgUpaR=9B6Odxt5+hWoKC;0=aplw$|FH zwzr+ctZS8MYa8kOg)VmFzK_7*QhNR3J4kdoF!!rSgP1X;-P*Jc0aa741QiEL?AJGO zs;z|hf9=&PM8iT7jW$Kw*M3YK#1Ga>2j3Q=-?Zv80W~F#$Ro%og7ZoeY}nNcIgmhQ z&@n}Z{{u90oA*c4+60VPl@du7A;R|RtQvsmi_Ft1v^(09?!u&0HI^9rFT-TWIqD9& zMp4l|TDWq-)2vANJ2E}Wuv@^W%))UP6c=Ya=6$avYO-`yhQ{ZSw|SP+20jE{eXcWp zK}0#2E~pY=yX<2#GhdTFzBZFahTGFgOxgCKNZp)GNARxJ{WNm2I-ULJucgoam;aI~ zVeqmv9g`^9^ytasU`7p7dg+py92uR=Fo}D*(PkN%cNmpF5+oKOLwAOY%JlPHTe+0_yV)ZO~q2V|Jb6F^Pg8E&)na6cIW;y|2z(nmWJ7fraSCQvo#st;n&A=AOkI2TYB|0A(Ceo z2$zVN8!#e6ZFC)XYp#Szt5kcUKN4nw_SF_UkXN8dj%Yt1IfWIH7~ycqS|L`J;Ap^6 zywnEaLw>uwzdDD%1pWo~y=$fa)=7=jHiQ<6CA1*sC(PJ>C8_qQ?Z_Y>V?y8Km4KfV zo3_dDFh<)D{PwecS10@W($o$yES!Wykv&WTeG7Mgaeu^Z6ZXFIbA}F z)o%2*>E}^~>0=jp$}0}XNkY=GDi{o(WriEy3T(o?F9V_G!Sr5Vy?Qk^hqF(Lv>7&N z4{3_I4R^&xaWInF9gFnrNeeSH5bkl0{GjHzn!mj^q=8O z6{1-WwGsM_7^Sfeg0X?k6>I6&U4n3wDmT-g{da$aR(v}B_T!H+4Ld-9fk`uK`||P> z;j)1fB+%iX63m{&C=zGzy>L>@gJ=g|&MgHU*2T`{XTX4C=?p6B8P2x8WG1F7Ulu>4 zgDS#6|3zBDS6)6M3jqxX58qWTqj<#hYTb#ckgh+P1wIfCLG*3{6JyKQ5R%oQ;cQAz zCdXo(VcTWuN_l7sggdpE25=BpQE`MM6SS{@)TV6<#bADBNBqH zDkhBh;~Zwz)#Ux-$B)1z&?g82H^BYOQ2K}P;B$}D#Fsy%C?b6gpGg;Ba%yl=*v3?^ zVt*Wyp}sVa-^2mFPqu*x3GAbp14cFpJ-)R;fP^jfh+@r9gH%aS95*-Bq+Mat8soFD zN`DADv_(ov{Gp)NfJ6F**k$hF;Mg+BRfNZQFwH3|3uw2PD~2U@v&nXZf9o(-YC5{x z@xj8vU>D|vP9kha_CGU;`8u+l+8I%wr6feJ75__kLk<5`+w5-m%oG3!_rd8q7QDVHWOx`f2LyKf~Cvn3M4N|d5zR;V_jCGh+?)xl^h@L+80(Njwmsw<+t~vDttMr$CAXaqCGL<(TJSW z(S6?#x$>^Bn;CUFfkgc3)z{NUA44RMF2NK`0+S#z#$p*np_DG9H86ClFP{+wVU8$7 zrP2eWYmGtNwRh0xEUHS#MF^S}EX-kIv^j?`Lx;!abmeSk8t5Hjq93HO$B)wU`Ee%u zKE4R&QV;$lE%j(OT33mpvK$+EP0u3X1vyCEi!>%MgHXl2Mku#!keFdmm9!~sEJ3_A zA(cJ>1G4VsGfdsF$ypX{i8#$~kZ6E>-ne{8^nhjoEe42D)2T;nBzEi8!*qIhh%mwR zEcPHNi1{ict!gEVJfX#kN#`cHI#BP&zgHj8Mxo72^@&}|Y<@fi#jCKIQSwn9kt z5fZey@hW+-Q1Z91yp<8tnRB zsz-~n#NHjj_(1Gz!lYJ0VQN|s5lm=gXpESjq9Uc#E{+u;vW)?baWj%tBaBe}zzHBv zoCfwHAIKZ7Ne97wz zLwri`C?!{&f8+KP3p5NXM0gLI2fnh)IDvOsAX?hc;#@t~o_=wS2rw+B^$$u$ou2k!SiO+i1%raCr_SY7PkdaOn;J3A;1&X9b=gUiE-~2CPaBo z9S}Zx8XU10_{_N_Qy{}~`}Xaab29k8uQphQUHB8`Je-pf|Jj@=n_caT2S}&Sn$M^~ z_(LfDj!>-NSs4J=4Pn#r{1o-HJl_-GG~o(jjXd0&$Y?^AwFsQnAVJpmNZ1e4VDLPo zYx=_a;RpEDK$MEBxWBlDKL(EIGVS6Ns!e|j+*#OINSW;~>+4a0wpz)#`wPV_ z%i+GnZ}{W!zOyLylj1$z;6uj3A0KZ;zO=*leEM~V_j8Hmr#8scQLaF2ef>Zq0W-^( zKCfbpq8V)yOl=E5`T3W3LNmRN8BYZk97_4M@X%am6XLvuzHaJfeVpsY!MHKC5GZ3} z^iHZn1KQo+n`-gFt9gu&Z(y83>BA&e1)X0~u1`NK;Umk#)OCjUF&H-@ra9ko#J6 z?1KlxT$cB$z{hihqsL??)$`gm49nson}h;enij5b-w06k&{$8OFM&^BX6qPxgpBNL zLianH9V3yi@ZtJG{|MvM;ea(SMwog9Fp}2sl6({bict*!aG@~+6UN!0CYc%JLXN#d zesM$s)0?7PSdlLS6q0zJIDsrJDjHW7Uz8s?1p+V+BQlgq%k16>zyP!YmphRv;iDf& zA|QKy@%?fxaXQTV+3t-RSG@eV?$a{k$t;ae?#5Z9DAOs4F>$v3Vv(t5nbYVrzKI zbKpe`TX4UCeJx1!_3!<0*lFLw_Iwvdw+El!z)}BooPh4&d-Bxzm=Irl^m80{F-19neWOGz_JO+_?5@oOsB7s6 z2da>s$wEZABC(?*7Vn~8j*V+1$&&fE(9_5hUD)!2$xZgeaS=RTNBYjK%k1QtkPU*JKg%?OPu8}dm+EM zS)AN4L(n)ul`Qm|As3{Ps0WBwX<7Al!TU z1}MKWogF@n#J@3(jg6xLT+I=Brq}=uVzjAsIrf22@~jz`VGW;+JwYYA8bpur`l~_G z(;(}0v<@BzgtL7(c9)1u8|n2oM{t}z38G#J)%!Hzif85<(z)K#;lnXK|1^F1%NyxW zfB2`^oi<@cGebzvxpZpa9Oejy3MJ$ogpC^j8hmM%e3Sy8>^{K`;+PXatnGm}+#R_F zVp029k*T}+GF4*C9NUcF9N`Q%+SgYgu`CgRNW9{(8>(*^rT{((dt3<#Og~^O-0`%9 zP_Pf~+$DbqWC%$miY@@gi!Xx=b5m8e3vTp6z-Iw^&bKmz`7q|6_kEz)j4Y{=+1I~Bxi(FP@6I!`YwK41@&s`P6@ z;`U;T{O;B2^wx#5>DHsq)AXVtej(pH0~EwK4_p)c*!B8$*{_BKm$@mRsFB1Mhx>rlS$stZl?q)~Wq4pT zeI@X~4!hP@*JNf$qR#CRsY2RgfhQvXjeTB7+j+8|bVNSGDIGG0<$%h_C~*?c2jOh` z@){tHjXKNdlBc^zSCLMPp3ga{>!<6#``h&Pf@fiPy&;|A#*bA9qZCf5H5ATdy46T1 z)L289H$!y66IciIc4VPR5E#^<4uVypKERUxaBS@#?WT>g;|83DKSmxK^R|wL)FTLK zW^TUw`XvNsx6`dBQ@jD=0D;zo51Q?#a$FPW0NCllZ^3m=XIc;YK&i7%=*|t{CwyMY z+O6?CVn%J{8PUf9tp(S`mn+4d7n?9p|C>2v3%m z=NtZG%uzKJMSz30)@I7mG>x)j{%X6fU@61AgsXX2zWK>i1aktOw+sQ%{X2wxf7_1= zXB;;*`ZBHNC1bEMN0d7>4#J@3hy$3&zQQi@?f6f)-mXC_xuGG&M16y<1BT2qmy=;h z+Dhc)D(e$CPiBP-6=Nx)a1O=E*%&F}{iukQiR^rqFHQdU9T`@O6u`A_Qoe-2AJxW}6{-~9nu(!nP68DUj zkE>}@9sSn9092s;-(}5OVXf7Nv9<;cCd`XW&j$HEoLWjXY(TuZXCRH-2iUOWu_kw6 zbRUMpXPHsw9&=rxj%#G5%Yb+fJB6n$Xlk^qBaw%DxdO zEb3e2X;pHya;1tomy3&2aj!Ug(>lIDu4@*L3mfd#P)1B<-l7kMK? z6i7*J#sQ1LE|TF*0Lmy1ue|$DQd4_h`t)c25i_cVG(S1P0^gRZJBA6piAl@;O5}dL z{YAeN=fkpp=S9)xoWQPht=nfZs72~Mdo?w&c-;8yPe9%ask?75q-#c*0!^lMoS&*7 zwriNInywsFLNrh-$3Gyg+#fIX#0#}4#>SiW*6|t@?T9w|zy$xm5qVHbF(cK*o+ne)gU~nkZ@XPKQf#iJ|1h+uMBtVD-l~8jq67Juy zh~_|oPnd9LY8Qh>81imo&O@*jrf?!(HU_Qvu+z{CQ_K?skJo!1mYSft%dlW zTbNF#+d9+5*UzNd6Vwm*J<;8PvT%_O`#-I6--n!Pd-3WxyWbU?cu;p|JY|MNTG|P(6ad3g-l|}P6<14`oZ;~ix8c)Xfa$&-Ctk{ z>3DHOS(ImqwGjN8hNw-eV4;=xJWxuAL^tCAQdu|t?JF-7eyDYd`<_`q-SkmSk5N!O z)S5=6Sbz`ec2g_N)ul^kFrgq4-zx3HoTrX`5q%jUkOHd1^h5S1DpeYIyW~Vx!1vTcR@#W2n)tP%s8wt|Ms801WNOxucCU6nI@|q zv`2X0FG4yag+oq-=4gkAYiLS{Vpa>z-CV)LgK?3N)i*-3kXD?LPxkbsdw1{P_c5PF zhI&#R{h$rM8&9c0Qfew3D-{hpbl}i_RA^D+;~c1@y+&~Jc`eZat0}J;Q>Ay`cnxBl zcUiBLv`$P-r3U(?@PMH3{)RD$33+Ycc~q=f}sS!S36L0vjmiu6mu~8*Lf$yoMFTF z>>r)A)!2Dxs%9OnkNL^?_}N3a1dkPCBRq-MJdA{8KkHO2G3nT9%4#&MlQ3AVC}gUe zYtp&y)^w>K+&$cbhPOYpA!HiGx8S2|pAigW8D%0P)!$28MUO^y}?K zY#x*D6<`w6GseVPMBBY`+*nWhdt1^FrkHyKFL7^@s=CRr64+h1m->L$Mzpjo&@hgP zripB~K&WQSdCVY1eye_tTxpn`wIdNqP=`a=)WW{E*VvtyFXJ zGU1d@qm5uKrTu~8l=xE>@x1%2fW2WL)tu2cT8v05(SOeE*mF$cpO@ViX&)cinmU+p z0%PnhPsf_)2+&1n3|VV9KSI0n6fJQDdynklZ)gNM1)<2mV6KGg$Uep>!#VdD6B5Oo zGC!GA*A3wq4VE40LB(x_R~gp0a|~F@&wO57nl9eq>;e{?3R%Sa#k(ZVjK)!3At`gU zj`ySF3{a7G1~4DHT&g;icp(=E0Eurr*O}~F080br7ZP6a7v+}QkSkzSi&{jT?U3Tg zSGG656F^N}GjJcZ{3`%oWWYIqk9Usx?I{0a$;cBjN&tuk)A>g}Dw+IZu2RN!nQnViF<^FB?Q7CMKnZ zj?{Dhjr4mEa)0xm|8F!BOiBQ{ zsh^h)>-h3f)Gwc@6=}RK?nfhz<#+tq*XG9y5^S|Fw{Kw9jb7o z`W@}50^xA8yJ?(Z*Nh55jD9duIKw^~eH=u)4zL~UU9nFH(9U-JHXz_+gp~Am0#D+Y z9Yn8tni*kQHU4xut)nq$tZ0A;VQlMgVr%Y7-@1G`_0~0~n^X7L_z;Bstzpcv*Z>TP zS}jqfAob`+;`tAqPqXJi$T-q9!2lc6dLHS?_{3tbdQgp{^ZB$ z*NpKV@T)WVo8SCqdhdHbOdUM~p@F!2k8pn=cZr3t>0ZnmW9E5Yz*@#FntOz5n2Hs! zAuiCwsesA2PlrmB6`N`Kj?zxkD4`05$cAx*R+0t8KTQyfBqE_^v@r35^DvkX=9bsR zQIsc)Y*(?d6V0yzA1Jt3M;rR%Pe#+XUfWLJ{mwh-KmGM@!vx8&t=f^hQ@66q*REVi zzrm!!el=$uG}~JS!c;D#x&j7IlvEoLOd`z)V>{F(p3@c?Uh5$7CxPf-xMtW7eUQo00Ty$$oWzE0E?_GIZA?qbD<5SEy!hmfR>?nhHa zsODvM{~nuvjP_cIB)b;d<1)e$;1C(&0U0r!lewGgPx~@jqV7SgWi5)=nyP62xJZPW zQWXR{=A^3&5YZ4NO1L$pSVstOBQQC2Y4qMLw271HJUEM@8CTa}B-i#S{h%hH z=>Nh&-U=xFub|#;zE%rSxW`x@kgwYF4m2LW|IQof5#~mc+81Eb+lh44JoYS|V7|2= z0BE~(G5zIV{$;FjGST_&|Kta0^5LB{bfPi!bhV(x zYD)tbE^?AYU{6s7?TWq;FWA#DWQ;R)WqkYqK9nVF(05K9#<;*4 zbW|G8ppSSu>+6R=nKF+(`3;)YreR&2;&JJ-v_TXG$A~N z>E5J|jJRerK`lFKF}rQWd4C5pxW|vLry+a^-@r6?d#Ia8V6#!T2-yp`*3jQU2vfGU zF0hVhEiffxe~8m(PsA?It^s@71T9%G+99D~8NUWiv(boF!<6Z_0&9W^Vs+~8X-cp5 zu}(JP`^gxsvj0nIZy`19&!D}?j^3Jv>K_*ypBl^nEqmHS%Yr|Gw3w>NeYH}U`%0UO z0QV6PD)gx2UIkj2J+v5(1v&9Sa7+y@F-As~DztWfco7gwkHl?P%;q-%l`ESw(KmRMZVizE# z#kAMHD)P6=FC8zp`KT{nOc>o^H3QqoFVk!Z7$(U9NOfxkwLXO5TR-^M>5C74nx5SG zG_Bz1JvekakViNcf*jVB8iF_*m=Gn#TKh$h@Hz0Df5rI4C12Lj<9F4~XGNOBbiQMy zIsdNrig)a5XST$vW-agGXl;a}-~7ffrbzt*83H|=12)UB;k;2QrQM+#BO`aHxma4N zO5>QbJ$p8T4RSL?2F}u5?4yBw=Kb*}5LQImm>5qNUpbZ9K=u-NhIgFWWR5g}>~u2Le%<{ajJ70G@z&ML5ZxG50PIujHPH;Bxpd(? z=7Jk(a(ohmiM9jL#5fW<&9AhZ-{4*CmHT?nU?>R2!S%C!UfX#yrkmyMGxRKtCo1doVt1~d$AkMi>_F(w{f~qBb z`@4S8+{vBR$Ou zTiIwwzL!h!-DptYN{WqleP6=IXZtu-ALdh(X#TWYJNzKtA(X{WO8-9i^j_B?J73-g#tlb}Gz}t1UoM#} z358u4UA1O1w9Y6uEAu0X8-v1E;QMG0?O-g#Yvvc}8Luk54R^sQ!lx3!kTcl+;^1BL zu=lnO(9%_-z3v~>0H8|BcYSv+Xun%D%ac%vq4?IE;L&KOiiKm~>Zl@dgQM9$P zx#h?y!ZZ?!6MEg52A#%N;(WN0LiNv%N@zyZAj z18I%0p(~7EGnyv-^4t+u+$Qs%9l0%&vSc18S6oyHQEdNdVz^)D{H8#NZ>7Hn8%wDf zBJ2{5>D#RJ_5FS97GFnu@XcX=nraB^CH%QG%g97dhBb+Ig#$4^Xq4%_4&e=D6*PNSzzPi!&jy`yhKEM4Kjms1`+nUi& zz*nVn2!>&cwL^NHVQ&c&=^OY@l`8fW>YwK_O|KTI|QKTceu;Od>m8NSE z5>?n|u5RLEV^pFh@P+hHI}ESp(95hzD`?H{;McN+DdgMD-(xLmOt+riNT2=eKc^ot z|5}+(%f0xDVE&4j9vs7%u?8x*VHxKDw=k|D2&YfxFpJ&7SD?KYKY>-E3qBcVgEq9< zL|)oPYdVY5HF^=4d3MDHekz!g?z3`gXdog#0U*p*v~)1b7(I~|9||;UP0cH8qB#+AV=V@W;R_f^?(iiJ$_Q?V7YI+Hb+$SRA&8Kr|tEoS2F3%9Wgh+i&9n=Aa zoD%lw+rpQKi(p`7q{X|!y~3$j(`bWZ?Km4LE(5!L88yl$k!3vKXE7Ob3SQ0s{xp}I z%p4h%qr;q!|Mp4wcceW^Sv)D~QasVyyq*Q9Nhu6WgdlCbBz)Hx$Tmzd0$Evy^2c{y z0N`U)IG)D3Slt)W@~+9PM{yMI)oPDSI3#uW|ux-*Gci>UEYUzn`iDDgNbSG`8t}R_*Pm6~7mUR2(--M&y@bDQnt0E3Gh*iQ)#%G<$TGxnw0E_cJ zNSc<$Vx8-IGy=JAibhzQuO5;5=#@zD6&N6+m+u|s#AKDwp(RlV7vS%I_kAR2d+FzF zHnxJDLM$4#k4YGWB$LqSGmw4|m}@YI*hJFbdk`Sns8=!B)n!*A9neI_ouaEjKoZWk zAKnXClR^0YJKqVZ|L1o+N zd@mF-)S>;@1hz!7?pkdxD;=!|*?P?=#aI>#o9Ap&kYEDO((U`~%1t@?;Pm$PA>_fN zjVL-nT#?5s=P<{lj)rip4hdWCV}%+rY7)DK$CViB!Urs(0+C)OA-4rswxWS_bB&d> z7Yu>KkE2shpY*|S@9toSUPmb44K~wb^TeV*ivG2I?%lA>hEnz2XTqshCZ|Gip#8;j zP-nbnX}&K4%fTC>CIKh;K4XZ6VCC_>^wYol5N+Umx-v4H{_wrmQ*#HzA@8rTLE9eT zPnyvz7*(tWBB2G-#i$R@W0-^k7-@hQLS{Nuj9*U&%k8P|9Gf*Fc@Nm7s37R5k7Lq6 zU!FO2I^Dl{J1w&lyacTk6~1Jsbdq=Hb-D!!sP*<4r(;KEmb`(~n{S2?$*MkUm_r>P z%|#n{P>pjal282)l!&|GrekC9jXmaEGXWXaFgdL!%2_F*$DqwddTJO-;clP#EnaaB ztU}}|iEn~HZo`}?m_dXgl@QNWXb1Ko4tI&`uhV`7rju1f+o{Ia#vOhot`!)qV>0yc z;lpTWu^ty=tbivV2&Hqcg+c~FkOZShN|Zze=eFD3y?isXX084U^GoJpF6L@(y4RX^PmkQPWveY) zY)MoiDUtvKh>R*!K;@kCJkLJgTUDS)+3tRUufF$t!-@Osv-581hH1sjteLzDm@a8A zj~N5)X(GxNd5YRkrJ)N~$z}5qIVL{AY)~JrtkJBu?L-4qb7Taysu(YXveB{(|8e@n zpOe=!2FZ)|iHCmo4=Qk$vgDgH#@w)_;srmNAh1jWjwdB7y=wP6p~ zUVn&h1Sa8xc&){su?8k*Lz;xWk=-rbHa86spsH)OB5ZOQ;cUyQIvEMZg1vonr; zPLM8*<0|bICHfkmVEG(@XyPnik8Avi+b(o3@aO1x&-2xM%?INlT4gi&mzgWSMf3Un zxS45pJ^#|H zIQia&SW-$3qjo&mQP&s!DaOmXT{LmnW`FZTLO3L7jIn^_Ru}fGn6Jo0V51Z^#brbz zIzaL~bmbKgaCLh3mp>sv+f5uY2U90e63VpWq&hs*r_o9#}GgQ(b93^mWRBwFk zt8A95X&n1`BQ0zq3D?9)NrzJDmIfpfOu*zEn=u6{ojc@yPhV}iLIS4q7rWTNbt)q~ z8-#H+422HS=g6(F53*Q6op^8fVcO~^ZalDDytj#on@szK3+FHknxlX8+0BOj z3_f!jWP6zI-+#QOC&z6v2yMp#(2 z70sa`tZjXQY)CK{Hi$-wkWJL1?@GDtEIprz6MqNb<#o%mIZB>*Lfn=6nYE?yDn;_V zi=6T;Tgj+!wuq06W1{3ga3Vka77n8>o;}ouM8XoOmqjQARHLI$fOh$J?%so`>qo-X zg$dm}cGh#0Ukky3Bp%{$1xCvcqwey&EsKq)zBJ2MCl8Vo!!mnVFq^ncn}gH zAvW^v8-XJ31^8vT%U=n)?3oX(o3nStDT51 zG=j>b^|4oIjtB7rE;nz0M5zERL$wEE>v-8l`=D8xXUL@(H%{Wndsx7T*B}tKQEwR0 zNv8e)a|k85)!>T5Eu4^v7^rWL4N&;t`Y&fMHXE~9&Lw^oF4STpzP*kUIrFL1Gki^$ z7spjPKze}UL+DpsMeqj3o=u&2`0N2_ZVm}dunhyc06bL^%fGIhMk6q~{$QK|ZGgdP zNOx}=z~Vk_F?a}Y1kRPv(ZsI|94Uh%Eg`W}Faw!kr42!oFmE=3^{-aUJy9)G;B>01 zMnL0uNib;O$ko28`nTHYd zoj{qiU8ssu{Gz|MDK5qQdM*xf#b$W(W{fGn9Wy`09iKUQ$e`teZ~7ezj>mTp7_HE6 z_suLcU{7Iuh;Niu=xYTyth1Y(Ms3~X66(XW_8@)SX@tNP1Q|=|7w^2yKGBrk|A(Ka ze~$%2EgIwU0n7|JDQ#~2xTJJ?bNKM^DUie z@bFX`h8ZcVJj*=Sr_a_V(=3T0SJ-dc+99ODCu_j*1~{M|A)wLRD4DU;ibVgd2Eu8> zOuz&!5y4KHjMCEShQsv20KQ$VG98W}aOPG5epw{1jM|PH4|$$3Y9*oYse}2{xN$R0 z!7wdvqbaB+Kmr=8dg#U`R04IFZT2IOt8VPX2`dZCQPs)Q_5nq0RH`Hx5_pH>iI zqy+?Ay})9_@hG>c3R_Mx6QYf%!#1v*g(9K)`QsSoeJht|e+14r3Q0ooyYx&f7TX;U z(JD@2Q5;>fEZgWJatLz&a-shA%{*>CCz4Tmt0d8XQEiNs#1|_sq~HSHl%ld`giHpryr*-YyfybhriK>E?i1fg)+M!3e10K+s$0LmV?3u0+b!fs|(#B0-0; zKBB}dtdPqBXW9*%y;}*<+rmaxR@nrhM22h--1M9puPN3jk+t~OKs@uN`X!j0YVtQA zCBa$pkg%@hm@sX^Ty4WdHn3^e?IPi9M;b%OT($M^SSR)iJ%@YDbcJ0emY@xRtR7q!3DV!Yr!Xqqp08LCPDQ`DD$y? ziTO(4PKlIJoNnJ64YQ^$HqEzPc2iwlF@ADY3nG+=DK4zHnhb)aW$ldI(4s;QM($g zrJLY3{th|^E153VtwSL(8PadIItkECn5vyUHphAh`hB9_Sk=+6_=fP~hjZ4I1i8qi zOyLesOSD!~M+KX2fB!&g80bzH&kRDKAw8#$5{zbiJ|ss>O+{;3+(whi{v@84M6vzl z+CB5EyA}#GumC1JXt@q{i1q@r>2ux+DGB zlpjMKW!snBtK(vLftp#5@b>dF+l2d_N7F~5NKA;d!!CzdGwZj+dMyl#+E$5M%}pGc z3hI^EmKZN%pE6&HaKNyXU|2~Rwc2jlV*{K+qekgSuaAf3O|T%WOBhnJ*z3pCyaE0L z)|&g#VuG(`8GwlimfK36M zF(+d#fD8Me7*(h49ZXUc24(*ow;6<$0h5$S+0L8{lIzZZ1X+m4+K92wRFCFB`uS{; znS2{as1meKUU@#0p-3+znqi&ee%uEpZhKoc%m4c zd{IuZZ#r+@^QLzGhiQ75KAs#&lVhKz{#K>=?U-tj5ES2z8uBew)2=#_fwnYaI>!LDRuB^Xkcdhf zk65#J)8q_!qo6MiD~DO5tU)w;Wv(mDk=Xk;37k^_>>2RnfIWB}24a}d>9YvoWEwY0 z?f{MK={2-hE0r~vhHb`31~p)iF%5hQJRT^dca5`V#Bq+BbLZbmf!sqdcw*AMP&&MR z@+bhr>nHC;8S$WJ5^njuqqk3%S$ZOun~A)9Ui{@@23)|dCDy#50N2^1qwd^VoJ3bs zM?(jXN&Ob@QFC5F`P;U8&kse0>dYVh{(>VOjeb3OGdAt$Q@$QtGS696QJ;5?$D>qX zsTiLrK0HJ_Yzst$J4K9(i_Wt0Pv5R1%7DbkKGLiYe)N}V^Y!nhv(JB>_{7-ffxN14 z9%2gfOSUb_$!)M5h15hurA?c}qjid?d~_e=tlHG~z}@*Ml~MZj&-ltPVB4QKD5ogO zZwZXW`PuZzhj-Hpgnw-9>ZMY+bQdvC9_+B_J9^TaMB@3?+dodDVlaq!nVjQC$X5)jji*n?dEiF&(i3a^v0UO70YaimQKKQCtQL?;9r4wlI@6z?zdVpekU+eHZR;9g>sT2eop!VV<6)b> z4e=tOrDmeDs|5$o4kA&Y&cuXDlerC|IT; zYcn7$vz_dRHW(^ccrGwU|q@d+GZlkAlH{^(!PlWCK3{p&qV|VXsYF7x&WG zEXf_w)_m=$7gGbs`Nl8b54&**6^V}}2xbwi#FsE3jtFx$<|Gpb8Tl%8H1e8003k)S zo%G$~7=Ogi_@lXxVN%&u>910Fr70PoNI1EsLpl^dTq>gk{#`hr508%Xo@<^5L_D1P zj@SHHXHl=OHhtDMDx0se*WICAtKFj z(tr&%`AJCdq||qr?6G04g!I!oO?0Teb9N3_!Ub`U1iRF9#;G_f#v->Pk7qV95pU^N zSq0Da&q#pTwV;g z@qK`vLCN!F5;s^K-a)EJxYIM3?49n%`JYYPaIH)a_js2z)xhRnC!rc6XqkCz^KUBQ zVSO{oAd0rZ2_~B}Nmwv+C&t0SIiSpC82U%}p=jE(&lnECyk*lH;7H5|48@1Cggro? z@yzwcUw%i|DDaTR>Oaxl)12y&ur870aTRCoIszd$*5;H^%9u^m5QaeLT33b{<_u{& zY+%fRQgwqCXr}6(vx3iCuqnW|*i!Z_Mf%1UeGseLSVg zYKi_uJ{*~?hW76C)<6DZbR^!RZO*L(wlMEu8*Q_&<2BppGiq&w72(oDEr28$o*eNp zkBs)^WkfKnVeIx0-0Yh-hH?UaD9f^4dwv)7L}8_oRGP0=qz0juW|Eq!E@3uhI9VgY zY4)kvQVy4kHh{LNt&m}m;jq0+XejhUkOXNX z%}j7E$CCcZH+Ue|cwBIP+uxj(>I}6Szl}HDhI8l=+@-dgl8PerHPlWAha`o92>5(6 zkNObBWk4)y&3!60!iqnCL??Xl_=e@{XjF{H(Ky8-$h|4<9lw>&q8Q(I;CYlTL?4eN zkNynemuINReajNV1c+z12o+Y%Ci%&)-hyaZPA`1*JK=b?2W6xrE4R~=<3mI$Ac*+X zLWrAdY@ zhuAlS0~X=vbe3g_)n9Njqd9 zorUQqgGHJ~+i}PS`f&6SOl3FYKsM4-y=h`$JdIC1VpD8Nzr69EXeU`nLClpk1L?(@ zYp8FX8Y*Sp-B`Zts6e{70rR9889<2K2u*>e6C>AC3uQJz9GHoB6RjrGlID>h{@@4i zq;qF*MkLJa8s-9X<6}XvjKJiKLc5>8a5kN~@*;3pnXbS09z=s7bB_rZvqDE0Q6}M# z@pFwU$roQa|IT@qRx7dQIxr+&iF@=(nEE^{YM$s?l`!+rZaA;`V*)D7Hdr5)@90RJ zi#MAvhfw=yrf%zsHNm$>0l%NGFK(9p*nX;}T>kKOIdJvPZ{APe_}Vv+LMz~@OJ|49 zW~4^B-DFgk!21ld#H-IgN5AY3xJ(TVZ4(#R3JI^hklgCbN!v<&Kd#$;gZ-wCZ|)D^8L;dtJaaPfe5W6qEmOY~vWyofY-^xo~zq6{Dnb`3e+Ciqh*f}9+t z_#n-(Cf9&ZL9|p{ZRR3O*@0p84>R8K;$eHg#RrG*U7jxV3UwWV+OH-%({ zZ-Ms%UTD`Af-|)v;s~`++0PC^37h8Qg;@kV#Oy~ix{r{cj=clS8B$`)aK{P;pYjXl zC9f@eO!r*hJZG*7iF-44E0i&_@)~gFxXC~?W9nzy981S#3I;=1@$mg-^3e^0l}|*9nN> zdNLYX_7|Zi)FXi|OaLLlY`38$bV-YwxD5?}_K4Y2SfJH}|NO7;&-VhD@^stuFF5k^ z#rS{0198dw9>toDf=lHW<7@xLjfQyjpNaQ`Hyh;jP0-&qJ^_<=7x4Xo$X{89fj~eO z3NRQ)Fnt-c5!es=1;Q5NpR|i0ZoLkZT>@`NL#SDWnuxx;K5c@yL!a|4`d^7~&2cQF zzS$iT-mUJX)(rxlRGkY3w1EgvU2SMGXjdb?JiBNN_Nim*U@uv% zU?3i)IfUuvpcy)IF1`N3m9(gC4%yopo28*?8&R+#@gRb(;6IssTU#8SMRGrZBBoLeD-Rq$QasAH zfX9~;TTE^N0L6E4&)*VnQLfMD*ZG@|c%MNMiFq!L6!E!WcGwu4R470mj#kBYz#7j- z=2f0I3X7RZbr_A>C`a4o6+&`#w1P+_R2hV#&DLy*1yXfEdRT6>(BJt7UnRs7mBwT7 z$IYExh&Jc3aFC0mKq7f_wJ4q`eTrwe8rS3+{HIFa=OsFnoVg$lMaB+XX%^zNbnbg1 z9G^3jwc!_MZvHa8zcQU({O+HpgL=a3aZh0Pnb>?vmFID0;5clTs$h|!M67C6f!N-t z3KvXh4u6l2+@}1yIJX=Rf9G@C{RRC#SrG4tWSvJ|3INRu9lroV%;78G_^YO7x&js?qz?DlSaG1beT$gRNXBZ25vjDciCFt}AoTMWp5=M+ zHV~(OauX>!$V@+nzP>)R9jIPWr)sKF53=ecNmwP$qC41rOF-)AwT+apo(;4Eq@mqo z6aFUVerQDMB{G|yn}hJ5qw$!&z-V+5WoBk}8WndYKIdnbiHw3Xwx$-lEqoI>X2*nW zFmSXHo+Q?B6=Sa!#HdKqQ&Xu82}fBa%tZMDaz8e3Hqze6hKqUEA;duk3=8%YwVDU9 z0l)XoJL#)meT6WS%_NNLA!5{WdiVN&M|y}IwN99oNPJJ9W+N>{WXl-GX7fMKs4(dU z6bI+4eBPd$ApWxc1I7ki;xk*C6@Vuov^RFDh?2FH-h20MdhQv_ zcRsyCHucri-O-<}e)qZbM^C>VG5=fe#oER(apK-XBuc1`4KH z%%M0-{GcQ`q@|9JYrvN|Y?pYWz#Y~roJy!koWGagoSAt0vo(4{#?x((M*RHXtvf0DvOQ=Noyl;DQeYA;!t~~v8*cI>Zj{uJ@3|=a= zEgHz5qOEFm9LpVqDi}G(b7O=X)x5*Qc>47)H72Z!`oQ0gsKcrZx`z68(F|7TTh&5T zGkh_`%L*|ZPmkE%-A5cJ`eP_e`?at*OXl1AM3TFi?tT0*Qp(lz%%!tv3dwQ8Sh?;s ztF^5=nBYw>ZlwF04~gi~0HaU}HJpb$>akjakl`aWm)^gAGyVMIVFV?;=~d<;gl9^J z90$IHnYo4e^zLuoO=FluuA*JkMSdBAL5a0vXSm*kIrk0MnbLMiVBt+aI~hB78O!vaW>!A4 z4WXlyyOH;+vG)#B85*F;-{Jxm-!MN_SP|$?G>`2_4W?*qFi~dcHWTkYA<648rMf+Q zKgxFuY)|g1vy|K5cq>`=$bHOAj(AqaI|P5sr3}4(fK`}`Y#{}HaFaP?{F&YuV|{fh zN}Bp8j6VUVlrOL3IP#or}q00d)YCA$uhuM|5wHFP6AzwX4nX@_O**q03Tyy`` z&%v@Zc~UD;SKkI+Zi{+ccV6@R$N%_t0I(TXNQyxM@)T1}Eng#D?l_dbobUjh%$N~`^@!^atD z9%Vpz{>g*i0ei0fTu$+ACH~8m!}~MPu_0V$(Od zx5YT>-&2O4+x8(^Blfga%rmllAGS-cf(`7cyxyF~b)1$dt_+L(ttno^Esq*Is!o!v6}R(oeetZ85@E z8FSk^WPn-p82h7XZ1gdGH+T!0e6(MNmDTr>2J_J>MFtMI2Hs`f2$Bi>i?dnl;zSm^ zrllUvlOx9xa?&{c+}zt355MDK8+YRTx9<5Y|1y|(_sb5;J#It>s3$8|{cLL)f~Zp@ zWrD8;;)IKMSueR`++W7@q7|p1=p@|PQAwWA7_a9#>@?R(N9lI)^9#S@`O-_45&tNo zC@)^}xBt9fdiDzovoE%;G=|Q{Z#^HkZSQdj!9?(|Q3GQ8=H}e^NP6qP{-^Y{KlroM z+s|B0Xf#A#IcCC`YA1v-{5 z9QO9ca{BC(_d%k|=^D=6WyGnkz)`@l3`m80zIg3xsTQW;yuT!i z8w^`q?cMjWht#RFrU9G$rgqeHi1pQH=FWdXTk>XFK+4JS^5FvRZAWfPe)3C;J9@{b z=^Yfn7DQ8H4}|BducdX=^_AF5uOgjN3a5lfGY@wrO^j5RYQs4*skC$&f)vRE$!^~K z=F{OUEa9)S^}#_yTE)(C9WhJ?4#lf5&l5yi(B$dtMIyQ&aqet0F&sGQ*&2;8ku~= zxYAywLF5#fBLv>m%oq}}zI1-*S?n&maQf?^|NCj;V4D76UIOAiAanNIry~SEkjREv z#@4!~wk`F&_;Ok&gkBv||4f?!ERF(=%)*9Q+rEkCa zr|H_WFU6)3OgNjCxXdU)Vg6B?X^9;-e>WcuPuAIJHY<@fG9EjaYjjL5rL$`XaD4lWn{j}5#%scwpDf^zi3Wk4IO0C|OUH2~lZR+lBZC;xXuxU4x1rv@Ox z8en2~A!rTyAsjJ^QI3R)?Nzc~hW5sc^CmmH_1P!sZ@>2+$>p<@Y8jh;%*0x#vzAR- zzW{|Kutki8b0`ynpAYuOhX>eVe^H0Hwg)`C|NhU@-~avJM81*T*~v6IN<_3y@K}8l zfkDaJc7%VYJ4uLST3;Y_^`R{)8|6&{e zRuE-t8}q3fvnFhNAa;~WO2B0XU$_+}H0wEZ^Q8>iV7U?X5Am&m9CmFVni}ELaL{|K zom~iDo!~2x_=~S4v<)o_y%LOw{#cHcK`7SL>OuS|#c%EEpeZ$J0wzK`^0>j_}rkMH}FXx9mJC&GE)hKjAt+@rP0PPTL^hXY^bQF84 zxWu|0U+XH?MsrQ-s zboo0F#)QP*SiqbPLU9Chq%zp;!{)No+<8XNe~O*xDGm?p1oYZhjJL4w!YppK)>6^iTGLd5h0xK zr8NYP+gmwHyZf8rx94QafX8yyss6GiG<00}6U?aX7M`SG^_K{oZdX{B1_$AC1{kj8 zBlj^DX#hm2)7~s39dVtP0~oj-zoQ(%#)AR$dYs44_?_<+&v=vPSTy-carfvg3Q!gY zQfC3kT~{)SBEi7b4B9Ft5jvC^8Aje#IU9Ut>_;cJOpX47mMfmx>qQ6|^KeSW;x5Y>#Xqr-KdU6zL;qA}; za@2S4{O~W+3uAaQ) zQ+{&41dn{BR94*0easKVUtlB?BmtmIp!V#K9^MIQ`g59NGWqvY!Oa|k@^Oqr#{RV5-|o2Wv@yRL7=b`(uTtX3L!441_i zN8?H@%~HE?c_CyaE_Sv zScDRd71hLtMxs$hvOp0@CF$web3nX@_=SuUR<#LDz!pw*M0`R5i-06F8)_{!AoP2B zNygOD#{C5(e6vK^=tJe*#Ba=o=4ONN>^1ogj#MU*+lEnFW6ijMDWR@`I1O_JHa5)~ znwqfhC5g|Y(W$h7WaP8yN70sc?3LTvdQr@+A{lB$ecnnRDv6RY1OwAdeJgBkI0m9B zZNXV(aukUSV3 zWxa2ufAg>ZBE59^Ptsuf86;#Nd3i5QdDsNPtt2K!m@6iV4F=@r<}WePT(+D3s0-!> zKZvRgX0LTG-5p#`ixVU#L&LCvDp#VMZI*(W6Nw)--^VkX>HaXj5JUo7#U6HLbT2*q z67Tbjnmd`l_uhFw&A^n&SUIQRM9Jpi#ve=rb1dN(Yn?l3g=8~wu!C-DwGhqas5X6SKs-=z*jx^K@1c2(8w_oT`g>C&V^wrCAw_4xEw)W ze&4YH{%_vA$)=1+BF{L$F+5fz68y4*TW(@9R<5C&w{EA-R${t~HGP%$qW{s@e8D^m zlD%inoIxYuMhW4^xMX6}itL|~PRx~v#jo^Mr~gJI=4$>Fc1%q?V*eo#Xj?bYP%s%` z12wEKji)>$@R&1b7HSZPR_vr(y3VChG#|Dn8g}B_YZ7{kGn5H#hb;p3H*nM+CnNX- zsoYOtibl=={<%);(8?NZOU6MBKr=|fL~UxEjH7ReRr@4vDXMW8!=%f;ftM5(YKkD9 zTv{U3(Py_zx^FhF8Xrm~8)6;e#f3JEdS z9b0jV((E9(nF}MhI2#Hj)YvJZZ_*JS*i|aKf!R`>%o1(if{ldFJ>P<6F9AK~Zu%SvnUV z7zl)^WKH4_O&`PwGDFU9ok0LFS&BUi*;x}lPFTN@D}!~{qlWPSArJi`heS6%d5w+0 z1U{G*a38gdh18F@)dt&+jDr<6WKEjHF%C_z5S8ia-U~#eBQO9K6Nvx_TQF(Wtnac# z<^%_36UH?lsM>)uF>uHMCd%uWDK2TQ>w;v@ZXqAb;RNFdW1+T!i2`tn=KW5Ka6)GHM4nZjoIX^^LanWlWR9Fs?AQEt^tHha+ zhj@&^_5mp@UhrubxAth|m^?UmW!S^}EY0FCL|EMCN!|&SWDx25si&(k&9ZTCQnnnb zgRc#nSvNbe=XO(yawIa4Zph(1;iMLAjz;8uMc<+qyjMC-{0xU5FC#fOEMH+G>!o76 zM9jvtb};i{0l5&-nEg>&>hr)ZK%xDiAUYQ|<%z?-onCn5JE@tCw+zX>IRp+5m*`75W;UpAzw>A5`n$hOlOwm&_&wrWV_tCi zxffVyt=Q92w!|F~S~lm#!r|In6XlcxnGS+Q63a$c4pAu>x(PdPfJn>_!x{5qi;BZ> z5Lq1``QGsp$9Im-EC5Jl0{lZR?8-&2}ar=CJWBN)~%F3 zq{^(uCRC*uC#C=Pj~`$f(uG>J83J$_fuDQx57@HXCzU>>c2&%hn%Gr4T_9h4$AC#=XBC6T$UGRL9JZ zNW8bd1xY;-cLpy_}W{8-xfjDl+lk90L{n@wQNFB8#&SQS(c0rDZ^C0C(v?SAPjF&)YLd;%jD!-bBI^Rfx z17`_G2O(-C3XoUjD#)^)bzKLuz`!416RRV88AwxCPs0aSVzNMIfI)1!nG3`5V?l-5 z8V3%ng?}5AcWB!VK&Hht)kcg&BNM${1rcc6hj@N4I`Oh|^je z_DFEFvY$S?^ARJ?XM5 z)}2quoAawQOynt5_^*HC&Gh^WuM)zP{iU%r^|baOSvR~Sk!+Z=O7RL~srf<`@VAW_ zK{?v2J+z5Ut(8PGYfO(}NXikrNBC6`j()ob{*iCpBS6G%LmF)D$4L`np2T;RyO6wv;zq#r-^FQ)G|kImZo$XB6=IVy+QhK{c$wy6Mlt931Z>X-xA=(b%8X@ z(ddI_jEE4JC;ShlMpNh5t+WdaY*NnDT21=?CT5a2lfHm|Ng0e(!8G_SQ)HPE{UwN^ zTuf!q4#-%UgJ}=Jk6}_Ln6vMF@4vtZviH(|iFx0(eQE^FYolPJwWE#k*vFUQelURc z$9xzv41VmF@Vd>uY?fxV%HrsL{2spY%8Tjz*|X`x4?ZCH$b0Ee|NPI>izGW;Ud3S+ zA%S~{{r3I9H=OHNAa0kQt3bM*&7YO<*AObTClT+EKH#4=@ffC+U>Fe2RltHi1r!(a zZB$K(=L6{=#KBaiP&q{yXgX0_XPKBjb&@5Wii+xsO6M>J_3xpW^^jqr( z^CoSwH;qX1z{I);+n^B|#1}NdDBX#5+lOV#{wXB>jU>Z_soTR}py>|L*I0uX*QORE z`3FQa+nyRvn+wyaeENQBICD9*uzr`e!MjMfDiI9sAhZ)7$vk(nZ`J`XYX}-#2aIxw z4sp;}o&m=QF=*TP1i3#(@1{Q1${yNt^A06)w``jeNlhMe+Jk1Jo=BGq_{i=fP^`sl zvH`!Da_o=x7uRSjGzN@F=Vn(L8JoL=$As(yz+#5}(uq zgNA))q+>0pxl_X{T9wArL^0@W2aNuRe+kj<1RhU%^yrB{xs&;C=|%wh6IbHK=dTnC z?}_4yBKQtqqH^D(710h)a+|nX0z_|lNZ`~1TpQRBD}C_=7l~0T+89vdI}2n&cW#Si z#H;y_=S5|uZ`<%?M{%q8{<7za>oJs3T&^g;$AzMRQh(eWnWF%lkaB>a-71U6PPV8h z!@e8ZQDF0MY~5dFNIv@T-B9aae&IC;D6*!qxg9dW5;0a;*y`r@D4lDT>o}kMc;C8h z{TCl!e&_i4{7j5RTnrQuQnb@Z<{#fbjpNZ25^|h!+0tY-Ma;_()jxUbN9lPIstumK z3?fsqh&qx5zQrCsTpYMgjhJ=PpHy) zq}@BHya%bfGTps1j#M1GeE!KCHiHcOG+>)*4ulalSDl{2EQN({Hg6e1H!!8&1{aW! zwI4cMZ=>Ga##~Jp11%#F#xwx+<|#JnCeA1DSGaX|m<^kEzyU^b@|k7KG}cg;8uG89 zt`X_?3KH36HaZA$2oN2&7l_uR{q5-Z!*q6vya7Y)FtSZ(4yKR>Pm;)Pnhh4DNx3ae zwJ`U~>1V%M3QdU{ewBV1Ai^>n+?1vmv;S~?H_b2J!F&kQEV7FZ4P8rby!ti#VaURb zhDDR6P3p!b3fq^3G>Sh%&-pVD%S2G(-_-grnZ0kMUCL>yM~c_d9VP;m*t?f&SI*|) z+H!^Ia7R8ZOq_Njq~s=1S#}{>ci3c=NK3#f`q+SYSX!9DF|C&9HAY1#OJfuF(VDfV zOI1&$`Nzv?`tco*J9wwL1HY^_;_8FAAv_{-kWxSvK;V{>VDtrG6>wOBm11-KZJbnk zzMgGc*c@2`DUkpXz!>Y3(1a~^_Nx{ma)Yd+D97)Ub$T+`gh`QBt%{ zQ;(U#P9+*Ln6zL1>?i35-}_!#m?80-#1~B9-H$&`{{`m3X15XM^w~>8sjux4(Nd5) zLY%aqDX26=Fa%c_ZGq3{S#TQfH=r7C#f+k{0obKa`w+28ttE&8gNwY;w(gL#=Mq}g zfzv}sG0~3HLL8v!Q&2WMHkR7i>#A|wo?pbbh#&yp{l<6Fw%HG>9aqLE#xpPj!|o-v%<^)=wZaT65}0CXX_PV#Q&)EfgfeX>fJ1*zPgDTx=S#v=%1aVT{_TfmU$EDhd)*l{jmtsDFy;L3eEh5&rE1`e4=s|W#u z*;F~WqJqA4UAh2453$5N7*f_XG&6&s3qQ!R)}Gk+3Yhd;qF!Ruk6HouQ0vU~WP&iX zF0OYza36_SMC9X{NnJ0SpQbT=yx517x;Gt+U=~X9txJq;rQ=B-KVO&STcpEi#f4-0l~CtF<)Kw`NQg zXnNgi438TMYH#2vuMvuCv+oZLm*e(Tm<_PbevVK(vn9Q!F|r}ZT1tRyNK zFy|P{gbOduvCRsNALrB->pT3PN^>D12%jjw`6#mUTKuqq9)2B3;l1SV_+tgR!|~wB zKl#<;EBxk3Ui1d-Y(T!JQ7cr8W^35hmc@VRvXft`Gr-02&t@6DekQ@X}g`M z#F3nU*(eV}h3irEm(?BBeog~fC~T$7{#2sq1&IV?zD#gvBUn^MZZrDj~BfzLTo32+;{)&|C&Dd)!XSI1joT9 z5_YhZ-0n7<`yRO@ZoKys=993fS6&9dAyV0LLxm}^K*2A({tb{qE76;N4&ZO5&p!Sr z%snn#yh4~YBNMQR(pi(VtS-)^TBQCp_yvG%XorlI8v$eBSVUVL`C>l3ZfhOaVjTQ^ z^rN`=JHH*zn0v|!(=A>>_;ySkJ{e*RV(Bx2E(r!Eo-a0$ECX_yG%6=gGRtb<85 zl2bjIM@NPq!YnO;d>SL$L6gPCxZIII&M_R;7>y5#S#7k(wvPMuwDQ8|lUhVP{() ziYCSvLxp_*!3RW7>Sx~S(- z<1mj}o{)yxwHj#(+ip!NHn%2V7_Nc~k*dI44E3Ld=tJu=NrWUakhatI`k4pm!;jue z-3!=?ma}<6gv?dU;+zUow1$>%=!Mk5I7?u{sbKQt9J8sw95~AdoNt1t#~Ak+1P?M0 zn>1Wls71Q1E~6q$;qW~nl;Rv?ZN!W^>|V#QiM{duuhU?6a~i+@AZ;PNtPJOI{8>;J zuM%xW$Lf#)Ql3&gkAS9tDSrt^w1nRo&b)Sfe9`rzS9#30i?y0JoV<3eBopS=3r^uB znI@l6kPy<_62We5Mcq%}vC{RE_fFm~+II4q_1wVs;l*n&!tgcY*xv#3n~7Az*UsUL z7(x%8%K{CcoVIEfQ<%ltb~h~h8G@2%yRCfk$mNJf#D6jonrY}`gmj=L%>E&T)rz=?MR&E}7O__yi(w||}1@%72lXSKwbx%+(y)znZ$0w8gtC5TieR=iP|u-i&!zCz46X7$mfn z%qbt>y9+|q(5f@!EU94bWflYUfb9%|j0gC_VR2;kU|be3<$L(>7IcuXuBYO5!fi9l&CV9qb5#O5imh>D;{FL zom-O=Dvj6a*f7n8e_qD7YXdQ7$GP)qX?QrK$cBqum?H}_c&nCmT!nM2Ouo#D?-wu{ za|?`1+&OO&`d$;aDEKfuWnHV$J7084(2n5BDE3}!@s@x_;6 zM%O5xEY$qdbjvoW1ymap2}Qxp(N3;|HKGsV1${JNd2k+H!K7(3m5t6KFxtn|31Ltp z1ftAHtRUG%A;@21Q_ubBx9cq5^^$n%UwlwN#$oQy*&J<)JzQsH=S29l4)F(r2Q3Kv zv3Jxr@22+Cz(|S7i}p!R@t{luG(5pgFqH+y3<*vgleCB+u7-73h1t=zLOBXIdL1MP z3<&GCitC1eBH0_^41x14{1ePJ4(y?&+h*NWA$-uHqJoH7OeK-#@C72+g8Ffz&aJT5 zudFR&8dM+5zC^!_<2V9TDvS0NBUZq3;5eBz*XZO#ni;+mz7L1ccgT;!cS*R@w@2SE zpVx$GoADUsc{R>89bF6%a}4a87ADUk zl!1<5J$#4nlQ@@q)@S`5&auz?fr$_tN;4F<{lhgTq3GPv<}7KQnq6)8QA zYk;fB)9X(j@mA?-ah^F&UVP$FTNlWJ%a+uJvq&9CbOkA*K)-ifBJgyl8}IytES_@!DhcPnng`esvJ%s6l7Oudw(Ys+zn1Ehh_HA~2*U}| za~!Cic>w~rCS8Z9TP2a!*rSoKLA`qQg@{@t0@mC|W^!wNf%35-#IYFBR%jvUaCA3% zS5nPNgv4)JYM=5aciwqP^D+Gxm9cOFs#sY;nG0gJcc0#TKjyNPJQ0VLIJVb-%wZY^`Z0Iv z7)WPMbwZ@!M6TWT&Ja*2K>}e9L$ZOXMi~r5sM)zAVW@4U_S8Cgx3sXvY3CwNY^MnM z+68f{bN%+C57SrQ{Nq$f`yYJvE^7Sw^zr>q$wffqlX^mn_CHVNPc~!-e;sL!>2CxK zC0lOHdOxgfU<-}bLX$EX8zX&af}p8ZI}DgwAhvT`g%pYC6liRkyO87)8g}^M6b|r< zX>xR$eT4vm_vD@AfvRN{UwnlJOMOE@S`(2 z7x-iti}!#X?$4{7#bytEtm-HTZF(js}d(Fhus2tMg0#*z7wJfjs9pzeU zsU0WV$mI;vEEMYmEYk4EFeXMMO>vz*Lw#DRz|k^*9x`=BbQRZwel6v0`rr!h zsV#{}AQFG}!uC*qn1j)2+u-;}z-kKO$02mR*5a%$x$KM`?PJ_1`e;8a=P=AMc4*t3 z^yB~b1DNgFbbD?#{qXoLiVn_aEG6j7MP3dmu=AV;mQeW<%wWLuD|Xf;ilR zA#SV%7XSmINkafD?Q{&vwC4hj9d|Wz>zfD6%S`(0_Vv*4=?AcYCRFM71^lb_z#)R1 zQcgcIYQdN?4}R+u?Y`@7;vpfui$ncB<;20eO5z)U1Bqyv2;t26>%?!sJTEc{^W)=b zv1}o&5e#DuLihk7Mjd8GTZEa#uqx^@M~Nnk+Kx-S6nNEsNd#G@up*3m&2NcG333_= z)#bXzET)dJt=dSF>ow`#-WA%*I^1M#S&VJe{|N0t8S6u=Zh7v})>jZrj)QeeU=8l_U;(E8Q6B9-yI~VZq;K6OoND1Z1+!*1>z!t8t6`~(nMnnK; zK$pK|qh9yt@LAEchjZsef4Hh@s9@T;3x!QFJY&8avcPviJQFo*58~50#Y6t}k1!CI zIM3@V!>k%I>^64l9HRmq=EdFAjH{{a=ChA)8O5 zv1uWx>Hs-42)eTe_Z~h#^etcnrmhV~<3`MU^qpJoS2JT?|ZFd%midQmI!8R!qQM^=4XYnoa&Xas%uA`cwvXej% zv-_oG6zyt_!cJQqL~=)LI`{96VqSm~FGTdp+CiFMBJ|(dP8!6Vq8I7z zh5pkZJ&+RS2Rp#N@tv!2pxOj^=*ZoTy7XJ${@3(vhX@a&_(Wg}iX_4~)pJWbWYL}t zCv6kZ)ne+?-`|DgxHcMxRIaNMD(mDmvM{|`>G~;h^fDgeOS_BCO5s2{Xv5!;Y z?VF>t;YR8oI)~a8N#9^=8iaAE#NOLn7iN>+VzZtjH^eSdoxYy)T!*-~OcDdNclYDV zKtDETW0UN}LAw(beR%~cV3@r&BrPjzgop*^8<4tQIDaP1RMHlZG&v?yc|#8-9*by9 zlo;c9N`$9o{2lf&zgmUKnw^_WGuSC>hGnewj?OM3oHXOexg)!fT7_X~sPlmg_b$==ikCXKjsir;w8xV6lY()Cp z=o{s_RIs7xp|BqjZ5T7J^FKCn9u3&~EnE?!`#b&sfDFKO9r`yGrJpuOMU{+;ApjUNbQ*ZX>c0 z#M$m=$+Y}vE|ot<^VeFI#xb|3W}Q}P z+EtGBjVib3!`S^tXb})}Fg7|%*TTqd0B>u+=Iz_}B0Q*?#)+{p9J?R8X|fj(8u#M) zboSy!j4~jO*aO=jT52G!7#a?VJ;#GCKroi+C&9i+mDK4vAH~-G9{rS1Rl>PVWVhDT zg;TuQxOHe12l&whDaM|6YaoKuCIry&+d14|4tBsnN_x$?vX3B0Mr>8nB$x;Z+YKbM z60Poa62hxQ#TuJhz&iuN23)s$XuuFO2kh$=^Rn=nkawL(8|Wx3aVfr*QIL?AAott* zemn|dNI}UyA*it{!w&|=7G`g4r#?;6+3WK+BDC|?JTdgQI?^1WZW|Fy^)l`?NT3yF zNKlFI?6+kXbxE%T0Z3h8s}Fq02)NYy&=M;cGpx3n5Pe2`R$qv03TAj);1!e3=GKFtBXL{n za(x=WCkGJ<_p>%Ypwi#b{3}u&9tFOLAO917in$D?fwqN}hHH>uT`xtP!_BzRE- z(g|nNbB~kXC(6z@^WUSs96fyeU~$iH`w_oSynIxAwBH*t4fah0XFK^u%~ZVa7%0(; z-#qQ26o`s)ilY5(eM&#HA9gXxiN|90J;u|K;R0J9h;V5-UH|1<>HIS*>HO7~@HK7b zYDUhEY3@-`i{Hh0QDgD_W!H=Q)}7zB1^Mv1bj}yej>9WV4SRW>!z~s@9exhq{KG#> zw?6$S-M#+vw22fPMxSn|KY<~dB0KZXH`28iUnQ*TDG)ObUr20ZHul+gS~~mE8{hi9 z^x-={W0QRZ!MK%feE0#TK2zxm%tQxBJs37N6Zn#}u{fEkw^8{)AeY0AP&Mny|GaIsC)l`-Ak+&G+z67>0;m0Xu@c&<3mk z)A}kjBZ)uNQ=hsJMGWHPbsA~>GFf1kft{MR!E~jr4XF{?tl9W)T>pT+?S_n@8!+YtYcLiQNGMMA_oPe zo9RTlc)klJs0RNQi6tVkEYFZr;z2l$cXkfKaEzzwUWk+m!aK5FLJduSl%`f!kx}4Q`R~CewaSJb2EMR>4#}z^gcv^VfY$T zcW+-hcjf{TVUp!dJ_-lq7B=xUOt&`hA(6m)`RR-4n-`FXVm@$)uSh$IiGFhZPI^o^ z4eV5|q1z82z%aE~#esAc$&>yk+a;ny3Au0*L=96eco;rU;5r7Mj1QiGCxGblzXPs( zNx_3ZggH)fG_|-A*E1t5@o3#@9P;L*34moP>6ZZ3RzJh51sxwg7zR%5TX+O_9Zrj-|B&;YugADnkraZ_|1&pevp>2 zCEkLFv@B#7;Tscz8JK*1A`m-L7qbq&sgenyU$$v?V}*oIIr3C{S38X5G)|B+1oyz4 ztEnP&H9|nrg01ZZ80bY}yn~gAU4Q7_gnh;SVNJUGXd>O6#CHqle|xJrg^tW>a%3PL zBWWHT8>4-gDN%pq{eakf33c|JmP1Y_%xFI|{@rP;jMsXN^P7p`C?rlfcU z4aFXQGL5wd>DfX21@O_(?%ekS4od{K=gxUyE~7tz>mUd^#$V~MjHbdIlY^=?ot~Y; zM~dHkgx7-E3C0Zk+JJwJzMZa7llob{W=aj1PigvO?hnJS3U>@R?Zhk&Axb;r@$n}& zF=fgE850DKkB_IPFP$N(*Az_WU7|gWr>j?=OZ|O)ybm)1E|<8rz2PDzlgJp`Pd~1g zGNkGf)6y#k2OES;A5JR^NU^8Tmi0WHo;`gof_gai0b8{92+G()^BCoCpI|zoPoDp- zBi1w}@n)2#pfoHjpowJom9nd$koa~SL2AY|06F-Pnz}DeFz=^BvZ0Xw7&Z z`C+Wr!v<%IJTF95drVZKbqtOwfDO&Sw!jGz)%*A_Y_SJd60NI|z(5@`p{^Cm++{sh zQYU9vkl^T!(CEg6#2r=T^;*n5S&J=(WZfhkhGnjZEs z2E$J*Vzip#nixsxx46OnO`T5-bt6!y!}J&JhD-!ngDw0aq#tTTuJhancOQf{^FA=RfpDbhJ|?+6S16xg05B1(qA>F^H--4j00FUIIRpGR zdYfxijYCYNZH$j!oVy|}T zO&ebYfhRIE;~uz4%9-|Mt35I8i?;b#8FxlPDNW)>`_5yGYh zgscS8diIF(zPwd|DSiS}j6iDt%+(m5BRpWMz>Zvv^aK8(HQ0ESVm+TvX%bkS3gY`FF&0wK8-58N&;HvIuQ;k;I^Lh z8q)pGki^{jcG#&u^~_7@+=UA)Zb>aAR!GM&$vMQ8ZEu>QuH_AEamfr?hUr2_hjG?1fP!J! zSI{z?!qjR%{q^_%zx2V4Pch$`qaP4ltTD|5 zmaz$)8J|eEaROG#tcJz_P6~s}v|Wk0Ty-tZj}3g|=TM8OOBrhLv#hbP(NSzu39Sey zm{@2BpN%DhZw<&|;s` zA=N!2v<1SnbEDu7No_^fsZ_CK=uy@J+F+fu4_R0>nAEHgS;how>~cYDD*&6Gokk1U zl{(Of-Zyk7B^#nLK{J%$k5q|vd3$ae!m&IJke8+JnP<}2^$%hEVYo``rGQbHE9X?$ z6<+-v&j3jK%vnf#(Jt)Lx5EQWEn#{F!9~14xM!IT2q3gVYG3w&M<2;EiogX?2?sX3a##wt!JNRJGhp3P-s9-Gi(H0B!jw8D+riq*Q;vi^Q zd`xrp(8>K00{F=bR@Gzy0W>FSVe* zkWrxAcQtF?BuGGvALHSE8|a2-X?7HJxQD42CbZ4?Zqy-ws|DAo9SLnPb1rjqfIzN> zb*<^@6b$Pe4%LffW3TC=&10;){SvK6otnQ^vac6cNau^Bby$(h}{+#+3@sV01 zdfzm3%OqiTHyRME)v*tsI+dEy<~PkEG{8ZB7gJy@AF5=`r|Bbpn`M|)?Spen1dG=r z>)E7qYQSkT9WF;8xJ|#x%Xd;Yep+Yo`P?H)*#aiUX0%>`fiP-QE%ei?UwJt_`}FhN zcYh|j;TFC_AaSP~f#s`^DJ~qigz*J=Eow zCr3zDYoX9d;Zt5)j`@L1o$v7@=73k?;D>mVBTsBTk2O}h!lXTUtMth6MGItZIRH*u z#u4v$J^y^-Z|M;R?}^LNunb_u4d3>2^0&kz4rMI3M$9zAUQj@^5iC0Gx!dugm(V_b zk5ahrCCiVqqfQ-PFLffS%!}JH@;mt?{{9dDiKSNRlFjv#2O%#e>yKt1^|#bsALL#p z@+B%jVoYZAkL&)6ccZ|fc_1cFdCtt7s}P9SUc(`|t)JIq@YrCY5t1b`75=j%|K>3% zO8S?47fpz9j=ox^Ek}xn>a7fpQhP?{ioJ?4pnj!Gm>PAF)$#ZKv^RbF8!}XX^gfFb z8&3&Akn}dTk+;cOt!nrxAs6eA_<_TCANCWKuQrd*p{>wthDv zIKV#p+;fpV)DU<|Mz<;3jklp0n}mmJtlvysy&a(?SkPpH z4BFdB^M@a`r?W$vS`Y;X>DmsVE@!4!$pF0=63_M)?v37~e$=TIM7~&s8Gu1(Kzh^H z(T}8PGY$4!gg^p$E={Fgh?5MH3|4IDNTkXL!^qT#_!j0KBU#=+a@Q64FMja1Z>7&} z4Wkth5%N!B71(wP10ETuAfnkU1kH5_El5}3&j>mV>;H*;d&Y5LpjVv20n}Q}yWkekL{87~Jb=&y6 zj-$KGK>^be_hylO@ZdqZdif&P8DGLKJ|25O641)@^{-w{Z@=>?`8IGoWNgDc$jzQM z*C9Mn)vqAME*JgEZ79|pP~c$lg!sZkM#(-&eA)jC=LXZ*49>_19h9OKN1k`b&X!9c zmSgHTlqRRfvLie=$+3$C$;3zLjDe<~nt{kP2@j4->@n1@wn=l9N+i7E;RTqu5#jOoq&fG=B^JTcZr1*&F^ zi|>8Ud`?T$7q}Ob8iLP-2E*{R)C4ZCNkdmIrs3;GJKI15v6TAIt`0SIr3u2my4I{) z;>Nc5T}b&0f=75zJ3#XSHe&x_oE^JL+F!wV7%)N~h#gE_LlO&~FdEPvL}F340I69G zF9zNWi9K`aeKidF8n{zB4&rm+ZV7_VoGxan)-l~@HDT)R>Wusk2CnFABc}lVAN72~ z)QC3exNj~O9l33%K{2LbJY-B1s(MX)?)_zaE{4!BcXV`NqPjq!i;47TVgx1~xY=9A z?jMOXS(|6)C({KYehKpi&yX?Prhl6xrrX?KhGA8Bg2OrjrGrB zv7W%WIy>nKM>GN0@=xhFd0t%m7k+1a!zDO7mVPp`=opIwio(GI^vV6dis(s3 zP*PB&P^^~cwnJM7egHlKRv5n(G=~eAED3k;(&^xAZEBi=hzB;ecuxNU*SFRWGD+p^ z39kJnLZerqxpGmhQtrBCvp-j|hRnjg$6mIBzeuOKkgCBY5b+d;n^Zdn9~Ca#fF?;5J%J3SwBJ$5EH-!%)FP{rV{q90eF#fGH2{Cy!~8)ea+i_AGOc zkZ@*(gs{{jZMRFie18qRv`8N}SSJc#6?(J~e!LNr&vng1jNiTon+o`qh4EsyH4^u{#%J`mh zRT>7qkK16||G7uJ7h`8xr4~k4o!9*|_V7Lulxj>%2H4PaR6;W1ob%aXA(VsEp83kR zQ+MC#bmP}QP0J)**(DUL?c3L}_3lj;!$x}cg|DPeo+(pWK>tKGW~)9&^q+>N4*Vp3 zO5E#Vxb;<-j}MU8O{M2vdL^9SB&?hQeekz2zuBk!ibg^lqT)3RtaI#aa8l|v4|6Ka z*eBcK@Hk#4k7CS<@8VwZJR@>cqC}^XrGZm}=}-UgkJ7Jj1b#e$Q~^8w$yM5lzk>G7 z)%dei)%D{b#fH5=lo&R9b^~C6B=$tQy7uggFaUoC(w|C`laC<~U@q`SP!$e71i@^R ztf&vCtWgjL{uCfvd!pl|FdkAkBC^yGPIqYFO!!93A^lhaR>EGAJ+!@@xaO>>6|_dH zGU@%yACaC8&}f)+8X`Z$K17Ad>=@ECgLH9eaSF*l>VI-W zs9vw7-VSnr?BnOqh0jFKscV>KA*n@b)ZAdKbRZN$3^2i;L`QAH((U0Ffiz{0nEz@5 z{(sKi`^&Q9NYKmj-d9&udy7V+;T=g3An4tYoLSAz?rOBs=p3Chzvu`3OVXZ|j*ez^ zc6PYCn&E&12ZZ;wyU_;i%X{yu^nH;pt6n$2A*I{>>fLv5+Q`Vrh{%Y@FRtIjw*3>n zIiLq^Y=k9bY@9ESg$$oVRmRTokX`g_fNeBZG8W7D-n7(rVbjX^!x3z=fY)%YfTnZJ z2w+A#*`$AyECv@pzk=XoBqnU~DMKtT42=SQxxy5IiVIU>y~4iC+mg<91KqjJN4Nh=4C8cTb#W7UV7~OIocq+*VEZf036Fp-8nT|>htBWjs45MK6E7(`pS zSAdcs@_?{Tw3cl!@@oIx^>wbv)T))W^D+YpF#L$$o}#_P?zE>!OE*1b+_OYl8u5!# zh(_D&Sq>g%??s9>GVm%5_X28$Fau1{}s`j^`dc zsKD+4bK5n<>A!t9oq6zddf>>R^x4f}wEoh5%^cg($)jj?G4;9l=9}r_l`jZ;$|0A~ zI+;X;GW#Pe%Ez{NA8n+Uj&tB?<+-r)vTyE+EYnw?ojdnnJiB!9^I)!|$zH;m-kL^^%?WC(W)n63=5bHBBPZ99a14WYfE z2L&siR@TFA{OH;c;PF6Vt|5prcAkztwm43x>Z{~fVP|A0ojLI+yS;;HWqTYNJjQPD z7`_6__~YFGClPXCrtN$6wMP%CA9`PsnLY#N{uKZGZ#y3-n&+zHn4QSMf6P`bHt?Ts z=i)Ya+RzCP5kiEfNDA!IHwAOjGept`l@;sTglq-I3YWDg=(bOw=(@~q^a$$+b3@Ax z$AMf~h+rf_YRKRO*mcSPqK^P z(MTO_2oQ%)^`w=_opk3ngg&1qIwdu9vePiX4$kT~A)I7J{Bl*Io-TxU54;;E)U-56 zJoI48@M~kP-5I_OUqc`v@XR^quBEWjNqiK6VoE1!%GK(liJig@6t2zewzssw8?Pep zvs`ZilMQy8w!!6m1#oGDYniys1&sLtO|fynhc?60{`AI7%{be>0TibG(5F72Fi3B>3hA@#6bDtLub1DVqF;bJRgo(C= z{NY4S-n?K*)Tk)>Vs$bZ5JB>4MzA<>H@*Ikf0rKn=6BPP(~l7yCC5gNHU+|vg@v!G zXIA|OFY`y8`6l`(bKyThBh<9VcB^t_hBFG@cq={l_}5_gpG3Uoa2+%u3QtYDG8inl zY%dQUe<;1!)1N+j|IKvs(g!TgXt^L73L@ioZ^E1`r$?UtW;%QvQ>Es1;0Geh0J4xC zIC3(5_s1RSeIh?yyYL~5<}yqJVN_rJU3&Q3SqSk1Xo4`aQ5XeAyEt^ONGv!otClk* z>hP6tp<=~|1-Xo@W1h!Aki>xA1Di7D)ep-vQ+p6r05`@c;{qc@2*}f^|Hqx_laJm` zmp*!zu&C{+TN`I~XO`Ke*n!!ao`J9fyG^249XoWGs11Z?!xXE#i|~<-`g^CxHvXFCvj7EcbCY+XAk7d$g`?A+vJI)1Dh2f!l;CykhD zFlJ+u5O&V0z(=JE2CIdFy3k(M#R?s`Y~&4hZRWte%h#|sMUavKG>;2Z&;-hq?g`&A=EXNb zd?@rHf9E!M@4PSRZJ2HIAgiOm?izK;(D*D&8EAVjMC7xdnxqtZu~$X~`N%?hpo^IN zZ0U>M#m~?%U7W{mgT*5(mlojtsnUT8b7ET+NDB0+Rmt`z!A4DV`8CD z*TuH_8qD;>*eJ>z?9U0~{J`-+Of8nMf!C2<^9}?_&izn}TL-SXYh=BJfh%n)-@c1= z#c`!Q;bXzeCF<$sfHF-{PMlo7x!=rQe-)`ue`0E-|1Xo02O z3K6T|Yd*&VK?OBJdq0`z@udp3hhk4d6l`8DipxhHrR(Q9{{MuqCPY2%q zqbE`ua9Uy~)t!)oz|Y;7qi0UDxzU?Gzj!IVedTiMgJD`mK%r~VhS1P67G(|zi`bzD z4-}ZaLg?e$cYY2c=o~q6Bwe_85jewWbCR?Snz8#=S8>dDgkwF<0=(rFJg_|L;sFdl z+FaM}fJD6D9q$>7SaS!aM(113x@D0o_ci)XkvhYj~iH@HVA zrQMdT_IC~q^gKZ$6hOMsBtQM^vy9m-lqDah=U;e^U>HZB$)4pQK4Ko#Lu5XbIfoy_ zOlS7T=0A2hWn6n;W2aS^kdeGuB zbWQj+))6=XpNbvUAJ;?2ybECiZXzx95DjU3DPizS^rX6U7dOb*DHNkA2h@o+)H0{! zOQzX{oJX-G%v3ViXaB83Z?0auoUUDQN8hy$nSe^uTFY;9$1fp>7cQu|>r_X8kDvENWK_tozvn7>IRLZ(1zatJ(G&2#~ z*^O9ZH^S37<a7g^IU%5V-l?&%9-uq=+eJ|kr!@c%}Cau2AD`3lF zTm3Fdw}?unN^ybIc4qInnJnM-@-J8r+r3wx1w?7p#qZtc;#tUHcHD-Kwm8m}diUX< zv$*i-FsN=_SgaEc_1)k83a6>*^zhTq;~Ym!f9jT*trU@Kw*!@{lv#fNzVE#(WX-uN zWdtICpiGbk?pF!N`N3;1rRnj}bp9zKFAW~0m7d0JF1(Amt1Wb}%r=jP`1pqz2g0c42pG zeGTW}nb`>zziGaw(^Yn?JhieG;@yVtg=bbQ5`TV#MRApRHGo!m4d&yRz&1{)p(QVm~;ri z?eGwqOH2)5GGwYl7+@ikX2=-Xk3!E%o#L21$#_}$C5-{aoPKK98#HGkzdJjjsU?~1 zDpzhp`QS1v#%(HNemQ;643^r|ilMm)$1#Mh=XFs70 zIeZB-pRc;iqw}iHy_!}E2N`@#^`xbSPRzRxDi35#oGA)o!X8)!lOaI)&)B+%D@3|8 z4(rD}#59lj63bKkF`euUG!L9MT=61K+=i>3q#er$qVBxwG~3A7c*cst=@bIaBz^-j zO)|OJnLFc>@DU%y1sP8n{SbU%zzrUtxuRncbjxr4DPS!v*iZYXkJ%)zf8yKUfw z3az&G0Zh;EN1(oX7~SSpY%y44x9F2NDT5Uea{xra9bp{10?^1!>hEMnaR~?OaWuY~ zGh&!SUiwcP=P12e+gmbk%!OYli)#gFs1m-^srlIP#A) z!G{4en=Yc-4IS(ydf|~Y*g3@EVzcQ2fi^z9^C2{jKpyz5)ib7sTmR_t`vB_W)aH&i zG|sTX&VfeeWjhCL+HRTxQ2cMJzI+9=h|Cxi9~{P5sV6 zed}saknE!P)6d9=kc|}HgGp08>pWDIjsVjDu`@OIQRSA3y%rRL?$*1$OM9Ga!XpF< z6*}T16)9MRzyu6hVD9@7Qs9{BqHYcA4=g@(?Dw3SP0UO;;d^$q zZlP|2tO&HI4_pYiWTGM(2%Lbax4siELWrPAK8gf;33(Ff^UIte5Iz#eA74d!fBAz? z47T6a70>*$f9?IEtTJI;2s~9e%$dlS+pwQ!|IU|DXbeOT2OJp#cMFM7DZ)3)m{WMB zfcFX(N^AFAIeT zL0Jg0ge{PR0`CLyUY`9MkQoL8!8iumK3clS?-lU(#`!~MJSo+lG_YN)HgN~W^U`~- z!hkNNC!YTi35#iy61jc4w!YU3x2(83>i;E~kL!A>6`NZjP4Y8{v2aHGYU^f3p{EwQEd z=u_Xo=~e9+nlz?b3j}+&wm&`b-1kyP&tQ5Bfny$%AAJX|e{n616J6(#$DZWehI0^5 zh`QU92rrry1QBH+h@A;EgH>GuH}a_N!y%7rz{XL?KdV4f$<0@cD?k!1Ibr@@N6YCt z+DRPvQx9?)@|_#lT(Xz|k46NHO&IFcomn)OQ)v2)Ox6QoGejHbKd>llY|gUSVV}*Z zna-m+i5EiI?vAEnICie1X=l+kRHJeHr-*c7DC83-PNro9x{p46mqmUiJ@xF9>4T3r zxDQQ#-+>Oc%LmxHClu{C0*M+jyFy#%7K~MHWGiig=ZS(f1hL!2&YX~qL_Ruo*a%F( zkFjWiShesuiX-LS8^gg+)*ub;~OZn6e5*tx$2b{&oVu1HPw_Td{!~s_@mZ53laeVX6B!!_3d~<$OeWXYH zj8F727fHDY5wxQfoCzTW;KVk%HB#t4E=SUP{`f94vUw+C#oZcH@F)7hP55O159|2g z!w=FkPd}Mn{_U&Ko<_8&gs{Zipq29xWI`W5cZe{hD>yl_^MLTENlgQ%sMgkI^_MWT z+}!8=rj$1NPR7svX^)>_;trT+EVOY($$>O?^>)0EM(!68G~C5r7zvsJ-w1tJp}*S4 z8}3&DqX90(oyr}U4$TW(1N4=!PFtYxsDrZ)YS3(JUa-MVj4-ynDz-=^gqR;|oKy^> zH%)9}6eRPQBP_CO2epe9dWe|EGHo8%uLB9s1w9#X0KNIP1} zT{M@*xDQh%N|Bk?q(fXNVW)VM=N7> z^vGd$uogJm<)cYH2Mu3W;>_qI(QN1qY#Ou^X-%;zPqWTCA+ z;9Lie@xa`2VywU!R7c=0w3f9-Gbib$q1|!6Ls5v;him?+#x%$MCVmVGP|XO>!akHg z;Kc&Uvl&eD>WIo#$AOGZ?Dn)`ioJ~=mpfTa)Ngz6s$mDfd2bkH8bKfA6B;lRZa}%U zNHCMdIs98tuo(=f8yK_PgyVi3f(pHH4Uk6ziz!pf074~$!~CK>%mKS?8sNaRW=g>S zQU4k~5k_0vqMxWjXj5Z4gy~!xFk6PUw&Od|jHSjJ?GR**C~piB(STw{S_Y(pM-*=Q zWE$?gp0(S4iJ3Zl$C!|LS6b&)>312!n1`WJ5+!))gWO&k@0~xmN{0L^|23_7;^JF` zRKAjE)*`0SZkRI$S1GSTEKO~`Sm4d*_76a-uSkNRHSet#3B6KxZbSY--z*_Mb=Xn% z5CU@qcrLS5<(v1r6BCY)lhOU?NlAiI?R0X=@3YyXN;s90yFAmWj}ysz|v z_6r-D9p9EH4EYK;#Z4uPebrGXn(g1{8}|6Ld6j$J32lP|+pL!v4J@z@_)<{xt~}`Fiq9aDD5cU zl|uK*FVCqm4+&6k5(yWV&X^V~Znh6On0(-Y+mp1DRf2Td&(5c}U-|oVYV>A$@QD{v z7tY2zOoSQ;h#DQx0$f6Z?jB5EfAPooN*qidzx`_LW;pR&v_F0Sb!>O1(zEPZw4>3l zVZnivXIpjzmIu#0i2qDudi~W`(#XxLbaexUbeiA)gp=&8^!2ZQ4KsuT0UKxtM3!)B z5a9%YZdNd1%O5>aw8PDV=m&M%Z}G&=hr^@~9KQ-v8RJxdCX9r=-(-oral;&ekq-Q! ztsMiW5Qv#S*toj=?;%(9*laH14>8W}OKV3z5h>6<;3zhRBd3SStz#~7`Rco9+Kv%F z`Vp9a&eQ-m05RH4ZIN--T&G=(|2$?~mq0973^8)YoUh5DXyb2VB{@ zhA72+2?mQ1f;OlHr4TIscGG*eM$(@=c?Ka%Vr@S>plY2y6N(ZIq>Ec8%$8an2Q%A? zs_A#ffQ3)Sk_iyIKGf^;n7`mfVM^2@=Q*13MVtxr056z^e6rb%jGKrOehiGF%%cmc z<;dj5e5N1%_G7t$V9qbfipXuWMS)=pEu0LMrz!`IlCoU3#ZMMOa=$Dm(;sLBiMLV~cF5alY#=S2+a`sR+ z!t)h;jMj;2Rs*xvHZVxcdHjqN2rv(9KoOv?mO(HqgOd*9-42QYqpXPsEA&?fTp3`U zcy)-}k>RGh5#AK$$DPOR=FQZK57UJ&dNFN$EnT?rF-*@4Qy83uaouGXafuU(*H&gQ zG2Vs7;1tgMUc-Mwr*B4?Bcjf6bv`@X@(Z3 zWrHW0{)C-7?Fs7vaff@u(TYEgzVpKN`cc?WJA3lf$zbwT7#%rwgz)g3G#rW&`os`~ z@IXIfZdK;GLUJ+lv}93VCfD?ayLBg_jh!%kS`-=viSw_vS3j2yl$|OV8l}qsES&9W zYlQ4X_-oM=7kq4inH9%ndQ}p&vb%JIvv2wibf?cR6TbD%b=rF@V6{qnw_zR|8+S2p z>PLv^W?XSdhu&^15m4pQD7&=?+KfUgK`q+R+8bWn(90XF|MCtpr5--%$>6=5aV?CE zq--4pO*3mz9k^d}wkFL%+ZCcc4ZZ^*j+%nLPhe6wIX=xgN}UL1_3n7IbEXHda*Rx8 z*{~C7jp%2KDC-)KQjF}Ch#XbNcx&3UJd2Ql8Rr0SZK`QTxrR9}ii9P`juHaTV@+@@ zz_~FTvkxP53oCa|4Eby%EcpRX*^ch;(sLnHK8u6)cf+~K1u_tIV?IXc!T zTNrpj5+fz?&PXAc7@hbZcNOG_$JKv$9t2-xu#>zKI4-d6F4Ul9@>%s|Re>@PAM2F> zbPCp-i_LcF!AN7aW9OP$B44ZLQ=*`eQq>6Yj?FefEc>xs5b}C;8rT zTyU!{`(Z87cW&al>V3e-_ckgnVMp8fZ9TcImTwyK`#%3Qc`iFI*ba#Q;<+#UWv2VC zfNj*m3({rt8q3Ia#3Dn5B0z3OrGF&ftJ(Lq1h{GT6K#d5nz(yAUHR;@)PY^TFuh-> zy00mJ#Lr5{MN*jN=$4Pnsiww{6N0v@U6K)!hDlKvAn(X2kQXT)q8!kT z@RZ;@li}>cPZ8x}D82p0t7#O1s(BL~N;j?&(v{tX7k~6~2+Gfj|1-U{GLB0qYpDs?4PTU5knBAVThVj*8t)CU|tQ)xJ9_yEg~|_ zPibznK)agJm=mj-=rWpSXx2vencUcAb7n!W&SOwYKU33_n2Vr+ff+TV>^e+E8)0o* zI2|=LEV9Gk3D(#~0*iVG)hcEn14Mr~eey&wCJHI;@SS@I9}>1b$2s>w=4@sTzXw9d zEnu3`|L7wew1a~*nwr7>p43bGYG4#BYl>5FdyZiag|21}V!n*_v6s+=jfC{A<@D}4 z4sYvjGiEuFC@?87vf3mjOVc=qJ^9SfBVPI%0*qU)wGBkOK&!1*rICZ7ZhUk(4Uf=1 zm;wVUc&c>XjYP1-MtGM#4I%tB!k|sEuq+cn!v(uCJQ50SnH@2SxQBRn!s`;BJK&L_ zN!?QRfTtOw-f-WIS(S{FIBcD9C>dp{nGE}D=B~^KWIg7W=`_=EUIxJh*Yh0J^di$( z5?1ECxRK|K%$Dh5u281mGVTf&L7PcwYDtAO#F_0C-Q!Qpx(cdgg$x)*UNxT$E`F7F zk*V^Bk}ZJ%U_hV0GIAYx!Okypaw&Q-y|9Q3OBW>H??5|*ftv1B6gcykO^VMOoDTlU zBl4u zt>sQVb>Ww#X<|EJn2kHXyvR8t*q*PkrPe?IfsNy|pSdZmYGNm$2UyKqypXn>gGAJ_ ztD0+U5W`;qwGn@|78q@XiADS|G`Vm$M$J0i30$Cw++gO1dBedYE$Q2>&!-1RkEQO5 zPapvOr*!*^w`2{%ugH^ZTVSg6F?!(G8E6Wl%}z&&&f+-8IH@uBDr;TF1fB;|#XGgW zd_4HEg;ej?&w zq|Z2L`Q>t+8ObVy)~JTEdG9=SJ$7zsUfm31*261@BLl{&^2Hlz>@B}Au+qO1+V$g)IwIIY&w>#3#J9lo|apX3#mO_q<7u4EYGz# zw&GzkrtGbp`|U&iBr3*p+PgruM$NQUJqVWUmr@J63PuzL5bFt>q7Ad_9wYXvWCFVaeBRe>G$5%&WcL_&Ox{cw7 z)N=qq(!eb$D)`9f1|HaA{sZE`$-5{b2H1sfhBuaft|Lza0>8T_@+0feIU~5a*0>|m z$$Z&jZmm(*3eRmz8_(LAe@)@r1bu>^h(IZ{THe6GJOCoresMYuMwJ>+1}zDEEPQ3q zXWAA53w^cC;*?k91$IC*BZCB-|6KmDgV9eP#FdeVXE{-R=7cT3t%y(llk@!Ezoz4I z>nPupTCKD4k{6}U@}2Q0Be7`ZMDE5maLR2)3nHsiUk8%|9M}atjs?y(<)-Co%b{=wC1bW=D$Xyp$8s%BAt8u>0o#rOPL}~1A@tB;&%<6OyB+~2M6JZ{0WZN zYfN+tS)7?lFa7dAq;I_NgY*DSWo}(!*|z zeK!2y3|hc&{zaQ~c4VCNaA?jngydWxzbO5ZCR(g3OzF6>o}OTqao_A=dd<} z`^`nR%ONs15oqQK&wKFrq4epeS76+l)AL{dCJxvGFm^b)&X2LhzMXm+PZEXXEGOrl zgo&7hNJI1?QTik_qS@VmfZqAyN_y{|chT-FP#?k`+F%{$^i`;5(XK&QI@;5g{`~RB z)76=k^vY+%YiEwgUK^!EYAy3C=_&IoMu!2Av5=OyP$_ILaHhy2TgG)T0L@#OX^Y(e z=MMx|CRqQ9(4ZsDIfj<$Co}ziL)NQ4;(UxYY04GCJwHun7-MnIGWY0X#tsY5ZL4Gq z8sdsub=}h^o?A=U-NWR_jK!R&q%Ub*yz@-0DyZr)FIdEdn>eGQ$1C@RcGyQMEN!c5#@?-MEw;i%bGC%w+ROI+ z(IW>@C~yKaOj7tHMB@W=vGuP5eIs*cZgL^CimeE#i)i|raFF@l@l)v+4?UU=;LN{( z06!0NrSRJdPPed_M*t7|1QU7yB~?o=VLZX9yD$k$;NKV;#}6-EPgCPF>BIvZh+kDJ%R8;<3xYfh6D@9mgGXx7^sjT?$q42uT&5QU002M$NklFo2 zK?+L7_;j>m+JHl7cl!f`xIC7|Mqto6Q)UaHX>o&{Kw|RO6MA$CzaSm8^J%v1>IuUb zcCkhnW|}Q-%0PG(tjIXR1Ms^Bp}`0-& zsqMwYvQEDa7!*U^UcPz-g@wW>JNIyB9=a#P7VAgMA^0)U`nwum_V9-~3Qj3#$Rul) zIfdi;5Ifq216Sie&hDAYER|MSNXYZoyk3^)JYNk{IOhp?Tba*dt~jP9Sl^%j`LF+{ zP|Ub<^67=kq0MhbI5WaqL}r4HH&TzZ!7#5nb%*mkw4)iEl=lgSj)GUAL$*0iz@|J8 zh&z7hXu7(7BX*%%;d?G#yP6Je_ob8P4invO2=l?=z>g&kDI2?s?-9zIEf_|u@fiuu zMnMr}5P)vhfTM>G(gw}g`q+tCg5KtcUDnSgMaCuIMbyLmkY4l8 z-5Q@MZ*Q({gcL;SyJ2_mp-Y8_&Yn$Mmqt~>>b%{sfpb9-m6d``+7>Cs~#QNF; z{u_2Y1X<}O>y+>kPCDm%0`-bQ`aHf?&_M_rJAQ4f{a7L}7j3HoPwd3;Tm0L^;zILH zFV{PH9(USzP*xP!xkH=b5A3Ly7rf8wnRCyNCg2?OmlvTj`Jz->=p2fH%~%nS@A*T5 zGSPe<6P9nuO-6`&QpXRUM;@6j;mEbuNOMt_%vlj6d{@#$Yaoc`0|unYW$CcQ{0;Ld zmvQYnMTixMmJ=|SMVnmYqdf0rB5XjuudZjWPd2BjCq6H*Ds8GXFWTt)QcgU$?@zol zgK*=WWl4Op5yDC8lGdL$E zlJORfwwFG93mf>6^wig%$9(Bn?AJQ&3(Y$4H}DV0C^)H7ip`s&NSO0zg3?xN*d zTbN5P|LQ*@5KN}$zx8cQcd}MXQw0xKa@+m67rq@1Mz6l|a+*MsCBBI-pI^C5Z0)J^ z2!|UPOJ3Nzm8Z@729Yb=M%TPaL1-7ILhVEVi$02$!nMH9JNsOHWq4)y`urZ~niPZ< zTAH3PEz=xYhgCkcXzSWpaO*gfYa6ZQB(|@e-2%>;`L5vpJ!WURC^Z$qQE z%1PK8F!e4{5&%t+jJ(uPhmg|HnP%{5SX)KdKy#(W(8x^o@$I+XMp)S;_BFP#9AY;= z{{>Cce8R{pIGmy7-QruJb7p-mUFM|D24K{I*8g~SKL_-{B%)0hpnLV|Jfr~L( z`QG1tv6oVIag%XbljHysAW>466x2pjVr%qAv?Ftd%O z-g$zx8G^Aw1%f^J9Z{{U&u+)$$!#k2xKPDa)y(ovdKvG**-%C>Z<^7{k6}(UNyM;0 zLSJg0Fpn@DG@pXh^0@%IBQuB6_gi1Cjgoka(5Vm8QaQ+|%%MJCg%e9mKfpai6WF8QhuG|?r^&8F^N`k3augYIhY^n2e0NwufwFy!weAULOjIGuC52)qC-bg1E*fE zO^?w|p5eb{2+#op2v0ViX9rJzC(Eei3>CGwpiO+V<(5|hb(!i_* zzc97Fayh3Iwwf91@4WaeLT_`X1@!6C=T{J@Ie{8e+IF>>aOO+%1kFIva{$50b7_na z)dF2`hh_ndzf7X0TI~o^!cB%=GiN{+bkOz;A!uH_dKu;r`UkDojII;s>aK2f+z%bY zycKgVh0R(b)?pUgD8918rIY&v_$M8hA{4TLBTjI>N+UaJ?#|3{{zT9>=-nnJWuESC z1UN7M#wTb(WesDDX_d=Xei^X$wmk5?_z($sdhhf6cK@&Als{#+#?teZ>7!&>Sc8w? zV*5}FLWFw?)CmoNJ;>>)PvuKi6!BcVi8mR~41^#vy#o$Pd-RnsQk>^#=pSWx^L*As z7BXys$|)5v;(g#6b3_3|0fULhC#Mm2J?xRqxL8kITPMd?h={h9raK7!zqDX8TCjreXB_rM9m2u6reL4|tZGPMs;d(} z5op08X3+XqH4g$S=DQ|*o`PP3$*us~L}+3ei*XD86B;b83IFOV)-~`p@SRuIYZ~#y zOX1vgGtUwp++To+EYg82h~TrhDwUbXck!KHM4o$_Qpr>KVpyZhNMqsLxPa|FF>Yx`V{Fi)8aEf4w49(x8U|m+si|-k+u|HNuo1P zEjDm!^kjxJeDcducFD5$?dxcFeiC(%H;6RvxXt6~P1RNK!}Mj`t6tx8yPq}p&zICd z#HHT!?LYlDsqWW*Px#dj(EcI3Fp)Gnk}f+Rcd(J38o8NXc?q);A{L!_^cfg;4kKd4 z3xQBAgv%C=yMu?1r=S1Jza*UNYw6-AA2If9>BCTd^v>&Xa`m^q_d`y!Mhj2bE{<*s zD;zxX*b``hI?}Iy{g0US;BZa*WP+v;KHeu9$Q&nrKKS5+Ov?5kLfQjvqp@3t7_<FyEq-$;E0G49n3%tRM`iBZ@8Q%8I9tXiM%2m@Azq-fNUiZeloPT>A+FWhge9r zFTRzojNeS%IMI2sriW!|d$~g!%(Dg4ZZwTXgr+7!J*v^z*u)$QVll+P$Y(#ztGE$G$pDHghxG=N`xJv$mYL~156ipH<8g%G~l`hNhKrB_hs3=c%B99GoOH!a z2)U+`!8A$mD-4Q6xGaPcV&n7Y6K`!&o^NHL(y>y+w4RJAdEQmi2p5r3cF7-U1EV8b z1;n=r>X%U#CXR)gG<_P(W7IH1i*6%0AL#3eP@C%O zoF5900XOP#N30)4tc}CF=3rE3U@D)$S@krG^ma4aOhQI36E5@4=y-bQ@y9TW?I2Vm z{zKSeceDF&%+g<8$*RvntcpMw` z%fk~Kh=*AdCW7JE4jtQ7lganaHbONHuA1D%c?Co(Yi@iDmq9eGIJ#~mf(`%*_QT@* zfdRD69_Xeyk9g$H0SZj!KD#3_4eZ)A;LOfrW};+D-y{T5g$c}~*4h1wfB@8e=+Iyq zJc#cXaBU}Ofee^uA!M6*84JdU1=^1$AdZ9hP;kO;DOZdl;oM!RMu2KDXWCg?UV) z2HxL(bb?oyuuSntMofi(J07JT%P?=Arz@1NkD74FiUS1?+pJTsLa}BrEzmoo;i$-I zqE9VwSL68MeStu&SyGt)F;6U`1{ip#SRZuw{Q2|g>_d;IqtL1u)*j8gG@EJbXorsL zUY<>O?4vs<9?Ca5HNlDD_(q<&$X@V2{fu@i_ma4q+cp zD^Kzlm=if2r~9uc*+Ju#aF@B2A@>R)nQ^lnfIRT0xd(w*=HH5;wT8Ma!bS)-rte0QrFN#lH4<#5wrn!yF} zQ{m1l<^VL!fIdny(nkNSE~B)V#vGA;?9`(G=o!MtWjCGQVP|doi!_E$qY!a->Huq( zj@Jqky?woDVrd%Qwu^PO7akTor)aGPI#~nt>zKycK=UYZuLhtrGlp&OIt?g9hFO#6 z2;Q&>zSg!$yEEPg{D8glVSnAo>zjin+?|tUyiHf;m%>i&;d-n{M+RK=f z5P1vritl{8_iR6pch$&I?#uUmuT4(5mTB?AKQ1*sidkHx-g2LZMu3voQ3j#dN6fHA9G*xVwwNeTfDOzHD}Il0!HHMdx#Fpq7`L{j3-hw@53 zCzS8+zrydKG^*+)srT|$J^s>dz~?LSM}s1DZg0NjKK|1;1{j?+c6x*9+dupB)Q=P7 zhi|-0yS48_^JCPe*qwmmN8`7KIl+fSWEva3m7e~_^QpW45M~5y>!~r20c65iIR3ur z>)-x90>zQ^yWhTq3D78EMc3nmYr{Ey>-#^#C*mZCkzvyS0no(f^x20}8x!=mFaIV^ z*DavZQStKUpCd@j;16(~MOJ5R1s5+g{R@Qo(|Ha8(}c!fcq+Uy613aCXDQv(Wh}if zCR_ZI9W`Z(0+2@nhS^;n`s=*>tXmGdf;qo zCSM(#i8{A|UL2uY@m)Bkz57!7_%jG!tJ)cu7Z9QmzXngwob~`44wKtfbH|C#>l_F- zh1p04V?NC(s*5umK*oYSLBxtPn1X!osUIU)cGI86bnM7OXoNYS4m@sHVyn8gn;olR zgtaNc6y8e6aG1rc5vAmEI>h4o4V=9@*!8${>n?q)i<9afKR$pd6WhB86m>j1*2`A# zKr8XlN7D%N%sGKxJYb$h?ihynjN-DM#++@+=>Uha0f z^P%a5T1qvYo{`|?;^KP<6wCsGFYs}+qdy(s!$l&;P_r%JkbgS@ z8f6@Z;sTI8^Y+r(pR0Wm?)0 zT0n5eE%Te$u($r;hhb+eb0`(9!ErZ#o7OJDKq+R*DDuw$3Tonng;%bhH&key705(8 zO51!<+GYwF9nJs5LFdr{7U7%haOKQaYI!o5oDR*7+B9Y|?cj4?{{RPH4iOfy5l2iG zOaw=x^0ha2r!)A89qwjljD`AW|7tpoU~-zYkDNb4cw(4D#!ZbV?#wXoXmerI)c$E! zfRPAw@1&0}-Gr(8T^b%9PS?g2NNIy@k%POxv_xxRyrAz7hcJ2N+75;>(7 zrgZSs(?n?_gfio&0CAw7$U;a6&1fsb0Tza~r4^GJxYpJ-=8mSO1s?&ac-K{_PPR#YrYo`t;c)I@!PcdTSWqMPvuQ? z!ucWovX%pIcU-nGJ^SM7P3YAer?GBt*$+QLxLm`xqXne^6P6u!{WesdZQ+}8lz=aNjM);x#ul`u2W6ek z@qq`>Gf)V+hgl#r>d>*1sU5oKdSbM+)lrm)@F@+>Nnrs@g|$iwm1M3%D#sMiyqsr_ zhkX?uwo}>?J00Y8TpSB8%k%PgEfx?`)@YQ7;%Dz7VH~;6EUP5hd*zLl*qOZx#ziV+ zHO3*X!Sg$`ehi|GGN~UmD*Oy$DnZ5YJSJ33Wbdi0w4)$fsl$$VF=hD+L9rfc?9$1G z$+dtvjKC7nw2@u|S=!Oc$moa+mKP;3z~ru!mdlKSc@j*aZM4%~m2ZCsBIc*@+(pq+ zB(s&m0*!MWJfa-)n>XL)`eR#|2X;T_gJ3eCdg7P*;+ul3KBks8k=$>f6=mkWnJ6Mf zL}Dj=B(3_+4`S>q@08GG)^mtT$KLkXAnV*8gYvu5&?r+IZqo*gtSmm17vJO%FnfK= z@67|x;Y)&!w55l?{#}^)?)2(E{*qYDiH_`>k3W-+ zoqd$NIE7)yEF@h>YayhyFe<0cJ(dm}I+|X8`Pb<(k*BmnzH{qZ`mcXGm%jPp_tQfV z=r~{x=CVoO4Dt;ftW7`p*-z64@4S;PeDWzU%o++O+346vXsgb$BXRQ7F^Dc&9ALJK zhQ1cf|0)i~%ybAq9|XpB!vHYg;%2}nh?+3XV_v{i;F#e}zP&5MzWj{2hY3J!*Maon zpZrB?8t5ey>ogIQ#+m3G?nLATHH{0LA7KQLEePd0OwcxSNZ2n!h^D9S;&3^Q8B+&` zkyCtAItQ`t%-E>P=D%%pt@*YC#2vE zj?{D}T}Asp%Occ^)36MQanoDs`-w|Fhz6AHcgz$TiEgul(B{rVb1P>_q;|}purH_V zRkX*NCmlO{G9Beijy1Gmw>iaj8m(|W9cm={QB!>nPNry<5jN)8L0KVs%nVMS9_BPc zTNbbfK6i8=ef@!Z;2N4&V^qMp{6L>f@)*DjD}G>l!N5#5|&W+83HLa>Sy<`^?bTwp^a`w=hsKM|lv0ow@?c zC@arld&)ch`dcllI4z9~CrK;g;w^u;_rraD8n0q*TSo|2e)EY%BZx?#d#;l<$y~Ou zGx(SP=3isZF_3=tlOLwHKl%_Dum)JDeP&*0vS zZKe)RUY}%q=Fv_b$I* zf!a{C)zd^}o5lpjC@~G}x=gYIQb*Y4fxiB9h?Aimt4$b~5SoFTcod;y9e3*5RLB4q z91roqon7%+#zK2`-vMoV28|_;^PP4|J5_#d1)o6wNmtJp$pmI|EQ2qs#Ex+e%e?RJ z5a-*N&n{7qc&UI8bAWb~w!~Nm{=mENTmiq}jcv_4gkl!?Vd8|jg0K;aY>(@^>!`{- zckRL#0;P;X!aAmtHb6hERqlgNphD-g&fxSO<(8%*8|n5Xc6f(xB@QmmrXD<3A(NGv zlpk||aUl;U1=0qDQH0CiGOsCr zlrS{9LTX=V&}XLzeF~%6#^I|G+L*L;z{@5J2Vo!ooLdfzS1>$|hv4KbAMd?f2lpo6XmARp!p^mvnXD9qaFgF8N3m@VdAJK6(YIIlwS1RV&IH}M?k)`QoY|7pe+ zV=4@sH_VOIT25}?#9Dzqs0^xM*J0@J!F1^OVa~)FicMJ0PPs+ov5ziagFX@t7zLj@ z*^Y~2wZSfCExSmA`1KkvWQnsJYK-PYo9t^lCU7;o-3W;uREe4J!NckN)6cN}wxmhU z4(aVBnj`~#H1zSv1^Ms9+A16Q}GVmWex~C6$eHt)3nZX3i{zB&b{*m=01K4 zYv;NLWe7gof-j5sn=5={tWDY$Zzb6cttIS+&f zjA5A-d3_$;uWnNyNu^$%OB@_P2UZnx=JB?5w!Y4}4iT3{9 z+unFwDaZThE8n&(!XGk|YwFuEReCnIXJ38kzl2?I&Idt*nFs<;dKaEWwq4(P{S^)t z8;7{Q$RRU^34~pz!D<<{px)wHQ%g_!!O#91oO6$-mtXpO%o1i{J}1(z|KUGV#%y}# z`4^ca+VNqN%OX+BWHjvF3om>JbDDwl?%Qv`7~|E%C0^;|dhydw(WZ~52OoM60?Pq) zE-t z$I~~z`Q!BFJ1@hiz`!(XH`$I11ChcfLeqm_>CY& zY#42V(=g}nU{iepVdOXqCc?(@CBkoVKqH!cH8LZ!W5m#2OwaT^MU1x2an`k3!*S5f{qSHyYaKK(io7Mo#YYY+^2MGvS*8sHUq z`KB0PXtX8kM+1+h)sC)F-Uw!NXAr8p2vs_VX`J;+cQktv?%OcZGvKQS1~w9kd>r3} zP4cRh8z#*n5yF1(=vjPxh#tq-^O*MTrc;CcFqK4GLD*ep2LNcM%i{~_rB5%WVT4I( zP&AAX%+g^Gt5kcujY*-}N$K<^LOzXwDMMMr?|SBUgyN0f@tpO~v5J1nTmh$b7>Kd) z8?j?!9v#sYSZ)6rcJm5qnK=O0uX8Zl$zB4A46*ar##~`db*GWo?~TwUqDQ924)Abi zo-nGdsgtNao|;uh1gXB>!PJdHL^x!`1hUK}X=SW~B9aZTah1!la;}QAGTyh+jyE?m!J5{co_X}qP$EsEfSE>1t-qXSg6J#ZrQo^??h1cl+{TW@z+f*z z>MXFpl!y01$Uz9j7QOJFSNzSl@%yX)kbvwe4@|m;KJg=$8~J4NgNC`Lt1NMMN2V5f zPOeNlxC4jXKGJ=tfp-dT;(@pq!VY+se`EE)+R=ov&>f8x)=030Mnm`EFQgy(M`#i* zeifT_Ohy{H!oT|znX#t8I>!S(cwLMZf0Sa+}TQuA=I3^dcq*%m!cb>V4 z5U~vGr*7Ox%>-%KLg6_NFS3F;T{C{7EeF_Ap*l_2)=+|YFk>UQ*3E94J`(aEtF*ll zp-P55cm!%TDwU_Z>xkV?pnwkWMEd06lODWkG|L_E+Y9hS+kzg)6XBSJ0(lhiT72}q z$^&--g`xB-%%&@IBmhO9Gd%{HlpC<{Pt;N=JSKKL^RcP;h(JDbbjju;ub3e5f_o`V zyvS*C3jf~!d;d)?wEWHmSq$T3Q>nJH=v2O=JtaBczV{=6=;VamscF%~p|yj%*u@f_ zaf=DVq>qjV238gDrQ3{vr5h5LfA*>XqO{BJ{riAaln~D5SO0 z4TH4FF3R*M3^Ia5YnbL}r&_K|aQu}uE%|wdUB>s4vV6xoUOBlHB4{XWK_iK2(>I>| zUV8KGe@aswZ89AiKC5c9gTlaP^Uq{haK9}fczquazuodW}1 z5T-jU8bn6fx{Dvk`}B935Ut11;PzvG&Ekj#eu5}WA7A8YJ+ zw2bax)Fgc{=I7Y;8DUog2ET!YtqtO%7Dls(9nQh%<3Oti&(}sg^A!Y=MI5dT6Z^vH z=Th(ChtmcGas%SI!LEuYfU@}>gl(0DS`(3FB5>&>zM=*V=5Pg=JImFcG_nY2*5tg9 zkqM@TdG6_};~cElM%m9jbS%sruj2GNzLGV>p4u%#F9V~{3hR-M3?8DtZR}()H$bM& z-X6|Jz&;tRT_a|S%jD+ObA>%+934Jm00sj=eiaWLm}zLv1C-M&hojZ5(>hOcMgUmD7w1uY+iPGBvTY8jz@c4yA21w9 zS4PIu0wzMUq%>kn!cOEcF++uDitb0C%%q{~PfOe?FH{0oMK*nmv z5`1znxB>@wANT%_yZp;Bj|eb4)VVVUaFAI?gRhc+_x5K`NF=xAiKl_@6eDcxca+Gx zf`5LSc2B@1f%(KEg_ir%mGUgdbTaOi>rR4WBOTa4D5bODE_8&l*TAKKIluFItzeVQ zL7dCN!#3DA%Zc$uIl@T#2EmoJ-TP>#5uA+jg_42sQVHPs5N&9iThSm>6N-|B*ohj) z$@-)B-h;k2q<{NY{{wZh3rxS}Hy6^oH{PY~9jUt(LD8LJ@VEnCw1*x#oz^~^LkTkn z?FE57$dfR_z3iTyVvT+01N_X`-D+c(3L_bmA-2)o8nBwRFf`QUlMc4BBhlB_fdxRe z46)CU^hjU+?vJxi?!AObfq%RKVWdJl<)7flx%a-~DXjq)#D_Ne^7q<_fm|TFCsPJoIO>bkZs0VL45H{u0Wa>izuJq0R$&-;4|A+{jT~x%5(9i>?ppf zjNGS8sGJzlcpBmaiE=BADRtOJr(SN$eoLdAN;^cFAZ{#JUZ(Xb@GE8QHzP{R?c0AC zdH(suWQA#3+iV~T(8=`OU;MWS5`Ukre)vXc14Lo< z@Dc`*T?o{$MYu3IoL(ms?8xW{=LtNA6B*iK7XpO>;2^YIw7dHbrhoAlf0;gb@9p&R zuYU!BUr!&t^LCnHEB~i|`Y%#H45NoRxfnAKVSKvN*$2;|weL)?z4j_$5J!V4i5Tb* z(KQ6p3!i-k1F5gX*+{2W%EPiW-&>p-CW;gB)~PoZHkU{yXG~vlFSHu>LO0-c@1yt? z&tu%}j?9L*i+wa&{AZqgG5zkHS0D)bE3C56Eu~8rK1wIfVCn_qV8oVrv>d&LUS(n0 z1pnJvs7{gHE~@g< zbRZ6uT*Y~e$&Tg*jhT!#VPS&E#zK!BGHvVV=9E_G!dfpOWYKbdJ`IrsPKLAe?2YEu zjxfnG&imODr_%rtQ#RO^UO*sg*F1@Nxk6>dC{NA0cjQTH-E*ThJ1A%_n>j zkuVr@=eGiY3v@6n5aT=-02A~>d=*y;qwTJRv)A1%C>d~OA;me*wDF-lnID-IFBv0o z#d`cU&Hj}qWMN`uthBF=`2?LPxcB88E*$&i2~QUlFWc(&cGTsJBy%apIcJUelj)1{ zzw?;h=C(0Uf|k)%7f{RF7b2MBw7=9H<&dtFAI12GXHiq-o$ZT$(-z^k1_@A*7#e`d z0)B4!McFSF6!IXJZORr8-r$_@@{-+N6= z!w>Z(YD)ob?jIr$3jb9XheS0+U8gThiDMbV@ya> z#-_lZ=12Rbd-g>bgy7)Z1pn4C`PzYbm1)*F*!(iuv^+wniW5{7aEp7&b9{xPF!d7W z#hb{>JHLG|+`PQ^y_acY3@XoUXIW=UKA(AC@(2H5x#V$ODev8N5PRJ@a0kfCVP;Iy7QLatS?N!P&2nPCY(qx>$>>5E;lT~>a58~TgT1zke z>Nn}=$usHO&%F@4$iIF2w`s9<8o&}=2_g4j|H*Xq_La1$ji_V_tgXxmouxe(RufG__shTP%cco%F#XPV`Wn7*?97K*miXd4a_+MB zRpWDvf#7ifXea+;emh-!r|{ea4rof{eAR5v-So&uBI~^W@|_lZo$w@AjS2qP3Ntc;mG2ua|Yj9e=_mm0y9h_NQ{)-=1i5CGVHgP>$)%9*c%B zapX)Nl~X(KG6Ru`+n35HX`=}?tfa8a{KGnA($wGvlEsRd>n^EFyDi3C@f-NKmXXQ- z_gp1??te~S>bobQ*?qvD`$jI3<-TV)I#y21vM4BJhhT^8uqUO~HFc-wzxRvOgJam+ zulysLL89p}W>S-=8_k>}WZd&jiR1fk5DWVj{tn;e6wi}k2d|%l5Z1=j7|}JDpFH#2 zw=i!ym45lR{~_JJek)zQb|uaJ`~NNd{LlYoI(g~=#)Qc#VPfLcA&}G`J)VB>!}j#X z>#uQ$-Q|Fh@59`TNqZBU{CVtD&z?J%jvYG=VS&)Hqq8jo%fdZ3zD;C|K8QGB4QY(b zry5VAgvBrhWbXY_;9dPRU>DGZU;$q>4>j~@p!ab4?sGp(pI-YMSiF=6 z4IPH5#rFV$B=KxPvpPt~!5#cfCdVdNJZ2CSAcl=I>_(uOp@E%k=U7Z0PIo6p(%8i} zAx`V*5DZus8mnHkQ4v){J8uZaBIX8Tlrx0CL?`|R6PS){Z!aJSLZ~@t&iLdLz+!8S z-4G&iZDJZ?Xjcz3YJn(@jV!U<-;)0Nul|=b)OUsjkG3w4LAa*U2AchqS;QiU*cK6` zB!(?8_FLGCsu?tzit~5@yH%aDwS8a2CRRm3BXeIN!o@o93qXd(7b0(XS~ujIuw_4b zpf`;!ji(wkxV`MuT-pAD`Y`t*YS$C^%ya=9!{5!Z&>Fd<7dzr!gv};)tPDRnGdCQ> z+TFi41ctKLaR~cT-YEc-ZDYq;ytci3#l7E#@KlRr z{Cxdzd>Q$fKQ3Bj9(r%tUM{efQ!eMCI8{9-^)oWMhh^_zx)7BHe&xE&Tkta$N%GrP z3W|jyOpr)Z>Njzz!}e{!kj@}rcQIbVUUL?MA2@HaA|Q&|%WpZ3R}pGL)6MVFm%=n! zN7O_5y>I5X{f!SWw%x8fl2;w9m(a4?`yPB}7GaoZb#mZA+%+{ameL2(Mjjh(AB0E1n=)v#_W%%@r^yb2j)Z`F5H4^n!_P6{ z**44v$_AOQ27D(Pa14#MAhgI7Zf#(KgAdcKOTS60i(jPv&eM#Oga!IacH5%DWJ?p^ z3L|3uhBtP{mesmt~u$T%`=hM+Jz1 z5Ej`1wT;0Gkk<6#wMq`}Wn2}gqYc0?3vw`e;K~|100LF>AJ-KX4KiCQ`t;*mhas%n z!mNq;=&9jG7W2#yIh1Aw&^a{9POTBR{_dB*Oiw)dIEQdfaO(KQwA)HL+OSbW$n9Fp zGx^-$AW($}B8o&v&sx@^MubBUCcXLLJ85`&gq`Rne7HC;6sFX0uKIJyymnz?+8R5l z`r0tg;E>L`!U7{j=pCAH%BccZfGu#7aiXJq3gz%;HPvy>SKh@}Rk|ppq;*^vN6Rt^ zyo^exGG4;iavXo}Te=B$jO7V(SBy#K*P!vxpiXw)S~-#0XjMDXA9wWl7wurKtCX|N zF@{v<+ANK7S3zqU!>B5pXzChx5wZmNZ6RwgucBS%ao*`4VY{JbnI2g@byqd`$>r;*+6&y<(S> zdYqF1Gx})0SYK$PbHL}-pS?{g8&|G8)DEbOkH4zlmQ-K$ozwln)MBK?^2vjl8O&0m zTO=7=`Mu9+N_yBllKcPKt}kEYvwYv>|B>YWl-BW8`SfUk0s;bRqh z<&bCt2GIqoozR<&6-FS&iOCBF5c04!o&WleQaju2uVUl5wLBFQJKBlxAl$rH0&ZY& zo4R{F{oUUTrzgMm&GgjQo(I8Mgk$s=H_}3K;B6-S!rrw5X4j}Qo$!Y3I@YlW!VX|&G|EZSH>r|X@I-m( zKuB7IL0;P&ftg&3U5YNu8&bn2!ol@)^X_F9Q6kk~Q@+jVxE(}LGJ4GX(yH%n&t*lKFEdv1pW^ZZ51)3QVix{{9(PmQs<_E!oU5D^DfC#EVog@0m9NHZj z_cjETTDIk#JN1m6>6$R>0>xq>Rl}?zRXlBi=r6LhZ7A1{_I9?DJMlT`$JBtZduXkP z+48@7_XbSYMtc10IYQwMpl!y!_4CW2rBQR#gRQYK-epF#WtC9J1yS*g0)13G30x*$ z`XZh3RADu0;*txrd1dSiV`P38icQT)7;A~SeiSlH5v_)&mJ=5p@!t31jdhd-*Y`08 zm{XAa5ERwWxs|U5+eLU2d=LvmgXi2ekHF!wJQs{$j;KS%uPhus^U7(Vqr6v06R`Ga z2LBo1YktiHQU3EPzc-CoU1)Z}cQt@U6Y-t-?2~D-_AN7orOwhuf4f-5?kjM#?+Ve& zz#H-s3!)m=0B}H$zaD&dhS_ij4i?5p*Rqut*kVw;iC^hFEv0!<@J_TK@A@d#1vCB2W=pd&=&R9 z(ytB8Vrb97gGXUt`a|Gw*8p%+ChbxY<%fTkD}HERlFw;i>`ln!ILNd_|7c72{xEJf z%K5QwS>%@)k6jM%Tob%@`k)Ecf+k=vr`yy;AyJO~32iKAD`@-eWxuS;dSvj7M&!Lw zW5f~RpkVA3_)1%CySS0@BbzpPUvNjf)V$DgoS(Kc<6Yh{P@Ak)*F28n77V#PMfjbfU)XfpnHG_~{N$9C*BchLEMmf?>890RynL&40*H-8kWw`sQ zt$#D{twAv}O%%TQ)!Fp_yzvs|oCpoG?ARiJG_12(edV7 zq&AcpEBL%kq5U=QVD$QIIj6fJZ5#XhGZbY(8JHx&3b3WIz z<1UUx7=7m%mpJ53hJetAM6*PrzBxw>dSDj_#|C~G|W0B=sU_fSi?C< zw=vBf#V3hS@%B}p2Tcg|C+lTRkw0?Rs*PYl+w2MjzR|b9H^x<1>Zd1dZ32(`F=teW zkZ!u>h^NBIHE^B&c;%ftyGOqQ_XtFRRY4Z=#oN421ztPOhJ0?;v$C}^*|N^$XVmx z;wM9F&5_5dy)376$shaW+9E&uU-?I_qul?#?2M5A4o$?W}QXr0g4*SmBraG z!|CcDiiO4#v-bv;{24|Was_?`UVi6iq>DR#fg?V3%GR)mXs)rabTnPU1fda4*ZAZF z^_``cy(eA2Ih<-Zsk50a z>2_?fH}P?qo}Pjzx2A7B|5F_6+c-?{3qtr}S_QFR$J=w8BaJmX+1Mo{9U9lrgvoqC zP&MIDVpoZ8MHY%_V%I0F(Qma-330JC?90g4Gz-ouCU##P4Fh;IZi)6xqB_ma!{YPsgr1k4nW8oF@cfU>cDKt-4N%k<_p<= zRA$!gaBYbh3m6Y~P1N8y<4Y(-?;{kf5opN2MO!G2Ly?a2(HbFS=kScSjMZ=1|$UBcN_ zx3uG3?708cpZy~J-~aJHM%_WoNL$(*j7X(!^H`>f(-uzc3}uupY+Nv9*XcmrDMh3nVT0?v~26<4)jG>iYLO+c~Vm^ zqYWIPiFKq!Yz3eSHbgzcydq>d=5u}13V{t;F_$0>inkE*pp-P#+M>;t85d1(7(&>j zy~4Jj{{R3$07*naRIzg`zWQP4xe1119XPfbil}nAV-aQXC8MmVRXbsszxDj{!Po{4 zl2>h~yGAQ$L1nu1Wzjy|gSc#i>-Oq4^ADK0J_u8n&%_m( z_rh!`e8>!m(}gJ#A6l5}vqZ2`ps`NNasF73A)O0sf`NjeZH%4h3U`VJ2JXM<+OC-O^5dcs4XaV;rTg>oAilH>B|`jP(}GaT8JSKEHk`z5D6= zERXCevF>?-c_K(gZ4C#+L0>)mv$Gq`JnhvHR@e(0@w2J7J$92-aOo#wzZ75w2N1du zw3kq*EH17Q?wIIpj8#ksut$YMEg?<~pX@x(pclrKF^GYPs(2)XthK!LWz|*nqQGze zZ7#3;4q5@O$t(1+Yq-dc`6>dB$`sMkU6CdjYiPeG{f{NZi01S?n!?<%fvby*SMTp722%=$d3-- z;+G#@QH}}_1YBs8J~rLJuaWv%iJ}#g5MPIS=4TV<&!|+@%ul6VCw>hIu#=oy)g#*M zup{BFJac^){P3THph6;qJJy$acF;ZhYJ+uoQ~W_WqFJfCz}eIqVHlkDx5BDz6yKds z|1WEA`kd!=CHP(J`$m8uK@bGEZxSU^v?NNFWyzA)8K%yJq>q`M zOfIgMux8%LbVokisj=H?Td6F8Dts(}u8H?C8FEL~ktzVkReT$H@FIhb1U$ELo=wgF zIN2FIQ@bHC(Kp`?jzxE7%=gx=_`r$6e#m{{5o$n;DGlFuG^Yo@c^ZN<%|;)6m_!L|+0Ja6|+(!a8z~@DA06=?(=7%hTPCM;^A+Dj{&?7KO5s`xh(uR0- z{f0YCyJ4Gz)977lY$k*w*G;%<1OrGEbG3c5b1Xo`#O|nQWl`HMOx$~Evu;9a4Bfif#TuC=4 z*ku&JsKILh538Xg(i5XOp|}ONAjCwuQ5czAuTO`F3N|<>Ub%@BAz|se>-X-Z8!UF4 zblnCma$_wS58=~Gy~wgi)D|^PT3c+G z1lmbjMj@`fXRA9e&eUN`H1o0qEV^P{eANFBQs|J3q_xbT>m(tKU{Vx1~MLmiu851MA$moYV54dPM zWFD7TccB}ned#XKZX&pVz-*gj`lKa(pZ`GvV}L{BT+AIWmT{`%nUP(vaFVfOIMk(7a0$VAFS|4k z?C%Ya0A~K6HPTumR4u^=?CNNN2KS}I(BVS|_NRlS-KRzx8e_O>%W)oywFdMtfC|f_ zJ*UFQqrek!#!bb9sn0kuUNRnf@B4yrRB$ngM}!`Wq1>>F+wmvcie^(k;>Vx@Rr` z^C@6Av#8>Pwn5jYX?Hbi(^B(V8lTm56u}po>C8U|U0g<~qGnv{cZ^H=xB?u|vVls7 zYj_(-i2YWV5EW?7e^F~Q<3b)o zy5oG~194pXTO5o;Xf$J*{?kLjFtMgFs+jmD9S-ZAUxO22t+f@nN4r@2;plW4xdLnm zZ@#CyK6MlNe2V$z(>qh?Hk-j2h0Nk+u(YV?P+2rJ{SXDnJZmT#(!OTGpJS=%nc?@4`FDcAD04)nTW9aBszxGh!Kdh!=pha% zxGZ9wF@&&uk?n9y*q25)6i(45uc@Svla@&j>`UpgG%4s3EzF~j#E%{(_7{ieQW`Vg zaYR8;Y&foE&ZtH@Vkzg2z#6B5KEV46}MTiHf| zkNmNGE6(A-jm2Ap0E>L1622*wDxA5CdT{Mm-uqNSuY|6Dp1gJta8fW%7)&KXO4=+z z%B)*=9&nh)A94iaP<-_N@jxOtliM1mN*hq*9upe{X_a{6$3IM;U%ioD$GT9@J_Hdx zfrJu3Gq08LfGGTz|N7siqsNb@AN~#L38y#5MEc$Devg>!y%2~TI&Blpq>M^W?*aCc z_|skKfv z(l+uvwffS-R7D$2?_sN*Oap@(5gE^Q?`+NFQ2d+5^HVOt))0*oSan;G% z5qGs#6W0QPs%Ni`={a358q?VLAUjoGi|38F9d$CH>)&3@ehuAhcysgSAULM04#ETz zo$C;Gv5}i`wi|dRfi(>8EFn}|nmY)qI~SAO{6a-~;_w;5(|!}Hbu}6~8TYYZ>@?x7 zqNx!tWY(i?z)}H0A6IciSBMaRX}c3)EMD8pyM|V|h#V`zJgz4X*|TGf=sNY4GRH*s zz`d!F&9@qvT$aGCC7At6%BiSrO9ziXlg^yIkS<;Pggq-@?s4(B%bp;2_$|U*I*x?3 zeKZA5+{2w<_a1iig|_I!yb<+d(qdf2Zmy)2?r3pPd!}%qrcdpWu4B`yhvI`C7YY!DNS;I(wV$MU0x=ja;Y0gX04RirTU7`J=Hb0{a{Q?XTiYa1z+`5q zmtoFT$k~=Tg#5LIiS*{*RkB&+;ndaDfvd{HG|iao?>m5{6h#qjl-ZAlQl=y~G_&+k z4H{MZafLEHWEj*wI-VlE%wF_j#*LzH?04Orq`_MM{LQ0XH=BqcMl*#+HyhanqidVV zDVIVY^S$s0#Q}4wl+88^SFN4xow&7(jp9r0V&d)j*t{YtA2hHyd@ud*Y(0d(F!Vk@ zbG&V*_nkQFGslNa;xw9B=VZs1ilKUx5^967NdODNKx3&;YutN(u zc}_Boz3NZA-jV9dZ*XUdwDi3K$Qqj9AwqxOx;YYKWEblTcX#PRJj0kKlsIjvqYsu? zpk7&vg#txP4c2q(U~qtWct72`v>Hty3}R&AQ0tB&g#L2TvdnKak&V__o8&xH&>Hs+ zQ9-$i09D|3#+U8J_-!zoq*VK_lEU!7u32>LVFhl&H8iN5KrJ=31VfnFNFQ7oL21T3 zOc0Oj516x75irFlaMi|(2Oi?VB8o2EoED%{V0$Q&nD_uIxJ*B+(C*b$A_g*#G_cOE z(kNWWyzAhs5eH|X1Fl_Mz%|$aB9&oDv;MSQ*D(&QkJ1>&gqAk@D_dI_EAR)=+4N-& zP{f&n?c2xwghfe&k9;DLh5R9hz%|nX*!i1G8ouoNc*~#MB@!sX#6N%|rS+7NFy~X` zACDjNM<95ZE~4|0vzjKgFbWTrByn2;QCz%KjSsbifDhtblwAUjh#Zfp4{+IWn1A^Z z^?&49@xDmw`Eg0Mo||Q4Yi_ii#3AWvs?3j~U6F|Ua(G+T*1b3lGCLVd1ceEZjc+A} zc{3Nk$cJ^(IurRmu7X8EQ#|HN{_Zb-+tgX5`Ez%%a_uh z-n~NYcmxa`C{P?fZT`UrR`JewXedjyhOULkq zM+b*?8HS7Rn#of$>8s!Rc6uAvror3S8PkP^kBOJTt0rO!2I0dGewPLg9Zm=O`(ed;C>K3!$i(CAkA`n9NTd!~Lsm4^f!d zqz(AqymKQ}u)uo(ra}wR!V&}+X10@wXcJ=8PP8FfhmUj>Oc8EDXiIg)X-0rBuKX6hedF8S{KMX&iTTwJl6$5W>)O!_-%Irf1H4 zi~edxkb;md5QdVFnvVBip2AX@@l4kSaYUR@qglZO=hzjCm|p@H6|!A#P(vH5&^T>U z-YUMpbwrEOcXbJ-coxBAfe@FQ>;+H*&D>yu1woiRp$Mp)cd^mlr%X~KSZf}Fdowdr z?1eGL1WjmLv^CNZ`^-Mkx0u-nOAOjXh^`F4vH}TksbK!tH!?}#DnXetvL5AQOW%6ai08K3YqdcuT5Ooe)Fx?Rur7^E z{Nlg)f)p8_fSj<4G0cS;N1wP;vzbyw;1QI<52AeA>=EC@RsIwBS$bZii8JHRFM@6% z#NEdf|L1@H=Y%6g(?&m9ft=%ZKKUyDwVV)MfR8D&b~viXt+n4iq7DKh4j60B8E$GK zS7SY#Pm8r>i_m=C;yPF_G|T3Qt;D}a+K6xRzB1O#ahK;1vn}B7*}id2&O`9wcUY;I zK2k^Hbh`O~NMfY5&CzbMkzOg>IRPV9#J%w4AxaAc3KEG=qze}@ZoO8+9WatHq3+Vb zIvE+@p~XQBsAIslTBfAz(oI9jx|yCXpTU$`*rpq|5q<3HZImdt7)LogETkg6{*Bi| z%RW9nmM*cc#jWAH>}b3%)7cOJaAkp!iMbFqT=+C zGwJxiDJ+A#(gz>EpKgs_CY&;gCBAZpc$sZoZG?xLf>tv>n&0>sYi6()@W|Ne*3(>}&6&b_xr@sY0K$NQm=`D6*Q^^puVU zIdIXPWxtt+%u|fnB?R*kTehXOX#popa*pO#4 zLP8Y^K!e0^!5W8@#>oo^r)o^>ootq|LAwoks~Z(M6O;_7G@%0(RZ36vFZ1Q5bD(pT zbjUu04oX)XHwy5=#7$qNXSr8N=3$>lNAk3|=3I=i_+Ev_fB3=ke-a;*J|gG%Ak5ob zn{Y?Qr1P0aarWA>)|-i0-MSA@s9B^TRM-Yb-FfO)675*)!1XGNUKU3#GvOd9`W~ht z%j>)>$}XLG5)HDAuZX~9n}oMXfe6yXIPl*&7+P%KxuQxTaP;%tnaH;}cYN=gdWckja*^FvmfJwrX$-t1Tn%(yh($UGvaC+iiXcMd) zFwhcm^5ltlKR7tZdVnjDZtoso5|cdbV=}3s?&9da-o8ldT8aqYM|*9*KedIfhnQqx zof}fAowm$auq{sUy)H(Ek<;XCodaihBb>_nsk{%0=bpEo(`Jb~&4sD1K`>}%(eqgP za@MR`5)-2#Y5hhjK z%w)<8{U=UZKtHf=CmPc{lPsWTr*FnNgwF|gVNT7vW^gB_sx*9WJhahmgxLEpzWZ-d zZ`WZ6>ch0G%aj6yrc;=GC(k)1Qu|usRg7uNwp`n$+m5*5JNwKrw}}6_kz85{kN4bH zUqLa@kS2*NwT7R%0+N%QxLgf^ttH%UO*`&--TTuJX5Q=UdQF&&G>!)M{YxLR=ynVf zFj^bvg=TFh`6|{*7)DdvGi;|3-&$FazRY-1APC&Fuc?ba-D>o=*RM4;%!1)@G8YeI z+@)? z+%F@nC|3o}TOWTGZhpG9v_RK&!_cgMdvFApJGnbyI?gKyG2B4`tEp#q2T;Sh0Nw?q zrEUsm9?}N$m8mMqr~t=Q4lMC(2{;Y0Z_2wLLr45 zpw*kfSw-?dN(@}5fyh+X*b zKXvXaFyln;Vm(zOOr$vM*tG9ebchc?FmON@DK|}Z15t%fcdcGvzS6x}dVBKpF~Yav z;)HN915I&mH^7ABL~Z#33Y!p08SnLkbnR$|M)2L?-jnIXq0_0gp^Y}pK#LGoP^^rP z67+(#J_`sRbL;^z$EIkIbjo89IryQ=^cqY%-J`s1Gt-ftgK1#Te)>Z#eV*%raM2eu zBET!&>OyXmMG{886|lAs{bx_al{o&$VV}s@TCV%=eDukcbp7UC`o#LUrq%dqd`E&+ zT)1XN7t+Xe81oxMz?)&AICCHVVh9!sBw@^wW)U~1;DLxX7_LS#*Yv*?$-J4@mMi^k zK!HXCLHI*yhLOgsOSFLx3`6aF<$NE?O4?fBkgYKfX{_{k4U5$Mo6G6uM~i80oV;K| z*AOu2T0o;4D{BZ5SPVd6&>wjwU16=O3!@zyb4UwQ%&YeaAw5AKPZL3KYVT~iMQ*358#@wd6)D-jFIGzvg$K<+#D%M^=WMWa`LxVK*{jg(NBVR7mXFQoafd+jD9&ailUXG9 ztDGW|eaW9Zs5R?a(P&XwO7kiev-Pw(?Wx$o@MsFTmQ_F2G+KQbuh?m3RK3V3Nc>luGaH~D+ppWo*Xzx;2Ju7F#4N=uFW zISc1d<2KOJV2kqtC$$MI!vC#u0OfAlwL zePSs5+?{0AT-(+8unY*9W>N^shrj(5u12@gH@@{<_J-I4v1KPm2CCrEAJZzjuHc0i zzsAJclm7naKTE&*)z4V`J(^zp>Z=jOM21D+w4v2h>-gl;Pa%9Xv%~efgiB;?2k#w> zPPF>?D%jm08BX&M{-ZEeySw+G@j!rrP-^O2m{R+{z*>~uED~cK1$jO?>6IF@bqd_% zu7J6Ul02x@njoan=Dwb8go0}jO?SYY3zKRaR*yUP5g?c#ok+}KnmrsE>UTl(5bTzj zgdvJ^`1CFy`1rnUUuQagwBIJD8`rPIB-+ES(48GFV^^#rJ?M%oomSU&G~5exaCC9cJ3Wn9=_3RF&BzW3XpUz`g8sh~NY zi)xs^c%@sIeW4XK)=AmcvO`$vrl=<0apS!VYQ#IIUqT2ay#lC0r|x+J2YN$zaxuK{ zwLV7kk|`JdYPZEdVPGBW$h!^=?JE84!cNcvA7jJqqIIMnTs*IDu*^fayKt?hGrOch zpK+bink4N|NTHhXjqP(Sm>T-%Yx1o{V5mdzZ6dN-B|?RJCu}m=N7OF|vvnem5Hi69 z@yE@fYGFw1%Sv#69!Ai-)iCQ)=o$$ZEZfcxKKhtv?L_fuqa4bM>GJR1 zN$OZ%$*(T^BEZz~-uxX=vmw$_3iOnucbCZCzbd za4ivh;Nciu#Cr#d9;=M zu@|^-dBj`>OdUV#dW%H43Ic4hhx?_wIoo%EdsxuL)5%*CY5X?(49&C0(~D2xdOJbA zQ502Ub0M#)3SFIWm&VO?9m|dcPemUM@iOM38lqS=6CA-!OI=g9DXt)(tx>*x9sg*$ zf`>l^t(dFnO?e2{7TKpYDB;{_&UGjEZmp#Ux1rxK@wHewR%4;HV*{Ef9#?{2;6SDw zlmLy8_BY~M7lJvs!2Xxm5vrL_s4K2X(xy#eBclT`n!;K$_y9M$bel5BgUVzDy!qlP zhfvkCA3SRw1NW$VB?=@2MB!_nW;_(~k{9RHO#kx}`@{CiCrKksL;hU*^9bdWxd4+A z1&HURKP5!TY2%qsO5d7$+*pX$aR!Kj(m_&w#CPQnxQ@z278XDSgIYPTiiCc}&-_bA zlwkub$H#G9yv^rm94`&Q*Nj$h43m)Iw|s7=s}*!|tzu!Xhq=X>Q!up_$%8BmQ4-%2 z0L!iO*;XavTmR&53OH~N;C)uAs8?l2yrJ;zb9M!?1V52Ukr-3954M0(YImejXz*uU z@&xFa&*I3!Lmn`ZMe$-QugF9Ic(MI6-4~k|DNLEu<}3ehWyfdcQXKhu`$BQ2|4^xA$=R_P_bB(mQYb=XCe#=Xs*# z`wT}jrDt9H!haVo1EDlNGL-()|M?%&^I!W~dg|%3L^E;+VHf^Kt(ep?KQr)7oH)ti z%kK2{+rLhqv4C<0ss5E$Uq|!K269Y{n&`EhNWc!EwQGWbd;i^ci3vSMe`Qmk42rH# z_Ja(}CzmdwA?!^({fB865vf>orDb9bA!2rX6w3fECiqH+e?34Ia?uqK8DX5qi|z0z z;Us-;y8`my0!R}bVibtypFN*`cky>=GP7VEG5z|dS{;l;3G4#fJV{HdxnOFVA7m;u{tom^yJ#x)0^rKva2bJ0B_$O zN!M=8;iC(KTHB1};z)Y>iRTE>OL^=~QL#yAN|>S*%3fv+x_R6zT9|um=+}&9y+Gb@ z-2gU*vRs8&hTDmK$WJqK#FU3*`MkBQ6F=cSm~x4-1T%e*zI*BGud}20ajYo#qq4Ye z8UI-Os1iC*$-+Sy9NRQQux@uCZo@>bg+Q&1KIy^mVUpIWpw(;>m@Ljrq~V9-sgH%` zYP77YW86vHwF#~_k->KoRx2svUxjc+_eB~vjFb73X%Wwj&J#pD`Wku#Gng408c1Kt zkjP-F^|IG3GpsO_>-vffEOyPfo;gW&qk-PtyFVSrC9fJ6ib+E5s5~8xfV?s7s5Fr3JC*oH)@r0&rB(K0tX4RGI*>>3mndvqB4rGSZN~&>hz*oAN zh+)lbySS%sz@-%;Te@e1crTNo5<)??iXa5G%LTG42t?$<*flr@t}tja=WY=R37oDX zm_rqA88RZ{SOQ1K+1~{VVk?Re8EzNmT8FUu0D=yDxWx@IMmR*DP~Vu31Q2744+3u9 zZZVhC-~56?@!q=leHDzJ3ZMwJd*m?nF;5wLKqg*p^(S9)8|Bc}=w&Ox&yz5g?&{ts z{A_#RBpFq++qQe>1Y<>7`@nTe*2lBT5x)`dt#90u&OWgsGDz0dcLxs+q*q^iH55fl zsONQ8brHPv4C|G3BYjkxKN(UmP_|W#uWdj$tR|GU)6bzZ=hEj`ejW}!Oifrd#Q}eg z6@A~`Q#9|JXPU3*!t5m^MO? zw=<`#uOURR{^1s3ubFbdJ9b}R8Dk^SnKV2+LByYV=y+S|?XSYkwg(}JP^0%y8WBW7 z{Aq4MsHI-B`1c2mM!Rj0kuK`gjpeL4;{zhk&`iy5D88W8(Ll5(lGKyR3 z9c&{dL9=`sZhVxzf>K!5CI#?K#+e(8R)FubqZR4O2h1xKZK>zb?zHc~Nb1~un>^+j zGiuvaWGKKo&h0;WCUC(1poG8y5ip+9PUaTVY_~Hr+2W^xf2>s~TtMf*BT_m@{bUu3 z!N?$M?5<*YSim~n43EJJ-PJ0nyZo{I!5CfubqEHiK)_Rl3+5B(f-c20jR0B9M>Ku~ zywO5ryO$WtS%82ROB8~(3T>1n1Q3ZD+YZ_ZIAH9C`{;AC}b1>X$-ums)Fx}zmu z91KgElR23gY$E6g&VpWNb8wjbu){%+IwN%(6TVF)R4x`f4nmfk;3)|k!94a;DRtUcm*V;B-R4Z zJs^nb_<;X*Wi1&7hhxet9mHDP_`ajQL7Fz~b-qZT1uYSu-Xe&(>h`^^J zk$}m$k|Y1K|DRcV!iyO1rZ>+ZpXCJ4ah0wP_g)NWO+Uix6`#)BuRLO6`1Y!$>7jt_4xV3!ely zAdLDM`d*W%TF5#4)Qt=jJq!VeE538?k>KjC(s%e2+8OqWxIdg~&=_UBa^iweZ|X0O z=9&pczh();2AblVw=aiXcBL&wwMqo>&^`)V2< zzm|@jd@{Z8-0Nvi*P+k=t+Dt_v$jNaYMPCju6~+s3=KnTSQi3Dx%kt_I`zQHJ`!DH zgCW{o#bWbDH2xjCy3j~2r!jU%H^NN+{sUx(*_wWB!6e;)7C5wsj9+PvjImZUHR5hC zrO;#Lr4_LVQ+yyz^sRlzp{A791#=m$9;6NJj+$^7TlcT%Kug?)E1!`{q!$h3ThDI> z4Y_U(hJIxRNKIHq_9Fzhv1VnE@WX3_>{TPZfe@#4wu`7*bxf=^)WaQ&W7l}v?tgJd z;aD7Xt%Tz-7<^txuVbR4uBH(;{Lw{WNs$IMknY*1P7~^veKi;dAF%85x0oilB3M4aNBt7dz7$~U^~DEw*dF z<6dT%m@^Sf!58y$KIfVOu3WnX{mdJ@X-O;kOPsWSK$NJTXW<@w4}<8q%hn0!e)^gW z!#&n?yml>DFz(R53g(X&pFf|vcJE>wif`1OeiZi3H+hD$^45v7Pk860V|gcVpSYqT3RqdZR~;4z;5iTtlMxu94)Fv^e^{I(yFS(bq|)A6Z?;(UR<6Praf%6 zY*@6OG2ytBj>In8GWIc-(-hz@jD%I@6-=40eeSb(?&L#06ZvLzk1_#8p1Wv zzRcf}#=PP?W?&}!D)@tyRW{|M|7vmJYTbK|d0~K{9=HZ=GFRh?P|d>Y2w59-QkZ}z z2u2uOfHqYluxoKtV6p6VnDymZta-S2ts3YLK4Vd(GlulY2ScpeHAcA#+q`9Uf z!ZCvbdLCT+U_RaX2>7$+u*q!&D0_uG6oj*)fqL+7k@-YdYv(<2-!ZRZ%Dz}ffQOjlvj5(vCA@1l+wej=`UZ9f;RHYz67Ttqy{?ap7vbtKER zC|XhEE#mmUfOl>yPeCFlWcao~{I8{yB{)HJ(!P; zc~;li)CR(-BZpr|u(Y1Im|Cs7n^}2i@?2u=!o(!P8A8n(^_ylA-^IQZz5Dj0sqsZz zmhP~=XcucLt`QaL9^Hl;OLJ9ft8Y(VyYM3683%CL8Ve4jWx?qMwD@q#ap73@2+85dK;4sJn zg;8S)hAAu^grZc_-hvj<2}fAfu@8sVp?eR9SvbECT2z^a78vW(yZ5F~2JchOX6iyf z>__N1gC_o|-Rz!<=CQe}HNB7D@&EDWZ_^Dz(x?#TX`_N3 zpo-_Dz@3e6=%+fE7!+F<%b;h zw-bNCOk`S@shDHT_bx8iVxWds#$WAV&?n$1syk6nfqT<;AEJrh7x~6yE@NTeQy$sG zVLy8m<#=uX*(cV)b2ILi`}_LRz`#JdIXGNmByfjf;%9_Ln`P`2BEt&dw=uCOsH62I zvWx<(&eAm~Zkh?_s9RDF7YwuDIA>hSNW^4KxeBKIr=2L9=%-FLJ9M3no3B;ztAj@B z=C`Y3U)s0-Fy*1qtnI>WaS9Za-r7#*Kw*Ipfw14gVpK+YM2)F{Zt@z}Js2ICXgJnV z7YG853?Rp*YkGu<_zOS~C|tmqg*|6XpW83ujRJ-9ux_B{=QI1pe=As+?SC*Uga_-m z%9t#533&~Ok>bR2Yq#H*vFQ<$QYHEU33#ogoq{AT9IKRw6PtUpiQWYIpTEZ?)3L> z|0XR?+-Ik1m{b&O^Ne!~U(><~k(7oRkuvJKX&C7x+?%G-jjNk!|3D9AV=ZMo8qi|o zp5uIw;1aE2ac)A0TxrUdSI6lrp+SX}`RNklei^O>PLv+{SP>-dgzB#^c_{v2z?IFx_$J`xn~9KHw>ZZ79KJB6r<3U=gPtilq|k*3-V zYduF)H1_<#T@{^nDCxAAW@`h50d%j!LWMA}fgrawk7aFvXKMHDbn8z)7e@v3Dikf! zd-~nY9YgVrYv6;?*))881{`9Y7GZgHk1_~&)Sr4wtDGwghO$OK&!P;NU`||Gtf!w? zQ%d@9jV5Q6je&+#L8}RbAp zMrngShc;qq$HKKhK`zX$VEii(2mJ(7oRa_#_mofit?Z-7+()MUf)HUP?7X5v<=44_ z`F(Mn&sIK~ld+|^-kInzcOWMNwvN( zRx0uX0hI*Z5m%R{I(*24NR;RIUdMd~ez*%ofu-ie`>ht1-aP&!yuR!x-bIn6WDc4b zAQE3E?0A`Tuzp27O}+ybIZ5en9(dN<$zJ9p763t5txf^a;`<_BCumz3_dD|Y@|&U! zrid?d7GKDt=*Qel2r}vDZPJyPkbslVWia@u!7sr?t(&_rFTyyjEIYU*E}ebpTSPqSPQUoc|D9H0rd$wN31L>tZ>2<+hz0Lx0LJ2vdsMvn;~%BL+k@$) zmtRi1SqouEI7x*}u1)gjVyEkGe&>5cT>2pW_P6igPBNLEV*|Q+HFz*)npWNZ%jg^D z*)dyd#pfS?2=km70EvyQwS8_Tv_g2jk$d;TkG1!}ezXHboM7{`=y%p{%rIFjHQ@@fru0wb#D_I0gbQhTbwSQ_JC&zJZVn^|jp^4a(W=iV< z{nOY`f%daK9qHel#+al|9y$ymX0l|nuSM2b%%ZKGCIq4`6*6XN9w}vLIyPMF5l^toGg@PjwZX-Oh z_GE({x4{F6Hhf7iTbl6IT<=4>W#yxvQB4*}_1fpuZ%vYX}Y?OxztF3jByo4*5ogR>nCZ zD=~JYQ8FVkz#ZWI!08jbvyI|Ev;(a*YF?UdI$%ON>4kLBdw=4Du2C*f-rPhGqP~q- zL0xEXh)ZEzi>0Xntz|v^U*E7NojP?6xZ#$_LQnS;*@PypF}4>LCs>O#2>#whb9xUv z!|fNw(b)8Y*uWQth0Hh6yOe2>)JNC?3+q$BQkckiT2K2xO}2ez8tI2?p0c%#ho$yp zEhggz7gg&XmM>tk=H_=K^XH*OiSr1d17mbVhM*Sm$%=Aj~ zsYcBGhN9ljx|qo+wEHmgcDW3=0^Z4!CmD;nccO&`{VJ=%)#G-&jcZRXJ z%=cBeW_32Vre_8YrQKMDcGGu8U^|C!xsW4&X2zQ1MZ9ODNH&q?!=gfc*L139M=*P-tt~6BJT`E%V4TuKB%jA zy__5L>GFgno(M_zZy>3izCf?MzXYtX)XKobF^z6K=$pT|OY+UGcly`#Vv z1}n*6Oq$%=%=iNq5A7_3y>{6(hn!*Z;x zhdGqjE8M0SU=C;rlOZQns?eX2zh-qemC}`>4x;V1MH?f(s1kpyIsh#iU=yq(slQO4 zKy2Qtg*2}szv5g#*8831nQJcJccI0Az^l0mzkA{r>(1mmQ$BhWdF8hHfW;^!3-j86o@{AjGypccUOw4rnyo&pr2i z+TYug-u>;{X$C>!!b@Le?|^;egZ6;FufjAgNjdcd3$D?eefYcgU}|xjpd6$3C@3p< zG=h^;Fw?UzA~$a=rM(M1n2dYS3gJqoD+8KwF zQ9Q!}Lj7GJYaZt3WF_Mz!`p^ik!ygw_I;+o;+=vbE$3&M#rSl>G`$kBQKDwqG_aX8 zC+WL0=blW*`uC&tT*GAoW>-P0cNYt7>(g_G_Opw3Q##z&m0I?65}x!%Fd!RDx|?^2 zmeo3+jvP6V{+mB~IsI>c`nQZ#0h^F0cY2&CD!0vWs&4+g)XHjjShXDmIm*#@G)zU?ezj) zYMK>bg@=dz5%U4}@rZlLwDHk}l_4p$rkT!vG867Z?HU|+Vm)%Om$5`>O$1r%M9l)e zTEa4%dE&SH#7EDz(Y%X!NVg%GDJ?s&gLeP`KmbWZK~!qzp1lAwQlCZ$YwLS8dE$?P zq3%nT5Pisb9d4pt!IZi8{MjdP3B;NtQw|dtT6?KSX^hLz2eS@d*ic&^RxVMK-zHi_ z+ZV2vtn=|4LYhn=V_~y$3}0(TCO#Q8wOBQ)U=*YWdSGLXt*xCQ${l^*Oxe3wH?XdY zrhP^l7*I8!v-nQi4^brT&Ep}%B$y%V#!H?k&SZEBE>)E3h~B~gRgyHgKi9gBcQ7RDDwXq%= zjwPA367#jtKNNDC(7+jjToAZ5)_!&3X1c2IMg zU(eoCT~(>66SuK0cD-&9Pxyx+PG)}{O9Ubbyu7@H!Xjn!v*E6Ha}ARly+vqw@UaRw zNhenpD$~6y3u*W&JFbsmVP+g*7(}qEVDq;O17Q@FWg4U6%t%b)PcZL*z%i`FQCjlU z6DOke7cYID9!wIz0lHNIp83xCr8qLnR6wVAx_u7X%f_Rf0u`pEQ_6&QMveT=X2^3nK4NC zCrOa`yLT7HZ{Pr7$A}cb015$w$|!h7Jt7l(O3fuq0D?l33^w=EED~#HA*qZ3BbR5O zE`MW@5&gXvGyeDg)qj_M@zcLf*FXQ5B&30X$obl0t{AB9;GDY*SVKUE5juW~Rbpi5 zsq~c#FF=f$)MA{Ssnv?~ z9Xi2e*P1>i-T25bj07$(a1V!1Kaq|dK9YX&!@rJwF4PL%8yrR`VMCz<{Si4upJylH zH5g#OZEj=t%JxqDrS+}MjI@lVnrO|SMgP)U`~!}{#tZS?HlVp4*w>%F{k4CVe)!YB zB6`qhxMR)ZA3Z!gMA_46fIT22a9UaVd-kWVzx-O-jgRT|8=vE@bSnr(3rw$J3txQh zMarm2fB!z4?mf7VR=)?U&MelC`qbNfnDD7EY`6-ot+*IctDlvD@nlj^JGQ<_NLNtC zaDYw}5~HliWMTyO!-A%dro`2T2!A@k@kdvOX?_)-Up16hFMm#7wX+NNXu3Nzg0*8I z?K%Z+wV?H3qbC_RbJ2I+jGEZVyb=bi4Gnz<>oE2Zp=Rj*WH7p7lVd@k6|N`3Rtt=a zRUMBZv~t!W(SfDtJKy`~=`a8CFTk5SVKvZ-QfPBrvqI8nlZ>R_%ZN#Mjd$(>WQn)# z7ZUowP0FXGF_9GCc(!V_RcMr9?4(BvSYJBOeqA_mB7N_xXi(9tEg}5rs<*(-v+VCj zIhe8M9)^~Geu9aO$*cxi=pxh_7@Gxh?p{x)5B2jKNWZ^wosaXB;281O zf^h!7{LlYWEVLAlGp6~Df@GR;lhQTJR@}Wi94;vK*FXQ>cj#Z(S==I={M2cUu~_)8 zFI{GXzZNut?BY7olivHY|HwP)7g}~^bIQz<7xj$qnBawjP;yLrLH_oS{TmTZ7{@OB zZbEo5YLVJ8-)os$!JTamAwnECzleBb9v+d87Zgg)_K`pP%f1pfEn7T|x{%(y$0sI; zT6R4R-o1M zf+Tpx90j6gT1B-)%dAwf<++~L)$2{xy1)_83~}6h@{t5A&4MpOXt?RG3g^GFFA*|p zbX!89kiUaETaIfJLWqpyQ7>VpQofsZ)k~71oF&tben&3mAxc;1v1B@G(7o zAihSQ`jD*bdl^5iJ2eDA7-j9&6g1EJyFSeN+MoU=iDWd7U?9#a^q6-R=ER8{y=w_Z z&Ty>eYv2cInF<|URBX4x$V2FIHw^m<7QzbBsJLh*VwtP0H?TUO*Pv{eb@Euj@2ZM+ zcd>B)Ld)KCq!K}0?=ZFG)ZcMb$D;c_lrKG4s!zb!wxCG+^(vvMvCtcE#C12jQ1ZAb z=mzvwCD9U^bcGt-P@GG2-)d)_R5dVIM7XovWjVa@Meo=Bg+KcDVn<=I$ASRswO$2Zksp=BSV88-G4 zmz-NvfG*=ssM5nQ;GvWWd;?FZqic|!IM$nbdk&yboF$M0)+_o|`dJR!xCVd!siPLR z#riJb)Xwb zfvxA<+l-vDl*E^h9#Mvc@M0_R&P2W{4$GEcTa&0qT=HRgqOHVcnbXB<{^a_W62yIe zMt6menLB*WsPXV!{+a=WA1FVghs3c3lt@IvQpMxMAI~;u3r_hq`w9A37H<2H@S5Jb zWnXr0bBZEeyp!PbncoK-7_1`L8jPqKT#1cPkGxFB&+j6wFe}|hUS7w2PG?mkUx}$N ze8hES>-AH&RXW zE}l!stXByZ@nQaxbEZB_z76=(|s08KL6rNMEE%blOrQQ3cfMDS_vbY zy!g`Bh{678`rUi)u#gQ6=h0JPMvch~rlS@s*PeaV={(kn&oBNy4c@xJ*O<8X^rXEA z3oQr*b66vw_~Z$J!PVmG^_yt-ds6q_z2R3L{ft_04%54Xy_IjY%2ZWi@vseBxJb_8 znZO7l0KuoR$`VepG!htBv1|0xY5ld$^y6RrO`4n=qrNaMjhT2%&ES)1bPpqvFzH{r z`gvOKKLCS0h!t-+UA&4FWBx&E-A!}{?7jMo4iAsA>*i8=@L&QLt`007?8i{m&cbOn zZCfNHBRj3C^;%&9lL@naPO$EWqPbiblZJYf+}|4+Vv>89u3o#0dkJe%2!+>N(VZHr+SuU~B97oCW0uYKF4iTUxC}2Ey`Jt4 z1i=0@0#~aQK^zP?O6ZyREjC99L1Tp#rK6!A%@NEKlfT+InF_-}zV_N{>6gFwS-NxQ zc3LIuqV1F+k>NLu_11@6K|!DET`*MQob<=`*=XsXWyisNX2>uR0}Cf`?`7Ovx1+zj zCj32E7cMCLP0Xf;Fe31r&>AKgCO(DP74p76g9<@LOZqbKTlqG#M^E`Pv zs7gW$&*Pm>sDsy@75Zd`mstO!-~1Fo2c}u;qu{%YmeMo|%evA=^fk%}!60ZK?E+T% zt3MncO`8XM=v(}h`G!(~gp8WyS_?Z9xx+u!LF@#7Y;!RE`7tn!m%IaigfaA0A<^;f zdL2d_-@yDp7p0|)M2a9U2-+r$(FQIv7FY;_ZnU_mFbJcWuqYOX%(Ms-7Ah;qkp%)> z843ZJz64*z4O94BxW!s4#*h6I^E_`$S)hf-ZSyeux1eP4Fp`*IEoq223%-DCBlqn4n)SvUf&Z&Dqq6 zaz^d)!o2m(bvCVwhD{G@!i|7R!5P-4xZ8C- z8h$wafRT)87s{2BEuCpULO=y`!n({q-zb1qfZN%HFe_5pY0tT}`g8-=nO~0G!U}@m z35ZuQ39&FeQ%Ug0(yN(p!k7*rYd=yygXg$K6C zFwe|D%Yl!Fu#E=AUFnZq<*{}4!k;bANA8pD`{Mc5H$2$>93d2;EOG#b;?RyCxR?bA zwBUg^nI_Ex+f~*?>hjr&0!jCuCGbVJ;I^*r^x)ng0=&X6?+^f{@2_DwVBHD}s_Pou z2$XdSD3@x9s^?x0^~@74N^d4Cavk(<1^!@T6PHLlM09Jcqwm`hMA`|gfwyp)y}g)5 z2XQT9)W$julxhYasX>6LXRa-$@z!6M1kDOK(MEfQ_WImkJvex@4=;ifN76^vZ?aI{ zdEGHwcI>z*dtP=ay=rKMFX~V*g6`FTlbA_VRJil?3V10^kOo$&Tm`?Qr!1@BgUQdL zqHd+R%4?5YU&d|)lXG$8uI6trB>8D1-+=?((0S#JDAGHj`C4v`ihS8o5N6Y^;$J4s zt6=BF04IZ$ws34$)b#n*XPHn%5{oG6K_>BKnBp1d5Y3JzCeA538m(`U%y3zYn65Or z+AVfwR6puVCq_}ZGbK{~SF3HCB?%??1puvs9UvjjX`K>sm28`NN4Wq$()z~?Nbfgc zAK~7JMdl#E-ZEF20mQqA{3MYom0~8P4z$-L6xPjyyg?A#W4KSpzXJ;HE4o8uE!q@m zO<&ZJEVzqbB(|QV`y%&vQ1+*kWGi`SZ+J+R;!9n0LMsb)8cm^wKDzYYyO>!Yq!(WQ zR%+kVNB`p=9fM9Jus-~c0s|tn4CE$1BY(71w#No#n=t-0L>jm@$R2I@-&2YJ-zwMpTKYrN59PBzuwSPOIupe zXtCeGy|MJCfBt7^bcommQSD@%#3~vvH>`43WaE?Hyh->_CS$4__!vVN!JR3ZWVX&GtLt z*5D3MSv3>BqV@RwH{(Nm{LqDTu=i=e=uW;(L|EC6(A5zWr+EtwA>Gen(XYRD+=xy3Ohzh>LMDD z>#B|&8c6%vevmGH^ilft+iw#-bP=X}IVPG_gpP>(LEbH`&B1JH)yND7W0dECvNl@$ zh6yu5m&{%bV+A*8fB_wo(Z)8uj)14H?mW1EziM$n^S(llg5x4qiLuEU zV#MPrLID-P&IR=!T`|fHOaT+!D&I82{JLk%v@ffNvGiGE8DfoF7R4G2M+YGQ#YX$`R9g>Yffg1 zzzlk#5^0^bRD;7584D-xz`&h7vy$qav9T1y9Dy5>%)iTUH=@o)>&rdCMII}>4LIgX z3e2buA8hrDW4`E<{8IYs+-F~9dSgF0NBDIxF2K-^--JH7&P!MuDThR~A8ZK*n*O65 z?vCxfM|Hn}5znRQE#yvcuqosjRMoqaGmm<}C2OnHty z=7?xZ#xvJKRYNQEBv_ZuS#e=u&G}FEadyVFZI-I*sJq##%@s`HBI_GqZA7}Uu?HxO zprK(QRoKD$f`#Mm{fBAd7jLKU|Fdrpbq>q7LIGoNoq3~zwIs1RLRvzdTr;L?RyBV0 z)hH%}zq{PSkVLLuCdqvHft4(e+2w$_?K2-x61lXBN;VOufD03)Wpuri!E3t z*vqHUwG*^qoJF;LxBAYgjy-zuaerX_R__{o;nanElC^w7cOf0tPGKeGTy z1aE8qIZn(-x=H@_lw|}RmR?BD!O7rJ!AlR7GD;rMTUztjBX@Z*@N<%k0(@y1abz6H zH>Ik8M2|MB?CY{DthGL7nGgnp&p zPpLwsKF{r+g;DmA^Upp_@BH>R>FKA> zM^q$*8Ht8kA$`G5J@F)(j#jj3qlB#+jJCF-@wGI64(RA>I!Fzyky|!tpTpQAYRYq& zvJ8i85vgJESUuYO&=gXyj3cEyi}vL#7+(lim{-r8K8HqqF}?MhH^RJK2X$#}RU;|m z+K}e(lb&Z|qcH@dgU$Ph@bq;0#}D2G-pgrpdORK2dlX;XqY>Bs($!B8km}Pz7V6#` z9!^gldNwQ>5+)ZSyC~3~KDX0ov>r}Q5s`^SVlZZoKci8owY7!8cmM~-eVD7!iUhML zEFn*hSzRM)2I{x#V77F6P?qq zkzQ@CYXY3m8nCLk@s*R$8cdI@vv96~zPZjs_pe3nSMlkc1;#V@JG;~J0_#%Tkj+2^ zPN1tWS2DBfXk+acT^QD>%rX;RXtQNnw&;bBqn-glsu~0&pS{j*UqDL+l%$SWtwdP> z$hc~{f}zmeXcK?yYTB{BiV$66cv9f%n4;Mp3R0^~o)#JNPSA|f%bXz3=5y= zD>w?*IXHf!KLaLIi*Akmh^(2FW9-Sy$_N^AIwF-3G=PsGlK_uCt;8+D4`pCv`qyzW z(f?cA5f+vo*JS9z+M-SYqf#0gM`4+^Wv!-n&h6U_$5u8zqaB8u8(pFb&syGY>5k#sSM{A1{R~e>~Y@|M``^?5;_1eA-9h-W7Rj{UdM% z3;xZIR`APAkb;CGeiCcdF#D^t9T;GiV2towo-e|KQx{?3&k{Yt$^?9)53Gsx^&f4W z9iJrh_!z6QKsW38Sl}-?cx6Agx%#dW%pMW5YSR)ZGDofxtz+ zw&nTWQ*i{|7B|IN@Y$bna1;3J*;+W(GHiRJ!fL=2l{d-bsZ@p1iKq0rW0k%~!BK&7bN{}B>|TBVA>vk= z!d1!LvQ5{@J1z2cSR?9~V_Mh`qPm8>5yXQZ(Ib@c;#{E+ooPE?LVpbf9U!1wh11|$ zBtZOJ$Fhk#yX&;HUWMWap(Dlw_4derqH+N3l*t`XHT;G?XE3Y`HU)e~_zc=GJrP*#e6~%_zV2 z*&`M5#m;(|-HyY_K^Bp9(EG=LyDa=AM`_ycdE&Xld`sj*=tm={sh>Ij>8v>Tei zB|4edWWZUZe73%{MY{#6*LG9}Y;LPn2|${u?@71)Eaz89j2YnZLDV6q^!wc58Bl&B zp^`W=*^_5XDt4$vMDAbwli&I67vBZ)LO66Ei9YkMu#OzRnEwB-Cs9IFm@CflGjj&j zI)h7d`wZ?VZ~mu0OV7Rf-E{1UvuJ*dz+}z%AkT+^=kcd#JkOo*nwTh-(Dse7sIeWB zrA(58G1`a^hV5QkZe%ZkbLn6I*L&07{?%WkYgaB|31Py9kW}%bCIR+xvz=8en1x{& z<2Qfux9OQ@&ZiTnPsb)e;%km%paB8gff0D}%+nztT)Om8I(GDU+Pil@7%VI+TP?ke%L4J$=ZNmK zxFqAUg4@>v&UJ*K?N8M#B#T(@F^ zCio#NKdZe=4rn*+Cl?PJfoK6{BmA~O9H1zMiwO8Yqd8@=t>w3ZHcx;WQdmJk&0`EV z+7PsXbKqj>ETgUv7c{VRHc#&zsA0YjwDVg0%+I#)uauASR2$aArw?|gTVn`1l)DZ* zLLbLBrg!Ysf(IVX2f;{FSIf!_oOlpWk2Lm-KXEKGyF>$Gf$2kBm)5Yd$pA|F#6^9@ z7ZA|hNO2yk#4>9VD(Oq387agT2i1tfw#Yn-m)0#9VPGx1twYq0Ynd-;K|~%yNDAL_ zvU9?J@Y_+@QxsHzzsS|4iAOt0a4e|2Us{ng}f zB8q9yiDup{qsX|iGM}2j`vU||sGVXx+t>)zCyE;uj!~>Hw{PBwHn$MU){vjhNv$Z@ z;z}u!Cecf7!!dE91Aegu4{1}Q7vu& ztYDlY7p|?S<|b2co%tYN1{VSzKszb7q82(2&!B}yK`h2Bcn~}{X@aJaf#cNv4q?b_ zN{7!3g0jIk0VhiGncqppw~q44S)JlAVfniZ=tCrAFGe#~pTr;B$~V@&$r} zSpMgWit_Dpiwawp;=0IxyKec36H2ZfgA|`Ocd=f<*HG=0$h<`7I~gm9+ZNMWt9;lQ zrE+saqixpN=N|D*sh!@3NxF2u^)1gGnBlWudKY!_+K((zFmU67TqV80Z5~9}TaYe* zQ>vCHzq74dW%$56I6Zp!Od%v(ahSjYMo~7;igpy4?)bP=AkT{SMkUMAm>2(}{>8i8 z7V9QL=RsQbNlt8k^Ex)0g9%&5m+`kh`%zlJFaHVLOAL3oiCYS^Iod?(?Y3DSKc=?G zs}T*bTCs76L>I0_4%Q&*nC#RjRAL6y^xWFnm;U(Q{rmLh8-JZHvAeXIoq&nF6o)Xk zJq3{oIPu)`0@jZA-hG=LoA0IzUwxVQ+TGB0Cn(?Rx2{B=A_7SpX5m}6Zo?3)r{3P) zm`H_*IM#t#UsoI>>!|6-{+tC5X+tcj(a2>Nd z3=Xh;z@)F6*2Wrw%H{;?M4Hl-yXoUWp-<3 z5#YuO;|-ORt{Ik9xj`f<83dT`3VmkTD`Nv(TOprynCSf-x)8~vAQ-^-Y_cF+5yS9% zRcLPNu+aF^jBDL!A?-leK?ow_cp{@HUTPg^hN=I`nFHy`6FunxdoSD>8cWyj++#n1 z`!M+n;D8z&rVVh@L$sA&LECiI14zN3>hmm7=WpQl^$|8YJ8>|zF>wLh zSXZMab;8i9eCKYs(jH;&R3Z||IO)zX2a~gaHL4DH*R$?N=0GOcK8R4rww-tOgTHVK zvTmjO+&|unhok{8j=P9)4h?hs7inx$L8~0gjkNVE#}1^UhpN)wU&K!t*Ns}n#Rkl5 zkzXOiD_lEHg?%wL3;Z^J>2X*?$TO5Kk-h@0AZ~34j4~2yu8p8vK*wf>XoT6L#m)N= z#~7z{0t}mr&+;U%_AGn=%XKb^`J~kT@cp-ZGX-{4&TEoYrQ7Z!Ty2t%+qJmt(zZ%U+j599m
+J`32cUyn0w|Vhkch4bQJ`SO!{|tBlE8Jx~ihj(nmiTB5 z%!|ymFe*HF$cWPYQNFnA5sW!$BBEG4v5#2b&SK+HwEfO+byzJriCQGnue+jaOSD97 zzyt*E2p;K1sRhk8*w@uE)xgxlcG@i4;5;Y(LHXmE`FPa&4BT=Z#3&nE8eJ{E3mi4y z%(GZGpSypMIN8uFlO;}44u2xbnB&&Ti=RTC80f)gaoBdL@Hon+hm23{g{zFiY0B<` zVRtuj2g~(4w{c3XPG5cf)in8w4-%$almtXgqFJ`vUI~1I?l1sQuHmasyVqbO*O_nn zu%ONW<{4d8y4rA;+en|zuyN<@&r<)!02WkSz)&dMe=r^aIa(qJ2Ud~c`>6xL+Rf

a4Uu1wcdj${k-x2xhD%?7F?CnL=31JSDfSGxIZYrD=D7hZ zPRvhKH^#oM#j$Qu7ziw8mpEudltrG=Kc)VXMtVq}eXcda!+v267z3kph`j&Qk@L*J zDNpMYa`#X9e{vK=CY}DTt9( zgih4u#fMeP!$B_u!w3G2}|dq-nm8Fa3kZKNx;;%Lby1#uZkyMb}sTSWd?DV zoR$IZOR}D%A`mu%A1oqqM!-ys}K25}ViqfTXw zwS7f@i7UE4+_-){jg3#F=U#Y$aHhv&(zDfKE45%?p;3cS96Np-&CXP;ffyJV2m!*p zgjFNBR=v*T##pR`w+Ts~fxZe+l90q?C9#vp*~St~1G3)Rp!k`c`i2d(YxlzZ+|YpU zF}|V_T#usuMW07rzHdi87HqAoSbh1J(?TuvuWol1A^-eBS9NP2+Qu7*Wp zmB4a8E(d*mYATn~S*#e-xI2CH(ZzI{bsyd=v6YKWma%9UiOQd#32hNYO1vH4 zJc)XV+u0n73l#(@{47cDWTp49W7kENS*W(%ehJJVRz^s2F|EB{4fDUTJf1dL7tjcy ztzuHFTp3UE^b?($)1YZKc!;f1BfG9iq(Af)}>HAJGSPWNUJGU&q^7(269Td5Vv zuznHZ4_Ir_2xWnNMefl^dx7-^(KaT(U}ErHHY&%V{ksSYyf2+T^Ay6`GO_Nb(uW^@ zm<9*$fIl#(c(k~;Kn1Hwi@ug|k@>gZgp;&+7D2Y7sTDk={g~OS@YBAE5BFz;?Cr;A zR+>%iaAzX)D-l0j6QKqYtO91xJ(y<~64$|`=}25le=QIO*GP52^eJ!%;M}M!+!5bm z;-z0a%EyIFpk>B{%=gw`VIzkNG)yQ&H!?JhnfpE92VOO-v1vu<)xz%NS%a{@>?A9E zX>%Sg+vWLf@hsO-I_txXTYlrM7BI_d!MfFiH3|g6Dg!=>E;LYA4qd%=(I+cQtlyD{ zut{4LYh|kOb1n|?FEh|SW74M&f>EX%VQm{_GVH6+NRvk4d{^f#EMs@^X(tRUf@TF4 z4uw@ms=oe42{Lc@NEyc}`Lmz>JpK5MH<^d#!u8~(m%g34_8ug<&x15EejB{A;`u1r zU%;~b-s^w|e*!Gh8y{?)xsDe_ZUH)y&_0-=>e^PuD#{zK6{=$A?Esp1p4)Hxgo*X{ zCrlI~gnjS~l%+z&`e|8n>=eITziegd*|!e?3C{r5gw;V`d^gP|oWgT2K_z5ntk;@M z8u=|O7#}m^1-u;x&m&FICdPwfpZa)qJw$Nh)T0q4gDowr21e9Ui5M0N>Rr@bK|3}Z zBefw;hYKuG z1=VG^4josSJwAj?e|$R|g~{JvZDTf-lith+>IK8`#_-)2xCC^^u59|ZP1#OI+8eO# z{CU~us{4_k&81z=putL(^|5ZBHrZue%8jSIy}_XYxt)2%LobkJ@@}3}1itxV{TaJ> zZt!d22Fa(f>#|Ie*&7d&9i6Gci0au`P{+k zgL8DofBXE4!>LE#!<|JZhoi?24i`^xLK*FwsheI;GxUrI5`P?YDwE2%`x2dnxa1v2 z3{)yNwmr^_@7S?tICk`r;qb8s*k8s;R@Yx1w*Tep%z&~F4MeV1m}ju=;KAd=ISw+p z#i@kijjc@?M-E9N%e-l;TxP{Q9?ceQ6m5>j!$1G*_pv3;SN$S%Y!i;<&qA?yw$qk; z;L!u@?VB0C^3BtEdy<8T-y}Rn4iS=!3K*e9kvckST{3--t%72<>_UONq_NE%cs1kYs%xAHjRyGBF$!ngi{ zQIgBUM}Op}I0@(^y>t~zpWm2Y3bUk3uv2$H`8DANlp@z}z0lx-Y|r*Rx0hI|EI zWewxN1!^Dt^pEGp-{1N5&jNos%Ngo!0Dy4vt@FxfHupubbBYa9i+}LNFAS$%cy9Q} zCqBl_(tF9Ly>pcubTFcCy*PB}V9qyx_0?BUA_vpCv4delyZ~J`OCEda6saQ#9!FPM zC9lp@6jP&Et85#0@i)j05)o-Q`2f%BufD{H{`FxW9i$mAt_oD~yyr(HT}r={hDQPJ zpd&Fiw>TVOre?LW3<|}D&C$mgf#gFy zWG6vIW{rIpjz)UU{Y@IyWy&^26g;oZ zZz_Bge=yVNMoj(D;TzpZnPHG^;Rw!~C~{}7?%`xE|6YNAfBI*CoQ|&>)#sQUNXIc$ z+Ic{&Gt%jP8ZX;(3C@0w*j-?XeES+puHeCWM$!M@Z#*-6;^AW)Dax(0jtbks0>=eL zAvGk9uj-?7vaPy zps=iXSU%+q4&jUE8syYBmXX4#EN!%I*{NSv4=E?ak|)+Tlb=HebY~~QjzpF-VOYs0 zjW3^eB#tn+V_Q}*8t^^gxcGv-fvm1mv$`NfxB;?A1~_<|Vl z)RG!yzse)3CD6tu3DpTYD!R?%Qkms@{p@SG*vP&K)@5m7l$o`Vh#nF0d*bAU)GCF(3s}y4yp<~+6Pt2y2Js+whqx5(!W%CT z6*;9Wogp{`zwW(fKdhJ?4lH8iGqdE1lI)iw&8u9@_AQp#TDDa43@mt|A4R;*t)oeD zaUqkR*rR+a*IqujOj|IA9DDGFg)+}*zWX5`c<@8R$@|_z=i6mXU#I=}Q|vZD9GTxb zyz8;|4exp4UBlTcUm3Qa;Z#HBpVTK?XfVodATiROVa91Ydg0**ALTxuBeZMt!z0J` z42vw4K1Z2t-O6$-Y*%^SUHBh@@ev0ya?jDl;m0}h`;~K7hNr(nyU5_AgNkdMH1^_O zouflR$$V@NCqEs=K0=t`>jj<}+sRRLS~@wO2*#RwRL+lWi(mW<+2KveX-j6*uD ztrBs~M;%+ovt_XJOcJ|7Rk@TBEdxdWFr!J0c_XiEw%f*3|5bWS!0}C;PN!n&#u}q$ z)=wA+UQAkOEQJYFaG9PsiGrI-BS}GrqjiKyoZoT*Si|zH94BV~5!7Ngf5JJ!V&Qlq zi-gIZ6smv_Z=Fw`8U^yLyQDMvo`!)3e|fKw>{13BQ~6`z3+Jy<<&pT!TlktXZ~1NS zf_B$=pKs%ukNV(O{veo%2L%>2J{j%DfN{fQBJfP@*I#&M_`z9+nV_@f`? z6r=kvUODm;8szm>l=!l1>1Vk)79(Vd>0^b%Q7CU&ww`zy*G<+?y2*7F`e#YsqPJt~ zX;$)zTm?4eV!q8M|D2-C6Z(M8h|P7UqgmFp%5t6E2T!0p8CBHavBT16uCmn_Fg96? z@?e&R%V)6VhNFk>flNBwyJm(b-}@m(w7$*e>R0$?%`8V!E-^xPod)s?fA|HKJslc8 z_}-tObMVUW5_d@)+If;?JTz7`_^uO`g(@AF>*Od?$~k-`b*E|_$#JQUYdy0c0D3eI zJmS;iI34BoE(TA_lGbFxN&$V;8kTVZ1G7CP$QM=4SQHVFw5cXISP~b zE}OGVH6cwiC$mXtF_tnyhe4lnV6cJ)?gkwajsnF%+e%0005|v^qyu`M<8eJMRynma zZGo)!uCsN&c1~Csb`Y2S19S}B@Thj&xr-SR*3Umn{6F}cXNRBp;U9?KRrq~~j^Pq} zz%Fod(EZy_z{jn1>J)X)JpnH5oJB`(Lq1#@;E|qmoye0l5<-0E>7_?fWqE1FDBI>w zLur)_=8jhD>H6Q&83E6p8bP)Bj3ArUFIE1kGmLrxkaOB%^b+Q4rpFDda+i=H}peml?v4trAG_n&(OT}-F{z^>uPf9xlRfA$#;m*D7MH+e5p zu3OvXeO=G?r+o2!{HI^ZwoH{%5G9Y}3z4jVBM?d96MC4vN<%Jz&2Rp3K{2=MbH}^%X()ruMMUGpXIVP_jy=sluDRV9b zv-z>yUB)QqGg6G+vpqO{mJ`58e=i**1;u5Cmv1gG>q4iJgpYmy6$Wm049jegUO2gb z`0hT8e(vVE`}keM+%E1J*>jNk&avh$SERi5bL;ZWXCmSQwyR*Q! ztam5!bzpIC8ZW{R83{kea@hU#>+-6f&hLVim*T>O|LP$&a#L5H zeD}L^A*&}MoqL1&i~L%jKJtP04j=u%`!XX@ds^f>g9PLu`xe+Vys)qcFBo;-vu(I| z`CHUm_c+w-}1H}vwhc?W!tXhv-?v+3$9ISAk#wpe;mlyTuO1!`<%g1Q%jy%$yi@GHJ#Puj{z zy8(k@zC&yZI@~ddsSt`+Hxa-p{dInRY!uoxror4O>RQSi_=;ieZE zSx%);1<*W#b3Je5L5KIf1T4PhuT`G-HadId5vH8+D_!BUaY-NERi+cT7T-8UgKv$G z9Vn>!M?K@kk86#jQm(-%L56u$A@g9>H;N|Nh-3O`(12cqwK&Qv7*sj!wW zui-Tc*JA@sK7MGYTNpT(&z>57?>B#ScGDZ1OTb*6lUX^W*SEP{U_}+ zRMy?BJ9Py8EEl02WdFkCL=Y7S>CQ1Du}lZ{_{oQd|KJz@li~mRmH&PC_BXy394aH1 znPk*lnu_qqW0xsuHT=wLXNNDc6yf1V9vwdPv5%#bV1;x;ST@JXPYh){6&g0|0Knj_ z2F}qx_Y-WPvynzu!yCk|Q{BSIEDhlc?2&kW z_~v)MIlTNLQ?gX(!^gG_cb$BUkuUZ}eEmCgBEB;meb0k5hBtCUwRKmToQ*Kwj`&`u zF*AS8JZMLF8JDB%uX8M>HwmwBL7hfpj0s5Qf9j;vngz?(cyKR@^b*T^4%3l2!bnvH z)r`yNoe)A>bIhZ^k!?dj)a51lr+l(xg18TzI5gaKX!r2_i!40=q5WJtyGxvO^k+|h zcR0HL(c$o3@FAaN2+=UW&vM{Ju*9F)S<}_%fNq!PI8)3bbS^ph!^@6d=3s)m z*4|?7U;;MGuGtQ%Msm(?tZoH z3~QPnN9g_Yecpy%y!>gmF0(A^JQqhk{oU_!aoP%}$ZcaZcsHkjaWUiu*O~wT06+jq zL_t)A>#RH1sJlx2*upZmBf~HL+`l(`>Qg_$`t#k9Z{;U+QU;(5kHoD@1K-{|55{eQ ztf&pKh3)G74+jSWZ2u!EsIt3jx%Bf7ivSR2cOJw z^RjYgXCMZn^-mr!Q>0#(GZ6vOHgClb%cH@iJlU7PKCM%y*oOhZh7k*yqq9PCIRJwC zkus#tZ)3UNPIT_MQ!g|7_YkmE7RD);R3rD+@$g7PM}E)|y+LRHG&4+#Tv!@iL7F#7 z%MSb-=({s4i#fyq*B}4oUvu>Qx#5FPK03_NnY_tb{`p7l8+P1(|FD%~tLIq0IF}>C zF!_UmCWa-99XEjO#9LeLqQha||pf*>>uc?;W%&^UM&K~Gb0$|6dr%JGvqPc)1Ztr zg(bz2D#GHCWx@9*@`~6^St&4pdWF4_TjU7+aN_8(;jUvx>3FMqJbsl6R+)``@Z`zi zfqU;yhujo1@L>Gryb}Z~9UPX&QgCG-A!SfH*(ArhgnWcn>oTH^LHzJT53q-EA9~K2 z(q(3Eu5u{K1x^=aHb^;x376@_U8Iw7`6~BQ!24HTJrzB+gU*E_c7itL>e8j*Z=U`J z9auKGgNIg;a-qI7*}chTc8{v}cI~^59pF;fBf|+!Gjo7q8UGcQoO)@koBXRz$9=s}Z8SI`C7 zV@`k|03s-G4JF-_hN+NRa6QH}+=gX3!et(M8safy5f&q&e4Mgdq(Qmj5nYTPpbDb| zbMf3wDjp+rrXzO>mxaZ*oWuDWrID}xQ!s&RUY$?%G`#ZEVLJU5O7dlrElqpj^1`*n zjz*`^RU%vY^i*$q7K!n01%t<-$jbv4ih@f8kgp-ThM5fJ~Ge`RXyRLtM_21|}`DjN01g)__=umolgQ?3grA)ObId4fq*BK!_ zG8{f~H`A++4PW`%=Z9~9>lsGt=;&}S!9n&Amls}uk=`1PHmNP`~YaUC!!(4DGDryz$n{d`Qb;vF*YdUyn&+zqA zr@)1Q?W_i!GB0WS?%DIh(=VSMb}bxY^vTB5k}Ttqmn2s>36gGcQ>J$Kl8X57KY#He z`&XW$v1e%l>%q@}*He^Mn6=o3QFWG!;ci@`2M79W@&o0xLU5q@* zS5Fzbf(-sopZl%hUwq~tvG;5HaFLG2b}pm3#q%<^BP+6=!M`12<07M;j#yD#biS0$ zTiieT^HL)^s>F(`#svVcWP~zXz<~g9b93=DGka~Xprar;d+>iw@z4BjiW)j zo3rtqy>iriC!4FU(~;T7rF^flu6T(NRl0uA#q1yk-99$DI`gaPWgFmZ#5Uy^dGb^w zS`5XAd?{0wL3JW-+|VGaPQ*$(hBZ7;VL;+FE`|E~^7F$5Hby^i|9!(h{mf4ffB5Au zGyCANzZeoOwPJ77Fa5$V44?Y=N7KpD@Nx9r5z2BP0-ZeQPB}H5jZWU`EvTc#LRj7! zu+!M>%|~!jmXuvOEy#h1`OkX`>SQ$sHyELZuhqsX(!_ut-roW!Wm@C!h;#f_;8W;> z7^1PXV@C#7>@2FQoGtZaB$tU@;S7C8>o2j?$E8!&`@IY-Sei1?wEek8{Z@~8thMXM z&oFzv3thJj*}KAk!q=aEdU)ScPq5jQ&MwPd*Qle{N#6m@%U92{*XvyJ0y{Y_e33fC z8^>|q``AOnF;;M7h6~S<}#em{wHQ0;rD`fMR0lAPWHdpDc;Lo ziPSx2qK+({7!I(fOy^x8^71qto>g?T`_|- zH4XHw%32&*ZVq>6oR%f;E-8x9i_9vE2zpXb7A+Uk*T_a{DO-3#eY1S)Ldv$3NWi3G zU8(ox75w*}I5GUlCqJFNZ1VC3e0ZJq{g40jPuS#qv2c;1%vXnm$>DUUN z)UyuAd5gGtIfvYNg4X?s%GP`O6qLYvdaZEZ97F>iPF-g`~{xw%b@tgnnH||d=v<;1Z z{HccDmZKri*WdJCD0#>0ldg;T&E{AG+M>xG?aA2Upn|3yKy|j z#@_`uH9AX3;qVF7ypns1hl3ND>>7Y=TPJy^kCkxhV$a^)j$5-;nKO+hYx@H z$C%nX!BPlmq&Lq<9|a3!xlVURsK9R}-OuQ(n^NaFrqi2GrN^mX^IPYkvyOT9el}eH zG;1yQ4}bLge~?>7Z8wqwWI%<~UCIg^#e*szah~PUv)}!l-)3X-V_abN5tOi_H{k$A zzLip?j6$Q~Fz=4U&M{)7HLmeM>rb8-$>d{(nE_{w%$Fm7b`*?nM@U1*QJNb00}*ur z^o)pJ%jqr)DA3$Gi4k?p6NoUj_Z;HVIHt#)kuX<|C^1hO*^t-PQyZUEPPf>-2+!X2 zIDESYJPX4MTs(K44c3R@`)>9hT)FWY>2GJo;v7ojd&3@#+a27rdV{sb{>dMgcesJo zO=-I|SZTOYj|sEPT~g=zWKRT9QRdjqlf-e2FUzo8wiP~G&tZZWB6)nTJl9yfj@6I`ks$>v3MDSG4#8v>6WW<+$$CGuJwxQmQn?%{qB+2s*WGDae3bgF-|2Z3;LT2^*Q+-st-as24;j>GX>PQ&RG@{TU}C#{_5?rGqI~2q!3E{oPJT zHF8}(=+X`iA&q1AMcDXzym##dK;IDdeja>Bz``le;y})jtm}uAF(D+t?R|XJ34M_)Cruz6S4HLUsrlKY9Gv@c0Aw zM-O<@wH@v`XACJfkil1YLnGd0IK1OaSG;KkJKypFUw`G#iOnoqU%*4Ye{ZYt%b;3&1bC($eU_&GEbL zN*$IGPh<2*TnAS6GteV#p__D;SiYD23+NR2VXYE}>8StAPj#Nm7qZoOC5#81OE(oz2I_o>_7Ut)XV@*GBGN#Blya+6gE;MB-I#z^vBZj?UF0LlGy?6)3fZw>lU z`#tHBBvrdPsFn#ZOZmLUz`*Lci^DF;?>f5gD*8=XybMhj*pp=E$9*DOorNaeB1dlpU$2^`-&%bD57v0iSw>K`J+WbEC93MtijO zHD<2Qa~t=TIc0-b;7KQ^62;FMP<0G7n*%O9F)bIJhHk;tNsUm`*ZScZ5X za(aSI{_a~;H_pSW-PjMaluMSZ(e}LZ%4@^7pLuRL%(Cch*pAY$6TOTekS;QcZuCAY z+sFqph*kqu((LR|Up2pEFlC*hlAJ;&U&*%F?ycFs1bAcML1`JH-5rm!PT$-lsc(RwK;6Hi`>iOtZ%q$OO<- z1#6iJmmm0m@@qUpDDbz3zrt;kUqP8&9zOA-KQ$~KIsvcf2p|qEMPHI`gk4{dYcSZ6 z*vH7{MW!MzG7`LxW8XaPGq?Cc10oO<3)~|8u}}UG%Ycp!zx^8=H%sGYOW{bBO4V}d zF8m9P$q1BJ1zbYee)TI~;l}Gz!^b}HX>Q6qN+VDWqsuiG6}rPpfGm1lVFh(70fib?ar0q)QiszPdxS%2G=?6KKLFD;D8 z{LRh;%PScynW7{F1KK)Abnx*Xxv?y*vzRR?_M6;6arw$tHmWTRFLF`S;lu8Y(CAWb zEgur-vP(QMrhY2M$jmHsyD#D%mJRHGV(+lZ=~o(Sc2ZYqpfCOFU*lvfmY`wjdM>yp zyLbSA#=TR~b}T%r-M=#!!Sc^jk8)}V#-2J2hI4nst>GJ-NcC^O`TX!C)9B||`m{U` z?ePBSF_wHGH#Zm!wftXV8P%4e9*N59-(a5p^x9*l6 zP)g)cZET%kMBZfrvLI}T{J7GDV+7G*Ym|#+)^$Xl!tl@iCa$N|Px%#p3?KOKdyJX( zOrn;E<(~$x;Dxi@sHHGf9)4?Zc@d$5MY6|R*d2Af$g!3r*y!OXL}p8NGXr#kwaE({ z6XoW{PygU2hadW(PjT$%p5b5pt6u?^9aeb546bX6-4txYTSkDo5*deQZmc$3U0@n( z&}k$7+zY2T1&u=qSQ@dWyknR>cJE!oKl>*?!5*Z8lo1^g1aO`>S+6ZG(}+|4;R_Pg zcxZn8!y8w8s&8eQc!t^Q&b!~GEA*GgAuTn%@T_69IvbSp;?*etTzcaI##)W!I~%3} z<$#+!Ro2qhflt45{HP1oPdgh99GHgb>38@jvxK~>JI&h?GO6y{hWuJi^iPMA96716 z7s-OeMm?jq7uh?24u%)`vyuX@5qu2e&LwhP77UX;ALxJ|j2vdqJTv=Q*M4bD#g6;lcO4f7o^6!C@yd zvBu2tbrQVkpbt7=)tP6~zs8K^O>TF$T%9FzhMKW@zUxG)MdBn@s-X zjNd+6SYF3JiDy}jJ#T+^v|By$;io<{yz9v)X^)syp?%rHzL0|l4kKnROH?0T&olNT zXiVFS)G+&bhd>Lh!1VN`+mbaO>MUpVy)9c=@wCV6wE)=Vo9fve zE;lB1dAb50Ujp~{4!?|Dv4>e@t`cB|(WRZ*ZO+)aa$xV?z1W=$q)CzO3$kzg)TbCR z%!2>tfRpNX1lJJ-4K=4XzXa1-IlH8oKwS~sSfte$e(Vl&WSRP2nlyIdZ zY_0+U7qGtbv_s};g?1PmptRJ&!&b+w3jrolc*?i5$R|4v)-yXz&aPPxEw6UOYo$=i zs|kq27N?VO6RYyCUjmCN;c$L60N>u0lQ1Kxs8pENIPYV`&$yAbTsgpVB?6mfT z3Kvo2*g99j?WqR47k3QjS-0qvwaJ;<5;GC5&ECdFC-)_oXatM*tSe8(@a9>V3i%}c zA~T+yt~|vQIt#wV8h`L!!et@afY>P7cqxdElyDwg;EN~X+YtRNK08lL$7LsuK4~Cy zC*HpoH$vlUXW2&Fp%|f-NMNfr`W*zo4%Y9;&z1cQx=e{1LP=)2!fkNz+h z)BW0T`jt}=KQkguouV;$=ezkPFHtVwawFJpeg6L*-v7Z5a2EV~6UjBI)d<_@x}*Z* z3xjZ;(||-RFVcBn2BDXSSu;U5Y`fRO_41ygGeaX{!ca^8G-Nis!020X$16wWUAuf9_+?IQIy&5a?*qec z{qARn?|tv9+1qgL>^79=`C$>oHG3b1JeOx(12<+Iik=N{VB5vW)$6Razr^x|Bgc+m zU~qOiBjDa5Y=^<`RcN0>xy~}f^6c|33=iFRca}HVsH?DggC@T<)SQLzl&D1vzegW_ zga&_SE{wdf3QuRbj7^~d)_>J$HD6}0^od5fvj{eTmT5E@#A-avu@A-b=l!yGA>6Gw z8eO#a%`4~&rqiE2cXoJUh2>!mG!Wk9Ezih@$dD@aSJPYRC^9;_B)!VPQx&x z?7#ip=Z05bKTAW%sVy7}n1;g65Ov>CDhZWFmxZXeattOTSbOOdyVStFGxBfCti~~N z!0f_r{N>*aAAa)Zhb^qZeS=2V{a&x1^r%cT=GPoUM&?TWB-@TL3{~4Czt$9EJ zeAP&}f^52xKl^py#V+jV|*N6A8eC1sqc$_1rF@!O8vRsS0 zoMk1}ZFuDp9QBq4cJTr3!5sPPGo45I86N3pppeG*UBfuqFJWGy=%9-5%DB%7eqiOD zb?!2Ib!>w*dMuZ@IoG1wX+jo=A037cS^+kr*cdSyEI zwmSfFfx!lEt*<3&)(Z?Q7@DP$L{>apie`N`z87KUgg*1f|NX!pMV}mC>5m=mc=D?u zWcesNhH2a^a{ALA22`BUIdbeUGVQ2obx_mRf>5*?L-J0!IM1?`*T2N6X$(5uWMlEm z&z~9|Wk$+FSEAnm1T-#B;xqA-dND&iUszlmRu3!=*DkWOZred-n|E+9!>i%-Zk8i? zsKPCHanqS%)~ByxL@r%s27sA}eUCiC9w(NfGGmesC*@E}k4pxm*|L$BIsJ|@$jCHp zrn9hzFjk*rkYe8=d~3EutqQTM$v5rC3RgYQ&uBaP zGWM7}BFB`zh=NPWvvrj#RR+iuuaQGv?&M%v=bs?O!-6lpuMTkr)WN@fDYvv0oJf}n zg&iQAm(AT|sn10^nx{{niNRkwO1i*my;R;g8}|&yYH#7*saXaBI^dv@;`c5$$Wd#5f^hHT+s?*hg(vZzgC$Luv$ zDC}dU1L_4{QhxZ$QwK|}fAU^k>o^$wmtleWFH8=8a8!$QoGDgLQe^(%07)J^&_I03 zz3I#{J9QlxS}p+OE=Y-=y_CFWa9SCne%XmwVB@}VZI@(?bx4_T)q}RDGdOlgQ#O`3 z|LurcztUlckS;@XB6_1x%5q9*6|mnCkcsdp03||IF%>DBJecG!$YoO2BShTHd{<&( z8w-%RD}i<6Z?;E{OIcXH(*Vg77~kWs6Iwajoc=|FPtzi8UNjDNu=l}D)>kR`C{n^* zo9&XOy=&VsCV-!!0;VAi9^VaY)rYW3#{$yVAH#K3Q76$`p;mB*Y0pkCf%b-jiJjhsKjvv39q?8I(P7!sB(-oW!JbKs3;b(sCpAWzBxz7&Y z|IW9xsf$nY1$wHy&95?E`Lmp&i12eH;wH;%{`^nAG`#rYi^C^B^{HY1L2lFr*ox}f zb}OEpQ%obCPFp(aKthRcITuYDNGcVVFWCX~7G`BT+{SMWWm*;7TX&hx1pdMWZZc*R z=>W?$oIcN93i5wr>5btm>Fh)y?&9PR4Nz0m)sr-YL&4mdhfo?PJ1k%O>fg{gd66mG zTBfjs;=X!?lPM1FXKnXl_G5S=ljrff4&GUjSLsMCVN@Mu>V7ZE_B9OIx?!4zAG(!s zjf+*L?k`=u#Psx~;o8w<3~9?m&Z1SZ!n_P@4Fy^TBqMRu(FHnyZc3l!$ytfRwmpZ2 zUHebKKY61bA~*1~JCDqC^j{pVzt%9bVaS*cblXJEg72Oa_YCiQ>~f~CZKPK*1g|df zw9wdZ;^j>@QP7T?y7`z4R1(SnxJ@^&GSf0*Ny|-i)jT7M+gb88>jjE7Bw%>vxxX3y z+yDN5#L!>iLMYCzhZow5jslAny`5}q!G8x0>S_47%*=ru3NM|z#xf@5$rR}qa1t1! zXYlTIHW2^H@BQI$lzjjfxjgC&ql>5LIBQg2K;O9s<1!t+L;Dwpm(Ozc`GqSOU0ZUL ztrvXBAB_}eB0Sw}E2E|w6w0evG|TeO5!)+l8eHOT4F^3?+k~a&VW5|>Mls8X{Huym z94b9dT5w{X*`5XV;_N{u-+lBjx)+H@22C%;$dB(D72N`AUEmnagLmy6 zo_Xoz;noYUvWR3y_HD?!DaaR{gZv$^5Zr(l(x+-tE}raF`pde>##nCY-r_04)B}xnjaH9+9Xq&g?ikTj}hxI9&b%6fhc4H6rguWM}$%JY$o>u)u*70MYT> zKO8u67kZ0L_`B&j#tEf<>QD0E7qn?q?M8-o99-lCNS0|*XaDk#z7(X+fM92o_ZoF&R)lsFZmYvBUMZZJ8zwkQxNxkNv8-LE!9_M1X z0|$1qN&kM*z=$Rt^W#t2EES7#L;aPN+M~Ar;@0p@WkK({G&JoGZ)s})T19907rRP| zK%nd)40#uy-YEhJ_Iu=#e&QIY+W>%%Aqfo}S{Z|01D5VDH1bEgY1^R;Tc#d^Y#Zw4 zZwIf6M+74aGaOE#t>g0HH<%sDEI+W)5gMfc5-NXukg|4y%eMTxVY>h^HWOQHtDxh^ zkweVvUd(KrGm^HI4rHNrvv0)CuJ~*l%$+jnk}MAa*+HTy!X#DFlhYQ#Z~f5brxXX5 zHl|$!&eaMY?vZ^p;SBZ6y!(#?GVqx~^m5Hzbhzk2`@Te|?rU->_3j8Kan`fSN zlq4|=mqIGGFerE{W~D5hhLH~>Ps6ULE?WH1XF?BovjSVMrNQ2?ib1|rCx&qO??31^ z-dv25WIo^HJodbX2JkiVie7%rA)*24a$AMXFZ|X}&Pp~EYG$vZHC1jeT!##gBdCwJY{s-g!&APmtzGNP74aIalBH~ik`e~lZi zmxd=k{KKq`to;Y(so$)t;WOuGp)(E$eHwHy?%=M2?VJkoJQwmfqLt0fR2~icTuQgb z2p}Ud2aX&c{=;ARB{p3D=J3~l{$-RA6`Bz(k6+bgRL>E7%Q1Oow2d&!R9<+5pFM`> zo_}e0{j;APp8Ci~xR~v6K!{w9TmQDBfX+DE9>4O7a$d?E1*Wfa~ zZ5>O>-eAcQiR_`%cVPegaL?U$r``n@VA1Qs^OsId;#NtRew=H#4)7qm|tsQ}3D=Ia)F-yx~P#!Bt%%Y3R0-<7}3)PX#2ujskm6 zMCxy5evDO49lQJfM~7{9-ER`Wxcu@8C*8T$o%obd_yKP`np=lj!NFG}pd=RNcMLa~ zZTa@MzmvEN?zN;YX%spOeVM_CoE${?pS!6Rp-Z@H9^F!>U>k#|8sTnB8?4Dx#h-4vd-aw=9@ zdL?&N{^k|FgP=yVq0{^-3#LFAiJ<@GaZy>Zc@IKis(j49j?ehzqy1RU;b8!Xi~f+d z$SX8gYVt?`WRsWe4E`Kq=BanQOwVO_8pNi_#CHaMz(SOgNAa0{cjPLAAnLJ2+Cw)) zTV5_vG|c@CKFK7%8rp1_$5xzWkj(bMwm2OZ^&QJ?=h^3?+^@2{(WPVF-?NZ*-bAMf zhc|o@EqwCZRHEli5n1(ya_v+H=d@pQF`Y6j@fL#=QBORkcL%fGiLY3&C&WB>!k>6k z2el#8F$k ztCJ~d15qz@$?PtK^beRZ7KAVWWV-%tR>LxW6yOT&2n-Piz94ecLSk2B6q)rZa7T12 z^NneVT38~uU^J~X4h^mFJs&+K6Q;9;5-?5IZ?SG1L**jJ?OkQcGVsK6Per05HzJ*q3 zum<^^{3@4#_HV&E-<{4=^&uHVMg_6~_TPT-^TWCGXNFJwqaPc#a$d8`S{%8hd8fQZ z5=aID}*l3coS&vRdT3FmS{JTH>bHfu){UG-wa7hp- zbJC1TllPQ^&%COvHR+3od6Nd~61cW9UA#sGT*NRiljI`FD#zCt>8fdKDlQRSK2e&5 z_~Mr-B)&|e5haUanL!rPM$oam#T2ja`;MF#e(=YCiZ#cq0q2hzMD(VFd7Jk}Ub7Fb zte1w~$f0tpMl!89<xS5?7WT$ri*Uyb;wh0~PPgGSG=CqsM%~C*sMYaUIv#E+bm~pixoEOcB1! z$YW1O87r|9mwzZv@Fj2lpipPwUryjc$zQpC7Bh;xgL4Z?HnV@?fH>=pdhrTy;+JEh zuxNZ)VWy)RZNlKIQf>a`NjHt7x(D9a^d$sb+n80*abOks=<)-N)=N{s-!$c4@v`L0 z^flmhz5IlWfon{SB)+}5*G`3W2 z|GqA^mrd$)OJ{+OoTJ%>`DY%)VS_7X`D58Q^Wdm`_>#_))WRy@#Yd##u|(p`6VA&kiS^gz=QdJ38aFbo(=U z9KST~q+L`^F?75mpGjJxYUmOidVFj0A7-?OV?3r^@#L}2U*!uYP9FiNP)kN%zsN$u z#MkpM#tq0iS7{ZV!cf8EGFlgbk8z%Q{GQfE*mw3_`3Rq)1gf0Hk-B2Nx1LV;S9vW% znH9v;P1oFeN02iS4!AjT?CD765c@uIYzA!e^MRbGGj3sgzH#dGu)tvlM|SSd2}9Zy z4iH@dm;0LLX$PKo{E6X_haMl!Ghpf{w)N4nkzeY7=$}?^YWB2vTTJuazv`&l(UzCa z>R2}&EHO?7o0{$ecw|dd-C3dSpcbNf2a$f^hsJYTf!I3$HkJNv7Vr>~;C@+ofm+4miuJj-2Xvv?1*-2TRay%NO@kr2Vv> zZ&II?3GX_%`>wlk?5(;@8$!FtbSxVOR~+RxEl(J;O;Il!rkt4n?JTvO1J6ZfGR>AF z`nDK;RGnC#S!t_3>uKrd&Jdf73$$miz4m%$sBK?d66mtLC{MyIcQ3E>!qA;`fNmm~ z@*$@j0_(tlbSmfk6hleJ0g&8LL6~SRzlK$~c$y={98k|5FCbX|dGx^#)d;=AJ_Jk)f>m__8tOO80X_&xNC8&H?V^?r02r2_g z778grXvXmQ`k#MsxN!F6;U|9PmxkG$PQ8;XKXm~XYZL^TKN~3I&!I4Of)yPKhk%tyWWd;8rGPp=1c|?v~_LcA&(oLHFK$gE!;|or@?by`2Sv57*)Q)6ns|@#9Asv=<)wwQt`>g%f*qzmj@c<_M-hI<}*h`njtioR{vu!X}L6ls@j<^n_No^zej zQ1XMkA}>me^-}{dm*XW)8MlV2&m3IIoh#v~`d%G5;VN zC*ZN3SwHAfGIePRs#`acbYj+RFjvr^>k&9nxhaF_w=%iQsPTNKo zdQwkhBDYnuH^=2rH&|}z44SiZY;K#N5#FVH9@Xo~V$y1R;KjQctRUQWs5?=UT6kr< z>%h8FY4XjpG=F$mjdfsKO^pLZUY#VKqC({MDNcpeK##ulS1is8*FQi`h z$zKZ_7iN#6aU_v`+cQJu2k)Ey0GkGzcrHh=F|5Wx>F0YxgHcbx)HKSN`}350l*Jr+EUD zH=h107U85#X~8GM-Jy72}-{y znJ;;vzUuiX8j}){;Tmx*yBcXUmUP?6z>`m(F0%?TdYK_y*Sg`02%4*KBZ8+Q7GDSt z&y+vXbjh3@0pWWYb-mRM?Uc&|brPS@nRJ75(zJvE>r8Y7Porwc`A@!AeVXPWv>Ifb zz=#5#MDCj(;XTjtvcC(tnd^KutiO-rjBz9nq&ZgySB+}$DpS=(NVb1*_jW+ls99r*2AU&!4nrq#VOnondj;0S`QzELw-; z)he5)TspnNJtnrj9w4y=UGCD<&YNBqNt7SIiQb%bukcEPyvP7mLP?|RZoWGn`Njw7 zls+?l&r_oDGijD1a}~E}?hy41ExdNz4#c?8*YAX(n{5wa6nkPgb&3<}k}jMc=g;@> zk9Nip{=ID4Kg@F2?aV?si0mCBE4+)_dt^M}@f>>`Jo0wW40p_suCvbW!I2?uaz4t^ z;a$79!J5v8^~u2^!hCVspKA(jXY28Bk@uAR(Aux9)wqr(}A z2k0gj5?w}NY-3}dCo=SWbX;PpwZYRgCt*p%@VCdqNjzVSOT>DplJXXiKAIQ<4xf$O z5Xf>{7n02CSFXZ~F5gu~3R*VGsG!2@R+c6W3}2J{7vHL3>^SoTHk~IbsLM8DhphwH z{3GNN*NQYLKGOt;|8%`9{gNcTo%T(K(@gM3KU^ztSiW$2_~rlkzZrh)pZ?r%-(&A% z89LL7jKl}G+DWYp`E(6or5`m(rMDdFpHTr)3Gh!GmD$^6T!<>m#7Kgg z!M-@!~QmK1YGEzV6Dc6Oie<1T@t!58wR!VT0F*usX;wn}OaYqrnvpDI`gUGa*);!7OM(NHbb`5k7NpVFd-y$(nGQ6@ zGxB>Uuj6#a@0Dlesp#m>cY^!PVSD~MUpvd1L`D|$qefayn=75wzy6_~RmT5N*vEL~ ztFl~ejw9>q(>6Tiu1?1Imlav=2w!euSZRxSbTz+`K}UU6=UXryy}#fF1CZA-vaYh! zcJZ#G!yb-Sb{yQ}m>r36Ijc5~Y1EQI@k>+b)*1FsWuGFl?x|wVsH&f&Yb#5vH7?b| z&Wz~4rV%{Jf-klu)lsq?X;lWlrohY*5yZF3Atg6gu2g4Y#eIWZ49I3t2PkYe?&P{8 z-6{NU>;GnWC*`sMxqP%d`#vb)blF5r+j^6e>CT=xn@bB_a%6`|5Os+0%xBt24E9}g z79M`+A(pvwx|Yi-39scn%)m37!|@pzJwlrR@0bECvs$&Ob_+*ZOmS&l|C?(jl>f7z5tT^kkxDPaqSW z>AgIQbX+Vuh15Z4PcigTS8Yc-4P{r@uCP>({lI$<>>rMB%=b?Ae_X%%M);wgm4}^2 zp$L1Y?z1Sla=)7$2bX*KXBjxyr2qQiBJ;G-$$6ZQNg~Z}Bd9`ki|Ip zmWB#$_6>uXJ?O(}8>5+7F$Q@Brt;qgH};@XB~}1nK%c)Az$^u|gAGp}@i%jI+Dlxf zaIezqTwV1%cvBf=Sai_ZQV;Ub?OXCo3e;C#H@G|He+rh0($kVs)8WT;m4|D@Slt?xT(bYwEoKW`<3&Yn-6-Z+`iI9X|DwOT&{N{S+125ndHX z~V~8r3e=Nu6|z>3g4Yj*n^ZCr@Td3xy6l5>)q`a^wL4$rEv<$)hhjE@K9d zL5Ab{bMj^51IpCWJz>BBdK$auV2966?|`N`Uddpdc+;Iz`QM z9opOddi_Y*fGn~3DcX(9M?+~kF`;;-*o6-)&lYCxOhtU^v$*i2(AY2SE_`mdyPr>a z-ZLdW6}sguwwmdwNclBgUX&Z1@^-t^v(eH5H7xYAOYQ3ip$y}T@8d-D=5IYG*MAlG*sn!s4Nn{W*m#x`fjhS3i5I**I>&;%Vo?hj^QUcA3XZT>6!}U)0qu z14S3OR(zG&ZqzwPGlK!#^PFhr#f$2FPZqKw$G}53JAk6zv0ci(Pn3am-j0NRy5t4f zuMUJ^8oXsJYcPA_(6t%vOaOm!MSKbNJu?e(EpKmwf=GAa!BAE5Kag)u#EX8yn zaApf5)8zThIuRf#JoWN-{*_L-Jfe`V^{&_Uv9Iwo!h1)+15%P_{5;@8<2siaatDw5 zMBK~hC3V8+il;7Vb5Fti@W{iD3?F#^2e=R85c{<3U{)tWIoUe1EvM8SBuxI)pe|*L zk?sh+y!WEWlnwS{%8z9B?dB+TTsjMMATncRx%5AH?yO`Tcu{hn6T>jbT}t)+7oX=s zyI0vmgKc9R;3qQ~ri$Pka?>n1PHR{B<)9Mt zEKfY`PaDg8kW6S#dB~^ycYxDQ2r|gPvsagRjJ6lW_GQE7&sn&+T9;J}mI*S^KP!Ov zRN~1djbg2(wO3pc_S@vu?92rpm_LBqg|{j0TE|0DTVBNeK&z2{S-*^Pt?Fu zIN&sYPAl1DTcET2dD=_$Qa!cFA0yzK>kw}nFRn>j@K)1{EMEH43#N{3lTh( zp?Clw!tN+Gr*`H2bsn5t}*gPtd!V;%z-VFo~g!jH85$_uiiApqA*J!DHb z)e?H?CG*caHn+4BnbthPE2m0G9{Crij%SuSkT2y_9F-pV)bM8XeZ>=P9Eqm_e(_)b zZ^PwRpB+B=lRrx#bJhGU)}?X_^elOva}%4D0-hJ@dde3Sd~hp6jat9WLF7N=6isH$ z@{*2*$}cI4vtdx064vo1#1G}dXUVtxn&Y7LxjBy$)t~fJ?}RIy;ki_SFk@sU*-tT~fT$EL^c z0f?yvf*roblnQBZA-Thj<0AXkVTrU1yF+sNYr@P=FYV-0w&FxJMORRIw6_K%j6P*P zd5Km4e6+~w)yQG{p751#-{W|+w|q|F@x}1Ug=*>>c?v|2XY_US1c|)WcXltxjShmUh7gzFUe_lFD`aXFQ50Jdlq` za0;i4mbjSSlwRIQ8faxB0-=ny_|`w)TUh5;=kxg4aZhP&_{2v3)b}y4@~vN`!)MVP zdixJYrZO9lIdOH8kN=OU5Zy~1Z+Ok4F4d6__pC9_P3?7$ zn}!-j8amb1-qY=Bw7K%_!5eAE(=Lj#>?EHKF0rVFnF(wd2Gwq1Ppxft(1$@ZXi$^p z)Iw+~y>=DyND~`lm8Gf6+_$hY?;R@W@hyA6$FZLrcdlV5$$c=5R~-4l+18YNRi=8W zCM=OGL*urxN_p~axdEg+_OdH{h8e+7^?g0VO69>=;!XU9n*wx`@pd*c+Yn8bfngWT z!pjx*LteOiA*TtgWEl`J;ISNH3-NSD;mCo#!zVuU(axJ4 zU^kqn73u}v@;rHT2ANkmSOQSNT@dmlECo`Q&YmR}K7`Qg3s zB$0x&p`9eKJ`wW~g$9GYtH2JfM|}R-pZv*i=F-jKNB_xx%(UBfl#OMJ@bf2zPERSw zq*gzzpnCW;1fP-T=yil`3O#*W4_W^DRX%#zo2(-U&ct;iWQt?sTjL=(jMvx{pvXLL z;&7T#oB`O3jE?rr!g{6~UwV{!5;02uO%Y6^bg*#&Ohc<&gb8aZkHw`9)B`)B?hhHz zH|2wT>E9bLw0v|9_U9B+y>&jW^G~w3ryYV#Zyj*|*bLqx>Ci2_(bZ%5oWhi^b>xop zjw4Rq#Z5Y!;GTfo=>___p_CCe8jpA4ivK;2>tpI`Ud&&In?5_NP@D6vLyW`6Wbos&249+gBg8a%S}n@x zVXR=7&=Ob^bx#mMy(89XvrSEa#OPW&kx1G_XApXs)YF{cI>El@L*HgB#;<+?OB-8; zBrm~FWCM5v4SLm$tOiMj1BNS{UMKh5Bk7T!jt~ni4caZy`Gm#jO&t87SG1Y!16D1S zB~uJQX*byZ*TrMh@3pNMpLCSyEY-l?v zihqwJa4U;6JLX6J@aNK+bRt|HPDIvr8?TZ8f2G$8B_o0(G#g)~P1qKl z3Nx+v9fvsG`>a)V+Nd zzkm7y?^KaH;awN!S@W53fbX3TMxExvxJEbr)cIx7V{c*nQxIS}lA&-IU4MbwACEV4 zaqqV;b}@n$>|?`Hm+(QzlZh*>JgG**Z0&uK`#sdvmr+{=r(4PhUHZogW zT}M$*!{=@JYUGo8;@lR#4%75wGdS`0w1iwwd+Dsr;!Uv@Opm+K|E=PV;eGG4^rwFB z6o+x%mXm!Oif5dbH1zL{&mLA!j2d9YrL2_g(MxA@N;v6kFU{dTc?@Z4fbh%ro^K=8Qmh_VxZ7RDFaIjiWHh)8 z>*414>e3SXAuh$%xq&@kS+C0tJaI|4Jcx}aDtPFgiaZIA_HLwf+@|ipbmUbccu#(L zPN>J4S4%$si+NT(NL*{jQ)$=J7hC%2q0a&MYk!4Q;w^TMC; zy8Y1YD~a^o0e@#89PlijYp<|+#tNbw&g) zPJiMnFs9Wvn4~h}m!mMPDD?8ugCerHOTmbj49Xz#J?Zrku772yi1S^){DH$wNFZ4U zW0+T=?_T6w-%Ff~aD(s=%h8n_Yk7^eU%PhJMOB%Gv-0zAWByZ2R3&}^mf7_o8U(%;jRpB5Hg;j4nuoiP5%Z{u|0yaZ*Q z(U>)QsHU&5E~GP8F()(;SJ%HcJ`D?6VbJ2@6I?rb*7iIm{sENHrx-ba;CUr zTK-dJx-&1GhLlGO2xNnIqni?aQKt{#Y2_l#jS%2EzMjTAd2SXrn87~|^W8N2 zUr$4}2(Q0oGO_1kfd{Wg0~-ooegp&R3G z{O-_q^6%j(duSDIRP2*HPvNF-W1NZCGTZa&v-@|yY3#T628zbJa3GPyDZY*6ko17) zGpy4Xg`>yowDjkg5@G)K?(Y(JJ&n>4U5$>hVM*RvzB`R$`un?KJ=|ya?|JHe9cCOZ zKfCMU{oAMdP?s{{+3S8hM?dT062(}?(e?D}Yx+B;Ltnpz?SCCo|BbL6ru+Ln#xaiD z-+gwt{_J5J@r+?R%$?%&@c!&@KF4^+-`&5*@!8`|6sIpOU*otvjyT5tJ^mPX^jr#q zSTnxj?I8zitp^2T`|NvKNu!rR zrvcdM){pr98Am#d?_b$c$}8ZE(q}z8e78R1lTEwctFX*6a_{R|ozXxn8?4GesB)cV z%g8(?BzXAwm`TF8BmU20KjUv-A^r=buNA zIx??aBoFkf{}rxZj>M&nT;g)pWtI+Qql^hTa&Mke-g?N-P}!1AMhv*4o6v4FkX4LM zZ~H2SRJg`Af70{<`ytL=IM30p9QwmjED5orU@yoeY{!ot9gZL7VpV8bbNL$hU4@aP zz^47^2};V}s>j^Y+03!8%p`cn#m;RD!?8nqhr5p*85Y03FuZogOAe77(`bB`YF#?d z#CI=9os3wiIQx@!0sBpP%4KsErG9%EmmA0{y=7w`s)x=49crQ?^9)$e-v=1 zTRc_*oj9aT*htYB-RQk&QqdTPfZ(k-;!SjsG(|~wl^A4POR5`eG zM8n^_={kS?_PzV}?{N~F`StLMH|1UT|NDM+8hrNmjcJsmH5nWQ7LCmVQrCTt;Kr|S z2Ge;rhX4D1?`3kwv^K~=hwIO`PmlZ*dXq97d9CM%0i(cCV(Z~X0e5)Kzzxdy&FFqB zoISi|vWu?nxA8e9qlboMMqxB@--u^pZ2&ZLvH8ggwJmjg8<-FF>M;!ipt5ZK$^j<;y3m{Y&0T=udl zoY3P4xPR;eotFu3!q*j8zAy|vKugy9IchS&(c%=cD4!DMqOyUlqqk@w72-@fdW{a#>wkq3{JktYvQ3_q zUjs&&6hA(_yz`L+J4}1>80UEdmMV8Wah;F-nV6;%97tNd1lGntr7~F?h$Pj}NL7vz zWr2U|pHBBi$Vm=22E46rX)ZYwzm>F>G*#~&=})+*EStANc{`28D|zGyh4V5ZxgGAU zFyAqJ9jeiXul=(>gQUv^bTt3uF?8W&8Ft5OMzaJR`x8Ev^Ktwh^_@LOX&SMA-ihA{ z??CHQnnc-9SXIWi=VN{B(HHrhq|oy2cjc9rBvh&C-#06YVLh)BOYgW&U&qz`*F8Is zp1uo=GY{2>ly2;Ks?h0gXf*#Jto-%sbR~!|ztwBv)b;R<&%#jQra29-5IZCi+Wj|! z-x$_mh?IN*Xz3KbAZ$P0{FqD&_SxTyp88ss zk9hghXiVJoX@$dhg%0wNr5wi4qZ15@{u&gH{F`?zBykhiv3B{%X@-OqIL4rmM6-p7gBKiY=eQzVIIpP$%A4lL@&PwWKJ z%TgS0)!j%&)|Jfo=H;Uv#9}&yTwmHu_h9A}q3TF%;qVjlp7cUQ!EO8vf20Z59`*4K zf0GXRsPyg>IhjwO`83ZyJ$5ulpt2O*e9TX1hc6le@?T!TM~SqK^iG7w+`Dqay&<>g zY>6{O@bF$SU6%o$U1zlbP*>S16B^Y%#a~9fP<*HS#5JYALMU3pAO`0g`@V`9gyQrkyrk19AM}@pnUSE}vL|94b3`_B6NbQy zTlDgZ83iCRT?vy+=G&af8L4UyJ9mCsq*jXTL7-r*1@_2g@lA~Ql{L1E)APy|^WgA| zfl>s8z;u$9c$({eofe;C7*Z?l=F!8%$3_tfXa~m>?VxAou6U%7xGrh(21x&wSvN+- z250`6AEP!#_Ov6=p^b`4{wBgmnjJpR6qjsK&@gt5Gx;Z>Oh>^iM+GcekI?j-;jMF= z4gk{vE87BTVS+#J7OvzwvS6Bio66L6{uq0f(+y^b;Kh4Uo>4#rJ5`X=xZ}5R-pd~x z;koHTp`$A95OOp_IlxugQXdJJ=&3s$<;8PJeqEyYzl*Vbkrhej`%0|(wUw576`X;^9+a4;s zJni_Pp%+}rp~b>4c&z)vcK%cer{0Zem|`E%(^+m{(lEbc$nwvusIY!+3@>kf6{^X{ zx)myv(Q)=TArnva!EH1*K92DUl(?gE zaWHG*QU>)EJrx?2Bb-0tY;;GTR7mI?w-eEaA&w~0M|u{yX?MO)@>a=#qap-GIR1nQ z9Svsu-orb50||-!-Di)})7uDN5zJ#TLLFD4$afSbNLrqKgi2t9Glt*6-9EzF7SeH& z8&*V68$}Q`Q`&BgJkPGxX*UzPS|S7VizQi&<<9Bo9~W+ z2$6@JyhD6l_z8f#(ngbyvDHYRzq^_wG|GE%Ob}Li6m!cnxd{y4#f-ocwPY}9Robo3 z8sv5w#4R5}lR8cMlkdDHKOI@)hG0#4 zAs{DMcfuD2a0oAMef0J{hHaIh?U(6VkCd+50|7eaLt|LXxhtX7@#L}P)%Tp#gngl% z<|y(qVWHfUp*XcEvk{Eu&$G~|eRk)leMS7Sh2*%(kkMhhn7PIN#w+7lx}*|A=3Eh8uh zKcpdg)-d?7r7f41&YaHFg`jc>Hlpf?o@ag}nitVF)TC8#5eUPg3Ghqe)2NfThFW>( zt{Xw(xCtW_%DM~&Apa+cicqQbn72Q>fErQ)DlJpEDZPOJy}Qo2g^zV&^H->3a8e<= zg&}wK>NRe#M$izj;E?D|aZNpYzT{L1$29HD-jXYSf?|XR>Cc(RJ}Knz0Jy*lE`J;E z$!JHA{Pc7>Hu<0vK5s1_=H1r{t*3Er<6%!HST_8$$kJEIj66C4NuhltSnl|u6iHWm zQ)`N-gBA#Ugs-oJIK70c0#+$LO^ zNVG{gcO-;$ifo{JT{Kw5TIJ_6nz?bhM}B+H$VwGx$<3q;s?5ga5%yE>!+8kn^iGP5 ztR)s6X%qO;b-0c135@3yCa{U-PZ9~(7)Osc_N$b7$P{1aUxyu+lYZU5$L!xd-bVk> z)cDoI#yH3C9ao22_mpYi>2!D4_22!AnA#Q_iIUr6=C4bDk5P1AGYT7+3J3`k9F|-q zP;#Zt@Vlka(@uKA5<(qTrzhr!C(;z$ElzzmJV~GIpzCx~`ls(>zk-R$v<`kFOal8) zWww2Vx(%$W{C9AlAvNBkFkV$v{Q8r8_BR{k80#%m-6ads7WlNDgjZF@qKR%7UL@k? zQ3;d3mQEQe2IxleVap#F{Wat(e#(@DgDXFUKc=YuQ4Rc5MuNjqlh*oaPdz&R;Pb=r zQ*rkIK){o@;?tWnlLW70ml)F1Xc}P%M*UmRI9k1ny5a7D`3w>%YAy(poAt;>h%2uZ zCv+0o;5y2-K9qCA(*6+IbDaI25o7`ZEr~5&hXxRdnIX!=3T6;#AlE1_umL2b7y5+s zDUpKsWuBYJRDu(M!QU`^?XYNAr+oDZeJv5fZ*w4>&_g@nt<}pMnduhc@a=&AA8BtE?AdkQ=j}TW-S_s~=x%g38UsNBBt?)SC6cmCiMC=%v?a%q zY^!WXmBGnFD&_bkk4aT3@oTD*%0mX*Rk59lV=EqFTckCVNRj4Dngal0AOHer^gQ3O z@09QRt+Vg__XR+gV69g^X=G?L$oyxJZFLQcqfmIHd5&D z;@6~mur&7y&FK5y`>*Ge7s4aP^+4}wEO5pWu)k{lSiaweDERePnW)!ao%Ga+W9=cn z4>3IruIS3$VV0rZP7Q9(%-D|3Xj$LLR1At~rXFVbKAIobo5paGzmFW^NMEti;Rp`B zUPY}@WcOkWHLZq~%ip}RCn5lL8>%Y%U^Y^w<@N|+lL2e@>csP|R|@0kU9W-k#8XPj zwKO)Cth*6}%J5nZE>jJuM+*d>{=lQeVpk^c3Fs1FQ}6?nkq2Tfa`Z#-@~2 zz8m4WyW&%Wtq%beygIyIYTzbur7^sU+|AUJzT;iyHf6W^6gt9`S^XS)FT6HIRb$FP zyVkH!hoXFZ+Z7lip#iCIE7*0f_;;$?NY3HWe8UATZKjJH46NN{g2>+O{>e4y`c9c^1-{X1L_&}1Jh<0{^J71Gm--(Ea% zJ;6R)9!Rq<&evUd8GWme9=1RE1L3-&hUJV2ye{Xjm$>;xf4{s5u)NVt;PrmyE7Z$yRpxA3Yw!dE}Zn`s7I zwETqiK-os;UxvqI%8`d;u6|hs@6VH5L-`JF`{PI7`*XjK(r`4aKU<%=#Hq^fA$7Ol zcMcCq174w7>TX>OtcYD&8lt;=xGZ0&NvT&~nVpaf&lHION_bH$bT%)UBDX!4sd~GSyec(;+Cu8NB<5H11#RmE5MPT zkA{^auwaEJlR2?O4rS-o+h!=Kn59y8xWXK=7~!)~;#$~CU7zX-U8GUJhKSBD-qS{g z59!1?VZ0l!)6#d_qvs2_q^G>(t?bSwh?6j4w(*jFvx{gUvO7})4E4K|GfMAH7t4?i z!HGVxTeDsvB9vnqjH#P=`sg2@4fHL6c{jw196`Da24M{EsWWxeu%RMS)b9h#OuM8T zLxO~KPv|0my{nhC7t)_fM@f9}>EZuw_g;@wys8&~)=PT&^XO5JYgL{&i$BI`MhkrB z%P&xuQ8e^J@>Cla&FtGA!s2QSoydOmBQ%5;Y?4vMK|B&9t^Dd535ph`E4URu=rjWj z3@d}(GOdMw3YtysV+=NMUuPe#PQ>mE4cuZ1+G(3*_w((&k3H5Neuz!g9A)o}lMV|n zSwmoB%z9WaXEK~I8$*wERfDrm^6fl(Do&qzs2#?@I<#=GT{w5HEpW2a?Nz=X68aS% z2I#&&RZ<0716S#&s~Db{2%{uQ??A~=bsfU(z@*`iyl+%!VMeZ`563}VDv<_GC^04; zxs0>HW0jM~X_tb45iuN7=BMFPJbz5zaT{do*jc`u69& z_9*XH!fhwp6_zSkwjr>oU}l$6NI@tJEOh8ZJXnuD0FAcu?sL+Za|A~64DV?dN(2uL zE;mhCd|d+%`C@fRyGO%%H)?-)4;#H#J^}CHV!T%m>GukK@b73DKJUNmm|0(G1ohah z!f3+XKr54Eb+;|KdKtlWD5KZc`#Y683cf#AKkYD;dKJ_2kKRY0d;b0Ro@RgeJH7X` z{VouKM1 zr|GuM3)EHR>^=7~3>l3ZeedBENt&WW4DayvW=6Va!FwhVwIB;w>A`rfDi4?}X_T^W&a&WE}%K*vx2G z)zt=-ieBCc^q*_wu{&HFTXZnJAk1%Ouhhw7tt2-_1Hhh`?_-LET0LC$l@C)0THi}C!gtbz>^ReHy57cO`V;wpND`G6k% zL2TrkUH&A}B`1MoLgL~`1HNAj_RGJ$owrn;_*eT4g21jHsW2JvvOHk9qF3cpdf1y} z;LaP$Vj7IV;&anjk0C^C1KVZMdS-(!%a#1z)D9nX1B>*BDHTsHaTKx7t0kDU=<2g(*s;b7S(> z7M}2FKDJA|?co*FxEIgxPH>cPQWW0%+ADeFmCBz=`8d>%r`5ln3m{2+Eq|kRdnz ztD_!0ve+JZ_-s4K%*!NS8c*Fiy-4fK!`iAF^9#30oZlK6wxe`1y(f;IefU&6a(FSb zHz)Ym-f3nx-n@P-IIH7|Xs>&RCEmser#~B)C3L}aC^XC3HH*N!fVKw=Y4-V9Mgm=Mar*YOJo>)pmon>IeR!3+|l>`Z2DgLJPY3<26#Q4 z45>0k~oL1I0_;fm(b=WsyiOQ1>XOsphg35Q;=I_^oVZ9DQ z+pFN{TW`QH35E5eaJ4#(>+qy?2GM#*aXo${sH3r4p6p=B>5#zhEoH)9H=jqZPS0NU zZk&KbzF(uy2@8^W>b1@60|nK44>if&v*pyUI(%Y>umH=SJz6}7SHY>xYI~^z!!m|q z9?WCiE27_%eh3?dvKMb<2Vf%!-X1a%`3{eJ@G8=d8XkCuw7Wqz@H<@cOuu@Y`QD$s z?=P>Xf46&2A5!>-J~KLvYd+KLrF(Y@n5N@0T3*jP8fLma;HC`z>~oBbCf|L2C-}*~ zuk2o?_x^0U{rN}ZIvnr)(%|ctc}@RrFnWD5$dJx^&qKx<7`XSVG#X7QwYin5`z!zey`9MbDb2*S3Mf@(|45FamPve#Z2_1tF>D?CY3W^iQS^FI2k z6s17<+;K{J%j4?l^D^>hmqtUHirS-7wLvd$7>p52yNUr-j6yi%m!2vXxm46bsNQyl z^2}eCvUb|5%vp}#5*nYV793$c!#ywHhVXW%-5AzF}I*{{s9ZcJ++|>hwLc%z>_Z=v-E(5CLrKl zI(Ay-op{MEj2@oDRZymOFY={a^9O(7*j}|xT@S0}-DlN63@2 zg9gVs^;cFp12KcLwaDQr^DOg_Mr(M4eV(76%aUf7%2?)>4o>PN53H`P2JXV4QpV=z z<^zuf=d5YJ*DhXvvn{XRYjU zWe5&~3K-KmA(5x)!^mOlC5<%@PBDvd`k@nfk0rYm@9nfAO*0A12!Up)2x=Qj{l5Z#sXw)%| zpR%mv7KVf!kniR8_k#2Gzdb%B4|zMB#3hVUWGouv05cBLoYJ(ynlc+f;k4|B=U8HO zkfX6+L4M-Yuf00;vU-d-3f>e|bm6~SuCyHDlgNw%IP+>f%0M}pNk-Coq3j3CDf(3N z?8Oy*$Une0&u+eCF=B|c++0K0X!UwjkAL7@5ai+VR$%MB0{JTfy;2GEl`wwEoA~`n zs!n@2@6~HBYycKeBW3QEVN3OL2rQ62_8sqUgZ+-FMg#5Omc2)?U}D{d@;aYN1FQXR z*B%7H@n&LqAU{iZAVqN?Leq zC^uz~N|J{>kF}=z)*LTf0o4_f9P0y-v^#-rx^5C zmhQCAeeSdE#+%EKBoqQ}wWW8FDm5Y%;d?-3oUHTJ+VSa~_RLRzwEg5Kf4Utwh-a9p zQaT=P;RFmIx}t+_igz+KL|HIhc#BLqAbcZ(S40ZfV@ zX&IQaY*IaY<`n1Y&$lH`#u0`m7p<<`ZQ~rHxxkW`jg3|4v7IF(QP7Al{$d$S)|k)a zK86yp!@(GD-oDgcc>RTTYwK!TxwYC(9C)Z5JA90SZ5><2Hussoz5T?Ie+d=W3?jS{ zB)-z4`UFYLH@uqzkadL>1mthH`OKAD9xHArY6ve{3ez_5i`e)1%~~)pdtVd|A<3#H zR5%5d;W?p03U2={msL#uK??-!lt&n7`nPcLg`Yf`Hi1W{2%5q$w12CEXqaj&nMT9b zHk8)Bhc}UL(|oadYJv~83DxCH#K9q?DZU8}%oRdf&;BfJ2O7sHFiK=a%Rl*_^35Zz z1&eghx5iMN9=I8pB{55K@b06vca}@z{v0z1i;IWhPd9>Va8xXD> z#2C}qz=*Q0Qy3y=&pynI#$m#s3AE5ja^d{h@bA5K`?z?Oyi?|xFa3ynBEQBdabV6D^3-BreN{k}v*fv%y$;+Cy%jg~fkW+71MRgVIAVp=y zupm8}YaF7kvB75*A#yOCtq3_}+PwiSRq?)i4HdGig!gw7fU6CXO6Tq~!QAYB=I|f# zfHh+i?e-Gh8Dc;!UMDwLZ}CyOvB5g5hPTeb31n_{CuHQRuv%}~gUwez%a7pX9>vUX zXrOSMl6gTg10F5AK^r*ZHEh?{Ni(E|i}FC6#a6mNl6(iX1iIDsWL z$r?dWZ&FN}!r0umIGt&mHXCzF5c+MSTo4)0o`$3J^td}LMv+Ng`{f3Moo~YOWCdG7 z__R#$=7o^rDRHzsY2ZLuc-XOk3owF7Yq)n#|57m_+z%@1D3RWQ1f0}Y!cY?2m%Qe; zPoIkdo9B#Mb7*x>t0?64)D2vvm4^-p z!#HUvt<8(DN&ho!fhqeFM;F_|JRWz-xsEsZ+RfXHOPtz*Gf1zV3Wo$*U|le#M!e2P zj~s26uUu;{zw%m}om$!umL9`3 z6ma|1D_7rYC!RRnK5+U2kwuIgKBJy!Aen`!Cjg;9ZqYv+(_9N*W03S3Ai6j| zyx`I*((pV9FQRhsfjME(rz0Pr0dP0gSK5R}+&hMf41bkDu805Dm zxcfBF+;Fd;lP8}mEU#p(6QL*te0V86vH&G}Ekh+?+K-!K7JVs)kwBO2Y3R#XW%r&! zm{!#(!w3HZbN9n(;HoU#plw|GBmam^^%d(#e)!HX>nZ^8h3$4P$FUwSU+!I2#C1CgcB#Bt7XS$X7` znz9{P7?%z&%(e3mpMj?r0Sd2>c9L=6B+HrR*%xtZ>6Rl<=#%2focda4e&Su@hkcVU zXsk}JoWSMMvs97!OVAdTp^eCB$lfJ@e)YIAKp+Hps(X^jbP_qqedd>EavLacFObOx z!N9|_d7Wa)HRP|CA*jFfg8Msn8?1xhYrLV1A^mQ=`FJrlKjNKz@Z%ekFxq|BlED;O zbgL*Ft809C4oRZ&V2%zy&w1x*1Sg-v7z2E6BTqeM2&EGyeeQNs1*w?QQ0Ah@isxY; zO?rJQVZrNvg8?wR0K>q!)dL^6SDh=Uo(m01$g3q4j=T8u{2izX?PFvt z{xOFa39NU$?ACLSmc$Q0i(N1q2-Ig$iX`y4kn`@T2zT$^X-8ORI6>!> z?lDq=1Oj-6b&PcG)#a7;{qKCUU3~3jK8&OCJ#a!U-4Sz;*CiLs43V7JmJ7 znnBH3Jo}b3vOoDX&2^3~R^jkujWKD$@_-dQy{4(ZXZXYynj z2RtEPWItIH40#=D@n=_VSO8b@gTFLeXQ`&%ViQhsd|({qu$uLjXS@qTKJ-^CkFfwg zyun{JXccX0gJW-F2WOwXM&(cXsId6wV4eaQT`CL zGY7)%@Ku@|Om(=pZhvBYG4B)b>dgGXwgx{g%a_*IGAMBtUeSo~XlRcreB}JO_S}mv zSD(Rov@?T43q?zwoy2gsjpzO@{J$gb0&^B)L#-k!a2=SpO7XaM`EL7{zw`O_{pY^Z ze*Mq?a{Dj;tN)_?-M{&F+tjTn-dtf_73ih)vDlIK~!M6I04V6=-1cvrTQ! zv@6Qr6pjpMBtXF2R!`>eZ{(qS43_w))*Ag!L&J6P;#vxovZH7u?BoGA)6_FKP>_9F zd3YB%mSW-kI9v;yp4z00m3VefaX$Ydn^847XV{$TZw;Askj<{vTVugI_K9=n&$mBz zc)D#&K!>IEcJHm5ZE{tG7WgEBJA;S$Y);!$%EDC8#24C5I#C5*jrWg^R&y(N%+Uvk zI{6Dq5A+*noxF;vb+ZGBQ@^B(xvgvZn0*LYAKC?`oMW3%UY0afTZfE}OJw!g8idCZ zkOmbyGYQ`)!zYo|?th5< z)*vwxIpigkXL;8AIyjd3CPVrddF8<*25$1CvPCN)06OLQAa4oQ@`pw=Nv>q$iJX*E z=`26|NMupQcbkZK@3xfz#^z3b(9m&nc<~UkSw}N|CVqfmVdea*P-g<~F*CQR0nz&d zd_D^;&Ody*ont0q8o9HHuvey7j$`YIqsQANjOJt`y_dKQXykpD9+WMvF{{wKri zrK-%nOPrcvNLfi@m^2%`iZCH(I#ddV4tvRbk~n+UUWf#)e_?ir7O&x6(toR`YJ|Nk z50<(Y9&8gSvrvB-3kGucx*M!FR~%fmaNXUG#|F>J0SAl`s)oqa%b;Fvzo(>;!ll`0 zx`HDXdso4ez6#xeCcy8hjQ9D0GWN2+{fYEFZSn#$D2d1()Lp!K+P4GR3rUCBdj*;N z9}O9vN6XkN?SX{i=fA+H`vdV++XsVq6LPL-HvNS%Bytey6x)mIPoW?M-d|g8U;EM@ zw%a#uro+w89n2sz$(q*heEnei)X)7~`ZSBm6WAw0to zYI;BJV$ti>bXu|cOQqi)n<3>kp2+d`(GNY{mM_2DUV85PFg!~u95etHgCPpMMP+ES z=w4cQH1&*uH1LYTfYheDb@`Qc@6rqHz3=;AJ9OkYgWY^ggRQOF?Cun1a76*%V8A~H zp4%u+ufFhortoibj=XDyRl<555xSEz?f9Vs?T0&;+NHPNY`2&1$8+O%RhAh@HL#Cx zM4pEs*iTd-^lBZTAI;BAGuW;~iHL%S_fv&zmVTy}c?lTetrD@cvPAuAJtt8K7gx*B z_)s~yhw-Dr<&u&)Jip9Cw+-{trp{o(2;|;c+ zMLWNL5#s@RZj#w@)){0p{G>S~L&*>>dGK~whs!rGYHr_cKfH9cedF05w4eRR2iqr~ zdAc2ACP5|1N+?6EyGodfZax-t03`^-RX!)6liu|?W;Rq#Jf2VkB9}Z4E?I9*<#>y1 zP?49$zWZ~XJWK?y>b+7qlb2Mg2se&P+6ch`)`*h;YCx60M-DK!EVf0KaW3F--G*Tw?G3Lzm^OGgF== zPHAg+8@LX_wzXxr@ld67oF8XJX07}TjwRPuJ3};a^biX3YFne>G$5yESmsLGntzLP z_BGx&Hgq1Sz>T%bKf2Za*8lwX+s}OJnf9B1@z1v-vxnR7fA;ejwJLo!1I7`}p|Uec zK;$&El@1$t$LBCguyOFNf{tEe!YPTn6dD@VR^?IJ7%o|@Q#tlP&o2+#UM7Nn@6%sb znL$~?7?KuxK3zjT&N3(0!f!eQ2`|}}Wl&a$ztQdmuVfd5|r$disiB6E&WYWnW$`}oZ@EkIu@~Su& zZl*EJpV#Pn4=?8o$kBE>M0_7d*|N_#Q+_{93b3EHuT(0FYo9?-B zwrd&3&>Rj8za!yn#koVw*wAk#;FC8uUujDh*&Bh9n$6Lvonx{6pY|*ixbGQNwio~T zqXPk5#MIVBUR9!}W83b8!FRI44tLkmGf&Gad?US~gZ;!SJRt9+Zz4lUlCdlbvvd;v zaHrAa#r%?6gv7aErA>!;gA7fKe+2RW<4U#w3Z#@x3+}-lU`jVA%NyxTN_$z)31%T? zFto>^gEOr&^i2)nSj`aGq@g-|X+5QDG}bgi6~Dd{Q$O)efmG5v*RWG@6pEHellkIK96FJ<|a>f3j2(>nUHp1YX?yA!tG zjUBCc5ASs_-moyJ%;UfG+N~V(>Y%#3%sNOqjUG%>EXC_E#|KPGAdL#Xf>&kwyjyl6 z45+Zqs0j{DecH<>PS#lRD!diAP7^EK;+>6Gyl9X*N!tS8U&_sG_o3I@(Ei+c-K6G@ z6>K2C@BwrHjSBF#@9H$HIu?wc#C*atPTG!MDNh9XNjuY7Q5DvAyY`IRN-@+nF9B7x z5AHsSfAH&N(6-h)WetTmNO}6iR5oFu4DaYMXCUVzhto(7PhHvMV=Z6!z0bDgTi4sq z{^BpT#bYNBwu)ZjX@k1#BR*9Q(}Zrgqt5af!dhigICG1~ z+NVGB>+Q&iQ|(`W_QZSJBac2&%Pas)&EO|a9phQS`Sk6sjJST)l-?L)WQt`FlM^VA%r3bkNEqLH@nXAp@iOvptNp?!KLQ@m zE+y9=`NQi7r)-=!w%C>~vy6$d#t_;nkO$xuy`3v7%L9@lzyJV107*naR2XKgPuB>6 za-;n~*~>VYu=FE&MX>U}H_L=>>4DHm&#LtI62~NlP8tU1&m3!yoqdRzBG%F3QJ!Y7 z+1z;qL+yUMerKh<{?@hj#>Fe`7D{K@&;CV?q-*E`tt?t)GVq~k?+3>B%g2%z8d2uh zyL>m!y4c?R-4yA|DG|ESc+Bkl%um|9^b2cm#_6jIw*S?bY_ ztb>5RSi*vVV0xG7Eg<_HRPSM6A3u7eJp zaygLvZ=d!=DX%yme1Hy--9ujKQ+>tAy6lM^&&a9JUE0zjJbP7qrSE@xTz|JkQcSW` zKHEDZ2dFF8ys8#F401yUumxx@Be`6!=fFZucQZJdGdw3XdK&iieMBP_vtgyUQfW8g+-NSV@(&DF#|asH9^ z#i?&_D8PEqkUxc8aG@bQ9V6EPo-3FK=e$C9Uj`%$GUM)#DinGmEOQ9K^p$z^?*o7X zFCn8z-u5i<`?@og1BH)*D1!=!cOscloOWo5;K7~w;P*_L#T7wa%=W4**EkA@PKpmIeIPoQ_=r$DgHq7<{lHX+D;$S6%dll|> zCkLz|e8P;RfUYF>xa7&7O6#MSZ@n`HF#{=2Sjqb??P@ZwUax+y(9yruUPBc|BX*yy z!(RU3d7#s7qLGJ68_ai6u4qcJATDti?(Ri$t0E8Wm1S~Op!ALi7Zo>YSM8%0Px>mL znU+V9`_|XK(yrgW(LVF5zuHcodKf;%V{BQpjV0DqG^PsG4g}9k&$VO6rtv0U!&|u? z&$6Yf+BtRXInx_>7$-Rq@6FoqWc zgdzSDmE6UFK4dLc@%}&m{B!L(UibGs^?qh0jv{3B*x8>2EKmka7sRp`Xkzg|d;bSN z)Lwu2*>?BlWp$Od34It_P@ox<7>G_SPPXG8exjXX$aAO3STtXH z_^I~ht+n>r8*jDCSFdvfwx{6O2ato%*d=Asv}jdjrJn&`y7!;-+)oGJ4Hvg=h!kJg zwo-+9B9k*i%_~W{R4h>nXbX+!o-Q#%d6I_tqqd`Q$FJ%Q-BB|1OsX`>Q-xcqktBM( z*XcK>PM&DjU%d#voE*;*v!~wAegyKo`s!=#t;?4Lzgw^ zbYI2on=9>y%dfZZjy;P4I*D^(rv0_Q_E+0$ufE=X_h0@#eGcOtMOMXBny9GiDqiED zkLSPreEVky{&D-Qzw#fnhfbWNKk{KMkPv^{}a?RJrYTHADUj;~HM9FpLwLDW9f+7O_ z=P+31RuzBSpwD8Z9#v8rTK?Pyp7`6&_EY7xEWU)I)A>ZFfJ^g8Mp?xV9ahKF_x`g8 z(+Nl!lf7Z^n1+BnIM4WglJ>vA?Cc4a73vIeBd*GpZ5kTk^bn5~W?(e107P9iFzs8S z!oNI}tc@Zo@1p2haHTOJJ&{e@6WNr&@>0A3S$Yb6H5i0p8@sPhC2k7%Q_yb;d1PB0 zKYAGBAGyF%#1DV$>2~tm(Y86e(^i>Ta_Jrq$sa6$rJQt`+{We#1_U!udO7UN@jMTG z$wCRX4#AIi>&jc+H4cok4SWCbZh}2$w21*Do%dZm@4nz4`x;zG5gdkl3})pUv{{2* z+pElODU0Co`xwS6ZHXD`jW|cfnK`#o=`WTfzFcCbu{^~XlsfQUJWlQ4*M9Z5;xEeF zRiXqXEPnMrSvqZe>3RD-3HVd6hPUt|?F?w!zZ{N{_!*;Skk1c4d4^n53h0Z%p`4WKXxh9$)m?u1%XixvT|bdMu=cvd*sxaw!nCGoAG;FPbF9HGwMgS zt?&^1RVV9B8P?vi?CE(hV@J&AT>^#NiLEDrjaQLivCkvuNDG-X1SOLLjr$@N5gs>gv$0495yx3lT{R+n-;~B$nSXrHJSMl6#ZmeY^qFxZ`Bx<9rly~YMPZxJ1rx@Uv zqsaPqXNop9IIo-vXK4lfFkf)T`xdVu17+0`zOgO<^kfR#Fc&mi<(n^4;GG!>6y#0d ztt=r|Sh8V(8iP9sP=%%KZ@LoTv=80{6)4MBxlmz>MF79JOlFILe-m%=1SjZ>>(OHh zJxV)u)eAWb-CW+)H;fiifFNE^(9eG2r`qSg`hC`r!*?iP;yDLDZlIWW3Xld_h?0!< z4P})1p&8z9aTnhmpGh;Y3|ygXGC)IX5cs@0gQ5cDk{}0LYMP@hJ>Js6K}Bs7rP<6v zbm_T zw4Q8~A?j({_eP(MXATg-2s*|$GRV6hRThFuy*Kpj6F>oFTNh5>G%HNix|Z0#{rIPN`{ob zjwgH_gUY3j?$=mh_I!#p^v>?l@M_KB8|!mlK1OMCW-j{$P~_QW+rIMUZ?#W+?4#|! z{jdL<_TT-DztP^fbP--2W1q!Lo5q;diRIEid&RfD_|^8_(+{^_`PDz$w(+p4%xBg@ zrs$VbR1Z+s`Y>Y%pH#RrFDyU2v&6m}#x{b94&T~dl1imY{%|Rq#+&V89ZI3VxFe12 zY~S#1oXgETctHoC?clznG2)$hl)folTHA)e_nxIrA*g)~=M2xCe~3<7J2R{!!M-P4 z|HY^Q2^DkO!p-NIttV~LkS>P46|+9z=TAP72Ez8|$7D@kXZy1*93D8#ajj=hx&O*9 z`eLlyy@_#mRU#tG4rXRqPp|d!vTS?H3`<3^T~alko2G`Zm;@ElP#$5*T5h}Wed+&r zsEahD7+$<3VC4PBJPt{{+2uyBK&;9DvgEF_+V@sCSqpi0pBe2<96T=BTiJB>a60l> z{;aD5=g$9W#;3!FkEBiIla90XNhNzOcqQ~RlYMRgW3;r$<(+WT_1_S)dz0t(U*T*T z&URWy$4m+v(wS&v*db12I>U0KSqxSC;X7TuyfusfXX(=BIzf={HG6uf9bpziV_r0^ zpQtdLQN&g|0S(TdJ%_{SHvJxh85tlwYV4CQ9W(7e^%Dp2ESf0tg0(;sY!p8`g#NBl ze5z&@L&Q)~l5y=4#sbL0C`Txrnxz%4v#ngfwDBLfXc#L&nh;W{35_@ehABEaC(am= zbYLumhq8cEd{pqN5q$>Vp$PBdpR!{;?G;?GOkm+AQ2@!Ixh&=f=T&59(Ld|si15VmE@0L(`{HyR&5qI;W zLOlI(h_`&(Wqi!3Z`QEFd)q8&3Oph$N99aGDzZ*1-@JOY{iDDC_u8NMrO&jdpZO@8 zB3VC9!oH?g;aeX50Z+@aK7=1*Db&3s&JBO-63ZaiaLrQb-jQt^;cw&NI{ny_?N9#u zf3f}H=l)5%dHJ<0OY#JToU}n*#3yw}h-6t52uo=P>9p}csd1*Mjgj`Xgn+4bmP-Bb z`R{RN`Hl9Tr#{eT7mmPB8pj|5JnOQ7z#7BA7{^O`^8EYS_{@QJ_03nH;WDY91^->% zH$7FSb+BuwJ$dd}yTFo|>$jHM9hU!XjnB20-niJl`Rw!U&b?*UX0lWQCFlVCNkhf% zsK;MVGDzm?0%i?X$}iig-`9rCG4 z3C0DFuJMTI*7lD?l{VsPAHW~+e<}Pub#{B^Z+ig4`+pa zKl0iNIR7CA%9p=zx|J?2SzcxHidU_8d)}R z>vDVfi=S<0KKRpZ@x)p9on?Q{2)T?2o{dIHO)?zF^B|nQq%(lpD0l9$)M1TT*83i$ z0BtsCpYRRzl6#`O(kRlzrEs>3ZRBbJmG$rk?{4@nueQ8t1FMj0EMi=vG|M~kd3Xx~ zNwaKpg%7e$-M+WOeie)jmor-46pkHP(Rw5wk2wJ>81PJ=^f$pO0sFp}^cUXl5-Y+q zel%3XiT{QGq56YHtZ_OLCi& z5t?$hJ<{i6L<}<%hC@%CVQE{7LIzDULpy_#X+Dc%RDm|uzxttaQaWTwAZ;))HkakI z83z!YTay@l9NU!x6*LerHYeF5Gl{N)SVE3McOcNa`9}e{r0VcboGc<&J@~#GBvaaf zxP0m>N)ixwsZBvMBS}5V*b_VDOmhVUkCr$Dg$6EHT&Ldm(KmO-Rw6Hz-rLG+GR?qO zsMeu(9C`3g*u%o9oMX_+`t7zc&R#C)HMPYOL&ir>+shHDgxMwaYD>D}@w@en@LQN4++SIObnvb?@jiai*3NRh|YK78_IW}T*Ji@r>(N)aX( zIKe#N)!4&_8xA>^kM_~0l)-bz(4)tXv;~}e88b8UWSI&H`v%MSr?%RokDPB``Tk4b zs`CuK;b%GWQ5+UDigc)I69aA<8m3bt7M0!0M6%0hqnn{$p@%Pq4KL|fb`*(C<$c#f zM>iP(KLW&z5Zn+4m^wDhhGICT=sKZkwUswN_P_=B^6KyP=Nw z8^(KX^%gqv$aL>gPJ?o<#^V?gC(b+y5bEwoPdOebtiW+f9aEtlKXoB>y#AvfwEOq2 zwRz$!I3L==)2fkX|31Lx!h;wRx9_dCZ@h4+z4F?t?a@acVL8=Sd-27W+bSN<8J9LO z@JmOH1=}_r5(eksHiX5y2`*K$@=&@_)eWpF(h;&N2rlx?2l%=;dLKGfAT!zCPX$|Y{74Zrg5eY$=3g_qlFS8n8- za%Wola*(vQZAx1ERp&2Z2unVdmYyP)jmJDk0R&SA8u`v^b;Z3ZJ2~;mB^x(y-YL04 zyuv>?4SZ@@wsNo}yY!@Iz=J40z&B=!^LiIhTxYS)j_gdfhdBEB@rRBhug5VYS5RhV z<80CRyLERZKR?*1a=n3arED$yQ8eK-9WrwqJ$RJ8Ant`ZfRjd+Oy31Z*P|cd_*_)j z_Uc=+?c$Av8^>pKc2mo^0D}9`&HMDeAk$;BV8X z0SU7m<+O47tN*rPdp+suR*24{*!l<>xXo5}H_63#Mv_I{xY;t^aX=d7vIF-@+ zF2hlYK#88DpJ`OtH{GukrvNl_36gvz4Q4T{H74Zk>_rhj_;?Hh)w0A_r-}UQ$!_*} z!!^d_@pWbvJse@%)2;j$zE%F>4517Sj*O4sEFN;KORBuY+bc?sa*y=Nk3}{~yC329 z&FeX7X_^xlm9ulu)n%#5mu@uLH|=A3XCr%bAW+tQ4trR-e=mE_rkK$fgNB|;=j^0L zgfpylR5;~QKW8z;*R}jAhDTXuvN0)5QR3b^RT&;5MKF^}L;;z17*g&sc1bBGJ%^>XCj zePn^ln4AH4=)`f368HNMyi+gBdyszMiO}@K!xavF*l-UZjxF0)=iMyrsQjA*UM84h`dN6pvJFi0ioYc<&vRUXCpn zAxC0BJ#bZeJZ~5fe*9ycK{(bIDOI4BSJpD!X@janst>08;J6>!1E5nMaY%abs(bio zPxuSZe~yIQO|Qb>X0rPnooRC2_w8+u*FzCwAk#|{I*`^=n#AKEAK7xycC)cU z(E&z7Gv5r2M^n7fn_Cs`j>x>O=)LXM}Hhen6K9|_^G}CiWJpH z1Asd9%e<%x?H~%vfBsv4x&7_``?up!cAc^(g{1uk9+Xz%=(t3--AkU5KJr8cM&2D1 zy&SCK@g$#eV+C4CGnZX-CCW0Qa8XWBeWVs>t7DD&FaQ9X~U)%~6zml|)a|(FXQ= zC3=-}3@I~Z=)v(+&bvg8S-45?xsTHD{L2`>@Z=b?(9`faj7jM(V?nu!mJVH{6iCis zM9<>|Zf!Ai^y8%Y;29y|ckU|~SNE6&b7SOtE}U=27n#if-Z+Oj%+EWk0Y5Xdv&@F} z&;9d%(*EV={za6=Y5ME}is;Wgb)kLWEGMnmQ1q!CjE#&K5qBa}C+pWQ(hbNRJ7);8zo$-yfG;s$$<(asA|9}{s zK+a6eF`IOZkI`)@do}9luR0C&n2WIr+6HpLyHtz360S6-pC?@g2IHXaAg|nLELp-2 zq}8i$o|>H?RJu?vH-0-~GBte&d0%kD-4yQLGCBx{KWUum6tPAS3RDCw4Rvsg($*6H zt*?c{PSnK^cVT|!uqwBx{P9g%` zF_xw}#*c9tUSu5EoUxI&))?oWfA!UN?uF;uPab=QC8an+n3Y)G!O*514`4{n@jai} z9h?bWC&TEK0@Bw$-k-&<+NUVcarVu&uXJQ0Bb25-Jkxe=ewHWQxA(#Kysy3d+>0#N z{2{2brvruW{3DOyOgY5R&+N9&1|dnSfu(IrSQh@_sCAq$Pn>(GJ#?t{3VA}~HIl9MtACDd+A3QA;Ju*@&sj4clBVM3(Abb zq52%Gi#$uE5}t;tC;y6NF_v$m1oB-X<<1|8V*32$PmfE2cm3n=V&0Gx_%sMGC8&b6 zt}5SjUIlk3H@AAO*njii$}a;H#+K(b3PnT4jfQTn9`{gfg)BJgVOqtjmQ8d@odP|8%;HrZvp>+=1!c4jOvPEnHC=@Rm(6$n|16tj%`5E3_-VWx zCm8fxq5~~7s2mum=L8DW&+}}0t4tb-R`Y!w}18R@3+^l-fP#^ z*aV8Qf+~UlTxs9>*0&g z{-p?T);bD{CtfXbPWBp#KkU!43fh6Tb18uz3cAZI(QOn+J)q7K`g5Sq(j@oPaY+xh z8%8lbInRg4PSx^*#PP>9pbn}oCDOaxJ^lEeO_Ka>3EfXj$i3;vwKt9|a+LirO=4i7D{@p2KvXOqG7%kcA?SFX|%G^pi6 zJkiL`O*UvdJ0(7DM9d)^@R`P%ed;93ynMcL{YH4!nYU^7&n+`saswPEsng!P#hr25~?JAqwJ;m!F zhT1fBJpdox{N|V2T@;`TKlQ0Ld-eit>zGysdS*{(tH=&Kk+$PKPL8tueU5TYy)v$7 zv_T`EZCCH6^Vv(UyYbzSFfL0-;3xY^j6V3^Qx&C;@M}{w_=NT~)0c4iyE7v4ZH{Pl zNtnh9{n_&F;V?##2oG-Z$(!eO-r2~+$>L-L$ zKAWS{mf(|7OC1W?0u^(oguo+X^2QL5e1>BrIF4W@@~`@=9uyrZj-O-bK6Qu$hJ8b^ z1ShJ_1z!MT_~S#p_9w~;!lX^IB(N4KxZ;ww{~5vs4*ogOD=P(%FN8?}X`i|e5M4?a z9Kp#M@kNyV1#oueApA?+PA;6paG-9;zFK-5s+&#?YwgKYH!4Aixb+LmQW_N^t0Hm4q#(_3T@!W4k0OF~-6JRhRG)ac-Q4Ju+@- zL|lFUYbI*pR_N_?Eqgqjl;fnzrLqLb_tYP$H3Q=q$sVf>;l^jH*##r&l<3Tus*NI#N<{ zTNY@gOat;h+`FeL2?r!Qtt+hl_B_5>PN&)EUG^13XqGa0->tRzjI*x8&uFfKE$!+# zv6VP+U0~;q&@#R*G36h0DZ$d)x%gHTq}3Dyy53jcQ?>wjw)0DK`8=~pga_VGfOz$v zIg_7!3al892rR~&cI(RJ_J98E|E>MTU;Ilf4LS!b6bc3=BoMfnbEp@%q_gub|2U|- ziP~jF%S@#@2$?|zM8nxMZuIv3v8R8s9l}WYoqzHV+u9wDgH+L_{H)hcUHHS*c_8Eq z{(EKjHne)#upLU$YLt2B_VxCSul!+q0>Sn0`3r3pPbnD0JE&qa?fP=|1+Alu9%K2_ zJ<4CbcbiEF_L88eP2goabN;b*WBs*u7zIpWa|fl#X?d3jeDm91=YSB`JG+#{sd|tA zKbJkIkY$-31Gxr*8yhcQzMM73CR5?nXmFsub?a8{8VB<%8|oY89pF_AZ0FUTQ&>^n_)v3VpjaZOXxT zE|E$1*O@j(*Ii=+-*HZIQT>~MmI`Ktsm$SK?QDDoFZ*iRpT-fkZRC)xQjb3siN4xi zdg)oxVB)O07XqY#ClB{}_V(?bV&N!5#J4LqDTDXDz9}EO(buIsM?;HSex(>5(}>6{ z9CfmsY%Zl-^Y;8^u#C`Spo?La>Cel-Z=Ezzedr$(DE=EBnT(M)$w_7&MnNF`f_44G z*(B3hE<^i3_c=Uc&O(4iJc9^T&{L_z;GhjP-knL9V+L#zIEOGg<}f&J(|58idFv3$ z8OAQhs2*mI0~w->Zn5k|dZ3!3*w#3b4U5R~ZIt^R`k3E?S;4@;eMcEb+1tT>g&TO4 zF*@h)`so!{>C+HfXPJ=7YYt;@kfonH6C!Oih-gx68eDQL7_0la4FSx|9 z&9~c6J$ zU$&?sCu^HV_|EFhQHN>hq`{y}u+K%VlhX2>nT#QA*_5Thsp4;+wQo2R>Dbuo(D~QB zG?t-U(C~;-1PuF+b@htk!@KpfomZLV-{P61)SvmA!Yti3IdF`!r@o;#Zj^ zT-v^!Z07O5xD_G=4j?=TOZku_m9D>-+$7$;4fs}`k@V2Y8Rf+ThiNZ(Q^kD9*JY7) zplQP^{GlIo{Xp#a$5`XOoMW?#Z4NoK%|5{m+S*d$wFjW>k^Okcs)OutT)n~`p?jQo zHR1Q24%N~pV1yBL9uB-{GI;(N_$4!{0Y{Ui90IDmy#@|^@HgMAoBNpR_QOJY0qx*!o^on>3a^qL zAqQb=?a#*!FE|=HFb|Kx+xzGhqYI{RAoc2WjW~fOWr-63Y|9FM)W4@orj%b#{janF z8b5JUKo$JF@F$^x*F*O1sS+uAS$i>Wf7&bI?Qsct#{@xc1iJ_GTCSHkhmPrRm8Rl| zPnzNd>p8q8QFw`V$~O@h@SH)26ec$LN$lN0UOIKJb1%;_{ng;95~ATjM~Xl&j{FQ@ zYYhH>?{EFh_9uV&&$MSg_6a&A-a-@v1#$qVyyzIg3zP^iCHDkJHLAp1$8&j%XImtj zGO3HhZA@9d;c)!SlkL}k^FL|-??3!M+Vbry^b5Y6&LEl1!2_PwWse_dM=$(UnT4s>#_B8kF%lA zGu_3+xXD$+k$f-dME_!u6y~Z!0x-7?h2>(*3P+C3d3T&XK zUM@XutksR7pdtz#P{5b(D@&`-DSK6r=W*0bG04s{6LI(6y(q-;;t4hhuHZrT`wbT^ zV8q_I#!S>L>bO9A>Aft4SsEY22w7caDGadgJJUh?xF_HM>!Y7OcfKvLT<9hz)ZBCi zjCS3m@9wxP4g!{e3Xig{!jF-WG2?=~;5Q>wkad8SSDhI)mfWbAEMgFQobZoce;qo? zpY#yu?NY%hWYK$`ya1>rJn?)v8YBJkm22P#&!W&e&K$s)fBf-B+s}UV{p~vYN}hlA z<@UmNzuSKQ%U^2Wd*xXB@O$_&I(w^*QO+E6MX=qVBd-xt`G5LYL}@w zyLpN|Kkkpo*Q${p9oCgAS0a1-sOSOu*E&k)1Z&YfOhS@xp}b}(2>j@W>3mj==pQXr z%DZ=O9fySUv}cKH;j7e8@(8E8lyYwWrL9mYD`syTAhiA$J?)c`DJA0ulA!#V4<3!| zL*{^Y=x>-YE@PTt-wu5V?*L)^L#~ITM$l0=DP2f-h0{oM1-{GQ$U5Wb|LVgtl(oPC z9?o3X+WnF{X)B17I;s~A1;{u3MFrR+YuzU_i*evg_a;kbz2uwVNt|P`%I4} zGwr>PoNYgP_E~0S#@d;)PcV*gl95Xp;R|UJnFeor7k)-i@(&}a3E(A5@xmb3t$d|$ z)=7NTi~UtQW29l}Z{p~+p*+t0&rl#e?XeEPaM2HO_lqPB9oG z0(FwaLF+!_-Ys;B+sLT{)OA^>0l%@;20%<|v+#-UIprSo7-v z?uMwrg`c@fAW1!!1-bq;$7F`zD$aq%OP=$@6@_NX)Ps?vJ>k;G7mk1ja|IE&XMIt zzxiALar@^O61T2iqF$73``eBbBuuZAnWwk^v3X6KW+vKt$Y0}QY*s{2Ub*r{yY_`U z?TM#8+|E7n1Syb; z(3bCDI54eX^r-Nw1ZpU(;{8w`^Moa5f?Tt&!Xig{PQGj6S2)dT2XBf3)Aj5pkJpEu z^gJrpQ~*3?(RzD?pPrH#Bx$~NfZ~wNkQj)jTY#Q!sGVim%{U7B9HZtmVdlYK8a|O) z#ZOifrO;-uzd^6Q6wMbu05}7`NRDx4Ato@grdXryDJDYjB)1uqRgcu1!CR_=DO}}F z$+ON!2xF8ujR+FAzr?jw|5ze|d4aypj`V0d`nHqAdvOVn_;L@v_q$31=1XtKCwe;nDWS9hF& zbqxNVWI5j~Cqgw8DBE_L<3%(GRFQT#Jw?ReY*sBHR~hMba>ZqiGNobaY?)YLVq&nd zt{Fv6um3DWpS;gI^;kmTo+9JQlI!!yM?b#K_Zw7LkFvz+5NQr!H2Nj=qL1s}nNeez z-3D#vCj9rDz0m%%|L8w#|NGznJMBjpAr7#vd(WPiNoa@%b7NwirCQQc8W$~GYU8&R za=h#^{hL@n_+=mR;0*^~dGp$}Yi$Wbr|74{0)B(1=a2CPd0@Ip*pFddymp7acdspQ zTGs}Kq_CIn-D=N$?{2$y{c?N!(X;Ire(KToCqDLMyTr1q7vA8=W;TMp{?^s@$tUdohxnjt?l*$=x~4WM0?~XKH3hQJ6kT>sn;ys!9*j!)6 ziGlLY#!Y7hRq(r~-n;cx8F$H%GB|akziM!3OiaSx8YMbD{GP@bN}aOQc>A!Y7%4~1 zn`L3p)_-#LEFNMB=a+Ro)yjUaXB5VSn(#4soMEe(*_vsn{Stcnof$B$`fg_YpuMYi zyt*M|9i(&i7{N~}>DEalNaJ6}ksrHs?Dj}hodqg)*6BFj+f%1b;6Pb~HOsg=2v0d% z;!K4wT;k!J9+zsFHjk-Ff1o9)n^(uj^tNl>gUb+5snQqn*zxqmxtAhey!J|4&e7HD>}_Ca(xW@=%*-<=`^YLr0H?W@$ua8noKd zVKR55ZQo-y4VWv8&-POLqceW^2lpyO(uPKzOC0xtmvkUzVwZLBYw| zIXslUPoS(FLRq+V6C#oJKGTg3&I%@lYW7S(?CoLtClsMD3EZ!EO9t4|(Mu&E(_Nrs zXBH>(co$1?k}eLsvvB#{W!5r7zjex&KC@_2<|6~?t09G3LA{>9wgU)}cB`T-pL){a z_@5)t(m}LVQLHu*3aACaLwH`n3Ba8{eDC2wwyKn{;3G~!q=JkNhJ^-&`i}C5C(xy0 zlt2)H@IvZBx?YA5EZ5Cuxd4W|p$||-+EeHQy(3M8Mk`nb1NoMUrLd`&U$LE_ovgRa zzsU(7QqQtU5n;1K8kZyq3M%E_CW^dbcZX#USQ|+Tp#+jaQ@T-^_Tu-x+-_WXsr`k& z{9A2e;S_aXnw$;DF3qrfH`ICZN#W~_{(vx=<6~c^NOR-Lo9*z?6K#ggRhiayK$hQ8 zq*g(Ca(c0S<}du!_D7%lXYG||zYPUBjtqD$_O|G)YkH?_WHA5Z7bK`YL9*3V|X(;HG~0EWk}^<0t0#zqiGYx!+x@XfwP0+ zypDJKC`#VR)5Fp%3QF1X0qr!FwdKU6Z|J$CZ!>TgN8{p33UfoB=3B#4m#1 zZ~XFO?Kgkr&$J)@=u-Q_AAGhw&xZ8t=pZwA)1tJ(V|r|@i-xLKd8LG@*W6=B*{%!o zs{;yC||-{<70$VV|Z@ilSy`*`_Vqn&i8m;6?r%FO|dl2*$cp-SVK$8 zq4;_5nkC+T?y9IAHaE`AZnPuJbgivY5jNCc!DILA^FL_Mu>9w_7hh|4@a}VP5lR6H zMwB+b)`~KD>cr_bc5J+T=LhtgjfL>t$&-`q@b=aA%OC%V_Vs`82N*tlK^#M4oBbUV zoRkBSti+mUSwJaJyYvVZU~Bkyg=13P+v6;PhJ&d4jUkO!X{QrPMOI~aiOtZLE^f68 zM=zkzbACOWc&}euWrpuwJILOLqPZSS_?n-yUGCL*?Tsrr32K4%I?7Dgr{4E?`}rpy z;X5wZ+Yeaow6JlI4ebkU8QxsxLx<;%p%A_Ge0v>va{g1l(B{uQMZYPs=6G4?`O51* z%9pi|aKhB~hGR7s$omy*XNYDoI^}UUx@%ypITMJoyUw_`&axkd>9)PH#D-!{<3cXz zDRp_6%BtQ{Wkr@7@$5!++b~P>C`MVUaia0Ch3v3Djcc$mo~|<{xGCP*NMUJ+3oEpw z|7J#q{v(5E2yDVv_sO?{@nt#F`rkQB5~GvYDUFd}(qB$VofbY<#b|C0`>OK70<10La{ zm;JKBFs>6d^y#6%I4JhzF@J|Y3d{1K9%Ci5bkR>V6rn5S>pXQisxrXJP)e0cnWj~i z7*jqD;Ybq{RoWCH#qHE}MlTB2Xw+IixAHMeW zukn45yY1YQ@58zG5Mj(b!&8hv$YaJ!tsO3_Zhca_)T`6pxSpmGN%GOK&XXSSarx?7 z?b%nJ<9kw<;snyzyw1|osYAC|^>VY?oKlq0ez;Q5!V-qT3W-$FS_JLvpotl$8YCC2V~oDfGar0z1CJHtUMGkh<_ zeG=1ie9RCh-u;zZiOcs+==<`RS9Fcki@ui-{`7|J@4baZI(+9qv&$H}sowsO#6{eS zG~eD8hRrMCqiHEerj!{x0!}Z}HUp0WR>q8St%m4w5!+_;6328P^VhFOTVB=Mh9IH8 z{nvnEVu+9OPuWu_etN%F@h12|jZNIlPD?Ih<;~+snj%bMj)F}6h5(=6=_)+lDTc5i zj~z>ItBqC)NUs}7hHAV+P2L%3V2VpI9XDX?y<65hWBPV*!JS~s-lHeDy*=sMLP8I5 zssBiN^Zoz;KmbWZK~xb$fwM37ZW+>Xmmc*TTnA)F>7ELZ1XcktL_9`>9nC>B;DAm`8FzbXueG88oETqpp#9`QoMazxjy&Kmo7}m(4<&M)XZg-kt90z8zfq1?@hkG5=bqN^(KMUpVHWAs zsWZT*PPEkmGW&ffkMg452vH8WufpG%GiQQdhvU*Um*XuAd%EaogCR_Rgcd2w#GyU? z)R_i37np^mOGR`%=-?>lfOwByc_lE4IT3-Gal4+S@90(U%9)BcuG7+I`S8bGqiKD1 z9L+KsFwV>>?*UGfbx1w<)9J0!KZNcprj*_U?zT}?);f807%>dl@8+_a+9y)!k|^nF z{Ip&DnZNW#((%qjSPY9fmL$FBv8VY)!YTH#oMA~0j$fS%%n;pU3E2#^d;f(CESsEZ zx0wCjrr*DI>0(+@W{@1ZA=kDq%D z0~vrCPjyHM4n^waY?nIo!w;X!-X`U-GYLA1+@JRUk@l`#dL`F=-|22N8tBITIs;%X zAUGG^MN%R~$`WOX6lGhIZ8>?9yhuJxK10?@@=94Nj^iXNjB_TIItYS;apcdrc_PdT{3QcTLL zd+;_NvL}l&?4Z)qCyowZ{QT#JcQ|tVuV4RZmSoORf84!u1DSH%H1&?{oH`cW2qlzj zI+l}8MEv;E0em77efag~RB<}|egcKlIY^mym_76k0u!j~cCHRp*Y~_oAW@ zS;0(cNsOJ2rZR7NKDw$P`->EQomoJJ+;wLtUG<1X92)aZyvWNk-+`D8lG9g%pMVkQ0qr|oJ{xhnRCN3_fUAG zt4k!RQ%p-wr`hA?X4sdp%5?f?zWS@f5>u0Z@}1uwb{+!HP?nv9qSs2S^rv(vz6qCg zHOc95Qx66BsarEtWDBgDeD2E2!=>l0Flx4xB^w%rW_?EBH5%y{+@piB0UgJw^o}wL zxj_Z9V}}M~A!~r0vEdY&T}GY7?ujMlx18rRDxS_VMAj7giFQj@^1?FZJGxKVl7EX` zF!1cNis?82Q%~h2tc|fe*RPLiW`n0akmgL)+lIwjkQHD&2roKN$}%7$DQ$!pIqY`OedwhOLuURjJfEEp=TE7sMOC6FJ5G- zdGmRWH{Bj?-MPmlew-G=$m=$^Jx=ufdn_a4{*ecc3}ajc0l1y%S);+h1jqQ^<<@EP zlf3lKgq?>h>%!Q(c<};xc|FU?q%-z*x6Zr{HZT<|)?Nd)(zQMswZB ztI_owmltU`Cf_hV+}oCA9T=$Ux0|ig5kfjOVqcAsjg$tdMvHPSiNPAi{MNVs?(j$d`a8qdzWg#9tWPld%q%0u(qX1v4!Zu0#&w;J(%PC!QCFCe zc|2VG>D$9K4Db`jPY?g?Kl%s5fA>H9-f*9h@L84}ncJSiqAvO2NV)?JE-ljl){ItG z8U5Dig-5?1Q$7|MxzqU0iA#Lj(QvcvAx0Muqtj2(z@I+%9EP(aj4la7PslGj7aQRA zs9N!9oM@z~MhvC0;nIwtjFInNLh9P@>nsP_VNcGB=PuF-I5d2~Ncm$rnd+=d$u`K3 zZOWw+np7Yv;BW(7d7ph)n{_23|p6|ZDCYt z__|^EIGfE+96=Y6#U>pHH(*PjdgLsFG}jD6bx_B8auEC)d$rNo&D?Y(oc+R}wj*w> z@))Dh8nzF}r~~4#Q=u)O!DLmSLY92WFP#iz=s-u0qn_)pU(cJsMVR1d&b8il2Cs8S zM$XZvV|JTmXzyLSmQI`HRwK%DttFj!#k$wCU z@;CpP#%;u*pXy@d6o)jNJi{hx>W6vCl=s9O zXRzVqi8b~OJw04Fcb;~bfdb;(WMqG1<2~vDW;waD;o`;1STIJJ06zfIxW(OWGUtXlHybK=M{(BdGr^^TVu8ebiVO+#JTW#J%>dYvD1 z3UlSkOT*v!JKsoMFPt46FXXHxz;yk9=C&kN6tnQo_nOqyNa!{SJ;7IXYt@ zmF7ju{_~*k{I7Q|czV?lV*J*RZOJ@6K?lb6fj?A-sF|*P4n@D(O zNStZJS@NSs$i&W>j#te$j5qYmd_*ehT8wr&LHenKnTb#r4O6bd5jUMO-j%(%?o!Lh z00=AW-Z71|{Dbqd9=}gQ}lgK`C4PS6U$jK z3?Ff-&%480?_D2mbDQ*ohi8@S&%E=z0ua?JSqFo>;4uEP?+UM9-||!_x6$`Wu4^!=)z%UZ(Qo)nsAT&bf(KF z8i4cYxVSYo%@-R>pXMZ@W+s9S+)ULUrbFb3Ww$=KI(+eSF9v=)@zDV`b6ce630FVY z5f0}@#gP9hvlc)1?AhUicdrhg`qZa+KR5isr!EhF`0Bf?!bT1(O;IoN3Q;za*6F<- zR5pEWzWHXDe%?Rf2>l^CI(8h~IBecJtGwzqb+3vj<0iM3qj*3VA=rUI&lFy~q7U>%431-J?Tk zhwUhvC3jdZF^f)F=9TXqmMl2>zsT|(g}J%G=4W93;9q}-i}(%=FMje9!9)W-lVgD~PRb@ko1X2HTAGe9dZX+9j^>*W!kfNN<(P8Lx32pcu78X;(%7;? z1U+udx9ysxPI9C45$d=zTmpCT;zgF{F<8Q>aCH!eb&AFnquGq$uG67+8Qx(!=_gr^ z>I_6S_fiJJ5_KS@CwnboJ5ZO_U_!NJc4Cy5G+7Tzx6bd0^IdNq`Ukeb3De{Fl)vuR zap>1!s?#2j55aX`<0;QjB0k$VjU4Hah~|QyG{s$18(_Gyw*~EF^9GK_Lgd%!G{z*p zc#v)7m3da*W4=4@ouAO!_$xnW5UGEm)tT9&M;3=ye*D_-K1+tY$kL9%ECXCy4;~M9 z9%tk+Y(2tzwq2k9Dwj*It$iKQt#la_0SSakky}VKX!g0KO*W+ zd5esE!4>VruixUtY@`@kCu`vATXl(|4jw{uzPp6D^0LyY<$KtNIB5~m88p+Ww9E&> zdY&XN)iY!auE)S0X3v~6p6|cUa%yyV*%?~e2-{dMe7yeN)wJo(%G#mtm*8qJT5x=g-@~W=dN!upvxn_}3gu5`YQ#z9F_RE15>sfYSb#C(0vyy8 zF22y|i-8cNfc2aDC$Rl7a`7<H(WjH6k6U*0A%4 zEE{7Nk!19`rQK}%1zQ~07Z*D znm5YEzqlsTDJ@N|)OhnqBAAW~$o>p!m zH(tlD0i(CyiEE^_s#)%-}J+8dn3y#P&R>bVQS%_=8c8`k2P51VSr@{|v^= zfBMe9!N9*aeB+ycf0$jAMv58{#1%MjrSPj$!CaO7F**-!PP|QL!VBtRio!(CL$r;N zJVuwFdVcu!KlyKl@BG34IjmfN3nQLo2#z!acBIpiO68d(QrY&B&e|2HF&}%zZa*KHPEbIJ>{>D0s`JKY7EN z;pym*&)R##2k*eje>mr_u)Ox#UeYiGUyMBW^2yPAU^E`}kq(PQDCy;?1kt`w6U`ICUrc{~F zU2dk=!%C*k6u>`lb}BW5gg?yk&UJ1~by<#v`woWG0!xZKFkpjDksSzcHl8OfPjo6F zE7Q_38*8+3NB3{S(ztQUhl@}Q2)QBOKZp^RPP zb%+S(PaUAc@a*u7FMWPE$Wo$ZIv^a-GJNglK0W;YD?dfgxy2!QgI-gSWtK7cz(vDq z{zg9F&oUQyR3;j|c4##&Jr34-MLr@!@L9io=Chw3p8doN!xpEm{o*%%hvfx79)9rW z-y7a|{dLNnok(v+fBq zJ+d=44A<`5%zl%F>+cVTu3h6c-cO=W&Ms`_c-$?P=GqBeK$dee+~AGAZ9(%2dmq00 z!&ip?$kMM*KEvAa%a@09Y*b$)FU>>Sscn`;&_#rw>S)TJhZH=8F``at4HdrpAi#*!X!3jLTRUJjkjB>QSH!b67X745Q=7OM`#x*jou#Owi{emFb@CEnu7k2J^v|0Z9nGhzV{ z#}xjxqIOySWtwTo`eJ7qLY+0^v-MWK5xTKIyJ+pkheJLG!%&CDZ;bv+-kam{x z7P*}s`K~qrfB!+IvQb89(992PN$p7kT@I{HOqQJ~y6`G@c8rQ|zb5rd1tlC%+!Q83 zP3+Y+T6XvWcmKgQ_2JU+^)Fr-{+xXvKfC@8dmcFufljoi$UR)WGpw=PNm+XAdDojR zdvPlSaXHpm4$dab>jT5d8kqwqaQ=xTiWtG-<%`0Jpp2E6@vT4mzsuJ`T?O05SxiYK zv29dzp=pwcif={U{PR|StbFoMrY%GhGAeBgOnFus@?prtJF&%8@WrO1BUCb3qwBO+ z!Bd^1m^J5QM#SC}3ySN!(0 zI^BNve5|K=rDyZ^=|4P8MgB8;V8rPscN~`pp~Z;!`+t&-ehqj0M=%1a4{Eq>3YPe4evaG|7oh&B2TzT~L{VZ-%-~JF8TBMkv{>H-F^Z@b`ZAzaIYNkNyvqX#AK8QU&e!C9F}M zH4PXeeEVBGc5--`QP@kL`~sU^kLH~EO-63`U}QuCl?t2+D@GhLD8t?WXdOpiI$g*P42iAvHGN7zA`Gvh0cYkvlwvFW{{Jli%cl9T=$( z#mJ<0Xq1&Qa%rK@GVG+G>XPsbJuxiI5T#%o{}o|8E5C(No>ixqJ}yOB+VnR!TPdmsv)1c=0yNFj=y~@v|qnMf&Wyr-s+w zcxQN@sd;ZswgH_#$Z@`qQL_hi3R>|CUF2K2;yq%B{Az}v&g!>wA#52=o9-SZFE#8v z(|?{iGs~OEcX`wJm1%XQX8!n!7@X|_2YBpgWCR=o&CKdEjW*UA2-#dqS5h$}WsW5F z)Evqb=~mrghrz5vLN;392RiHhzxvYU;S*Olq2%!jM<*@}cewLp`_6ri{sq&)6BrM4 zNZ3bmK%FrgiL72I|L#1kO*%Ei%@B>k>_@>5>y$kWY;nf#C`U88_aP*}aBNjgzU2{<+=RK#o+|UKdN6E z>AHSOoaJzbrC$0wyX#(?IB0fe#$Oq!w4dWLe_%?R~GKV9iTUPF`qJuC)LbekCm_N<&>6X|h&WEOO7uy^K_bc{VL zMFZNUHO@*L=kzSs$1k$c8kGud>hnEj`-a&^*%$GcW300m!WjjB&>1a7KF-28avY@z z2I;JFmf3R^#kQeA7}IooIsh|vy^Q+o@15uVt<1V#@eV9VVk6O!H_Ts$w__C-np`cmmBGh6rK)l?jeOC_s*nI>{jZ`JR|%v^r)CE5xuuzc~EBg4P_ z(^uKxeJ7`nuG4|@z8yQn&XBvrG3`5TcRK$N(|I!eI-SWBbxn!N@Fmp$*7QQQuxgyE zL4fM;R?29`A_97Vp4?}&k_3#wgB5%5DKs-jgMsoPh6R@~07KJQmp5TxJfyL-=XHQe z-jJjy&Pjv8t90Q^zcyf&utL`)tY9Vvr9?DGEO1Dl@g%{_lOi*OH$YvZeuQI9j#B47 zT47x=M!XHS8}sfn%CNA&sSd;yYcH6cE_FR*kl9u!77{&3asnFw(hmV(VS!bf77Q;a?^>f@Mm3LSLEzyVLvf({F(I_fB$=}JlO z@VG8nIZBg(hMTSU5QWAJe2l|`0~CzFzW?Trhu{BSuG5kDuZA<{8S!6W$q@9qE?xsU z`1zAUDbC1`Qzkf4b@updMm?6;acx7Cwf4%EG(B-?S7VdS*i= zjWmcfiB&mq8i^;QxKt&b2ToF(MKNROg%3L;$b*WDTfSJ9Z70H!C3RbY*_kl^kU?mL zX5#^#hD#+^8bv-&D{ys7o@5Cu$p`d^2HH+nbdbRC45ae`^-hF|BhP#fj!~TW35VPO zP=70}0P;=RmPd3|xmCwX5G!r``#-@&-YRfuh_0BbGL(B>i4|Q@3v$FJ`Lngn(grUK z^Ne%eU5AXAcyh!ZqiAkARPQ&a+||*8jP!fj3-%<2U{&-tZ(V1U`W{pB9B+GAc|9J^ z9%XOW%g+p#&K_Y?A*0=?QyJwYzYntsBjw5R2am}cDy}MHC5IT(_-X8EEDE~%a6O#^ zPwmOYV)$=ybdPzIV|UqzxJg5$vut>B6GckKp*5w&>K~D?W?qe@VRt;{fEf;t%E&x0 zZ+F#8UdL!);E!TVzCj_1L)>l};9VTA5jewvn&ZxX0j-bQOd2=QtA6)5$z?XNwG) zXVzmE1DF59BigM6E+tuDR^;&F_V6?vir@IPZw+7m%$IUPo42RLC}xk-_Q0u2=T8j3 z^35y57rykh;V=H|yTf~L|BTIy*P#RXa$fvnIxOyknZ?j|v#GN^k4e*x#zWRB-&|p% zHzK=m^7!!X8cQKp*#ONl;19T+d;S`Ct8l{01q}N;9Pzut5*%kGv}djJtQS2?f7dpE zygz`kz!<==HD5CcwhMp#?v3G$2g8~W23ys zD`TFzg>c(Gw?m{2RFc-)O`?V0lq=8G84!lS4Dd|&4a0l*DthrZQ}Q3pMISvCE4Nac zQ=~1`CAH#RdiT&-h$erH(|HQ7BLrov%Y&Z%H;GzY#oIr!p-#~zjQHq&(^$~eQ9-ET zg}>;opu%U8iEle-MXJM3TO_ZexPF^%rEUB!1w(YE)84?Pz1+yGP^&WANXofUwP|Om zvn+=W^lZ{m@Q$bjremFzxcVL^LahPo02X^14|?(-~4 zR3a+zUT1vi8$U%Bynyy+B~o!!*(4hpDk0Zp+c45jHeAw2WA5?~glh%c-89piPqADd z{at=iK&HwmK!Qm!g|*Qhd7}f<_utOg-l`4M$*8L0Vw}M_3=tN)jLC#jEY?Ct!fqnD>08Y@^bmtV!<*+xU9t1502w!t%f{)Bfak#-Xy49nT~7{Bmr znvQ@vlATd+(C1EfUBZ~ajn0Moy#`9yAx2<9vzxCGJY`FGFXz&4Zh&@}6HXqPP zaB|G!hZxG7G$#&^9ENVHHBJcfXiJU=WsQ9{4maML_JlU+lgEe=PF%Kg@7_w5fy~)C zg$AeBU0>UIU1hJvVZIZKyi!*bk0aN1H0*ebPaZScn!GMqMmB_LOo>m!(5H2<>*no- z#sfb)cXlqsWj?zW+9PIFyu%zJd+8j<3$ssPn+=usR%SQ?ae4UQ_JiR^*Os|-ZDu&f zn)f$u-Xv?-CqM(st~U@t6SHStDk)Nr!__!aN3x^{*jWsC{hYOvw;5*f=ILB`l8cSG zcXa&Y-}?6OZ8j}?oNWwQ2^=}dM?R_nr%F*izxA8{kP8fN3_tw-pAT=o@!D{QBN}a_ z?Klj(bPmu(uFHTkVxfO`=`eVA#X&YcyXWK1{ncTK%lK|m-*~L`CUu5oLmA(>eLH(E ztaDxCY&o}#yV*T^1jr+g?bPT{kLP$-$IhNToKbppoz@1h^_IfEJSSDjQU z2dfzVDI;Z&jykv~G079_4F_!ew!UyVmGGKOI`c-|vn}oV8;q1WkGZt2vLmRmYUjk6 zi8%~<_orOFeUDRX-pf*?74-Zrx_s;Qt(2{8+Ni_ivA=CPde3oesAVIa5M-?`xSvi+ z%vl$xD<6;9Z}?s>Vn+&7o>sT=lxNioxVI_I_&_!(&DLY?n`k5oLZysHeRJo zH>Nor?N1N)v(iBR(uS^tz9vV*)86sSz`*F+FkPk8-dnmIzj6JIXF<&09;fq{*rwlK zd_sdd**jw8<#&VPIca+XUjD*aPP$VDohNbn-szg~rfi{IT`VlNTXZ1U(W!YHCQ7gUQ2f(Y5Z%13&s#gPeUZaTiIn)K|C^`AC=5UxoLdirZkV!)8C(N z|Fg>|X`9vv#!Pjs>p|9iuR96|uo>m%Fdh?JLCl{{OX#TNfDvJbD3Hh7~ljZE-YB9&f+PgAHj`RcsJ$2g{u0Ap8rlX%j&FU~lA-g`7*15VmJ z+h_bf^3MpUVj0isqE({%u!QG^`{U9OP8XelzmgZ3v&Ttz*sFqy49rI>8J|@!iG%;1 zbj2N+Kx^`ck3!XC3jqHsUy?+Lmj>C!%|?1>eR=rzfAoJdv#>mT;pcyer2|Kaq5;{= zH4==Gn&`;F89Q)#hr%gNV_LbtoVZ6=hwI3d$;a46Zgw!{X$+rx>GLdYIZj97k2nMR z4zzh<%(3Ch=e{z0@}IHp z`+%&eKs9Q-8^aO9+(i<(Vu;bG@)~&BSe(5ID?o98BtNhtSA#`c3CTZxfGJm(7j!)r zM9ADFp(P9xhV`GJ1uMp)dfH*NK`CAmFR(WL(9e6<$-Yb~X^W!OQAq=|u!Wapk+|}Y zapF&T3ZwMGOVTgesuNLFx+rBXC%*AiUmX?R-YA_kSlPS0L?c<<)KFKLvlw9RJ8=1g zOFUMXdANS_7PGc2y(DcLflcB#^0a=C*(&mA1L3c6nbD7aa&5SH>L{b~Cv#85I!AgQ zW7D-8g4KaZ)WAXAx1{tFe@I9xvgn_>ElP8>+(TlRoiofAh+49~vwncIm6<5>75o$%@`V$H+yDi zGx&<$p$Pn^6Hdc@?94O6uY8MJbeXyMPi(+`<0r4MHhDQ|IXaJ$B%f>MO~Vv}oD){= zkhdI7H5^#vm{s6r>C_xKdyb8uM^x(JCMOTA<|xam=c1_iD^HCRyWsIKV8b262wMh0mY8f0t!XAF!NZI|n^@l<#@=ah+lbolAul-TaQ8qB&g1#*wac zR3EVk+YZAHd8$lILsxoWeN>&&EOnyZu(Vh&DmzM+g)6$CPMY7<@qrP;p}nCa@8~GS z=yDI4GcD?v^*kLh>N)o+xb#kab*A7E9b<3#_1IcBi-U?bpN@rjX5J_(PTM=gCTmaN znsJFU<@5ac^RZQ2oKi(y4MuO!SL+n(26a0!rhFO}9Z;vNJ4whNzIc=@A`t$>zcbW4 z6L<7J{_gR7ew4o{kP>pLw8m-lIQ6V@uW{_j$NJeM>Go1NO{>SR6bd$fO>m#(>_xQcTjh}^Mio3@f!}Rx#%Q%rk!>v6}=)S-lrY2YJq#OcqlDV@OTk8Y}i4sIP_cEPOo62x_uHM)0g zm%UfBJQtYNb^oLo9U!;u&Weffon$`zhp-x|`*2X>Z^~p5Cj(Ccu7=rV4~}5S3}tBK zS|t5=?Kpamc+&H3VKSUeBq)%DPPHvzwgRUV^bK=`y9+h}{P9~i_V4dKECBrJ8Rd6> zk9vR~&fv$}QV$P@*o?CA0O5k$iY-SeB5Y63aKv<+`5kxcC*Uh>=rN;PF&xt)wHi`I zw)l%jM`l#ktYstaWY&Z54wDL%hN?wLTBhIe3s;gM zPa<9U;-4I)uQX%()~8>znTag{ukpBUj;9Q$3T zLH^S)HKCbz$}u>kkC#+vDzb)3YCE}=eEtz$qX|V#$ymRN8$7_)s2`0q(}Ry04gKMt z{s)#A+#i1NSH8)5#1k}7CBHd30hxMCS`>Qe(qJJi^XK@9lZ-as<0O%LG|;DFNGKCW ziDp>><0WREdUoc*^TRj){y!f6^n3pyBg3Ej{1-7Mo+q3}7oxOxl}n6r=(9o3Wl;EK z1R??>&BmYH$(wo?-t9!0s9%UzgRPd2C?@!i!V|8c0NgGk!;bnTHVru*iMyLlk~&an zX@O~kRT`v;QL2mh8%*U-$=&4~5s`lc!u9e{@8EKDFHRWanWv>|foWlOgkaeJIN#MD zqM#4zy5CmANe`5D1g*(}(-`1Q5eg`O63Y+qwQnM)^Fo{^4P#;m*~uhtFkaJxzJk2pL4d&NOt6H6jC6hKDn^-`nU8Z`y+>ziAbU4~>w4`-%y5+7u}f#O zJmdQq*5CixjTnwz8g&Rm@es2V>5PHb6L{Qg9X_F3mG5Op+4}Qa#FdS+$j@b!&S)r09Uuxe-;pmK=827U zmdlW58frGI^K^jhRGr}Ngf*5EIh*kOGaS{(1&GSz7MqaYxyof>)Zz0i-+G9l$XFaQ z&mqIytpv9htb%au@n%=xs-r&39ES0=UZgYe?}oSE{Oe(t zr9kss4(0t9=HCvRM6-B?)2Qa{)uI<3k9h!s4zMKd=fCuI*7P66VA$lo2}T`hBa~(K zDaZ^onFs!AKqBNNpL_HbuI)+y@`@2qm0_6_pQ+sfTb zxWMT~C)1&DFU|=DKpap}-w!d;Tv9DrI3uu)>?u_ktB!snD`zW`@8~n{){k}wwCFuT z+C74HByxPL#t^*O$$3bJK;8FLF88U}$@TW_bb!+Fr97Jl;4^QlgJM3?j+4%zW2aa? zN@omt<)&VAzz(wamq^1b>*Bp}UAZ~qlvxkRjlss#IwR6WR?%I)<4lr&-5KUjuWDZE zyVqk0YrqrO9(m-y&nq(1Zyeum=@s`!`H$lShPvANDfWRhLaW&lcBH%W(%8#G{0&5W z#hdXex8uj336JJwkI%EcLTi&vkHAy`h0LSOi- zG{-Re(+5$Hny0WGM}Hb1T@^Nx--PQaik??JZjUE~@h9JA$Ve_z3?)q11AFDUB-SZo z9!kM1CQD@3S67DRt1Rujy)oRq&HWr+E(oA?xiNzpJ+R)D7TYAlyNjP6ozNbTXL8fz z_X<#&bt61^k4W$wi>4r-@VDU*xyn@|dBEknWMTtZ>o)O>F+iq-nt~JSm>IvjOXmba z6CRxPzMoIS_xb@mb}|b?!q`}Oi{|X?8U_xE0GvOMdD~{;tv3zs#Z`zpM$=Me#_>-{ zs$hs{B6wOj@Uy^48~?1tYed8F>Rh?bGzO1&G-O0vnDbuWf>)Y7OaTYIe|<6jC~l+X zi4S?}cqag&F=<@rZZ9!FDeHDN9}aK)_y^pG zx;%X2*MDO;%E?HMb~-Cy1DcWgP}xT#;Dv{2*pGLby)qNvYJ{lV{jEbr^F~e<;(!5%c0zeW-j33Y5iQuC;xP}|1EP0xs|9N+Y zMVEA?0VvtmbsYr^r%Zz;&q>8=py4f2;us&l<Rd4$e5 z3T<5zEa6?6WlO%`5q?ZpohcbJqU)$Wh8$Ng;5^ks!-@`pdIj;jbTH?@>vB-FKYG+; zZs$Q>lP6F)?$9SABA`lB?DB2yh}Sn^G)NC3`->MY4bQWo`~=3k^sFx5z`)1ACat^o z*M^&`%v@jqYDnHjCN8B}WV7rIW^lf7a(4LiZ~xM;%!;9P^y)r$N~}NP*iVkG_4a~? z9D(|P4a<)hEV|EXAA}#{u%ljWTXBI!>p-Jwj`qN(Ee&#Q~emvpk`e zxRx!;rxg+6B)?`kafhTB>_ldyQ*CPv-&I*F-~K+)A-uca zI9~VjxgS=)v40Qmam3SMGT^E5N^25m^z$LD9WFubPr(@jo=wXrKKZBIji;N?5R`$A zU+H117x0$8PFsI=nmeA4I((L$qn9u$1<8R~oEzd2Hg5Sr3Gq)LdjV2bT^^A3ZWy=`|kN#e#4t(oJ zuMBVA;!uVS><2EPa|z@Ey1m5!pr`I_Q{QcHH(J#fv`v%=@x+mTv7{tw4`T$R+K|s= za=}ZdilFPv_#P(l&ojn>fQEd8i~LMerbx*jhK?Vyl=xcwIx zsUFcRC7OFN*EL!MA#A`-4F8)5Omztb_M*G5f3Y%r^yYX4nqD1 z$1Edgt_h#z%6H!BKKKlvhAr`IFey5;8Z7%gnZ1BDmUGqrQ>k?%<18&3r`PMF26h+Kl#+K&5|a>n>{S|*)L(fN|&8d zm#;Z8?@W>%uUb+j{Uv{mIWxg9vYH>|g1ahmvW>#g;71=1umo$@&Ey#W&UD5(tLpJ# z=^!&eg8wX=&pn3NS&;; zA@tN)PNuHP%Jh5sW;qhJmtDWRlW$#z>HghsOmCv)&otd$MuqEWgPo6QDuvZ=NuI#< zmn1e0gICIoGIiFgFjbmk+Kc`2k9@)Bv&*H!3OAN#_b=Y0N6MtXenoBlE$m|eT0o`0 zpL!2Xy;Vp)do9oJ%PZ&;zfQoh^0Hd&MVIqnV3W_$YNs<@2 zP*FgGYQ4q!%4`PXdy~prh&gwQQ>fDhw@NuiN2~B;#>Pls)2j50>8}b)@yQpk#VbKL z6;0RqPb~j*JcMIcC?U>d-yI$-aGD3x)C|K&5(>|hC!a10(5T6I_=zn)B9|Iv-0AVz z66MXpoB|NV0#=1I4q!rgkcw+gW0XLw%TF@&Bs?(6a9VgwXUa|s=r>N%^=ZOgjuJ$OJ_Vs%`HK8()q)MYj`|skKLVpS%Xe-L-~IjnZTR_L`nBP? zPjT7X;z=6q4U8JsC3cJu;48FhxI#S*;_h&aWdaXa2b~Sqtb5j=-Go0IPYb!JZqXpS zwseM3%D!VSQ9EPmCk$jG2Av2t=v$N@%Y^RkU<)phr#TeUR`HYF&qpo@D zt#^mBTzaJ8e&NzHF-9~<4zpax4#my8_lDPgc6E6FgX`?2d5Gb=$!6l)+!t{>7tGn2 z{8xYcU6vQsQYx1w&9dRHXr>cGg{*ud7kIGT+9@!9G!&WS!zh0+tS}ogPdW40P?vVP zS7(img{KO!cN;muJ;o!$`BmN`XF5f|ZfazKs~RfONGF14gNf`&SDG?%27l(M&Z+g| zT()?U(}K>PJca>VGe_INy8(U&<9X%)4JBpl^`E}UMq@hFbMATA9A5my`Qg{U^xW{d z7cWrnShv!_CZ02w)Kt(z-cpY25Y#u!THb-ts7K1fEoOgqw-4w3j#)Yo8*Y4V{Omyg zp`3W-QH{*DAnUni07$!fC67^rutz#Rkp)3<)fquH;Ff=#v>Z8la(LlW$A=eQ{%jeq z81t5YXOXi?!^+Zl_vur|-Mxf%dL`8z=SUNA@c*HTNIbn&oi_Ajg$Z|yC zvNvZeJp~K9A4gouswXq$2z7k2`59VstZpy+qzxm`OZ{!-jk9CpKb7Pyjd|K8?)<~HVk!NM2|2i@YFm0d!06+jqL_t(>X2`e)(04D^bh+Fop1s7a z$mfv>^%^d?MUuRx1@vFrYPTEHCf)y{~RU04-@77)Xo7CIHGYJ#249oj2`cZn(;+rzK ztGz7hJO7E={uDd@l@{R8p`F$V4Gl6JxmB8xg@177UuXp;132-m^out*O@s=V;A?O` z#FV-oIC+xBP?I#Ch!4b?gVUgUYuinO6D6si!)pgc&XD<|^j zUBIRwjfBL}pZ{stpuHQfrASpND0A`Zg4tw&P2RQi46|TMU%7%;HIZ`jDtVirK1Fi_ zA3X~1$m3&Q!4lpIU+_vFiXA*AQW?ohr5GvY~ ztr4zBiEBQ(bS4!z?@5!q3@wdVD?Vz?hE-2oB8sTF@R$hY(O&~nAEB)kyhuIXyfuB0 z370i}_d9=>WfGtM;#YEOwEE#*0_Ek};N&rXt*le|fXC5U*MPfO@&N{djdB`F7mg6e z0-uVVGMD~Ooz+j0Wk#X%M zFfH?r(O2=~@6od}9f$iNA}cC;MyP`~2DD_Fp9lTyc%UG-#^M{Rr{q-qQu7~tD9cD+&9&J|^9Q=ZTn z%Gb<#rI$QyuqG!ue>&?h8$0u}$`U-nyD_&gym#gnhr`EC=W?q@tK* z=jk}wiCn(NO~hdKWEU^Ja`}ZL^lpT`e&gow&JTagah~^v`;R!2pU#6fAG;Js(OJ)) z;1rI-%w8Qld~{gItc!BRI3ykOQRmVWmw`Al;%4tU>C3!;0b9FJL92vq*>+MuAk{S>fhlD7hmAi z__L01$HMsJ|KZkmTDU^L{tKr?$1=+G>1YMA?P)brX)V?A9m(6HBuTYOUM48&m!8|9^; z=+nARU6AKqzLLISb{q)raN_bOjsA4%rN+PChIz7-m1Bpsli@6ldgoGQn|A9KPf@yb z;r#H_`P0LvUixHsa)kFoPI>h7DjNYYZEMJ>tGX8-vbEDTeF9C~VuiqG!idP}6w&7Y?7}lz=SYVDw%2#t$FgyG>I7q=6mP zUe60D$P#;>jknZ`jkXWtgD24G-{~@Mv$6tY;uyy<{Rv*nOiRQ3upYB<^F1?n)}#D( zSaP^Ur+%Bd_|Wl_!}h@&blegXSB%6l1u>*D#$8kT6wP zgiXWhY+2GVGL20k;XRZZv0jNgbtTDsk6wFtmusFDY+;%UuPsiDrR}xhhu{07Vfogz z;cLJ8?cv~&Qy45*j=28pwZ zqZs(Eq^KyRH$Kz9dKe)OOBq-&7Sd6$0@h#Si%Kb&tIF}b5 z%c)A9z_d)K;XQ7pzIy#8^%5iF7)Le~=`8a+!g>Ak82EF$OT)W29&iT*AQVWN;0ULY=o#K*a zxtq9GsongkRU% zTOq8oOzS?!as~(c1H5l&2Kg>=?*Agcn_RfJ!~O{f!=UDLCCZC8nDalhS{719=q#8^ z=4(WQJd`yN==`~Kl~D3N`ozqKX**-A9ISV36Wq*Pbx`MR?^B+lG-i5UA;PDhDUxUq zVgAN$|Mxg7#Ev`^pLBREul0q4bwN(PN}Hqq=D&Gj+j8Ilo3??|h}O_AW5=>@M^3)d zl`~J;&Hj9gKq0AY08Wuo-xqi(+)@1Kl59C zaj?XL1B4N`c!aYPkqa>6@3O0slQfx*c#PvuXWXPiMP!(`{gs>giN`dAO*l{YXdXSX zG+ciES=Nm|$_2E~am%&3QaAcmXU23VMt``*ga2`mm-av){wp8T=zjevZN8^q2+B{_ z`H()tJHO-aiGK^9d>4wz-{5awA9{}%%`sd{JG4w!FIQ`qWV}AGJ>N3#3$LW4Op8nnScb|rLnDNth=}>PvPWc$e?dfHONE3kY!IR|p zXD;a$F^Km%*rGu{<#@Dteco>yrQhU_hW8(#o>zA5** zyv1rodAP6SvZ7>P!-Y+N6(7aAaG2KF?@CUSAM8B;3Q6$MRb>rLoOkM zr^1d=fSw07KGYMSLK1Nq70jDy63|GBT=CCmtTBO*69gzmxXpLLl^ld9T1Z=kFt&LQ ztpEI;*=E_2?;4L(o*p|&r1&K-s87PX$Z+98A}V^}7oHB*EbHenVW6oXHLka*+%@nP z8KlsF;fSEf%TeD24E}YD?pI!YoxL!3h8HeBKb$^umX6o%aEI?ZoNIrNOUFDV$Jqck z-OiEU2g%8Q+QbM_7$Xh)#1>5^VXyCe&zymZPW z9V+Ob#n^NMsv2!t-chkm$KnC|0K9bW;1cO$+}WAnKhC8Z%2}PofMf()V>3qw6KY+i z4p$wQT#H|gx+&9Y$i?KWi=)ySxY_iI!S3kx@||Tyx7pkZA19U$(OIA~O-E*R%~7`< zmOUM#!$h6UeulT$Bk|yi&(n}%5Yus+%ZR(3q|!B?HBe)U>ek*TwJ*z~SRb#UV0mOF zW0#qR85-U)&{Ye#g_ax@Dt{v3S2`v>W!r??-uH$XUS)j@CrpR)X;^*x9zwz96HRUS z&AYqH%bCTgQGVi(G1fcIE_6KU#FM5=4tbCtF7=@!1-i_NfCsgw-BA~627-imXNC*? zv_190HS^y5bk@jD-nKJP!1a^7w$*LYU3X~{n>G)!^esz}h$AwOgq3z@vwWK8;>2O{ z>6r^GZF(>)-@Y~c`Y-=dbzF%nOg^Sad}228TvB_-WpEL*2&Y559V z{|-9-jMFKd?IF{rF#dEMe*aUv`@{X#zr#8Moq=5Zi)`a`2AYoHC46iZbl$S#cXxUB zJLy$@X_&Mt=P@1P?%%@n_X$0XX2W!b_dG10>0iV8uhZjqfA5eTx8Gy<{yu(6v-OGj zCre{o+5<&ir5h=Qh9I8c?s*pJk8tR3S`}_xRP>C~m}A!ZqnvSd9Z#pb+G@%u%kW*M z>dAjDb9RQ|_SI{{8?U}LyvM;2%d1?R%z*(%xyE2==F#vHH`T9FcJHisM^Vi{ntRgY zQe4|W1!1b(6JgTj#suqAVIzPB=VL$YRJVjzggW1Q{INgY3b03#Fbg|=skF=?oQ;iS z#z+Wk#p?nKP`swf89xZ`sro&BPj~?((-IE+n4kUl0}e^llc_`xc}pi)x2SDxOdq2F zj{dnRFOA_i#l~lBQ2r>>@GY++!g~6Xa9*UtMxa+}O2x{~0t*36BKqnIcfAwVWLVsX zh%f-gf;R1z{)h|{FmgvTc|K7llhBWTDLR^b2zL!3i z?_M6J(1uRvhOwIUkRolM-QnhY?+pL)pa0(QtKa+wY`DHc0j5F2SZg9Ma=k|Y`AoW= zjB<(#t8TGDYU$|qaE#ICjg|L@m3Lkn7HQC_-)jXB38k|@e9vlkRNM|~`k3N`k{W?W z!;NESBzg}2#GPe%k|)B*Um6ah2sp=4r2OlZR#j$1plskRb1Vsv3qOeBz6jq5_Mu#* zF)qAWsWt;M{=y236E;#XqMy**1Qbv~0#=@Y$49<>uWxy%cRN?57bZ~{vK$NmQ%dNZ zBe0}TCn8oBV6JGIVPw`$SZ_SVDPO(LN*erePl3)e*IB`ye8(4z&<@VNY;2QCzUV`w z7*Ek7D(suL?hUW}=(XX?Klkb3sk5grVCI<(;C6BhqI2BDY)4>$wfIh$h|?QKPqKvP z=`&}zE8+ZblnvJ!dtds@I>*AYF?(eNoOg$JKe#n4+W~>^8X2uoG>t+#+}=uFH_v0d z!Kd^nFGsd(O*iGny*d!R%H~}Om|5}&P&TA81E3M*k+@zIrz^ZRmUiMH)(ixv9Rp?M zPuB-|H2T&V>DHKZ^!Yf;u}*Qh834lu%Y~M2-(ZOshYT=M{r>y!51)PMO5LVyN1DAW z?$<_MkD|9e|evowB^Fs58S8_8tuTni`nbPBH ze?tCQFKD#Jd_XNIC(@1_lurz84O`M^ToR=I`DIeVOn==U&%)tz1ecG|AK@qMLZXzh zEJ(*Cerv#o9`9;dqQh8)3>pYiw&Ji3klxxGLAgU78n-bZ!JRT6gRiGy+SCt?2Go>y z;HYP6Pln}4qn=xD$yd{`6JW>0TdmF8ZS=R%^ww5(@Ir)??@hO|J^IIOE0-Wl7E#Z)si*z0?KmQy& zaMD!Fl-=RkOP3@#afBJ?rNz(sAmtE@P?Dtfrl`y+t*R@4gNCAGq$lA++DdPF2P%{{ zKi}o8`$@lebmM2o(cyYp0c?LJFyIaclu^_JM3w#+w!h0?#gm39?bGo7w)Z(n%UGbd zp4HiIld8l|=qI39QC``5L|L*9_XNF7?4X&Q zjo}pcpqyHo8;%~}poHC(fz_wOaZWQ^=CryXpls1Wkn@ z^IZV?N09;$Am2XTDRFVwNC#Kph#Equ$$PY$BP6>ixeD9dFxpY3f* z1>X1WFXp_rG}`xme16*~E-;GcjKl`VVYxe1fv(xZ+v12jX^k{YrP1k|6}A5w4K;G4 z5DcsIg{~MfX@|uT6QAM?>y<0e;lGkaMe3~dr}27P6*)12B?YuR8@_yeL_;OiGY*!lBSx!TEboc$?U;Y07I(+q)zcswfi7akhEZ5>S3?^8zER1s;H+kyx z0><23MyPKO+qd2r79VrWr=#aa<(G=O2qi7jmo5Dzt4YhoQD+qW(kMZtqzmT0fS}MX zQ2cd-22bx{Fs2n=RlcqqCmjFeMl;fVlrEK=l2T-20>&@Q!4r&NK|ZD{&*F~L!~7|N z$}fL`jiFOsCHIP(WMsO#_8Jkmrz8~T!Jn3n&%nuzOMv*y<$8|l$}7H>(+E>?%(Kcj z)7B6(z5@@rP%E7%j49;e-V)7Ot;SvDV%e@|s^2&47bMjtBZ4I1AP0=E*tfyy_ueA9t-m@o<$9TVa>&|fP z>Q#k49AHiV<%^ete;b+q=5PG^@cRGw&xXa5=P2Wg!~Kni!|Lv_;nF2wSh~abL3sAU zF>zFFZ*k6W7jya$I7cmNN?GmorEndz<50sb{ls zn=o&PK8Rkp_re(m;mmIj2ykQbQEpV%ZDFvutimuml*-QEJRMc{Hdx*qX_Y=@0vNK_ z_Ng+Z4hwJm4im(qKMq=!JZ)-rpyd$zj?U(GZaeu7I4y8elc#}sZ&dNy5SRxYmvsPi zz-4R$1CnRH;NP9`O-EaRt4Dr2%jlsK*542wxlHJ4$@F*=`3VoKu4uHLskgCgI-VZCe>b#G zd~_I}w0sj7zojnAQ`M_U$n-_{E`RerxCl1C$ME{gOK{Q{^Eu&1K%uoWdp-)+UB}=3 zd?pwWaXOoA9c=w;+0SK*EG1p!NbIH4r-q$J_lIN5;9g>&X?C7HOM@3m-p#&WPOcsf z(UD~L#6-uOO&6~}%T+oNMHqx8<*AVL*A-EFi5@dh?T`5RQQabiC6NR0ZJAsH@TVmtvPpQyFVCGe$+eCIe>e{(~gu8@u##oN(sI4MdT{$ z!YEQ?mr+>on+M35lz~NpC`<7zEhGxORjc;|MA&$UL)H?WZ#-}@M(Y(hXqU9@4-=1*C-K);hww|>5;={dhoIyY8&>rfCV6E*T*s9V zq65W$C8LwBk|7^K(BTt22}@Z;zK|R@!@B&9N2|l;Lr?r+ze3V?GiM!-YOVJHm!)03 zc6E61#TTgT+4xGOzOmu?`>f5!_*-EQfa}g(3U%S)MK(Gw(ipHImRW&!-u_^?`rggq zCNos?)LT1DV;{yqQ?T;8i{d#-??j14V+<{Lv959LdJ2yW0q<144g7uXUO0Jbi3269 z>j@%E{j_dW4tD$=V{oaFYMC94S(g2X-@azQAj~wBne`9Hn9m%t6I z>ueIAoq0${ihP3qGt9~yvv&JsB+GGm<_~NNy zefiPwqu1UUKJntq$Pao*OXEfQE&m#_^6DDrbS1!1JXyC&7f;HRIy+CNL_Y*g)zfqq zfhkKnl;(BHk}z?t%D1+=lz%&op`Vuu1(U3@&N%tdU6(5Y0TQm&up$AF&t3 zTbRw)ASREpIoVIrmqzo=PK7^vc;>tNOsGafr{sqh!%4?{D}2&|SJF1$?MPeC6`vYC zwu$=NfmvW_oo)+16~UPbWo{g2)=bR36zN1FPwQf7OJ_5GfM5S%$jf``9a6L%va@YrkC;p|l$6AHO$SIJD*Hy|o_A@L2mgrMG~-d8 zqKHd;;U+1RfAXSIZ*5W}f!v{_5=<1)Nt<#=ntVYOOhA8cqpgGqEgo9@qzG(yGvvuG zI@{9&r0d8SIyfSWSO%kZN&G_AGvr9Gp8E4%lr?Pl2fv=psUL+O58jO@he6wc_B>06 z_3Ct)ob@p(7J4Ep(?BljN5s<*hBktv!(UG?D;Y$J-U*|wcGuGh*@X2xGCy>o7yP0# z?TKsoB6ZSOvel-N7-C5Soo)9oc){EbdyzJod0koK*k|hK(~Ad(lNU}6C)hJ~Z{^Og zy0XfkI49hZ#Mz_F<`VxsmOHI)c-%FEKk3Yy8rXTya;DC3Cn`LT7@P>A1MrMs_I&Fp zV`h*{_Y9Je5&|!KT;Zhv3bqKNSyliALAhd!VDQvoWpaF*bwXe00nil=c( zsM1kx`c}C2gcB$oNy_BR6*H8djf$D>WI z5TThWzKQd_fBTgBR^dQI*4zT`hHa;iJhxmN)dTdk%XosY;VXLe?aB505`|+HRS<Iz#DxEEpWbT=gZk7jv0*IYhj}TAimTzgy142#eo<9 z2`@Rh?w2AA80q8XEWwyx*UH-n*=`xdr}S8mEg#O1qyk7d1_x&{AL3m9GiT2aXU;!8 zoH_S2M>%qG&XJSsZSY)lmT_p6bSItEHqATlCI2V!UTBFzD!Y^zDR*TU6)5uWV~{dJPlxnZh@0e8|I8{ zpcjT!D!$c>5IzKirWTs&!te<%g-llWx=>q$?hA_!0-SuFgiT)dqT#FK?(seZ@DUzr z4v7$2owtFF?=u=Cjt-+As8jSueRKrZnTwykeKjMsM^M;JmiDL)8ccI+9G0WQ81%YSY28hHarOoqc`%i zu7LX84`@oMN8T)`l9Ig{lf zqujjoWj=~YS&D`Sg~oDg8PkvSFj%F>r+^xi4+ytIaqI-kwN5W`LevRnMUD-B@!Gqr z<9*D&k$c10W8B(|f~PZsLHhR9w^)|)4EsPH4mUXJ*7|ym(Pd>0xhbrUl5&N>%aM9uN&<;F1U zm;OD~Yy7KFB>I#7OyUse`y=?R4b0>b3Ds;*c zob}I21Z7Uz#wXc?yAr?y2%OPSw%tW{fcKxB(~>z~GOrV~3!GPV%TZc?tx+T&tA6wL zaSt~*eu7Qi<`w^38g`k^yqyT;QiDsykZBc<<~NC3_u6a5KlQ)q7D1uMyw3?y77W6= zJga>d=hOu+aXqcC-eu0^cEaeVe)&r7-cuLC zcU)v9kqxj{x&{V+d+||3`IfZuqMva5>wLz4^2Y?!Mx%$wV@Y8hpP1Yd}n zLiW6tR9$34Ih5y;jv)O%bzsaj;mieP$P(e^S|I30#CFFyuLlY?{VVs zMP266!+q)^0bhH%Ng&Cj9x<^lzb>alB|mELlVKZZ)X#Lxrg`_U5zo5Q!I(An^}PMY z>l_vSGxT?7_}oh`B;7|G7;qrvZxJ|WFtE$9XO3~Tfy+L(jxbPS`gTab?L9#rj!~^O z9^C;maV5Bs$0S6-J;`6c62=Jh$*&SaNYdcV!fYmIBpA*Ws#E!y{+uu6QUO}tN61g2 z!FXf;6qZOii4`r8aHX$D&%gxkW6QAoNZ86n+@8gMWa-Foe{E%W%o(~mGj5&6(zTm& zUh+dc3;Q12f&92X%FgU*GHk}M^{%+8@~F@>WM+M2RXj{+?fkZ+N*+bKbwbW=Mp^2j zuxVV7O^EB2jB&d>JAOPuM81aKHXWtM+l>4kxde&IMp?;6{elj7H->kas88ZZ zB|$>*uZ#$IC{T>VAyCk$i=UBg^j1m(*+}hWr#>}aCyw+oQW82$Ta4v`v*Ae`>FD7} zufD?%{>Cgmrq7Pc=R|LToTM^&8AB4^!uGdtrYBxqPy26<2E0)s2PIXboGy`}FlN+M zV@<HTN4H#a^nSb6Tu)f8<9Q=)EneZo#*|^Ws{Tb3BzOh4E zX@RWYmnb_^W)1zx+$z^P%4~058Al z*X!?u3)7r1{({A{6UWCCazs*M?G2mq)@>|#=y8g#Nn(w_`tEt#VZ_`0T8zj={|=+S zGSO9Alh+BHGL&@VbbLN_($w)xpTZWs4J`2rYvJjz9Z%shzxFz)G=l#>Iz9=q zL{tE62<5^hOO`JW9q{BvZ;SUN#!Y6gSGn>24hOLK^Kgwb9MEAs?d|W6n4Q?*J|YM8 zjmz%5k|pT5mWVozdYSTO-E14S!{Hz9*W6R9bi#a&Gh)p6hd3L1H0BXO1To=w4Ab9* z?e7q2W0Ulo^i0zZL`Fj(Hr^W5<{}X_jEXKZuY}&|*WdSwQ}Bt@)ALE-UcP{96dBL3 zDIT-7E0+0Hr!;xVk+*MdC80VWeI4N+;@otxC5~x@VCg7d1AQB(U+*n^A06_V^{Grn zSOGXa92yI=3G8v5Zbj8)QUD4Ls+!=1i}4tG#9aaXX`J!MoC1UB?9p=@>{DjtPC@(Ay{O4x_jkg~8 zeQ4>}aN^|I;r!DVhUcEY!f5d+mMXEq^$)RZ{HE6Sq2Bu zq0N~Vr};x~X3L<{Fqk#}`OipfHU8>Im%IMySOL-XM6Yz<$P-qkfWL4>u;PN(EpM*H zqaK^VUn`7;rsEce>EKC6enj!+^p@5*Si>|Q6GgL7Y?9=Xy{$fXP^CD znxji_WZbls;lmQ>I(^zJ8e-u+j#m0R`ici~6QKNcoF<)--%1(Udm0VD0Jt%H$E9EY z3Detd!%1W@VVq_3;OJsTJAP!*O3gz*={fh#1Y)& zT~-3Ly2*5%EwL^P?=A<8{#4zS^z)}tg>00e9Wq^&qP)g`EZ^V*cI>B|I)4P!S7a_< zB_(q`K#<^)ETmfhfYX(%fVPbclA@h}GF7IegTLT?No-*{0fZU7JbWo9PSenJSSjeP z^HnhN*F$J$&;e&hbz{B*OFnhH9|~(I4EJaF7^Vf6jss$4f z(hyeKE6x5aK6~6wN2hE2G)$fM3B{ANJN-UQ2lX=9q@%v-46BZ{?^R!!*Y`MV;;lD+ zGF*G>bvg$R81!CDo$pGCHQI*__QvgECp@6zaEQG>&amCb-{YFg){&B#+)^Ean8`_} zms>>3W@M0vdVP)~VOfSc&ABi!mR=*W!xS^2J;ZnTD)cscvT6<#%*X#N1#>h)cnOao zHBpe7Cbf_coSAQfU;?2wf5zDL*qf-d`U{QSYT{c(5LKRg;#7RYdfc8u=QCx>q`+sv zTi~eIgX@f*I4ukxEdsLDl2)`NmX<+^hpYhXN(v}nVgu=*r5?3TyHl99zA|~ICbt> zjskqHElWB{N5NZG+d5y2LYEae+mK}xq%!M*E$}1f%FFL`E&zl&C_t>QENYq1c zTK7bSkW9*g`J;qc@?yLj*Y6Bp``Rh!V7VQ25*nBC)^ow^>Lb-MY1@zi0`WCz_cGS& zJv&PpY6o0{4tPF#)H%cHKx zFQWnvP>4qPrBT)^c*7xvK^Y0%PaNZS@%i2TI)350)9-`p*E2hwHmJj+Clwd=5q2H7 zTY07a*(q=q!F;jKv%}^{z4cvBr>EW12Vs&mzs-O1(Y-G-%m6tmD&Mw0^0kX@+9`2X z%-I4j3hYLdcSiLzZ1XnQ%W{A^(|8X2L>sNt)JG~ZYc%h07&~nSeN3<8H9WMcgwWbM z5Biy&Kk4@PV&j+#rhhQBIV-k{A$Evcx98HCIS7nRR5#o}T4@o_C2Tv;$9Qg~Y2rH4 z5UwOu<))Vb4Yck$|NW`|_}%Yd0iFDdwaedUF3aqx_>JuSc_CWoxAaiHk&`u2^ij%+ zx?XlfEq>GQag&Atm2#pO-)2m|$l15>TNp#5!TVgt+v8d86Hk0Cet+(Znoo7Br#I!P zQrjP|e|LEKklmgmh2OaND7Q|(aQd3zxbmL93nv{uOV6tl?zedSbv@Lt|AH*|Dt`a2 z@LF*y~DgDzmsu zz5WTcAGZ7HKMIqE_7kGeKsc2{GDnPNNGzoa7p~k0PxspMBp%=Up^`af@CY_(;34bv zJ`NSh|4Z4M{dTrx_kD+Bk;P)o_tqWz_Nc*X*^m?4vVZ`AU-FV5Z+XdM{saGA1_B#M z5CpaWBS-=RwiCs&rk2!F-+8EcBv~Z$`Fz)2=N$6ZZFO(*cZR*!UejKC9?rnR&69nc z?tISl`Nk;3Y|(>JwHIy@Po#hn)F+jZ73Ou4yPKBkw`QvIOVox#pz$OGp4EA z7UgL%*5=#0|=;Gac6wEKbHf-Oey~rsq@wuSa_LF&ePDetBPEQ^e-BjZ} z1rmj^UsQHl*rtzh-9+is_a=?_tl4uz9d*7Sh``|~@JMVwQ(UW)K34Bhw zyH=W$U*MQmH#Er~)BN}(t>K~eR!Bov6-e2E{F(P4+OB~5Z3LOVYE5|lg)xfqdDAvL zEeBpUIeOT@$d@O<)Tc`YT=L~#xk$`65&TX85r*$T%PXt02G0hKaBURn7U;$$td;3H zPK6cfrHr{4cl8#**7!#Jx)lqsIMrXh9Z4=#5P}llz^Szk|ESLCq|L)&U?|t;)^fljsaP6vv{0SRA z?a)5u<3KN2hIIAH)#>{mIy0edR(a`2_}yD6L! zUCZ;!b&CzoEPcwIV>_`WYq=W5uVqzMj4sC!kDG7ZRBq$jeP}Z4sF)29zgzLir=K!Q z@f+E_rv2q*4t*t_&htF#W#&$H{#`Kadho?p+d&RhJ66I%ne8|LCk zx((dM$=K2qS1nYY8ythcnV*jz*;5OXR}sdpqO#-)W!c5$v8o^((wyp~|y1YYfBu>Aa@{ ztGr1A^;>ljfeEicse(67cYZBdhobS-E}Eu5{d^fuY#qI}Znf{ru5DC*t2}We z%||+_`91eG3?9v+9Y=h2`mGma*LP1f`_XUycHTe#clP0Aqi zs^J2g^=ug}Hew(KW<#4sCxS{6&dh^?377e~AH4;W`ji(gUNAp##_-Z z+~vo0FqIp68v=%Qva|*WBA?wPtX@*6TY=->;>okW@GD^hywBu((bYZPf${wUV4woo z$b}fseQ%(|=2F3h$Lj@0pz`IygJriC61v>Bx*@1Qj*QSr*kC!5BmBEG9=2V2#8g^! z5{4m1BY;}~M=w_3#Fdf!EQHh=1o#hU;FpAri(U3ac=V{tpX4iza_&~H!X8;UVVmw* zIOI6Ad+&XJ`tXAvPY1W&pEj>tql4h66^*)lY|0JW(4tW;=@W9H4IM{^#UXz4M>*sW z2xnXZ^Q;QcOy128?W(ijoAzWHi<2%%M!S^H?yFU-{1%KGp-$wBcQ;4r%Oy&fF%uR$ zpZSu8`ccnzz*cTV+P0&Hx35yxDavbXRNfL;-iGiGA5BHoz5@L1bg zn~2d0CG?E&OI-oX<%+N)j(i8tXvCOjm95=Kq8YyZ`(E}^Z=!6yq%atjvY~}Xh=OuZ zz{;KkWaF~ySJ8EVXdtna!kK|T{KJ2nHUB?i6Q>o%3mRme>+QY=H&EN4xyA;1zvgRY zYe(Q3%bc#W=f;nmrGxF7e=A}b%c)azeqFBQ=n)oA9<&ke$Asv_;Fk@u+*5`O#$7__ zk`i&atmQSQNxgmVUK;&}hb|2|$+9Lz>ls&ZSwVZklsd8ewta2rLW6wA>%OO&ij@aRJ>1IXRc4;MCjRYE`W2)aU>5 z@Lcj)mRaUjCrG*PVCQLI5-|2{S(qE^2 zyX4Jxach_9B=CF<8}k@fL4~ud+hF&*Fe<@sdRP)wgNi3~X4zhiBRdJb49gC-ogK<) zVn|1QsMEH0?bD8{<4=2hXGaC6vBq-4bfgG3f7yZ3R%>G1v?v>@@1EE&FZ^!0mxkq* zdP#a27RuabhcrOdrIm{f984Lz5R7}!%s8XZL4SqO$2YpLc_^pa0qrVb-4+urr`!D;@Z*Tl}PJoV9G~HXIV>44S7HhCt#% zQ$xUMlja@oC4;u;U`mAxlfecb0KjC6-|&w-`ooGk(8-G%{yoi69-LsdEu(#>{R5vT z$5*D``TPH1`iZ5T6ZeV@9r#vFn-qE~;}j)1-EM`k>&O%F#TCN9sZ9BaleO_D zPr+pF^{10>{PP=mDTb9`V+LqO7uG| z6FP_o4R7gaDJOi1yJRRZwUz%NlVQa(MuGvp81Kf_2rZ1QtQ}Px0)zlwK%u|2u;-Iu zjSB$PVrFw0zfHk&D<1yRh#3f^xzH{C`h6YdbChM~v^14Ba0G9wOGe;Ldq`O9!u7%G zirD}v^;4E{oO3Mj7HRGAde#Zs9KU*SZI24XlZmuy)4@vFOe;P0UoC+8#<&q#av$By$ezdQoA5U87P%!aogq>8W#<_F1XC!2Vdmk?81@d^8wK8q702jd7mUPQY% z0zqnOhLsTzP@X-qvgw#ts>H9BSXB<>tN%|QPf@1Etmoxo}gz? z5mP=W3#yJ+u9lxkt87q$qHCy(Xu(XaWfq}&?NHtZ-l8*_dw`_@5jxl-6_HLFS;HGX zG%>=+8*nPs;>|GsDCjgdD}`5Vett=V_SG>wOWC3+YCge?knbtHIvxJ@TX(0g*_an~lNN^UfNz^w zFJ~r@E@ikga^`7>AuD>OjCASe34B?B)?z3%LVlrqx6|Qy{L*(uLA`h5O?VrG@@!tq z9y;hge)wei@VyU`KVNdlg|t8Y;^Fl4cys#k?K{)Xm(M7NIi`@tBQ**G1J9>#o@`C` zAJU1C#60UqI>Cq%`)6RKzt5z(Wf$osw>xPpz)K>Oy{I?skEQ~2oV5gw^50J48D(7K z63^Np!tA2+LETK8Fz_IHcum|@gbPcQpxZRWBd^X}y<$_PmA#FCD;zi>4mbX0Y3PvC z2D@@+&lzw%LT}NY87|7xT{lpU&J2R448np)2g)7G8TlrCOh4-;XO>JW>lVKc5r-V| zWxmu-)t|raK;eu)LFw;1`F>`fLb^GkC3owX4wpaxkRA zq;ASphqzR(e(P!0tMxU>V<*H8xsC5_I!oHQa#?Tea{$I3NBeHV!`WznN<&=r3yM+R zLos!ea#tBdY62%W(O6_jys?@gJdsAppXkP7k{2o&%MBSBb|I3PX33$H^3n}H-*w7o z+}>lLpEBy?BNfI#p>E4d``2^asYhow%C!oeeFyAfhlZ3*E-;ayUm>1v6)ESCM-kPx zYEx(W_n;tNT3O^!PLP+6adDO(YCq}8qbaT{9x6{n*MpY0s*|A|7Tm(0awmG?lPP!+ z2cE?D&w&wrU78%erAOrz8qy{0!YAw{Ul6xD0^%z4@*v)#l`8?wd(vabsr?n}+K}xF z?&wIHD$#9o>yIwz=e002M$ zNklX`5dm6m`V2Hw&rP-vCy zyE!#X1124Wva)gXOvdz7!3i_im*IR4OnY3qhnE2$rG}NyC~yT|3K3r=Xs%|QHr_Si z#4F~^TqcMGtckdP<%CU*ezCVQ&=DVGvWK@#BeiMF!hlD=xd9g%gSQq~7&YKO;cD#) zg|0?xuW-RhgptupTO!aJ`6YVcPzOBC7oQjWJU6T`zk-|fC~Zj=_V0`t+CoVu1@9?S zb2cWPunCa(I7kIZ2jMwKK$rMoI3Lk^bfE_He6z>+Z)%xYaBC> zbmWIAK++*2AmNP5ceuzcE%`NB;z2%89{u&&I3hy}!>EEwh_dvE=be|+8DB~E1T7nV ze4rs|zeT5Dn@-m`dqO-471TH{67UW#QOBQq+<1Xb%>W3COT|!plt*1D9>e2cc)k_- z3y9}vTB&kfc4+JpFF@dBS~cR5;Zn>IUV0w+10_axqMRe~k`h=ONu(f>uHQe1SL|rS z6$7(4F7pX*tuNk+QOl-s=8;M!Af*AaK*bY0y!0rsF*3uz-7;Y(0yaNrC!H^$#5HV+ zfs?o2eS7-L`yWrAeg4Jt)n{L2Q?Q$$Z~ovT^20XEhHg!t-sdaiG+g{S3VMZ(!1Jec zBEfm%ww-{(%rN+^2uC(w(SfiN;SwJkYw~DA&6zR3XJ8&OFFBg5&h9a@@Xd=eW?ybfKlk3bH~rb4{Ylo!r{ktR{3nlU2R(aW8Hp-l8Pc(FStD>G zzZ!?1e5qI-8NFs?3>mW7UIqY283-=rP#%}YIPzx#bo#oP+)l(>3y;=lTr|7!Zv z@4Yh}zWos$2Rrz11`h4VQ>(t_WT!M{O(F?<8~1wqu4|S3Hj+<(w=z~X&BMlP=K|{n zHv$*`mKQ#jENjdu&(6^yt-I*-QLNqnK%0~+QwuEcAIRTNzPtqA4cj2jZVB@`2k)L38 z1YUDwYT-=8z+9-?jEHNxOjVgx{J+r|U@~ajr7R6l0qjMp-l4jJ8+ZUo zfH02SWRxkM#xOB*8ZTf|F<%Qcv=rluqU4}(dCznhg^2}y_dL@oRab-A#89r4 zD?P0|@+fWLr005OJ@KPK(*ULiD($dr7ZiB5U8b==0*)9})j#2*!1Kk%CSBN3 zdGjuStWelzNso--b>fJ80?ViSL8qT~GWE{vzr6eA#{!Xa5R zQvJKX^&4!+Kb+n_{~o6d-N;g#Ylqh}Kyh%5FNk83|M!3V$tSziOdBjN? zj~{c0gG+NBrtxDV&I;3`r>#RBlrfL`bJWp2KsDk!HZRYWtC<9r;ISL<5q{{<_%4e% zVM&MjxO3;N(DVBXp4D$&%Qq8X%>7H|arvmqTN_dzrgrJs^fbMF_n+4^7{dfMpl*e+ zL2Ki(%NccQ1E3m3;KvS4N0-o;e3_X+1{;6}bR-_I>HPkSThrhC`OmXq+&qHOLuP}o zcE98Z-Dg!F8#zus`#G$YeU8xeOooDX zY;&Qx4nsge;)H8G^u7JUGkVG$=)N1#BO5coLVVL+-I^|z6_rlV;JpU#7;Z70(P8BW zM_PjrSkehujpMS?bR-B=ESGR-AEu4C)r&tnAvy<0n&-aT;C9)Nokwj`m>wBdPwH%s zj*rKyx^z#wROhC*wA#L;nOSt~OMN$O>DbnCz)gMBiFu1GE=ATECagbkR-VBgDR48I z4-YQM+CMnVvMj&!zQ>WuIy+^r`ITLBM@OMgN5nh}YDEE;Gs-$4M_NNK!4@k4q%0tP z3a;?TPw`geWB37!e!Awn2q(p2}bS4Zdd%iL9UA z{=vXdB0D6s&_4X!=e7Pl5B`i=Y@_C8q;fg5{dH!Id-yFMrxnr=Ojq zL;%wA9?J;Nv*(N}s1h%6;i2WJckFlMje=aRhBvG_*ZJPg@QXTkCIZ)qQ-VeNiDLx) zLKtslYsD*|PQMBOIW#lEKqXPx#0r+e$V$Y<7|$*>q3)po_P9a9@DqnRw^G`$vc;)> z6UMJvW|Uc8vSg^}u6Y(dVkR+6Pz%aR1Q3D03`su~5+xBwx{*x;adKVqHjY+kID(%l z6dhrcPZF8J^k8p)db-7j!J$>^K4*CXqYy{ioG9TK3y|`&`ejrb|EMqFz6@QfDPT2f zWj77ERv|?LByvP9BbL5|Ei08Lu-Z@|mVRa6Qo?~t9<#<$`9~f!qb%zK137p=CxdT& zTVMLciGxS^Oe4OqSJ=4Du<\BS-xZpySl0C5wZG*2aOf}j!F1ze@eNDw3Lx->+j zDOoG=ELE~WVw$xe=*{Iu+u+|ioUYyakaukg-sl+QLv%#X2LWwVUAKVF*M?!D0SuRic_z#u}wC##wH1XkLjExQ*f|> z{hYtP&nD1Mr~BXh)%4Zpe=&Xe<{BMxe84UR$jvX;oSQj7Q zRkB!*vf&R^&N^oa&C~mNHlFE|3ONl3e_&t`HG!Sis z<=YYJhVt*)HI`#g31p;|g1HO+PLF<$oynvMjkVezUT^HC~YmL#f+=!5){Z~llMxU3)M-cEkyRrPx8rynt@1`o^){mFy; z2`3G`O13PckK*nQ)c~=k0Ble{?pF}7HCGD2Z4y@zO|8Wl&-&U~ zH-r3nikbUU4w#9sY`S@adlHwnOUmA&EtX1mZNRb$JFV5?(TT`XZW!141>BqDBlXp7 zih?PAn=33Sk9Jhm=j?dN>yY7sEYV5D6JEaj^>QX*TUX6E9z#)_Xdv8 zk8TV@QfSG7y2}h}21gg$4Lmm4{3RR4vv3BlI7ZsZPX;Z>izyR^eD3AC>{^Zmw;EQG z;z?W?yoqmAPSO#^U)r(2YB9nKYx`22`W)YSsV0hk`Poh=xY$|4+xQ)>`SDd;zNd5> zc85CRa+e0--F}ir*PkI5TOk5)4&>;`4#t_Btx_6Sx%KOw2 zJ=XWWdhv9+##&y_ov#L!464suDcSKZhA4uD^BqmZzYC%&Wn+jNYa39zGbe(W9&KMnkpGx)#aYr7v$ zUw`@W^y0~<)AJ_}82xp*lO4!ux@tMahCi$-UvtlJcK9=cJ4+%9@^{H409qr1oN^-1 zSD$`NXHK5fr)f7DlXlw4!_Kt+1ki$pZ;H2P&N2P~>@&IIjY~-(-Ap4raLqayYZ&J{8fHK6Yyh0Ouw!qth|-2%f;{vUc)^SRh;j>_f9Hi zKY-_`|0x^v)aMjZ7Lkt1_bB7RA?C=&o(6i7IJ{-UH2ZY6DS?jVaT<-Hok zJfhN}Tb@rmZ3LP2Xym@f$s}Jtd_^bhK5~*{EPTw*{Mo>}&sVMeYWEGkc74G~Fg#!I zUA<`g0u?S}3Nu530hl%_!WcSrsZ&<<|7`~(|E-TXExrvHpAIhyT#0ML zROeoqJvtJ8yQp+qVGiC1=TBm)M%%G`sPw_^3EX59-p#@uS6Tx^+7C2hOGQgLt$38H z(^j7B)X9gc^j-X>W8?3ANXZKvpplQcVSmTi{xKsJ31JF42Qz6c@2*Vv4vLhxB@@)tG$;)Q0H}Uu!_J~9IX*^5elC&e&efE>#@gkn@a?MX*LSu<%9R1sVo8PscFxr}a z{!AO4u=SzX(%}-z(<_H z{3<;7!&u=rC}|T=0tAStWjzM3qJR6*c%_;_%yc5Ijb_&aZ+qOOT!D8>L(~j;-=x8~ z$+2wvj8tt>uxzkI!wjI(t4AKhiRZ|qBV;`-nHr8@<>2ZieCOd?mmyP&k^A5^aTi6b zqectS!4g08TEoFpawQ&*9{1{c{ui8^8$AN+XpjY0y$hL$iJ~^*1Dt<03QY`sg))x1 z$UD#8eP83S@HNDOO!#07AW+(!<>@f=1QI=R;H^vAg{5lDm#;A4AUK6-UJaMdx7;8+ z)4vQ5z8B9O_A2bSrk390xn~d4oRVQTe#(`uzEi*(NVS_BE$H?g3hpbrd`a}`;dF5O zN7IcvbQ*5muG?V({r@p zsd%P_RAqbflvx2Yd=$!XOg3v*!v^aOuk%F(fTMI46bVYY62`=P_^K~QKBI&1kkQhw zr{|CVczX8uOTHxekmEktS7V2pyx_U>ZdBdM2|Hj-vY-FRIUulZKF@T2m44UO01!-hugj2j22?Xe#rFAV2*)+P0l&wUzFyw7>VxBMwIJmj4*Cuf7v^ zoAuTk9GmyH5?mU%%%`iU`kjm+_2*AGfwfVxg-JGo3N`U8iZ zves7`%K*y#ce#R<4{V2YHIC_OBlXCACcN9Bj8p%EPg5Y6OEr1RvQE5h%qyxLf)Z09 z^{Y;j@tg^&Mx7^MQFiecUIYbBKWW=Alz#aYKgP46-~Jg$NV%YWlr5T8b`H;7!2k|B zBQE7oPHka_)6Dd*y(ug)f>zs_H87nV3@0Kr?X7Xcr z*$k2Df0m=+6H(O4#I0n|!)Ls?7xs9~FIhbjFRg(;iwYqlT*WU06?^zp3=zk}%LpHO z9aR4A+Y&W$Cp}YU6~5Mqh~D(9occQm7*>EQT@^>gV(YKi={O1Vf=eB!*t{7BAbqe5fK+Y=X@O#lz(* zi5o|7L&#JbP>k&EGJSiM-)lCm?wm2A#^z$r;)isIirIz~xt;hZfwSse*KH$m-!BZ) ztT;KZ^%v|dh(i#rQp!?H{_sE)rnY%lrUkyE1mim-hvMa%qdD49Hk5u zwN34BhK)gKBK!xEF#UwrwE{smUE|!{7mN@;0r4>~PN8jbf5vGmS6J`A?`Gl5#=r)o zlu|5G4gL6_A#jIR=*w^tmwP0l!AE%uybJ+L0dH~13g^Jed;d~B_dB6+1cx3Bhkh|a zV}6(h|~E1|aBuzZa)(H2v`d9)o zLb{EukPcg;0z74(fpROCcTX1C?YUzgw+DpTf z-3`w!*|QPELFw6i=VsOC5C6aE@%=AoFn+?O*st(^LSv6kgGUaQLDEekPH; zHk?_{DoWk$Y;Zb{X$Oy6juxtCU{$+uVin53D;AG!ky^-mYKh2q2>fP%oukClu8kj9 zuH=m@HG@{Ay&=Par&uaGYVVkAFd#xHIp(;P5||k3hF{qOBnn9$9BpxFfNp~{KW7uA z4d!%|O+WeLXOzMXufCfxaf+6E@Z&#ld*W6a%&=eUqAQ?e(oRh6m24o%`$ezt?h>R1 z6^qugcD^a!;&=wI?Z5m3MFD;?-FlY~5$^M$x34~)HuKRQY?gd*#0f(-a<+3o0B3rG zbDM9@Y;CeMg>sc9G=mjKX-KKl2Tz_(pMCXJz99YXd+$wGIniv|U^&$5n>4cED(CNCltE)V4Sz!O=U*uT%791zU}v>Hs94z`J!$bu208fjNSPmGp^W zbW)PP)jQ>ae>P@y?lp+DkZ+RriF4>)5ahbDb1?n4|J{EzJ>o0eC(NE$LLQk%kn5Dp zL9u{roHR9g3@m$)h?6o^NuX=8+7WeDeeITXl<%@**XaBG3_IRgHUVy9naVC_vq9^( z62#k$L~+);WmV^gx_6ke5&=Bib?C8JlR4hi$#C+e3x0t{kp7vOq7!k(T7HjtbvXpt z9^IuD&PcoYDVvi~Oz}v1fkvI!$akZq@;d*y!3R9u2f%+=Fs>i`RA4**X2g)^ZV_M} z(O+9q7Qaq?{z_d~hLs-AIDP4i^!1w$J%i@vf0xXupV}*FQ(75IXMNYNB#~EV6o|*H ziJcK=P&|&>JszsaG(DyKb6KmrDVuoQ4^rvLiUQ($fQcRCD;#c+bfVrpKLxOHDZj?xHTgWPp!kcW>_i->zjW zho*Y4oHMLk(-pC};|ND7dDpK_5vow@9P^rblfNFwO}!-_z~|(o@QY!`N1n(VyqQ*+ zO^QE0@Kcmc$L317l0{o+&LWpM^c$d-=iNB z*|^eO$6WmCN&otJtp+#TqdN1*nt#(d8pU>so>9#Qq>Xwo63(=@tkVutjwSu%FTvr_ zC)=mBKb4mbERfiDa1y6Jz>Ch+J9y2DMG1NFE7IkY>!95HMIY{P;6dm5wUY3!5S&XbL{E z8kY#+-A2|3YWWfdv3Q4ONaYg0p#b^q47otmQ-MflptoSV477e%IC-K_-8|>xT8`Gp zo5!X);>JFPE!KeVU%kPG>$|K8e}B4h^F5AueT(Hveu>sjB$+e?7<_31G=MvOBxT_i zZlaIY?j-@Eh5l%-^Vr>G1lourrR^v;lSUjAatf zzGV5wFBrZ3WP0}Wb50$3LSv530=U$?aA_D*o#*5RY$mKJ5sXE@K8)wKYkAOfGiUT# zaDWkh%U4c)8g9fN>=%%6(;PtU$!zAQCGDDy1(IrLo9VllOD{#E;f1xr3HhNvpTkY= zrk8oW;#EM9#UlqQ%t64>fY_jev_s+VjGhH|j*ny?RYrMIQJCK4}?|s1R(~}&Hy1{G`W3u^^ zH}krq$mYM3FTTu@6u%GAVLfuFE?zK$@r#c?p1uO^T{d68|H1nl4f{64X&}MW7P!?P zsAwqAkY;4qb>@CI!ckx=HJ56@4C`ysV6)2uVIRA**Wtc<+pHC!RmgMFN$Z-xvgHx zTd=0xvh>podoFEiP5tVZzqI_>|M{0p0BaEL z+31vASnZ)4I^(*|^JztLNWXgHDs$6Iz(2?~yytjx*6k19hg0ckeRJ?FEw`xow5s(o zO+M1?exP}@4IDH0Mk(Liz?mK0D)0%etWLrQuIL1@9K7s)xo~F6r#j>on8*XPWiZ zc*{43XB@-rN)F_;67RU8-H9z{$2Z>65VmF3?_bfcd@&nlhPPhNd5Me*Z)d7bAjZXT zb~aWGf2Qe*XjPI`Xsrx@69ga(tb{3rrBq4> zS_4ENa6+3@*hxX>{IqR-j75mDZR;WLV2UUW^)r!+;mr-J1da8WdxKv)Auak^}?PdFV@$r9EPx|K0KsYCxE zf=N1wn=cUI=G%T?ri=Qw`z)B`(+;v#5Sn)E?Hu&A(*~_a(=tu?q_@tcK+qcANN|)L z6=r)jEdVm06}EX9ZYQ4_z|=q>=}!FE0>OQP1F23Tj&{shNMK;G@vHJN?$)=1TQ@o2 z$!e^FW(GTVd2(>bz}a&b4%iMD+&IwpT$)|FuMc$q;#-ikm-6_Qq--5RF~eth)6sdh z8J%XJ0FC4%8>T87UKU2u%Gl4Mb z_arVp``mK*^@p5E;PsR|2{;GMci;OYTc8+BBfOc`3N6Gw56jkYm!OTXh*P5KvcbQd{9 z{^AQS@Ewg?v5`Do%E%vdTy)FR`|!36*q+dny5=NA`p3z0OP%Kl)VJCJ_gxC6^>1C2 zO*mcaYQ6rD56mt5U#H$2<6!ku%k!L{dF%amgMT9RghTk$_v3u<&+?m{1DYPxv@dPt zP~JTz;W`KD{`T08=94@u!P!sWOmmVqh<(B~Io{aCL|5mljolJhHm6>21BveBd1faT zY$KAUp8jAU`l@##aI#(2)99*WRW9R-7gDitkk`H@w766jfCSoZMf>PAGK{2#)1QAw zF?M3AKpn`_8t`0z5Vz!O*peGzIzh4G^tRe%Ql(g*hL#C!vDR(mhLL#XVpq#7_N~6W z^Yk@&(-pS$OsZ{WxV$VNVbW~pP9%uW@Md>g4BsLy6mV6(rgvQyUi|jWdy<=B49{@Y z7i|+0>#s1T&v@!+{mFuqWnvlebgZ24Rc}E8)HbCSs$iybCo4gVWOXq@jl$)-pk=_qY(?8o<%6#U z<{KbdNsZ}BtiA=f8JEgHT9sY#h)~e*EfgBUUkYYOyMDomxSfIdyKXd@kzBFJFaa7b zT{>}CV^VFmF;!MsMXVKn*xEE3uqP)&b(nxf(>c;VY;;mVfkiY+zR zkkfm5f~>H~*LXe>H3qWUR~P*gY(!v_QiGMFC0)uhZgRC)iBrGUvE&*x)6A7eaz?pE zKk>+KjMBv$3@OQQ&@m<+Bt4=((&EZlgVd&OrZ*09heERPZ8=vzVbFZ*8uD?oU=VWa z)@{NZV5N@c)s+C=C4At)VQ-Ak$(v{#yisrM_q@4CSKHOOgJbz7#9g)>z4YP>4B8m< z^IFbROeFHESO$adUw$7+d0LT`dFl!+Mmjmi5#y^#30pcLKa-pDyt(Q4$+K*+I$%=3 zUU$AdaSAzJdF~QV5?J;1@G}F*D_83Yf&Gm3aC?(Bm*P!c5&O*37n#7EgC?&hWt)Q993{H}?nfd|Qx%*XtY_f55J0;=tJwu74dRk_sQNkNRKUNDKNrW%*LUND#=A_mS z5t`?Kx$0ise>dB3K83e5o<7a@Hkfp(=Sl9t*4?Yu*?z}2J5Dm-Kf_a`r|e{EuWmZq${FhZm8-Y-s`|Cm zf5!w5u*1#UJkev@nq{wvALT%`?HM~z_re4FWw8^Qu_-)5-3T(F7=Xv$_|1Se#hH)o0eJTm;BZ2XIdGRY7%xri(L z9_=z`d&p`zTbIZ|`rK~K>wLv(7{ghN4-;c*S`1Zz?SVY59xjD0zyOY4@~&wzwbN)f z`VVeXw31jnrH9Do zU=(HN*YdUO`mM8YVB~6Ag{7q-Euy2&xSc`eG<3lXhC#vd*$*C-U9UBv&w!Yhly+Z2OmQ7rpC{+ z4!(6rf=}_Ww^O{L($)QwJaO=#tgH0C2GBDPPLMNTgAYSFR{S}ry%KRXBg~d~D+vf$uRWk?w7tK*k}+- z>09>QA8+NIwkB=*rtSspUwTDfS;!7;!#d-etGZMJPqhwX5$dNUq)#;G06~*aN zwO3zUE)bo>UU9x^(xq5^8S6|h(?9BgwiRs<1eraf> zMN4Bp{phsZS$qgZ`AOWn1kFU~NN`4f)go`*L51WBqHmY%32(>ut4% z5sjB8FHx$-Qi`1I?G{=}p;VY{Y}ND}8^hSLe0uT4s69yg`YRcM4g)2pbuz zV=aKb%3xbPTT=_=J2;h1 z-F%9q@~^aq9M(}AOEtEUyUIB5EaY^#R)CVwebF-TFE=n^x<{;{#75B2M#WPe)u4pY zE=m#?dk&>YP2L#1H^^hY@(tS3ZyxFRkY}a6298o_9B0K34;jFqOAq4IE@*?Uu%(AI zI$VTTzUC;?>SGlM`TlR69i3-#5H0o-(qH8kZZ(D~)ir}a!$>D9a0YNa5mPpubs^9r zp44IUw*65VHK9;5hmiW*OY>`>-S}2NB)zpu;lf|VQ=f+1!)|Ln*>!X2eCd?k2|d}( z#uQ?Xk8*^0lJ)ZwZ18Tsi5su9&7b9Sg>hH!#?_zc8E)97JEt{X({I>aI>sAjH}6UG zc3~P{|4Hr$U-bjX1Eybk^()KUZ|^PFZro(RbCP}kH<#-Sa_+FA|BwNab=!}l9ozem z0h0rimXl6@?AP^7f+PHsjM#F+p(OGOZnia@qHpwOk_k6AU?dz zyCfS1=6R!0-6yU4oBV{JUZqxIRVn)Yj$ixcHb+KujR9Sby&Md z%lNE!WEr}a9NMt@8xB^NqGn|9kbq%p1HVBqb*3()~s52ojFkDNb0!{!zsS|-n_+BZ+!8OFN4TE+G^T;R|p zgGd0vbHZ)7rj>XI6}4SH=35p+YiwC?N%vw$7MLyF`0S+FbR*Bm?6CCX2Atv6xnT;R zt}T1#&3=a)emCCfPwId>Z}yv4%hPRphS?3zJR7fJI?r_N4L!fZqosR%@DbPNYkxJeiPsX1H?nlRY*Svw4v|7{99lD25%O6 zX}Qa}iw{2hum<~Vaq{~>)}ixEAin?o?=Qdg-S6_YBR)` zzMNz5f5;Pp+dM7!;N6e&s@8Wty1}G}SF_TeJ;RFg3om?;Cnx8seM0XhgUW0R;1!#0 zA$jQGRkonmFUa@6?+l@*TM!TVSe=Uw2MqpdC7BGG>z#WH9(lFv`yU@HAAI^~`SH)* zU0#3l9VQ+;aRG0icJEqEin)8#p*nXXW-0Dk+|zCoWt{e?Cr z#T7k6p_>v(hDodIayDTO8&BC6F{4NFck4?4c3@0veO$qpeq;$v3y!*_xwv@7Lf=7} zW&XRr^F1bC@8v9lZ|2IqIORO#qE6VN`yOjOFx*czAxr&kZ2t zo4XV0jfDNBizCgb2EY~9)|IXe+Fl!OO+q)^)}h)-O~kF|j!|mjAZ*)TI&PnvUDI;w zq|SKTuw`f%$7Bx7U1+wBWMDbfW%2y+_0OS&Z>@7@ii9Z}`XSqes!Q8nPX;n>p>U*a zzA_Wbge-z_(Wd=$q3tW2G~{czY3kb-mh9aItz$(EP@EUu($;3w_!iFW20S?$(wq`3 z$_KEjPq())Bw|O)|G-gy3#nj><|t~@l2CcgJCAAU6z9~!gSP45MJC0q3<9h)aK%og zQqqOBGR;kt4Dq?C)SvKjDnhkY%dj%zMGCkj+k}D@xAD^0hIQ@Kbj2~6C*JgpbU_%F zN7=@}$b(n(6P7eZ^Nk?^;95@IOo!7pM;o*nX7eo@f8kZ?CDRV1)61$Gj-kQ+XyQ;n zcwMesDp`!9PM-N;73J0THCP|%&7><5=&|onGJ_8~0GIqKpvYLDNE90INn_)2tab6> zogB8)rX}sL1q~;&@C=x`j;M(LWh*gqSPvOXKU0qCSmh_pR-~j!dypfDZDVt~!5W@n zo)jr5-swEH}L9I3OrUvI=%Hqb6|>tdHqYNgpTA4v=qdwpA&C$ z%nmJmBFf$Rm~VLHZGw)pdeU~3*bHZQrZwj^{q`kk{%(9uH_zWp(|prr+NQPZyvBVJ zoOw@jGpu2M*ICtn_k(xXv&~048JOR{Pn~5Te2!IjUYx|c?YU_;&4C>CXrRBNh(xqov+dsoSV}Citmy7Q|@Vo=%%Z<&qQTai+W9W5^tvCC} z7;s{fE4=aP{SU7%SFbX=BJI|}!{yDlukpqoNnkhAl#* zF}Y*F(_5F#b5iR>!75V&k61cq9_-JxVcXWhQoX{|sYp#Q+-CRtKllOlm$MiQ#(bjY zRw5^do{M>vN$!U~x*WZoNMt7gG^IspWPns<;~AIwCWSXY+k=w_7crl?c!t*k^Q|l7 z_4U7Zc)H+IDJR*UF}ltZz8)O;6v7F%_Onb(w{H9PGHqoW*kd8D%caX9cGn>3@4FN> zKQd}_|1$O{+oocc$p}`t!W=|s(SF-mZ>vnJ4wydkp2OyKy)eDYp#3?oCPP(D!me~+ z-Qk%A2G2N75s~o!SJ&Fd)tm^31qT$`7y<~ok>zKz}rX=vcKpTs9#(BHAa+Fl0z z6;^tsbq4;S>!j8Gg}nS1Pf>dDLB6xI9CV$Y{?h~|x0y^#ZKm3eeVzS-c$O90?`^b% zZNp@6aSY-1M<-q03yMeVqkI9e_7O(@G(-nQ#JaSXQ((&?~yeCq`P z;U&u|C6mk5O2aUKf=P%*8(|}8Lxw~#DzGi%^FpCf1@$ZJUtD&2Qpwf)^Ayl%@ zas*(=A#T%BU+Vk7?*=0`=~L+GfF@p+la)6G1>^9Guu72*GWrqLDMQIol`)C)#eCw* z)RkxUWuJy6PLZ6kx3FcwM+|)$m75e#CBm1Eq4XmkXla!^8)`!85y7#uKG3`D)S#L< zR3Li74-8Vrec~1~1wp)`k*6~0=!=ppN*C-zSRN%$byjubFsxW~{^R@mEEm1#Uy6=2H@AydgS_Hj6H+Azo6qF&IE_t~w#%;y)9~#p zv9#v+ro{-!c2ZAW9NhfquisdH{p(+4PGrWz){rnCB*^`r-kJ~`}&Vm+mh0?7$4%|JwoDY6E zfsl3}Dpc*lGzNtnh`#c{`>go$#v8Y)u*JYOeUjfZPdO0d`wu>fKY%VG`bT2o^mYbDBlwd;_ohh`ak?nOpN!J?|$bS%NLnA98l*@z{f3C zPGB6=TL+cjmtxeauwOi9V4s0g4X}NhlEd=!QBIWoo{C$VELY3mE{Cz5IM^2K`#pnm z?b`Lc4a_1E-wCrX81QhvgG1@~#7dRcts(XQ_y((C$CKJAVPh!ZUHER)Vxh}w=;SUuH764wgs=UUVR7x>UDQ^F*cQ9C9G#}GEYdTqGu}@s zk}wLoWNdk~SB2><0Gleq8tfXA1O!wz0mn>mXjVL8JWZ_dMoNjm48x`X3VOwn1>|DOW;egSwI%Ta?~|N)rtP9&IdvP zO^grZ5YW2eYOD+TXU;rLtvtzL;|m;qU<=4G-#)`5du|BaMwmh{kQ)QTLnh(P)0>6# z9WdId?=d((&QpPWM}pT{Vis&bTX@KMgdU(!j`mZo5FcZ*?Gp$GfF8cpCep4CE*2NX zw$|vbUH&BJ8qTr}E$+qyyHwN z4m<$P#O0B1jPYeu(0<4Y`HfqAdw?$Ov`-z#c*v6ozw1zM0y@T_?-!OYfAt&7kAL#Z z+P1{Giv#NYBY8dKuD%Gp3-Np;?h6gif_r<3*$65oO3j=H!aT<5^YTlxf$cyjO(c z)%P9ct^tx*qs}U_leWcXfDEm9u_QLs#!9REIbhf=& zZufcO;iN#i52z6K-%b=Ra8Ac^a1!7o;VctmTb^%!^2vu!8*cbPM$+z4m;G3wlZ9Js z+48A`15Nt_2h`HZBm*Aak9;MvlZiv?6Wi;gTU}inA0Gp#6Imm#Y+_OpHHcw<-A{tk zWu^W2C%*$mOzC=3_7z@dJ9P55m1UB-6wVwoyz!BzaqG4|#g1;$QP%dWoojJSpuer#(#THf8vb{z9IyvzP-}+a`^;c>SyU z{>Hh}y~kMxrR`5#Fp&q3Hrk6DWF%dl_WixE$FtS#%pM%ZR7nTP2GkBR{Exn_!H!;TEkUS2CMN(ru{{AlZRI z@-_f%<4w^UcVNmRjjHA&#CdB~_!cdVnL>m|3_fIN%B66}$^dCVmcM|4nBqX93$fuM z2bW40vI=L_+oZUvB*nbps{%?}5iQK7ACU`jR4$+t92Z$+Y$#vg+$@4*s*~;N$ZLlT1k?R0+imuyZWMy z-BM)vnY+>=-Ad|X=X$Lk^r^T_FZBTU*&fnf-DEq2S>gH-t#IP5bZpe59pmPbFR9Zo zXnANprgS2rKh$g zC2)3FRW1eBygOeSI@3i6x1_WUeXXdeYhy3r1G^1$f>M%%Z^ipsa@m=cj$IxlCo&Sq ziss5+xs|EPQY#eM8NYO~Q(E=b2smxpudQrxQPzAfhtdjVZC05r~|Kiyj{uZ{ywkx zypNoFtbY68;?%x3)w}Ek=Ad;7UK_YtR{e{E#b+-*!vL1^7-!G%%2H&YZ$XF*1djt( z10x0qOp1I{)5AmLXGK~)4;=iem$Iu<`V#AO*G=+LhwN{$7H@v?!)*8O-R5x#GPqwq z1sCH74+N30wOb2n-*eSrSLSQ)dDn+(I{~*Mgcpnk2}`3~l|l6nrgWQ@Rp*)Ba?8(j zUDy8(8}FYmP9z>O!Tj-0-{9c>wcHM}odE>5@<`Js751|y=u^BE`|4F`WU}fY8-B_X zM?n7SF0L+v6Yl>92e$oNJOQ|o?=XD-IYwpE002M$Nkli;ziFRUn zhY!qM_~@eVz^k~nXg=P5fP+wNJ0?9< zkc0tZzmxO2p=M^`hp+XRl#QN1mH{<@jF1?52!Dz`rS;P9AB8vQmOf2Szr(`XMvB9z zCvZ@Daqx^^|uTLOc)b{D^u4`JRw`@+xo380C zi(#E`wwLm&zBjzE`>E3amN;UFHoQ?#F@gq7yc8;^aV1HWS7eigAAeP-BT;X-Q8r4Q zlu}T{>WCVy{2jhVWr<5p#|PgvJrG-LHPFzl2!eU|+nF74lb)tm*=!I}*eGu4W!uVWIqV#~0h-zx?b0*e z@DsHJ3sZqg*4_Nii_bI-``hI${*iR;GTf73&DtO;Om|x}+PXFh^w!1?N6$LrgdbF7 zW$|o$gla35ToFax31pD4Whcw%Bi=XPVz(`i=(Iv65Ghms75&U>(J*@}pLztk#T$9m z0YOVzZPK}$?;v}gaEsTt9WGtexNI=%Vh7XCbpNdTb7$G~gzK<0g?CbsKN5`VHZckD z-#F344qrOCnwiuMCUGIkd4qT4Fi5XFI=}tmQFt4_l5vHR*^U#o!PWr)r@CVCVzkDNGJT##p0s$>PEpJZcGHSl1kXNoAwJGZyM&y-<9Hh z2JGiK(0qoiL2gxQedS%Y9%A67oCd(EtEMz9GFrEx?Tzr>o#o??J`9dqrW`P4)oVH6 zCdovBo8SSfzk3Ym>{o0PImZE^S`*~vi=2I9%CMeC^{xtE;8mY)Y0)W14W4_2l3KBM z5?gT?7QVi@$GTQ<_{s;)W)p9&2tVv$dv}Gh%SK;VX{dv^{!G`0U~skTt=qSFdls(< zbucJRou--Kr6WBL7rVNr?AoIz5~75zQc0pNCO_s1?y=-TA-QF52>F_}%06Jsm(~$_ z@OI$!@y8#MOP$HM%DRZ|L$$*5zTohldd95q5jJW!4{og6O<(&OcCh4@x(7@$J>dTo z1NJ}q!#`ZU_O-7t3A~Vj&KqyM$#*@T<)eaEIEepndHszyI2eBrdp)GioIJ+4h}07% z0GU+ym39PVkd&2L?CTS{+iYFB$#$h%Jk>a)ZC&9T5DwbAtg9@lj4fx|zULvHdHMn} z@FFmRdt_^^#Vlv(WXF5nOBvvw^0^Xa4X$*Oo*S?0!_e*GF&aj3CZ9K z+$tOIL#a%GYFieHv2#Y&5vvOI}TupJKEq1Ry-Q4znK_+RN7uGBPu~4AulY4h+o#)c+oj>e=?RbF&KFO* z;^~C1WdWRTpIQ+Gr6%Iwa2N6*whL4bRH5v9)t41Wge5 z39m`%uk;6k@k(nQi<^z8pV@66TWvc=%(fmsNP)BEuf-M-0H-pY2u;SETGw z0;f1UjY!;n2IExOI2d+9LFW#j)z5*=(+tQjJ^c);%~yFh{uSVD-_`j~;*oaLZL*h~ z=6Yj3?z``i=dxe>z02!2Dd#$ntx>(9Xta{$RQh)N6@L6_13x5u$bE+1ug-X+)Q&Bu zIVW|WSF~zu5AOPnh%@KTNKkV8r0Kq0;tT@cdHZe7D13qKGPQNhGO``kN~FBTFMO?Y z+NRn<*{jM!y_9P%AJddWm(^3Bu)+I)x1^lBgq#dEbJ%e}$yfY>w_msJv$vna@#a;2 z_4fsHv<)?H^VG9#-!vOXe{tA7;`@QADQ)ibiKKm@Is?;Oux*vFq z&W?+!cw@SpL(&AK`J`))CmqV=HYnSle{qcjY*JUVLT}Ws5RnN#JzbZh;j*`(%_U7( z={6G4Ej$Rxh4Q$^3llm8C?P{E{c__g_s}VsT*TEzZW(hD<|=r;x3Upy(A8H@$>>`4 zlB;Ctyft6FNz_!5O}=S~IauD>sW|OgSN-N=p5Mkv7e}}UxeeQJmVJlC=`mL71a+6p z2-A2GML->S_0h!YmwEs$tltHZuD@;zEno99-f|V6&6wWDd-5W@@?^prwq?0|Ssb{g zS0cgFepl-in8}gBwkWK8`)8hEopKr$&akqzeZ-T0^D2?>iyKRCXk{{if&V>baidY zrE|rkkw;X3TSN^bR@jmz{8W6!L&R17KxgnGzVMNQWWlOa5vD3hSDx5k8kWQO4e_he zXW$^7RLXV3>;YC+PH)EIKFLYD@t&0bq_kN^m5;i^r)8AxkMUH1(nbB1sa?}E-n0S( zT<1YvR=8Su==9)JhSZI2j;L+J+bW*qaCA-2#C(i?H}!ehW# z7In}XI@*f6^ha;K$$>`RfCCQ)c=wp>c#zc_OZFe*`>uDl8Yu_k!fZM6M#=7~l&_N= zw@&%W(cG-8Da6nDS94ht1K$Zx%dwa0$8U<||(<-FU`qE9UFmpRW@} zdPTGSWaZ)wyj}C}xN_gT3$VmA~+X6FN+o;^&NyFya(!mtk;i z6T*tkr6>q{)TN#Qt1Et04kIC+m)0u_GvPV12s^Nw2tA2(6+&H z?Qi@WcK{rbt87r(w7iK??$K6$DvX<9n^qPRM%`$cw!E^fbfAS3^~wrQDDhNBL2nsk~+%hXTojQbO|^=ex;zGAj4)^sgy zT6+nvmyuWTGHB7$@XV#@aOOF{XETqf7BsN|X)Zkk>Pw5@Zh-6^Kt8j_bwgPy-MPD*Y zZ(#Y~wLu9|bRt;LN4)u(#!s6+hClL@!Ahbn^y8jr$D{dK{VnJt*CTX}%)ycL~5?e7$^QscE<)vI+Q zZGfZ`Yq&gB8B#>A&@9L!@qw=uTI%37FY!u*$RgXK0{kva$A{nfO;l3O-ql1D#BMNnELP9)9x3h}@=|*36nm zd?3=GXAMb`J_7~aR_+ZWr4CoVYGnZ7q|AYut9#rS4mb?@(;xk4xxs+wYu+19ilT6e-X2e`TU`qgZ=BYBQMdEIh! zhe4SGpB$KE1v!IQlzvz>P7o26x!Cm&1(J(zowRqhOWJR?xI{Gdsv ze%^Swox)SO+r~=45I!&>2%CD4y3zf`1|Oo+MLH|3GRT^{MWwTqrLy+|j{N#&FQRcB zW+S|}eD61YgAc%cVfokp=HD*2s2{Ou>LzkB2$xPz#+5tu%>mxVJl|h1?$61VFxEvm zxpM!%|Jk22X*iXG%P;cH5bM0(J^Aau`N{I!GtVrqzVzzy;U|}=GxziHEcLanJ;g-B zH$utE$*7Z=6MUHNCQlx2+_=45W*ZboA*fHRHZ%Ku6nH-)UbS8qKyqc#7XN)K91Vd}CPK!Vf4uVsMvjb>lpa z!dS1R>tsRr!mUA=ZNy3LJi%>x0o=KC9z_AI%2ciz|Crk8;TYM(5f(5q`4w2VNx%v#4tXhiQHm6F9CsGq=Akc~xK4Nt`($p;7blhlb(KY(ts_*g?uVR| zM~0*irf}>w-FfL>;pd`<6O5`4Lte!lIyWzp;*_c3^_|3tTV-JVt|t`@uiv^gO?$-5 zwqgdQ*Qr>A>bgdR(p6=X!R9bYI-ik((7If?=*MiJ96_Vg9iV~FCA~d7MoNq?PvS*& z0u4|V+R9}xD6gp$PVur5882#&>#@=Aknqx5VFy_~sYK+Ch*YEvJ;4=n$*>Wc?P8U! zN;Zg+89RVv;jMBE>TB9WWENo!Pq-_N!umgGZ|R%pD?Ul)$AA&N8do5T45SRSjwP@8 zGyNy>kS?xuOKhYiV+t3f!2CvJV3X9jW_D*F1=IPF+>Z{`qoIXpC#z_n9tkLa2MfdG zJjobxSz=X7S1Kbw+0j9?UY)!8qs(HTHIV@6vC~L2p~82tDwhEXWev1P)WmIplq9)9*$p9YJHBXi&lxf4tnFK4~kz9_@kZJJhXZd5*7_Ur1fE zZXMtEtch)C$wv|Iad7*939EH6lW64gkgiYXtYd9P@% zGfcWqbBNowfLX_FOVtKzfN%Rx-Ld6{4)p{6wqI!uvzDEaF;9zl)$o1ZR;FxK534>= z%RvxExj7I{TXCCTwt$9K4W!AIYxqF8XnO@FdHN$&nS7U?(QeEko zZ8Isrp-Z)qF}ENe{Io0K!iB)>OdGPozw6lyVR*(g90`Tz$A9GZqnjDjS`LGNwn|$R z2-o(MPnBzF*XdGzysea6bNXk=SzALjihPGXBP0DO{xFrfn^MT4*^iVN;#S{Gdt!oV zdyr>-qaF=|m(75;T%?jo6m>;;^HZ91n&u0)=~#;{)-PavVTJWKuv=NIbOybmQdend z^OiAo1kOoxFCb(ArG!gaba|LW&&y1D2o?DWTT~sE!DcB1^=A5bfsi;Rw1t^h5~jjg z0X3Wu<&ZC~#%Ed%ik(P~T}15}PDbPbAY$0bND5PxA=|LF#L%#yHL`TX&#gMJgKK#a z$chrOl{O+nyvS7gOB#MQrVVnHD~2ApM=Npx@9Hl7(tRKeuXzOvq*`H3OaUT;u_JIJ z+q6!PtRueKHMR+I_as+EUZIHQC0blb$?mOjbM+ zg4d0|Q5*aJQsA3U)PU4)v5mEU%Y{|DN>`dfrJn;Ml>!qWaKI;A$8<3 z%Tzi^EtHSIsZ+Z&wh%_z{f2G0 zU0<3r-Q9S@&oHK0&JDYprwpb&$+diqqn{3Z7}RppwZ$5~X?1!5ZG0?)U07+i9C`Z@ zJPJnHs=lO8aR8v>uG=)cfv;|&lYHJd1YsyhrOVhG^Z_(DB>ITglit0t{MBFm`EvQ{ z2g|pA_dj0F@j6x(L|M`28wGqj;htM2=;KZ?DLBgj?ZF*hfy#u&)!gon^2RaZ4jw#| zZnLoc*~j<>qWf-1VPJOk(@&O5FTF@bEP-lX1T*xhm3!?FktINY0S)-R^~-NP96$bG zx%$a_%N_$CdE8@yaFZt{w^$Co&l3hjUmmgQzlW^%;JL>K&Gru&aC&f+_Ln#Q*!X;E zVqXv$hkYO3%PQDe4^4A;+JQ~qr~r(N?m@rG+l1U=uq@C|4-Q#rKhF0ylpXuj zSu*)q9$ZsDa|g@dh>Xfx>r|zrT@bA85;7S)3Ce%n+)c3CkTAsfRlg#yu7}AQm${Cs z>kLzMeN+M40X%ABK_1zmcZ!s{hiGZ(4)y57^e6rAkEI$NTeG7vP z46U3_cIDxd1GnEjWMG~)!p8!wpPzj4DHDWa%d4-wuv~iP>FDz5^{c#9iMNq)xY~*4 zJ?Jn`WROjtjg3#8_KAdN9h_YAMQhrDX7<}AzJ2WM8D34ySKd7r;m7Mdr*l9Zal55k z>K-~+A7hHsXO8o<>&)`CS6*Je^6E?ZxZ{1!RvZJzfkU3QK|E!P4o*%j4xM)rb!>aa zM_0>`l1QF9+8&n&cawfdTX}GJm*OOiv55DfR21tNaQE(BV}i>a*s`>aYR8l-`TRZ1 z;d0J{c%MpMp*!rcPja#(y+~jf=lKPNcH)N@r(7AkA*-%+==xDLuyD+G3#0hf6yKJa zp<;I&#F{R_IO(L~#&$CgC{GiQs()1{i+9P={I!AoLk*%OIrT|egzRd=h0)^*;e7on zAhBEPr*PG`6};uA6HgfNbluk!Ucs8~;>sSz)_y6&W7KDb(1BVex4jE%StKXE(b}0` zJWAijH4WSPC%xFZ^eqZCJ}dpTf$$}}`K`xF6B-^W2QWXko~92;xhtLWA*Xc$nAV}w z%(t$)2xWdIPqeq<%R&G*3t^kDzWsqCtGsAWMFOlc!!>fpXWzWt{KUO04H|Ydo3jDI zLzWC^)Kh_+D1_t0R~+>(Hw-Lh5~ZL6i-gdq3|{4F(lD0#1Z?QD-Z+As%f@3%ix?$P z5~bg64J-HH9ifQ#z)_kFr&tqulc_YwHr$#q1s=OMH_8a&kyM$`x9wTIe znY35=LA1Ch>2JLVJL~d91Pgt~%}L_79d~JcR$TZw&#=Fy)4ptHxX()85pRt){7hrb zoB|D}KMz(WkjKihafJD-@QnYfg#R{qXIstT$9Y)7*qY99t{2rptaX(t4R1g?!?D50 zql=pybz#)S&=?gVHN?9M0q?E(_}ZseJy*Sa^=n^UKK#j#v-0ix@UOFN!@b)!0sENz z=*!%mgqRH2Jni89r{uE@3FD2RZ?5;;&(6ZnF}6d!`Q|T|FMspf;97^o1>5=+DhLlQ z|5F*YLeU^wm_B;{y&BN)de%eU%kE05ze84B9c!HCiGzz0_VYP}8rt^Xv^j`7*tD0< z)Sa+TF!GTo@=voD+OUoxPs-xB6ET)S+s+5~?z3`yj#srRJvT9KiU&+w>PZdX(m|zu zB%esAMa_hiW3Gb@)9GDD0IL=4*p!a}@&v^Dym0EPeU-H)8`x*uW6alvZBpqZKIE9a z1~YBrZx|d1beeuU?VvNw@>bqeqNR>O1eg&o>yJp=@i? ziE_&o4npV5ir@cm#EML{<%Rl~!SyDKJe*Lt+FC2FOadqu!%hZN;T&%uJI6L6U)_4~ z;?vo-cjLy5<*KhV2GoxwRWX90|MrYw;u(rp*5qI1=@2I^H;?{h{YtLm7@?_p%J*kv^e-Zbg^ z6?VUC;l}Rvy+Qp5Ly5`7wxy+N2XLta5VRmd z*BUnJng{0%o1r66^`!=p_Aj9v8<2ejxKJ#ycCq1ztf_0Dq1xXv^NKm%jRVI%ua?58uoVP<*F%iws}$+))8&=%DF@q}@=E@&ZL z=)w)j61LAPJ)~Rm3b$L%vHgxdE#EGz__Hgzc21mdC7e%YwWD?w%?(gPM?rN#DS?Pdw1ldpHgIlSjUpO=5BaD} zDQ-oEZfKFkEPMsE@W`vCRsr8lUFK$;TWOv)&eF!4K%f0 zGUS9;F0u$8uR3+r6-AB_Z#R4>SA zxvd%2xd>(b8I;U#-YcE|j{+O-l68*HVcC6_9Ln}2*cP$#cH!pq+1{piUejp5)Ac{g zCJkf0H?t?XhB2?h?LP_Lw3(l}Sg#{qP?#pn@TE2oFIZu0gQSASh4_5)FDXK}!VP&JA0djg}rLA&jj z8*2tZOg0|zT262N@VLkqjC~zyYzTbh&w!z9>Nb{ZS8qg`#O@&)o{9s(y>+ z0fV?vn)pz6)l6gIx{NAb@)_=Y<;?aeVE1?|j(% z?N_F6a4zP=iCamt%>A%v>H+?Fz9I42D=#fy`SMqJa&bDZk^S?(_{-&IufMTeyK#%{ zeV^oe6L&cWvd0YmcYpJ@a+c>I+mI}SwrBbP%kmt<;J`e}WBs65}Mj z5KXVTqYk#uILnl>gX9sHA(L$;?bjXvz+IrD7>TK8+pe-DEbbBDBN6qIv=cWc4>dt| z5}~|`?}S+tBK&-ESaz#PSeJ!-V>5_#npiUL@|nx4(}WP-c<=}dS2W~de^7&Orsy4%=rGUip|WeA8-QS_vz&yp$84MEa9crkx_;Aczbk?`hE3Ps+!jtHNQMPfl5}M;x%mcU#KmctraQwWt?e)n z8@}Nhul?|bu9DSN#IoHWvFgw`o&Gpv<*0YrU@e{*Wexk*7T=RU+o|Pjpfx^Z?DAgo z2`KvtXGYsxTcn(;teTILUXcnEBDX8^y>2UuYCoQ zkCsn9{4ihWH87eTk32=DdV8Tw(pHq87!|Zi_)`%>T=W!$%k^i>XWw? zVN+MM%fb$H%62{A@oU<_w=_^l{OBvO;hJa1cYWKHFzvK?yQWw2Zv9rb#m*PL!_B{G z>UW-gxC25d3-SS+^KK2tglMn`? zPA084z8x_uW-{bYgJY_n^YkR-_YhPckU%i|s49{M?n8EC6wo{$q zYt*;=P#mxeebK5m|Z9X`ATj@zL403j2NJA6U? z-~ZK5mY=@y?((JAURu8Qo8Mmk^MCnImbc%2d--?&$6qYJ{P{1Ir};wmL%!Pm?8P(7 zmtT3V&JlSKV6vk+xge^o9rX7CLYIZGhWlq+=Xk_AcarLrm0QkkxaH4w>XS7pZR);H zP^6u)7kCw}x{xx{{@`en(1n-jIu~vIdiq23e!vz^s0sex!Q`2vMhFH52QjUQg*w=Zmm_4{Y~E{z%Xv+~7(irAx;3t6t~l(Ctouty5Gz}M@O zVoOv|f4ELmb;TlH2={J7+!5xu??>zmc zJ>v^E!_MK)wl`$~o_*vCFTcFJ$j?st27{~Xe9yt}FL>z3frY)FtKzcImCwo>16P~*$eOeDN%TJL%l&Ox8fbe)435A3?_gb`0B28ZMcmm98w1F~Lx z;RObPXENxjdOX)~2T^-GxnYZqTbV%6F1cyK-2p}>F6iqw8XR1D(_7niM!wZoo2I>W zU0&@ckxOj}b0wS)-!VY;{qXmh6!{v^ldOtM!@)#W^cbY1PK>^%>&NC7ua3*3;0FS6 z=_!NhWlf<9&N6Q&T)UzH%;3M`TONZ}^W2Tw6*_tI{J_;~yk7W|k9h0Rb)F1R2RRSn zDnG(dM;*{JfS?X(GU@rP5b0qUeB~}XDx3K}RWO{VH2G?}_B&kOdW(Vk1-?Yi;KoUx zXD;sYG~$(4Ut6AfitS1V*Qz$lgNn=*hWXFa2PZqy_k)CPUDDm`Ng!3f_5?@#Rv&>&$5DG2f`ZsdQV>@3DoUIozd&W}^ z^3qDkrw^c=x&c2j&@SZV(FkQc9JJFh5EjNse1s&v&l#U=<0PEz#+0=%4;dJfRCbw# zlI`q_t1n)!q|ezEF4?84o;tnAOF-Yn2p1jlWRX|<`h5~z<78V>P^+!jhSjgza{J;m z55wtPh8CtbTzChdcr>5Vq{`Rqh7(tQ?Gla-^cR$Y<)gFzMI-7B?Sa1B#fDj)4j0`j zv!-iW|D@FechhV`oj0fZ{fsJE+4>PlI%23MtR(^$1y1I5WQ(+Q`34PUvB zU+4U`N2hhX>2&(zRHm{Xyl0pdpTsV&&{_+=Q$@C1S}En9DVe>KFtVi=D!)RRwmvD3 zL8o<|xi(yQOg4QI?2Pw!39q)9%B$06{8B z!C=YutjpMdvUgv2`g1yD2fMC@dgFQH=dUj>fB8$H6@=82!mYU1(l6z0ke}V_JHL1% z2fT0JyUjq84~H>OcS{Q6>-2qvS-5pmv`-v^sO*ccwy#{ZO~QeSg9pAKeCG7>jcaTb+sBgDpH8ZtL{IATz%RsM-aJ$2eKc>K5hfPd+*lr z!H4fMiMYYN$EQQ}G(qZ4^c-MjQiZQg($A(0g|JUoo+{f6JYY~;%p3qC2W9DGmKP}T zMB#(wl^0*iA!@hOsM}>GwwHLy@F{O_I>v;_vP6*BSK`v>f!H;khG{?Rvri-thJ3b0 zF`4zrmV;1E_d87B`weV!tA}2ClH*sV9VQUUwN5r zO>WscmWgNUAT%Q+Owk$gGuaa^^%OsKts>IO!af0UmB+rMe}w@(@X}RT zU$U4bArtAVb04a=IH`NlVWDM@H!=iDCWPXk6hb27ti-DkDn_EIx~Q9mFU|@#1eFne z?CR)WFbi49Zr+guR}LCMdhSljf!5BSh9H0Ky@myk>lc zvtZO`w;;O^M2i)q16*+;C6$RZA14h45u>ECu9Pz`;bGsAE`41damJ{nQxds&6Ar(o zlMEx$(X*o9gKNAFBojp+ZvfFLysPnagOp6`CQLiiTBbEE0hkO013%$3T>ILvRw(V5 z82p61ddO2537DpCcs-FyCMl{D!zECYpYRNS2^4MxxI$Q&)h(~>)9C)oOb2|IZzX*8 zwe@e@woUUW%&J?>Bl-s~EdTzA%k*U-1TQ+Y1lyv0} zf-Qd;A9=XZv|IB!L0`Hk(lkxlFpQPk)vV#$e7zZLTJL`0fWB;JvZDaQ^wa zsk5TH&nu4(9(V>pTCNOPuKp}O2PB5dhVpg~^h0i1wW8>NKo+%7;6&uDx8KZ}3v19L z2IV~8FqEPWp>bL19%s+EtMS(#Ew|qPh_~HvXqA2BcW>Wi;Kph{_m6!C*N7ob25%m+ zrd`Gc$e1m&>N@&U>%46;lN|8;Li!1wQoQ`iE8Hu1@%hMcerM+t4U$U!&)>IGq zx$pn@$w$kTn^&2jTw;)Wg*>)Q0p#`^!?`yMUD96<{JLjIttLwP^7E!$e8&o^gBQ|u z-49oP?chOtUF939z4L~2;11^?5wUUB34fhaO{d?qy!iQ4hvLX!*a7iF&N^7m+R3t%cVLV|Tix|*@}=zPja+_n;e!vaF;P6f zoIiVlvo&XyJtmiasL*eiIGFDP+)h6IbCFA%IXRZDHu75(^7e^`Z)iHr_a06k@P;YA zx8kd5eZ_FjA@CGLMKfvjD|RUR(>LDY3)y#h#VfoSJfDARZ@KW&7opEYf(b}$lQL63 zZG}$09(b6u@=f2`zB1{`An&6A1TWT_(6zX3nDMl)IjYAe#aNmV5^6_jxLnfnxy8a-i(%nz#G4 zvXANYk@No@w!kl25*@Ta_oYIu#AeKIvi;PTx=< zi5r~TS+x!ARPg2(E$GCli#D*0njkMlYQNf|Y|YD+h~?EX^`eM&&p5|=gJ8x<2u|_{ zODAl`Ws^nt8nZ;lRp$Vvor!6z&SF`X+MEPv*4Uc*M0-rFa!~_j(HmKP4RFy_9O{^P zgxr!UflzjG;2$&&Xp=HiCKm;|UAN9sacd|2cQpVBwzK{RdobA;!%E}8(7#rG&EH6b zl=+ff^P7SGNEfDsYB*&oTzQn)ir+vK9sDRQPi-({YPzO~Tv-AcA9y2uXkK~KRXJKF z4L8SEngd=rdm%MM#vwp>U2;F>ZkqnyN(R0MM^BT!4# zr&tLRc?eW$^A%vgMG!D3T>GF1W*R?nrB%KmlmB1R-t$SbB)#stZEM=i^q3t#J8Lj& z0Ik6;MSu)KAp|J}L*Jy(zo`gGq!bL01VE7ph(Ul9u;c=3#?QL(rfqd~wa)LHJoo1N zc6IM8m{s@9%qJh8{#xWGZNPxl?#9KP^M#ep@sN=^n|y_jRc6_x(+u%4=S=I`X6s9g`QY^E;(UqPBEGvs%=F`V{*5o^;Xh zz%ma(zwzelzDjZC5neGXeLMqUoq~0NL)w&iE9>VPv}ix#VdZmrkXmniB6yLhjs|aT zeOQCX6A@}0=hd8?x#wzw7xO|w78@F*`Xg}~C?9|PiRH5H%LgaEf*ZPUSUbUUiT&cm zM|DL+XSubq^c$cghfN5JM;21NL4~ubY*d&ay&%QOQ&{~EciWu=l3Xv~>mR>f=Q)rw z&*|Wr@fW}N^77I9zfk(NpK*Biz4!eAxy+YY09FciBcC~n zjS8W+b+E8uvB<4ZY$mRpzT!;;4}^286pLbXus+P=5KF&zbz9V(qeqvkm(T0jj2phy z51!{}x3tr-{s0}>oZM|2LZ8+JeCu~`TOY-Z7m-R!A3+K11lCP-HVw!PZ(;{3f56Cl z(eHw+Dp(vN!)<1eP+b4J{wJP1@XiLB4+<8~3H?-N+8g?!DD~Pi&cQ@tLIg6(G;C#% zLK}#AkHSsPhApebLWe-e#i#!2w;W|YKqr2IRlYWfO!ms+mA3EM2W|hYD|CqOwmPc~ zR$^q3Lsihtq>^}N94&6^+jz7c^w(9K$^lMj%)*_rEx(l&MqIwg?u#biGk>F|e0;(| z;~b)(wQTFU`5zcpnEaYxw+H;CLjjr|ZkU(>O!LcKoxGyY_39#iGk;U2Z;+g>5sZ>G zO4Y7eC17qx&A^e4yS4%Zrn0gK-%55<;1C+`rxA!)!`aponorAd1FC#d9%$`J$sOzBpoL`p>6OU+zX{M zdYD-sdDE>zw{#Yo?*?$jmqi?#lw4}wbfC|2KXVsTiOV(a@(B%1e#g3s!FlSJUV2ey zg|Fa-wJp`JrDU)8AH&c?ZOFMD$WE5^)R4H|VjiP-8J@20*~9S-JCX3}Ye1;wTp zzwKr}9;l95Mh0Gl8P;@TP`HYNZs+9JzxTcGF28*D13h&7m%2TTs|z{M(Py~LFg#_A zaxj?tH8RJ7f7n%D zM4m-WpA#WXd*%fN=gvLiH!QL6;<=dXx;2mu1vbNzv5!p!y7RWDE0>=12H~TRKXz46 zm%S|-8XuB}scU|+t#w@f(1Ky0t>%G0$>5D`@edf=iHuo4{6J_tmyvG$H3qkStEj}=0v{Yg>t@y|C0ki} zfji59raH}hsuOg(S4}g&cOcfbb>9B_jVlDNS`GQkJF z(kbmi;wiM;G4xP079Q}dchbm=hi>|BBPs2~RxAuPsP?x!`l=(?M%j&&u#7zH(t+=` zw#A|-m-J0mX5(zank69w^CYQ*@wXcTJmy=oz*9g=bT!T>;}#HKS|qPO!W!=icNssk zy`<59-)``L?FT@)4NKZ=ZnF*esG!PlUcSN3@{246mtto?Yxyh_7`~nLq_Idq$vg=cp5X5E+ksel`*Ha| z_fR>Ljr(};tnBh5E>F_Fy+HBh%NLhVZhy9%*O}EjI+M!Vfi$4#P$9=?64aUdl*)Jo zCJUqTYbVe0E}lkzWk*x>!V`&V4U*$ zF`Nw=wi1AITU1_6nFmLq?E;taHe*cmIIr7{4#({&EJ*4X;*Lda3w7>{gUq~H==5?! zSBUu5!Ka_8c`Hgf9!F8Xuw zFuvwPg`CwqTyAOu`08t~>X`_A;7!jm+|ZSeOINPg7c}smdF`B(MgKMi_N*v;Szyr* zI9tlb_N=~U&D)u{`r)sotE#|*2ROvusPM7@%?(dJ`M7S>IlWxfRWRBSn_ZrL*$HlA z<1JIe_wcN*i3zixou{*kmg@0FQ+o1hQe(p>DZzvQkGRr#_+$Uvui@GD(ActttAeq> zXWr19r;V7$d59FMYkgXuln-%jBE7-KMlwt+N8FS{*(hxD^~3%0`=PwhWteBO`Ygbq zyB))Lp}x&VkK5+P_~0=RpZUO#u?cyWzjBY1>R)}T2lP!-MtUDs-C9PzzAWxZw3HRM z_{Afh19*hSbJ?sPaPkED7i00rILq0p@~kU%QwDbEh4_KMsJPRWM*1u zu!DnsM!##=$Gh)Jgb{|3C2DNbCkLG51Yq!JS@Y$w9~W58zj<&*S6Z6w={)lQ zd56zLo$8zbd&zx16qlD93-+StfTvuO78{GyKcHV5RE3;FGjE$Ciq%}DV;)jA-@>#w zq^*P#%~&gC+CT<><;RX9dhZgN!^7%uZA5N|{#o@vPo(aL&_q6yN9@2sZeYS)#sC07 z07*naRL(YVqXRZ{tbvMs7S@z>f=!Nj$3ZUD&l+4;&LK+ywoZ<`K&?*~BS*YGkNvA3 zARHlF5mLxjz?yJWq=BF?Wf3EDHyFADnsu5BX4>lXAY9sP@b~5I`r$tQ@=HysjN;7l zfmJqf(}nNT%)B1Nr!;e1zWXv}oTg90?W8&5G<{NBJg#%gZ`r`mVmg@TISQad}m@Td+3Z9#$^%#q;rgYEAt}P%r9)(*eHzs~qwkqpF;B zJzn0_t6smRH`S!ciDN5v;LUuH!9aFRXKVTL^c@{sjo?-x9&Wv@d!|{S-P1e%`Qovk zQ_$p(4%uXgxdd5bEe<--RY$i+;ePe2U-bgAvaxe+q4sekS>LS598GkM}MFQn>saN#9P z)AC~*@d@%rezhAz8iszeZWPV=ecemC4=+t(qz-*pK%k@YkLls|Z@%)?l!Q4sLNiSs2@?CFg&@2w&!Nbvfjok}g!TC_39y>^%P@csu?S%G4 zpiSM;SD)eO_M&*U$MU2fG}m?8-QnSRy=GPq*^AGCzGTi7GrmQ_Tg7OTcXfj1GhLr1MP4?&1$N?I)Ep zSet&8DSPAFQI{Y|FK*={?gp&1-9*ejEU(H*oXrSL!&eMr>S!^@8URO0rzJ-cJmF(| zp;8`w8nDfB1j18l2||%?1ZSgp$P@VVbrDDtek^i$)qUUmpyyt*2A#;TPlx=1+ZV|r zH#Sq3#fZOWe?ha_F?5Hvyjli_@Y{GbU!3MQjQeR?2Cmg2*3r63`S7*i8B@rEz1{88 zb_2eB42JNVFN_fh(q5(QSPg1%Of-I~N6X;Jyo0=L6EOspuytM{w;P`sQl-nmF-HRR z7Zstz0h_{x?D5_Bi*i)x9JhR?ZMg{4fHp^hQ~lBy`{tUTv@9cwuQp*GEBr6r!} z>5#1erZFajbhy|RTvye^LcYO0b?!*UMKqcTRXBa17HCM)Xy9#B{meL~9!7-M@RbUR zFN(G9z;}HKZIGG8!+!8h{gVeD2R1vV6PT>Ht8r`~SjbusryiUQ_u-O(nMMrlF> z@2k~P$Fm+g1-~Y_dzgd_vz>SK3#R5>A-P*#L6kPwV~<-4(SH-A|tp8JP6Z{_-Zc;^|J@B_@X?w zc%0H0qn(6F#RVdhnOr}WF;8riRbKUN9u=7=-+J>+$9`ZM0);kTr-#1bkNms(JcJf! z`Vbxy7K^{zdOm{tmY3zW{lM)$m}EvHf+xD2mVJJmCSxWxvq0c2Itq-Vk6yW|FA_f{ z+?3-N7~vN!`7~OnFY_+}+!Dj9Rv6uYY-Y<)vr5Y2SED z(GvpY8%_|&R|W zj>U8?gD-*D`YJt_N5?3_?TGi(FgU>hoHxX|p_l`NBQfbs5`N&juDp3J!ES6k`K9x; zq^XWYr+PWmZD;q6dB~k-sdD0h4La*99=<+zR&V7}Kn`s~2?UE*KIT^)Dsi<}Xilp@ z=+@P>BWHO-E7`OQj?_tv>v|V?n`d>Weydnk9i7&<7T z9q~^p__|Vg93>t5}1!_~xOq@Qf|UCSWr`Eo2Cj zZ9A8bT^z$<=Ub0d7+`<&vu{~n*Bk4JX#8+wgNZ?)CBfj9i43AF8)NtTJ3 z*PkwWE7mnV?0)<9(?X4fI1@Wm^1$R=O!f2pGfynv{OXH(21d^< zaQl`P8D1o?*iij5jPm^v^n(}lM=GElvZ%nf97&_{ut;^F_sI*+gPM2s5dG7(^$}e$ z_~642)E1@J8271z){z&9!Yp6n*pLb^dnePLXp2J8O8|7`LT~NjcE1a{P4LW_E55pN z?S>YovWVM7Z|Ut~?|t~`@`Jzm+vR(|^IOaLi! z@tfY}rnlIcLIt#F zZ8asv>-z5kf5n4#%FKM6cFk@4cZ?SudFW?4W!C%-e3H{7^fVhF2n56mwrLz*;2H)A zi4h_ud{dV&7}^gezWoWET_D`S3mu`aZTq^;HimTK9YWhVx1kwtXkq@V&ZR>~(hany zIlsb(FQ&2%e4teKJXmIRcKB>O=@+rLtV16f{l-5nl)TtKIeV0$HGJzI;>u|p-osxL zF7VQkZkTxh>tY>#96iHa{5ht4=2;zP*o^<1rpvE%`??m616$TM=Q6pW@Zbs&7=KD~ z^^LRnr!}D8`1E5vAo`gmz;nx`%X(;-d$iq6S={B#8n?&fFS8xs!heL(E;sB7?4CORf~q%&rri#%o<{BX72 zs-tzla#^qU0#l!)9?UzoV#zRK@LAf_%db<=p zzIE%SKa9rQ*G<7d_FRH~(Geb4y2~rFq`&wQz-`)q0c+Y=LvhmJaxmtUd+D7Y{D74= zaoYoYyP&8pp2v|f@J?%f;A48<{q3(WufP7*^4xC#0l`1F=A-L7&`3wEwhT))B99R>082fL`$5z3n};7GU2!Jp8@ z4dth`$i&yA`}P^y8+CS@SPN0=>uaz4_VWC5&n|!QAOBL%PskoUH^A#_dElGxP0+{S zW4|M0ta0!Sz}2TJqJ2sm&(+Wna#k4MXJPM!qu`tX;4e7T0Wy>iTl3BBz3MbmXkEz= z^QMReqU%oM1M&ztZY zV+7^p1wJ~|PseJ@$RGf--O1QSY;H(7h|xD_DTfDqCDjyMfhph9I%$M|_}`wHgicU& zgLZ7@(>HP=3H}~8f*Kw&vKp~WM|D~OeJ%Y*eS^6PJlSJ;-rS&HuYf5QvHBgI@N))y zB!)ijBZ=cEMj-^#v6LD|VE2&;fbqdk3=a8|LLEw3=}3eBAR5X~3YAIOZ$ewmh;hl^ zHO%tMqr98nkuZeKGG~8mG)--o=8KO;eT@B`3?IIm2Jccgc1NUA_d!t^PxqJK z(ybjlQ^393=O4M?k8V+>WwsYOOsX)po{Cw(dBZXk1nxyI&nKvyvyCh!Y6p`ZXEOOf z8*j7W4M&ga178I2(}$f~?Llv!*^?ejF7%&F-b`{bM*l3-(V0MupV&wc(2)f{fyp_G zdQF6)K`(4#vj6sXzO6ISdZ^k;7~4LC3kN4@!H1sXSc{4O^?&)Fmw)!p|CxB`d5Yh9 zSqsGHm&c!Ybonp;<)?y;vy&`-(Pi`z?14|~V@_Kt6*L9#xS!q+7E8B(@<0Alm8xFF zpFaXPJ(>px-djEkU-J9yXdkb-HQq0ud6I4@e)ElIIO!gg=K@qS1|BUKJ(1- z`m3+`xrnP*uiC%JW+UjfBDZyOLYsq*a@xvW9l+*5o9@ZWUX`;x+O)d;Y4OSJQ*Xch z&hp)FeS10c{bl)o|HGea-F#Ov&n!n^yTF_E9JbwAM7SSZ6AS=dN$T|qniF^^T0!wZ`EpdHU4 zW#Xm9K=I~s3`eH^72=q>2UyA|*Fin&4{hCupa|ih+~K@tprQWIH-S9=`cvJmi+g0a zfbk}6ENv}Rw72ChH>@FEZB^qN59u?e@&Uu1(7<2dQh)cf0p+BP1m*Cx)5u9r7)9oZ zPU5Y$-E|HN_6AOd;6nm>VLQ3PfY)F9zTVHHW9t_>0<>&Y&;oU{L3B}I#Tl}(_pY8V zIl80gOQZ?*Za$WW)ZUR>orno-M{VEL&3V-m-l1)?x#88{(D^$NkgKgyY)1Xr%~3vr zgi}_*Q3i(=X9|vt8wARl@7I~ad|Wbzh$K* zp36zkI4#?U6@C(*xon?b@!{998Gn}3H02wP>Ve(E6sSp|Oo2>lx;z?#)HE1Qk48b; zA-}?vDsJUFkB{_+T=O$U+MfY~Z?%BNX~l%{a%TJ(XBe@cE{$a?Gskc&KRR8*j8oVe zPk6wJ+cf#|B%aGx9wR~7TweUf#h-BsBd(ko)-+{v`96<1zn^Y?rYAjWKzt*mc;uQc zP2=j*^5RmykJEvx11RM)U3n4j)6Qjcx_Izux^l|9@jhMgfR~o^%xk6vA3WoB@e08R zZf{Bxx3z)GqMJ#N*QQ>2^r{v&=a--96_uQ4Q_mR4;>#cnz8B{0A#50!*wU7w54P&I z6J2dMILMjG*{6Kx;bYzJ{_G1ch#$DsY$!TKESQkQ#D=a+pxAlLdW5rfTCCl@%|eN9 zLSz%c;ti#9_K*c+>XC^DJv^Z+=2i|CmQO$PY0=fF;-SqJoY^3q z(W`JTUpnx6{(ti0A1+U51M&UueP?;&^>-wiMJ#^J_aeCO8XFo`eWU|=lII{aoryg6 zWOJtWhR&Y<`@i_h7oa%?O1R4!zSUG*0o>GSPREFGA&d zu~ner!r|FK8aT4CZw|)C2P!*Io&L_(Us-KnS<=R+;p^j0j@!;#M5qd#Z-xmTTb z$aIo3>9Gm@;6Lk0dtkxEHyNJ(!c*RW@VZqVdghF|AIujG8;L{RVgx+_9ZAxTs6a0i z2OR#^8T*1hHX?j~B%9l_>PI|G&22;Q7%?zV+?jUS9dSzEh#AEPwlxAM3Tk z7nc{GRr^DZ+9$N|HE*SnFORJ{0k2PMtI&36SY!njc=55Te2t)d`;)jL#o+pG#)WqF zOL?|V!F8Lp!jX@^>RWhnJ0PLHgN7rw5!K(DoH*7M3=xhnNLE;H>twB?x0&z#ROi~# zco)rH4@k+hEv|xUv!He(6(v#fhrdt-yXzA+R;2k>4RCJ~7}rE|=Z@Y2cBHn>_@uUr z3Dgf83_g^}$CjaCqemg`BY@L8Xn;{qF4q;ToYYW_o!?HZCM4BuC*JXWXgBkbj8Ui;crhsuU@#-CZu@c1F6)u-`HGp9@6n6joD&w7-$Whdp0 zKPv-I5Cx-}cLOD@)%yz5ELuDh3e`SI7rK9T!eCRNxA9ZOJr;QD?G++n#vUs2a zE7ab`vrkADZsh3)xX*qf9G(R8-gJb%vdgz|`LjZvxuQB0)jsfe4zpU{deEfLPlrK&g&|K z{_bg@&(b%OWv+=+l0^VL$Xft+rly9R{7i6RAg6F*a8<~!buRD9bHVtjy=1}bLZLLk!|8Z= z_6uB%czOASeEX@cT+ss9P~OtB2HZNvYk6sne3iNz5@ez`0iJC3P|fX8ZG;DAxw3=a zhk9@peXx(1KwMpFFSrESFXiw`K5(v-u{b1n5slB}FIz%ep*`u3t0`~3^E1gkTE6j( zFUzNA^+5k+Efnvnjqscg8)Dj+Sy_+zu-}5IZjC9QL-Y8`{neGDQ|N7wng4AY7i&z>en!5vo`vqWmwvf8rk-? zRPjUGDv2zNk%sjBp{MyxKQ>i3TabMHp^w4P+qz&F6Sn%}A$UK(!7>?}1oPZ0T5eaw zPi+&V?Kd_ay4p$~<}MqzHO3{;@G(mCHtuPiq)}USdzCyD>rmTLY#kT_1O#?YeU>Z7 zer>SoDG2xruSh519f>dc!2r5iwIibK9|zg}8-c)A@>ras9> zNrhDQjT5(Mk}{Y zCFl^uE&Gu^~{h?sc54<0Sa zWWCCr3x+O~Z944IuLFtb%Cb6nK&tMIVWv=>NrTTZ`w%z*ytadVnqbgw(g-SI#w$+E zqU@w{Pcj3k`fs$gY}R{gTQk4fQrg6+H_G=KTE6&0 zYqzf5_M(9MARf^@p1TQeE3K6R-5vy}nGQ zAAInEdBa~7vzk*o@Dx@eZ?s0sq#S#=o#jXy3a&mdp}TIiKx2{bGl{}!eWJ%5>j6VV z!7=IYeCNB%V^^P0yU{|POYF#FFp^J{5GDtUd{H1p zgTMZPZj1WP^2#gUTz>rH*O$-U{Ds$XY-YHn;mny^!youAXVlj=k1pbkRP1xgj#YgN zH_vnM!0PexnZ9fAoEF$ZbFhC#W^O+eHea8bcL`Nq`QX)8(82mZ3tLhwk~qqPau$Zg zciq@EQK)$KFUTVx13rZKS6?C@i*FW6oS97_i*fN%J$~m~zq9;A&qn;}{g1VgxT(dI zzBsLozy;lIN1c;q;mvJGUF0L5%^(%MJ+BpQ*l=()!hWEgsBYL)5YWwwJMp7D_F+>k z$rUvgeYRit#?dCAc7e$mWnLM}0+JXGz6B2W-CN5$dW|yAWIXx!RkdC5I;RJaFRFd$ z<8$~09>{Oqxi&O80Xr$&565>WV9NhADc@%=+X=L6heksN^}?ox&5++}rM}ZpdBpe& z+c~-!qdwe%RSr%M==0nF%4H!<9Gx`A2-@-4GSJ2I)@BSofj3)M(j1IK{sWhP!ED#C zNeUQd0fvPReO44Q3Yn*~8KiHY<>BhP)T1_@dbt6?FaPm#FloPR$_dzgntq`+WsN2G zG@D8J?h$TVVysoWOk0W0D2zR;A~rzG*of%F1DiPCLaFwRJGe4>_MA@gz0A1HFP7dU&OOpg-OY44fY3pf+U1 zg^wPbL_$T>z%GVVt87_hDBIdTyc}RAokkT$R|>F0IGR=lq7|UAVvx$*Zbum~5A5a^ z929Rc5NP0~7@p3r2*r)Vgzn(R+Kl2k|9R3ilGwY*!7H@rN|=-k;B)@sO#5hXfZwKn zWJgAIO&lNM2YW9M=aQ+j*YE8 z#MN(wiLB=Dl#4k76v1uOgLvvCN@$^=zhiZ}ZQP`M)FlsDm9rjhVv%Q84tZwHkA%zo z)lOS^z=ouOph%%Qq&~kj+JbA;at#7|9N}YKC>Jz=^7})5)_BD*ZRkCA zN{N-1(tTa=DYYjS7I%11S-0UZ?&Ft>k3YT~y{(5MH34cFEPT%T@ManD5fU`VI%!j$ z!;OM7tt@(2^t=}hoN;CG0N&X%x^+YoBX6boRA_I)>-Ti-g{qPxqeBU?kzum=l$jTzxzFX zwfI0^vA(`M|Kjt@Yp=g)pRzck!5Lh9c1Pd2z-N8{R(?k>Hdd14 z&B~cm2mU6)Y281M&v|yk?N)N_De5u_>$2Yt+K#Y|GPm0hnBKBA!Lta;!a!_PQd}NH z=1P*pC{XwGsa!n)0xm)BOV9+h9?8dQ9$rix z9x~RP*Tb>r^f3M7mVc->g`L(Lug;x2SdO$QupeYY+dBC(@KWo+|3_?ouu*M9>pQhK z7Lplb7{h&bS@$R5Gx|zi6$-9y=@8BFyB{wffAX0=Om}Me>Q|mu`_tx?cETHW#EC#1 z;|KgD<)jl1CWCoRTSMcyAjM9qQ-SG25~aC*)$UM=+eui&dE6R=em-ST;7e>iI@Iks z;>iLSk+pGT3}+lgA2uG)a8^2Xp*AHOSTx5DY8(#T+`}*sZui18Hrs$YSAq={wduJ~ zB%U7!4yAs>oPegupdsvVR^)(uLQD`CgNS`~eevVw!M1Uj;0gqCq|z5CZ;24fFxxjezLc z*xb=a65Tc!BLrh?Ki1fE+OL)RLb!OE}%WMoe>j&=ZY=*3rF@6 zqd)lYhbuY@Y$QJR$!632#vtj&2_w-66D;e(+x-$$Mn3>f|8c#g1pZ*7WB7p^-}VYL z7?BN)HWB>fAc_+~X~To3jR4=cz4xmhR+1}9LX429VB68 z8l8}-;cIZh;dR`M`!GS}V>b=GH+wdDGmqdJ!o)dr0*VIqrPKQ~8=ZY72gYL?e50ri z_?_ilaYihe#?X&>xH5)NrwtwK1}>x<`6g;g(YUdBwtKBN?9{;k;`cox*$tJ!g|dxN zc#uzrC3a^41RvTHze8{1ivd|mEZ=ybhw*OPOUGPg4!fYnTK+7@{36c-x7sQT20W^v zNk`6vc^r`qlrHxes38Q4!CH#yZ6O52L>qKf$JQCT)m9Z!PiWx2Rpen~IKYGgkYleh zPFrhWa2T}sjKKu=;wu&(MpyZ%Q`}GN-0FYmdCpK&KVOJ^Jw{*rai6Ck=AEjRW0)C1kp^Z0Ww=+=sZ<&IQ%@{2F)!B%~g>#--6 zOScY}+ghC6)?|95y62UeJV(K;97tI0QHG({6YiK0RcyU!H@t_Rd&D^dd!ScqN=eT% zw0z!-V=F|Debg)Qq2#gPK6v5_%kdX<@A$3j%Xxia`0+338w$GD_oB|~y3XWR^AHWU zHgR^EjRpQ>@y6!mwmOa%af;E04}yL1%P(s|pb5)*bM*v0?M@pM zoP|?iYkL!4cmM~M+uuktjw!H#9bEaaVdynu-qYa?=adjH3R&06ZmwK-=#XVp+55ge ztmvx-((jb~??s@f&TG@)x9+IU(CfBtC&S;khq_V?FjA`xYCIlw=7+Drq5Y};n7gFj zcpz2PhS7I;tJtwN(YM6w*7EXmPcDyWV|1X~2yfrhgSm_Y{Oh0}@v;E;dP8fciHFMO zP_R@QfdwY@!T6yuP@5Fm^}%`Bq>T!jz>G~Qw}R;>5c_0@;$yY>dyF~y^CEBXMl=U- zZpWeTu~F_Kl<|_QdbH!VXWBh^Hp);Ecpin295tjv#h$~IqrH&#=2mHMdL$Vbot1hJ zOZi6|6)#|=r}_ePZHMMR%itwujsvyTh?Ne#xMG;{OJvp zNNKC|6I~BYzhvBzt(-iu9Vk13`+y&#@iQ%HO_JoI_=Vq@f?gUzONW!Z$z*pe$u_&6H_j}y`< zy;-s3(Z-SM8r17o*+}3wHu1K_d}w#MQWMV(xo~ej7 z$5qQypS<%!d$>ZNYlAGz#vxTvwlU%K(r zis#@0bO9Ls=i>cz_<`3W_5}|&Wne5FG6{}{f2xy=R{`B(R&=^%Up-&4Yf}-%{=Q5(nPQn5NSNe=k$>S4j4!*1ck_06 zVEA4*3@&fJ{pRwiJ{Hwiy!w6Q^21dvhIs8CZ)muqv#h5ykzCOI)V#_P+Eb@>7E+Vt zvHXT4&INmWDy8r79{NdN>mdM50-rdqC_bdl4BGqI>Jq%#0}}pn5S_3r%JC%AK3XSh z?MI(s|Yw7nmPEwS#u4O;X|B=&pQavuL^~ ziQlo4#qW?IL0JgZlIl)8gxEqlQuEdhO4na?ER>k)In5b$-AH;$tC~kxpfjCl0$_1= zMhgzQ4vUWSS`6@7aV3FeqUj5QnUc{F(_>42Q z8~a6@I;GC+9V<*~n2URU8@G@NlKU_F}hm;el_y zGJcQeXcXAcrrzDmwAfNVXW=E<3wovPw{^AUz;@C8@kIQU#oirnqS3GL*a6;2LA0vO zo{T`x_w^ykU;OHQed%3aJ@-coZ>n`kx8uX*4}R}^dK(tsr^w@}@J80^Kk33IV$F*o ze=H<@=3c(a?PQ~0$){{0@NE~aT&3VpE%<5!-_F1v@WfQEbowO*lHGYHZTqfXJ^k6j zGe`K8?S1~K8*W8oqs^x5X!+pNPxaMt%Gr$4M#3w;5?E!(vrM^-ywQQfSuySCy?Q$M8xU>)0{RdFYKE$f^$|MSXzhLPR@4b>LOQUK0U_ zkb(`Z=)19T1~bwe4+}H?!|>*^R^ALdiO(k6Zr;(w_K3l3_{{I5bmTHv#0!<_*cG{X zXOpc8Gev&OcQ;pP;#6wWY2;?Ii{{)c2LL+G{@E`t3+p^pI&cTRnfVSzf(xoqp&h2HmX zv;9F&bpu^Wzi{P=ylQZSY6jLizP-f%vWRRGKzM%z`0j=i=imB48C95|N4Bo_xoki4Q&CvM?Gv=Lx3+>ZbRq zu<_>~<4pRKPZ)N}uQWt1daxKcuSM{AO)6Xx=e}OeFhXZ>tqFxmMQ|_hWFX!r?KAe& zg^$!rRg3X$&>~m->T^eXs=VMRYmcls7?Lzmox;>U& zR3#e=-%IcMU;-B}rSGV~k%K_ z(?W^o27K#+%D8O{U-4Q)7Ev??9)zq-Znw5g{);x&rMETJPxuh~k2#Ahete_C{Xl27 z>G#|kb)@^WSH6Q6%}OHI(6$3NiM210i~Xa~yB@*Y+A?LP&<}c&&&I?0xeSQ@B&>Ak zl3N{+qk~Fl*hz*Xy!qpb54t%Zk5!!bo>m=mRf#(E;!-%inyCiML!`cSN~m1b;)IJZ zb;X8Tl7z1X6mL30U)qB}x;l#l#y93V#lY3ZX~VvSL`bnkv1H&A+Ui-I1!f`46(N;` zhxA5{^_Cn|Mo7l0JXb;g;PyE-<=Dwsnyaz#E$yATK);rIM_1Dhg^v$u*zj(9K(Cux z^ofmmWkZWd7Rnsa%j#jK0zSB>d-$!h>f-X{%gb-+1Co3Ikt-x`{p?rEYkEkZ{>KKL zxq-GTYb@7H#!u=wnYI=9q91g0qqDjVsck`Ox*oBi3&MF}t5VrX)7F?GT>Qg$55MA@ z7mPTFROTZ&rPV(qJ33_iXt~@)b=()q1Ky+wM8<``^7M1s#5whAHafbSOLD>to2gg3 zTYZpQ6Z6bO#t8bA>(8812bGn2if^&8x->2x?WDmz7#YS0${44pr{WRUmipKyG$e!e z3lFaTQOA7T6I`zNoqdGc)#Pt|6!1{E@M+!Rlfxr@d{K*b#y74)(sn)GpVn0cd?i}? z=Ww1t@(p(CR2{^<5cIj`AaBAwk~qG}v=Sld}V|&U}gsKDuRrr}G|DT_Uq` zU|kwu4iUElU(14#U+TuXq0b;iH~lfmre10I*r`7z7!9O%b++ovnG4J1$DY(H+b(MG zr%OO3Kgc(n^*F6b?8dc^me+sux69kFy{dQHYcZudWdh8l^axZ<*K$$gXY2-wT})6) zOrcZ=i`~X_x_$u1`r03%z`tQeofk%(bFJh)F>1}SX`q>J#8|&oj@v;NIUbLML)pHs zZ6Qva0y&?}HVnJzKk*?8l?Lizh@NZ|HH)6fKHsh<+)X#qst?_t-muwSNaEE#XOTVl zz;8lD9Js)GKRkmylhYi6;To91tN!*6Mk)BAKKT*8y}blqAS-?x8mLX#N6K-nM_3P`--WT zBOcOLIe4ve@@G!pk~$iHHNG;LDP`SHX`#uH08PeB^ocB$i`PB%5qnPt9Q&5AY!Z5#l(R$|NX$ z;Q5Dyg4!P9T&5V=J(vc#ErmRUX_NGK<_Ytl+O!z*nPDmlFZdt1E7zfNbl^E02h4RcdD<6wtijiIhfvO(8CM&` zIP((%grh4(9zWoN_M&auUY{{TgcNaosoqX8PYIvfKW=Mr=Mye&d+Hz5FX^2SCF@i3 z!QVXwE?D%D7higD`Pw(WrYrA{xnDC+(ihHf`X5b5NKu2rE%jiw_7rHYwdY=&K+)X>#GR(|+Jy&W4VS z7mX^LXsEZbvh|N0`nSzskV#8`R4M2|P-xr7-ueP~Z>c0*-+FFSo#;87twKQ{lp*N~ zI0=vPzk5LW2UE-J)zA$Kf*8BOrqn2!^)LRcKjqCgh6{e=%xwnV+4f1seo~c*u9Z3S z+edA_IQ!=R8tY2_z_@%Lgs@-kJd?-NkbV<8R6+J54U)#!L%rN8louJ&9^6N|^+GLk z_6HuV^O=s~Rypu}8_buAgTGL{kr;IXZieFD%1rXjkAR?}+1_r>&o_&dOS$jVtROZP&?DAy?62TxDQktUA$M`BG#o(pcqqLB---3%X%cHWn&UxT{5L#wu_V+!ieYfw9h_ z>uB-K$%HJJs`Gj2YA(i8YRKauWaw>&CP8nw1aY2)%8=4c1Pi9qN8u&U=>uVGiFzq%zrp&4jhyMZ=X7=Z!@v zeDrI4#v}0}h}DIbA0Qgm2 z12z`fXb2#AjA2&GZAFM?#t*by72>Qb^FZnxU)lBM92+bJc9ZhK8RL-KKLoTHAtFY6 z{EU$lCGEv51uEuhleD!2Q$bQY6*p{GzJN$CwVN|Kc&Hzb!jtb{;5+KyvIp_J9)K3M zEwApNda+Pca;q(Z1*cQffV#5ngCFT$}~ zW$+|VU2}93(J`RpC&jg8r%ZA-IJ)2nX+}F5x*3WsXdpWh&;tu7rTu`P64&7<;c8l6 z@9IT6ePIOZ3jIBPNRRk9?RMKZgef|z)0z@G%Dvcj-{I;)&dlRq^AV;%3c}UcNq^$1 z8fy=$z)3&qM1RcVk3R9F+K}Ex#g#YA6u$lLDmk@$_Q@yGlaro0`!Ac&8+*>_`Ic1| z;j+Oa@CG$xuf83=!G%;aY|(WsS023^C~I(=F~x3M1ZA7~g>5G(E^%WR&CJQ46lQ$N z@8>6238|1_0P&{opc_^WG`N-0F^+E_1$|-2#kkNa!QeCSXF1UC!)d0!K5Vigi7{vJHSEA%X@Piqk8)+O#o<%+}OPd=xQdp)JM zhUg5p2NU54w7=81T7?9{F>etqHql`By<;wG!(;T< z;om%%8k|iBA!)O~+kv_npv+ebG|2IRt;j}K>X1@+Xpljc0G?b^aZGg}xGYig+n|(M z22MRQIR?KzKza1HA614d55_`dbAL_?&%u>K5D~S?G+)uQh}-IV3l#dce?ZU;g2`2e zzd0j^d4}m{a5sN=S{}H*1p!}V@Hd`*sBeS=rxuwsd{)qUvY~)8w|wZ#yKvo>xq{lW zNsG?!^lNhFXYB#7fo6w1+;K7o|1i#H;W`~VDOzd zr86|>1>fn)#Tr_qsYxAdcFGX7f}euxVqf7k(zYDBqMMg)wX zbrYn#pDT&B(nw<=cIcOXe!>SH6E8DUsll`h>7zf|v~^XxJ2=qXfW5X^%q+irm6hJH zPjog6W=I@kj^+>+c+~MJ`Q~SDy|q03!i&1JMcRh(>=KPFB!rV8&v$>YfPjle`FF^42p?YKCTbRcx+fcmtO*sVW7892b6$R$Dr{Z`&K}P?xuh5eR zq|qH71k&J8-`QAj0;1z9cG7mXP`WJM^~{YnYS6>mS83&2#t-^2ez4CSR~GGpQ;)PS z!NlI;cYPUS2c4)A&(5;p?+sH1NTN9N;UFru4R0O4xw8-J9 z;U`MG*btrvQ!u1D0|yfx_eYjj{Q!s6FB%xV8L$qLfgeP#f6MjR7wps@8zB~r(9#BY z2fwe_puB?lIF%pY1Megc^l5Pft>~B-J)jO=C9yBa4F2%TWR{-TiO0H+Hfw&uL*KUb z8X&O!4v&kME_%ScrZ-~L_AVBYS1%);!L}PScp?`XWSEv1LlCVP<~p1XeiX&hQzrXx%V(M^{$i!WN}4;c@xQiz`jmY;WZNjGtt1Ab zR_IlaP6LB^GH$EwdwCF61;UZXjfbx2XeKTSPs0VYebl?~qN8}5e((``Y7>E&FzgsQ zz?#tUhkoTc0w|m=Dy)Lq6m`jgKK5M_%BW4lGvXS+=j&4sXT`n9v#pFnY0A=Ysz%Q*44eX%VUjjvw76mqxpk?t;i-o$CW%{ z^e>z;p-Bq;7JavRLjzv7tQTGB?Of{b=m0z3W+x}Y3;n=LTYy{Au|mJY9NIxMJ>iqF zL^hp0FdBLgxGP`_%vVFkV8_{f%M(G?GifKP6~Wu5N1xDu;q?bs+@w4{f>r~V4ryH+JA9aqyzTh@F#u);%M|Wk{?eQAQ6cr4F^uQbu;)PEZ2^ADw{}W@@g;vw$1Tl?cW&_=mvg=~igwO8 z={lA^I&*5<;8Wffq;(R{u`#FFF3I;B_3n+AvkbqK7tT0aC&8%cDm}v@$YT7npWjLz zoI?ICoxC_2Td8P#bdt(m@ugn_Npv-i@Y7UM*&CLycQWKnST?^t{yWm|>%Q7>qQn#Q zJ`wMJV13Y%eL2;mu&dXFeuZ{TF6kM*B61(!Q-p1A^Z-uMGmRx}65sgXM)6zp^~@{1?@s)d@9-sB7w= z6)~8|mfP2s_uhVedH3y~=z&+2X#)G>KmGINfB!%J#qz-?8gS|KdT+dMsi4ziF!B`v zBjEeFKK!0@4@lKJ#?cuYpofV_*Rb9m1%IsoQx+cB&7d^`d0XwFkq0*+fb)pXjBpR=fj&M34rTKs1PL&7s;*3IJg{|m z$V1)HKLdDhp*f?$mojMQfPTPufZTSekcpJDE+_=7i)rj()e0Ynv2Y|k>qNfwB<#UA z6#%W}4Bn)Hv1lk?>;>MqypVwo;BkhiaMGp~jS|+IiGl?q59OB+I6lzNNATTVl(a5P zcwzyY@={7={IaQ_O=1Ii0zTERe!Q>?(7&!XWYP-;0rIOiOxTOU(;Gqc8^bId@lYxj z9{2I0Qu&m}J>Bf%mu>%v*HAMMj>_$iurkKKJ5L(u8!#WIuH9#4lN{0~ny4Wh)I_X9&snz(#1Z z$hCU@!H@jy&^tG9Mw&i@URG9vZTYTn@n&NSTtCL^nNnaZmQqAI@?2j@T8i^ZQ4gGT zrtZ*>K)0t zOpT7wVe^aeq*uoe@Mzz~%fqJSB0Sq5&ePSORZsY+9iSKd;KSlI<-}2nR>-T0k@e=! z-&{`L`Ja|={m!>!|Ec9|U6J_0Gl%{o=~HQSXHz!(Nk3yvEsA zJXL2Nzm#XpVOUrL$V0~sW)35}Ur0WBpzan@x7dpg^MM_NeNMF-g~KXias3o~X%q0| zIfZSWGC#rE7Sr~_rF{;1{MZn_w3Fyh9{-oNS398RJ15M4oP=a9WJ7|i%0;J<8Hw0k zUap_v2VmJG?l{amH3ExXAIX#~4N0zF+r-968(`#O17n3ZFKUZN`i2nVv5#O?^uzu$ zzEU${&7^YbAAaj5!W4Tyb-0-g%gTHY^>b(2Kp|Lp4TqEo@Rl`h*&NShHheDJNy9II z9hza&4(gl{Q#IKc^1$@$E0B=G0lYGs-^bl&x?eu$%VVFW@O@d}&Gh@$IKKx3zkK%P zlviVy4?8p1Xuz|xcPA|HD2L9+V2o%c8vJaYqkI%QDot$!SQv5PF++YgRW=YB;UpLV zx?FwXaXs|*((=?7p3|hIfz%5R79Z(2bhz6$Z!Euf?LRM{{PH!upPE7Z-two~K>YQO ze!5)0rNKt`tsm(2C+>wmtpT6G3LQ9M>nlkTh57YgV=M$N)OBz|&Dkg9H~h`U+Lu8M ztl>kLO`*EPCo?ZRBKiD^rh^gqENVG3(nSX{!WJ39pF|AZ!yDb%Ncd9sycwGGLigGP zJ~C(dEyFjwVW+lc;j^_Nbjai#ZLs6U4ZSK+Jmq*VdMIHqB-BofAWbN5bf{e5c0`17j>&T0JXzl4 zlZ7(szytfECwY*d;!G%h^+wh(%EZ;oRIV||>7l2IL`T^1c|i8d>Z<|6L-z-6XX2JA z>4Cxu)rDOFf2fdlj&R%~4=M&bK1+Rq4=p$>P6*UBSJikw|4<4dml1Qq0=wI$%u}Vr z)(Kqc)$&k|Ulyqd8E|Cj z2yeJ4C<~6u$VUMe$Q`OrovmlFhF^GAN%ZJoCerH+&opWLqxB=_!%uH4fAa7Cy>2tP zyZpD`|6O1Cc<&cKUk-0xU!HzmuQ-=4vB&C(=&r6t(>8fcDE%T=B(z9nvB%Jat;o}X zjtRp!WvqAoG0Ks_A`zdrUGTk>ZXj69(jLfDj@*t>ZA0#jqh2XP4(ebxV-K4f;GDH4 zW|N$9BSqhld}xHu=+FN!Yv0Pv?lFdn23U25fAyaI;dV9J4)C_Y_ynr(U=E{?;}Z@} zWE4NR^m|$7T_ALkM_KhC#_t?#4%j##uf7Hj59;Oa-Ml6`obnra*p)gZ&p|*AwB>1k z3zqr=f7=i;m%E>;(}k0-V`n{j=QK-`I%#8N86gBrlDWQhl7!hm9_tTVDFgH}sX%$21QJHYPr9 zgY#RMgn8@6C(Ao}{^8pDZ!ed0|NR}kg6&WKpFdmv_NQ;^qeiEf8|s|!X3+FvRW>l0 zSU>3?>vc;MwrsvaHN4n+ud$yGj4`0Z&a@Zotlg%;7KPc)lmpLT*s|G2z`&b%d0 zUSoK0lz-Fcc&IZ-zI9}jmxefcZ#vb-g%cC1BQZD)l~eq}>s#R9;LoPZYvxUQ^LcDB zp5@1%xtz6avXNC@O1M=GOsjzoo#+e@FITZ}p)0N}jG2b5dI*PrZq|EwJ6@#bRqLQs6>>HgZr3Uqo-xzzOC*;-*{H*oXt)x6?_qrL*XBn`yn_p6~^M z8kl*u1rvxxDRl_q}(`3*bnkAIfC1oXd7(u*2OI^$fD7l0LzuHh@j-{Xps>v$Utw~z@k!D z*beob73}X^%C{5#{6GAK;77|p`Uk(~hqyoe_!E6F@{X>Azo79|S5I~2!uDdb?TtTl zSxMqlrvAiPem^sWF2b{YRznu5ecC?33G{8oR`7tM&)nZR;X^17@OttHSlcad0>9-o z%i#nMwsQi&hipXdfo$?78UKnOwD73B#=x6av4drbFK;g5q!w3XSkF}_Q(MVH!@-pe;M5rb8sN;q@J2RAU`nH(D&W|+UgN*`CLd9yz0#K=>yFy* zsoUtQ4ejy*4sV@BbjJVM>Qt3%2+{@D*c+T?A8h5?x`9 zR~`>YSUtCSH30oRjBYKHb5Y4;56}Z~D?YfQ;n;^8@Hv0r_g~B7BvJ6Ju9Z%;3GEpPnjuN8lw4}jg%Rfogn zkN?fTTYmJDSC_LF^bHApvGx>in$en>cRq9567TuOso4-=pX6y#f@uu2EOHrS>D0V# ztzq?%oyUMR23+M=dt4TZ`eUny;Bh&Tv-b=j;4rw9jJj^%-S&l}OZmn5Tbo=5AB{Zl zCcvAs8iceTJi9Tst%86@ux*Nq5!<)+2@dj$zso<9A_E~f@PeK|8rY^gFbZ6}mKhJQ z6v(nRUcDDY}_^okTorsomg<7vza!?9=C^y|GfjA6IUu@#)IK%Ews~!NtL%eGbi3(pk$P+W3%?7d@&$ zzw!{6BF2f8L}I|@xdYX>5nMKGP-%jTR094SaPHV7n6rt^Nmv^)>=+BMj?u!>TpXOr zwAhVIcwukF66PKkORijmhFx^HkcS95U0<2KFSYgsu;i2xY)&%jW4H_0-~{t#GZ2 zX;dK6USVV*lV{OvmBnw|#0)Wrb3!pZG=xm$M4A@^uyLMzRs9!2ed=d@Z}(| zD8*j-PT}p-6oof9`@w{+2THbfZK_dQp+P3~yl!#<8y1~|mESDf;~#v1pYR1|`;lQ^ zN>>)(FoO+4WMU)!(gp_m74QiUY4e3` z+O@A<=vfL*qH^oi9pSR+A<(aQ-RpTB*bRrBO+=J8?)rn^!bzVL0^fDI@j$2Y^|v3& z6Zo7i-YmDUhs7I@%G%X+Etd^`7yD$3>ug6y(OUON1a+kssD5a?L;4UUSA2939z0jr zL+E~8(X88wD*-<1Hslm1EZO7DI+HZ6=5&G8rgYVNMrqkT zY_)$bSkvefc{6{Ct&&Tvlv2?jc*h5HHh=3j+cj(*?KQlF>pytc*ORSklMe9GA2f*c zgU%^2K*dvNW+1x+KZ%aTB@C<^6L0ztP}L?SBOojJ=$8c*3b^izrugB$+z|1lH5$q~ zFMeG#w65qE?VC{3=k==t8_sUB(%3e2y% zdd4#g%wvr29zVsv^P2R-VAn%Tvd&mTu$Z5N$qz=lnpm&oSA^(3ZV z08@TKKxV_*76#CMbeoQRWVp@CF64%9>asiQn3HANQs z&cD#KoYpwG zqLBx`TSrtI{=aqen%fyF-M*s_Fe-u0yoqn;nTQx>{nnou2IG@L;FPgTrh@PS3nPK_ zgY)-kt0eqdR$0wAE*xBk_yFpV+q-S63 z+qiKWDe*xI*qxSOKRm5G&n0L%Dj3O#V4P8KXvEK^69l}x}_-tJP%K?sP-jj z1}piAvtxW^vWsmr^^Tt2t-u?oq=kdr?YBU=h78OEDX7ciYZf%y;=2BcFXJ;hKbIGG z%kXOky&=wegt1IAzHpfIsI;Nfyok446OwfciQV_c z0M>`1VNn~ZH?XPq)d*SAz?6OGViTrGe0!2UQ%Qk2(T@1p>kC$%Y%l zeFJ#ujB7lg2!;lI8lt0EJ3zVf4NAT7YA3y65q2ty>;Sg*gDZW?G^-~?&>o5?#13iz6t5%CS2 zLZow^DT-PHPkluv8H@wz^w4eR6nLWZd{lp69lz0|d887ok8sKZ(WLPwbe><7XW?L6 zQ?XEB^V_-&Jqm*dtv4M?JHStVP&C<#MRnF-dO}pa)*|un36eO@_N-+CWZ}*U%f>G^W)0GdHWokO^#?p$6Fb39Sd>Sh3Jsu@l8G;O&LC<_{3AFENl+r}R+msCBu@m&sU`KPX^y zLUkOjvA@hp#rCD^)DGRT#Wp}RS6m5!x@_}jq-}ZZ+qIEJu;@@t$B5LpxxA~l9h|$o zT)1>qLlFbLd?Bd1tp`6|=c@hoa{bfymN$O#gXNyi3|+iAWgV&w1! z5e8K{?yhgJw1Gh@eu;0W+m!A4DzGkoVw7OQo&8!lz$=$J^+d~rFak8e6(SX&e=h)3 z4we@*9&j`W36~HblrD0CBL{rKs}{QS2c>gwK>@HZ|W&B2Q#>AAAe= z`d~++HXH$4_j5UE+c}KKFWy|Ljmp!hFNw5NmDO*J*Ls_clj5IZb? zrUjt#+C;YP85pE2_^jvDW^gj)>0^uy(U}D@eOQ!cWoB!7fE=Muab|eZwU=DlIq z<@L!vml>z&#?gKE>IC|DVX)_q(8L$1)98e}xH7(lXZg>%6xKL&52u;w?WKt~eCj!5 z5*je@J z3E0H#Q?qTP(X&_1@P)4q*yfBu@a)wLaLMO{%;rS+v<~Y7@`a%Y`d}Zp^TC@aRpPqr z6%2e3yY-?OeHEMAnD7_v-?C`JgFjbW2;f7Hj%}!%38~J$zyqworiZd53F28#J{pFwk-~VFy)w}O`;9&4* z-P7@S?=$VV{$!w~!{>|ygVlC6!urW3@N?#Yj+k9N`tREatfBn}+NR4daqKTXc6B4c z15c&d$~Sp4AIix$klx9wO3^RT-vZ|}QdPQR)_8lZaQKU$M1UGS!MR6=S!WVNf;U(+{ z+v5md*jK*sKV;u{!VZU}@r+1G6ew;IDUbvR5CF07OJnK1ySl5p{rsLcpL6m%{WT!X zWYzgH^UZh9(MxRz$jZI7jC<`6ql4@b4 zAJ1jQD}3QYSn5{Lf+rmw$&Yc$e0Gqo36?WwoI(~MG>HRGnQ42op2|ck4+>=(G(cYo z;D%=vtd4W(myFN3ioA;7spR(r%HdnQYl00QRcnw{2ldb~{?}5)`77F(I(_y+`^)eB zp#3M`{KIzNJ$LE~!$v##-Z9_ebeFzew15Bhbmcf3Gt?&si;R8R*y2HJ+{T2v_n=9i z@s%|I6MGg&U+S z3bb{o4@GiC;=%=PW#w5#+Oo>Q(X0SJ`xVLT)N8@O8;3Y}<@RM=DMBuoL>t@|O-?}k zk#+zq{-&-7lta6tw;&PftxPqEXR*T^0-TIfQ=P2gO&NHX_f=cK{)AllNIS{Ggt><; z6fXGT5xwylJjt_Aq^4L5M}O>rE@MjMjWM{>A|eKi2yqH9(!m+wl%a$$RE`>QHl8D; z7~J58NAkf--bDH#A6RgDJ-Sf`k2vKcENOVeA%XqJ#?gy7vPnl4z5vb<+;uWTld>g# z_@sPYydH--aBaBJhYoq(EfLhaf9gyENzbf8%a^e!bYr~KQ5;-sTU_^aUTRA1p_+or zmvrG-^rBEh9DPNdk&iCKNlQzm(UnH8g!(IQRp5g+d-T16?L$Y~rWO#pEG{gl20AWP zYE2XW#mUb&;>Vqfq@r38LTA4i?Q-k)kJ853-6^a$rC4CJZ9?TDIfI<98Aip zUS>a1KH3<3@IvE6rzZ^#$0!FL8K|Qpxa;g0WfU)SZzk!z1|78C>AW6OXMb za>IX&$CF@mrJMkt(2nUUoy#f1MnZHX4Gen1I(p!tBR$FwKJggFJiMVvh%EAB`;H#L zC13o}YsQ6s$EMuTIZTi`2{e=(Mu?HA@!Yld(HplWeZ_MGg6ybBD5E5oamiOWuuJD5?qBEJ zVMlQ5OWo3f9{8Pr4jIqt{80LX!zl@W)iV|KrDk-Kr-ABx!bj2SRl$4piq9)Luli3s z$M!+blPl_QFRVODQwEKLkvdE~MxRdQg;~6$qp$jFa(m`9`se{-*@eBx&w_?;R74-f z32bA6&LV<41TKo+R~0qK;dSg=d$cg~7*AiVTaipJ*_`J&GqVAAUXy*!_4|3t6!QnA z^T%cxvb6DT*^I@=1qN>>@@W^Noyf~#hjQdXn+-pBaKs+csYB9)&^a^Y(;kUK2XFKt z&*Fu-0kAAI!Vetivak_BU2#&3E|fzTn}8AEOB#6OPs_$28*}j!f&w$(%^e9Cj*>oZ zI+PL+kPzN0Fkw?c3BgI87+7zPg^3`;SNR)wjh`5Qjbel2jM%wO7Z8{5;8PYJ#IZ9l z$`jVXC>wc=>Chz(|8+Q#TQ$*r*IjJak=8-f??Q3i)u4ogj!a5+tdn7^)a_U~2uL2J zUU#!j04vxAlEM!w66|E4r@r{L{_$#MhlKB z+F@vVp!(f6UTDW&e_n6O5D#^dXJ34!{rkWEK|6Wslm{#iRB}!YtPB*=ONZm~oClh` zg@ew9-_R)?8JV=Z=vy6+PFEcdV}J>%TxNU9=aktt5<>{D#ADs0V@JMuciIsT+R#%> zI_1RE#7*{~fA|G2G~tuJm2%<)Qk7R~gm)b%V~`^FL?(F-pwo^PG|>~f$Vr+w<%xlX zKQL?omY8$`>EuIC*+@s40AAAZb7)3BBVOptz(2Dd;T8E;TpAZ@TU z`0M0`0$vH=(Z;Fk)OYb;^;vM@6JqeIe}r4vLXNhU5WbOntvEVER~q);S>3Rl>eINg zXW14;8zMi%DN7vqYK;90hybA>PDtIvH?fa$(kTyoNgF=FN8ixQxO063AN@+lRYljy z3C%x0c4svie6bNc;68A9J_7w&y!g2e!OH*TH~2Ux!aDK=8sG*{YAO0zZ37192T`OgQe%xa`@BQ0 zb1-#)Y;H?rj>3fBFFNtF1ohkjlVc8a=hoRhs*Amv58xB(g=Y?S^%`(KEPLpXXe5e- z3W2u^L1S~Z!N~Wq>jfdGQ;Dk+ox(FSAOU|5byFC?S@5x#rY>~ z419F9-nOyk)YV4j%{B1K2Xi@$fQ>PD32BeSY_K8|IpIaj0+WL@<=}Go^{u_mXa95c zW?Cvt?G1U*OeLknU~M!~Oqo@*q8LLVkfg~gr_Rw^P60mjcQ$&7R{RknoUT$urH9p6 z5-sv&&+P0)@m1O2Q$RYE6vUK|@W6m6=C{{TWH87yh0q=ETluh|4v$*i<690}r;T;~OOVwjw)@$LH$rdpSo^7xJHbNA(&`<8mbTa00_$_8&~di!=iQgDs)>cit)^FnE zdSoC1XObUW7GhqQm0s3OWzds`M+NDET3x3ObOc`L5({w!mUa?4Y$8M-=tw8eW$={4 zi;$CcWE6h+Cv@P&S4C5_q{k2*t1xn<_gJ3v7)RHU{=?GHLW2_>b>vZRaDq4cedZdO z!6Odd=ua_Gol1^K&9Aq2IFTm=-x>XcC;9LsT{@GxI)qnf<7w>6ngbo<#Yx#sS>`D15E_FL|rHp1hfab!U zGR`&9;hlISp6qiyj3rg?$YdPxUvG@5q3FIs)s>8_RlN!m*a#2(I{P9c{DMo2TgLv8 zGA4ZVoomS{+Nzr+&g2jChY#ln)sy}h=Q+4th@p-RuT4#K&uUWt*l(X{r}So@Z$JEy zZ)M_xey_azN;{`r7eOq2T)C;bd+-yTT$+!qNxfgP(IQ!?Lk$ z#6Q5nK?b)nfdd`+z=4tH2oLH(S?kKY1Gy!eIx3%nj~?MuJSeYQ!_J)19T$C}75%7t zXb$MLuJ{U`EYvWaIJY`+Ue5vozEVsdM=ZJUB5aA)3R5UwbtC#oeZX~c{;{wsF2TjF ze&Yn^%ooQfXC)9*dFHx=;xi`-+zXA-f5dAIbi%;#3FXMDj8PvuB1no;StHI+?BvLI zECase7e0X}hA!!K{89$qSf2D)PMRZS&=6K}d?X6dzZ>NsE*bZ!C{bqaosb~Bhh?9Dq zYju+9%;q8O8Jsy4?AgP~fZY*2Y^DK;XDWE;nYW}|&>+U^PQO;h`j}&+&VgoK2^Y*G%6&n{sF9Xs5t4--oS`wIhY+@#B~BH z8TA`F=Q^%2;bRZhNrLk5%0t5wYn7GUs!PfVfsHcY5xwy*b)67=@@WI9gWwR`Th^~U zjlQ(ISSkVTubu=Agyn%w-{GN&P6PU+=VZ4EtESMoY$cxkT@um`3LXi<6!CcSrJrD5 z{4o2e`UZYXpLUa#tkGYiuFz5+*Ro^uW9%8}$27R%6I}SGPD<9?hSPp=4nFtQ({hyA zjyxhKwuaXdo;>lja15s$!Pu|A8jl2|4d{=#4)V)?OnNnDYI1sB6T&>S3npI(AHPgJ z$)ya(7EAL)md&e^n#;zY)Mv?MtXT~rSKz!tg1WBuTMLOwhXTn)?|`ByU?(rT$q1R- zfPBu9*bRM9&h)wxeo5yK?x@|P^O>HwGYu~SdFEMt%v9&0Z-4Xa?S|_PY4Uubz5Uwj zngd^MAG`fFonwS8ni`_i?q%~O@jIv6qWG#XyXRZt!&r}-_;@24*Vl@f7?=My`#G}4 zTlx><3eQ-GRy^wAe9p1TL&QW{40Ckz?X$T(lxKSN^m@Xd-D9XCIa5AN3{A#J_A$47 z`D@RbVDH(gI<|at)j3o1c*)QlLOL=PfG@AaMV#NJriA+^ivpRL=RC?U_>Q+->7#AC zFj75Nqx8fWUYGRx(#aOo6AtHi&h^cbo)M^#pl{?OGklqI1dhHe z9?qUUYu$mDZumg{kxp(mXrOro;uzn}Lz&p&%}aX;E2Lx^3vpFn!C?RwyNfb06LuM= zbcJ#pWd!30w*s(g;N;_YmxjEfuF_fe&LW7F{ixr_2%c@4bPdaE^BnaEZEQ<^l(9^Y zd8CCld=p2%b@`NqK0L>hSm+Xii`#7~t28B?H#4Ofbf+yX}2H=|di;cvEQ?kG@Oj6Qc$TR4>yMZ7fl)^v3Nz058VhSPY92cjqyZF9gR9G z4;}V6E>`A`O$5U`&NPw4ejS9w%XH|Mb?sL@mPqk73%IE5%p0$)+_J0a&>1QMf-5uf#1`_z_$+~Z$^r{r+W#m!xWoy;kDZ23_RqG3CitOG z95``sDGNM!iGw#i&t^YX9}RuRB7B0MvHgHvVsYcqqwPQa<8Qa0Kl*6<-3u>x63BR0 zW0mB0A(25AUcfOe?Jjbvp65jubR5AWO#naX%KO(a<99lsP?-M!15OxuARA=#15Q6B zPkp3M)Rljomo$d>y9JuO@pbn-o4PVA8Yc43zw&0={kxyGZ+-2H?beUosEORg_Rd>x z`HIACcib+&9g>U6kCv}ZHJ#ns4sTb~#P$Hs1L_syJ{Q@6Sb#oO#=f9ETT_a3USOAd zuNNMSw{`0-`f2Z52(@@ldtLfY2;Ek2u^kA3eCY|?kjd|bYhV=^yYlv&C(g)nyO&P6 z)fzpx?aB)XCf5?kgUN-<9RZr0YlF{PNsK4^ob6?&&!0-xM^}42VYk#Psl>G1de#je z)cL8}5VE|{DEzXsCed}Zqv+6&u_B8JXyBJk3ew=jpGZufbvr93SbyV4AvQ8`q05y7 zj{RP|z`M(&9J!R2U+QWJiwVBxfoyOn<9ik?(!7Bwd;+&+MK5B?IJgZgy1|2h9ABkS zo<$Q^qjhVY1Pv!hh(c&8{m?-O{Nn7DAbdPa8AcIMcFDs9>wd0cSV$9n=N-}s9zc&r zIP#Yvu*75eNEo~_UNyE-G1`ZpSAOPQv<8?X`q*$ zqg>SX0z+?dg12`xXp1?F4;J=j$7a>G;NshKLhym76H_+JopmS=mAtuLXTOOER+BzS zuj3ba#2G9|N2gJ~nGSs67oJG0-Kv@lMK9<%COoCbkaEk@7_ov!W9jtJP?o_kdL}J;I|aamtqU3f!tFmXL`)G*;I?r7&pr zyhl58fZsix$Nty!k_Fw#F=TK7kPVpoEfUN&H6pf!B!A=E!so&Uuod*u^ z3e&;`tv2(zTmk4mp8N}owo>h-^l)1tiH^ce+o>@cn`*p{pP)q+SAKG30UF>eCO9&_ zhnEN(SdcLB9H9|!&$=8NWyb0U)nX6MM_{xT*?C*aY&E( z@L9&tj<{8h@1XCwB;-u|*}+xlU)Ro6CD99%=wkG%xQHW6#imDvHvUq89{j8E6+bc* zR9KM#SSJbF#sWfkn_K&R`x1)?UYVz^MS9cTS)5D_qZL0dd`abs*i# z_|u<1+Wz4me$mcp<-N7dV9R76k}J-jiZ>{<{lcNcfa{KI)Y6Lw(>8qxl-JEMpd)`i z`KNBu@za?zKoAqa8+{Eg21V$wKV@Dlts8vABfhi0M3X#p*?aei~`VG(4hL;;}sZ#xl}@BY}KQ zEF<1JK1-dGXSXqYp$j}bC+po>9^QX_W7oU5+-TU$Tb?!Ut-TCU? z(n_1a2w}(h{1tuML9+Jgwih*9^Hzz+6efv0LqVOVT``el4CNpIX9CH=q*td$_if51 ze~{_cbGy`>Qwn1fl;k|vhwVUnOq~^%5rGXo-wNjruDS!e^YpDfOzZ^&uli&kcd3s) zNiKiRV^jnK$1cXIu}{Mv+tL@nuk*^xqhz5CS3l|_cJU)|Ch^cEoyq+k%~7P~G$99{ z!*8A%&f}>A>Zon7U{TcD9%*O@=$gX-{bc+N4tDO}CN$0ESGC_#-&* zAx%iVV^8=Hb8z*C^Y}HtSZ$YqY*sTfh{L!SIWl8}b|86z^XeQ<%K?EKpr7y?VI%)_yqCD^aKdA>A+nbEX)M6k+Z*zWC0( zXm>>m-yGn5m<@Jm7fCuEhJG2~!$e$>+J=544=mu(bBYT?DB6LzqWcN=9n>ItgBAeS z)!<3FcyS*)gQy2BHU4v2Ks^7{uiEaJciW+Dt=iQo9{JIa{Wc8WC#p_{x(u+TYz?C5 zO6Q>y!VBEJ;tei7w>2nE=(f>;IXoeeEfo7UqHYfz6hA=EK|Y-}c8-1u3=!Zh2MBPH zMF?HN<~Hg={e)Ni3mjPRQy%=pfrE$685|>TEb*4HhJ*F8(2bQkYA=pSZKl{^ubA< zF$|n_dWQ$N!6A)(V+bDc*dD-1*)k2kxlHzV--&}KzT9&2@E0Fyx7>KNojG&d6REF! z>7n-2GtadP`hEhtn#%w`SF_Fm%aA&mWflhc5oR7My3KguX?)SiDFLn4g`gA@a_d}W zXPY4fhc-pO*x1lp12yZdjRDb#wsfnQg@oULq!J#2My?VLc9HrJKD`R-_kB}p4K>2lWr?z**lv2b2~4)^zwpba~lx%zB3V| zE@>Z(>G+Ry{D^G{)FldYC1aduj^)^6cXh+NdkUqYfTw|#m|u5aIwO`J2aDS9UljC1?c9ynhMMyPtJc2oMzV=!e* zm>F{zD_HOm&=nb!!<#@HKbKqZN*?|${*e!TLU>>=euWAcN42H80ug-b2fLy}<_YA{ z0UvH??y^gJP#!oq1k%Ba4B*(pgVx}45Rj96U46(RsBWL)S&{OF(M#!`oT8P6At*xK zz$Oe_R7L};GP{5rpdOZ-0uN2aqfubM57>u==rrObJ;qDD!!z*k349rc*Ao68l8?;L zuH#KbkMQ_ngpKs!iMwdP38te1+l_R=)R$mN52zuTV#^9ruLgKKsUM2q;zr9hfuDUT zg6IP?x<>SeSlz<_jX-k0e=?ANh3YU)_4-COmozu^p66}d!nF6m_5K#b60yi zV9kB@Jd$u(mvZ^2%&FtY+snUuyzQQSzg>S&J1QD{|3>fRe)LyQXdwc=B={D-l4|fnD`)P)Ja0~28axRp~WV& z0zGyYDSn>YN$_VaW*Nv@mIDiX=3CZ}x`kix$&Y4_@X`MxozvZ+@h`B@Mqc#5*lU#m z2QPIWIq^}-!;|vxB_83S1s3=cCvu2syJNs!;KUboav$?zsQ}-Zf=Lwk(Pu16I(l%D z%1L?VP(CaAv4xHpm4;8`06}iT2oJ1c{Id#K<|&Piq~W!c1$=3Ha7Nm~n}4HM$t*?I z;R9RpTZd;YIxcmSd~{w%18nrwtXYnr&} zR#VkAZ@hWnzWds}ciq{3`|R`Px8#+)by!rt*mj!LHWwX2s|xdE(L5$#Xy+Kb&<2;- z(4mb8B%)$OIr-A1V9ND=Q4>HOBsU+ukb}5BX2#WVwNLt;Og26M06+jqL_t)C7D#$I zpdRj3-(o>4-yYZ1hkyL_6S@j0xsB#hd;R5C^#Qa??Z%sL=6+kD7Eh1y z=phy8M`zCLwu71wQt#{*^E|}Zm#IH)r-~tC`#?wPYv=q0kNG}#4Sg6Z*<|yC*SqOD zx0UNC>Y@6TthH~7e-)8-z;O!pjbP+tjrC`vk#i0Yp>v)iYJVG0$axLPXK?US(ZhY z5gyuIQ-2)hga+@KMK7~R9e5iZroB-RzqxfbadC5DWrjVDyw2;S;eD6=~0yEh(PX% zlA#(NCoV3-Ht{C{@a7QOE*#EXk+)q@oyzSj*u*$h z3vxERuIQGPOPgARORl<;hFNUU#8Br!J}Spm8$O)&_%qM94^EzL|KdB}(N*>Bc0sRx zee?C#^g#9bcFUdjY7(3&9Onp~6Q!JL-S+bh3MSk8bP)4qA4#gT`w?$3#9w+K8@o79 zb50N*c5iC;eXl-f2Q3e8bKXt=!-cf1*%x>d?$s*r3p^-SRJb#;pgpL}57ZM2k#AY> zu{ie}r5S^$7e2s;ew;h@ZSQxR+>wv>2)3ma7`1yrw<)8m^^yX#1t!m&)2R=}ES_n| zSO#zAKj6SuVq{}o$v{;WO7uf+f8N%uc<|uG00e9zuatGfUS5|=Kt6tAf(<0!QksUkPe(5_v=avZ(w4w&n-#dAs-va!>`8T>J#Pr>bsFy=iQRM zm)n}qjk$&5eX`HWGGD_J`e)&?jXx#7UQ?`jmh#aN8)<*CQZyW8n-X(~@C-zA$()+X zI{j5_#G9rz;mxEU3ag4JZlR4+VG-R1b|0FI14A30AsXcchKwa%%9e5T2~G0gEqRP( z>&gQQFY?O}p5c}Jj7Oy|1NNk<=bD;R)f8wC6JN2Yp#WaOWb|-@AzuMWuH)ndpmPvP zFBTE~kOJt?$8mFl-a2S9PXT!b4mH`feeg!Tfk|(sP-E2|L+OGVYSL{-k+>9St4 zx68o7;>OY$Y{~}{AN0@wl{ZPy)-o`;!y!}wUxbD{>C{``iL1Rb5SDJ>SExac!7{wV zcjOnCdZd9aVT1?2V8|tI#FFSvetNxu7@0RDI67V&K%y zXFnnKekExXO*%QFAM4|O9Z1Qqx1$t|s3tm08tsBAc90&+0w+di%3@P&goXfLq4hvw z>Fbouab&I7CMNK)>wAn$a}Dp4yP%h$V{ljD$;~nL!fV!9 zl(WHxkNKIhQbkvKjeKmU^cZ1Ii;T$+rtCYguibU~t?glbq4~TX0KO<9QPg;#S9t&4 zr#{(k{@G3K^>^NdO1i`cB@YdsM|I?g`(9#tW?he6yOB+5$6hF?PhEH2Q9n%1>uDK#_v!f&$_Qx>w0j;Vru@nk zo(ZWpC|c-bZ3AUo}tx(*zD z(ihENhO?Pgo0nNs;Ef8*QREiWU?;W^h)bU+sDC6LTF>DGJ zZ!wqJv>QHPNLyuG-@ku<+ZMka7EnrW_60DqIZvnm;^$h%mhF`PvjU2yvLKu~+k#6u zxPT)*kdy&WdKrgZ;Gr9Njp-3DxZ$xbKbHN6q$8U+4J5G0qk@wrraXF%xJeVkYotqA z=th3PMmS{&p_8`*nqxqcrT?qK8tlU91s>}{1+44fOPzE-=h=Pf3kdnZFsA41{zC%2 zWN)ewGni?bsTbD?t_P^M^y=2_gNM}t_Gwo}<-o<0L^WMa-n5nT{#$Rf=YRW;ZQtdy z>hN3o3g}Mz_ka19`k36a>VT}1)lSuEScD|tlkzbWi$HNjO$d=i$L>;n>xS!t=Pz`L zb#j<>fgXzL?8L>pPTb_jx=9-uVWA0(iCAE117n^6guyQ`C-f8E$S1&$bod(Bf8fi2 z5dOghf25ym62Jkp2 z8)-;?SXfscl4Y4?r%+6%@(!cTPq2kh_lE^6xXT!{C9He|d~QGBEqxalF#$f|Rgn6H zjuQ`c7o6zNi7fR7PUyi6%~%KUN(gT9#Nki6s*XA;c=R9XMt;zISO7OXf;aL47Wqqf z@UJB=_>5ncft@rb9awBS%U2r~kiYUo;T64XH8RpLx_FTreMp{Ax)QIp@zhgct^O9x zJQhT6u#gej(asUadUck8C#Gzq1C2>6eOXhj_RVj6wH>|TdQEsvdb1^3Gnu(jUuXW* zgP&~gy!*Z<&Ct$#$<6H-9n*J#gEt;&dz^y;SHfx6joTArBGk~=p}pc?@++^l z|L=R>Yk#b(Id|N7o3D<1@cyxOPFE!EzV`u764@0NTgj=qz<&B6yVs1X2e@a7Qo)hG zA({A#Nh-VMz%Od|nV4q)*d)OYCZQ}oIAhn|JQHEDQXXRP(0|ff^qg}m(2ntc#-q7i z2Ln8jSS~!Ud7pHx??2QP>%ze!6(WlXJ~rahoXfD4a_rz-Q*C=gi;nB;3v4Ce1L!!w zXX6ul)4pm2Ul(`ov+0dZ3De1C+0zLo{J!jj$S~>p3uXWKzWWW z5(p+#&@(1J%k_`t?6!k2tP3;xJ+q)&RJ9eFOx$x}Y^ z4o=`B{Sq%Yv1wU8@+Td<_$}qcgi-H;)UWv#!d1FTdm~N1)VFM^Ggs$Qg*~GvZ9=Hk zI#%0A@tjN0i$4#a=}4v>>Q^$jglZerNYUAP_$maKVYO4OuTXF6;b|5SOxSsrp_Uw_ zhe@Q~6>Yt|_s;9>g{L3WO-~o}_K!X7{o^Ovzy1CX+RHD$B#Slh33>0ny1(zzMZNhz zAssk_MXYn-@Z-gh=)GHvZ@Pc5*h#Cdfw~U})EAwN!G+GAU>kd8Oan@OtkZ%qI2RFm zrW^?T8E8g0G#uy5n$RVNK0Lr%;s$TYKl$j6yet%=XZSL>GB}b4 zZ(aB(xs*rNNI&vRpNib2J)?}^1}8M(9XLMB31yU@@^!G0N78ca~LmOD}F}EAMo6Vg&fNZ^2l0~6I@r}5XUN1$ARdf`pEh{WzQ#(RP!mh_QB793 zYANx?G7nF*1n7ecQ_hLE5Th6T1BYky4=lb)KKRK)vkWO4QdEFkPHwfv&pBMS&4p)ZxW2aQlvVGxmpKXsl{zQBGy%P{y7W^$k zzEBIXYp`#f9wW?Y(Z(0(1B?T`6#2mWq^sOwD{X+2prFnal7=pJufl>_ebq1gU_rt8 zjZHGO1^R}sNa^apW!=UEj7c_&P|8ZjQjD>M+f~k=JJa5J@4fbyfBl2@t#5q2ed2-p zv?RIQ&gz+o7oK~j-E;4K?Yg7aYeB-5Ue0qxj}JH)pIE1Gbzw_yZ-TEE3cNU^ny2M5 z4!iEe!3zydD!Cd_w-Sk~&d=_aEF5I`wDu`+F$N2`ryKW_Ny$Oa~ zZZ85K{eh452-yc8d|;p9qwvjm1n&O*Y;j@M8xucerTcJ;5IwWVSpni{3r{L zF$O-(bSQ|j*6N#Z~`NJtq}a+CJ+8P80Eo9d3XZ{mh!-t z@sf^mXmDf#I3wPe9&t!V-V!J9;1jRI0Y>AaLxjfh8efmRTs#bnl!ZQcG@fEpgLmMG zk-bV&c5}(Cb|>P8+R5+$NtptjnXhU_S3j4)0K%nJJ@>Hx;89(o)oNH*5lkUPeEAhS zMO(*(cI?ep^bqxL)QB%=^}MIO_4eEC-+lkD_2Hh^d`Xm@Fm_Ptb$S{!y3XjHm33~r z8tqEj0~iCDbhAG+FsG`^b_U97*O^3u$B|AKUdW8kC?~{!(T4#87zaC4_-w>;MnB+( zzx6WyDlRw)kwZT4z{a*arok;euI3xsloJOA9Xe$oLSAHsXLJr;>;^A9!Ce>iO`YoOE!AIZ__J z#7o>UkL;1w$odH^`Oqx+Q9mi0aHU60iVK1dpc`>xb9A0)hXgcT{mHiAM~|QbC!{>^ zrT+sT=}Cv4@{&8lmSy2VJeG}c(xVJjvg%M{gvN_2I(hTfI9~q19xnGBGihU&kCQX^ zexKD>iqDl25+-t#7oC-Eg>_(1e$7A5f+u1x;#X^X1Fub-UAT?Vh{tY#*FD zr~Ccuq0PvsNgrNWKR!gK;w!A)*VgHd+>%{!($=QBQJJ*nH~hyUT)4u+g=IIBN}YbB zb%JYFVdK^G2hqRD_gx)WfJt_}!U zrC_Tbv>1^Qb=X3|^hX%|6qf=^NWCVX7@q_WN!+i`d6htW#D07l{=g0$I^_N|PTVue zMpvH6+Q*l=ozB%#Dou8}&&W@jcTkDe%jaAzTJ-R`AMREe3k!c3P&V*=3gm*DK7u~T zPkW0EiL(I04)iBb2Al(Z@i%D}JN3YO;bhU_+nD@JOTob>a3uj8_=X3#_zD^0K{+~N z!$q~LEFO@LJwg|Zg>9WHu_^jmn0}UZNBOjf2+|mf?@E9{39HVW!oZVHi~%SVXU_^? zT(Bwz_oKonC+X2iAH{?6S%;63wpzZVi!9=F*5Hg~;SyNo)7es54Feo_1b4}A#-3AE zM^!K2hX#!BnX!|S{IPDw^gu_AEQ-S5s+dVWI}^ynAMpvVup`ZxvSLUFeRUEce@S>e zIK6%7sNTX2mK6+FQroYnvMM z*pcC(u#3C8+$Rax;*MG)hguvRKTJUfcPN+e&HE?m*^=6KBmDRVJTa}jnsd@;Q$98=R_5n#Nj*U z!$;%_v)-$?tM*5y#Nm+`{?{rC-^AbrcNvFYVr$WPgFD($Wg6ks-90JOByLN)?)x=i zyY05y+g-Z-8ul}(qhy%HYScQ zJl(B^6&6EX7q6A#Vm;yZuIPO5B3GtO@T6zL(h{1|NwM&~*JK&|l(5TsIS9{FU*Zau z{DHrtf5`$*=LHGjxjv@TOla9z|2ut4;^gU5?TfmCv#l!&+`InD3(vOObk*mkTW>ZW zCZ$T)6jbJ$Ml49!p}urk4=1bMkjL#xzP(s!L28d&Q%4 zMFx0$-I{>C@C66^6H{h?kVq&8S8an>)nBERUrnPl@Ubks*2g#+nz@uTa^V3#Limkw zWTl+49F)c3q?fql|EVD|M;@UKpWpybLm@rtGvb;T2oAEd?=aiHUAMx=F{MAKkwYAQ zz>@}^1_B)4R+#W>Y80JIFZ@A=td2=1;?k+ss_wwmsjyqE zp#KR)*GD(?4TuAWd4(&tFsW3HRr?`bBX;tE9-3yrtwo5K*{MaeXR8Je zu6A6waAkUX1EucB@K8WrWUi~vB@7y1V}PDcJMu}nab3?QoM?eB+bA@!9?+G+XB|Eo z2yzdyzyl8Qb4fTnK#9JQnSAKU2NoMB!zKcKxNw|xnv+$@sPU1a9h&!m9S-x0 zzx0K6|NRfN=U;faz4*%Oo-p`U5=oskD_Q865c^l_4T_lx9ho%XYINoYot=hJ5^j6d z;tM2#9%gQV%rRKBEYrw#`W)i~GT8P07vKF8-HU#xojHHP6EDsK+)6=kixeS~J3iv| zN$srPe(SC6rB`0jV8DX3#sx6f1?*n6ea0%il81h_hjiK4%gw&ah$x|l2a|YI;JX*Z z{VqH6ImE7G@nGCXes=bmfRlG$Kws&^!9<)1I)VOC3m0-4JD6h#M`Yq%3k>W< z4hL{HP_g0E$y4%s-EKtdU|zyvgfgyhX5mAZmgZ`DYER;gUc5Fho zGbDKmS!LC2B+zn9mnplF@E{}(Ik@3TdXyO)V&I{T4B*fv04D@5aN?1M^2nhqd1CMq zQ?}$E84L#gGIMVnB^p?Hl}^d-`qcky{%Qw^-BcQlgg7!On->nHW9g=opRuGHCqe`g zqyRecizsmYppZJ0Zx^Dm4RrwCwSx6>u^N~5Nc>2H2Bi%q@x8GzaMDrunk4T!woGKH z)A9vrz2mN3(*3Yo2M@LF1Gm`U8`_%LkWb`&H1vW%>vFx(L%M1=>#Y}`Zf`yJL_4^* z-ZTEp^RKqQedNcwH&P$d(d|!M%4I_A6KA(Ec5rz#byIbw5^NAY^+|9n7(SzFP?HC> zclA|jb_vzht*7hF`26sX?>HGU@qiOQ#k1<9WY7^K3oO!O9$b#nugatI$Ol>iwCl!k%>0_C*T^hffgPuUhc z)2Xa@t2t9gv;Arr};WiImHPG1x@g{(uy7wh|;ZqWipes69 zPTlb9Qg=;wJ5;^KmJTdB;ZR>5d{qE1*Pr4M-!cpX4&U+%Co7wXZFLSbofqQ1?T=O` zJ5gu|@f#rIxK5=9Hj+ST{=LO2)p}%Uk8HaE@0Qe#FwL55#`O<_`jfbYF?2j|ueKZZER*Z$HD7 zptc|pI@CUNgZ+m35Z~C_r~~VFRjN zIj_E_-CmtPxs8`|iLbhIRZ9#vbgtx0f!OH{3W+# zw;n8pKSy-S0)aX@efqS`;EhGdq8&0;Fy2!RU+|-A&I9;QEWM)-?~A%cYL8Zt&=jAV zxR-6nrmf(Ex*9*-fSW3s4p<*h_x8oykI} zd57YJ1K@~bImQrYkFaGLo*Y?iLxY0?FW&l%T zju|&L0ST|s4z%G|YBJKzy2E7-g;OUQ(zIXP!32hWG4|z|s{BYj7e3Oj4(Q{D6=fqQ z!RQkkpy#+&sCJ}LO(!-_bOyxmN=y;yk>6NGIdXx~j_^mNXWfBO_EBNUH~6uO{Kz|F zf>|R&HdBtS_Mc9ke3e#E5u%(9ukO?qP)HBNj*2k}Jap)~_N9lv*}n6wueP0&$J!gO zztB!-$B?yF-2<=P<8$2WU+0lbG|ryYwCCbMZ&uuK+db_wpZsFG@w(r&8*aX-J@Leo z?Y;NkM|&UTFC}}3|0*j%ix!7HjNMSL3(yl1gIAD(;S|t`2$#QRVvb}c6xpI$hpelQ?({(BPQU`G5ioKR> z?WyOU^Q~gv`1;q{O*iOvU`>qQ)k5N;HoxxD)vp73Zh%R&C)nait?{i27JckAvuV6n zlWHb~wEunjrU&OTWOB~h((S?eLX%~4jzPaoO@;x7PJmXOHSM>-U`OGJacGiP(?8q3+1}wOd4L-LTAs3v$ zSwwNA1A273q{RpA1Q;~PB<6^(NLQ#iBi{_6?t#xT(YJ2ff(QPM{>YjZ5ipt=(s(=X zMKt3SejFGy@&w{4qu&Z#5m13au>_v|~YXCQ9SE(QaYI;iw! zk|b@*9^td|a8aHA^=F@I$6kG|9nrU5u59eI$A9}``_WJTp`F&M7|&kTZBBk4w?t}8 zG`+wM_yj$#4)`twq~~C0_YMAbZ&Wc@BWBt&luUk8H^`(jmTY4D0G9CJt1#7xbM=$5 z=mpNqV|_aeIkLf>@hPc#c1mJXX1%>~ElI!&XSOad;L#(n#1nk-%xc>Xo|)@s^jByp z18>GHn~Mg0%5lLbUJ2+vz~DnR{1T5eBdqY@Iol5Ih*OQKJeT|LjM{wh0PS4rk{^`ztML*$yZ{3t$wWMnyy6^=;dgKLtUinA)(oSHhKNi;L zL*qov@^;xBNl6U~4TLq^~B|>49IpbK3n01$g!&yL2F2;r0V2 zxLP<}(LKUi&E=zE6tf}2uI5)Bez^V5|I7c{F72FZKm4md*IRRTbmdfasl~{qZXejM z2dlZ=gaxW?u>$J9b0(v7$&$uv#?_nI5K9}@O0$$vqla`umXd!W5J9^{w@)L4!xRlg9moJ7hSv{}U#9qE4fXcNh(p3pv z&6Q1_zGzHfqN=RuI5X?d`to)n?UL(um@VDX#4R%Ol|u11!Km?vq{SOoViT{J-OFw9Ae<$fKRaP8n^sv{LULVywIzBpl6YEbp;qaSZMCkRj288!%n9nnze0%jzB%} zF~lv6Z(CfNfdNKVz!(UCRYQ+)!Eht$0!WVmUUMj3wO552t}<7E?JRgsV7jNwKA@_s zQ~JM!kq??u3r`L zi?7rObwWhPv2(Gb2QLnIv4Ggvqn(a@daHuo!sHi(Y4FNx5G_}DTzwEpyLjegd-=Ji z+VMAD(JNflp)Or$Pd@Y8_QSvbr4|sp$`0PFl)LX3u;WLS&UK2KrLj`>1!Y@YtL}OH zlY?6b*l{6%mjNEP`3svasUK+2l@7~8Yo2z<4}|!Uz7+9Q54H0#mzR%{my~W-b=v4v zvSyvZ=ZIx#mn9qBv6aQeDh(4|N$~ju<-^y!x*wuT;@}W3L*aB??2nlzxaj5fuk^?# zyulgyj&=Ya%aV7!P&f8f$)5Y7Ir1~nhDTx->IWT)ClKv4bzcFGennGur~k#C&<7t} zyq*3B9x!xEdqol<{FiBP3m-n5aQbap;uSfeu{`NYSf&GWVPz(nv# zJ(dje1iX=5eB`bR@fBsECk8)<#||bi3gLs#z#H2^{9QOTHj)ls=tD>PqXPA_$;55o z!HL)>ZH-Y>*X2#!1HGj-!T7SPhf#Ocjx}jwXy9P%zVEKP+Mj;sJMD%e2ih~QJfkn? zp7l;DUo76aa9MAg+t=>A^Y-?>CdS7KqfF*-zl;ug*v1-}o{9As;`M zZ&OnEk&&|CPFUQUqejZ@T>H27wL^Li;5*;`MmwyBaT%|7G>P-Q*u9RKtZXuoT?rnI zG_)I>$xJyHz>4YuATNP?Tq`;-rgovNw`Y>qJKKU&q^?u!qBJ`+YTK*yvd#|&fl8f4EPtd`1B@< zIk>vZAX<4%cwp*gV3ZOv8xAl|FhXIuQc_ms;S>H*cop9LIkOs4ksVNQE1pkC(T8}X zAoytT@ zdg-6w73*_SbsXF=u9}_O9#ryc@6rP0;fKH6exQW}E!eknbR88bME7bR@j==v7*$iGRzq+{ zXxc=|%FgZ+x8KMsn0L1&zzZ_%h+p2l(jNZW!|jvz-{*<*Dc!3*w(&`p71YmEe>Kir z*1gyde&PW?lz!^@S2X5pa;@{1js7wCjmztQ;lIlDKFLW#YBu}`t=w;EuSf;CQ>KWU* zQg%>R*6O2zy1RhqZ(vA3O{bkO=;|MMk5Fr59-MR0ivZ8L4M9_Fh5BZ)jGmld59--U z-grhDI1^sVm;iG=Ja|ZpT9uKe-EQgXi!3v5_@k~^S{`)pC*)j;KBR%?Kt|D)j-)9A z7ag#J+nLIj#Ty;4DKbby6W;+x4tPxDBa7Q6Lu1AClSKhIp<^M1eN6h<*e!d@pYdhr zC_`7^q<7BOt7_rJ)eg!yH=NOiE#KV8!VURb8W(rf--i}j4NAI*At=2u(6sE_a%c=S zh7o}mZqXI}%x9G!^x*-{hllkDkwXiF2M#2JckVilyz=pu4S={oaE* zfpSX|0}_Yy!w;y503D~bfOz3|&$P2|zt)Z((O0N1o@&2-;y3NlUp?7Qofe{YM{1HN zd z6dJ-bs!UM@Dt;!q^DS@Nzc-%egelIesfENTHoRo9LMSy8sF{1pjHg$ zs?Un6?OAUXxk|)ywOR_FNm<4f%BKgm@Ge>4%yv(0$Hw|+p_|a*D|)vr z>cI9g&Q%*B%ZZGDJdeG?i+_PdkK$$2?w7JFHdV>|3tjjGY88|zRkWk2$sYX%yY_3G zYWr_!`)~VPyYBW+Nf%AfUU;-^o`1LP-G8v1Ij*lOpM0;~eaG$XUcCwD-0=_E`zJnV z7tfz=H{Ezd{br{f&=rXNXL)PNZhP(ZSKDKcJ?@=bUa80n7;ig7hbRp(LcT*?=*GBfolOeqz02$tU{o(PdX+Q zK(XsXbqgZ-9y`;2Qa9dcW9-*KKRK$M+wcDIAL*?>`*b_fj=qCYZ_Yud3e;z{BIkSk zBBko*l5XQV)IR?Qf6!ij^^JB`-$n3A2KugFXXI!EVNddCp0w4U55-REDoJ6HQfS^5FKq6OF*#e%5cR$U9G z{)Er`%_Z?>^3TTQyC+ZUR^PvGJ9;hX7ysb1dP|lj*=mFD=!zH5rrdVNZEdf1sLfwG z@6oxM@tJx;{i{y1X_s?NEjE}8+h+5kpVwGd zwzF_T_jwFLp7j@g#ueLEid36d8T8E4n3!`U#HUQ?+0iFt(G#74xeg^fyk}W;1+Er8 z+&ZQOt9*|7$f?Bv3jpxA4X)m(MV+A!e4!;x4F4R+fF6Fp@D1${-tj5vY%bAq&*(}U zcUxcs2Yhj&&S6#25LQK{vRF;A!lQ<(gJUqEDvG!Ugo;TQZ4@D{$t1Wm&|2XJ596y- zlp~9K(dbuYfqZlvULS^u4g}=$=X`+*Cb%b5$Q5q;y$8KF-_( zih75SEMxN{y9sJrowJ!1Uze1QV}tBqH{9}N$w79RRShI~Or#0%b>J3{Uh0T63{My| zaNxhrNO0NCAIJ)hx>?f>C%#hsV}q?Atwu$|y22G0?g!Q2v#B>Za7p{zj$SCYc~oPr z-jS~FL2xNmTPg1NIAM?8_Mk65NFVJWoH^BAeeO4H=e-y8%*0kZv2(Wl`ZrItNA*g( z)2GksM9&I8_F@al^}3VYmpxdZmF&;}PJ?G~B+tKn;sH{)!sl{xcNfMeUA)2fWL)jG zI+n$_TST z>8Rqu6B}e!ak;?S%le=lkf0Wfxjnba0X2OaKg5 ziVvQao|@4Q)ko3CQLsJVW@YAh2aoTyZhmp81#=mE+0fBYAh)z{J^aP?z`b|1 z%Ueu(F6vgMbMmX&ugcLNn`MZK?iP}q`tt6@b9$@OgY73j{!u&k_BqjrzX^&0eM(n> z1)`l;ReqfBiWg(Sx}=W8+oMF)X$lwiG(%p&s(G^9qrG;GZx;@Z6xqBl!ujLJ z9&0DGGyjc;zSxfF*d5*s`Nqx5af}?^AY3Zn{v;4I=*EnUFS!1tmjI&YQY21;RQeYS$@l+06yH(hRu}i=ykn^ zboGm?0DE<7oHx~E4Q|CYa7bU#TfA6AAiLghrRQZNugdASEC{)~L04kr2kJ9r_?Hm*<4Oc=1zNWmwT~?fA)`of-52H>(N9hc2$(|&Bp)zwaNt*E zBVN&`%m}70c=!}uN)xMjb)FMsAXt_KXBnd(xRai^rSm0r8AO~cO$-dfsv!fc36VND zbRd=bolno0wig$p4!Jst++s8%1E(^_;0hA}{d0gvz#Tp5vKd}F!OQl842h8G%D^~L zQNEN1s&N&8w>Avm;#F5oeA0kG1K>q9!V9{@ySuAZx&|6-g^YQP^+0jC^nxB>WRkRV z>UewW)o0uL$KGg%bx-FhJvja7Zys;I(tvnQCs^B|6KL63_KQ=`(`GZN@sx-nm(4zz z^MUbZC_yJ4zhwL>(R z7q)mz)h`$}b)if=XooeJzyyI%5zlQLQP5cAgi~m|iGu)-Zn{`>pBf-4u_FlwcLQdv zJ(#Ns8N?DOEm`#ooK#Id#ZEi9X@``dGyL&0103yuDK}eqA3P z`0En7i~PfcfW<5NqlaJ>9PCy zJ&S-n?e2R%)z0dTOwZ_+N^k7MK)lXlMJ5J8#u%sZaX# zobk!;ed<$wrt0aZpJ~@0y{=u*{pNhI4cq_Vv%jy~IDf8hNxUx~;fxy7W2=nnW0E{4 zyYg4Ni)A=`s0GOF=pN(qsh`3J07nrdf(QmGN*ycXcE`kZ({Bd)lkfg<+tS0U8vH$B zvTd*ewewSQN{=w$v?E@o@v)n3YM*-WLER4a3oXc0U(#pxB2w|W6kEmVQ zUt_fb@iyt_PGxZ-vT3&jv3$(B0T7p6t_*;V=M3?`C~Y`#grWEXus zkMKB-7uSTz_iM^JMefBnZ|MXlcw=vkqw=vP#*7XaXg_v#l=BKzVgfWgOJKMryw*{D z=|W#{SRJw1)gocUjgMjK}#-#x2vnv$;Bu0)9Cv_`1=jV2g9@Wt&WNWEYjfng} zC6mC;&U_@40gez4#i}hCw7so&zq_M>P%_yaGQM=4WK#q$#U60TXyD;2Ujq#^T)FF6f*F1@y+mwZw7g$ge zsvQA_I`S#8pJV|2W`tMGS-RmhAeC+Ot(mt{wvWs8I$?GHdFM|0(jF`aBn2->oG!l^ zh!(;XDHz4j1qM{Um)VD+=MOz*KZ78xiMrW{A#dAM1`PqYH)J@e%5Lh6T`?Bd*zH9& zNcHdbVD7$LWfrbCU0Wt$kpv48_@WjntL@mj_3_nQ~$wd zX*e~k=XOGYC%^@>4|Z>C`wpCJn@9evef3}eZ|&Ysf4rS` z`_lu48oz|8jSao3Rl9VTFYRaygU4RInp%?&jdOeS+SYg8IO%cjGk^BEc3xkje(kka zb)r3~mqJ`>ufP6Q+hJm$-CF1w8@NLCwLkn)d+(i-?T3&2M0&qNkG^6Gf?s=vGO$6tN8-Kwuy@6&_er%xVluV^xRMk^1+>n%z8a zDpAHFfc{R~q0Z0?zY+3Kbm=7H6`*14#n1Gyacl!)pKh%N2fqg&P1%i4fFE@N9`Xpt zBu`8|q%KNcEqqE=ooCTEH0cBI3LKt1gAjW3;jN~;&5aimzzba9;IK&Wo1--GPZ>He z-bSCX9YKqKh&f+e&;oGOC%my0z2HNd19=?qr3@Xw_n~&ImrGOzBjO-n1Q?JG9GJhH zD8h9ibSVoh3J}Pr65;9HGRNi2k*>OULBeOrKlpQu-fAwA7n+pChLStWBOm=ptFGMn zB0m*G*+o69L>Tp;u`}7=B!$0lG&CGsg6u=R0HwN;f zjuRi?)LoZZ-kK*Np-C*|9VYIb?~1%?D#oMH5-!uw`B0K}UA$|h&cML3Nx5R{uK}6~ zy64zx9%pn<@vF~0-cIWlru};_>A~qU?dQLGtUdOd-|F644aORRpP>O;R`;n723?9_91V{WT;720;sMzui2UIi@584zNb0}Ui+BE2nZRD#S!L9m( zPwa$>_6HY!yogmhp}sjGVzYP072D6|!NR(Hl{N@DeC$(a5N+2 zRr`5{5yN*<`sWQ!`nh`b3bap5Pd;^-+5QXZJfs}AjAR7;z=d1L ztv-mWI^_A`FiH*C#M&me{Oo?o%eYI;4+MA_p~3R3QP?g zkOTWkyRxt^@fST%r3{1{{5~6%lA^yJ`L*65FX%w8u#ey&Y~W9xeR?{%vYk!!cC=5> zC|F)|{*rt!xnAXciB@O|JuHj?*MJX=R15i;@|%J?@!$g2NKU&A)?lx50H@)$OIg{UUh6fis+V<+i5gUb0=cVD{h4i+k zPV!kt`c@&b<2Q}``Sw7X^4aoro9=U^uVsWC-tw7FnmTI7;1iFRP8s@5g#O~)B~#VL zi@#~hk1)oqGQi9rB3Sroqi7~&KsOgy7IeaNd^6I|0eHYq!x<&&fj+qAah-I?w}9*L z`MqH}Y|fWqbDrjLy+p*YrF_)@b){!|EB1C45XIL}JvvKe8gHiSeAY>`{8lc;ReT~x zynf*xBzMPo*OiP*eOKtAUvO2i0`H-yJT?pSND)){c9f5uCyO*rY0T3QA=-fj_&Og7~E%qaO`9vcpTlb^8}lFakY z8N|~ZX?THS|NKyn_T{XN4T;W8l>58_w}V}_A+B(cHMQzQ_@n4bJ*8sBcC^LfNqS;Z zRvJ?(1v@6m1^AMeT?TIT$@|i+IU{@x)ad9j7&Adts@SHnBFJ4F^`N$oAj48COPE-h zvKxeq`S=!ID`;PDhoi{uEC#bTC_w(fp0&0%aBDOY>PL6Xbd;Ml%9e0;@~cy23h?ry zJt=FJPlbQ-TE${x;b$g8DoPUuP8wBF#vX*mZvC36qS_tzN`8W0{pO3iq|&XeRmIDx zo)Nh7TR8P5zm|h~Hm{Kt#K|uMvD+F*i_IA^6IMrPg#m*z(lpx{e5#ksev!_k!tal- z_J*!H%%$A^6VDy!tbvX$WJ)KFaTn638mKrCQZsDQ$ex(+OGD?K&Xf8n`qEP$Ho)RQ z31dHnUz`tm$&LJ#p6Ocl3^)XmXBuO_mTL8x^|r_wyC@SNop!D;V}ihR;L36egZJB{aW=j8r$`Ro77^aWH0Kzc#kp?6`_ZbN9uq-v<*&Y4{PaxdPo%>{6Z*wl5tXA;fQV&p0v>$w%K|UU&6#r zS?y=bqGio%6*U%w4BT6aKR%#@RD-c5#Ax6?DR)odGb33 z-}%nBr+44~fTeSEe0ZB}=Vsr5SimT6CX8P`t4%(7vg%*o_>~js5^mo&3B$=#yYNfC zIk~mVTc~eeyfnRVf_Tcs7MtXcuAZ9K7#wXefZS$sNxgZBpliw953tSiJ=zO9?dvSz zy-TO|HgRCgw~UT&qkVXC=@;Ta13s`>2SxIi{scJWDQyJB%t3^MU&GXy_tszgy{nf< zg5NTkyllY;vlzYq>9y(q<_OsDZnJNNkI?0t0PLZ-!O_F#FPzVjpw{E5D>|)hkhbMr zwQ)I^ItpJsF=d@;J0*XD=$g0Kbr`Gg2mW><^r1>(dxnwD&a-+>7(wd7Tz3*d8uLjv z(`xt07wwX^Bpq)CIc;x^$Gev$LT4JC9b;99I)mS5)0!WhKaInVba$*x?|vo~)cGWT zwld9sr!%j5&{8l!jl30K?TM&889w_4K|XY*6*h-4@Fq#z;+XSkd^_WPZrYK9uwC?L zcwge^Wxb{?d1|kTJeXcxo8D)y;vL#e%zG2+dkaWJRGqvYncJNz)kJhT8=~PiZQ<$2 z0W-`j`W+eA^cup9Xxfc$8MpM=8H%MXgOVu{a4KDeT2I1Pq&%T`|;oWeEK;9h%H{l+OSp{n-!}ME~Z4J8yg%FD54Imbx%O? zl<)=+27l+3{?EFBMY{l>Yw=GpK-i(U5MBxG0n36<{$$XRe5hH21P|DxE68Xj z9WI_@Ti{s84?-S8*G|XW_ChmJRNmOV6|eDDI!RC3;%_uI#3>U%X^ThlTX@#c$i<9W%Aas?;q==nX=g{Q`6IT2VTznm2jK5PhmVq0Uen0yS{nK)Pj+(+ zjLz@}h9EmfDL>??1G$2krp-scJ3aSD|0R16E^$H6=jmL~ePqAZvFYp2JO`dnrmG*n8kBTBSth`ma+WQ5RO!u|8|>r454{YI;TKiV zIc9UBZ!$>jJ~8W;qYO@t;diaGAKTht4~8f6*fFtmta3j8qmNyl9)0N2w9cCi-NG)w zi#mp@vKP5Y7u$~>s?Ab)mMv=~+c}d7n~8;?H|v+cjX3GlWiU2`9jq`IN_nBIKEaZo z-{)g{zKON3bN%KmbUk zeD?9Drjw69FcTM)P{59IUe z2|8mCl_%>-C#m$?ZxEO8&g>tj15I$^;)g_ma-2todyL9|- z+*zOg>ia*KZhraJ^sTRboo_FlU@ynsbc;Mwzjj#d=cp9Z9H9)RZYRv@MH*RYK>mnV zm$poOqwwShVZtr>LBS)-lHh?qqYeP4WnEol^#b~lk7^^Gu&p|I*@;~uvyFe zg-ac*&5DaZaf>rFtjlmb27YtXXY-}xZ`i8$>TQAW;*y4U={SklZ}95P!%}|hD&chV zG(L;h^qq&&bwD7`NBJy(aq_AhhPQl?g>b`K7ULSO`EETlzuK3RcHMT{mSm!WFw=Oc z4Dq;kqjm3H{4KA1i90I~6hr_6`!Wp6;fl`(f${*D+KxYnD`14pj69~F5oCz*r=mI-uB)_AP*B%6#+3@$SHn6_ z?>NfWsk7mZFMnd9%ue2)uCw9r?O(pcS8lFTu^yXl-oDM|!I!32UgLvJ%)(lnsRB~B z7Z2tSIR`W8^jYcSH2l`1&uXCftjeGZXv-z6`mTF*Sd}lbgEAU{j;CpyV7Bp*OMHZe zyR^$@1(KJ-V}U!)=?PZ)o~9yh3!C9hQw^p9Fdcc8{02t4%Hngm!l^$ngqu!D14DXo z`rAM9L2N4f0<dTv;^aYDTc7Hs`8G^{I`^5i zghA6Lo%*Tz&-$g)s0&>`2Ob;_7CJnYJYjJ%qgT|*$DjJqiK7Ts16E0__$yu*I|qX$ zXe{Yku9aJ-Zc1e3DZ4GXNDqB^_FgcZg0Q5m&VJdK{(sXPOMK5LO<|!-ZoYV7;@l%WU-#mlwIdny>mk`;Vt5ALj%nRQ%yJ zI`G@PS>OoI`}f&+xP6Z|5w~g3*-T3w?(NZWrVMyF$T)xe$aI&6euKe+D=gri&c1K( zD0|u()!#m|`bRg9P3OVK>TBvQ>IURm;q)H%3s1Yw)?J21urF`qFGS&zq)ioIFy(9UDMP z3GZ{f_wni5|L`l*U;o&>5l&zjYF-L0kJ?7nB`_NsNI9}?P(C2#cZ0V%Tec~Ifewn& zwv^NK);r?fqpjL!>DoCuq`yO*evHFCd~>nOa-i#c`(c~G;Z8bR?D=5XlJy$~LY`H} zSAE^(`40I7TJn^)TU-t%LVBMgZ++v(Hzu%<>4+l^;f5{ym2UIpF8fLBU)0RH`|O;N zugR^M!?ySe@%j|59epioqE5+l?Ea zV#nvEQ%r_sNs%-7;_@1$&-?yB*oaV~184i@h+ug{Zd$;NuNZEm2}=5OP^pyIjb2veTdV^m>9cl_JVo!mS*UVQ{Q~3afCb;A*!Gp}G#+u2 z8XQ$V4L6^?i^p(b95i)Y<$ZPRg;9QW)4FOqF)kkjB1bRcBDB=&4~#e$LoFcHFhc%yL-i zGgEifI` z)Nu2=?5aAzS*BoKa9IIJzzVSj6sQn?@=h|i=FM>Ju`yu8evu*P+6 z3ZF*F2wd!&yE>Y~7$*dzlUOR2FJ-IcMK$BGK6`2vjS_kijm3%|Al<@2a56H(*_BT%$_k&>w&MzM0~$W*ALoT9Dx?#w1YTKXF21 zLwQ3Wap7bf)DB*|vaSlY^3lCuf|SmcGsuEZ+M#EBp_9aff3^k2A@!yTU0Z94WoUsZ1vy`5l&WoS=hgUr-G2LfUOQS{rxnPMb_@ zIEWCb$F8m%bt5AK0Q9rIzMJniyw1keE%IoUc5Ve*JVL#3KQqdk)6+*!OrJl(N9mw< zn&UlJ)_4nnYLC10#=*mp4c=w!Pd~r9GrhNQZ@SA0f|XtPJIa~<4Jq9FBTAO%^Nt`DuRTCau`Ft=1GWfS7w2X586z!`ta z#G?;CGJXAPUxg+e8agiPyh%w13pky*5_e?KQ4UJ?#VwkfivGY|D!2EWF)4qkix z9XhVooh~72#}`Ft1**AKS6K-M~QX;HHyHxoPHFf9khnHJ{2E*Vi(vbQpcBPw8>_7UVX^4*#bg z{U~kQ7k~G~>D;-q(;f`o;8d}7meM`)=%ZP_q;8jZD}B&_sY+P2zZn1{uYb1j=G`Kn zMKY*2F5?X5S3rMrapP^^_H>+mMurx}0Fw6%0vVB+w(WL7gLKC45TE)99hq0DGeP3g zLI)J5PS(jnvC(nLkTmVPiAxyMSRRC_^zvHmcIm!4-4V~|;%r8I8FMGsOOOjH0V5{JU`Dzau7gO&=n z6(mFL4=Djaxd=;wGg;v%xfQ+-u}g>i)A!$=-v13t z0BD@om;(HWSJi+0pMN~P@tfc9HiJA=@fruNe*#M3(;yxQcpc@2^L?K+m_Pq%(?YUEMZUpNeHNyJc2ICtV42Mx1vhrUOw2sLn?~D(&&Kza;wxjgIUfn&NBF)B zYutSw;sv~E&EZHq&*_Ei!Zf{3C(iN@ek`j4PFSyG4PnZoPCBjl+L^j?4?U3}N9Y2Z z4q|X1|Ev>Zr(mB-)R|ZF)Hhn<4Q}qvp81B$*RT*JvN{(4KnZNhUD zS48m*q?_2hq~pw626Ch?f0cV>G`t2#m2YJW`o(R1Cf%h=2R5{qoit-xS0o8fh8?!6 zVzKJ6q%ob1GZ9{$@5UV>oA}PtxX>g&K@W zp?H-fn#!~JC%wKJdEQzkj|xx4S3auWU=z`B#<$*Dr#%>p+yl1bM8N74I>6+&2X`Jh z&CD?!^QZq{dg%EV_&AqK2bc(9N#crc0650eiGv;YC9t%}r7Jp5vGKT9-;`lCbolqD z+jlpn>#YC3!*?Wj|4%1}%oTiPXL_7L!+0b# zdaO=Nn4bL{*1P&?wc0sY=xdgyV3Gsq+;T`Bk!Y8W#t8;X-}w61r}NbRkd1RWmb27V z7uu0OY^5{gc*QSYao)J*DU7;w>C*Jvv(IvL?yrGISzMK-Y{E`bX3ha_e#L7_6?TH!TUp(AEkY! zqb!}!g35@U+~vIW41i z^G@EC&vNgZ@~#&QTeiz9nfW)pv*~t_h4Z;>K>Ke!bsf@pJF1{Y5442e2aACzhORinEj6oDDAX^+p7lw;t)>PafZojmuch0sq^k-oOtD> z_2hHjl(P3rdg2pKS(_*Q#_6Qtl|e)kZ)TFCAShRr6d0FAU=#%zI}2ztn9-<-2zSQQ zLS-f5EFN+#6eEaf>UFA52VFDVp{-1xo7cWK&o=b=2nsMNro(#x(a#Hyo=1MDkpgi-vHTJG3`?fXjdOFAH9KL~YEvlJq!{#X) zXowD&-+;wgQ12R;CR2`Nhp+|bG3io4}c0AZ{fy~`9u>9@Mh8}mZfd?o)zs`f`_ z@M)>VVwH)?ucgg4PUT*vU6!G$`?NE$RDQ%W{3(B=4HbxL!Hu(|lWyTNo%ts!>8Y1; zCA48>1J&pj%t5|{Z$5>cJD=UNW_`E1OQ*vnQv0k<-+gR4_xvADXD?r14gXy}zQhOP zT$%#?JW`p$!W~W(-w0rTyw0qKy^$HzIFwzi zR?n~OvIh&ID|V#p9D()@r$o8D$xeF7RXOQ$8;Z}u8Rylupw4uCH(#`cx+)JlaqICQ zd=ew8Ii=%Z5l%09n9jXl(@uv9J;~Rm-@)KdxX!C2`Fs7CO6yssl^^>OU;FBpr`Lb| zs|?^`P>O_HiO*ZDiBa?i1pia+sX*`#GuCD9*BEz3aIH&N2=yt6)P z`-3aOjrpvL9A@aqOY6Yo3!n~`x{VQcx6848%A#r2N!zTZ=%5~w=m9_CIgCTFC4TuWYzgDR%EGH*8g1cPrfodp z*Qg)nRrX_#J)Sj`uBEekQezYO#n^x|%o&$^x35i~a#Z0bAAdM8F-P=~>8lDW(I*nY-hfq3MI9ZK)Y{S4gZXQoOOKojw}VU6GLzs=3z zI$cp;rjn+4mag5<8?hpRTG=z4yo zap8?c=z}+}|Gf)0Xqh&6Z2bB2AL6;mqkM#4-~`oq@9+#n2&)dR9i{<9SdM5Vy{?uz zcD^{+NbnNcWQK9C`XfX%`OxwU=_Zn{_LPf;S?f(1XoR`j5G@-IDJ}IWjpDKMq3;c2 zyngj0q%+pi&>e;q&ag$=K-EvjSuO~(w-9Y_!E!O)?A*h_|d)y(DGU*mwl_FmR($FLqKb}vkmY-izD~X&`GCHbG!6X zbuRw!T>9RjKJrNM8=Q`{bDwualo`jDn{;ZmPTw|o?D!`889qYh4>{KL?3y3yqVpfv z8boa1Unju^*A{$uIWSz6SS|v1?mmX~j&+%=JD~7mai03*o*(r(m(OxAcP+PiRXv>t zff*0PKCuQo_!-<{@pK34{=hHBPusrdn%SfpLTG;z|rukk0`JF ziPLgxTbSi8yIXmKUwjljiJE6~9>WMf$$!DrKUc%pxi1{c@~#ea4P$)OpXR@_`^)?{ zPB_yz80r11;x%0TdBTx<;xe#+xVB*Z=-mOaGC9~VzE4P!UhBT|!!VckHC}1!iia9} z_$;1w^3^LLibEb$} z%(8{f61z;)tW#BXqSYBbchYPHcX-1T4oz9d@4=)EzPL+$$-yExo(@mNN2TZq8@k3G z1@mlXm4XxfnV$u3#4K<|)}@>d3c~85qnx6M?CP~s4b;~5^zB0|!y3>UJD9zjh}oL!*QSr( z|6tm@dy7}|b`FnDufP4?^n)M$WP1O@PcrkOPAeT%TSg)!xcq@MafvS`$pZ_0RVs-T zTz&fA@^V*KYS({h$g~nH;-X&EwYGp+uZee9$e3BV03$Gj8TCd_&gzA77 z#x96H&EbT#QmqZhOX`LPmLL87-1o%E)XaSrZkev>H;fih`9AQQKkXu{50%%|p8+r|#H38`)8b78 z0TvCdaU0(1%|;BS>Q7|0L94w!MM=KBYClF(m(p;^+_JT9m0j1*1F{5C7o&U{F0A#l zuvHHTR=R51onHNQzF6)Wzu~&vm6_)elyLEsM_n|(`1#WwXBvYSLkD=0bskqoqGc1J z@p_LOk)cx$B(obYx#*11d8ZBBF)mzi;_uigPKldUXfBB&^v2lmJ5S&uOTZSVn zC-Upmo@V7~aQk%V96azIfB}SMV0XICVDR@?X7ko7zv0xI_o+A3>Gt$_HdlY+k@Kvp zC(Zez)9r^Jntt)&&h(RC-I=a^dW*pTv-QlNUVh}_^o1ACP2c?7!_$jTxZe2s^we}> z`omi%rw{KifkDT_WfFct*80}YfpwEgN}esW#q;Q~&-K#zRr(?s=@$8=4*eVXElkJh ze~?qVmA7no=8M{&|NIN1J*+&q$2$AY3s4UyjrNls@9Z%V(hVQR34j~7;*np=(Ko;O zjp<+i>wkmn;&kQ#O#ExR)=%w92TMCFxqKtVM^D)(eV&;f5SR`A>TIW-((Q6!C>%HI)?AAZ%>c25@?M)*rM|(KQ3c( zIg|S!*g8oX?2dMy*lMOED{XAjHxW6-P`RUE^+`Pe4L^`}AKC1H+V`kVB*_jdv4zil zDZ9EzA8eM-<~yQ7n)(J7_GsJO!}0oCZ}SE9+tcrT`HKv0pQOC3auV3x>BHR*r;A$; zPZxOebnFCmv-zds(M6SMmgceX`wSbhw|3wWKJ5fs7wbx<&)(biGTAufwtn(tDm%{( zR)q2DuRM1eaqHK{ZJWD3RYz@u-AT6GKs|3Qx>Fp!(6L<$W$7i0sSm;HvL5kBNB(t~Nx26>!?hlS6<=S5 zi$^&Pv!Bv~7jdXB(_-_sJ;uB z#*Km{OgGlXH^UJKEa{+t%s5@}4xuWf0<8q1W#BqII7Yf8{w+VkwOjZJTXxKe4`FY5 zxCRlsFvHs@me2AcsFeU$JDcRvg=MBP*NlmAP-$e@4o0FY*2?I`)vx;#dEiH^x}sSU z$(g?K1cJ(>;yDd0VRJpTp2gI<=zLKaG0KM?D`5@6{<-aqU)qc#oK`KDm0wELQY#|n zt))gUbe$&f$YnlyM8kOEBx{K2R0|_DdC;0l0_=%Kcm2-o>7$SMSkT5@UX}4mjv2t$ ze)Hz^pMLnG>B^^{QhC}zFtzE#GX|^RN(a6LfO^Rbr1ChEGr|@+86qk0BCLQ)=x{dD z6Ap=zOTI}f^-T&|)^}z!PK8E+tNf=DV$fjTdt#HvGFsuNGFg%L{4aR05|V~irz`nN zM*Ik2T05d6I{A|LQnpD(RLSatJTD@~bLahpZgqkz=!PwY!xO~?L4XRT!X;%)-|>>f zbo#srhr9&!jE~Ub@VP4+?MHqjS82pkID|A+{Xz^*kb{6k5E6fuOz70U3v@97G8!`-PWl#DGSDukYrw!;@mMR+w zBCPP+nNA~`PK{xvDPe^dU(+j+#$n;ytYgHZeU&P-kEW;o3QyhDZ|lCoNGM982L6lm+}!HJB4i2g`|P^3er9V zC|ze+5l=}#+7hwk8le>%R{fDy#kizImq8Hp`9Gv1?NZxACy6ip8&-0#>;YC-(1dPI zupeQ4T7T=6Y4aa`4=Q{ZZI4b1ONdsEGW*IGha)E)N$>LMWSVa*avUf5?h&qcc-!#e zH@`gHzxl!R@BZx-zK!rPowJ?kduJ|9kDXPT!w+t<^og0# zZSXT4>?Y*9%=E4ty)a#P{vkSQ>@#?8XL|9{$?37}yVJYulsM0X#iLI=GJW-nUz^@| z?~M##oS812E_EQ2)<=e`H_N1cI}GZo?6(GjJ_~D^P(QW{-j%i7@@`)Hr$f8=xJY-A z{coq~G`dIR2!k^Bj%6tmcw>UV$;n|C+b6Kv#*vS~FFenrPdZ4(b{Yvw%YNs%r>7?# zdu+P;$&GAkwlPx>I9&eM@c{?P(+Omf>>{G8F z=Z%kV3|`u}Iem@8LXJaw6GL=qlDK^{ag)iE4Q4O=dbNXvE1NshMczuOFTWRYmv9FT zyqv1H5cr+!*r87IMWmS$ize+Fd6bi*v~lGg`Jmluvko%SCUMV4?7Wj-OGl#U(0l2v z^Rn!?2I1O>&M@Dwr5}cZ+%IIlx+(m_tDo?-?d=SHo_*>`>L=vbyggn0z^}G%PG>JM zn5EtK`#vB@9lJ_>dy-pO@1Wt<13%!ZHjqdmr7e)yv3lL5*oI(uhbcY z#oid8Z*a(fYUsg1^Q}#Ue{ku{f9=FKPHeK))I-+_7VI0+Zpc+L5+tooMugFJ1GJhw;<- zC=Tg3Sm`opm}S%syMDvPDg7Q$8s7sp!-Vl!KDR5|2kjW9gx>Q=3 z_A)s!d6w^bPZFfUi!PTvy-4QE3DqO z_$vQcMz_Wc`NO^j9?ih3TpW~XK{vsLvt?43BUm;=&jN)f&#pfd^W2HA)6e~e`CKXe z6;IuQ51EiQ4e}b5iK|#McnU2F_Mj8op;1(wX?TRqR_B?KsL?$}8N8%{W~GEjWv9^I z`t<5_h2=}ze97||8?>w_UwiHC=|8{p!|Cd^YZ=I-Dzt-?ftyBNxWrek^fVqKm;N%~ z0TI6E{-yoS^%9)~R;G2@@}S3)`i{uBs&$k8L$XbwdjBCP+zjSJvRrP2A8cyFl z&(voZXgd+}yi0D$7BHQE9pCuGrhXD1GU~=8Wph}q$;;_6t`kROaYy4xW2}GqAAm^E zDHe&FA0w<3b%v=ID<)-Xxx&j}Lca`#3jREkR-R3l{LoemC_%W9AF?1@r14uLT_&a7 zbUN$^7J&^3JBB_}^s)`{lX{I3pY_HrSUJPL2eA!EHEj}2Da zQs|JbIz@wT5h8$bM1((ily}3ZMGAtjnzgD*zGhgL@BtDNaAjAd9;z}j}{&ahKnq^We?0uM?xHR2npz-qSSLjgPNS)|pkl%wy6V2&P*FU^Ay~gs9 z$Dev+`qonyrMNgnZu;(bzm;ze{Pfl9D98?^dMI6}lcm1O zj!JGdQg&~=vx|n$b#1(HU8Xud_=Y{eF7a3rdh{)%C+B!F(T5&)$_C|=bT}v>((d*2 zx9E!}+`!9(Sl&d`NlSOo!Dikz&=CXeO^Ml+_d!m-|i^`H%PR(1cj?h_m*^Vyh zC^Igfx_a&U^n;gvl7ZR_PdW4nZ5aFq?!bG139S+qUs1pE%l9T1;7&6^bRS-u@0*crEDZAV+3Yg6`ts{X4y zu#;YTF54HseC#W*|`m8r{n5d%Wc;h!5 znE2s6Z&++t&;>;`1$L1p4U(wpY+hv+u4dfO-I%?UQsE0~)w*zoH?0KAAI`AIlt+K8 zND@Idj5IznC1Ih{wGO;5(hDgNzQDHr&ieT&v8}C`Zb?U zQy6>$;RfpZ#QhM?iUDFJ6NMK#+sw3G|K!Saoi`1Bb=6nMckbPte)a3$Oh5UXSEp-y z(KVHr{6nJEWWmy)=SE!+5hBdAjID#>@Uy=SH;+k&Way}9mX_2<<1?M|+Hg8!jQKsP4iWJUUR*zFvV7b!iuw#VdvG3N;acX)~Yi~sYABF(y46uoH(&Vl4VPg9p>Rp zI`Y)Cl+(rt%_QFE>daOiM`wvAy>6yQaWo#Inzl6BZ@BrKU&7mI4GCTb@mRKDqwLYQ zJ=hkX&hpg$#xo)okLp4fI0i_jw9X4p^@L6c3rgDxJ>pe2N$F3VxJ>6&mw6O4*EF+yqVj?VrY;`S6?jB81=F zVR;LCA$l*=F*aN8(rMo2^d;XsDD#S^EbTy-PSsrwWcok;>Xm8jA3VnCbdOH|#rJ-H zy8r8kd82T3I&q%a>nATv?|is6-MsMzXY#YrF>id$Zv68L!}b*2 z1M9uvyj!Q4Ki#qDd@maHYZYQg);O2etfLF{d4tX19z1dB!Wm}V=}>R7hh#P(1345g zpJk817Pfg#UX~2zOk{{m##OlROZ&H6v!8`L_`(-noPNYI*c)t$HQ&9YF_&wu7h{)B zecAcrBrZB%^an{fP50Q-uy=2Cv3G9B013`X#`wpAr&-l^mEbqC!zr{vsj=5yj zhJ%QE=w%NhyGMR{tnW!Ww=Q?GF4;jAmovE+g1O-gKu@yNO_MhLeNOUPgAWH1oj2Z7 zPmwOR3Vust`X{vA`@H!vFT9je`4?8M;~9I z{rBi!hR&ghu7(YF{z09G?H4|{VZ6iUcm4M%ke1TwWE2Gz0z(nU@JMRo$)5_(S^t1$ zR9#6L+|YzHAE7@)YhFi^-+Bu;dUD3WnZ-DH)uWMB$lv+fvI-b~SLTt8N?W6|lBY5~ z%IwQF6?7NsMlR2NojQAtucbc5dFq_Nl)_5lAh5!rAtsSCOAdN=Ftlr*el*?SErgYe zv-#^7{Lg>!3SXlB>2&vAt(8=tb)0KaY#oVjMcvD5@<8lE2cfgng0cx;x~}l@7fydQ z`sg4nR~g|PUYNvlm!|N2g)+0w7N`s!;E)Tv3;+N?07*naR9U_=P9U<=i9@GdR3lit z#GWcDI9s3dbS-z^gJBKirV_3s@7q_qpi_KMj3)ktqcjqJi!FR`- zc7EAdF>#!-Z@4_HaOG~fz&Y7r`pobp(vB zn}%anc{624XOGP_42h@xkAE}W-@G}UIQ#Il#k%(Gm0Q!j8+WFSTergh@iVL^UT1cX z&T`f@GYc%X?Ry^Y=?3Q`)6I>Q>HDv}JN={Q?@xdHhu@&ilW{N3w& z)3eV!HvQ?J{l@NDZl)*4SO?Xi^Oov|1xdalF!Dq&3Sinz&#Qt zcpG5-va`yDU}V6`sTQpZ=%CUrzCqmgS(5tBj#=!P+g=gI& zfAZ8nLVBSW6{qjeU0rmZ_q~p}hUjpXVRdqwQSvJstGi9-|4X0$9ACF)(16`u`QROJ z9G#wc=9xO2KpS%@Ej&8WDhK37+VOK|PEQ!gw07g>be$uLm2*K!gGm9k4kiU{wEX$S z_S1aTyWdtStNhsBxJSrm2L#@2kG=abMwkla&vbH0cZ^GUok_Ptu3tFqLfnQMCjImY zx%=jz4dfK=9?&&Ydg^BbkeT3c$TA@ z_y8JiVmrZ%(VgjM zuf8_@fLV=v=m*)g|EDx=U)vT)q_{tLZx5gsb}@lS5r`oDhHkS#>{O*`u*Iz9foZlmsp2SQwhy_ zZd%1hhbeRWfmZ)qy1vix8)LqTx8u7^HJ?d5(l<=s#np0zti<^j$6+^z?YItW9F4c* z`)<5(ijo0DiL}tAaN2^dakhVjjt~=#6;5Dg(%?@XcWwP~c6`EvWhF9cQ_H*yZFIEe zpwjX~gCa!h=vP|et5uL*w)<~R<%~uaBPxHS*Eou{4Xk^>&IbRfJW5|J(-y~^st<9@@d&k)dZ`0pI&FyX)9DTG zINO$jg;nS)zXOJj_c=77tzKl&x3_WB<@GLd~U^e^I$w%3tOXtA4mzCyQcXnA0^_%G?-=o;20Up-e|FJ+41fcOY+fwbQ*e(`hD&tHCpvPy@H(||1J&A&X) z{mSoibT#;v*1ogi$m!aCKl-;z$H~dN9X8NA!QfZLJ%)9YS@&1&?@YJp^gl)2aFmS> zeoXEza30y}o(Px3x!PmLw->-65`$FxMGh()urh!+&J42y7Q%#gX^+dpR@5~Y9EHc? zAKA5GWKmhJ6j9SXDrF%WLEc}G=WQO{R9?pxS551)~n*hC%j7V(jOXx7hZm8 zEM+yX5Dm2P^f?V^(MVcLI!=E%@s17!^7Ax;p|ODIpe3&@YbplU@6urEG$Qq;%Nrwg zGIVI_12~7i8kcZO3PF))lT)1du_E(9D08l`nO?)tfF642q3IIWNsf0*;T4{WvSyeH zzZJe8-Pu^bJ6-$aW0o;}qPl3TSEo-ty*~ZqXTO;K<`=I{x0vbkh`1FBm9K`?NO)Zc zQuO7tgB+atRzkSGjDL_b56f0e{a`o8G57ZxS{uB=0srEIt_8Q(GIk-9L;rNb8Q%W% ztMY_a32%HIKccwF@9b&zFi#^yPxGJXE|xqrpF*tA zmLSNTAsOG%t* zb6)_OgzsvYO(&mTHYnm>XA}8Yc$zBt7qCf8Hz3cx8$T z;*q;J$w&)lyb$}GeZL3)l*pXWw7){fhYtDuk6xbc^5*pD>1X*$?%CD=W@)AzpnkEX{ju1)W5ar`rHF3voA zak|KxhAT&3V?*`((kLb(?$NQaBe%4-xemGvDObV=iKvVp?+nXW zmc#5WZ#cFX80@iJi4vVRlSdgDxZmLxgT`Hs1-`|X$}b|_aRyAAoG`e-@~RCwSMGP% zH>a;LfP47DIbe60 zEM(&}gMlZXeujH<0oUgR0M zw1IOOu+%KM{2C^cif#YI{!#Uj)D?C`toq#YTPk#j!Nj24&p5iItd{R}$G6kliPp~S z%(s2oneTedFmZYJjf3#U$*(xHc{|s4?)Y}2;7l)0m*4HOx5G)89yoLdKJnC@8q(Hvenn1jOWR8r`4W%)lpa8c%Q%`rA~ChJHe~# zy&m#X@xHmK4c)B|;hbDxXkjroYrOF58O8(7IBhK6ROkj(grjLF=c*)Tn{l&yJqC|I z;xfLJQjNnPH`*8zES*msU!g@ouJskZq9`wh3$O5KFMgGy^S-!vZoKm16=YP71yk`c zMk=YJD7Rs9w<-W_Hd&6IV~5+wF8G>>(|nk4;U9d-pPQ|Ac}?DD(x>AtESxsH=*(t$ z*YDMD92$lo88ar{{0y(SEa=Aa`T!MGg(1%dTH$GU7g^8vh)1|GTSk?NKF8YvfSh5G zD^WQ@y~pd`YwUrzd-Fz?FCC$x_>?u3KmFOu)6ZY|C9i;i#qgEK**p^x67C|Ktc2fj zkIZJDOb^DBqTwoRo~9d|S~AgaB*)F+Dr6aEo~Z-VSN;`*erG`}XvQ^u!^8XNq!j+a zYr zCMhai#fw-i;be=O^;$z@u;oMI_?f}VXX-%c*=^C8zS9ysLcHsgv3&At8P6L=>bB>Z zZT#N%zPBt3Y0PV&WBuo=`XBt@2h*E$DxG1>mw`#I{)e@Nt{Rnt13bv6u4z-&^o9qS zz&RmRJSBz@?#U0s@#-&7A55G_=@oYGbPesevg~4wS{`&_GQOse<}Kb>TjSy*p-qSFPwPW=ylFFfzD zdG|E+;P-z2{Pg~NPjZy?N7L1h+?YyTN}YL^H?(_vvD*4`RX%Rtn?Bk)GOfeox#u36 zPM*2Ihs>@{e4}nUbCM++z|xgWr?y26A%4`&VP}|jrQPXDpEclX_|9+RWACJKCU}cw zOLk_QDR;17J)Dm_CPC$*C+&Ge>i_z`{qJaObTFtd?GmieF>y1dw5x7ahw@l_ViwuJ zZYF6XJ$YmKImS{hm(04B-1N&FWF{2+&VJv6uItQGhJB^vTaY1dl_e!A*Ru9AfS69n zP_z2zUirw-=n8)9Z0(b$o?2vA^az`V9YEaW*xy?$DLds}7TUCJNN-_#d~JZ-s0FY;9$SLzRh%5c6#+I}5 z4d?@|;|9L2|8)?Uv$-@ud zdz-RynSlfc15kfz-x@;RMgebri}N()>hVV}Gkbo5?|odGZZV^7d!5?J`WRhDY+9Gt zAGT;7%a{GIT6*SE9yhwzrekFhx1C}2?6M_0)H~##IJUWtjgGN+j1!l$`RdwBXQ$dY zZJ;~V;&sW8opaMl!?f0``hDwQ8aw=_I7Q8{{(h8tQ@6tFgz>F`c$%jDA@%8T$>KDv zld{qmPWp3xo`WB0G*0jCm2e4_0}2UXIn&6xa$#Sl|)AvM~1 zE#?#Y2{Rf*6L^H4job`celEcBzEyo|OF(W7WHr3xkyxbhEJ5H8Z;7ym+s1E2Bgx$LBH z#74A}q%oaTE3J)~YXdE)UZ&TDw-8WsMV}PmH30YQ$=jj0bIWKen zVLTmI{FO}<&hm$s!BZG`z)#E6>6FKGmettIu-C+s-cB7t6%2Nu9@+(lbonzMhfXUV z9RO+^xeFEUsAG{jesJqXx;j0C2+QLFus{k7UT5Lm=mbJli&I0=+q`Ie+J%w)_R*te zu^$*y2IK8G&0WF=9&w4a?bEonEyR-X#hdiSi;u!so=jiWS;6=me&scy!*5l- zLk$%#T0A`VyH(r@utwA6uDgLXab2J};GF8*i#?7y0xdiv=!XwuU~ ze)Q@(=DTeC-Q_Ta_uqXNnk*?8OJNo=h@E7a@q~($%ucZQlgQ)-OlcLtjXx_&NtpVL zyMrd{A)WjmNqvMznpLu5n9V@HnSY~s7x_QkFr*s6U zGxnG$u|u$R%oBguBXRoNbZYHpHmlxbqwYOs4*l}9b(nR_$&=1hvt$g|4c`edA=&K875229L3n z$4*S)t4TAEBw_4;C1CE2FiU*ikdz((bxTSUZB+r)5U_x32L|dqSFUKCkbE zQ>S)b#G@S5DFYV1!_Q3K;T?Fu9GYeO(%zthb(c<{y!SGuE!KqZGf3EH@F)z~8++UL zle~m)^3^!Mr*NM7tt1!$JS&^Wz3MW6fo1jKcP5S;v9H3p{pl|h zj4JlhIiT@FW|kR`tj?g;3ITww*IMVon@(B9uWP-ivko)dX9urxYaM2suqj_Eg{#Ud z-tfj=g`LgmS8u(^*R40Eud%`Xg=e3J!3~xHZcp#OeV0z<Y|G{+vjte6Egtr`6VV-c2t(F9#$J5WLHm_|$;|44p7K)EJt|lrZeo zyf}&;SNL9+X%a_R^fzR8;@CKGV^QR1^?@yK()BGx!)L%xO;`pgc9>CQ8!L@g1nnfI zGe&a?N5&j1@}mh=)IMEn%#0kgbyrgTX2q|z1G_?{J0Xv5`U zVKAZ|{DnuJGrQ+~@H)rc_zi1$yw9H-rqs1$e2%ZvbUOKIS>&T#hXt+!xeL)a78K%H zJVmaCQGeu^!jh5#)L=Z@Kubq4&R=>sUj;qINb>|E>x7eHQJI^bhEE=yrCP^muU`3> z(fZvA1IdRUU73FL)1OVhdi@P_Un5yi4A-=l`HUpQHA4l~IyBshTD}eQn(MLQ`flox z9fgSBPFHb7R~)&JBZI{Sba^arRv>xZN8U8Nu}y1^w^DV*t*tNfr{b2+&Ho%{!!+)G zp2K$7Z@V|XIS#}AKCbC3(vlsSA;K;WI~y4rmzMu94a4Vl+c>>1>H*Ctybb=${$|zu zZvSE8M|03Tv~WolU(zv1)p+Lb>Pj9NJ1 zJ8e@nvUri70&V>?-vwhAR+}oIl;d!EkVawXU-_^2gZhw``YE_&Rbv2G{wvZMcVEYU z?kiv5%k-BhC&(%fM$0frvhpF}PPWw1zhaiA`kL2k1HI8f0T=Nr>)c)pG)x@PE%(;z z{4JAdglUI{`75m5e#vI^rOXOTN5%mX-3@436Ee8iKQW!zzB1iHblGJ&2{#he0RXJ>g}NH zp~E!z>wW=yw>luuuMUMXfYpIEZ%b#^m!)2pINV#ihYlR5XqEg$-jZ=HyEw6yf5sx<2_&R(J*Z#f>BmcVO37v9}wO=Ps+(ha?Gj@V8ShqPYbr5X&q$9ulrh>M& zM?To5*%>wsHNt@bgbG@;t)W;yi}usUk-uG@3j#ffP_el|KK@~{3J@^yr1LiHg-W!N zkKdMa9;mLmtSn>#p!L)Dd75RIbY-~eKJlx=ciw$}^2_bo{`1d11se>K7-U~%*`Axl zAA0nW^uL00$W-!{9IK2VtW&hLt1KJ5#$g72SkJ-Skt1svFdSFsAh6HY`R2^OP8-nQ z^^0!l6vMVk-&UUCxYkcd+c=5|$ z^*76Q!wc!eBab@s!SuT5kvPk?xO_HFJVn!*soG0%bRK#aRv0_->7T%-PJWe7KHbME z53Z!p#*gP~>TUn^CdD+;6le1weO-qc-?B(oS(?x0WtYMAQGs-=oH84-kHRx|BGD}i zgaxT8zgB9)_w)R|z(4Q^OEZD0%oGt%q)X}FP{TjB>y1uf|}?ch@5=vZVc%zBB-v(L|8dSp6t?gFpD>fAfu<_PcWLGhtwKy7>( zjPbn(zifHq+Nb%VWj>gN8s20k^2a}UX?pAJcX|C;qiFL&Jk5i2lcN!1$>UsK4PO>e zLa7&F8c*HDTly4g?heAKwpk|@Ja#6Q_?Icohb}CgC#E$_H|JrQDlx_qzYJ|2I;3Uv zKKFP0;d`g|-tdQE`gsnsfQ6UHNMkdPZ@>3;`sbJ7^Jj6+pY=CPJOh3n)-WC4dFXTF zi@N4Y(?8f+rYFF#0k;e<%=71#-LM6X8GgMLekkprna<`+7X?)fm;T1v;qx-Pq}{O( zz6;m9HAJWNSsJaYu$3P*g$<8-xzs9S3OvLA;-&M`cfR|b=_((qdjI|RxnE74c3944(Yw4I zb}hHzHfpA~?9K6*%7H>=J5x7WcM=M$lMeF6gD7x3V>)FHf0q`Hdf8b9setAi^g|x^7XYwb!INXbvonfAdV>ca6SDWbb{r{x)#7kPU)yq{Yq!W-7tJiKuozBdlT?d-pI<;t}C(oeJT{`1d2$5JIu z^kE?JvEk_E(Z^Y~L|FClN_ug0emF388IN{kJ~)ur@>?U|PUoC`L+(|ybMBzK`*e2b z#be*kaQ%kq8n^eRWyjjiwmLT4JZ;-Gy|fMwF1nv*M_p%H-#~aby&ZK2CNArl)6_36 z;pUxskAan@vWeR`@qjt_t>;XuGp$a(l%Z)GUvCCn{&apJyRgy}r!d}IhxXw;j(?Xr zS{=09#&_K+4S8u=;u4RSy7#`o2d(Dc^o4@~%&xC22D36GG8+<#9{w^^1u-!gh2X!4 zNI>{@Nz3!$aE-2=xQy2Yj=K$Ghs$smxrPgH$P_I}25#VK1vI>IDy^N*4YS};Ik=?d zE3};CVT0r-KUD(WnQV>Ad(&xL^SHl{ztek{H(l$vpABpJiW$lAn5I7v)9JjI8~MY{ zhBN7suz2>_>Q1ROFwJ<6HM*xb;_!)M=ct%4C@M&oIk|RNJ-FF4Gi7oB0+*28=8W!Z zoM5uSw;!wwtk|5Lc!@U(&vkT})aEjlzLaI?y%>rPma;B8>{^1Ac)QNWljU;}phlFY)>8-SXc$GDXvBK01BJ z`P?!TFZJAT+Ca@b5pJsp9piM(SK~+^KhkP?bARI$&hW@e>(_C-U8B~PX&nmF@cyuwv{rC)aP8mAi3FTAev#OLBK z!3+96X&E^&E;23iDsjApDY|VV<^;HbbA*y98qy7|-n>u=dNLzuQ^Q7sQP)1d!Al@ z%iDD3G{kTGv18S-vV-epa63&Nog00+*8#i9{wS80d1!)zzswkeUzUZl-Vk^C4PgSm zck%^IFWO#Vq-$r(G%i`mI-aEQr);7oFuM+M&|Cnq9Mmb_wd80dl(v*g)zNk9PaduB z)j{iFq&jd@j@Fm=PTxS8Oo8{JQ~4#nz-hzyKltclI<^du(c$NR=h^A(DeM#kR~Se* zcz)FHN-o}zo!7~K_ZP+MvoMC~mzQ?>9XuE(Z9CfHb}5m# z^-IrmZ@>L^CRLt(`l;z@mV>%aL7FvDAkWKul7G{A?ePZK{zuDb*jyHI`>dS$n-~4! z$si2=^41J~<#|5PQWkj^PJYB?y4GRG_vUKptB*OHFh2Y32z!bkG=tlqeU^&mP(qnN zgBMmp{M5Z74*e4UcQ^$Qk9W~}#;>WW&{SOL3_QZswXBd#C@aBX+A#fh5S~XDY0y~c zMI?>}Fj9`Z4L(QIVZKIiZpIykpU3NOm=5zZBnyTb?KNI>D?7-A9Xw~q`sjm5|r-Q#m9eGL}(T|3!o~i>_ z5sSR%G31qT=+L;1&t0Ur4)42qx8PPGuUcFg!ov(-rGCI~@CrW&JP2gQ%i(DGE&i4m zQjrfi8a^;Q^u5EqHx4^WJP2iI{_Aud%a&0!aBXgS_@BjyXjtjtqhuHX8;#}gdsN$(%58xrNpF2MH z7I+po$8T^y&p?J(xy378{qj?tfQB_*Cyv%h^C4`z@;5UIl?xyE#dTrwf z9m-4Ni}r0Rm10R5?!t42dv6^yuRd#Uhh53gxNNgV#|^REY`f2QZ%n(*+frwfozTd4 zL0rNXVYtWcBxiR_xMg|he30+3n4bm{j?OCs2%WHf4TuBb;xOL& zto7Gn^Yn5SD;V&?wCjOHKmUF2a~$(DO~2#iuk%?rpO;P@9I{U$yWa1(z8_4x#MSXj zyyc7SVYh2+d_PLtcb}zb*+`9@Xfx6gr*}J9mH*b4@Wg-WMQuPhrs1-QSk%d5+6~&i z3|{FZt4HAoiQnS7!hpM-`D49T?Z4rbxAI=~AtbeLY-%6-6~bI|!*UJcOIN}xm;X8^ zNaU6O$Wmby*?MWc^*u2DBvm{Q{0V&Wi#$XY_cZ+MhfkmO;`HKhMB`B{%NJ+~Liig9r!iwnaL5_N{?sz|OFK`$j$M zp1hdlnfaz}Ed!li9NL`x>le3h$seA@br`nsimUU=@1c090_!*VkY9Py875xoeB~=& zNe90-Kli=rMaGjX0d-TobiBI0lozkg8|9Gqw#WX+oZ-S4r=ImYpfIfifJ~}jKjM~8 z(|akqa1Mr~*|cKoz?pyT#A#Y}=H+w4>tCh5KWay?W?b!<%|=E^v>g4Lpi%43^NiS% zXhU_#!Fw~}UAhWy204>tm<;N~uPYLr`8|lx8i}NkI*QlXA<=cZJdB^F*xf{@O>=B8T9ZeC-$2QB8Zd|)IzCul- zi}Cq+`q$rlYx>12uQDsa$s-sWCQV*Q*g{)|XSl8kwOo`S3w;tYSQRY%nrDJV=KNH+ z@!F}8oxbz8@4^^o;R+rD;)$lntnM5_~h^QiVI5+U87^_4@(W#9=@l z#G4@G3D4fgwa{(zy5PJpcDx-omk({RS+(dn$g%v;)x*|y8C+=B%H!tQ9%S9$yFIPa zSJ|Zw+;kZc9fXx5%P|LGGMjmd57*J$qlc0KfWc`r zTTDi-k_o{oEOO{w!WmU^7f8pLqUtGZ3I=3ZWBs1AcJp5;&1_l}r9OS|N#d;1&Kd-LbLJBNlBAItQH9hRZ-8xwB$$r}e$ z=|qjki0f+8Z1*voG;>z`2ph0D%O4n3FK=$xwajT?D||uPEK!ipO>)Z4LLKr63B8w4oy98nI>9cHA5&W8V;#jW`i-b$_6FGETovynR6|G zk@So1z$%^KlwJ|jaN<*TDZ)fDh(<^w=|U%-Ij;6gqhUgYch}K@4?)~|lj+i8r>&WF z;fiNBckM6(mI9YfM>q<;;O-ysl)wk#a{S;ziIsO%!Jy|=3}WpJ*D2)V$g}&5oS7k$ zcW2$G{&KGxio(h3`#V$!*RHTen01e9d^l&5lNx^c)_W{rdU?9SXYG?0>NtE=MX&4{ zrTjIIen%t9Ag)hOQR`h;gfoqswvu-X8A{Hwsj?CCrtwO);|_;6ObS)(Lwj3Pzwuk> z8!jR!3HZgS(?%@#$p!SZfF;0Cv%oOHy?bdh4cB+md+vIkgIhidKBu7(w!lFQ7{;^m z5ulwX)N)#MdGwzjaK*E@m4^;gH-Nbd3x9U5ny);Q_v&RgYck+}AajN~v`Mo@1>H{G zGSUit30v{hJ|&}kMu~btkK}PyLRQ?#-x0X*^L!w0sK|6C6~?%vAF|6==euzFBU5Nu zrxfqnL^}sC_t`L;!SYB`ambtjSLKJD;nt@$PrK-X2QiAP>lp*oLgg91J`5*MZ4dh8 z(~eB>T6s)?3{I0ITn}*42-cyiw#obCYk?HLBo0%gpRWUyuaWVwoyIfxAaZ)<@rR~= z@!$N(^cYJJYFUt^Ex*FJ;U^FNtxbKt_S zfBok4{qO&XhKqV2^|uTTr&u^!nPN#D@xbM?jcLN=U!@c2vt1|4azOn-aF3w$gIGan zy$`&$l0g=Uc;{MgC9v%}YTCht17qx69rGGug*^>x(BPd)IyLthAZRx>{Aid4xQ-Rw z*T(Hg5_$~o!Ibm=DcQrO!x=o59ciyNZ=S5<77EXkw_VIz6pTdCluvjEWy+(J4hHv$ zcU|)uCxvZurszKWm?x&m4`vx?WuW{o$_LTbS7rAq&D<&)ZCjn6fF_-Hu|>BbP<$nf zv_&aEpp#zfxbV*LSytP%9y-kDhHX3zD_Pru!}0Spc4{K$$lvC%Sq(QNArKjW?aSOf@tK|a+nB`p?yGsZD{9{1 zon@Dz+eZ0@zz(~oY#l#!CSrcVvTMH^;WsKgX>5x7)t)@91dyDJ+R-Yu~MJk<0F*z z?`SVzK3leJU(CW+-Y~_4#2NE&lpj~fbf2N!@y4`^e9z`W1_|7rM?Y!K8vyx=IOj#6 zEibmumND(Ff8kNx6Xu}8L5BmSXTZ6VY#s2*TI1ik$wxF9ba)xTBX*%UnDF){UvAc% zL(*4|K<+n{fJodDYGNXFpJ17_0no1iJ`otSW^PJX6d)m$<32BhfS|a_R0C9LC0}%_ z;cr?f6+UcmTM<@08(&AFU2%LF@1T4~=;@3TpV0*wCjB)71_w=$VQLMIz5@ zU|<7$*6Alq^W3&+b+^s}Q-J=WZ|SNdjSUQiEBMtV&d8|zHf&Z;U~DMmW0(^Q7L`>x zvmCR#K-Vw>_m98*-ExbqP1`)kqYZuQWsbL~C?|r1U4NyfIR#fC!~wjD^aHpAOld_c z{rL&Z2%F(81C~dZ^twEl&a%3e)6;xa`r@~7owkkZ$C!p~yZEa#ouB1X{#gf#U16p@ ztpwW45jJR%)_PF%CfGPv+NNrwteV!9pyt~#2S{aG+Qz9H>ssr`NRyh6#+ygYcl@T~ zFL_r4{KZt5X=#HflZD3@7Ev+2ki^jef#qlA#rUebt&3Y)mqg?mZPF2w1(pfxgyH>UCKu_` zxuM)S!|w~p4(^e(>q?&8>U%gi7uWqbUS%p6s8?*DC8sQ9LVP z{?Xacjykcf)*k2D2P}`UK{@6nj~w33?{*Nch*W-?JUizaRgpo$qfgcOKl~a5)_(dP4Qis&d!m&#IY$9Q2?ha%A9- zGLb(S!IT9}-Zd5%Z$HU48{EPPFvABZ8BeHjBCuQq8&nd9N4w+WRo;SXdnAcvSa5Y< zG6~$8VUIw72b!-_((9fcPs1)Z=W@&e_(n zx}fP*>1)@n=X(KeUFss->1-G6+-u}fBVH$H;zqV^7gFEtqfi)(Pg>$-qX!W7ZIrip zNo3mf(sSz~Ws*AGdJQ#kVPXYs={12BlQ{WIxf{)>T({l|-|An}mMdu?(CwSDzIGK3 zdS09RhmZcm?D*00-~QdNmbba(%y;WEZa4ep>pE-j*3FyrLFkaDEtKEVnb@Nrcyn?m zH-u#}i)Mp*{M;=JJaR{b1APYvwm;_?{CP;*522l5VECBtiag&tT5iD4 z{)NkYAP>5vbU_iDfk6-Yu7W#T|i1&uOoRI=%pZ=BuH;yD{rRoeHEQC6OYx^!y@70%8&*IB! zJ#CfwVwvf_O8oGUgVVIn8pDc*sxRusr^B>`aUh`E?Nx4#vfZ`~b&!nkgfox=r(v!F z$6#s^Z`rJHS2vuVGa#D3*0o)^u#lp#%-AN zR1iaJvQ-@vKQ=e@$#xF95N@I0#xwidEu?ic?@8#YpE9^3Yw_Z5bmm(Rn!?Ffb;8Yb z0Mx;mIy@wfrlH#xf%@14rd`_2Km{m5uOl08RJhWz`8{HDFVA3KRPr z@7%kayV5hWNBT!Bf&GEm#ovGML9S<$&o|NUPDkgFFdFl>^Ev?rhi>U3VF|+TKTm;d z$Cqvzuj$lFH@@2HO=BK5k8R6CS2D|`e9E(r?pepWXF5R|uF6#$S}vqFKgJgigNsfu z*&Sc9DsP+Sq3kJQgR>#aUehXX)K`#p05HYg7@CI73K!UAb{pX6FRKZb%W4ZOuC7w{ zYGH5NFa3}c;rRFY>a*M6d|_R@cmTC~r9+wF zsSbf|@;DvT3P&4ZO7<%TXozyxc#Aw?@L+db`H+eM0$Z5&x%$~}I=Dh6-_()kPmh=X_#gkv^5ow=UfzH2 z$Ls<3))@~*^5W+e#{?!h6KmWwv0;0GP!W}FSX zO{>G(mM3AFH^aq0(=*<%#@}}1&~BJ==hvT8MjX=V$4K3lsbf~C(wQQMp7Kt#527*6 zHa`gmZu=b~&*GzsQ3l2_;U%me;qfZ)Ca;1+ zrZ9;cF@;MRl(XZsSA4p+O~bZn-oC)2>jW#kM zaaF@>nqNI<=Ksva=>HMhmR_EHwp?Wp;?0$NC#-DHARM0@hF3LZS=nKihi%H!r`*Jb z{?1<_kGzC|)s5HidX80yvu-0tE{kDC|vBb|t`oiyG!VSC$??YJ~G!Wma}13vqpio}fY>4Ili zR|dm1X_8>FVH!RoQ#4s$@x(z#KdUz>wAC0i6Q*r?!8!1?a++?(&9?5CktX^_Xt(3I zZ&jiFBBC_Hqe|3GBK^|X#7$w&Q0)Vorkz&b!`_bkEaY~ud>g!G>rbZ@k9_)s4!8`{ zs551CYXqUuso}{O!i=-}kg)VX*fFDKZ~IUVxwT_uCOKlaFutPG?>>Xv6V%~R{gGGdsf%S-zWE_dNDzV#=(oj=Q#rs)W8 zxMU4$T$L9ACCoS;yy7!%E`ME#tUS6k!nB$3c9?eaS~HAYmP}WC81F1e%TTBUP1J;nJ5BeKl!fjB+-P?Sm zU;1|zrs|PlWt6w%&|kVQ`khbObEmxXUii~6atf@2{!N@N;fnvpl|X)~43w|)M4x;p zuacH~)QRZ1!i#p{H-st@I))eC(sQ=GAY-&7BjL_Cx)_K_87iTOcVZ{O#8$jKCx0qIOq!3G8$K@>*`7mKb9Vg zHz!o&M>%{)m@bnnqzj+0?BK$3Zd>_+mzgisu!%0LYwGEUa0e4IJ(mTS=F7lE;)WiR zf5oTU1e&K7KZ?ZDv$!2E9~yac1~IgtVLID4J0j1rm%2VSs_gPd0}D!dP;N9z78o1@ zUp3gFGk|O^@&DU@`1|GG{@q_Mk7ysiNW|Hps+Pg#X=w)I^)N(apH z$(L=oFLgVQzEjkV_4|Z(Wby>!4#?D_v!|{EzTlqwCr_T!8R2^d&Jbd(A7>eSSay8O zlQ)?T5AHYUw$q41BYdaP?sn3+hDn|qpyGxRRvHasu>O^9Lzdm}4cDYW-zD~zUE_@Zab*^X^Oyd%N4CFg) z2gyU=NZ;Jm0D{2{VCGnIA=w$n#@9aCgSeI9KG_iI%3YJJhlByw9b3bO;P( zzX@CEa$fE%&gPCiGvD;EMBF`1nTj{<#$Y=-hXwUddW>w#QzN+ z;C*snU6d!=KHq;DKd|X854Ih`lX^HlVgSN9E55ju`G!6FbNjRd4um*w<<_XPypv8` z_1~i{VDO_ZO#6&5b*0YKi)F1RI?ab3Oc>tzWGCOW`Zc00) zjZW&?c3)b$P1gep-vh!nPxMQRfCMEZ6Ed#bu&z|$#-C)W*69kALNJ)NrlFQi5<5a2 z)p%QY)A`6~3teQYA~c=kTm7IR;y|d1@~iT}ah6JP4!*OO8Q9>4wmdiPrnkX60AxU$ zzqkywzya%MklPzPSQ?UqOS86L1Tv{QV+M*?m2DkDAq9soBI#Ke<>M7ElkArhE&gl) zBi@58EElwt(Y@)CiHy9>eD#(Ji7nnVW{{g-&s!rFEb0R=OIogAkTO~-n(-`qYs*H6NP?7cf4GKRJ_p}aJa>_K{fb{guq1avLd6! zY*6|mnleHc?(juS*=xMeMMGKF_TDezdPvy%pz+1wW0`P9t?x@$w_X42&^n1=J%7nd zZ||5{Iob}{vpgL!kk;+TKJ6ffw~c`6GU^0|m=Ruf>C?0wE>D6q?IDE0TV`kjOC7(m zy@im2okR8&A8{RJzI9>VgpBY)R66O^wq5m8<+x}O&me>rg`20|T7LEGpE0@eV7d9| z;_~pFgXO`4C$x!AXd{nv(DJh{zGPd+$LN6p3r_|^6zbZ-F{nYyfnR7^_wXyDI!{M8 zoej5Nac!mp$Fap`OW=*yHm^FydK$cJ(`$djh~rI0T#b8mpO$+K7emS5*&L0XM7QeM zEwFwFtEn^tPl9;6lHb-5%VF~uSQ4wBbKr8{zp1Pp7J(=OtMk%0(^GJ8Y1SFytPUCi z4-G#+V)akaHV@UScJXOC2Mdq_F}>|e%Mvy6&P%^uO>>F<0@vq|b7c z)dty-CVeVa8Qe;>&)2O_SP>A%V{Tb{nk$@{>|kqPZlHrV`F3*kDFcQR>be6%&xdM1 zWzylrE^P)kw=t$5ug@+Iq4CwL=Uf3T-pZN0&xkjVc!Pq|)rY=i$%!2X4y3$5ejl8B zz5M*`OUpa^ua@^Na+ZNfi5JT3$|NhJE>pgy9rYVDN4A$JxBGi;IeN?%axX}+<8JxM z;0as5y6Xv^?r$l(rv06drmgx}d8^x%1?>!~&|w?tS2?MErf&162M_B2BU=}0v?GuJ zb7k*_sF{#JyW2Kh!jo-PQJuwWli|dGkqL1&jmDu6y2Z7@H~ft!=xvk|fK4UnB5vLA zQ|Hvp#QW0nzh-g^?*`A7qj@kmUr44BxP4>WZ1c6vbMOo~E3KT`0GJ8g`M1Mf11uz# z!Rl|sSrsLN6@uOGroJ2$S=iivYzNGO;lLv^0N|(@Tt~Q64)Ca#6duU$`T-r6&v;&Q zO9aJ=PMQS;SXh&n%#M*KF8BEnxjWx}$r3$ZONFjH-1+Xo@&N;g-+%lOFUtqvAqT49 zO+9zMfi%juBj5R}j#_Vt7i8#2yWJh9?MpO`Dmj^aG*G!Z-6#QiZOD2ctZtkUb^xK? z}F{7>(=GKhS#yxWk2 zPa&E%s6vIXNqPyz2s)dlx#4W!DjYyUPfYqXmA+_pOqCTNj7<8llXQi@>ShfZN=L!P zPtmDKhvo$&gO2Em;nsh{sD_PO(gR_@DZdFZa5#+)keN{qq6&AdpcQD~rk|!?40T!n zaO%^@Tbse|$iFH-i6=s8=@a29_n?jVS_!lc#wZLl;017XK+@DE3$ zne%prv7EONWp`jJtnyIY!e!vi3(sZxmoJ^^^7f9}!ql@)rnA!#T*`*q{l1oUCm(#F z93Qj7DGY5(PvrPz`&@$w4;g%cC$bJXwOYoui^uxyusM&CaX~U5!<23{-<46klv@-f zj=Ft}mmpg!l4*%ib|L@Cx`A`2R6Dxk~$_|bL zSl(%LL_Cz~HM8pUC3>(!BJCGr@<;+s>~x#1KAec@Yd__;UOV!~JA!ZZh&v3aHHFbM z%jL_jby!;JN&MRKphjC8yh+uG8K&RE%Ry{+H$QtnKm*0Y)lPotKAQ-p-10^`{b)zW_Wb)e1nQ)RFq)d)j=r!nG$O0e%M5h0NM7t5Zq8Rv~bIXDewFTgOb zjjRbnt1u!X{eEsHpUNJn^ygP+2$y~ej*g;~X1<0sI>o7Vb=*id2U%;dzayZ4uq z-+sWA$;Zpj|LSL1ol}4Jx#`i14Qyxpmc%)>3AT=;7hDi^g}_I-Ub0>3{!`9fJUFvF zeD)~Yl~`Y(-j)6=!|LOZ%U`OlkO$!ieCS*P=55ZTg8MZW$|$4X3GqO26uoe0Si|6A z*9(6btnAa_KSro$Y=QK0gwn;x(2+L|7s5Yv^w_QS&_PV@yj=MD%(C2JJJT1Bzgx~8 zK3(4B?8dL(y|G-{;R<2SoIQQ=fScfsmUnMlTlVR{oM9J<+a~?0x^44!=*s?z15YP( zJlkb`x6EiObJu@aPrmSDP9+az)v`#KtF1n^hw4bZ_Ns>~r4Gl2fN4J*;3=4Wvb1ZK zO>n7$;=!$EbQBBu`qJsfN4wo%I%T>-8n`KJW;ki|SG=(hg!%4R)ofJ>BwZCCY{#3T zap-Q`!q)7rp%vb7381D022U#1fRVo8nzrMa{Ef%>xRs%SCb$l2xNiJ1Qz#A7^QcKF zX4IQ-WCCNZS0(%$+pSyyupvrA6MYfEXxr5k3yCr`e(AmD1r?;gnuec@bWhx$%b?(l z@leUEG8OAJ+m?1X`>?xz6|@|Vp^!TZP@Ghyuo9$rF@I$uc##NIcVpo`<(AkEx54ealcaN)AAs=|C>hDu;5)bIib1 z6ou$Vr@AXJe-i_4U6C1mnxFmh3#1LavHX|JrfO=WLr>O;s;|UFZ>FTdhX3e3h>T+v ze8^-O?-#b}wHU<|sniS0hf!^$svZJAVwAtwGa>v-Uwb4ny?JOzhmk3PQLfy+Wt@>g zToh}@Qn11hbcw2XVv2&g56$uxYT6s4Cq9ac;zP(O$s~`uEWGBs7gM}E;=nNVDa+Vy zab(ish)%KO)Aq(L{_`RRxA|e+^`&(9f{vACjr#T~>3Pu+o(m~h5mhTyPsAtfQJ#2F zYYgl1rR3t3c)|cLzhM~sDF^5zT0$AxwSs^v9{gnNzAr*PgmuJ(O15t1E88Hj{{FVK*h*ORq-(LSxWZp<<25Zq`4Tc3 zKjEr=SGsZwUun{=iae(2ulP(~IOM_iMfvNC;h{9jSM#WRg~x0ll)u{gckr5-TRd)q zaiuBx2BEwp_sE+xjnM(=@W`)iszyD@z^;SDbjobIhyXrYv&6-dAaOE4?fh_()2tUB zDCpa^Fs|67Wy91OY5OQ|7~0s~mKrZ_8ycgcrG6C`;2FArei?BobKy+Cg8C92 z3qYOjxO)#CE&uq3kC$hhm-zKx{c^c>h3`Uu=E;4&CGmpoL98s?cjkm#o0-p##SlHp@r61H5tZH3KcnOJp*iEn}8r@Q&e=3aJ9o882`b zvLsY%Z!ogZijF(pcCBd|e==y$(%ltGyJ6lyW)nn`WMBoX`1QXz{pkwSh~5laI2uN~ z^`SkdX}94vzIMuNyBuO!{j^X4(beqFEVJ!2*rxu>OAEy{POhu`}danx4&Va z@F)dU=zI6S=R-1VVfxdDIguaY{tO|+<-bNNI*G5`2}kzU)$G^Z}XMgzPlh z>IE2OOG7KCmOX*^H?H|fK++6ygNe;-8vep>S^;NM{;U0CygM^@IhoCKoI%G3&P?NVAF&^u#E6Z zyOBc>o&v43jmz+eYorkV4`xl@twLS|g8aa&e0k<9M1UzA!9LNh@TMuuMveg`>4<}E z**BjJvkejUkQ>>&E;aRwnS0-xISdR?XoB3AliQhkrQ+nIwlg^~vc~}3^J%$4H8f#X zhM-ZV9O4RpM9g{=M({S#T@qJlbSA@w5ozdX8j~dPO^irsWaW`I(|zF8>8(6ce#^q9 z4Jef0p))ZtEfpz-7YdSFTbztgMhBZt#--~bKRl65y@sRKacxbFzJse^ZJ+v`G3%k+ z2;e`sl(%q$FYS#%P#8Q*QxhJ`75F&$*~q9Yzy?3IsnkIpWb5~l=>iu9^u5_sdnw(L zr}!>^^boc}1(@Q|G!54<^1bqI|GGNeQnf5rT&0iDLm$SIjWI(7!7bhqioXEFp(=|O z9*k)Pm7mBM;QQXX3P;QW>_11d50rIcT5GUSzhQ`{_Mo7 zXU!m<_8*?)ymec0t2<>g51Vh(3Q2wjjX&0}ipLYvI@eae>a}EU*~@NtZE3V=*y4j< z2vpuQzIvxLyl|%fj_X1H`Gb$>tiD?Q=CA*XZ$ezpd&h6Pe8=`C+mdU}#QW82-s9cw zs}oD8s}bjD`>zo9_djD5kY4p4zTv~S&pf|CXZH+Wpw3Pl1a?*hqFPX=?kFSW2VUzP zpzh>T<&@5v-?~YYV4^bNMnrTdfcW%aYS2zV=b3bjk~DuCJq5h{=J%DiZl=93&(;4C zAJ;qP{F6IB?%%u5hkL(b)#illr8``eeSZ1LI~SMVynkc)6<2UyapJ4*wI_*#a3z8hNX#0sD8B8N@Ik!SvsROcO#KGx-StSFGNEWqUkroH`3d_(RkBW%BY zppGp}R2Dl4{JAwMD;8`+IHtq!=)pa%aOEJ>lZP3&Kw`PWp~2t&{^RAtk3U|Xb4W>} zj4g+D%iLj`_&~2ehHJF9%Ug#BVEi=vcARufudW;rSeYzSWmX!UUVGNRTkO<%H+m!C zXh$r6@>um@ed!clX=_?m^CEj9N4`vygvt}KD>)Ts!-;4x17Gko?vf=m3P^&X*)k=# zaSoVNYTe;cAxMK)FkRj<)6@-2a(=4J)eJ)#7~g>4@Jx+*Qnxc*(-meM!))tCziCSM zP~8w-U&u2~(Y&cl1byT&Pr`BZ(6CKou(8q{X*&Fr-MolS9#2Io`l{=eBOtAn38-*MVQjfrYJwo|&jJi@Iw&TLvP zgmtUVHEwNE@TS%H?b_^e?&0p;yUVRxx9WwMvlL8Z@O_1&qgvhaWvSj&U8`RP#8Z#q zbBwW?Dcu{QJAEL{pCR0A3!^__)axDmUg7(CXN9*zsIg5gV;t0Fc#CUd66C6uZHRjF z3)vY&;CFJPXeDu>gZIK$d8NKfzvY+5P#S#B{1<}y?Q-An?T2#+P){dd<6=ZD%Oy-~ z-L02SVwhqR{BuV5mm0jcGnoRytostZl4D7_}06 z9=7m(}s0? zQdMG|G4?pNVsZvUp6%t2~vxk1gNlQAdh zGm(Uj_efCUl0IRgTGfKb=$aU4@f?V#uZ{W|9$Q)iVc2p|;>}Uj=F)Bud*Si{vbo~> zVtIDy!SaNQ8lF9Pki+R;-F~`!ef#_6_n+KZe({T+EdS##Z!W+2$vev>I{5pnroIN> zE6#DeqSK#qQyD-Eitz5440$*?tZndn!jkXl+nix57n3UTlI`raX|(5OPFU&i8f&;? zb1Ua~eHVxyZn5J|c_Dj{-LkV8L`bX^tg=c0x~k&Le#1&}t+DbOKI0H(HgCdVyl@TM zcpJ9Eh3l}kXV`YnaJuL8@%!@^Oylin{hhw!I!wF6GLvLxjHfbc$fRteVd%6ycxavt zw_(!nDSf!c%_?7Rfv7D4aFUMQA%_Ev^UzCpa1`lX;;cZN;m+do7q78=eucpVH!(5V z9L^K4*WdB#SpS?q z!h|2;M;g;Mp60D_X!k6TaGJn|QFBS&yIvH1{aJMibmOM{BrP40p^sjAwd2*R0!6R# zPCC&6R4h}l9@#3Erqit>8LWU=eui#~Qt89+9K5Hju6lqGkhmp98g2{IwCt@5QOa{0 z#xD_Lo~Er=OD3u@d;;V#sPk2Xenc%BfNP*n|2>y zm(6?ev(}xN=fC9cG~&uu82}sKsWgMOVRO9hHnZEXmg!CLt!L8+*Z#KWFw<}2eiP0& zg>|~dtG~l$c=KsC4byhVwO`x3Z@=*=mqVVJ|E4WWuMnE9VN92Ur^Flf%8!+05NUtO zjwWTCU+957=o{9&t6C2|p^-O9rc({#MHrtz32<&ic47UMQdBac>N?MIwz_~4J9&_2}u`=7k`UivlO z?soE!G6n0Nx4d-avPh(GiiUOa3}-v9UEEpz4Kwo>*($jI)q~|Lwh0}gfAtQAk)|d( zcIYG7cGZd@x=x!yp3x>eVk?kad)W7#cW%W|PUM@Plo#P~m;RK(SWVj$`%aQOP@86EgTfT4W|{~vyGf4O`AG50`ST7G`% z%yOL%$}LQWc;Q06UxO^F%a!5IlVc{L?M~J=)m(9~e6hWg$ss>pDUXVF)~%+p#+i>- z48+h(4J611acK@2OqjoUSCFrUs%y6l*HAcMr3e#s!Vw(*9M*+#+ZNui8HWW*Mdi0V ziQi7UDR3kGT&No-3B$bpPT#aPFdJ$*R=AnI?u|*lS}aOgGah+pKAL9JZCjq?T|P2< z8^13iJNL{E5ZAIbu5SFqVW#v`v*75eTAyTi?iIJ7ky(d~D4*wG4a-@C`2ukidc-C- z&qiFibaUB1Uu;COKL3Ih2tSKn4Wjyz-&)2i?dezUwx!i@x;w7Jglk=SrQ{d2_HZ5t?6<;f=O24+jjAII?gn6xc)b#+lCSDO~x{_ zonARKb9lSAaTuO5(rEx_+Qx0mYI@_loa_HfxI51Hz^L50BsR?vUwAC3 zwg>+ljN10K?l=4?2M8KZhiez=l{e{BE`?%#Oy?uKJazikL&vqlz;8^?^d<7Rw@%`h`x z6~0Zi{HF6g+@^J4n!>Lvt4xO38Qf($+!Z@YB=2YDZh22DHqk06=qmlHV*}?f%d%%0 zG^vZD%}R&1GB$Dkm`>a^kww5zu~zA$R=UL5_11K4X1X0_I_VbfxePT#*}@f#HQY2E zKF7^&dGuYXI((ZwM@Jui`Z*m_zLdbW$DjS=eP-yrOs{Luq;h&*17S;e_~_ssGi{;%IHKYsfH z+rchycCVaQSQC-CD0Uv{MJV;LL@=2q*ygr%Wy#$n(Vwi~M2o+N&x^BMfr3%>TN zIXc&`F`ZuxbbHQO&OV&KcxBnYa+6zJ8O1W1O{a&^wJ&-Lw|G6AMKQ`K{{=64uW7Q;ft`aRcbQ|V_>=R8@BlYLHNMgWh_x5@Sn%BFYda2a?r8dfXG2X8S#KFUI|r2 z?w5(MRS%4-_!=~*kCG$wk=KEsc~Iral*LDtH_O(?8(QHzv;H!Y3FpCu*Q#_7pDw?tK++j3T9aNj}@vFHgKqeZ5pXhCFFoA zJi%2+lg}O5;ca*Lo9rro#jE*L92KPL`!S^Osb|fb;tFP)|AMbHt2-#-b(X&O_&Y$q zz{lbIGQBSd`HB3LsoJhn82l2nD;M5R{{H*#XAq&D@7?<@Gk)en@jlug;^bLVa;$a* zCWas7>+%8o(7Q6tx|s7c=}k4tPy5?8ys*wX!wmp6!IY#Gy2`z)FviwCb;IR{-|>iR zHYJrIt1I!D48ld?3?#Pr3tvONt(1eGHMC_@uj=BIJ_e3Wn>4W^hECgd>rTJVTwaoN z@Hx|&!yAubf62TFW~Q}~Oj?8gsBcnk<7m5SCFe+Hx%^9d6}I6*YsfSTLSnb&RxhpB zrll==wzJJ<;s_%r(#hSY1Gk*1+stZv(<;+8jl!$mM_YxXFUEMy*0}Bm*$(ylYQ|A+ zdFj4TVXR*#FC5%+ty>oW$RbFVRQ>O8}-w$jYEIKmTihD9Zt)o-aS%oHmSB%=ZhA~Og$dC;jME!ZL@b!aySm5CxS9goc6&cU3}g9pNMGYio! z=KxR@)fsuYmd{K%7hI*S14kJQ0G}_-0_^QOU=KO>gN43WSVoj*<#z^7c4}KmH%gXH zxMx%tHYFQQ%2qegk0<6^7G%x#0tOE%@-+uul*K)P%Hw{vT}PD%KW6*V0cRlmun~G< zH6eU4fJhEX5MC8s8c!K{-@WzCa_5^*mdB58v9v{@p~3o!kHGwv8<_t1$tR3Z{kW8> zwDGAAv#2UfSD^B(zoiSciQjF?9RINnefl_ng513ym7mhO%>P>3p%G#G`LcySa?%ZM@-kCg8LZ<@de z7y4TAS)P`u{K8uHOkE~r0R7}eLIpLIVe(`V^=7HF-Vy;!W=x0|e+G%b$(O5P-4T|K zk1yLEsCW&fID6?`T;U7GW1DyWK~I}N|A*}GWBD|4GHN!nW=b^h*bz%UO`t`~goUH| z3n~)|i`B-U4k)};pE4*?Z3K!lo!kOWUL;lG&|ScWjKf`YM<;hf5aRx;FZ4qSWFi4! z;oE!&g9W1sWY0E2($0)UXM`Kmc=X9{>7v4mRqH&wgSXVF&P({@g9$OZO|J1qvfUEw)n42F=4@p!$Y zlONtT2Vzo0{7Y{BlWHE{DzQiW~fqRbuictty8EB(Ck2 zUm4qM@D11%SEW_gF-gJ?-cTxMsYa4{3bynv@919MOkqW==;(OXeboV|h*&)N@%<%b z11p~@zVg-lPi)GFh=t>qvQRSUHeG^KUg~{V;nwxD@&-)91p`0P$w=7x(QbFq>2yWF zxW+%juK0^qaZ|s-&GClLG#a+U)C(zA7Sin z>Px-^jsQT0snpU;-6y=>-$k>-8d~mcYqdq<_PvF3Krg8a&L9__*3%>>2-K6@cK$Z! z&iC!w=Ccxb#DGL2T~(v5+Q@To1Pz`M7hVRC0~w~6(?>9i`hefyN}YkZr`=;f|bRvQLfq2Az#HJRxawOQ&#OmnEos|IkYY#P-}>5DPwb4Ddcb#|MTF z(80mg$IHXJ43Zu`VJrHRBU;gTEf3>{z_KjtSey|rv9I<6A zH{(IWc0w**xs8l1OLZfa^P~^Xk-XS`25CI?qJ1@%MEa@31m6QxEaK^-b zyfP`xt{G8cNkjLDB2eKpT^J;0koDL?U-g;7qap+)UfY6VCG^0E2R2XkXy}NuQTD}j z1|jW`2bYNk51|-D1DV7zwVxc`kgM8b85vnc#;`WS>2$LS?xoMm(LJcnBvoS&61mzc zd{h<|1U;ilGp~vB~Pp0hQ3+412gc{f~Ky4x=sh7Ux+VL4DP zpT;FzzEBPy=CQnVc`X|p%Iq&zUMi%|pg0m14a<4-ugnAgCO=FIZu7x%S)Ja_lj2|e z!fQ$&MKv-sPU6(3cq&{Nd5e~#BRoS!@f3}^3C9@2g!j=@d|veMtPx-9sBA+Ue>H@b z^ITKs$k15Qs&~2D?Ev z#1dQdD^KcKe43RXeO0!)EE%E^#e(TVh$=B#IFi_6DXT}WZYAyNa9FGaWNUDv^?2SO+Pqyux$F!4*Td5JBY z^n!P-OO2;()3*_}{j+U&!_V$cCv1mznD&gX!)Lrwww@R^hrbEeMu&|^{H7o3(^oj) zCE>-BVZk@@ta%brZ}okVdxNuDB`O_Mo{%2yez?fjBRXmN!r%M z%vaDrx64xFEq$!{MB+sC&uJX1oE_h`F`HN%g~d|9tx9ZZy32Q{PEI~XSJbhe{$#m! znJaW@=MR~L@ZttnF0Nd?&i2aj1^A87;6gSL`Hs6tXPf(Db92^(^K`KJu1gPq9X?vv$<2hW%1OoW`!nf4nlul;5V*T}wh z61X!;=4W;2*ASL0DrbU>D)0hQ&SY1LPK8Ad)8O@dOW^V3JxaL!v5VKQFZ;C7hg`M) zm=%{N-`&T}_EEMlJ?6UHFFyKgx%uvomz@iC_&NAbyK7l-!Z6>rKz_G&SuQ-Ymc9Ty zs9%OF(4@@qJ+Tw|m}mJovD={h#>Gpv(G!9TJ}ghj=!HG$(~zFEkBU_Ku29hgC{x&x zW{5c)LqHHJVAZyWEQtyuMRmZaOcvY|JEp*>bo?)=9Cm!Xr`OC^K{d)5=!Em}M~;D) zmvjWl)QYPp&mLs3fTB`CNM)nm72H$!;`Zez9`C(XpzP~{r>ZC?%VYPQ@vQ`vcsjvT$-@P)JaG$w9oat6ISNb;3gOT`2J zj5EvpWoDQl58ISpkq_ADjZyUB84fs;xtyM6>B|AcTkM_l{DUKOc@d$e@-e&-WavYNC$9vv=>b58C#%0=0Z&;Vb>`R2dmMyGUkRh!+mNmZ* zf)AKld66eqm&9YavEyh1kVeRyp)R$|#_JcCTTC_w@1q22rn~?%8~HTcyQjs|yw!k) zG{lRycsJ!}6L6VUc#WFd%XdS{Z@7GDn?5kWcW28KY23{UU|L^|ujM5PCOKrB($I8#+Z`tU=F2eg&3?mXzwUHS2fOki4a1vP-JM2yvwwCQ zH}hk9f=-1sZu!v2-)zL!bPW$Z?FN#DGUC6=4I#b#LB2J_5(kcSh^ifGo{B%uXdUM1 zFPmCUEXQf@D5thB^6AUlhyU=gt=rI>>#u2Qx0&4?ragzx?#4C4o^lWQ8;5Cq-juF+tU*@EZTq)vn*N+O zjj!W6&8odEh=^il`-!N3V759#e!8}Nw<9CbYMb2lrntI{<^X19Y=Q$8;f9o^KZOwZU9a!Ieu=@v{&M-|RaPeMEl3c(y zj;uc5x83($ryebjyE}N)06~D^h{U*1U<*>=qM;)IrHrD?ME*QwZ)!)pXBnnt=9Uiq zA=iQ$Z-?Zpvsu`7t`*2{6l69P3A~O>7LFO^bwgn~f-t`9iMNv$h0=j^+ym{fDc@rd zapD&^z$acib=p>9XX4xkcFuia$Yi0*o-ame!r%?u>s`Or%E%S?6Yj^|WuS2EX?0`) zQ+7(_<5h65L(c3n(#vcya-EDXle)#o5$jp{g6G`iMx{_66y?lN=(aQKNH~1Jv)e-s zdHI5r++xr%8Y2tRA%hb$(8kG6pTfBV()BkttEl~My7|WzvHo6{EnX??$Srg@~;>$~A;2nfQ zUpOn?Y>GY|EAUxfOWw6JxU*BYAsBbeY>m2e5a0~3a;Jur$VofnmYy9tX=p6xx~0)P z7rX;y$}v1>Ym|W$ag!kF8c)!z>M0n0Ez{P2aW@_9PA6T>9JVQFBfR8a>9sDCaQqn7 zD=9OsmaB0Hmo|VE&!#PkcFT_G8@_4GVY=JQv<#d5;%NVN8ewM|#&>7Gd1)AJ`BsNM zbNGz6;mofNZ##*mpQf?e#e0Sh@(t`IUdnHABZvn?H8U>xaK?&Kv*9T~mOjdqWxC70 z^+3Ppm2}f^#m)SxKGp7ZFn2q4cEHtG~#L9HqNw3 zk!Mtw23T@wG zmJsD^Ic6D!(aiko7FWX=-)Xf&PrN*{X&BQrobm0~Zhafy?#8p7-n2D)Tsmqv!!^P; z?oO+&WXtf=MxG2m9jdQ!XwUBG-tvM2sZZgQp83`?G%t-^Jn~}r<^k8NSKab@a%jIu zeRS%w#j$PVX%p%@3K+|UZJ2`y-$T-!%T(iN`5JHWTzOan5c*a0N$gv>I;g_7<&x(P zYg>Hj%c-bsGh{eTa~p`-KJXeNthl#LLJ0Tl-*@*PEWbPYi1V1o%isL^mz;-Sl9aaN zIfJr0ckWW>PnNgezR4A^m%z1_{p1&jsom(QCMzznO=*uSOZ_7DE8d^(xS?l=o@ql^ z8r;i3#D1k;o@bW-0+Szfn(0|BKfeA;z8-yb`M*E=oU;=2!Co;5Z2yrw$V5?usB#tc zka_lD%|umGGOdyt6+@c|qVTR?odli)ff@b+F8w7!gav?3(IYbYvEPfkSC%~nO_$Kq zV+LVI47hH8$&H$KzM$**oUL(u%jP`S{R6w>B;|P~pea-POolu8oPG?QzM9mgtU|4yxo;4JS1`=3=O7$C#OOZ2JkkLibaM$X+iyg7VH4239B3|#wRQe+8qD&E6uFJanQ044I#yKr6Sn<4Jp(NZ5IiQm_~~Hpau2Z_`bEBc z(b0Hhdb!I?87mKdi7!0E%?rBweqGc8@tS3?D)e|rqmE9FEP@tNmW9R&fc)*kzp#$- z)RhXh@VfsThN>}ONzCs*cpa;&4uGQ%vV}&*L&mM+UG#g7?MWG#LOO$}=dYH7lWTlv z#yF1J>K=nW}%LD@!mrs#blX?R&JBz)FVYup+B~A0%wq;4T z9jt!g2xoa{yzg^Ljj;XN+wM;D zCVH!U^4Zw5^_0d;_olQ$HSVU_u*PY^j}Fubmb8_-v{?tdh^#fnD_fTx%ZY7+^pkdC z$`4+u~)OjAe1a)s0}q$IM&vJ%`EW9Nzxcz5LWbakPJONtkJb=`ztc zI=wJmX9GX#TgPi_jGImK)->ig-5uX~*ztxt_&4|DH2e^Tqd_+A%oNYz1Hv@h{OIqn zwu5`5=`g*sTYtmNcxM>h4Ks(Eb~X*ywsE@WG&7zJ+@#x%@3h*6`K)++2dwR^F#j6U z^x8I!xeAB8hs=Y*(sV30ZJMu+>+r^9KAWFz2kbD@*p9lO{L*(7bhofJ*eNgZWb80LWiy)_IG^U`sm|7vK8sY@^2VOyz|c6Sygv(_x^YH(s8}< z)(KlM*Q=)^kMi=qRBDYoKBzZeP9z!2x#bQK-REoXH&fBZ_TW+4V`$ z3xa~{UuZ=>;g!R@8vP)5^7yTugKK-7ncx66-}yRu_-J|f)mO`z!|&-`yF=hw&Qv%_ z6Vop&*m~Y zl0n4FojM3)M7BF%Zxnzw@(1H0P)Yg~!cTA?F>p9y_UVYu@*yu9c3Gt}B(8JzR?pi( zezrUXwq~@bT+XBtFU)K9mfOJzbI|C{|2aB0=MZj>(cf-5I%pXBcHo%@sMZ+8yF|_q z0({KaFlh0xvm8pp(UKh}$4NFoM^q$5kZ!ibpwQ>9dbI>xdyK9avHLi43s`bEg%WpV zI=u2{;XHiJQW3NLe&oYPye}E?x>})3uaNBox$O+TBJ-c{MZy26SP4uba#1C#8%L~i+wPO&Xx zzPz9E9f;fCezAP@`A5)sz%^iJmdB5tEr0s#i{-;Veau;iZ!1qx?<;sMPnX`pr>|eS z9aD55g3+x>K}R>mM^tBQ;u>CC{e_OPBW*_4RF;A>$yx>zHt@{lvzKOt7N7aE!Hc_^ zjpSbHe4ql>$G?QEM7)gF$5T!mgc8B2D6+^p<&v;u_M@NVsA=c@LisVk38?>+i7NbX4=Bd zdRpNM_gP;~2z7zmH!e^3kSRyfXu77)`vzO1zupnZ zWG0U=;X^CsuwfH@iU(n4)BfhE{S@ec3GJj%$~JseAfctu^79*h(>}qU9Y*yeEz2Q) zx_z2X<52=)2rt~H;*A^WgzvcE#ouLGopzXTcw&(tX~G*Jwl%Y9d>s}X1Ew@vF%749 zT|PUk?Z&bChpswe%a+y(TXN3+)8NV>?<-dgyKQ&6E*mEC9VDFPNDQ4u9(-)m)fp{* zl5ej-H_jj0%}HyE`FVuextM&A)!# z5aJ(MEv)_;Hq7c0taviMbg|0O@Y)Th-MChiVC3I2;f+6!>G%BkOFF{PcUQrCKYll> z1hy~vDBZZA#tR|rGqri0OJH2~!+_il?l}~2!d0-oQ|$k#0Vkm|ah4q+PNccIL?;{C z+4@c&>>Pal-8dgUzNCrYAc?p8!Cu|G5_V#k=$3M%3KemIeA~Vzr!)~R2#UXRgrZ|CWi7vj< z!Z3+GZvO(`?l;fBRUJ`Igd?yU=mH-C1UDUG81U4$9rqJDu}}SExA%oRyHCeA*8ocJ zIf%rI!p<`wczX0K4I-BeWebn9@l4sLF{#?-sM0BIlSPE*WN|>DFfh0hYs3#@i}`2 zf4ZfU#*Q@S9Lb^7J+>;PGMICS=?K<=5oEX>pqBI9!)KAT1L#VFHy(9o{53CL??2w< z3xZ~b+w6{x(GT+NFkpyWq`Smm;nM!woE2hJeEB!*-D4GkL516?ltCWcLWVRMM4@dm zoI=V?0+0FX;q7leU%vhNljX&;2aL!W<+EJ)*`L2yKKS^P<<{-n*#@LMIeY~!3!jy( zD>(v`oGNL!<2dzGjYp@Er`1%PjkbAFx-MJ=Q)MI(NTHpOA)BZQoq6w|ade(dY=95x z9E9{sPz?2gD?aLlDj!V|2;AAqM(51nS?1zXdgAxLWT-7@;-<_{{*3qOmu}17c2Sc> zc`P%#JDoADGvj9%-R5b_{YHQ4V^cn={HdWzMA`GDL%g^_Fk-}l0ZiUi#|TNVVH^2~DLMrtTw@6e6Hlho z)(TYV$wRz0yf%vR9eJ=iX>}mPu{~#U%Jzyf)TZl)s4U;D?+$M`<62+F#eh@%3S~&@iDWSYqCEa4oF8o? zQ;3YfW4q&@;rw*F)k(qumZ2X206+jqL_t)GpuiY5%a%~nAYk)uSfwq<;@7kxY;=q? zWy5sFXp*j+M#N=$+n%*OLDmf}X@~8kwSA#v3_O-R`6`5w1A@hmVY)}VQXu0uiogx7 z@F)J_(DFZ=fer5o7&+umrjD9C&ahEqzCDooCzkUo(^czy8nvZn^MJe^~yX&)5!r%onDC zJ)18@vyFL&KI|U&=+GOk-dy>pA|s8_2B`%Zicu!DR{=ULh7~4UmC^96Te!fJL|#p5 zSduG~wak7At11_`9OFknTi)V>!wxUhd9UBugY{PG{O{bg=xmx!+wS0QpO28;vI4jw?{fj9? zFJ3%do*X?%BVa{K2N2wTb3wwcd|0vMDO-pL^J7Ui&W>>H(C?%o+9h!KfIOWg?EG?l zxq12L%iC9f!e`Do50T|Jd8k6D?!@I`n5@RY&@p>4AAkRBx&8H@S%LT?_2iISn0C2a z`2O+GOds4?O4n(DNn%9aJn_ZYi57D4L81d+IDxk8vIzE9S{^g<5o9e=&nfV7N=Y1 z96VEJ(X%wg;{n&q5)b|iH$Obc?Lbn@{0Nf8rBPT<`|IVt>@v!H@BQY)8__ z(TLDR^3^B4aE`-J^=6n@G|)_SEIt}-r!Rl;Z%sXuZJmX)(t{yDYaSRb&DnIFG0c1` zBh8_20#-f9FC2h(!v}dV`G~i^Qr~zo`NCk-vo_9PI%u%{Iz!u#tw^{FcgZ-2$9n0@ z5<0#}9JC4lGA|p>|Ip!y7{n`0X3N}8nDRjWK&!+QZ|PQjMyFWH;7V%?VZQht(jSYm zR(cYy!lJcNZY|GU2gKIAIC)cb3RL3o&A%~KU(hm1O14eZa9HM)xy#^oygY5kgm2D?WTSvA+O97*ffJS_IM!VfC&h1bp>V`Z;v!Uct9TlDe8JR-(S|<0&^*dU*TEvqSo)=oYy17tS+tzrS2(@OFtq+E2J?^OqcE{`S`1@`>Bj z(T{DO*P`04%d;xZpe^N8iL9LD4S&SqA7;pwcW8+WM49A0@daYKps*N@EV3jJ8gy&o z7>huBTm~SZ))w9qEShxSk;!E%KV)LZTYlx`F(30#QQ{T9BQ7h74Y6CQR76!q3lNG%M?RII zMx(hrY7!klM;VmItx|>C!HCOmHL`O*Uar>y#&aFVT(?RB=w0B@%v;O#3-2v&UHUO! zWMlxs)~*y5%UcSOG`X=yIjBe{oEve3_?S(S19>=uK)Vur+?-f z5TA0L+IR53?y;0_ACYeGOJkTdz5-?#Se0GUj{*%y!?$fJb#LS0%iB&+(G%A6L9j_D z{7eI%L72AXwCGedL!05haV2uVD^oQm;*_rT`~XN^Q5LttXMbR(_^wE88{cM?ZDi7j zm>W-FxChi%KsD{c&uZE{w8Gd=#Mfc2MM7 zzZYJ(P4hb@EFSvhSnxY2XjN67npd=q&FF+%?z`hyG?ngj`G`)Zw5FMHSqIF+?)bM( zTYuX9v`o?xPkynQ-vn&^n5NCPTYrZ&y@oyQpKt)6lrgh#o50e&@y&MuCpW{aG>r>w zc~~(vXxYG|BRJ?(I-MaV*y*xTP{m_n0ZN*-SAc}~gjRT-blUs~KJ(kj+m;9W zM77<XBw80a4)Stvm`7`s`I%vCTcN%e$^arNlwQY0#Tho+o-K}#- zTk_?%x6((nF8}2H_m`_&$7;XzFZrcw#$!$*F%G5?4eS&}3@}}Q@{zG9ss3V@jXawyXnnL z$$zNwCg6vGEEDSQK>YnZbxiY^nZ5;)RDit0dD)~3+V8e~)~EJSQLT%6uL z=WJ;aIcD@?T9?RNRdBSI$^!%Ep7S|xwkKIBs)J6tbbuK=aBhOvRnaA*U-6-foNqv| z6IMK)aX|YyODR?mHM|GC!^h09j}i3|X}^E|2vJ!{V8r@_6$t4&I00!YhVlsSb%c(x zTza>=?=4IoEOb>QGF#C2JEo)U%8DOG@f#HKd?XK;r!Q)2R$g+Xa|=MUhupqEJvna! zB5e!)Yrd2yZi|=$5C=6GsBz7h73heiBp_%g_H%~h%K2-{J6HaCxpv{l+0tZ@u>z>u zx$-%P$w!~S?*|AU^RoVi8<_6g{)~eCBn`y(gnj(!r^`nSAntQ$$wnm|OmKF`KzaSQ zkLEz6Ujl&qi)Q9UK>gcKgf5Nx?hIg3mHrqJc-|@5EGPj*L3mR-eF8F8o8JGOIdXl!zwsT-11&DPV2?lLo<@P7YH1i>?k=2E>kGtZeF$fX}-0a#tgSXW2SD5#Ag2{ zB96wXM_jYNA)DQ>H()yMU$UFG8K!9%HoF5e_-@&XmudrN-M$HLr|Gzsqr;1bmSp&l zWn-YdopzH3uv3#f(dU$QgQV)xk$EKDJkM9nvSVe#~Z@Z{s>$Y;@&zCiAZ+8?Bety)$E*PvdLap$UwB;kq5sf6Roc zokQ#GGdj%TvR{*rzYW@r&;E)lGjBA8tK4D4-A~K3`U^<#MrWLG+R-2AwrVq-b>yaR zBoBuS7W^K~5&C=1Ijmp&{1?kx*HJ3zp0QeS=eFOYdAYoGv$i&gv+*}Sv#rbyB<#=H zUU`etg&nTS=WN&oI?;Bd-CFN%nWw&U82mjydbAucWB;oQ9K7DSvV8Xa!{yNtu(Vxg zN$*7y_1;vrp&#(0sCtIFS_ff%yt{xNtxUCdf~uPs2Uxl%ee0HYWhyy^7l-9dy;V{K zz;SLlxcT1lp5GO_bY;2q`~P=2;;h8MkKSK?`49iy^3z}cEjl=#v$@qTJd^GB;l53Ge_OS0gSk>U7y@p8>8nD;O8JQ-68Q7gt#!kiVSifPY827B)N8!aM4<)7}*z%Yo>_50RYVAWl@s zb|XLK;6Uk+g1E>O{{c6j?cPP@}cN7FF6twhGpGsnpV zDsZnn37%$W^r72T>X}h)_e|zZ}MSHtGulb zQ)|?r;Xbp`?+d@|su#kVezrT!nipYglg{`qQ^NQ(J^5{a2wVx|CP2wUL>!kSkVLr_9B#T@dtV7QB3){3+wKUDK4l5e!C&9$8_?mv> zYTIz}YPY+25~kC&ZJ6=a4e8jiD}Uiw<5%6!xQ(Cbbe!RunNEdN-s@IQpP9dGgH;Fc zlx^J9vyL|}+dbpm&{R*Bdlh9}vF*n%PV3oACR@R_!4YU+y$x*}uQ=VZZTo56%o!-# z=~7bK17SK{CM-AQD2#|cMi}MX=DFif>!9;QBiU%eRjXmoK=v={e~E%YaY4*^at< z#t*)^-6}-jpQw#d?Xxk?usq~*w@XPh0Y~Sv-$oRjSWZ+@&T-t^W|^;hyQ;0(NBK?&wN}?-ofc)_)Ge;u5{J;(>fogE_ca;&j z9aw1h_Mb;|;uYD<$$=R5aOro;&=p2XcIG`dAPLu0MDb%>ces-dg4rrZ zc;jgt!p=BXzu`q^rqSt?VaD|)m<{ZtgHAKk?6fQG4k^3yqtkTk#@|XM!mye>DBauX z(=={zx$=c(H(?t*0StKJk`Ku(!iEeRQK0E0K-KRxPb&5Jbs#*Pr)VTBr2w?1t@OSU z3%`Er1vUZxl_}WT*KIwj7(9zniz2h_iZ~Mt!Qe zv;mpCuwC*DmoE<5Eb#bP|9xtpAmLR``H+rr!dT8vSk^B;VYHhsIooyj-gnFGJ9je4 zbaKL0O(u1U|8+2_-}GJX#jS~o%(&#M^opi&2}1XMLt6$V%^#g4c@tmPKhw;nb=3B% zn?Y;_9`2dHhOa2oRECypg`?d~;|(bPLeN@Hn@;1<_9@(@58Z~P^MX4Dzgo&KL7hHv z!VEkh+Qc3*m2sc43#e{ zuF}hP`cAv5uX!!~G=0-qrrN))55qfd14o!0v>w!Vy|Ru8)3n9iY3DH8Njs%EyMTJC z4d(nLeGhMvY<#AVLHO#smF$hvIJZQ-JYhElQ9UV~Zz|ad&1DNUd3Bn;DGQgbR6f1x zIRPD59@F+S(>C@^;XkBRaP}cJn~AAV+o4TiAi=jJ4*81m-~Q%rmm7Q|!G6>cXF+e> zx)pugym^!DO;lt7_uHIz*}HLR`Reh*<%ykf^++E1wi2(MawpBeuDYT#nDW2Os=!V8NIxy% zzzwi9K?jh3y-MIq0Ml)twMr9u^r9S@z6vwt~UPbGB2ty0Pb{(*2|3 z?~PPmMeCW2+T>)TSGO@KtBppD3f&S_dk>AbL2+=SjvXa>F5=R`h2`qOJIlM*{(8B- z_Y*RO)eGVgD>^t~b6~yt(Z87s&(0_u@m+xjd@}y)&p9l8?{joaB_RIpy$9UD^heG> ze6l>`0MZVFqzqWd8$UJz>XEOV-z0-C&#~B{u&}H~7<@dqM)@dsVG^bdkbnNS(J9{T z#>Dt;JXSk_Brp~u(UEqAQO5-!uW6_QTotovS^c^ZH|-oQe*K1<7gn0Ct_|1p^lR$3 z(j{Q}*}wohy9H?;baz}~YdMPVrq{d*KfC*7BaYyauc*3aj>F*aH>~9!;FL9AIJL6o z@}$@`0|(`$acyiIf^Fp9xF%hKJF`W=1$IRYjZRr{p+?ZE14B-mA=2O_z~Q3mO3lGR zJ+uU>vIgaq6wdNU$2xLKMhWDR?vxH^^lvnLPpnfkEGJ?SuK@X{^tjOh@M1{j$jo8fB4m#5l&F; zH~rRx&!Su~sZf{a+<Z*(| zQVH<93p@Ok2nYV{-@HLn?V#)%EXuW_DThDpN*@sWBsWtTiG_QWXQI=ga8`y>&R>g0 z|CTk*wUUb54n~KW4N83mUglS(ZLIF}Hx}o?vmlcV+M{XBV?-Lg&f=7C%T_-Whl0gt zf>)ePt{4gCSxzQ~i#pG$R(wye%#R)<~YV3F2Dc7A85PB%YXU@ z-bBR5GEW$^zh#W}@ZopS!7^|MisR2M=yy`&h-=_~g=En3rwhEmr-|>9G6KwAF|9YJ?ML&;0dBfeebcAb^T;*s5~u1p zH})@JuEHtEAWw1N-Dp(SNkNvkagdALemw0bRu*0`pm*iL{cHSlA@O3uJkAS72C!_Y zV438QB`Ci}_=Y?3nK-a=aO!Ja*#=}`Ub4sR4VBujV*8=EyebQSSb;R^HlHYWCgkdd zUyMF`z`JiLuq+wkyw}P^flI!#aLB;>gwL7R-eB^$z_}03LBFz3nXp(nY&7{yIu))S z%*HufP;XHA?x5c}6C}5U+&p690=>^vyuG3UlSg?lfjVDaGNJN$4}IuLM^-_s0NCK9 ziA^O~;b4`oSM%_~5fA66!_g6Y@QxlVAKv@z@{2qFwA{Y&0ekVPLZDjwY29KMIL|o1 zgBhdWpK#U1iRI5f@*Rk;{&jhjFBI}T3K#D0_;{SpK>XVu|HNAdUg9q&Ae4(`;cLn8 zld7e>(aMU@;SK(;x*!C&d0tGsXphcJ{2s7a-|Y;g>3R}=(%A=b%0$WtLA(sik|K|? z7T;eD6GH;_xI=>g-}UUI22%V*g^k}Va~rA}48HPE?%>yabEh6OF)<;*flzhwCA;qK zl#Ye59p%$*J{|dFTd%m=Q_nV{^&2?%3xb? z66iqC`sT#Jx0?A@4%>(|smQ$ZOPRS9Z}>~TZIc`mIt3{ZI}5f8#cC`6rrrb7R>>8E!lLcx?tMSd`uY^-sj?1Ji2aV z@;vamPbvHCZ#BZ;dBl|+{}o?(t6b`kC96wDn$`oRe@$L`+kfT`BB=lEp@;kj(vJxgBsR&=2uWp^rBCM zDsj(iHhJP!eJ#Fi+rA#nvt{&IIbAmMwJdTDY$qi{JU!$f{VJ<}bt6xj(#PDPHvq>t z2IUI1y%^EwdHWu|GtW0aT`>l;53t`)0Lklty!DyB@?19A$A~w|A!m0T2!l>x)NQwt z=UOTZnc7=Eh&CWuwpC};-yf)q+r+mYB^_XuxkYWeDtq*SI#$qb9KyvNUeT7J(5bkAQsUH<7m^HI7_e!-Zobn>HgZg2Yhm%m(YLwn-}FY2N1l27Vg z>_LKg-q2updUXE`nIGc&x63!*e#eK~&Y3Jw7Csw69lCU!$ph(6pW;{K+vojo@O-&X zf8_+`i=zuBEeFewyk7q$Gl{FzJFlDto=mUoWaK&H-s79xoJP)Z_UVmES;kh!<21oZ zgGttIT+Lu5YA$W<=)0QXxecL!dL(93dC3NQ~niSpZ;K!?=dGGINXdLqYQyEe=`11RT)U&|1_#rxtK z?7f4w7N~V!5#pbc4Yw8Ck;>*c80d?($#b@cVrNYjx3;(GCFu}8%CUIMxdv6a*wYvM z_~`NSzy0t3=koCfciCh7!E)yg58!d(xXG)wpYqjMA7r%8(3fr>m92d_>kF(n*k7_D z%*yFa<_gbvE8eqn@V|e&JpBHLGL)URlm@*go@+*KBY1!VEIyzzg(zn30 z+otK{g;n9`xYQ-^0|wvgSB)*0NYjpb0NCR|@f^!^;W#dFB6LWq9|#6-Td!R2gx*1Px?O#it{#pgV|@d+~JaJ78?=?B@e?$+~1tRNUZ{`jNiKF?0v zXvpZY7etz?phdhJv2%o%WY|7V~`MbC{;uX94pZ0;$ zZyB$-BjtcsqdL)ez#BAgaehc&_kOSnltbcv#xE^Xjw%2zu@QMp5kbo?Ag^Z9`EtN88g0bock7{@-xf9F3x&H3dMK88%(2l>6B^W)h= zzJz^u`QTIh_Y1a%;hQsFdux5IZ;yC`&uwJf5O<=WHC3^#)YSpJ;%n3|dFAL!mUiqA zywLjDZ2K9%l;<}Zp0TXtwjz1FNGZpRsC?JBi>s>A1wHc23SFeRjdd z_k!8X5f7<2aJR#9K<|X&EpZ;N&ekNiB6YZ61;San%6r$#iWR*`+gzOF7;F>_H{r>) zBeo*lf~VV;-tZ<8b-AG8xjf~A+48c{Is5aKv_8ZwpBJn;*r;xuK?`06%RHQL$iyiV z2JFa;9a&a7I|nPNcM6<%{u)Q9OzE_s0QArDaho-ItQK(K`XanYv- zNHXb$NO!l!OVW01oE2aBsz810Jb79tdIHgK8%8yb(pmV*hxptnB%=?YS#hrm#luNN ztwQufqItCpXrrkLIvr0vkMJ=-=7Ezfhs(X9zMH{L$( z4$_3*(OG92oTJ%l?hf zbo1Mwn=K7%9C;hUXZ5?K{+<9%%dpNjAJ6J*p5<%kx)R7k|KB;@fv`FCTrxUAeS#S59jyscqC7I`)_6OfY`nrp0q!IN^m? zZ~eq`Sl&dr;%!gQ;b$Lc|LR4eg_&cUlv(Qoa$83%Ae>oVv4t$5RTzK(a5C|te6+1Uw6~{GURa`Y}VmAoH>7=-aKLI(QU$X*Z&!R zpLgpn5A)s)T3F>h^U=+wrI~rO+_?`7&k$%kGWa)HH*?6PZu>045#xbl2B7X*P^w-J za^lwb8vILKHz=KeYj;g9l^^8A{(eaq7C(% zo0^Bb-xoTs_-}PW-R%ZC-?O@irx;mQeAmaboS9Z-z1DdL-7)CD|M8F6e*Wskub1Ea z=GR#n5Xb)Lb)L`S6|me{zjK$5tKolnDK|e59d>D_H>xwvpbxvvxbM#GQzjCRnJ8Q^ zE_Dn1{pI|j6N^h;|H>9BZd7Z>4?jHOyIwbzFPN-+a>h5mxJmqHo>l#s)sTFmz^sKw=fViU!3zQ6 z#>&I3yH6O8A1qJN>ow!?3-oqUc*$7zC95${fBb2A&d2?({?ixBJ#I4jp=EdX^vjB} zvnP8?6^Ij&$-tWe&L7P9ONF5FJ7D%zfDZ7l&QS!IWT?q3kB=_%v5F%EWB>y%w>?=o zUGCOz4)bj^KFpFmyc9$(6%>LGx!j^aEEEUs&b)nVlB)+VndxYh%j_1ID-%v&(je3s z2gy^%Nb5!Hl8KEK#Z>}jIa@lv@jP#1dW%!t`jM|rqqnOL?jLt%f6j+T&iMkj?ca$` zO*oh=J>_+&@Xec?Mu93ncWkK4mps|;9XC7N*9_n>H@2NIgSGL(tGQlGlyPyec}Ekm8{qn!&lfrwrf1ND5pXbzp6SX_FG3@yAP=!MP; z9^F4$*`G>Jd@2uR@7rj4_ILYHxPwiD-{Dagf1O`C+Yc4Jcq@N#?K?+^9i8CZ>DI-w z?btB;L-<^l#-&~5fWOP5>YSXem(hc-Zc8O|pQm~lO0TUiX;?D%4s}PF>!@3f`Af0c zhae&~Wz!XM{z9%yHZKrwC&*|&%+rP?d*)2o*>v%Lwyyh=$2lMy! zH&6JC*EpX2b!d3=&$hHtA7Ft@`nvSXB|j5#>SjHINWTt}v{_}KOs`FBt23a|bz&ETNrsbmkM>xw;vSrmVIRPRq%4g4VG72bvX6m zX1@%`{;ON?aG>le#KA$mj#d6nApBTeHvm@{{7Bp@S0OA6{;WuTx1qN5_yQ32X;CrN zNjW-1URE~Tx112yu@1DI$-4@2!9*f=%)oUiPfcyGF5i33l011Ys4!iW7SsWi$BKFj z8+@QUu-o-M-2*;*f5g7s8|+)m#q;uLxp%_z4|hLbKE3^GJ_7RTa)<9h&=m59CY{=E zBIrhF!<47Cynew&o{z44{iok8kAM6NJbAS%`unchO{*N{%P4nZGfR5(J) z_Qt7HbJ(zq4a`*!X-co_>>R!+rQ?U$)|O|Tck5lfeZzsVckSw7C}lQ{@%_+wt%v!W zkE+hJ*4NQCUDsQ2+MYgFh1o7hEpyGE81Ltqo?(sJu$I^G>(1h~udd5)8@vD-&u^5} zET?H3zi}JKw4u|D-}0NT;WK{6JJ0jXr|Xn$;+?#yesrFFSku=)w_4g(-}T8ElMgfyGL!m4-_-TU zyQ{g00LZ=4PJLlO-zBaW-Wurbl*h0Ugw-y6)_zkTb!h*H9?`HA-h(P{bkY5$bmxJh ze8WpMM`+&d$A-`6Qmcp>f6njteqN`&&goAFc>DUSv>o~X@x0lD?yB!e_b0582ISm2 z&%lQDx%2kZ!W-sU`JMi2H0Cv~=c*Gz%jcc%`MNO6*eEtk^JknbM<2XPv5%`x{>04$ znELB~tr(a`y|jPk=kuP{Q`=^KP8^_hoRaqHAdWwA`Zrjwn(WuN8UM_tD>*$>(|C+SQxG&-h*#{nC(|h!#pQBGW zJ9WSR?TkxrGNydSw#{>25cG=gMBHI~`IH;fjKA(Y_-MK73wGYJV}>!=InPKO9^T|; zG_r`xi+k_c)^DF?xhQwPECG$s7?n&{DOC&}uz@KL;r`^gGBqW>=LoWWkz|{dH>)pv zewhi+6*s2s1M?bMN6kE=yBsiU8DIw(=T|wgKt1k2w0T@ zR#v8X>Z)P{S1{c3=l3tZ`}Sv^f%x5W{^VP9;q|kuc(?-bZ~y+)^2e|K%>HD)PK_LD zMk?;^hhpSc<%UQI)G>PDxfivwiQ5#z%qZ45m2)g7L7 zVnDg>>QpHwb(W2*5l&K5Kggme;j0eN~An@ywE-w`I4%eJ`ohz ziNggwlBc<4@1RLo$xZH_nd>m*4aWuD;5)qN0iGWJK`&=x?u1mX|)P zd3lSwYPF43c@BcSCbX@wfzB#7M?y_sWuMKesQvZP4X+QmdO%m@pqK+LvXL2W^`$ld z6PJXgT%qIIS^Djt#nYyScRwJk2VI?4XXV)jyeM+LLYgSCfY7k833YT>9(%?ZMH3wo66ir*y|r ziAFM>i~q$l#5K8&4xz4S<$(NI$!+c9ZoGYHYeS7hF-rVGegx|0|{byc< z?;ZGi_wVNwus0a9B@Cx33oGI6P7cPBjQg^pPyF)galS|Pg0g(Zhx*u6zPxp(j5~L@ z34L+En3o5tUq2^(!i{NY@&X>0RY$*rXcdkQkxSoby6msAw3JIHag7Z);8DyJ7~nxF ztZul73c-p;#6C*|`Z24JL*bL#jHOOVFp0Wk?5i&?xmo`DIX8|NL%w|SJ-ca`T(CRm z4P#8lpoYgkJZ3WXdO7ngPHcHSJ~?Om*C}IRS24KrftYk&R(NNkU0_x&-D-Zn0Y&E? z^Q3p~ILV!h5;x>=@&yBOAF^hvk>0X^9Uy>GZ)HIkOV$p!y<6()M+yu1en>5MPv zUzI)r?}X(fXNj(SxaH{8bDq&)Rl@#XJ)KRtGT~d7qK5t>?&v3zO)70fQOZ?>rt%o-w>9aA}63@H4mKl-=VmCuzY;>>*bTXzve5{pRj7{*v?8%!n!U+MNJ)>;Su7q^=f8c*x$o2Bl+>SOEZ$s%dFU;yKt- zhHn$HekEk4QVqs|hm!^;4_16Hu18KHf>;;dBkGtu^_2W8lRQBr6&#&FO6QRBE@?F( zF2CE(e3;0$c^Uj}hZELQ%H<7{vSa8@SlMykAag==K3zVzdB#f8ZI&?omdW|@!v)XM zP;b=vkjD2a57N^S>9;&!g>+~g_A_NCAO;sEw9D+8 zRMn(GpV}VG5a06hZdlqW^&AL!p0+w+J=R$zO1=&ZDJu}IpYm_nASUSR{Gwj-96nxg zS_c8h6D>kak8&CQrjYDu$b<=Dy_0TQo&<9$mL#Y^_@}iOoXozXtsc1; za_-5oZ6SRfusk>}Ll_iZ@k||BrL=w64U^^_9n}f>ZEtjeSCtjK@M%8A7qqGcHCYA6 zx=g;ZtBk1^>N{)~zLQUx8%ku91_LZy?uXs$oY&v>Jq z{mU_LG`XbT%~x(6OoNx3OKbg+25?x&1V1kLH@nc`}7=>r;# z(ZPVP>-NVX*Ot;BlHrHo%V!F6UYDgX%dmY=6Zx7%G!Ogj;Q;g)dPh#2vdUD{zKn*+ zYYvs}2~*QqruwUJs~n-CJk(d-&3}KeCcNqT?rWGqURo->>SSWb*!Y!yE3JI(HArb& zXgf{Xwh1=b_`o8tUgST%EIoGZ6-Rt0l_f`+)mB^YKFi*C^G{j<>2Q)6TH);n@v~Dv zVEah{Nz1RI9#=WC)JAFj8)tDc&M_iJ;DzVJxAfoq9W-im>)ZZoy!6jpFZ5&IfUPaz z)%JQo9(?Me&tg`noee!G(Jnq?b@hc0Ghz)eza^wKvJxa&dr(Ji&|3wo&#eqMX5 z|N6kUo)xWUVMlx_Lt#h`W%Za(;gN z1Q=V(@{kMiq`^5(Uc|9*q9t<&XYpR7vhUj&VO&@ac6-$!#``){8;O$yZIfP^*{=(4 zD~YdREn8zk`C|A(xDCvol}j59^04Bm^Gz?fR4|?GM8~%S*f|}tDss#=fe+3;;X`tt zE%$je{uTq!tuv^q5qn^5A?jbRyRPWGeEM{G_{}%Vmw)=>^3xCBWuM*)X2D;6{q^!Y zzC`^c`v70CmCK42{h&()v;a#AXRS6ZZ+)fzNUax7b+DpwF4N46{z2prlC-o?f)8S9 zu^p8N{1n5_4qDizhj3v4OEh{|4)U$eKn-M)_Cn0ruLE+QiP0a@9`LY|Vq{+1zKUmH z!eOC^>dfV5kY@>nin?Uy30P1Du_c%4haAe2ytl~HKDyQA?rT0?czk#H#obSMX6H6# z%-ap9SO4YKXA9rOqz$}Tezd9jyx29M~}zRX$ubCD2Uess&6Uq{cY7R@JEmx4gKX^$aqqre{} z7aK)U5kh1kykvqu$Zb4l_mNd#fbi!K0D&!9^ClXpnOSCgWbd+DgD z)hWg&=Ep}u6%iphuXPuAIjD4yPyB9$OMr^3bnf^f^#Mo3rR_8G^;@wos?;L^>JS8( z?C2q4>!dCI<-!Vj^&P`sJrI|b{6@p5L9Q}}92GD+q@Wa{+YafvWE&^i1+Lw>&?;-F zELOyEh}4MTKmv3^gwgi65YiZ&h*Ur4;NE^zoRV*!8X|n5JK|%r3DxFGM%87@6C7oT z?}_$Mbz!S>;<)|5bN#fBd}K8r`FGvu^NwuFIecZ@EKAG)06+jqL_t(QnCSx8gIV%r z^>uuk;@xx0HfKLx*Ql5pJ8b(^wsGK&ubQ)Z$^b<`y1%E-!ef`t9kv9buI}lSiKiZc#_P53z7da;IO^Sh=H~rG@ zjkNqV*}Bo{I4CUtsj~?D8!4b93Gy`POMj--)3)cOz0>=?4iN`+G;gb`n~xf%Jr`rp z+puUFolAc7P&Pkp==|16dGFJ~JpI@wIka6uKS2X2(b(+T7Ij3 zBDmpNJThT`p!n5w(K52YuXvlDH-Y3uX^g&BI`Bt*RY>#zmU0R%!!CMlrDbax#1_=G-Z5#%;zC)oiGM3B}%cbC@SJz;vT>`$QG8$ z;GJ6U@v#EnDuCOGbn=l~j;NS#nM`CKy90Dq72Hnrib+NXXNXn-as0M|72nyTcfS%a z#^L@;-}(U$KTmGuxJdyXpE)hYVL0U`a@+@JS^K_phMTN{FqvVtd6(A|VmT_m9S=j- z-E8IhF%2D?*SBJm@A%bjCn@UUMcTOgz-q!&uyhjG>h>CS&!h={R=q1GC_cRHYKO}R zw~p_muuhiyd>rp%Ug7#VD-fRozsu0ny?E9ff1c``!Y1j(+lQ#1JbuK()88zA`2D}K z0`W5w6u!y9Yq-Ar`m4N+;A+EZg>+U56dzCl;A_=#Kwx7rRUIRh zF11%1nJh*vE220AD>m@rOWK*O4mTK;wxUUEijY*u@C`UTauJ18`F3>H8G{33E3|%> zpJYxz^TXFp5S7J$Lg9dC#|y4pI!^@o7Hbq@RhGQ2?Suik3$LEGXqi^N3e%3YX$ze(Q!T<>!JTdCCZ7 zg@YqJ^B;dd}mk_%zPWKVV%}4YxY*NN|r~> zPTSl-Yiy-YnUsRz<6tZUu6n711g0~t0ZsXqt!JqmJDE;-n+LSEcUOMb%hp?dp&d?X zJv?_XKBq&&VZ&(5I?J!|m6tcExf_Dqo;5qBGgSW4QdIeTCH11&2Tyu_Y$tZBiP~7I3stskUt^E{lUvIKtORkS0#qZXY^!m1&!FcG~wieV(C8;}qB>RS9@CI>KA&CpKw!uE( zl7}wbuB9)0_?xa52KrS0qC+DVAp9v+D=>)cTk)9=#%0HBLp?q|TkhPvjX&5y!>aU8 z-#%QPJ>p|^*^&p1av7>0uuA>*k}XY#JPsinm2&2-5_am^gX~JL7XuxFVq_b0cEWl> zU_KEa7jSS;`<6kqPk^5o#{l;SYEj*NLZH}V@Z$=BckZ3R_j3tODwJgb`{1(^2nY7Q zk;Ly&_)6LE&2l^2sMy2hRt_I1_0_bm9=}*_;259D&?apY-+`%v1uv4Ws`!wTZ;y)Y zOeXRTiA*wFiP0Yzi=I`G&R@8=dLh6!S-mK$Ngq-7+|VLjp_7!1^KHdCv1MRJmLMl55N6p`QyL+>+<-g@9_N;LgAR;VD1(7@`aOtZbH82it}N^CM@;mY)^UGQud#qr*_Y(x+(1 zCr{oPZ3$FS$|Id5@vKUbojadoJ7^YPUIsh1Lzt)hk?MfHb+yKk2usVspASCOWB|Gz ztbnFiPNI|WV6g7_Bb^M-m6M1t2?6@xkQ%Mo#FDL3n1~*;`gVNDdzCM4Ex)||`SQQq z`)|uHkH6qy{LgUr2f5pDboCs&zN239PG$C89^BxAflN3~mwR!~0bg6^>6GQ-4qpyG zzgWJ0#k=xJU-E(hC9A97Dy)kYzCd(zFqcUDOx#dq13vnT)A=dyk)J_WW`DFs%^Wsm zIZGnaPKFIt&x{E}6B&u($+U6;AdP=b8yr|0%H=BJfBW2wfPxc|i2*BA1s6j25$7yt zWu=WJC6Rfat(Tq&gEqdf;ThnYI%LG8P8@XMs$7J({vilL`L}g!p5n$IinkrmK5DXC z{AF$(Z{L)1mJ537^c79PQK>bnGE+x>kte=34Ijjgp%AUGSO_y z-IXoy9f4eQ!!CJ8US+r}FGyaH4m$n$?f@|`@U1`gH4^5cgg!M1EBfLb3yE$z2#=gX z*on`1jpJFHIEKP49>TQEv-)`MpGd8D#~Z)>VLJUgX9r6S_e`S71>&SRi_cMZ36<_t z$;+Ven*7G~+&B%}<=_0uE>oSaJ#*T4!;&!_>zxfck3|28^6z;H=kRs_1X4pS|85*8oy!IwJl!U#fj>4R=QGCgtZT69{csd zX4d_3m!J4l$?(Xe!RsS1z2?!rS={E;Hr9<9v9{nUMEIg$*Ra;V<#m}!WnLt~rt$i0 zzBIu>hlUef;ayhImyBGe2d*&l{W;k%t?ahLbb>k%cOv1>fwp+cntslTNNHg@}s9G=x>tWq~X0K-_;jzN^=hv-i4+*p2fa)F?e=gE&zmNP^L^)@*JA_=>zrP=QewTy3DkG zWcU>~-9h2zK5ww}Sq@egf2p>XKbAe%#jE{I-e1LvG}IL)AgLTYB8bXT|S|RW`9oz@SzG;#i#Rz}~^S z@Kh`tkdDD0se~}MLRm2#{G&@I0E+f!o&)#i&;8P=Gam-SY{wAM$%7T$hpiph_KLvk zgICKLOHSu3O{tfa%mMk~Wzv>$kx=Oz=kYh4qMYm~ERrnfVO){bf zPF1FpgG?5%x%Ev!l5hpWZ8mw>m&W7-NIP@4T%KNR7e3Z>z!KRJA3ize3)IZ)Zt_s| zVcrHn;b%~68s`IFaN*<;xVERmY;Y}?+@ScZHVt4u{^g> z**`%Vl`VFR3Ste)fANjZxfoZcT2L#kot`q&1%`+F3N1rsd-$I?e$Xc!+#79B#88)b zTTq5>~y+M@lG%tOj_Y2+wM?PBMJxJ}WDuAuGczNRuk7jcfmdDasZ% z!Sjfk7T!KSc+_--zGV{vwX&?!awl(-3M3y%!JezXjN)`1U_g_)hw8=Vk$iPSVDSCf#@&)?}kNG$uZ*@rHLm@=pm^y)4 ziHYX%uLu9qro^Kx0+TG3c$s1)L!a!MZbzr{UE*il6cdiv( zjGG5if1S0rG02l1GxWHh{?AoS+ME}NQ67-Xu0kSf3c9Ub1_2rBt-O}08ERTeaU49< zr$Xu0ZvnDi+2+L;$4S5b3A#qPCJ#sZKl`IR(;+MTRDFrK0fffEzd=~{Bc)HkYu=VJ zRys)&R$67lM}FJZ_@=Gj9i*9YjB8Kxl8x~Wwikmiy1QX}4alS=C-!e=)Chb#&-~0Y z?YZ%^yVIQ?p+j!POQ*ql)p4J9)9s&|ueP?_`P}*M<)=?t_+TE~0c-l1r)Vk5iLUC- zzIL0h-p#4J)~orKE?YS@yhLrjjSc`;dH@?$B`A#{wsLyFD^R-rneMvc_18W_w|J;i zopXHz5L?ZFy)8y~!H}4WR<^b7tB#XTA$uo?-!$dVQ4hgvc%LPfujxJu>wH%ZyItB& zyoSWyK_{-EClAJpws@Kb!hd4RE57s=3sN~NuY4pH#q4D~x zXvOy3d$dCb2gtN@&VD!R%Rhg^ju^gY##Z~^{_1nS0l_l>5I$zxlm5y|G`8Jh=ZiaJ zpoU@oi?{Ux=mkDGJ2_mwxX+D4`fj&g|L!|Jt_iK%qr65rHpc1k1EY)3IsQygQG&ec z;JN!zpu|Z*E(SiJ&&`b>=zK;t*CudM*W^LC%C4EW`2w|DlTLXpt>1vK?ER`V=oz!7 zPXb?BcMkcii&s=95jPqC=K0{%Va6K3USZFh`^$2VZ%6n%MD8L+E%YGYPbfTl{&+dA z{rkL0iM`(1>78&L?S1W)XZWEvw>>#P z755geiE`o)S`2mI_|n%3QpgruF7nX2THt`)O5t0T@-;o>!o!byz!@ECMr_eIb8?%J zmE$Ip0aqfN5cn{+s}JHxt8=}8>nP*$_YEcvrf=}>TcG3OXCt+%AYkV)gT=Z_;#EmM zLhFo=KPNl~ENh+c48)zo2dqMTjQ{H6CQOTq4+duv0c~yTP$T0!oWmsIB`YdVc@Dx3 z@9rJnAaI}A^ULM#N1rSoef-PixBrX}eP~+Up>aFITN_H404#Pg1mZxGeBzlQuNU^M z)3XC~f{+I$okb%{*>)68xWW(mw+x;u;13PgW5|-X6*B{WD-S^72Mosbw-51H2RvIg z2d+jO@v%Mepo{;AuW8C6ogRGkvv4BCQyPCN#XrKMQldj2^9x$qs-HiQgPw+rZK1D2 z@w75fZ+x!c*3AdYgQHKE4=(ta3}2jN(;b+U9~}oZn&)h&KL`IrgJyNY$*%Pe{`qn` ztZNVa>*be+ua?^v=gV*TJv*?IdbR2sJ=IM*enXX={MPjFRlYZBzWfY6 zqoJ*)a}fa?f0;St36H<^m31QiA%E})Dmd|)G@WmTylLm2lQJ*=;sunvx(HUf{onY_ zZ;(d6XkR9TNSyjjY6%;16(;@cx5C$7CAW{AOwG|BvAdvQk%QPOP;#>xN84CCeshO@ z@;7e3R(Bz5z+9?re642U=AtWaX$0`IuS{IAgwkE>x^6aZ<;UelWrJ`|NhmFGd5FYdZ&@cI+CF}b?=>VQLu;NvUFJSax#}g%yyh2sSr%-# zXgMksAPPzA_(zV}iWrzsWucd7No|MfYvi#7iMx$U_Ir)9JLw6Bv@hZ@tBEz)Rt2l0hDR>>I3@&TrQJD3kr@VZ{4+(7VLvRa_& zlGVIDH(kC{(2ScT=v~80&gujCtFOC7xGC<3(7fUEn(Ly^FZ`KZ9IqMMUR}Oke)Y@G zmk0OmG9KX*0lfXv=WE^J^5KWPvh{$s5AmF9L~>B)5(`gXDsp9Onl4YrZKVQ9Z+eZ*t=pU*Ug$gIQT7ic6<_aecXBx zJ-%T%^9aCGzd+50@0cCj`QYAi%0!}GQekCKpFso{9|zBtaB7&_p219@3Kr%-%&xSRnljn{2L;&VCRZAlE$t>FNkQ=qPFP?J-c zGuYQzH(fHT>g(QapRGvH702i<%JQ{dPK=I0)GlTA;3;=L$(PQ- zvID@}ouae0g~w^cv#(rg3sHd%`H-Kj4ORy1$?=E+d2?$yJNS6HbK@R&w3!JWvn7XZ z5niavx1bxS+`i7Oyr?|&r(*%4FYnyDv)sPJ5|RLIqa19?nJIHt=NRgl(8(kU0zP@j zO&JicMqYG9ZyrC(82@K5LHx=)=AMO9FY?BS;Q0Z)lDEk*hF{(Gh2HwT{d&mT40;0L zK){J$%7sb962DaE;|fmsRxkQ|%YcWqrN&?UtpEL$OpodljW`s%@`7SC0h~snTvt3` z<(tBiw}_ckG67;E9IFLPfK#up8aoL zPMlH?20k7tilI;1K`{v*h0$V42O;7~61Ll5IE93lh9dN`u^YS03aPs3#ya|X$dz3EBor`0 zLY1NfK`O2dnrVW82 zfR;~x>wh9dfPUZ|r-lzm6H_L2NcUTP0ZYg_oBkLlnDFfBfvNq9w; zJjJ07gX2nd2Wg$4&msI}{RKS7>tUrM|~AU`Eb0&o~Dn_E|xo= zU=O>1PLFOa|NcD#at7F`hZ(El7bY`x!J#}<$bb#{OB-SxwtL21DHG+|?`Pa3kl^!a zo87j3%E9p@ug>Ouvl%x(V?1}k%XdDwe~*;}o(18?bjl1}s!mugj{6wX>1XJjXjPj~ zE#?ESnPTuAiR|>TtQjLa{!te{SZBHWak|sbK3`7He&bDg3Z98W2H6yLE%nj>y%UXr z3P_<0Lo8&>UkhLJGWZh-`j{(;l|ijJU+|V1DDvmN^IRaHyOjs8Yar=N&_Q$_sAb~f z#lVM^v%Lnzy=$F(-|dT#<^h{H$wuHnJB5y}7JwbysSlb+K(JGiZ*0;E^iA2Jr~I<2 zfK#ocRk0ORvP|nl2{bF6_OSPW%5(GZljZgSZ}Bhydr|YaPXdWCPb6ESZ09)t6RmvLl;lQ!3Dy!sMzgrLMb2a+LmZUn85OXkR@aMGR z5$}#u8+?1i^#S;xt8$X1fUdQ>ROzs1(9k5w42CJ`m0t~~n3GITsJ>=-&k4^;4`n`0`sdWw-<2d(qK zRgShfK~VRyL8;XTCD-~hu6Jk!&t!u_+l;9kge7=Ib_`ZNNcj~KN2!NDqO&QUbwp3~ zv*`56qo00a#|jC2bMe_n%O@XwoEPxCV6XW17mxD%gyYe>48(o0kA6_T;~Sh_33O1{ zEq2u0WH-uxWn6m3xb^h-=JId6z3C@DNx&z*;v>m(U5TCcH;#!MPmVHx&pL({*|$2T z?yIxAMZC$*N(mFdssdHd9LIW7+p(^1M1ApuT`YV>`{lv&<=IbccxRHknI4~=Q-5fcmKLaIbY1iIdK}Y+y__7_f>(a5uq1% ziCxNb;-Q{?FGD?jp!$S|Wp$iivbF(P(M?;pS$V;JA82k~zJAo#yq+BJ_B$_9_}B`)6YvcKWs{iT&(^#I1ec_eYo5_~fDr7G|{`D2B zc2xS81N-(ZznVB)Vw2mj;(ITSVJ-fU!AZ^1;o&o(RVScx!r-bywp!VFI2gClL4l!f zsi<}Z^0vdM4g}fq(;w4WTFHdw?F42DK%%&CGnz0?mB_Jp4dp>gcTo9Zp*b`lnv@qh@Q8*Eu{;rDuCjID*@fw zCSwry!t&h01F$XlDE>`>-v;BXJ&B1+O~dc&Pm|_?7%UVCTWtdN+r9o&sRtojDX#mtli6 z_6hr8^#`QUn_G|>Y>CfLjj2j=WSV%`7Or$_OM}mlI(QD= z)OADa90J>3+F!vt-?N_*kdNZ!T|BE^rZ`YsyJhR3`{BdCK*aaFu;=5CKjtgdx8jE< zY$5?&YduQLo{P0g%R4Xj)(8ve7pSi{!O;Bv(1UI;lJ@J)GvR2#&f(c;sZZC zS8q>5vy7Q7o6?uU9Cb*&@fidB+z^LvCJN}SEm!$?G0KB4-15qgF|78y;%!u~&OM(m zfBYZ+lAE^uVdW}Ye4-;#KFY6O#_E6#c&N&e0?SQdCPt2_9V_}!b$x(SAM*?;W9EAg z9x(nEVKmUZeiZ@L8PrmW5B)jdb3c{_VoUDGOA{w@E~SLi(8mDR!LZLX$cre-qcHKN z%Y`E)4MTXeLvk0~j*>}>d)h4Y4D`Wq(qLoB794o_0CtvqDC|=v09Vk|gocl2Q1JQS z2os9Zv!3@*PG;ST)gFS5b+Y01C09axeXP$&XlwKpleF{MOP=z#QhS$KdhI;ndk{YD z&!Z;nzhj*YwpVP2Ibewi)bR0Ov`m(Vk_UfmTN_@1*SsPqU})td@u1gt;VftfOO_Qc z4SHf2+M!8!SP7}f;$hDfdk2fghOcj(Xfet1M<|_3aE`V<6+{Odl(YWwv+uR~U`2EV z$q9uG5*#NN&=_}hDexFs4OU)Zr$F({leOn5ls82YuJM(r4+^s=#`QipHqNSl+A?TB zPMs<8HgfQj)8MwFSAkpfo@&KOd}8G%+f)xEs;rxqoYDo%TwqFE2Jy<4J|GC{Vg(;2 zvlfPVb_DvM$SSVXt_vbrrgBkc^dTjE+W7z!z8MCA68b;<8HLSridCUK-M9l>3Wu)@ zY$p}E9m;nF5)G=y6<(YzqR^_h*{RQEo2c2INS{N^84M*TY37I8D@MTjh&=yICx1E& ze~#&8XoUV#cx~GpY z1%vPy@L&X{Hs(AJER?ie{pBqWiT8idbmlOMrK z<>HiUT`=4tTy_^lfH|~(+K1{?ak+!t{!*8er!~w3zreyKhxgXoxh_mJ#i;-#vv>wl z-32nxL8U3j{H=W$u;I(lCcp5a%ybC9HNP{8pUj)Sb7J?vFVRCs5EMyP#u5bu7+mWSl{rV*4&hcZZX z|Lcntk{8~kUr+2D=S2c}(FFz>tXg-5E_E$Lw*2#x!if45owBN55J&l=%)n@SrSV2^ zk%GfmQC!(%k~GBwmrwAOAzghW&n@J4`3m!GRvlbbu^k#5KRVWQ5&>%Joq4TG^URqK zwy}wy9Vv_leeRP1_p8@0mftg(c>0v@N3lJKK`ne3OYxd`ujRf~DYRzAZ&elYC27+3 zh1z6L?gh_v#MUT-4e}uF*iD*Ce=+JU3jM?%cID+C zF&TZ3do{K$!JY2R+D#wEWa5aiaq0tQ>7>e!_r2lGRL2>lqo~fRJ!$1Y_wpU_h8ge5 zXB!a%!3_Eknl8>kp?4!NPGeFiF}Do1bCsWpdd%S6nGZ_ZrKX&u&8$p-P}TN~S2cj6 zDL!Rvb&u2R7?N1bnGS|N%cdL>gwet-( zRvsr2d2f8J~(#s@&(@m0G$gQi+fandagiv3$Bb- zJi3Vok8X6aBX);%18Er*bjQ8fYtasBSb z3t&<^p>RdQ35J8f@}+sX%M$^bQ+}xnv^f4&Z1k{>_^Z2ptP_p+z{`XJl0jF4d6=hi zl$wf)x}%f<&{C((w~R+yDmHZ#DDUXh8S+FYvSig^E>~e{BwgftB+)o2o0(Ytg!F-fYh9Rz zkmxXw3Uxi!^B^+Y_;+U40p)6n=S(gKg7t1kC*g>Rq#;_6+@ujA7~8c#t;&z#*o*+Lbwmv7q4o|*Wj8R}BFe=V!7B_kvaHHS zGhMaI&F)R$R!*T*T>9bir9M?d0}1`KsxRxcoM+Ezds4T>wR4z8w5- zCgugU5e0uAW_I&Y*RTj%pp&1qJ2r6g&ahdcf$v9&mF?!apz>^I;*Fz=>my9(XZ~DxF zetg3lscQUf?_(Ls1npx0KCvnoOjAi6!`0Y=bS4gF39EFr+Mn^2YCbL}zBfj$+<^kZ zakd8;VX&-F9U`4X8hS&=3P-4W(d>-lK(Wv(vs*MuRq|R8czxr+=PI&P;$ALr^41fG zbc}Zq9jwbw*$(36o5=!$*XkVLhZyDiF}ibJDdm&)1_$_xav9BukonGztOTxr@R&(- zG2gd7n*P~fYbZJxh&15*>#cbOD}e zpp{RTKzKK^)GOr=0DMySXe-JwM84AH37`5EiRHGDI*F+4s6*6nn5}EZ$_8~wO5_qT z__Sba>g7SB@Ls7gwY-}xyNasN2NZ9X_;D;<> zLx_EFLip;jqF1l1Fo8R|$sspt7muPG@X{fyIc()sNTyTY%o}tL989H8R$$9cJcj`* z8nL_Y33eC_Zq3vT`a3D)5MK#1OnRxyS{6gD^7T&#^J@apayKEfeUSn0koFv|#D!JL zX1KX|*BM!1X*uoY$jk1xcz2#i>RoB`HXFzxW7eywJnz6)2Ex-yX4K$X=I!-}Qy-NY zD@D~onJO1mC7DCX5V&QuT=CK#6Y)TZ z4#mII<$}_`Y$jgGt!}oi5QRTF$|bcdxTHJ}Jxb4#6Fl4XP(E+x5jA1L>9SaPiy%{e zX@phw@~Lw78sPOkj5yRlJ@ufj%;n30-!zTg`XmjHu#p+~cFV<=c=e>oAKAqMoSG0- zQvT+rTmhB5ahCt++r%DLg;2TXd7Urc!Xsl-#-eV`hpA&q@pNI8(X|bV*PN;?D;C~- zXPo%q?W^UVKjB%~t525`2KoQv)%o)9kt-XFm05vMVaJ>X`InJKj1C!O_~trgCpZWn zww4cL0r;-+)D2%>P~+C+OWt<+h6&Ii&uBz*nFO6f%Cwm3H$%xNy>bPkFVmQ}myG1} zi92+7;g42k)d@KVJp22cF)}YN@{W5AzOw{0IOZ$J@T@H^4k$6GJ4pnaR{p$f ztuHlPc;X~;;S!v9Lm{!HjgSE;Owgtfg`2Gta~fyjsd%ZBgPxPQ>ZC2+O)S3%GhLay z+aP$PU=2{X5{C={crpG7O;nLS+>< z6-E4!UC6b)&+?3&AeMEdo6w3$Sn)6zg(@$3?kY(#0?Dw9)z!XDv~lzWyQ?C@OxZ}b z)-B-4QYYaeOkBh(n#ym6>s`^{=OAGPlC`2hlFr>r9#;<&YkoW-@0P0`q`d&n^*02P z`3pW#6_u{IjUyh_km#)IMw=kdq5a&e8(s zKiI>jWC6&Hhn&sr-^w{?*5hI@=_fxGH;W);UX#35ur8Cr(=WjfHe_w+NY8l#R5&tj z84*{28NUZ6?e!k$iZg0;SVrdH7uDr!8UbAfBYW6Z9K9x~ZsUX^>A^od!Uj&-T?*F& zm||;hjuDqQG)1=Yo^7hZtVI5hGi)i&P!ds}RR)RNeO3%%UK$NrL-{=aEmxKSt1f|E@^oGfh7T@@?mjrAPH-D)T z_UK|_F=L3V9Aq1D*&1s&^x25d@;T5ag%yxIm?ZY?A|MC0F<(u!Hsy%TpGB0mua{LT%BOiM_lhq=OHg-U zY%EMRaJWycXW|gWhENQXk1Gqlvem)NKK^#~8lJ@c zMuiPL6+>N=!B6Dw;C&iBUm|36!9LB^1Sb%;ZZk13LU8h7V7&nurroY}$;!bgj?gA} zGcnLMZSl3UepABdLylW!I!ts%@92=ys|xAGpgN^77;xY=?8{ar;(o^=q14BQVM{^5 zCn7AK0Lf$@hisH=2S=h*y9FIBgk z3^Shq#$g}cJY7X8_m@pgv+6QzfpKT8XCYDrCHXO=DOSADCad)AL5m1Z$Xiwjw0%&< zH^${X)iXgmc>)sPKh7JIXt=7sE35dnJ{wP&*amik`SK55NV}xP8*&mAqtbmAT(V2U zQfOB~>R><|2AXt}wUtyhvGAK&gr;PBX((k2q1ctLb!U63uxBPEV&|wl{V!UVSCZOa zoXiT2grlw`C&H>On`IC#{u;uo$vUUbJLNS9x+<7#Y~zhvwkoGWwWDAR$c9^aVm9`Q zQjvhmMp3)~#XQcrGo~#l)Urb~>|gyrTI^cdhe`dvS7!CO#dGGZ4AX{|_fA@m45tjO z!3GdJLR!h8WB3ew_1$O&?w}j!6Q3e#w&s^EaM-8O_Wlc*w$bkduX2>;p5-!c)GH9W ztlLh{ZM!rT*SA@-6%_lzwTS_VS3JfwY>$uES84pOn~W-og2R?#8|8DY4Whs)c)b+f z`fV~Nq(o&}c=~`bOwzM$`*G8ye-Fe>>-+-hiz)JpxO8%dz;_ z*UBhI$EMU_QnjL>9cGoU>Sj%$79V*N8{LHh-*HM0%~ zWM(0k1Bv78bR;;%txv8dbP;Ncik5SMBGl&;@-Q~DmuxR`n-qpwK>pkwq{-PTf~CrL zE0YgT`^qV2P=55I<+{~LI?re5ck3-*4t~SyvsgWN#cG7XS)Yz@W`BdXHTfLGYk0ii ziFLnP*!$i}{z>+}`DQ3}P@Xe4Cl*%5-V-NZeUeENx@o5$qAPE+%w=a!0XkC~K+dBu zJ@HEijzi0*9LIoGUhR+guj#<7zm*e=t`E}f2iz3C1uuW^<0;p$oz-(tR8}R%@j88) zcCEbVFWcsmL|L6<9hAAl8h6wHUmbgZpyjh~a>oR^RVFMt?pfMShY#hVyn=1Kl-#&| z7S=qBS17-AD$Gc+gu=!jdve+r zGr!iM^HzPsqiTi;a>vrYLGK6e+TUj=bK12^HNsxG2WqhnReV7I_sBeucmJ+5q<4WzT0C6K~0@7$S0>{2W{e^K!QhaJP)j*Eef*} z)H%F6-PaiCoVYq>zRYvB3;{DPW*iATb8BeyW9vRxY^PyoG>J;;GU^87&-leEZ{GO% ziq=s;;Zx;nK5%h3$hJ%bKtn6Ax8a>IJBHF1;G{V*{^rvICucX88>f6n>y&Xe6N=Z* zUqHu>AD>9Ka`y8IA;w90)&RH)=zyv)$hT!&R#YIUIo_gMj9^+En`cu(#tCK*m( zTv>3HBhPyfchX?xI^u4b6~pHrOdqle!72RhRMb^j>Tr2fD=!wfI#;j|&>5l|$Gp|a ztxo(IbZY*<*Z+aa+<=|h*A+&;D{BsM>j1>Z#EaZw9x_B4L7e6%Xg}R~!uvweu>Et< zHs^6xB)_(AFLx)o@(Te(sM)X`L|G0$%=$KNpQW+E&%r!tIy7$EC%j>vJKp%6-*C^@ z@dNxn6`Du$-6#HA`O?j_9dEjQew{zVo5wzlY4aMsPup;L&3H4s<1_6H>-els$2+gj zEu(n}uedEI&7?_p&RNMEkmj+2Fy&nb7uGV3x7>!ce&%)9U{;8&-gW-pbKZDt*fXr- zo@cu{-ug-1xXtN(=@~xLH=XB(H*TM2ocG7|MKu!X{53z%*YTQhHooOk$XEI;Z!4o2 zX1rw?zfM-;%y}Je+V|=CKDa#pLjgD0f9~6_&4z3_;oSMHUoyv`VV+yR#_x0EdtDJ` zsDV^sN$=wpPL*TJ=mDvD^Yy6*v0*eFnodybH)UTNZJNfvPOq6zyv?T4*L-0!ye9=S zT)g?(*Rq?g`G_;i5pKFBAEved7`aYhy8e#B+)$!d$=ce+EVCy-`n7eEe8!0#gJ;9L z96D`w+uizyt}7V%)`L!p6CPzmxHp}Nj(eViGFql*>vTV$JmWQvuYp~=%=N}Xs0O?S zt@HLdbDf414`V|Id9^yk^B^#wx+x?+ls%MEw=sr`tr>}_ufd~b1wK*P8uo1!r}GNG z@Ipj1_QN20s@gP887wmy(l&?auU~ni+7}fujzu?bwwEr0@8H>~eSOqLSa@?-@DiEJbVD*75PEJ7FcBNh(bU=OZOj%4Efcc(p|9sX#8!~0kJ}9&+IwM1f{#Yy1 zm`yv&lb7F-aG&<`lcx$RJdiD5Y1DZRDR=-4@3S~to(TRIt?+%kIo-4~Y@bFPL(6zy zzUp?K{InsV&m*~=&~n$rHLHEsfYz`13vYdlcYeobJYnU}O+R5BX55)i=X;)U_Th7S zKVE()2ki6SPq#h$aCx@;#%VbXH*I}ydDg-INRZ!kGX5jHq?z^Wbo1}LK3`||EVpf& z`3P(HPBpV$Y0Jc5T(8qUCh$8tcAk2*uEIM0xAbiK4bMJtYljOe67TDJ#M<#oIZE6<2v~>FZtBN>RY{%rgbvD^&KUZ zY9v(&nT|Vr3u)!k1=4&aYu)iaiR85=EL*QzC9p4H($)=C<(i9sLRH5@X~L&+bh>ggI1ZlWut|~1 zKezlI@rK9nRHP`JS6!V|04wZ~FADtgtIXguO`l~HcEfiN$eNQY*-l=Dj+~RH5{7X~ ze9(CNn@Yk~`MP0H_s1Pm2Wy}fzYQ{CgGW$3RFy0L43yYUhF;D#Xr9|GMbO#1-{fdl zRv1z|z#vN5Qb?I_F`hMj%!`yx&Uhf4hp2DeyPY>w<=GwZ-mtUh;+)48_#*b}S5KGQ z4{j|VfANdu_>4CUDMl51pe@fPFd+A7Zy%`e=Rm*rg*j+;xx_pDdDa0sg}Vw69F)xp zfIKibuIB;q@i01F|$@MTcz}_yf8Em>QPDZ8mVY#md@eLPw;la661BUG8z2) zaJD+QBI2aP??~j?2l8BnaB|}$!tGTTIQI}6q8G64cr%Hx@>w_hb0URQ3q|~jRUw#h zme7gB000Slyp5({omZ)b6+sU;yWXyH@N>#{5Syw_cq@y2J;Hew*7XVE=9_bgE=80w*(b1Z zjkhhkvWhX5iZYIgu7d+p`o$n^PxcB@FyjqZ<}4#=@^c-8PwO<}NZ)jxgM+MwnZF-a zJTm6^K3=E8bKnY7e#07PKF@g_-`9D@M~mGgsm5>mM%YbD*9G}5kEYqqGk!s@lk>Oi z+xWdNwlAmBUBCCKW}A$6Iv4MKgKU0)FH-PxJjr(k`oKJPR1S z&am+ElYbE)wb+u3ex3;cUoRg;- z$RJ+h>cLT?!?vXJ=m=#64rAPG%LlC|T&i^y~zJDFffJstQz3n%A3& z#K~YEA%4-h9o1j8wnK>Vu0A0>U#Rx;3VGfD_oLJWZ!yZ-nb5#h3Hj+-pGk1Qj~jUA zpjQxlMQpuoB+o4jXZj9ezxUvS)eb_OG-zZOrjrMfc}V((?EOHV5BH#-^4yM;l?Y^= zV2ixei85f-rM6jxS1Pgvs9pSEWuv-aTbI(Zyon^=%yj8<86d=dat-QK9YzgOyE62- zu(sn0k1@lb@R(|3_*SKoviNU2X89X*i{)N`6F}L=$j$?R0m9+hz(hr}@x;I_r=Wmm zKCvO560tHPCQOQNi!E7&x7pFT!ru+bYI&YF9vg0apeSb5Yq$on_>@kaz45nP&Fk{t zcoi;rAuaf(|0;1OGUrQgXIAGOv?RoB{J9hVTV>mv@(j+TTfqgqU)Pqm$!PnVt7qw3 zx8k+oTNYcr6uWsQUd}Q~sM_S)+D38=teZDb+1pD6rT8|#oCj$!0L`-MiUE-qC4*>o z^lKcj(#>BdFhYVjY+8{+2HU|aYzD6oz^e#{=)ouahR($zGMg=lcS75ytd2L5=^kt~ zkJ6|x8sBr%i$B+$rki;NHtgtl+uCW*4Qn~VwlMr5fM{LJD|a??jyJ9RqH~dQ)GA9S ztdc6%@76`DMMC6R2Bg-qCq8VtUf&{JZ8ACepmIxo$Lm}ch^naQJ?d}6O8k*AW*y@e<3-aciH4@JZc45!)Olfj=|Wsc|*M~+m@TB^5AX#E|-3nWW}y>1ZEpVe#f3?TOubuTF(fanzM#0 zkvUaI3k4AghxB<{G_4V0MV#I?ErDnm2PILY)bfRiLe*QSkk{_V=n1cR$2ynWFK+z} zu}u=iNq6QWo~*k*C(1dw-7(X)f!)0LDEvKc$Wt$v?_CjN_W3{b+%F#m=1o%%y)nse z78Q+y59Y7G^c(}Zh(TBdfz|{wC2MQ;`^e@4y@#|y z8j&4=4p8(B1_bv^e}WUkxU!Lh6!E3$jbo#EmC+x(^HxVfA76Uolf=C0>2PNL?yw33 zmy58!ABW>%Y7R;W;Jegbf9F#Rco47+Wh}n=s;@oWF+~S)BVoU+6aKnF#Xv$l zqPP4GoP7*um&J9R93JYC(?KcqL4_xn)P(2ai+zz%TZ{aj*QeUjyvXXK&2hnJ->1D~ zs9o~~$C;e`Pb2ay6V?UJS^dbPA0Io;q(?N25bcVqO%hfEIpj$ZD;Zj6tlYF>cIDA! zBHSZEa?_q#mmF-g%}WC zWo-=Q;;vZ5r_zcSaDous@HMs6?E+c2vmk{!r`eLw#kfd2R!V7ukf0rd`azwy&*9q` zb1r#|K(c5AGj>R(`WNG#dy7>=r@;Sp&v@epb96Thm6!V7ZUBz7C(?Z8CzXl#>uDi=JH#`|4?5Ajq-Zppj>&ZR^An%JE4U z@TA&DxpAsb&pv=5*$*ixZ}ajOW3U#`H&5}N*Kp^>qQg@<<=UWHR+E8O`&_j6RscwU zGfFu9d0ynXkz~;FX`=?IlC?hZjvSWM#RmI^t^CFm<3uN=F0|N{_}~jP^V=}SknkKq z?gvw6S;UJuEsV<4*lG8{jGa49p Date: Wed, 7 Nov 2018 10:21:14 -0800 Subject: [PATCH 243/540] Add AvgPool3d/Conv3D/MaxPool3D to TFMobile ops list PiperOrigin-RevId: 220485157 --- tensorflow/contrib/makefile/tf_op_files.txt | 2 ++ tensorflow/core/kernels/BUILD | 5 +++++ tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc | 3 +++ 3 files changed, 10 insertions(+) diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt index 24a4a03f232..e779eff6890 100644 --- a/tensorflow/contrib/makefile/tf_op_files.txt +++ b/tensorflow/contrib/makefile/tf_op_files.txt @@ -42,6 +42,7 @@ tensorflow/core/kernels/conv_grad_filter_ops.cc tensorflow/core/kernels/conv_grad_input_ops.cc tensorflow/core/kernels/conv_grad_ops.cc tensorflow/core/kernels/conv_ops.cc +tensorflow/core/kernels/conv_ops_3d.cc tensorflow/core/kernels/conv_ops_fused.cc tensorflow/core/kernels/conv_ops_using_gemm.cc tensorflow/core/kernels/crop_and_resize_op.cc @@ -163,6 +164,7 @@ tensorflow/core/kernels/pack_op.cc tensorflow/core/kernels/pad_op.cc tensorflow/core/kernels/padding_fifo_queue.cc tensorflow/core/kernels/padding_fifo_queue_op.cc +tensorflow/core/kernels/pooling_ops_3d.cc tensorflow/core/kernels/pooling_ops_common.cc tensorflow/core/kernels/population_count_op.cc tensorflow/core/kernels/quantization_utils.cc diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index be448ed3db1..b855fe5436b 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -5314,7 +5314,9 @@ filegroup( "batch_norm_op.h", "control_flow_ops.h", "conv_2d.h", + "conv_3d.h", "conv_ops.h", + "conv_ops_gpu.h", "data_format_ops.h", "depthtospace_op.h", "depthwise_conv_op.h", @@ -5333,6 +5335,7 @@ filegroup( "mirror_pad_op.h", "mirror_pad_op_cpu_impl.h", "pad_op.h", + "pooling_ops_3d.h", "random_op.h", "reduction_ops.h", "reduction_ops_common.h", @@ -5380,6 +5383,7 @@ filegroup( "conv_grad_ops.cc", "conv_grad_ops.h", "conv_ops.cc", + "conv_ops_3d.cc", "conv_ops_fused.cc", "conv_ops_using_gemm.cc", "crop_and_resize_op.cc", @@ -5490,6 +5494,7 @@ filegroup( "pad_op.cc", "padding_fifo_queue.cc", "padding_fifo_queue_op.cc", + "pooling_ops_3d.cc", "queue_base.cc", "queue_op.cc", "queue_ops.cc", diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc index 221e9b8e34e..d251589b483 100644 --- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc +++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc @@ -55,6 +55,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) { "AssignSub", "AudioSpectrogram", "AvgPool", + "AvgPool3D", "AvgPoolGrad", "BatchMatMul", "BatchNormWithGlobalNormalization", @@ -78,6 +79,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) { "Conv2D", "Conv2DBackpropFilter", "Conv2DBackpropInput", + "Conv3D", "Cos", "Cosh", "CropAndResize", @@ -168,6 +170,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) { "Max", "Maximum", "MaxPool", + "MaxPool3D", "MaxPoolGrad", "MaxPoolGradGrad", "MaxPoolGradGradV2", From 71f69fdaeb3d75d5cfb9b37fe6e90b7a20bb997b Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Wed, 7 Nov 2018 10:35:04 -0800 Subject: [PATCH 244/540] Internal change. PiperOrigin-RevId: 220487789 --- .../python/ops/distributions/bernoulli.py | 2 +- tensorflow/python/ops/distributions/beta.py | 2 +- .../python/ops/distributions/categorical.py | 2 +- .../python/ops/distributions/dirichlet.py | 2 +- .../distributions/dirichlet_multinomial.py | 2 +- .../python/ops/distributions/distribution.py | 8 +- .../python/ops/distributions/exponential.py | 2 +- tensorflow/python/ops/distributions/gamma.py | 2 +- .../ops/distributions/kullback_leibler.py | 4 +- .../python/ops/distributions/laplace.py | 2 +- .../python/ops/distributions/multinomial.py | 2 +- tensorflow/python/ops/distributions/normal.py | 2 +- .../python/ops/distributions/student_t.py | 2 +- .../python/ops/distributions/uniform.py | 2 +- .../tools/api/generator/api_init_files.bzl | 1 - .../tensorflow.distributions.-bernoulli.pbtxt | 143 ----------------- .../v2/tensorflow.distributions.-beta.pbtxt | 147 ------------------ ...ensorflow.distributions.-categorical.pbtxt | 147 ------------------ ...distributions.-dirichlet-multinomial.pbtxt | 147 ------------------ .../tensorflow.distributions.-dirichlet.pbtxt | 143 ----------------- ...nsorflow.distributions.-distribution.pbtxt | 134 ---------------- ...ensorflow.distributions.-exponential.pbtxt | 144 ----------------- .../v2/tensorflow.distributions.-gamma.pbtxt | 143 ----------------- .../tensorflow.distributions.-laplace.pbtxt | 143 ----------------- ...ensorflow.distributions.-multinomial.pbtxt | 147 ------------------ .../v2/tensorflow.distributions.-normal.pbtxt | 143 ----------------- ...nsorflow.distributions.-register-k-l.pbtxt | 9 -- ...stributions.-reparameterization-type.pbtxt | 9 -- .../tensorflow.distributions.-student-t.pbtxt | 147 ------------------ .../tensorflow.distributions.-uniform.pbtxt | 147 ------------------ .../golden/v2/tensorflow.distributions.pbtxt | 75 --------- .../tools/api/golden/v2/tensorflow.pbtxt | 4 - 32 files changed, 18 insertions(+), 1991 deletions(-) delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py index baecc321d38..4fb598aef4d 100644 --- a/tensorflow/python/ops/distributions/bernoulli.py +++ b/tensorflow/python/ops/distributions/bernoulli.py @@ -32,7 +32,7 @@ from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export -@tf_export("distributions.Bernoulli") +@tf_export(v1=["distributions.Bernoulli"]) class Bernoulli(distribution.Distribution): """Bernoulli distribution. diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py index 51c4f6eb3d0..1d1a666317f 100644 --- a/tensorflow/python/ops/distributions/beta.py +++ b/tensorflow/python/ops/distributions/beta.py @@ -47,7 +47,7 @@ _beta_sample_note = """Note: `x` must have dtype `self.dtype` and be in `[0, 1].` It must have a shape compatible with `self.batch_shape()`.""" -@tf_export("distributions.Beta") +@tf_export(v1=["distributions.Beta"]) class Beta(distribution.Distribution): """Beta distribution. diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py index 09d7e0e6804..33a84356250 100644 --- a/tensorflow/python/ops/distributions/categorical.py +++ b/tensorflow/python/ops/distributions/categorical.py @@ -59,7 +59,7 @@ def _broadcast_cat_event_and_params(event, params, base_dtype): return event, params -@tf_export("distributions.Categorical") +@tf_export(v1=["distributions.Categorical"]) class Categorical(distribution.Distribution): """Categorical distribution. diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py index 675c30b3833..971ce46efbc 100644 --- a/tensorflow/python/ops/distributions/dirichlet.py +++ b/tensorflow/python/ops/distributions/dirichlet.py @@ -45,7 +45,7 @@ dtype `self.dtype` and be in the `(self.event_shape() - 1)`-simplex, i.e., `self.batch_shape() + self.event_shape()`.""" -@tf_export("distributions.Dirichlet") +@tf_export(v1=["distributions.Dirichlet"]) class Dirichlet(distribution.Distribution): """Dirichlet distribution. diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py index 2e3151a5ab4..8ce01f6b957 100644 --- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py +++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py @@ -51,7 +51,7 @@ fractional components, and such that with `self.concentration` and `self.total_count`.""" -@tf_export("distributions.DirichletMultinomial") +@tf_export(v1=["distributions.DirichletMultinomial"]) class DirichletMultinomial(distribution.Distribution): """Dirichlet-Multinomial compound distribution. diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py index 11247a39bca..d551830fb84 100644 --- a/tensorflow/python/ops/distributions/distribution.py +++ b/tensorflow/python/ops/distributions/distribution.py @@ -212,7 +212,7 @@ class _DistributionMeta(abc.ABCMeta): return abc.ABCMeta.__new__(mcs, classname, baseclasses, attrs) -@tf_export("distributions.ReparameterizationType") +@tf_export(v1=["distributions.ReparameterizationType"]) class ReparameterizationType(object): """Instances of this class represent how sampling is reparameterized. @@ -263,7 +263,7 @@ class ReparameterizationType(object): # reparameterized distribution support straight-through gradients with # respect to all parameters. FULLY_REPARAMETERIZED = ReparameterizationType("FULLY_REPARAMETERIZED") -tf_export("distributions.FULLY_REPARAMETERIZED").export_constant( +tf_export(v1=["distributions.FULLY_REPARAMETERIZED"]).export_constant( __name__, "FULLY_REPARAMETERIZED") @@ -271,12 +271,12 @@ tf_export("distributions.FULLY_REPARAMETERIZED").export_constant( # reparameterized distribution do not support straight-through gradients for # at least some of the parameters. NOT_REPARAMETERIZED = ReparameterizationType("NOT_REPARAMETERIZED") -tf_export("distributions.NOT_REPARAMETERIZED").export_constant( +tf_export(v1=["distributions.NOT_REPARAMETERIZED"]).export_constant( __name__, "NOT_REPARAMETERIZED") @six.add_metaclass(_DistributionMeta) -@tf_export("distributions.Distribution") +@tf_export(v1=["distributions.Distribution"]) class Distribution(_BaseDistribution): """A generic probability distribution base class. diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py index 6a52af8c33e..8b79a5d4abd 100644 --- a/tensorflow/python/ops/distributions/exponential.py +++ b/tensorflow/python/ops/distributions/exponential.py @@ -37,7 +37,7 @@ __all__ = [ ] -@tf_export("distributions.Exponential") +@tf_export(v1=["distributions.Exponential"]) class Exponential(gamma.Gamma): """Exponential distribution. diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py index 4a2db208d40..57505d1b131 100644 --- a/tensorflow/python/ops/distributions/gamma.py +++ b/tensorflow/python/ops/distributions/gamma.py @@ -43,7 +43,7 @@ __all__ = [ ] -@tf_export("distributions.Gamma") +@tf_export(v1=["distributions.Gamma"]) class Gamma(distribution.Distribution): """Gamma distribution. diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py index 12743fa23d6..5c6745b0fe0 100644 --- a/tensorflow/python/ops/distributions/kullback_leibler.py +++ b/tensorflow/python/ops/distributions/kullback_leibler.py @@ -60,7 +60,7 @@ def _registered_kl(type_a, type_b): "should update all references to use `tfp.distributions` " "instead of `tf.distributions`.", warn_once=True) -@tf_export("distributions.kl_divergence") +@tf_export(v1=["distributions.kl_divergence"]) def kl_divergence(distribution_a, distribution_b, allow_nan_stats=True, name=None): """Get the KL-divergence KL(distribution_a || distribution_b). @@ -161,7 +161,7 @@ def cross_entropy(ref, other, ref, other, allow_nan_stats=allow_nan_stats) -@tf_export("distributions.RegisterKL") +@tf_export(v1=["distributions.RegisterKL"]) class RegisterKL(object): """Decorator to register a KL divergence implementation function. diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py index 4f6a8f587d1..a96b58ba1a6 100644 --- a/tensorflow/python/ops/distributions/laplace.py +++ b/tensorflow/python/ops/distributions/laplace.py @@ -43,7 +43,7 @@ __all__ = [ ] -@tf_export("distributions.Laplace") +@tf_export(v1=["distributions.Laplace"]) class Laplace(distribution.Distribution): """The Laplace distribution with location `loc` and `scale` parameters. diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py index 8397353cd5e..97d2b1b26c6 100644 --- a/tensorflow/python/ops/distributions/multinomial.py +++ b/tensorflow/python/ops/distributions/multinomial.py @@ -52,7 +52,7 @@ fractional components, and such that with `self.probs` and `self.total_count`.""" -@tf_export("distributions.Multinomial") +@tf_export(v1=["distributions.Multinomial"]) class Multinomial(distribution.Distribution): """Multinomial distribution. diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py index 9f511709b90..9acc0469885 100644 --- a/tensorflow/python/ops/distributions/normal.py +++ b/tensorflow/python/ops/distributions/normal.py @@ -42,7 +42,7 @@ __all__ = [ ] -@tf_export("distributions.Normal") +@tf_export(v1=["distributions.Normal"]) class Normal(distribution.Distribution): """The Normal distribution with location `loc` and `scale` parameters. diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py index b69e61925c1..351f5605e24 100644 --- a/tensorflow/python/ops/distributions/student_t.py +++ b/tensorflow/python/ops/distributions/student_t.py @@ -43,7 +43,7 @@ __all__ = [ ] -@tf_export("distributions.StudentT") +@tf_export(v1=["distributions.StudentT"]) class StudentT(distribution.Distribution): """Student's t-distribution. diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py index b6b24187cc5..8fac0167778 100644 --- a/tensorflow/python/ops/distributions/uniform.py +++ b/tensorflow/python/ops/distributions/uniform.py @@ -33,7 +33,7 @@ from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export -@tf_export("distributions.Uniform") +@tf_export(v1=["distributions.Uniform"]) class Uniform(distribution.Distribution): """Uniform distribution with `low` and `high` parameters. diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl index ac7bc28b2be..49603a6e16a 100644 --- a/tensorflow/python/tools/api/generator/api_init_files.bzl +++ b/tensorflow/python/tools/api/generator/api_init_files.bzl @@ -10,7 +10,6 @@ TENSORFLOW_API_INIT_FILES = [ "data/__init__.py", "data/experimental/__init__.py", "debugging/__init__.py", - "distributions/__init__.py", "dtypes/__init__.py", "errors/__init__.py", "experimental/__init__.py", diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt deleted file mode 100644 index ca96f4eaece..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt +++ /dev/null @@ -1,143 +0,0 @@ -path: "tensorflow.distributions.Bernoulli" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "logits" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "probs" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"\", \'False\', \'True\', \'Bernoulli\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt deleted file mode 100644 index d0508acd9f4..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt +++ /dev/null @@ -1,147 +0,0 @@ -path: "tensorflow.distributions.Beta" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "concentration0" - mtype: "" - } - member { - name: "concentration1" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "total_concentration" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'concentration1\', \'concentration0\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Beta\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt deleted file mode 100644 index ff0fbb56cd4..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt +++ /dev/null @@ -1,147 +0,0 @@ -path: "tensorflow.distributions.Categorical" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "event_size" - mtype: "" - } - member { - name: "logits" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "probs" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"\", \'False\', \'True\', \'Categorical\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt deleted file mode 100644 index d75e4a2f88b..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt +++ /dev/null @@ -1,147 +0,0 @@ -path: "tensorflow.distributions.DirichletMultinomial" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "concentration" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "total_concentration" - mtype: "" - } - member { - name: "total_count" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'total_count\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'DirichletMultinomial\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt deleted file mode 100644 index b838b9ae21d..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt +++ /dev/null @@ -1,143 +0,0 @@ -path: "tensorflow.distributions.Dirichlet" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "concentration" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "total_concentration" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Dirichlet\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt deleted file mode 100644 index 6f06b7d50dd..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt +++ /dev/null @@ -1,134 +0,0 @@ -path: "tensorflow.distributions.Distribution" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'dtype\', \'reparameterization_type\', \'validate_args\', \'allow_nan_stats\', \'parameters\', \'graph_parents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt deleted file mode 100644 index d34f9cde5d4..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt +++ /dev/null @@ -1,144 +0,0 @@ -path: "tensorflow.distributions.Exponential" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "concentration" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "rate" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Exponential\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt deleted file mode 100644 index df268b8d99e..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt +++ /dev/null @@ -1,143 +0,0 @@ -path: "tensorflow.distributions.Gamma" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "concentration" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "rate" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'concentration\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Gamma\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt deleted file mode 100644 index 303dcb4ed3b..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt +++ /dev/null @@ -1,143 +0,0 @@ -path: "tensorflow.distributions.Laplace" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "loc" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "scale" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Laplace\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt deleted file mode 100644 index ecda8acb15c..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt +++ /dev/null @@ -1,147 +0,0 @@ -path: "tensorflow.distributions.Multinomial" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "logits" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "probs" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "total_count" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'total_count\', \'logits\', \'probs\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Multinomial\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt deleted file mode 100644 index 92b9eeea223..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt +++ /dev/null @@ -1,143 +0,0 @@ -path: "tensorflow.distributions.Normal" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "loc" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "scale" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Normal\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt deleted file mode 100644 index e3db443c2bd..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt +++ /dev/null @@ -1,9 +0,0 @@ -path: "tensorflow.distributions.RegisterKL" -tf_class { - is_instance: "" - is_instance: "" - member_method { - name: "__init__" - argspec: "args=[\'self\', \'dist_cls_a\', \'dist_cls_b\'], varargs=None, keywords=None, defaults=None" - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt deleted file mode 100644 index 02e8d576ddd..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt +++ /dev/null @@ -1,9 +0,0 @@ -path: "tensorflow.distributions.ReparameterizationType" -tf_class { - is_instance: "" - is_instance: "" - member_method { - name: "__init__" - argspec: "args=[\'self\', \'rep_type\'], varargs=None, keywords=None, defaults=None" - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt deleted file mode 100644 index 9aa7f9a6346..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt +++ /dev/null @@ -1,147 +0,0 @@ -path: "tensorflow.distributions.StudentT" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "df" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "loc" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "scale" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'df\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'StudentT\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt deleted file mode 100644 index d1b9d306962..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt +++ /dev/null @@ -1,147 +0,0 @@ -path: "tensorflow.distributions.Uniform" -tf_class { - is_instance: "" - is_instance: "" - is_instance: "" - is_instance: "" - member { - name: "allow_nan_stats" - mtype: "" - } - member { - name: "batch_shape" - mtype: "" - } - member { - name: "dtype" - mtype: "" - } - member { - name: "event_shape" - mtype: "" - } - member { - name: "high" - mtype: "" - } - member { - name: "low" - mtype: "" - } - member { - name: "name" - mtype: "" - } - member { - name: "parameters" - mtype: "" - } - member { - name: "reparameterization_type" - mtype: "" - } - member { - name: "validate_args" - mtype: "" - } - member_method { - name: "__init__" - argspec: "args=[\'self\', \'low\', \'high\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'False\', \'True\', \'Uniform\'], " - } - member_method { - name: "batch_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], " - } - member_method { - name: "cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], " - } - member_method { - name: "copy" - argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None" - } - member_method { - name: "covariance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], " - } - member_method { - name: "cross_entropy" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], " - } - member_method { - name: "entropy" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], " - } - member_method { - name: "event_shape_tensor" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], " - } - member_method { - name: "is_scalar_batch" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], " - } - member_method { - name: "is_scalar_event" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], " - } - member_method { - name: "kl_divergence" - argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], " - } - member_method { - name: "log_cdf" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], " - } - member_method { - name: "log_prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], " - } - member_method { - name: "log_survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], " - } - member_method { - name: "mean" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], " - } - member_method { - name: "mode" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], " - } - member_method { - name: "param_shapes" - argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], " - } - member_method { - name: "param_static_shapes" - argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None" - } - member_method { - name: "prob" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], " - } - member_method { - name: "quantile" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], " - } - member_method { - name: "range" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range\'], " - } - member_method { - name: "sample" - argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], " - } - member_method { - name: "stddev" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], " - } - member_method { - name: "survival_function" - argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], " - } - member_method { - name: "variance" - argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt deleted file mode 100644 index 90b60ef074d..00000000000 --- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt +++ /dev/null @@ -1,75 +0,0 @@ -path: "tensorflow.distributions" -tf_module { - member { - name: "Bernoulli" - mtype: "" - } - member { - name: "Beta" - mtype: "" - } - member { - name: "Categorical" - mtype: "" - } - member { - name: "Dirichlet" - mtype: "" - } - member { - name: "DirichletMultinomial" - mtype: "" - } - member { - name: "Distribution" - mtype: "" - } - member { - name: "Exponential" - mtype: "" - } - member { - name: "FULLY_REPARAMETERIZED" - mtype: "" - } - member { - name: "Gamma" - mtype: "" - } - member { - name: "Laplace" - mtype: "" - } - member { - name: "Multinomial" - mtype: "" - } - member { - name: "NOT_REPARAMETERIZED" - mtype: "" - } - member { - name: "Normal" - mtype: "" - } - member { - name: "RegisterKL" - mtype: "" - } - member { - name: "ReparameterizationType" - mtype: "" - } - member { - name: "StudentT" - mtype: "" - } - member { - name: "Uniform" - mtype: "" - } - member_method { - name: "kl_divergence" - argspec: "args=[\'distribution_a\', \'distribution_b\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " - } -} diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt index 7c865bb0022..e6d8cb64401 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt @@ -216,10 +216,6 @@ tf_module { name: "debugging" mtype: "" } - member { - name: "distributions" - mtype: "" - } member { name: "double" mtype: "" From dfb95f134c5b12b0b9e9bc79587acd4071ed1e33 Mon Sep 17 00:00:00 2001 From: James Keeling Date: Wed, 7 Nov 2018 10:46:15 -0800 Subject: [PATCH 245/540] Improve error message when explicit non-existent device is requested I used the error interpolation framework to add the node that causes the problem. I also added a new sentence that appears if the user has requested a GPU and CUDA is disabled: "The requested device appears to be a GPU, but CUDA is not enabled." PiperOrigin-RevId: 220490021 --- tensorflow/core/common_runtime/placer.cc | 16 ++++++++--- tensorflow/core/common_runtime/placer_test.cc | 27 +++++++++++++++++-- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc index 305d6a3b1bd..f8d933b45e0 100644 --- a/tensorflow/core/common_runtime/placer.cc +++ b/tensorflow/core/common_runtime/placer.cc @@ -32,6 +32,7 @@ limitations under the License. #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/util/port.h" namespace tensorflow { @@ -378,11 +379,20 @@ class ColocationGraph { } std::sort(device_names.begin(), device_names.end()); + string gpu_msg = ""; + if (!IsGoogleCudaEnabled() && + str_util::Lowercase(specified_device_name.type) == "gpu") { + gpu_msg = + " The requested device appears to be a GPU, but CUDA is not " + "enabled."; + } + return errors::InvalidArgument( - "Operation was explicitly assigned to ", - node->requested_device(), " but available devices are [ ", + errors::FormatNodeNameForError(node->name()), + "was explicitly assigned to ", node->requested_device(), + " but available devices are [ ", str_util::Join(device_names, ", "), " ]. Make sure ", - "the device specification refers to a valid device."); + "the device specification refers to a valid device.", gpu_msg); } else if (specified_device_name.has_type) { return errors::InvalidArgument( "Could not satisfy explicit device specification '", diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc index d5e98b8d9e8..69f1611c1dd 100644 --- a/tensorflow/core/common_runtime/placer_test.cc +++ b/tensorflow/core/common_runtime/placer_test.cc @@ -1199,10 +1199,33 @@ TEST_F(PlacerTest, TestNonExistentDevice) { EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); LOG(WARNING) << s.error_message(); EXPECT_TRUE(str_util::StrContains( - s.error_message(), - "was explicitly assigned to /job:foo/replica:17 but available devices")); + s.error_message(), "was explicitly assigned to /job:foo/replica:17")); + EXPECT_TRUE( + str_util::StrContains(s.error_message(), "but available devices")); } +#if !GOOGLE_CUDA +// Test that we inform the user if they appear to be explicitly placing nodes +// on a GPU when CUDA is not available +TEST_F(PlacerTest, TestUseGpuWithNoCuda) { + Graph g(OpRegistry::Global()); + { // Scope for temporary variables used to construct g. + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + ops::SourceOp("VariableGPU", + b.opts().WithName("var").WithDevice("/device:gpu:0")); + TF_EXPECT_OK(BuildGraph(b, &g)); + } + + SessionOptions options; + Status s = Place(&g, &options); + EXPECT_EQ(error::INVALID_ARGUMENT, s.code()); + LOG(WARNING) << s.error_message(); + EXPECT_TRUE(str_util::StrContains( + s.error_message(), + "The requested device appears to be a GPU, but CUDA is not enabled.")); +} +#endif + TEST_F(PlacerTest, TestUnsupportedDeviceAllowSoftPlacement) { Graph g(OpRegistry::Global()); { // Scope for temporary variables used to construct g. From cb6fc1dfea89d143faf4efe9083a6699a07602f5 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Wed, 7 Nov 2018 11:13:18 -0800 Subject: [PATCH 246/540] [XLA:GPU] Disable lateral MOF of dynamic-update-slice The emitter doesn't support in-place DUS when it's in a multi-output fusion, making performance a lot worse. Disable it for now. PiperOrigin-RevId: 220495950 --- .../xla/service/gpu/multi_output_fusion.cc | 12 +++++++ .../service/gpu/multi_output_fusion_test.cc | 34 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc index 9427d3d54ad..d9b06828e2b 100644 --- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc @@ -140,6 +140,18 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1, return false; } + // The emitter only supports in-place DUS for fusions with a single DUS at the + // root. Don't sibling fuse DUS for now. + // TODO(b/119178699): Multi-output fusing DUS can improve performance if we + // share the input and output buffers and add support to the emitter. + if (instr1->fused_expression_root()->opcode() == + HloOpcode::kDynamicUpdateSlice || + (instr2->opcode() == HloOpcode::kFusion && + instr2->fused_expression_root()->opcode() == + HloOpcode::kDynamicUpdateSlice)) { + return false; + } + // Do this check last, as it may be expensive. return !GpuInstructionFusion::FusionWouldBeTooLarge(instr1, instr2); } diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc index 1d4856e0cae..58ee684c473 100644 --- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc @@ -621,5 +621,39 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) { } } +TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) { + auto module = ParseHloString(R"(HloModule dus_mof + fusion.1 { + p.0 = f16[50,96,1024]{2,1,0} parameter(0) + p.1 = s32[1]{0} parameter(1) + p.2 = f16[1,96,1024]{2,1,0} parameter(2) + c.0 = s32[] constant(0) + pad = s32[3]{0} pad(p.1, c.0), padding=0_2 + ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad) + } + + fusion.2 { + p.0 = f16[50,96,1024]{2,1,0} parameter(0) + p.1 = s32[1]{0} parameter(1) + p.2 = f16[1,96,1024]{2,1,0} parameter(2) + c.0 = s32[] constant(0) + pad = s32[3]{0} pad(p.1, c.0), padding=0_2 + ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad) + } + + ENTRY entry { + p.00 = f16[50,96,1024]{2,1,0} parameter(0) + p.01 = f16[50,96,1024]{2,1,0} parameter(1) + p.1 = s32[1]{0} parameter(2) + p.2 = f16[1,96,1024]{2,1,0} parameter(3) + + f1 = f16[50,96,1024] fusion(p.00, p.1, p.2), kind=kLoop, calls=fusion.1 + f2 = f16[50,96,1024] fusion(p.01, p.1, p.2), kind=kLoop, calls=fusion.2 + ROOT tuple = (f16[50,96,1024],f16[50,96,1024]) tuple(f1, f2) + })") + .ValueOrDie(); + ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); +} + } // namespace gpu } // namespace xla From 6504db67b81c166a6cc734f4cacabd763b3eb595 Mon Sep 17 00:00:00 2001 From: mbhuiyan Date: Wed, 7 Nov 2018 11:23:13 -0800 Subject: [PATCH 247/540] adding comments for MKL DNN --- .../direct_session_with_tracking_alloc_test.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc index a48ff0541d5..6a265c468c1 100644 --- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc +++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc @@ -107,6 +107,16 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) { EXPECT_EQ(2, shape.dim_size()); EXPECT_EQ(2, shape.dim(0).size()); EXPECT_EQ(1, shape.dim(1).size()); + // if MKL is used, it goes through additional + // graph rewrite pass on top of Tensorflow. + // In TF, every time a graph pass + // happens, "constant" nodes are allocated + // and deallocated. Each allocation calls the + // (FindChunkPtr of BFCAllocator), + // which increments the value of AllocationId. + // Thus AllocationId of MKL can differ with TF if + // someone changes the relevant codes in BFCAllocator. + // Currently they are the same. if (node->name() == y->name()) { EXPECT_EQ(13, cm->AllocationId(node, 0)); } else { From 449eec6dcaa1c0ab828a7c1a0e2383d49db435d1 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Wed, 7 Nov 2018 11:27:38 -0800 Subject: [PATCH 248/540] [TF:XLA] Fix wrong output for Pad with int64 pad amounts. Re-enable some long-disabled cases in randomized test for Pad, which oddly enough turned out to still trigger bugs. Change test harness to form padding quantities by subtraction rather than addition, which stops it from trying to build unreasonably large tensors. PiperOrigin-RevId: 220498750 --- tensorflow/compiler/tests/binary_ops_test.py | 9 ++++--- tensorflow/compiler/tests/randomized_tests.cc | 15 +++++------ tensorflow/compiler/tf2xla/kernels/pad_op.cc | 25 ++++++++++--------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py index 4e6dd6abfc9..d52d7c35e73 100644 --- a/tensorflow/compiler/tests/binary_ops_test.py +++ b/tensorflow/compiler/tests/binary_ops_test.py @@ -18,6 +18,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import itertools + import numpy as np from tensorflow.compiler.tests import xla_test @@ -996,13 +998,14 @@ class BinaryOpsTest(xla_test.XLATestCase): expected=np.array([[[[1], [2]], [[3], [4]]]], dtype=dtype)) def testPad(self): - for dtype in self.numeric_types: + for dtype, pad_type in itertools.product( + self.numeric_types, [np.int32, np.int64]): self._testBinary( array_ops.pad, np.array( [[1, 2, 3], [4, 5, 6]], dtype=dtype), np.array( - [[1, 2], [2, 1]], dtype=np.int32), + [[1, 2], [2, 1]], dtype=pad_type), expected=np.array( [[0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 0], @@ -1016,7 +1019,7 @@ class BinaryOpsTest(xla_test.XLATestCase): np.array( [[1, 2, 3], [4, 5, 6]], dtype=dtype), np.array( - [[0, 3], [2, 1]], dtype=np.int32), + [[0, 3], [2, 1]], dtype=pad_type), expected=np.array( [[7, 7, 1, 2, 3, 7], [7, 7, 4, 5, 6, 7], diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc index cfccf5f3d2a..a6b58020126 100644 --- a/tensorflow/compiler/tests/randomized_tests.cc +++ b/tensorflow/compiler/tests/randomized_tests.cc @@ -2466,20 +2466,21 @@ TEST_F(OpTest, Pack) { }); } -// TODO(b/31741898): crashes on GPU. TEST_F(OpTest, Pad) { Repeatedly([this]() { auto type = Choose(kAllXlaTypes); std::vector t_dims = RandomDims(); - // TODO(b/31741996): re-enable DT_INT64 when bug is fixed. - // DataType tpaddings = Choose({DT_INT32, DT_INT64}); - DataType tpaddings = DT_INT32; + DataType tpaddings = Choose({DT_INT32, DT_INT64}); std::vector paddings_vec; - std::uniform_int_distribution distribution(0, 7); for (int i = 0; i < t_dims.size(); ++i) { - paddings_vec.push_back(distribution(generator())); - paddings_vec.push_back(distribution(generator())); + std::uniform_int_distribution pad_distribution(0, t_dims[i]); + int pad_size = pad_distribution(generator()); + std::uniform_int_distribution lower_distribution(0, pad_size); + int low_pad_size = lower_distribution(generator()); + paddings_vec.push_back(low_pad_size); + paddings_vec.push_back(pad_size - low_pad_size); + t_dims[i] -= pad_size; } Tensor paddings; CHECK( diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc index 3f5445b4821..ee03352c582 100644 --- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc @@ -29,8 +29,8 @@ class PadOp : public XlaOpKernel { explicit PadOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { - const TensorShape input_shape = ctx->InputShape(0); - const TensorShape pad_shape = ctx->InputShape(1); + const TensorShape input_shape = ctx->InputShape("input"); + const TensorShape pad_shape = ctx->InputShape("paddings"); const int dims = input_shape.dims(); OP_REQUIRES( ctx, @@ -47,22 +47,22 @@ class PadOp : public XlaOpKernel { "The first dimension of paddings must be the rank of inputs", pad_shape.DebugString(), " ", input_shape.DebugString())); + xla::XlaOp input = ctx->Input("input"); if (fixed_dims == 0) { // Tensor is rank 0. Return it unchanged. - ctx->SetOutput(0, ctx->Input(0)); + ctx->SetOutput(0, input); return; } - // Evaluate the 'padding' constant input, reshaping to a matrix. xla::Literal pad_literal; - OP_REQUIRES_OK( - ctx, ctx->ConstantInputReshaped(1, {fixed_dims, 2}, &pad_literal)); + OP_REQUIRES_OK(ctx, + ctx->ConstantInputAsInt64Literal("paddings", &pad_literal)); xla::PaddingConfig config; for (int i = 0; i < fixed_dims; ++i) { auto* dim = config.add_dimensions(); - int before = pad_literal.Get({i, 0}); - int after = pad_literal.Get({i, 1}); + int before = pad_literal.Get({i, 0}); + int after = pad_literal.Get({i, 1}); OP_REQUIRES(ctx, before >= 0 && after >= 0, errors::InvalidArgument( "Paddings must be non-negative: ", before, " ", after)); @@ -73,12 +73,13 @@ class PadOp : public XlaOpKernel { // PadV2 added a "constant_values" input that indicates the pad value. xla::XlaOp constant_values; if (ctx->num_inputs() == 3) { - OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(2)), - errors::InvalidArgument("constant_values must be a scalar.")); - ctx->SetOutput(0, xla::Pad(ctx->Input(0), ctx->Input(2), config)); + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(ctx->InputShape("constant_values")), + errors::InvalidArgument("constant_values must be a scalar.")); + ctx->SetOutput(0, xla::Pad(input, ctx->Input("constant_values"), config)); } else { auto zero = XlaHelpers::Zero(ctx->builder(), input_type(0)); - ctx->SetOutput(0, xla::Pad(ctx->Input(0), zero, config)); + ctx->SetOutput(0, xla::Pad(input, zero, config)); } } }; From 0ef9741fb159a8284dcb9ab9645cc5a124d0f722 Mon Sep 17 00:00:00 2001 From: Bjarke Hammersholt Roune Date: Wed, 7 Nov 2018 11:29:18 -0800 Subject: [PATCH 249/540] Let HloModule::Clone("") preserve the cloned name. PiperOrigin-RevId: 220499026 --- tensorflow/compiler/xla/service/hlo_module.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc index 6a838b7eb96..14bf17f4be1 100644 --- a/tensorflow/compiler/xla/service/hlo_module.cc +++ b/tensorflow/compiler/xla/service/hlo_module.cc @@ -559,7 +559,8 @@ std::unique_ptr HloModule::Clone(const string& suffix) const { std::unique_ptr HloModule::Clone(const HloModuleConfig& config, const string& suffix) const { VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n"; - auto module = absl::make_unique(name_ + "-" + suffix, config); + auto module = absl::make_unique( + absl::StrCat(name_, suffix.empty() ? "" : "-", suffix), config); HloCloneContext context(module.get(), suffix); auto cloned_computation = entry_computation_->Clone(suffix, &context); From 44fb8a750e563392e4aa4b7c6de5d7f56d1c65a8 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Wed, 7 Nov 2018 11:32:34 -0800 Subject: [PATCH 250/540] Tweak comment on XlaClusterInfo's default constructor. PiperOrigin-RevId: 220499695 --- tensorflow/compiler/jit/encapsulate_util.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h index 304f9f31205..5e0c4bf6a0c 100644 --- a/tensorflow/compiler/jit/encapsulate_util.h +++ b/tensorflow/compiler/jit/encapsulate_util.h @@ -117,11 +117,14 @@ Status PreprocessForEncapsulation(Graph* g, // Information for XLA computation. struct XlaClusterInfo { - // The implicit default constructor is deleted because host_compute_core is a - // const member whose type (std::map) doesn't necessarily have a user provided - // constructor - while libc++ and libstdc++ 4.8 provide a user defined - // default constructor, libstdc++ at least >= 7.3 does not. - // See also c++11 [class.ctor] p5. + // Add an explicitly-defined default constructor for this class. + // + // The compiler may delete the default constructor here because + // host_compute_core is a const member whose type (std::map) doesn't + // necessarily have a user provided constructor -- while libc++ and + // libstdc++ 4.8 provide a user defined default constructor, libstdc++ at + // least >= 7.3 does not. See also c++11 [class.ctor] p5. + // // TODO(klimek): In c++17 we'll be able to initialize host_compute_core // without losing aggregate initialization, which allows us to get rid of // the constructor definitions again. From bdc9d807853d1b354dc2ae4681857baf4858cd00 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Wed, 7 Nov 2018 11:43:11 -0800 Subject: [PATCH 251/540] [TF:XLA] Bump open source llvm revision to r346295 PiperOrigin-RevId: 220501794 --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 18a0ba6b197..a76d7a55707 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -473,11 +473,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "llvm", build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"), - sha256 = "e13fec9469075ab7dd4dbc046c7e37bd8330557aebbe42e3f1de2bad5adb8f2c", - strip_prefix = "llvm-b9e341e90c167b92136893c9085343aa254ddd37", + sha256 = "b2a8e45e3c6feb2d122307edc4fd18364986c66dec5cb9a19969490256d9d545", + strip_prefix = "llvm-2e8ff631d6999258f3503c72ef8d9d69c7bd35b8", urls = [ - "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/b9e341e90c167b92136893c9085343aa254ddd37.tar.gz", - "https://github.com/llvm-mirror/llvm/archive/b9e341e90c167b92136893c9085343aa254ddd37.tar.gz", + "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/2e8ff631d6999258f3503c72ef8d9d69c7bd35b8.tar.gz", + "https://github.com/llvm-mirror/llvm/archive/2e8ff631d6999258f3503c72ef8d9d69c7bd35b8.tar.gz", ], ) From 6ca494f8f8eab3a8603dbd4e2ccc22f4f9e1806f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 11:44:55 -0800 Subject: [PATCH 252/540] parallel_for: add converter for MaxPoolGradGrad. PiperOrigin-RevId: 220502069 --- .../python/ops/parallel_for/control_flow_ops_test.py | 7 +++++-- tensorflow/python/ops/parallel_for/pfor.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py index e86f409d68b..1826924b47e 100644 --- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py +++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py @@ -792,9 +792,12 @@ class NNTest(PForTest): output = nn.max_pool( x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC") loss = nn.l2_loss(output) - return output, gradient_ops.gradients(loss, x1) + ones = array_ops.ones_like(output) + grad = gradient_ops.gradients(loss, x1, grad_ys=ones) + grad_grad = gradient_ops.gradients(grad, ones) + return output, grad, grad_grad - self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2) + self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3) def test_fused_batch_norm(self): data_formats = ["NHWC"] diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py index 5d10860e943..72441908ec2 100644 --- a/tensorflow/python/ops/parallel_for/pfor.py +++ b/tensorflow/python/ops/parallel_for/pfor.py @@ -1304,6 +1304,7 @@ def _inputs_with_flattening(pfor_input, input_indices): @RegisterPForWithArgs("AvgPool", dims=[0]) @RegisterPForWithArgs("MaxPool", dims=[0]) @RegisterPForWithArgs("MaxPoolGrad", dims=[0, 1, 2]) +@RegisterPForWithArgs("MaxPoolGradGrad", dims=[0, 1, 2]) @RegisterPForWithArgs("SoftmaxCrossEntropyWithLogits", dims=[0, 1]) def _convert_flatten_batch(pfor_input, op_type, dims): del op_type From ee054e9826875de8cee089a02091a03c616687ce Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Wed, 7 Nov 2018 11:47:00 -0800 Subject: [PATCH 253/540] [TF:XLA] Fix more int32/int64 literal bugs. Make XlaOpKernelContext::ConstantInputReshaped private, and change its users to use other methods, since its use cases can be handled other ways. Move code to copy a tensor to a literal into common literal_util module and simplify it. PiperOrigin-RevId: 220502409 --- tensorflow/compiler/tf2xla/kernels/fill_op.cc | 21 ++++------- .../compiler/tf2xla/kernels/mirror_pad_op.cc | 9 +++-- .../compiler/tf2xla/kernels/reverse_op.cc | 9 ++--- .../compiler/tf2xla/kernels/shape_op.cc | 15 ++++---- .../compiler/tf2xla/kernels/transpose_op.cc | 18 ++++------ tensorflow/compiler/tf2xla/literal_util.cc | 6 ++++ tensorflow/compiler/tf2xla/literal_util.h | 5 +++ tensorflow/compiler/tf2xla/xla_op_kernel.cc | 35 +++++++------------ tensorflow/compiler/tf2xla/xla_op_kernel.h | 18 +++++----- 9 files changed, 62 insertions(+), 74 deletions(-) diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc index e9bdb15aa0c..c7938a4bbfa 100644 --- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc @@ -33,8 +33,8 @@ class FillOp : public XlaOpKernel { void Compile(XlaOpKernelContext* ctx) override { // The output of this Op is a tensor of shape 'dims_shape' with each // element set to the scalar 'dims_literal'. - const TensorShape dims_shape = ctx->InputShape(0); - const TensorShape value_shape = ctx->InputShape(1); + const TensorShape dims_shape = ctx->InputShape("dims"); + const TensorShape value_shape = ctx->InputShape("value"); OP_REQUIRES( ctx, IsLegacyVector(dims_shape), errors::InvalidArgument("dims must be a vector of int32, got shape ", @@ -42,29 +42,22 @@ class FillOp : public XlaOpKernel { OP_REQUIRES(ctx, IsLegacyScalar(value_shape), errors::InvalidArgument("value must be a scalar, got shape ", value_shape.DebugString())); + // Evaluate the 'dims' constant input, reshaping to a vector if it // was a 'legacy' vector (secretly a scalar). - xla::Literal dims_literal; - OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped( - 0, {dims_shape.num_elements()}, &dims_literal)); + std::vector dims; + OP_REQUIRES_OK(ctx, ctx->ConstantInputReshapedToIntVector("dims", &dims)); - // Convert the dims literal into a vector that we can pass to - // XlaBuilder. - std::vector broadcast; - broadcast.reserve(dims_literal.shape().dimensions(0)); - for (int i = 0; i < dims_literal.shape().dimensions(0); ++i) { - broadcast.push_back(dims_literal.Get({i})); - } // Look up the value input, reshaping to a scalar if it was a // 'legacy' scalar (secretly a vector). - xla::XlaOp data = ctx->Input(1); + xla::XlaOp data = ctx->Input("value"); if (value_shape.dims() > 0) { CHECK_EQ(value_shape.dims(), 1); data = xla::Reshape(data, {}); } // Emit the actual computation, which broadcasts the scalar to the // desired shape. - auto result = xla::Broadcast(data, broadcast); + auto result = xla::Broadcast(data, dims); ctx->SetOutput(0, result); } diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc index 4833a9662dd..0f51c70f277 100644 --- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc @@ -65,8 +65,8 @@ class MirrorPadOp : public XlaOpKernel { } void Compile(XlaOpKernelContext* ctx) override { - const TensorShape input_shape = ctx->InputShape(0); - const TensorShape pad_shape = ctx->InputShape(1); + const TensorShape input_shape = ctx->InputShape("input"); + const TensorShape pad_shape = ctx->InputShape("paddings"); MirrorPadMode mode; OP_REQUIRES_OK(ctx, GetNodeAttr(def(), "mode", &mode)); @@ -93,11 +93,10 @@ class MirrorPadOp : public XlaOpKernel { // Evaluate the 'padding' constant input, reshaping to a matrix. xla::Literal pad_literal; - OP_REQUIRES_OK( - ctx, ctx->ConstantInputReshaped(1, {fixed_dims, 2}, &pad_literal)); + OP_REQUIRES_OK(ctx, ctx->ConstantInput("paddings", &pad_literal)); xla::XlaBuilder* b = ctx->builder(); - auto in0 = ctx->Input(0); + auto in0 = ctx->Input("input"); xla::StatusOr in0_shape = b->GetShape(in0); OP_REQUIRES(ctx, in0_shape.ok(), in0_shape.status()); xla::StatusOr accum_status = diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc index 56b80cb4a29..2ceadaf79c5 100644 --- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc @@ -51,14 +51,11 @@ class ReverseOp : public XlaOpKernel { } // XlaBuilder::Rev() requires concrete values for dimensions arg. xla::Literal lax; - OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {x_shape.dims()}, &lax)); - std::vector revdims(x_shape.dims()); - std::copy(lax.data().begin(), lax.data().end(), - revdims.begin()); - std::vector dimensions; + OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &lax)); + std::vector dimensions; for (int d = 0; d < x_shape.dims(); ++d) { - if (revdims[d]) { + if (lax.Get({d})) { dimensions.push_back(d); } } diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc index 37b026aeb05..4fdbeac8d1a 100644 --- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc @@ -108,21 +108,20 @@ class ExpandDimsOp : public XlaOpKernel { explicit ExpandDimsOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { - const TensorShape input_shape = ctx->InputShape(0); - const TensorShape dim_shape = ctx->InputShape(1); + const TensorShape input_shape = ctx->InputShape("input"); + const TensorShape dim_shape = ctx->InputShape("dim"); + std::vector dims; + OP_REQUIRES_OK(ctx, ctx->ConstantInputReshapedToIntVector("dim", &dims)); // TODO(phawkins): the standard implementation of ExpandDimsOp seems to // accept legacy scalars, even when they should be forbidden by the graphdef // version. - OP_REQUIRES(ctx, dim_shape.num_elements() == 1, + OP_REQUIRES(ctx, dims.size() == 1, errors::InvalidArgument(absl::StrCat( "dim input to ExpandDims must be a scalar; got ", dim_shape.DebugString()))); - xla::Literal literal; - OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {1}, &literal)); - - int dim = literal.data()[0]; + int dim = dims[0]; OP_REQUIRES(ctx, (dim >= -1 - input_shape.dims() && dim <= input_shape.dims()), @@ -148,7 +147,7 @@ class ExpandDimsOp : public XlaOpKernel { dim = std::min(dim, existing_dims_size); new_shape.emplace(new_shape.begin() + dim, 1); - ctx->SetOutput(0, xla::Reshape(ctx->Input(0), new_shape)); + ctx->SetOutput(0, xla::Reshape(ctx->Input("input"), new_shape)); } }; REGISTER_XLA_OP(Name("ExpandDims").CompileTimeConstantInput("dim"), diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc index 48a211942d7..c9b324a243e 100644 --- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc @@ -37,8 +37,8 @@ class TransposeOp : public XlaOpKernel { : XlaOpKernel(ctx), conjugate_(conjugate) {} void Compile(XlaOpKernelContext* ctx) override { - const TensorShape input_shape = ctx->InputShape(0); - const TensorShape perm_tensor_shape = ctx->InputShape(1); + const TensorShape input_shape = ctx->InputShape("x"); + const TensorShape perm_tensor_shape = ctx->InputShape("perm"); // Preliminary validation of sizes. OP_REQUIRES(ctx, TensorShapeUtils::IsVector(perm_tensor_shape), @@ -52,19 +52,15 @@ class TransposeOp : public XlaOpKernel { ". But input(1) is a vector of size ", perm_tensor_shape.num_elements())); - xla::Literal literal; - OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {dims}, &literal)); - - std::vector perm(dims); - std::copy(literal.data().begin(), literal.data().end(), - perm.begin()); + std::vector perm; + OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("perm", &perm)); std::vector transposed_order; // Check whether permutation is a permutation of integers of [0 .. dims). absl::InlinedVector bits(dims); bool is_identity = true; for (int i = 0; i < dims; ++i) { - const int32 d = perm[i]; + const int64 d = perm[i]; OP_REQUIRES( ctx, 0 <= d && d < dims, errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")")); @@ -83,9 +79,9 @@ class TransposeOp : public XlaOpKernel { xla::XlaOp transposed; // 0-D, 1-D, and identity transposes do nothing. if (dims <= 1 || is_identity) { - transposed = ctx->Input(0); + transposed = ctx->Input("x"); } else { - transposed = xla::Transpose(ctx->Input(0), transposed_order); + transposed = xla::Transpose(ctx->Input("x"), transposed_order); } // Conjugate the transposed result if this is ConjugateTransposeOp. diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc index 20103ec3ae0..67d08290033 100644 --- a/tensorflow/compiler/tf2xla/literal_util.cc +++ b/tensorflow/compiler/tf2xla/literal_util.cc @@ -32,6 +32,12 @@ Status HostTensorToBorrowingLiteral(const Tensor& host_tensor, return Status::OK(); } +xla::StatusOr HostTensorToLiteral(const Tensor& host_tensor) { + xla::BorrowingLiteral literal; + TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(host_tensor, &literal)); + return literal.Clone(); +} + Status HostTensorToMutableBorrowingLiteral( Tensor* host_tensor, xla::MutableBorrowingLiteral* literal) { xla::Shape xla_shape; diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h index 1db7470ee2a..a153dddee61 100644 --- a/tensorflow/compiler/tf2xla/literal_util.h +++ b/tensorflow/compiler/tf2xla/literal_util.h @@ -30,6 +30,11 @@ namespace tensorflow { // 'host_tensor'. Status HostTensorToBorrowingLiteral(const Tensor& host_tensor, xla::BorrowingLiteral* literal); + +// Returns a Literal with the contents of 'host_tensor', backed by its own +// storage (i.e., not reusing 'host_tensor's buffers.) +xla::StatusOr HostTensorToLiteral(const Tensor& host_tensor); + // Returns a MutableBorrowingLiteral that utilizes the same underlying buffer // owned by 'host_tensor', but is mutable via the xla::Literal methods. Status HostTensorToMutableBorrowingLiteral( diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc index 4b1634f6974..227915f5703 100644 --- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc +++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc @@ -136,25 +136,6 @@ Status XlaOpKernelContext::ConstantInputReshaped( } const XlaExpression* expression = CastExpressionFromTensor(tensor); - auto copy_tensor_to_literal = [](const Tensor& tensor, - xla::Literal* literal) { - xla::Shape literal_shape; - TF_RETURN_IF_ERROR( - TensorShapeToXLAShape(tensor.dtype(), tensor.shape(), &literal_shape)); - - *literal = xla::Literal(literal_shape); - - // memcpy over the payload ... - // TODO(phawkins): handle string types. - size_t total_bytes = tensor.TotalBytes(); - if (total_bytes > 0) { - void* dst_ptr = literal->untyped_data(); - const void* src_ptr = DMAHelper::base(&tensor); - memcpy(dst_ptr, src_ptr, total_bytes); - } - return Status::OK(); - }; - // If the tensor has a known constant value, there is no need to invoke XLA. if (expression->has_constant_value()) { Tensor temp(tensor.dtype()); @@ -164,14 +145,15 @@ Status XlaOpKernelContext::ConstantInputReshaped( return errors::Internal("Incompatible shapes in ConstantInputReshaped."); } - return copy_tensor_to_literal(temp, constant_literal); + TF_ASSIGN_OR_RETURN(*constant_literal, HostTensorToLiteral(temp)); + return Status::OK(); } // Make sure we treat zero-element tensors as constant. if (new_shape.num_elements() == 0) { Tensor temp(tensor.dtype(), new_shape); - - return copy_tensor_to_literal(temp, constant_literal); + TF_ASSIGN_OR_RETURN(*constant_literal, HostTensorToLiteral(temp)); + return Status::OK(); } xla::XlaOp handle = expression->handle(); @@ -322,6 +304,15 @@ Status XlaOpKernelContext::ConstantInputReshapedToIntVector( return LiteralToInt64Vector(literal, out); } +Status XlaOpKernelContext::ConstantInputReshapedToIntVector( + absl::string_view name, std::vector* out) { + TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name)); + xla::Literal literal; + TF_RETURN_IF_ERROR(ConstantInputReshaped( + index, {InputShape(index).num_elements()}, &literal)); + return LiteralToInt64Vector(literal, out); +} + Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index, xla::Literal* out) { xla::Literal literal; diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h index aa00a454968..3d9499f5fae 100644 --- a/tensorflow/compiler/tf2xla/xla_op_kernel.h +++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h @@ -111,14 +111,6 @@ class XlaOpKernelContext { Status ConstantInput(int index, xla::Literal* constant_literal); Status ConstantInput(absl::string_view name, xla::Literal* constant_literal); - // Evaluates input `index`, reshapes it to `new_shape` if new_shape != - // InputShape(index), and stores it in `*constant_literal`. If the input - // cannot be evaluated, e.g., because it depends on unbound parameters, - // returns a non-Ok status. If InputShape(index).num_elements() != - // new_shape.num_elements(), returns an error status. - Status ConstantInputReshaped(int index, absl::Span new_dims, - xla::Literal* constant_literal); - // Converts a constant scalar int32 or int64 tensor into an int64. Status ConstantInputAsIntScalar(int index, int64* out); Status ConstantInputAsIntScalar(absl::string_view name, int64* out); @@ -134,6 +126,8 @@ class XlaOpKernelContext { // Reshapes and converts a constant int32 or int64 tensor into a vector of // int64s. Status ConstantInputReshapedToIntVector(int index, std::vector* out); + Status ConstantInputReshapedToIntVector(absl::string_view name, + std::vector* out); // Converts a constant int32 or int64 Tensor into an xla int64 Literal. Status ConstantInputAsInt64Literal(int index, xla::Literal* out); @@ -260,6 +254,14 @@ class XlaOpKernelContext { // type to allow mapping for variant to more generic types. Status allocate_output(int index, const xla::Shape& shape, Tensor** output); + // Evaluates input `index`, reshapes it to `new_shape` if new_shape != + // InputShape(index), and stores it in `*constant_literal`. If the input + // cannot be evaluated, e.g., because it depends on unbound parameters, + // returns a non-Ok status. If InputShape(index).num_elements() != + // new_shape.num_elements(), returns an error status. + Status ConstantInputReshaped(int index, absl::Span new_dims, + xla::Literal* constant_literal); + OpKernelContext* const context_; }; From 5b373961c662765d8a5b9f469e508476f719fa54 Mon Sep 17 00:00:00 2001 From: Alexandre Passos Date: Wed, 7 Nov 2018 11:56:39 -0800 Subject: [PATCH 254/540] First cut of pruning for wrap_function PiperOrigin-RevId: 220504107 --- tensorflow/python/eager/BUILD | 13 +++ tensorflow/python/eager/def_function.py | 64 +------------ tensorflow/python/eager/function.py | 2 +- tensorflow/python/eager/lift_to_graph.py | 90 +++++++++++++++++++ tensorflow/python/eager/wrap_function.py | 41 ++++++++- tensorflow/python/eager/wrap_function_test.py | 17 ++++ 6 files changed, 162 insertions(+), 65 deletions(-) create mode 100644 tensorflow/python/eager/lift_to_graph.py diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD index 3b4fcd2d977..9d6c4f73375 100644 --- a/tensorflow/python/eager/BUILD +++ b/tensorflow/python/eager/BUILD @@ -408,6 +408,7 @@ py_library( deps = [ ":context", ":function", + ":lift_to_graph", "//tensorflow/python:cond_v2", # TODO(b/118513001): Imported via control_flow_ops; remove. "//tensorflow/python:control_flow_ops", "//tensorflow/python:framework_ops", @@ -418,6 +419,17 @@ py_library( ], ) +py_library( + name = "lift_to_graph", + srcs = ["lift_to_graph.py"], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:internal"], + deps = [ + ":context", + "//tensorflow/python:framework_ops", + ], +) + py_test( name = "def_function_test", srcs = ["def_function_test.py"], @@ -438,6 +450,7 @@ py_library( deps = [ ":context", ":function", + ":lift_to_graph", "//tensorflow/python:framework_ops", "//tensorflow/python:template", "//tensorflow/python:variable_scope", diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py index a56ddeea81b..8c43c38e8b1 100644 --- a/tensorflow/python/eager/def_function.py +++ b/tensorflow/python/eager/def_function.py @@ -19,12 +19,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import collections import functools import weakref from tensorflow.python.eager import context from tensorflow.python.eager import function as function_lib +from tensorflow.python.eager import lift_to_graph from tensorflow.python.framework import ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops @@ -34,65 +34,6 @@ from tensorflow.python.training.checkpointable import base as checkpointable from tensorflow.python.util import tf_decorator -def _graph_inputs(op): - return [x.op for x in op.inputs] + list(op.control_inputs) - - -def _lift_to_graph(init_tensor, graph): - """Copies the tensor and all its inputs recursively to the outer graph.""" - # Check that the initializer does not depend on any placeholders. - visited_ops = set([]) - ops_to_visit = [init_tensor.op] - op_outputs = collections.defaultdict(set) - while ops_to_visit: - op = ops_to_visit.pop() - if op in visited_ops: - continue - visited_ops.add(op) - # TODO(apassos) distinguish arg placeholders, capture placeholders, - # and placeholders the user might directly use to initialize - # variables. - if op.type == "Placeholder": - raise ValueError( - "Unable to lift tensor", init_tensor, - "because it depends transitively on placeholder ", op) - for inp in _graph_inputs(op): - op_outputs[inp].add(op) - if inp not in visited_ops: - ops_to_visit.append(inp) - # Topologically sort the nodes we've extracted. Now we know how many of their - # outputs are part of this subgraph. - ops_to_copy = [] - marked_ops = set([]) - ops_to_visit = [init_tensor.op] - while ops_to_visit: - op = ops_to_visit.pop() - if op in marked_ops: - continue - marked_ops.add(op) - ops_to_copy.append(op) - for inp in _graph_inputs(op): - if all(x in marked_ops for x in op_outputs[inp]): - ops_to_visit.append(inp) - assert len(ops_to_copy) == len(visited_ops) - # ops_to_copy now holds a reverse topologically sorted list of ops which - # ends in the initializer. We copy those to the outermost graph and - # build the initialization op there. - with graph.as_default(): - op_map = {} - for op in reversed(ops_to_copy): - copied_inputs = [op_map[x] for x in op.inputs] - copied_control_inputs = [op_map[x] for x in op.control_inputs] - with ops.control_dependencies(copied_control_inputs): - copied_op = graph.create_op( - op.type, copied_inputs, [x.dtype for x in op.outputs], - attrs=op.node_def.attr) - op_map[op] = copied_op - for i, o in enumerate(op.outputs): - op_map[o] = copied_op.outputs[i] - return op_map[init_tensor] - - class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable): """Variable which does not lift its initializer out of function context. @@ -206,7 +147,8 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable): if self._in_graph_mode: with ops.init_scope(): outer_graph = ops.get_default_graph() - lifted_initializer = _lift_to_graph(initial_value, outer_graph) + lifted_initializer = lift_to_graph.lift_to_graph( + initial_value, outer_graph)[initial_value] with ops.init_scope(): self._initial_value = lifted_initializer with ops.name_scope("IsInitialized"): diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index aa0763d115b..17c9533ac7a 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -341,7 +341,7 @@ class Function(object): "wrap_function-decorated function.") return self._call_flat(args) raise AssertionError( - "Tried to call a concrete function obtained from an interal API " + "Tried to call a concrete function obtained from an internal API " "through the public interface. Use get_concrete_function instead.") if len(args) > self._num_positional_args: raise TypeError( diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py new file mode 100644 index 00000000000..67eb60289fb --- /dev/null +++ b/tensorflow/python/eager/lift_to_graph.py @@ -0,0 +1,90 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# pylint: disable=unidiomatic-typecheck +"""Utility to lift subgraphs.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops + + +def _graph_inputs(op): + return [x.op for x in op.inputs] + list(op.control_inputs) + + +def lift_to_graph(init_tensor, graph, sources=None): + """Copies the tensor and all its inputs recursively to the outer graph.""" + # Check that the initializer does not depend on any placeholders. + if sources is None: + sources = set([]) + visited_ops = set([x.op for x in sources]) + ops_to_visit = [init_tensor.op] + op_outputs = collections.defaultdict(set) + print("ops_to_visit", ops_to_visit) + while ops_to_visit: + op = ops_to_visit.pop() + print("visiting", op) + if op in visited_ops: + continue + visited_ops.add(op) + # TODO(apassos) distinguish arg placeholders, capture placeholders, + # and placeholders the user might directly use to initialize + # variables. + if op.type == "Placeholder": + raise ValueError( + "Unable to lift tensor", init_tensor, + "because it depends transitively on placeholder ", op) + for inp in _graph_inputs(op): + op_outputs[inp].add(op) + if inp not in visited_ops and inp not in sources: + ops_to_visit.append(inp) + # Topologically sort the nodes we've extracted. Now we know how many of their + # outputs are part of this subgraph. + ops_to_copy = [] + marked_ops = set([]) + ops_to_visit = [init_tensor.op] + while ops_to_visit: + op = ops_to_visit.pop() + if op in marked_ops: + continue + marked_ops.add(op) + ops_to_copy.append(op) + for inp in _graph_inputs(op): + if all(x in marked_ops for x in op_outputs[inp]) and inp not in sources: + ops_to_visit.append(inp) + assert len(ops_to_copy) == len(visited_ops) + # ops_to_copy now holds a reverse topologically sorted list of ops which + # ends in the initializer. We copy those to the outermost graph and + # build the initialization op there. + with graph.as_default(): + op_map = {} + for s in sources: + op_map[s] = array_ops.placeholder(dtype=s.dtype, shape=s.shape) + for op in reversed(ops_to_copy): + copied_inputs = [op_map[x] for x in op.inputs] + copied_control_inputs = [op_map[x] for x in op.control_inputs] + with ops.control_dependencies(copied_control_inputs): + copied_op = graph.create_op( + op.type, copied_inputs, [x.dtype for x in op.outputs], + attrs=op.node_def.attr) + op_map[op] = copied_op + for i, o in enumerate(op.outputs): + op_map[o] = copied_op.outputs[i] + return op_map diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py index 7f9c896adee..48266437ef5 100644 --- a/tensorflow/python/eager/wrap_function.py +++ b/tensorflow/python/eager/wrap_function.py @@ -20,8 +20,12 @@ from __future__ import division from __future__ import print_function from tensorflow.python.eager import function +from tensorflow.python.eager import lift_to_graph from tensorflow.python.framework import func_graph +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops from tensorflow.python.ops import variable_scope +from tensorflow.python.util import nest class VariableHolder(object): @@ -41,6 +45,38 @@ class VariableHolder(object): return self._fn(*args, **kwargs) +class WrappedFunction(function.Function): + """Wraps a tf V1 piece of code in a function.""" + + def __init__(self, fn_graph, variable_holder, attrs=None, signature=None): + super(WrappedFunction, self).__init__( + fn_graph, attrs=attrs, signature=signature) + self._variable_holder = variable_holder + + def prune(self, feeds, fetches): + flat_feeds, flat_fetches = nest.flatten(feeds), nest.flatten(fetches) + for f in flat_feeds + flat_fetches: + if not isinstance(f, ops.Tensor): + raise ValueError("Feeds and fetches must be tensors.") + if f.graph is not self._func_graph: + raise ValueError( + "Can only prune function whose feeds and fetches " + "are from this graph (%s). Tensor %s from graph %s" % ( + self._func_graph, f, f.graph)) + with self._func_graph.as_default(): + pruned_graph = func_graph.FuncGraph("pruned") + sink_tensor = array_ops.identity_n(flat_fetches)[0] + lift_map = lift_to_graph.lift_to_graph( + sink_tensor, pruned_graph, sources=flat_feeds) + pruned_graph.outputs.extend(lift_map[x] for x in flat_fetches) + pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds) + pruned_fn = WrappedFunction( + pruned_graph, variable_holder=self._variable_holder) + pruned_fn._num_positional_args = len(flat_feeds) # pylint: disable=protected-access + pruned_fn._arg_keywords = [] # pylint: disable=protected-access + return pruned_fn + + def wrap_function(fn, signature, name=None): """Wraps the TF 1.x function fn into a graph function. @@ -83,12 +119,11 @@ def wrap_function(fn, signature, name=None): the wrapped graph function. """ holder = VariableHolder(fn) - fn = function.Function( + return WrappedFunction( func_graph.func_graph_from_py_func( name, holder, args=None, kwargs=None, signature=signature, add_control_dependencies=False), + variable_holder=holder, signature=signature) - fn._variable_holder = holder - return fn diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py index 0690358491d..b32b6ca4269 100644 --- a/tensorflow/python/eager/wrap_function_test.py +++ b/tensorflow/python/eager/wrap_function_test.py @@ -53,6 +53,23 @@ class WrapFunctionTest(test.TestCase): self.assertAllEqual(f_sub(1.0), 4.0) self.assertAllEqual(f_sub(1.0), 3.0) + def testPrune(self): + + x_in = [] + x_out = [] + + def f(x, y): + x_in.append(x) + xx = x * x + x_out.append(xx) + return xx, 2 * y*y + + f_wrapped = wrap_function.wrap_function( + f, [tensor_spec.TensorSpec((), dtypes.float32)] * 2) + + f_pruned = f_wrapped.prune(x_in[0], [x_out[0]]) + self.assertAllEqual(f_pruned(ops.convert_to_tensor(2.0)), [4.0]) + if __name__ == '__main__': ops.enable_eager_execution() From 81f1d6751b6be87cbe22bd91c5af36a6fef47b78 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Wed, 7 Nov 2018 12:07:26 -0800 Subject: [PATCH 255/540] Use a smart constant instead tf.constant -- one that matches the condition value, in the sense that it will either be a Tensor or a Python value. This change also required adjusting the way side_effect_guards works, which now will gate only the live symbols. A third change adds the live symbols to expression nodes. Motivation: pure-side-effects control flow statements are being augmented with a dummy return value (they just return a single scalar). This provides a return value that can be used as control dependency by side_effect_guards. Such dependency necessarily needs to be a tensor, but we don't want to return a tensor unless the if statement is staged to a cond. This CL makes that return value consistent with the statement itself. PiperOrigin-RevId: 220506051 --- .../autograph/converters/control_flow.py | 21 ++++++++++++------- .../converters/side_effect_guards.py | 5 +++-- .../autograph/core/converter_testing.py | 2 ++ .../python/autograph/impl/conversion.py | 2 ++ .../autograph/lang/special_functions.py | 8 +++++++ .../autograph/lang/special_functions_test.py | 19 ++++++++++++----- .../pyct/static_analysis/liveness.py | 7 +++++++ .../python/autograph/pyct/transformer.py | 20 ++++++++++++------ 8 files changed, 63 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py index a7596be2913..f9e5c99fd64 100644 --- a/tensorflow/python/autograph/converters/control_flow.py +++ b/tensorflow/python/autograph/converters/control_flow.py @@ -160,6 +160,10 @@ class ControlFlowTransformer(converter.Base): node_body = ast_util.rename_symbols(node.body, alias_body_map) node_orelse = ast_util.rename_symbols(node.orelse, alias_orelse_map) + cond_var_name = self.ctx.namer.new_symbol('cond', body_scope.referenced) + body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced) + orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced) + returned_from_cond = tuple(returned_from_cond) if returned_from_cond: if len(returned_from_cond) == 1: @@ -181,13 +185,14 @@ class ControlFlowTransformer(converter.Base): # actually has some return value as well. cond_results = None # TODO(mdan): This doesn't belong here; it's specific to the operator. - returned_from_body = (templates.replace_as_expression('tf.constant(1)'),) - returned_from_orelse = ( - templates.replace_as_expression('tf.constant(1)'),) - - body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced) - orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced) + returned_from_body = (templates.replace_as_expression( + 'ag__.match_staging_level(1, cond_var_name)', + cond_var_name=cond_var_name),) + returned_from_orelse = (templates.replace_as_expression( + 'ag__.match_staging_level(1, cond_var_name)', + cond_var_name=cond_var_name),) + cond_assign = self.create_assignment(cond_var_name, node.test) body_def = self._create_cond_branch( body_name, aliased_orig_names=aliased_body_orig_names, @@ -200,10 +205,10 @@ class ControlFlowTransformer(converter.Base): aliased_new_names=aliased_orelse_new_names, body=node_orelse, returns=returned_from_orelse) - cond_expr = self._create_cond_expr(cond_results, node.test, body_name, + cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name, orelse_name) - return body_def + orelse_def + cond_expr + return cond_assign + body_def + orelse_def + cond_expr def _get_loop_state(self, node): body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE) diff --git a/tensorflow/python/autograph/converters/side_effect_guards.py b/tensorflow/python/autograph/converters/side_effect_guards.py index 910c470f978..98e29ec8e1b 100644 --- a/tensorflow/python/autograph/converters/side_effect_guards.py +++ b/tensorflow/python/autograph/converters/side_effect_guards.py @@ -122,11 +122,12 @@ class SideEffectGuardTransformer(converter.Base): # possible, gate all remaining statements (and that may fail too, see # _visit_and_reindent. args_scope = anno.getanno(node.value, NodeAnno.ARGS_SCOPE) + live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT) # NOTE: We can't guard object attributes because they may not be writable. # In addition, avoid renaming well-known names. # TODO(mdan): Move these names into config. - unguarded_names = (qual_names.QN('self'), qual_names.QN('tf')) - guarded_args = tuple(s for s in args_scope.read + unguarded_names = (qual_names.QN('self'), qual_names.QN('ag__')) + guarded_args = tuple(s for s in live_out if not s.is_composite() and s not in unguarded_names) # TODO(mdan): Include all arguments which depended on guarded_args too. diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py index 0326103933f..7b0608d03fc 100644 --- a/tensorflow/python/autograph/core/converter_testing.py +++ b/tensorflow/python/autograph/core/converter_testing.py @@ -30,6 +30,7 @@ from tensorflow.python.autograph.core import config from tensorflow.python.autograph.core import converter from tensorflow.python.autograph.core import errors from tensorflow.python.autograph.core import function_wrapping +from tensorflow.python.autograph.lang import special_functions from tensorflow.python.autograph.pyct import compiler from tensorflow.python.autograph.pyct import origin_info from tensorflow.python.autograph.pyct import parser @@ -103,6 +104,7 @@ class TestCase(test.TestCase): fake_ag = self.make_fake_mod('fake_ag', converted_call, converter.ConversionOptions) fake_ag.__dict__.update(operators.__dict__) + fake_ag.__dict__.update(special_functions.__dict__) fake_ag.__dict__['utils'] = utils fake_ag.__dict__['rewrite_graph_construction_error'] = ( errors.rewrite_graph_construction_error) diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py index b2fa2825ebd..a0de6ca6b38 100644 --- a/tensorflow/python/autograph/impl/conversion.py +++ b/tensorflow/python/autograph/impl/conversion.py @@ -45,6 +45,7 @@ from tensorflow.python.autograph.core import config from tensorflow.python.autograph.core import converter from tensorflow.python.autograph.core import errors from tensorflow.python.autograph.core import function_wrapping +from tensorflow.python.autograph.lang import special_functions from tensorflow.python.autograph.pyct import ast_util from tensorflow.python.autograph.pyct import compiler from tensorflow.python.autograph.pyct import inspect_utils @@ -272,6 +273,7 @@ def _add_self_references(namespace, autograph_module): # TODO(mdan): Add safeguards against name clashes. # We don't want to create a submodule because we want the operators to be # accessible as ag__. + ag_internal.__dict__.update(special_functions.__dict__) ag_internal.__dict__.update(operators.__dict__) _add_reserved_symbol(namespace, 'ag__', ag_internal) diff --git a/tensorflow/python/autograph/lang/special_functions.py b/tensorflow/python/autograph/lang/special_functions.py index 62ac018ac46..411770692b0 100644 --- a/tensorflow/python/autograph/lang/special_functions.py +++ b/tensorflow/python/autograph/lang/special_functions.py @@ -24,6 +24,7 @@ from __future__ import division from __future__ import print_function from tensorflow.python.autograph.operators import data_structures +from tensorflow.python.framework import constant_op from tensorflow.python.framework import tensor_util @@ -46,6 +47,13 @@ def _validate_list_constructor(elements, element_dtype, element_shape): ' allowed'.format(type(elements))) +def match_staging_level(value, like_value): + """Casts a value to be staged at the same level as another.""" + if tensor_util.is_tensor(like_value): + return constant_op.constant(value) + return value + + def tensor_list(elements, element_dtype=None, element_shape=None, diff --git a/tensorflow/python/autograph/lang/special_functions_test.py b/tensorflow/python/autograph/lang/special_functions_test.py index 206a32d07cd..123ee65b326 100644 --- a/tensorflow/python/autograph/lang/special_functions_test.py +++ b/tensorflow/python/autograph/lang/special_functions_test.py @@ -30,26 +30,35 @@ from tensorflow.python.platform import test class SpecialFunctionsTest(test.TestCase): + def test_match_staging_level(self): + some_tensor = constant_op.constant(0) + tensor_one = special_functions.match_staging_level(1, some_tensor) + python_one = special_functions.match_staging_level(1, 1) + with self.cached_session() as sess: + self.assertTrue(tensor_util.is_tensor(tensor_one)) + self.assertAllEqual(sess.run(tensor_one), 1) + self.assertEqual(python_one, 1) + def test_tensor_list_empty_list(self): l = special_functions.tensor_list([], element_dtype=dtypes.int32, element_shape=()) sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32) - with self.test_session() as sess: + with self.cached_session() as sess: self.assertAllEqual(sess.run(sl), []) l = special_functions.tensor_list((), element_dtype=dtypes.int32, element_shape=()) sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32) - with self.test_session() as sess: + with self.cached_session() as sess: self.assertAllEqual(sess.run(sl), []) def test_tensor_list_tensor(self): l = special_functions.tensor_list( constant_op.constant([], dtype=dtypes.int32)) sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32) - with self.test_session() as sess: + with self.cached_session() as sess: self.assertAllEqual(sess.run(sl), []) def test_tensor_list_unsupported_initializer(self): @@ -66,7 +75,7 @@ class SpecialFunctionsTest(test.TestCase): l = special_functions.tensor_list(elements) sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32) - with self.test_session() as sess: + with self.cached_session() as sess: self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]]) def test_tensor_list_array_from_elements(self): @@ -74,7 +83,7 @@ class SpecialFunctionsTest(test.TestCase): l = special_functions.tensor_list(elements, use_tensor_array=True) sl = l.stack() - with self.test_session() as sess: + with self.cached_session() as sess: self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]]) def test_stack(self): diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py index ad11057a0b0..451398f1b70 100644 --- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py +++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py @@ -198,6 +198,13 @@ class Annotator(transformer.Base): node = self._block_statement_live_out(node) return self._block_statement_live_in(node, node.test) + def visit_Expr(self, node): + node = self.generic_visit(node) + cfg_node = self.current_analyzer.graph.index[node] + anno.setanno(node, anno.Static.LIVE_VARS_OUT, + frozenset(self.current_analyzer.out[cfg_node])) + return node + def resolve(node, source_info, graphs): """Resolves the live symbols at the exit of control flow statements. diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py index 3b6a446340a..b6830534b3d 100644 --- a/tensorflow/python/autograph/pyct/transformer.py +++ b/tensorflow/python/autograph/pyct/transformer.py @@ -26,6 +26,7 @@ import six from tensorflow.python.autograph.pyct import anno from tensorflow.python.autograph.pyct import compiler from tensorflow.python.autograph.pyct import pretty_printer +from tensorflow.python.autograph.pyct import templates class AutographParseError(SyntaxError): @@ -280,6 +281,12 @@ class Base(gast.NodeTransformer): print(pretty_printer.fmt(node)) return node + def create_assignment(self, target, expression): + template = """ + target = expression + """ + return templates.replace(template, target=target, expression=expression) + def visit_block(self, nodes, before_visit=None, after_visit=None): """A more powerful version of generic_visit for statement blocks. @@ -316,13 +323,14 @@ class Base(gast.NodeTransformer): Args: nodes: enumerable of AST node objects. If None, the function returns None. before_visit: optional callable that is called before visiting each item - in nodes - after_visit: optional callable that takes in an AST node and - returns a tuple (new_node, new_destination). It is called after - visiting each item in nodes. Is used in the same was as the + in nodes + after_visit: optional callable that takes in an AST node and returns a + tuple (new_node, new_destination). It is called after visiting each item + in nodes. Is used in the same was as the visit_* methods: new_node will replace the node; if not None, - new_destination must be a list, and subsequent nodes will be placed - in this list instead of the list returned by visit_block. + new_destination must be a list, and subsequent nodes will be placed + in this list instead of the list returned by visit_block. + Returns: A list of AST node objects containing the transformed items fron nodes, except those nodes that have been relocated using after_visit. From fa5dcb6f2c6e84d020a32ce85a0869ccb326d1b6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 12:16:38 -0800 Subject: [PATCH 256/540] Add TensorTracer. This is a utility to trace intermediate tensor values generated during the evaluation of a TF graph on TPU. It is hooked up with the training loop in the TpuEstimator, but is disabled by default. See the comments inside the code for how to turn it on and its various options. PiperOrigin-RevId: 220507541 --- tensorflow/contrib/tpu/BUILD | 1 + .../contrib/tpu/python/tpu/tensor_tracer.py | 553 ++++++++++++++++++ .../contrib/tpu/python/tpu/tpu_estimator.py | 9 +- 3 files changed, 562 insertions(+), 1 deletion(-) create mode 100644 tensorflow/contrib/tpu/python/tpu/tensor_tracer.py diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD index 67327d32000..a0a9cb3f31a 100644 --- a/tensorflow/contrib/tpu/BUILD +++ b/tensorflow/contrib/tpu/BUILD @@ -246,6 +246,7 @@ py_library( "python/tpu/bfloat16.py", "python/tpu/device_assignment.py", "python/tpu/session_support.py", + "python/tpu/tensor_tracer.py", "python/tpu/topology.py", "python/tpu/tpu.py", "python/tpu/tpu_feed.py", diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py new file mode 100644 index 00000000000..70baea203cc --- /dev/null +++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py @@ -0,0 +1,553 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================== +"""A utility to trace tensor values on TPU.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import os.path +import re + +from tensorflow.contrib.tpu.python.ops import tpu_ops +from tensorflow.contrib.tpu.python.tpu import tpu +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_util +from tensorflow.python.ops import gen_math_ops +from tensorflow.python.ops import logging_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import tf_logging as logging + +_TRACER_LOG_PREFIX = ' [>>>TT>>>]' +_DEVICE_TYPE_TPU = 'tpu' +_DEVICE_TYPE_CPU = 'cpu' +_GLOBAL_STEP_OP_NAME = 'GLOBAL-STEP' +_TRACE_MODE_NAN_INF = 'nan-inf' +_TRACE_MODE_PART_TENSOR = 'part-tensor' +_TRACE_MODE_PART_TENSOR_SIZE = 3 +_TRACE_MODE_FULL_TENSOR = 'full-tensor' +_RECORD_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range' +_RECORD_SHOULD_NOT_TRACE = 'not-traced-should-not-trace' +_RECORD_FILTERED_OUT = 'not-traced-filtered-out' +_RECORD_SCALAR = 'not-traced-scalar' +_RECORD_DYNAMIC_SHAPE = 'not-traced-dynamic-shape' +_RECORD_GET_TRACED = 'get-traced' +_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:' +_MARKER_SECTION_END = '!!!!!!! section-end:' +_SECTION_NAME_CONFIG = 'configuration' +_SECTION_NAME_REASON = 'reason' +_SECTION_NAME_OP_LIST = 'op-list' +_SECTION_NAME_GRAPH = 'graph' +_FIELD_NAME_VERSION = 'version:' +_FIELD_NAME_DEVICE = 'device:' +_FIELD_NAME_TRACE_MODE = 'trace-mode:' +_FIELD_NAME_NUM_REPLICAS = 'num-replicas:' +_FIELD_NAME_NUM_OPS = 'number-of-ops:' +_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:' +_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS' +_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'") +_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"') +_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)') +_FLAG_NAME_ENABLE = 'enable' +_FLAG_NAME_TRACE_MODE = 'trace_mode' +_FLAG_NAME_INTERESTING_OPS = 'interesting_ops' +_FLAG_NAME_TRACE_FILE = 'trace_file_path' +_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir' +_FLAG_NAME_OP_RANGE = 'op_range' +_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)') +_OUTPUT_STREAM_ESCAPE = 'file://' +_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR' + + +class TensorTracer(object): + """A software construct for tracing tensor values in a TF graph on TPU. + + This utility is disabled by default. It can be enabled by setting + the TENSOR_TRACER_FLAGS env variable as: + export TENSOR_TRACER_FLAGS="--enable=1" + If it is enabled, it will trace the output tensor values of + selected Ops in the graph. It has two outputs: (1) the traces and (2) + a report. The traces are dumped to a specified local file on the TPU + host. The report is printed to the log.info of the TPU job. + By passing options via the env variable, users can change: + (1) the trace mode (e.g., detecting NaN/Inf, printing partial or + full tensor values) + (2) which Ops to be traced (via op.name or op.type) + (3) output trace file path. + """ + + @staticmethod + def _match_next_flag(flags, pos): + """Returns the match for the next TensorTracer flag.""" + + match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos) + if match: + return match + match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos) + if match: + return match + match = _FLAG_NO_QUOTE_PAT.match(flags, pos) + return match + + @staticmethod + def print_flag_values(): + """Prints all TensorTracer flags passed via environment variables.""" + + tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR) + if not tensor_tracer_flags: + return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR + result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR, + tensor_tracer_flags) + result += 'Individual flag value:\n' + pos = 0 + while True: + match = TensorTracer._match_next_flag(tensor_tracer_flags, pos) + if not match: + break + flag_name = match.group(1) + flag_value = match.group(2) + result += ' %s: %s\n'%(flag_name, flag_value) + pos = match.end() + result += '\n' + return result + + @staticmethod + def get_flag_value(wanted_flag_name): + """Returns the value of a TensorTracer flags.""" + + tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR) + if not tensor_tracer_flags: + return '' + pos = 0 + while True: + match = TensorTracer._match_next_flag(tensor_tracer_flags, pos) + if not match: + return '' + flag_name = match.group(1) + flag_value = match.group(2) + if flag_name == wanted_flag_name: + return flag_value + pos = match.end() + return '' + + @staticmethod + def is_enabled(): + """Returns True if TensorTracer is enabled.""" + + flag_value = TensorTracer.get_flag_value(_FLAG_NAME_ENABLE) + flag_value = flag_value.lower() + enabled = flag_value in ['1', 't', 'true', 'y', 'yes'] + return enabled + + @staticmethod + def use_test_undeclared_outputs_dir(): + """Decides the output directory of the trace file. + + Args: + None. + + Returns: + True if the output trace file should be written to the + test-undeclared-outputs-directory defined via an + env variable. + """ + + flag_value = TensorTracer.get_flag_value( + _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR) + flag_value = flag_value.lower() + enabled = flag_value in ['1', 't', 'true', 'y', 'yes'] + return enabled + + @staticmethod + def check_device_type(device_type): + """Checks if the given device type is valid.""" + + if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]: + raise ValueError('Invalid device_type "%s"'%device_type) + + @staticmethod + def check_trace_mode(trace_mode): + """Checks if the given trace mode is valid.""" + + valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR, + _TRACE_MODE_FULL_TENSOR] + if trace_mode not in valid_trace_modes: + raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.' + 'Valid trace modes are: %s'%(trace_mode, + valid_trace_modes)) + + @staticmethod + def should_trace(device_type, op): + """Returns True if the given Op should be traced.""" + + if device_type != _DEVICE_TYPE_TPU: + raise ValueError('Non TPU device type is not supported') + if control_flow_util.IsInCond(op): + return False + if op.type in ['Reshape', 'ArgMin', 'ArgMax']: + return False + # pylint: disable=protected-access + return tpu._TPU_REPLICATE_ATTR in op.node_def.attr + # pylint: enable=protected-access + + @staticmethod + def reason(op_idx, details): + """Returns why the Op at op_idx is traced or not.""" + return '%d %s'%(op_idx, details) + + @staticmethod + def topological_sort(g): + """Performs topological sort on the given graph. + + Args: + g: the graph. + + Returns: + A pair where the first element indicates if the topological + sort succeeded (True if there is no cycle found; False if a + cycle is found) and the second element is either the sorted + list of nodes or the cycle of nodes found. + """ + + def visit(op, cycle, permanently_marked_ops, + temporarily_marked_ops, sorted_ops): + """Recursively visits all Ops in a graph. + + Args: + op: the current Op being visited. + cycle: a cycle of Ops found. + permanently_marked_ops: the set of Ops that were already visited. + temporarily_marked_ops: the set of Ops that we have visited during + the current descent. + sorted_ops: the list of Ops sorted in topological order. + """ + + if cycle: + return + if op in permanently_marked_ops: + return + if op in temporarily_marked_ops: + cycle = temporarily_marked_ops + return + temporarily_marked_ops.add(op) + for i in range(len(op.outputs)): + out_tensor = op.outputs[i] + for consumer_op in out_tensor.consumers(): + visit(consumer_op, cycle, permanently_marked_ops, + temporarily_marked_ops, sorted_ops) + # pylint: disable=protected-access + for ctrl_output_op in op._control_outputs: + # pylint: enable=protected-access + visit(ctrl_output_op, cycle, permanently_marked_ops, + temporarily_marked_ops, sorted_ops) + temporarily_marked_ops.remove(op) + permanently_marked_ops.add(op) + sorted_ops.insert(0, op) + + graph_cycle = set([]) + sorted_ops = [] + permanently_marked_ops = set([]) + temporarily_marked_ops = set([]) + unsorted_ops = g.get_operations() + for op in unsorted_ops: + visit(op, graph_cycle, permanently_marked_ops, + temporarily_marked_ops, sorted_ops) + if graph_cycle: + return (False, graph_cycle) + else: + assert len(unsorted_ops) == len(sorted_ops) + return (True, sorted_ops) + + def __init__(self): + """Initializes a TensorTracer. + + Sets the various member fields from the flags (if given) or the defaults. + """ + self._version = 'use-outside-compilation' + self._device_type = None + self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE) + if not self._trace_mode: + self._trace_mode = _TRACE_MODE_NAN_INF + TensorTracer.check_trace_mode(self._trace_mode) + self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE + self._instrument_records = {} + interesting_ops = TensorTracer.get_flag_value(_FLAG_NAME_INTERESTING_OPS) + self._selected_ops = interesting_ops.split() + self._set_trace_file_path() + self._set_op_range() + self._num_replicas = None + self._replica_id = None + + def _add_replica_id_to_graph(self, num_replicas, result_tensor): + """Adds nodes for computing the replica ID to the graph.""" + + if not num_replicas: + self._replica_id = 'unknown' + return result_tensor + + self._num_replicas = num_replicas + + with ops.control_dependencies(None): + # Uses None as dependency to run outside of TPU graph rewrites. + self._replica_id = tpu_ops.tpu_replicated_input( + list(range(self._num_replicas)), + name='tt_replica_id') + use_replica_id = array_ops.identity(self._replica_id).op + with ops.control_dependencies([use_replica_id]): + # Adds a control dependency from the result_tensor to + # the replica_id to ensure that replica_id will be added to the graph. + return array_ops.identity(result_tensor) + + def _set_trace_file_path(self): + """Sets the path of the output trace file.""" + + self._trace_file_path = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_FILE) + if not self._trace_file_path: + raise ValueError('--%s is not set in the environment variable %s' + %(_FLAG_NAME_TRACE_FILE, _FLAGS_ENV_VAR)) + elif TensorTracer.use_test_undeclared_outputs_dir(): + if os.path.isabs(self._trace_file_path): + raise ValueError('If use_test_undeclared_outputs_dir is set,' + 'trace_file_path cannot be an absolute path (%s)' + %self._trace_file_path) + outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR) + self._trace_file_path = os.path.join(outputs_dir, + self._trace_file_path) + + def _set_op_range(self): + """Sets the index range of the Ops that we will consider tracing.""" + + op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE) + if not op_range: + self._op_range = (-1, -1) # this means including all ops. + return + match = _OP_RANGE_PAT.match(op_range) + if not match: + self._op_range = (-1, -1) # this means including all ops. + return + self._op_range = (int(match.group(1)), int(match.group(2))) + + def _inside_op_range(self, idx): + """Return True if the given index is inside the selected range.""" + + if idx < self._op_range[0]: + return False + return self._op_range[1] < 0 or idx <= self._op_range[1] + + def _write_report(self, content): + """Writes the given content to the report.""" + + logging.info('%s %s'%(_TRACER_LOG_PREFIX, content)) + + def _is_selected_op(self, op_name): + """Returns True if the Op with op_name is selected to be traced.""" + + if not self._selected_ops: + return True + if op_name in self._selected_ops: + return True + return False + + def _write_config_section(self): + """Writes the config section of the report.""" + + self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG)) + self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version)) + self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type)) + self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode)) + self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas)) + self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG)) + + def _write_reason_section(self): + """Writes the reason section of the report.""" + + self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON)) + for key in sorted(self._instrument_records): + self._write_report('"%s" %s\n'%(key, self._instrument_records[key])) + self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON)) + + def _write_op_list_section(self, op_list): + """Writes the Op-list section of the report.""" + + self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST)) + self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list))) + for i in range(0, len(op_list)): + self._write_report('%d "%s" %s\n'%(i, op_list[i].name, op_list[i].type)) + self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST)) + + def _write_graph_section(self, succeed, sorted_or_cycle): + """Writes the graph section of the report.""" + + self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH)) + self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED, + succeed)) + l = list(sorted_or_cycle) + for i in range(0, len(l)): + self._write_report('%d "%s"\n'%(i, l[i].name)) + self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH)) + + def _make_tensor_trace_fun(self, op_name, output_idx): + """Makes the tensor tracing function called by outside compilation. + + Args: + op_name: the name of the Op that outputs the tensor to be traced. + output_idx: which output of the Op it is (0 means the first output). + + Returns: + A function to be passed as the first argument to outside compilation. + + Raises: + RuntimeError: If the trace mode is invalid. + """ + + def _print_tensor(op_name, output_idx, num_elements, tensor, output_tensor): + """Prints a tensor value to a file. + + Args: + op_name: the name of the Op that outputs the tensor to be printed. + output_idx: which output of the Op it is (0 means the first output). + num_elements: number of elements to print. + tensor: the tensor needs to be returned. + output_tensor: the tensor needs to be printed. + + Returns: + The same tensor passed via the "tensor" argument. + """ + msg = '"%s:%d" '%(op_name, output_idx) + output_stream = _OUTPUT_STREAM_ESCAPE + self._trace_file_path + print_op = logging_ops.print_v2(msg, array_ops.shape(output_tensor), + ' @', self._replica_id, + '\n', output_tensor, + summarize=num_elements, + output_stream=output_stream) + with ops.control_dependencies([print_op]): + return array_ops.identity(tensor).op + + def _detect_nan_inf(tensor): + """Trace function for detecting any NaN/Inf in the tensor.""" + + if tensor.dtype.is_floating: + # Since host can't handle bf16, always convert tensor to f32. + tensor = math_ops.cast(tensor, dtypes.float32) + output_tensor = math_ops.reduce_any( + gen_math_ops.logical_or(gen_math_ops.is_nan(tensor), + gen_math_ops.is_inf(tensor))) + else: + output_tensor = constant_op.constant(0) + return _print_tensor(op_name, output_idx, 1, tensor, output_tensor) + + def _show_global_step(tensor): + """Trace function for printing the global step count.""" + + return _print_tensor(op_name, output_idx, 1, tensor, tensor) + + def _show_part_tensor(tensor): + """Trace function for printing part of the tensor.""" + + return _print_tensor(op_name, output_idx, self._part_tensor_size, + tensor, tensor) + + def _show_full_tensor(tensor): + """Trace function for printing the entire tensor.""" + + return _print_tensor(op_name, output_idx, -1, tensor, tensor) + + if op_name == _GLOBAL_STEP_OP_NAME: + return _show_global_step + if self._trace_mode == _TRACE_MODE_NAN_INF: + return _detect_nan_inf + if self._trace_mode == _TRACE_MODE_PART_TENSOR: + return _show_part_tensor + if self._trace_mode == _TRACE_MODE_FULL_TENSOR: + return _show_full_tensor + + raise RuntimeError('Tensor trace fun for %s is not yet implemented' + %self._trace_mode) + + def trace_tpu(self, graph, result_tensor, num_replicas=None): + """Traces the tensors generated by TPU Ops in a TF graph. + + Args: + graph: the graph of Ops. + result_tensor: a result tensor of evaluating the graph. + num_replicas: number of replicas used on the TPU. + + Returns: + A tuple (result_tensor_copy, tracing_ops), where: + result_tensor_copy: an exact copy of result_tensor + tracing_ops: a list of tracing ops. If this list + is non empty, the caller of this function + should pose control dependencies upon these + Ops so that they will be executed when the + graph is evaluated. + """ + + self._device_type = _DEVICE_TYPE_TPU + TensorTracer.check_device_type(self._device_type) + result_tensor_copy = self._add_replica_id_to_graph(num_replicas, + result_tensor) + self._write_config_section() + tracing_ops = [] + operations = graph.get_operations() + self._write_op_list_section(operations) + # Does the topological sort before adding any nodes to the graph. + (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph) + for op_id, op in enumerate(operations): + if not self._inside_op_range(op_id): + self._instrument_records[op.name] = TensorTracer.reason( + op_id, _RECORD_OUTSIDE_OP_RANGE) + continue + if not TensorTracer.should_trace(self._device_type, op): + self._instrument_records[op.name] = TensorTracer.reason( + op_id, _RECORD_SHOULD_NOT_TRACE) + continue + if not self._is_selected_op(op.name): + self._instrument_records[op.name] = TensorTracer.reason( + op_id, _RECORD_FILTERED_OUT) + continue + for i in range(len(op.outputs)): + out_tensor = op.outputs[i] + if not out_tensor.get_shape().is_fully_defined(): + self._instrument_records[out_tensor.name] = TensorTracer.reason( + op_id, _RECORD_DYNAMIC_SHAPE) + continue # cannot trace tensors with dynamic shape. + rank = len(out_tensor.shape) + if rank < 1: + self._instrument_records[out_tensor.name] = TensorTracer.reason( + op_id, _RECORD_SCALAR) + continue # cannot trace scalar. + self._instrument_records[out_tensor.name] = TensorTracer.reason( + op_id, _RECORD_GET_TRACED) + consumers = out_tensor.consumers() + trace_op = tpu.outside_compilation( + self._make_tensor_trace_fun(op.name, i), out_tensor) + if consumers: + for consumer_op in consumers: + # pylint: disable=protected-access + consumer_op._add_control_input(trace_op) + # pylint: enable=protected-access + else: + # if there is no consumer, we will add the control dependence later + # when we add the control dependency to the output operations. + tracing_ops.append(trace_op) + + self._write_reason_section() + self._write_graph_section(succeed, sorted_or_cycle) + + return (result_tensor_copy, tracing_ops) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index 555ad0f1fdb..cf8b55cfb0e 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -31,6 +31,7 @@ import six from six.moves import queue as Queue # pylint: disable=redefined-builtin from six.moves import xrange # pylint: disable=redefined-builtin +from tensorflow.contrib.tpu.python.tpu import tensor_tracer from tensorflow.contrib.tpu.python.ops import tpu_ops from tensorflow.contrib.tpu.python.tpu import error_handling from tensorflow.contrib.tpu.python.tpu import session_support @@ -1317,9 +1318,15 @@ class _ModelFnWrapper(object): captured_training_hooks.capture(estimator_spec.training_hooks) + tracing_ops = [] + if tensor_tracer.TensorTracer.is_enabled(): + tt = tensor_tracer.TensorTracer() + loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(), loss, + self._ctx.num_replicas) + # We must run train_op to update the variables prior to running the # outfeed. - with ops.control_dependencies([train_op]): + with ops.control_dependencies([train_op]+tracing_ops): host_call_outfeed_ops = [] if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec) # pylint: disable=protected-access and estimator_spec.host_call is not None): From 96b90fac8c58671bc310ca0b2c29e022e72250f0 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Wed, 7 Nov 2018 15:24:37 -0500 Subject: [PATCH 257/540] [XLA] Add proto representation of U16/S16 literals U16/S16 were already handled in most methods of Literal, except for the serialization and deserialization of protos and a few other places. Add those, to make allocating U16 literals work via xrt. --- tensorflow/compiler/xla/literal.cc | 45 +++++++++++++++++++++++++ tensorflow/compiler/xla/literal_test.cc | 22 ++++++++++++ tensorflow/compiler/xla/xla_data.proto | 6 ++-- 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc index 80dfdb83c35..cb00a0ab16d 100644 --- a/tensorflow/compiler/xla/literal.cc +++ b/tensorflow/compiler/xla/literal.cc @@ -1434,10 +1434,14 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const { return EqualElementsInternal(other, &multi_index); case U8: return EqualElementsInternal(other, &multi_index); + case S16: + return EqualElementsInternal(other, &multi_index); case S32: return EqualElementsInternal(other, &multi_index); case S64: return EqualElementsInternal(other, &multi_index); + case U16: + return EqualElementsInternal(other, &multi_index); case U32: return EqualElementsInternal(other, &multi_index); case U64: @@ -1506,6 +1510,11 @@ bool LiteralBase::IsAll(int8 value) const { return AllElementsEqualValue(piece.data(), value); } return false; + case U16: + if (value >= 0) { + return AllElementsEqualValue(piece.data(), value); + } + return false; case U32: if (value >= 0) { return AllElementsEqualValue(piece.data(), value); @@ -1518,6 +1527,8 @@ bool LiteralBase::IsAll(int8 value) const { return false; case S8: return AllElementsEqualValue(piece.data(), value); + case S16: + return AllElementsEqualValue(piece.data(), value); case S32: return AllElementsEqualValue(piece.data(), value); case S64: @@ -1739,12 +1750,16 @@ bool LiteralBase::IsZero(absl::Span indices) const { switch (shape().element_type()) { case U8: return Get(indices) == 0; + case U16: + return Get(indices) == 0; case U32: return Get(indices) == 0; case U64: return Get(indices) == 0; case S8: return Get(indices) == 0; + case S16: + return Get(indices) == 0; case S32: return Get(indices) == 0; case S64: @@ -1802,6 +1817,20 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const { case S64: CopyToRepeatedField(proto->mutable_s64s(), data()); break; + case U16: + *proto->mutable_u16s() = string( + reinterpret_cast(data().data()), size_bytes()); + if (!kLittleEndian) { + ConvertEndianShort(proto->mutable_u16s()); + } + break; + case S16: + *proto->mutable_s16s() = string( + reinterpret_cast(data().data()), size_bytes()); + if (!kLittleEndian) { + ConvertEndianShort(proto->mutable_s16s()); + } + break; case F16: *proto->mutable_f16s() = string( reinterpret_cast(data().data()), size_bytes()); @@ -1916,6 +1945,22 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) { case U64: TF_RETURN_IF_ERROR(CopyFromRepeatedField(data(), proto.u64s())); break; + case S16: { + const string& s(proto.s16s()); + TF_RET_CHECK(data().size() * sizeof(int16_t) == s.size()); + memcpy(untyped_data(), s.data(), s.size()); + if (!kLittleEndian) { + ConvertEndianShort(reinterpret_cast(untyped_data()), s.size()); + } + } break; + case U16: { + const string& s(proto.u16s()); + TF_RET_CHECK(data().size() * sizeof(uint16_t) == s.size()); + memcpy(untyped_data(), s.data(), s.size()); + if (!kLittleEndian) { + ConvertEndianShort(reinterpret_cast(untyped_data()), s.size()); + } + } break; case F16: { const string& s(proto.f16s()); TF_RET_CHECK(data().size() * sizeof(half) == s.size()); diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc index 3511760ac1c..8cec37897a9 100644 --- a/tensorflow/compiler/xla/literal_test.cc +++ b/tensorflow/compiler/xla/literal_test.cc @@ -1394,6 +1394,28 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) { EXPECT_EQ(h1, r[3]); } +TEST_F(LiteralUtilTest, CopyFromProto_u16) { + uint16 u1(0xabcd); + uint16 u2(0x1234); + + const unsigned char uint16_vals[8] = {0xcd, 0xab, 0x34, 0x12, + 0x34, 0x12, 0xcd, 0xab}; + LiteralProto p; + p.mutable_shape()->set_element_type(U16); + p.mutable_shape()->clear_dimensions(); + p.mutable_shape()->add_dimensions(4); + LayoutUtil::SetToDefaultLayout(p.mutable_shape()); + p.clear_u16s(); + p.set_u16s(uint16_vals, 8); + TF_ASSERT_OK_AND_ASSIGN(Literal literal, Literal::CreateFromProto(p)); + auto r = literal.data(); + ASSERT_EQ(4, r.size()); + EXPECT_EQ(u1, r[0]); + EXPECT_EQ(u2, r[1]); + EXPECT_EQ(u2, r[2]); + EXPECT_EQ(u1, r[3]); +} + TEST_F(LiteralUtilTest, LiteralSliceTest) { auto scalar = LiteralUtil::CreateR0(1.0); auto matrix = LiteralUtil::CreateR2({{1.0, 2.0}, {3.0, 4.0}}); diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto index b6bd919e2b2..683ccc40f16 100644 --- a/tensorflow/compiler/xla/xla_data.proto +++ b/tensorflow/compiler/xla/xla_data.proto @@ -332,11 +332,13 @@ message LiteralProto { repeated double f64s = 9; repeated float c64s = 12; // Stored as interleaved real, imag floats. repeated LiteralProto tuple_literals = 10; - // The F16s and BF16s are encoded in little endian byte order + // The F16s, BF16s, U16s and S16s are encoded in little endian byte order bytes f16s = 11; bytes bf16s = 13; + bytes u16s = 16; + bytes s16s = 17; repeated int64 sparse_indices = 14; - // Next = 16 + // Next = 18 } message WindowDimension { From 752725c6eddc24bfe7c70ae28c1f6b72298d6e03 Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Wed, 7 Nov 2018 12:25:58 -0800 Subject: [PATCH 258/540] [XLA] Add hlo matchers for all-to-all and collective-permute. PiperOrigin-RevId: 220508994 --- tensorflow/compiler/xla/service/hlo_matchers.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h index 1717770301e..170ec93a334 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.h +++ b/tensorflow/compiler/xla/service/hlo_matchers.h @@ -165,6 +165,7 @@ namespace opcode_matchers { } HLO_MATCHER(Abs); HLO_MATCHER(Add); +HLO_MATCHER(AllToAll); HLO_MATCHER(Bitcast); HLO_MATCHER(Broadcast); HLO_MATCHER(BatchNormGrad); @@ -178,6 +179,7 @@ HLO_MATCHER(Convert); HLO_MATCHER(Convolution); HLO_MATCHER(Copy); HLO_MATCHER(CrossReplicaSum); +HLO_MATCHER(CollectivePermute); HLO_MATCHER(Divide); HLO_MATCHER(Domain); HLO_MATCHER(DynamicSlice); From c6218d38ebf3653c0ecb3ddefc69cd833dcaf5e6 Mon Sep 17 00:00:00 2001 From: James Keeling Date: Wed, 7 Nov 2018 12:38:09 -0800 Subject: [PATCH 259/540] Correct docstring in error_interpolation. The docstring previously referenced the old format for tags. PiperOrigin-RevId: 220510979 --- tensorflow/python/framework/error_interpolation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py index bc3c81b2a2f..e87f505d8c3 100644 --- a/tensorflow/python/framework/error_interpolation.py +++ b/tensorflow/python/framework/error_interpolation.py @@ -267,8 +267,8 @@ def compute_field_dict(op): def interpolate(error_message, graph): """Interpolates an error message. - The error message can contain tags of the form ^^type:name^^ which will - be replaced. + The error message can contain tags of the form `{{type name}}` which will be + replaced. Args: error_message: A string to interpolate. From 100c6184f4f1df82f0c465bcb07be34dc52da614 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Wed, 7 Nov 2018 14:00:26 -0800 Subject: [PATCH 260/540] [XLA:GPU] Fuse reverse HLOs Turns out reverse is not elementwise, but we can still fuse it. PiperOrigin-RevId: 220524481 --- .../xla/service/gpu/instruction_fusion.cc | 1 + .../service/gpu/instruction_fusion_test.cc | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc index 7f2b59810f0..43f43b50e4a 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc @@ -47,6 +47,7 @@ bool IsFusible(const HloInstruction& hlo) { hlo.opcode() == HloOpcode::kReduce || hlo.opcode() == HloOpcode::kReduceWindow || hlo.opcode() == HloOpcode::kReshape || + hlo.opcode() == HloOpcode::kReverse || hlo.opcode() == HloOpcode::kScatter || hlo.opcode() == HloOpcode::kSlice || hlo.opcode() == HloOpcode::kTranspose; diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc index 57e66f5a12c..3cd49131349 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc @@ -805,5 +805,26 @@ TEST_F(InstructionFusionTest, NonscalarConstantsNotFused) { op::Reduce(op::Broadcast(op::Parameter()), op::Constant())); } +TEST_F(InstructionFusionTest, FuseReverse) { + auto module = ParseHloString(R"( + HloModule test_module + + ENTRY Reverse { + p0 = f32[50,96,1024]{2,1,0} parameter(0) + add = f32[50,96,1024]{2,1,0} add(p0, p0) + ROOT reverse = f32[50,96,1024] reverse(add), dimensions={0} + })") + .ValueOrDie(); + + EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true) + .Run(module.get()) + .ValueOrDie()); + + HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Fusion()); + EXPECT_THAT(root->fused_expression_root(), + op::Reverse(op::Add(op::Parameter(), op::Parameter()))); +} + } // namespace gpu } // namespace xla From 487a8d7fb2fba20a72ccae62d67d4a3f8d84a52f Mon Sep 17 00:00:00 2001 From: Roy Frostig Date: Wed, 7 Nov 2018 14:53:20 -0800 Subject: [PATCH 261/540] [XLA] Track an existing SWIG include as a dependency in BUILD. PiperOrigin-RevId: 220534068 --- tensorflow/compiler/xla/python/BUILD | 1 + tensorflow/python/BUILD | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index 21685c4a5b9..79603f21404 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -81,6 +81,7 @@ tf_py_wrap_cc( srcs = ["xla.i"], swig_includes = [ "local_computation_builder.i", + "//tensorflow/python:platform/base.i", ], deps = [ ":local_computation_builder", diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 7ff358cb088..bb9a9f6386c 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -20,6 +20,8 @@ licenses(["notice"]) # Apache 2.0 exports_files(["LICENSE"]) +exports_files(["platform/base.i"]) + load("//tensorflow:tensorflow.bzl", "if_not_windows") load("//tensorflow:tensorflow.bzl", "tf_cuda_library") load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py") From 7003be098cc8d10191e78f61dae417c7353b03c7 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 14:57:31 -0800 Subject: [PATCH 262/540] Fixed point implementation for audio preprocessing in TF Lite Micro PiperOrigin-RevId: 220534796 --- .../micro/examples/micro_speech/BUILD | 26 ++- .../micro_speech/preprocessor_fixed.cc | 218 ++++++++++++++++++ ...{preprocessor.cc => preprocessor_float.cc} | 0 .../experimental/micro/tools/make/Makefile | 45 +++- 4 files changed, 276 insertions(+), 13 deletions(-) create mode 100644 tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_fixed.cc rename tensorflow/lite/experimental/micro/examples/micro_speech/{preprocessor.cc => preprocessor_float.cc} (100%) diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD index 69022b611ed..638ae1467a5 100644 --- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD +++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD @@ -32,14 +32,36 @@ tflite_micro_cc_test( ) tflite_micro_cc_test( - name = "preprocessor_test", + name = "preprocessor_float_test", srcs = [ "no_30ms_sample_data.cc", "no_30ms_sample_data.h", "no_power_spectrum_data.cc", "no_power_spectrum_data.h", - "preprocessor.cc", "preprocessor.h", + "preprocessor_float.cc", + "preprocessor_test.cc", + "yes_30ms_sample_data.cc", + "yes_30ms_sample_data.h", + "yes_power_spectrum_data.cc", + "yes_power_spectrum_data.h", + ], + deps = [ + "//tensorflow/lite/c:c_api_internal", + "//tensorflow/lite/experimental/micro:micro_framework", + "//tensorflow/lite/experimental/micro/testing:micro_test", + ], +) + +tflite_micro_cc_test( + name = "preprocessor_fixed_test", + srcs = [ + "no_30ms_sample_data.cc", + "no_30ms_sample_data.h", + "no_power_spectrum_data.cc", + "no_power_spectrum_data.h", + "preprocessor.h", + "preprocessor_fixed.cc", "preprocessor_test.cc", "yes_30ms_sample_data.cc", "yes_30ms_sample_data.h", diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_fixed.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_fixed.cc new file mode 100644 index 00000000000..de60c982f3a --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_fixed.cc @@ -0,0 +1,218 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Reference implementation of the preprocessing pipeline, with the same +// results as the audio tutorial at +// https://www.tensorflow.org/tutorials/sequences/audio_recognition +// This module takes 30ms of PCM-encoded signed 16-bit audio samples (at 16KHz, +// so 480 values), and extracts a power spectrum of frequencies. There are 43 +// frequency bands in the result, derived from the original 256 output from the +// discrete Fourier transform, and averaged together in groups of 6. +// It's expected that most platforms will have optimized versions of the +// functions used here, for example replacing the DFT with an FFT, so this +// version shouldn't be used where performance is critical. +// This implementation uses fixed point for any non-constant calculations, +// instead of floating point, to help show how this can work on platforms that +// don't have good float support. + +#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h" + +#include + +namespace { + +// q format notation: qx.y => 1 sign bit, x-1 integer bits, y fraction bits. +// Use standard (non-saturating) arithmetic with signed ints of size x+y bits. +// Sacrifice some precision to avoid use of 64-bit ints. + +// q1.15 * q1.15 => q2.30 +inline int32_t Q1_15_FixedMultiply_Q2_30(int16_t a, int16_t b) { + int32_t big_a = a; + int32_t big_b = b; + return big_a * big_b; +} + +// q2.30 * q2.30 => q10.22 +inline int32_t Q2_30_FixedMultiply_Q10_22(int32_t a, int32_t b) { + // q2.30 result + int32_t tmp = (a >> 15) * (b >> 15); + // q10.22 result + return tmp >> 8; +} + +// q10.22 * q10.22 => q10.22 +// Will overflow if product is >= 512. +// Largest product in small test set is 465.25 +inline int32_t Q10_22_FixedMultiply_Q10_22(int32_t a, int32_t b) { + // q10.22 result + return (a >> 11) * (b >> 11); +} + +// float => q2.30 +// No checking for saturation. Only used for inputs in range [-1, 1]. +inline int32_t FloatToFixed_Q2_30(float input) { + return static_cast(roundf(input * (1 << 30))); +} + +// These constants allow us to allocate fixed-sized arrays on the stack for our +// working memory. +constexpr int kInputSize = 512; +constexpr int kAverageWindowSize = 6; +constexpr int kOutputSize = + ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize; + +// Performs a discrete Fourier transform on the real inputs. This corresponds to +// rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html, +// and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft. +// It takes in an array of float real values, and returns a result of the same +// length with q10.22 fixed point real and imaginary components interleaved, so +// fourier_output[0] is the first real value, fourier_output[1] is the first +// imaginary, fourier_output[2] is the second real, and so on. +// The calling function should ensure that the array passed in as fourier_output +// is at least time_series_size in length. Most optimized FFT implementations +// require the length to be a power of two as well, but this version doesn't +// enforce that. + +// input: q2.30 fixed point. output: q10.22 fixed point. +// Outputs interpreted as q10.22 fixed point are un-scaled. +void CalculateDiscreteFourierTransform(int32_t* time_series, + int time_series_size, + int32_t* fourier_output) { + for (int i = 0; i < time_series_size / 2; ++i) { + int32_t real = 0; + for (int j = 0; j < time_series_size; ++j) { + const int32_t real_scale = + FloatToFixed_Q2_30(cos(j * i * M_PI * 2 / time_series_size)); + real += Q2_30_FixedMultiply_Q10_22(time_series[j], real_scale); + } + int32_t imaginary = 0; + for (int j = 0; j < time_series_size; ++j) { + const int32_t imaginary_scale = + FloatToFixed_Q2_30(sin(j * i * M_PI * 2 / time_series_size)); + imaginary -= Q2_30_FixedMultiply_Q10_22(time_series[j], imaginary_scale); + } + fourier_output[(i * 2) + 0] = real; + fourier_output[(i * 2) + 1] = imaginary; + } +} + +// Produces a simple sine curve that is used to ensure frequencies at the center +// of the current sample window are weighted more heavily than those at the end. +// q1.15 output format. +void CalculatePeriodicHann(int window_length, int16_t* window_function) { + for (int i = 0; i < window_length; ++i) { + const float real_value = (0.5 - 0.5 * cos((2 * M_PI * i) / window_length)); + int tmp = static_cast(roundf(real_value * (1 << 15))); + // Saturate the 0x8000 value to 0x7fff + if (tmp > 0x7fff) tmp = 0x7fff; + window_function[i] = tmp; + } +} + +} // namespace + +TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter, + const int16_t* input, int input_size, int output_size, + uint8_t* output) { + // Ensure our input and output data arrays are valid. + if (input_size > kInputSize) { + error_reporter->Report("Input size %d larger than %d", input_size, + kInputSize); + return kTfLiteError; + } + if (output_size != kOutputSize) { + error_reporter->Report("Requested output size %d doesn't match %d", + output_size, kOutputSize); + return kTfLiteError; + } + + // Pre-calculate the window function we'll be applying to the input data. + // In a real application, we'd calculate this table once in an initialization + // function and store it for repeated reuse. + // q1.15 format. + int16_t window_function[kInputSize]; + CalculatePeriodicHann(input_size, window_function); + + // Apply the window function to our time series input, and pad it with zeroes + // to the next power of two. + int32_t fixed_input[kInputSize]; + for (int i = 0; i < kInputSize; ++i) { + if (i < input_size) { + // input is int16_t. Treat as q1.15 fixed point value in range [-1,1) + // window_function is also q1.15 fixed point number + fixed_input[i] = + Q1_15_FixedMultiply_Q2_30(input[i], window_function[i]); + } else { + fixed_input[i] = 0; + } + } + + // Pull the frequency data from the time series sample. + // Calculated in q10.22 format from q2.30 inputs. + int32_t fourier_values[kInputSize]; + CalculateDiscreteFourierTransform(fixed_input, kInputSize, fourier_values); + + // We have the complex numbers giving us information about each frequency + // band, but all we want to know is how strong each frequency is, so calculate + // the squared magnitude by adding together the squares of each component. + int32_t power_spectrum[kInputSize / 2]; + for (int i = 0; i < (kInputSize / 2); ++i) { + const int32_t real = fourier_values[(i * 2) + 0]; + const int32_t imaginary = fourier_values[(i * 2) + 1]; + // q10.22 results + power_spectrum[i] = + Q10_22_FixedMultiply_Q10_22(real, real) + + Q10_22_FixedMultiply_Q10_22(imaginary, imaginary); + } + + // Finally, reduce the size of the output by averaging together six adjacent + // frequencies into each slot, producing an array of 43 values. + // Power_spectrum numbers are q10.22. Divide by kAverageWindowSize inside + // loop to prevent overflow. + for (int i = 0; i < kOutputSize; ++i) { + int32_t average = 0; + for (int j = 0; j < kAverageWindowSize; ++j) { + const int index = (i * kAverageWindowSize) + j; + if (index < (kInputSize / 2)) { + average += power_spectrum[index] / kAverageWindowSize; + } + } + // Quantize the result into eight bits, effectively multiplying by two. + // The 127.5 constant here has to match the features_max value defined in + // tensorflow/examples/speech_commands/input_data.py, and this also assumes + // that features_min is zero. + // + // q10.22 input + // integer output + // + // output = (input - features_min) * + // (output_max - output_min) / (features_max - features_min) + // == (input) * (255) / (127.5) + // == input * 2 + // == input << 1 + // Also want to round to nearest integer and only keep integer bits + // => ((input << 1) + 0x200000) >> 22 + // == (input + 0x100000) >> 21 + int32_t quantized_average = (average + 0x100000) >> 21; + if (quantized_average < 0) { + quantized_average = 0; + } + if (quantized_average > 255) { + quantized_average = 255; + } + output[i] = quantized_average; + } + return kTfLiteOk; +} diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_float.cc similarity index 100% rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc rename to tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_float.cc diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile index 5492003e5af..b182c120d2a 100644 --- a/tensorflow/lite/experimental/micro/tools/make/Makefile +++ b/tensorflow/lite/experimental/micro/tools/make/Makefile @@ -62,12 +62,19 @@ tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc # Test binary for the microcontroller speech model. PREPROCESSOR_TEST_SRCS := \ tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \ -tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc \ tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \ tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \ tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \ tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc +PREPROCESSOR_FLOAT_TEST_SRCS = \ +$(PREPROCESSOR_TEST_SRCS) \ +tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_float.cc + +PREPROCESSOR_FIXED_TEST_SRCS += \ +$(PREPROCESSOR_TEST_SRCS) \ +tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_fixed.cc + MICROLITE_TEST_SRCS := \ $(wildcard tensorflow/lite/experimental/micro/*test.cc) \ $(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc) @@ -91,7 +98,8 @@ include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc) ALL_SRCS := \ $(MICRO_SPEECH_TEST_SRCS) \ - $(PREPROCESSOR_TEST_SRCS) \ + $(PREPROCESSOR_FLOAT_TEST_SRCS) \ + $(PREPROCESSOR_FIXED_TEST_SRCS) \ $(MICROLITE_CC_SRCS) \ $(MICROLITE_TEST_SRCS) @@ -104,7 +112,8 @@ LIBDIR := $(GENDIR)lib/ MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME) MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test -PREPROCESSOR_TEST_BINARY := $(BINDIR)preprocessor_test +PREPROCESSOR_FLOAT_TEST_BINARY := $(BINDIR)preprocessor_float_test +PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++ CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc @@ -113,8 +122,11 @@ AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \ $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS)))) -PREPROCESSOR_TEST_OBJS := $(addprefix $(OBJDIR), \ -$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_TEST_SRCS)))) +PREPROCESSOR_FLOAT_TEST_OBJS := $(addprefix $(OBJDIR), \ +$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FLOAT_TEST_SRCS)))) + +PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \ +$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS)))) MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \ $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS)))) @@ -158,18 +170,29 @@ micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin test_micro_speech: $(MICRO_SPEECH_TEST_BINARY) $(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~' -$(PREPROCESSOR_TEST_BINARY): $(PREPROCESSOR_TEST_OBJS) $(MICROLITE_LIB_PATH) +$(PREPROCESSOR_FLOAT_TEST_BINARY): $(PREPROCESSOR_FLOAT_TEST_OBJS) $(MICROLITE_LIB_PATH) @mkdir -p $(dir $@) $(CXX) $(CXXFLAGS) $(INCLUDES) \ - -o $(PREPROCESSOR_TEST_BINARY) $(PREPROCESSOR_TEST_OBJS) \ + -o $(PREPROCESSOR_FLOAT_TEST_BINARY) $(PREPROCESSOR_FLOAT_TEST_OBJS) \ $(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS) -preprocessor_test: $(PREPROCESSOR_TEST_BINARY) -preprocessor_test_bin: $(PREPROCESSOR_TEST_BINARY).bin +preprocessor_float_test: $(PREPROCESSOR_FLOAT_TEST_BINARY) +preprocessor_float_test_bin: $(PREPROCESSOR_FLOAT_TEST_BINARY).bin -test_preprocessor: $(PREPROCESSOR_TEST_BINARY) - $(TEST_SCRIPT) $(PREPROCESSOR_TEST_BINARY) '~~~ALL TESTS PASSED~~~' +test_preprocessor_float: $(PREPROCESSOR_FLOAT_TEST_BINARY) + $(TEST_SCRIPT) $(PREPROCESSOR_FLOAT_TEST_BINARY) '~~~ALL TESTS PASSED~~~' +$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH) + @mkdir -p $(dir $@) + $(CXX) $(CXXFLAGS) $(INCLUDES) \ + -o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \ + $(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS) + +preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY) +preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin + +test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY) + $(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~' $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH) @mkdir -p $(dir $@) From 3146bc0a2240ac829437009c6cdc614b3869cd0a Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Wed, 7 Nov 2018 14:59:59 -0800 Subject: [PATCH 263/540] In the short term, making Feature Column V2 produce RefVariables instead of ResourceVariables till we figure out the performance regression issues. PiperOrigin-RevId: 220535259 --- tensorflow/python/feature_column/feature_column_v2.py | 10 ++++++++-- .../python/feature_column/feature_column_v2_test.py | 8 ++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py index d97d41dd830..bd198ed53d3 100644 --- a/tensorflow/python/feature_column/feature_column_v2.py +++ b/tensorflow/python/feature_column/feature_column_v2.py @@ -184,6 +184,7 @@ class StateManager(object): shape, dtype=None, trainable=True, + use_resource=True, initializer=None): """Creates a new variable. @@ -193,12 +194,14 @@ class StateManager(object): shape: variable shape. dtype: The type of the variable. Defaults to `self.dtype` or `float32`. trainable: Whether this variable is trainable or not. + use_resource: If true, we use resource variables. Otherwise we use + RefVariable. initializer: initializer instance (callable). Returns: The created variable. """ - del feature_column, name, shape, dtype, trainable, initializer + del feature_column, name, shape, dtype, trainable, use_resource, initializer raise NotImplementedError('StateManager.create_variable') def add_variable(self, feature_column, var): @@ -270,6 +273,7 @@ class _StateManagerImpl(StateManager): shape, dtype=None, trainable=True, + use_resource=True, initializer=None): if name in self._cols_to_vars_map[feature_column]: raise ValueError('Variable already exists.') @@ -280,7 +284,7 @@ class _StateManagerImpl(StateManager): dtype=dtype, initializer=initializer, trainable=self._trainable and trainable, - use_resource=True, + use_resource=use_resource, # TODO(rohanj): Get rid of this hack once we have a mechanism for # specifying a default partitioner for an entire layer. In that case, # the default getter for Layers should work. @@ -2539,6 +2543,8 @@ class EmbeddingColumn( shape=embedding_shape, dtype=dtypes.float32, trainable=self.trainable, + # TODO(rohanj): Make this True when b/118500434 is fixed. + use_resource=False, initializer=self.initializer) def _get_dense_tensor_internal_helper(self, sparse_tensors, diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py index ab727752b49..45317a7d4a0 100644 --- a/tensorflow/python/feature_column/feature_column_v2_test.py +++ b/tensorflow/python/feature_column/feature_column_v2_test.py @@ -1816,6 +1816,8 @@ class LinearModelTest(test.TestCase): 'sparse_feature': [['a'], ['x']], } model(features) + for var in model.variables: + self.assertTrue(isinstance(var, variables_lib.RefVariable)) variable_names = [var.name for var in model.variables] self.assertItemsEqual([ 'linear_model/dense_feature_bucketized/weights:0', @@ -5592,6 +5594,7 @@ class _TestStateManager(fc.StateManager): shape, dtype=None, trainable=True, + use_resource=True, initializer=None): if feature_column not in self._all_variables: self._all_variables[feature_column] = {} @@ -5604,6 +5607,7 @@ class _TestStateManager(fc.StateManager): shape=shape, dtype=dtype, trainable=self._trainable and trainable, + use_resource=use_resource, initializer=initializer) var_dict[name] = var return var @@ -6182,6 +6186,8 @@ class EmbeddingColumnTest(test.TestCase): global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',), tuple([v.name for v in global_vars])) + for v in global_vars: + self.assertTrue(isinstance(v, variables_lib.RefVariable)) trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',), tuple([v.name for v in trainable_vars])) @@ -6964,6 +6970,8 @@ class SharedEmbeddingColumnTest(test.TestCase): self.assertItemsEqual( ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'], tuple([v.name for v in global_vars])) + for v in global_vars: + self.assertTrue(isinstance(v, variables_lib.RefVariable)) trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES) if trainable: self.assertItemsEqual( From 665bd7a2ce8ab64917cd47caa15e4c6d2b620e7b Mon Sep 17 00:00:00 2001 From: Katherine Wu Date: Wed, 7 Nov 2018 15:17:32 -0800 Subject: [PATCH 264/540] Fix flaky test -- when testing the metric value, make sure that the updated value is reflected by re-running the value instead of update_op PiperOrigin-RevId: 220538414 --- tensorflow/contrib/saved_model/BUILD | 4 ---- .../saved_model/keras_saved_model_test.py | 23 ++++++++++--------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD index 395a68c6446..d8982b7e519 100644 --- a/tensorflow/contrib/saved_model/BUILD +++ b/tensorflow/contrib/saved_model/BUILD @@ -104,11 +104,7 @@ py_test( srcs_version = "PY2AND3", tags = [ "no_windows", - # TODO(b/119022845): Re-enable this test in TAP. - "manual", - "notap", "notsan", - "no_oss", ], deps = [ ":keras_saved_model", diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py index 4970ebc3199..a65b2ce4661 100644 --- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py +++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py @@ -345,21 +345,22 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase): inputs, outputs = load_model(sess, output_path, model_fn_lib.ModeKeys.EVAL) - sess.run(outputs['metrics/mae/update_op'], { - inputs[input_name]: input_arr, - inputs[target_name]: target_arr - }) + # First obtain the loss and predictions, and run the metric update op by + # feeding in the inputs and targets. + loss, predictions, _ = sess.run( + (outputs['loss'], outputs['predictions/' + output_name], + outputs['metrics/mae/update_op']), + {inputs[input_name]: input_arr, inputs[target_name]: target_arr}) - eval_results = sess.run(outputs, {inputs[input_name]: input_arr, - inputs[target_name]: target_arr}) + # The metric value should be run after the update op, to ensure that it + # reflects the correct value. + metric_value = sess.run(outputs['metrics/mae/value']) self.assertEqual(int(train_before_export), sess.run(training_module.get_global_step())) - self.assertAllClose(ref_loss, eval_results['loss'], atol=1e-05) - self.assertAllClose( - ref_mae, eval_results['metrics/mae/value'], atol=1e-05) - self.assertAllClose( - ref_predict, eval_results['predictions/' + output_name], atol=1e-05) + self.assertAllClose(ref_loss, loss, atol=1e-05) + self.assertAllClose(ref_mae, metric_value, atol=1e-05) + self.assertAllClose(ref_predict, predictions, atol=1e-05) # Load train graph, and check for the train op, and prediction values with session.Session(graph=ops.Graph()) as sess: From 42476f730edfefa02c66f2c27e532929c84f3aaf Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 15:59:13 -0800 Subject: [PATCH 265/540] MNIST TF 2.0 integration test PiperOrigin-RevId: 220545522 --- tensorflow/examples/tf2_showcase/BUILD | 32 +++ tensorflow/examples/tf2_showcase/README.md | 25 ++ tensorflow/examples/tf2_showcase/mnist.py | 262 +++++++++++++++++++++ 3 files changed, 319 insertions(+) create mode 100644 tensorflow/examples/tf2_showcase/BUILD create mode 100644 tensorflow/examples/tf2_showcase/README.md create mode 100644 tensorflow/examples/tf2_showcase/mnist.py diff --git a/tensorflow/examples/tf2_showcase/BUILD b/tensorflow/examples/tf2_showcase/BUILD new file mode 100644 index 00000000000..922bc96b25b --- /dev/null +++ b/tensorflow/examples/tf2_showcase/BUILD @@ -0,0 +1,32 @@ +licenses(["notice"]) # Apache 2.0 + +package( + default_visibility = ["//visibility:private"], +) + +test_suite( + name = "all_tests", + tags = [ + "manual", + "no_oss", + "notap", + ], + tests = [ + ":mnist", + ], +) + +py_test( + name = "mnist", + srcs = ["mnist.py"], + tags = [ + "manual", + "no_oss", + "notap", + ], + deps = [ + "//tensorflow:tensorflow_py", + "//third_party/py/absl:app", + "//third_party/py/absl/flags", + ], +) diff --git a/tensorflow/examples/tf2_showcase/README.md b/tensorflow/examples/tf2_showcase/README.md new file mode 100644 index 00000000000..8211fb1d30d --- /dev/null +++ b/tensorflow/examples/tf2_showcase/README.md @@ -0,0 +1,25 @@ +# TF 2.0 Showcase + +The code here shows idiomatic ways to write TensorFlow 2.0 code. It doubles as +an integration test. + +## General guidelines for showcase code: + +- Code should minimize dependencies and be self-contained in one file. A user + should be able to copy-paste the example code into their project and have it + just work. +- Code should emphasize simplicity over performance, as long as it performs + within a factor of 2-3x of the optimized implementation. +- Code should work on CPU and single GPU. +- Code should run in Python 3. +- Code should conform to the [Google Python Style Guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md) + + +- Code should follow these guidelines: + - Prefer Keras. + - Split code into separate input pipeline and model code segments. + - Don't use tf.cond or tf.while_loop; instead, make use of AutoGraph's + functionality to compile Python `for`, `while`, and `if` statements. + - Prefer a simple training loop over Estimator + - Save and restore a SavedModel. + - Write basic TensorBoard metrics - loss, accuracy, diff --git a/tensorflow/examples/tf2_showcase/mnist.py b/tensorflow/examples/tf2_showcase/mnist.py new file mode 100644 index 00000000000..a4bfe4e53a8 --- /dev/null +++ b/tensorflow/examples/tf2_showcase/mnist.py @@ -0,0 +1,262 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""MNIST model training with TensorFlow eager execution. + +See: +https://research.googleblog.com/2017/10/eager-execution-imperative-define-by.html + +This program demonstrates training, export, and inference of a convolutional +neural network model with eager execution enabled. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time + +from absl import app +from absl import flags +import numpy as np +import tensorflow as tf + +tfe = tf.contrib.eager + +flags.DEFINE_integer( + name='log_interval', + default=10, + help='batches between logging training status') + +flags.DEFINE_float(name='learning_rate', default=0.01, help='Learning rate.') + +flags.DEFINE_float( + name='momentum', short_name='m', default=0.5, help='SGD momentum.') + +flags.DEFINE_integer( + name='batch_size', + default=100, + help='Batch size to use during training / eval') + +flags.DEFINE_integer( + name='train_epochs', default=10, help='Number of epochs to train') + +flags.DEFINE_string( + name='model_dir', + default='/tmp/tensorflow/mnist', + help='Where to save checkpoints, tensorboard summaries, etc.') + +flags.DEFINE_bool( + name='clean', + default=False, + help='Whether to clear model directory before training') + +FLAGS = flags.FLAGS + + +def create_model(): + """Model to recognize digits in the MNIST dataset. + + Network structure is equivalent to: + https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py + and + https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py + But uses the tf.keras API. + Returns: + A tf.keras.Model. + """ + # Assumes data_format == 'channel_last'. + # See https://www.tensorflow.org/performance/performance_guide#data_formats + + input_shape = [28, 28, 1] + + l = tf.keras.layers + max_pool = l.MaxPooling2D((2, 2), (2, 2), padding='same') + # The model consists of a sequential chain of layers, so tf.keras.Sequential + # (a subclass of tf.keras.Model) makes for a compact description. + model = tf.keras.Sequential( + [ + l.Reshape( + target_shape=input_shape, + input_shape=(28 * 28,)), + l.Conv2D(2, 5, padding='same', activation=tf.nn.relu), + max_pool, + l.Conv2D(4, 5, padding='same', activation=tf.nn.relu), + max_pool, + l.Flatten(), + l.Dense(32, activation=tf.nn.relu), + l.Dropout(0.4), + l.Dense(10) + ]) + # TODO(brianklee): Remove when @kaftan makes this happen by default. + # TODO(brianklee): remove `autograph=True` when kwarg default is flipped. + model.call = tfe.function(model.call, autograph=True) + # Needs to have input_signature specified in order to be exported + # since model.predict() is never called before saved_model.export() + # TODO(brianklee): Update with input signature, depending on how the impl of + # saved_model.restore() pans out. + model.predict = tfe.function(model.predict, autograph=True) + # ,input_signature=(tensor_spec.TensorSpec(shape=[28, 28, None], dtype=tf.float32),) # pylint: disable=line-too-long + return model + + +def mnist_datasets(): + (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() + # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32. + x_train, x_test = x_train / np.float32(255), x_test / np.float32(255) + y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64) + train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) + test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)) + return train_dataset, test_dataset + + +def loss(logits, labels): + return tf.reduce_mean( + tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=labels)) + + +def compute_accuracy(logits, labels): + predictions = tf.argmax(logits, axis=1, output_type=tf.int64) + labels = tf.cast(labels, tf.int64) + return tf.reduce_mean( + tf.cast(tf.equal(predictions, labels), dtype=tf.float32)) + + +# TODO(brianklee): Enable @tf.function on the training loop when zip, enumerate +# are supported by autograph. +def train(model, optimizer, dataset, step_counter, log_interval=None, + num_steps=None): + """Trains model on `dataset` using `optimizer`.""" + start = time.time() + for (batch, (images, labels)) in enumerate(dataset): + if num_steps is not None and batch > num_steps: + break + with tf.contrib.summary.record_summaries_every_n_global_steps( + 10, global_step=step_counter): + # Record the operations used to compute the loss given the input, + # so that the gradient of the loss with respect to the variables + # can be computed. + with tf.GradientTape() as tape: + logits = model(images, training=True) + loss_value = loss(logits, labels) + tf.contrib.summary.scalar('loss', loss_value) + tf.contrib.summary.scalar('accuracy', compute_accuracy(logits, labels)) + grads = tape.gradient(loss_value, model.variables) + optimizer.apply_gradients( + zip(grads, model.variables), global_step=step_counter) + if log_interval and batch % log_interval == 0: + rate = log_interval / (time.time() - start) + print('Step #%d\tLoss: %.6f (%d steps/sec)' % (batch, loss_value, rate)) + start = time.time() + + +def test(model, dataset): + """Perform an evaluation of `model` on the examples from `dataset`.""" + avg_loss = tfe.metrics.Mean('loss', dtype=tf.float32) + accuracy = tfe.metrics.Accuracy('accuracy', dtype=tf.float32) + + for (images, labels) in dataset: + logits = model(images, training=False) + avg_loss(loss(logits, labels)) + accuracy( + tf.argmax(logits, axis=1, output_type=tf.int64), + tf.cast(labels, tf.int64)) + print('Test set: Average loss: %.4f, Accuracy: %4f%%\n' % + (avg_loss.result(), 100 * accuracy.result())) + with tf.contrib.summary.always_record_summaries(): + tf.contrib.summary.scalar('loss', avg_loss.result()) + tf.contrib.summary.scalar('accuracy', accuracy.result()) + + +def train_and_export(flags_obj): + """Run MNIST training and eval loop in eager mode. + + Args: + flags_obj: An object containing parsed flag values. + """ + # Load the datasets + train_ds, test_ds = mnist_datasets() + train_ds = train_ds.shuffle(60000).batch(flags_obj.batch_size) + test_ds = test_ds.batch(flags_obj.batch_size) + + # Create the model and optimizer + model = create_model() + optimizer = tf.train.MomentumOptimizer( + flags_obj.learning_rate, flags_obj.momentum) + + # See summaries with `tensorboard --logdir=` + train_dir = os.path.join(flags_obj.model_dir, 'summaries', 'train') + test_dir = os.path.join(flags_obj.model_dir, 'summaries', 'eval') + summary_writer = tf.contrib.summary.create_file_writer( + train_dir, flush_millis=10000) + test_summary_writer = tf.contrib.summary.create_file_writer( + test_dir, flush_millis=10000, name='test') + + # Create and restore checkpoint (if one exists on the path) + checkpoint_dir = os.path.join(flags_obj.model_dir, 'checkpoints') + checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') + step_counter = tf.train.get_or_create_global_step() + checkpoint = tf.train.Checkpoint( + model=model, optimizer=optimizer, step_counter=step_counter) + # Restore variables on creation if a checkpoint exists. + checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) + + # Train and evaluate for a set number of epochs. + for _ in range(flags_obj.train_epochs): + start = time.time() + with summary_writer.as_default(): + train(model, optimizer, train_ds, step_counter, + flags_obj.log_interval, num_steps=1) + end = time.time() + print('\nTrain time for epoch #%d (%d total steps): %f' % + (checkpoint.save_counter.numpy() + 1, + step_counter.numpy(), + end - start)) + with test_summary_writer.as_default(): + test(model, test_ds) + checkpoint.save(checkpoint_prefix) + + # TODO(brianklee): Enable this functionality after @allenl implements this. + # export_path = os.path.join(flags_obj.model_dir, 'export') + # tf.saved_model.save(export_path, model) + + +def import_and_eval(flags_obj): + export_path = os.path.join(flags_obj.model_dir, 'export') + model = tf.saved_model.restore(export_path) + _, (x_test, y_test) = tf.keras.datasets.mnist.load_data() + x_test = x_test / np.float32(255) + y_predict = model(x_test) + accuracy = compute_accuracy(y_predict, y_test) + print('Model accuracy: {:0.2f}%'.format(accuracy.numpy() * 100)) + + +def apply_clean(flags_obj): + if flags_obj.clean and tf.gfile.Exists(flags_obj.model_dir): + tf.logging.info('--clean flag set. Removing existing model dir: {}'.format( + flags_obj.model_dir)) + tf.gfile.DeleteRecursively(flags_obj.model_dir) + + +def main(_): + apply_clean(flags.FLAGS) + train_and_export(flags.FLAGS) + # TODO(brianklee): Enable this functionality after @allenl implements this. + # import_and_eval(flags.FLAGS) + + +if __name__ == '__main__': + app.run(main) From 7a7b72855e7894b169ae78f4b46f247552bb62cb Mon Sep 17 00:00:00 2001 From: Alexandre Passos Date: Wed, 7 Nov 2018 16:15:06 -0800 Subject: [PATCH 266/540] Pulls out variable initialization in tf.function().get_concrete_function PiperOrigin-RevId: 220548234 --- tensorflow/python/eager/def_function.py | 64 ++++++++++------ tensorflow/python/eager/function_test.py | 98 ++++++++++++------------ tensorflow/python/eager/lift_to_graph.py | 2 - 3 files changed, 89 insertions(+), 75 deletions(-) diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py index 8c43c38e8b1..f4436a25bc7 100644 --- a/tensorflow/python/eager/def_function.py +++ b/tensorflow/python/eager/def_function.py @@ -51,6 +51,7 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable): name=None, dtype=None, constraint=None, + add_initializers_to=None, **unused_kwargs): """Creates a variable. @@ -81,6 +82,9 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable): variable and return the Tensor for the projected value (which must have the same shape). Constraints are not safe to use when doing asynchronous distributed training. + add_initializers_to: if not None and not in legacy graph mode, the + initializer tensor will be added to this map instead of adding the + assignment to the function. Raises: ValueError: If the initial value is not specified, or does not have a @@ -166,21 +170,24 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable): self._graph_element = value ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self) else: - def assign_fn(): - with ops.name_scope("Assign") as n, ops.colocate_with(self._handle): - resource_variable_ops.assign_variable_op( - self._handle, - initial_value, - name=n) - # Returning values to keep tf.cond happy. - return ops.convert_to_tensor(1) - def not_assign_fn(): - return ops.convert_to_tensor(0) - # Note: this cond is always guaranteed to run because we're inside a - # defun which will insert automatic control dependencies. - control_flow_ops.cond( - resource_variable_ops.var_is_initialized_op(self._handle), - not_assign_fn, assign_fn) + if add_initializers_to is not None: + add_initializers_to[self] = initial_value + else: + def assign_fn(): + with ops.name_scope("Assign") as n, ops.colocate_with(self._handle): + resource_variable_ops.assign_variable_op( + self._handle, + initial_value, + name=n) + # Returning values to keep tf.cond happy. + return ops.convert_to_tensor(1) + def not_assign_fn(): + return ops.convert_to_tensor(0) + # Note: this cond is always guaranteed to run because we're inside a + # defun which will insert automatic control dependencies. + control_flow_ops.cond( + resource_variable_ops.var_is_initialized_op(self._handle), + not_assign_fn, assign_fn) # After the handle has been created, set up a way to clean it up when # executing eagerly. We'll hold the only reference to the deleter, so that @@ -252,14 +259,15 @@ class PolymorphicFunction(object): input_signature=self._input_signature, experimental_autograph=self._autograph) - def _initialize(self, args, kwds): + def _initialize(self, args, kwds, add_initializers_to=None): """Initializes, on the first call.""" self._created_variables = [] def variable_capturing_scope(unused_next_creator, **kwds): """Creates UnliftedInitializerVariables and saves references to them.""" - v = UnliftedInitializerVariable(**kwds) + v = UnliftedInitializerVariable( + add_initializers_to=add_initializers_to, **kwds) self._created_variables.append(weakref.ref(v)) return v @@ -405,14 +413,22 @@ class PolymorphicFunction(object): Raises: ValueError: if this object has not yet been called on concrete values. """ - # TODO(apassos) figure out how to handle this case (what should we return - # here?) + assert context.executing_eagerly() if self._stateful_fn is None: - raise ValueError( - "Call this function with concrete values before asking for a" - " concrete function. Calling the function will ensure that, in" - " case this function creates variables, that those are properly" - " initialized.") + # Here we trace the function, collect the initializers, and attempt to + # extract them and run them eagerly. Fail only if we cannot do so. + initializer_map = {} + self._initialize(args, kwargs, add_initializers_to=initializer_map) + if not self._created_variables: + + @function + def initialize_variables(): + for v, init in initializer_map.items(): + v.assign(lift_to_graph.lift_to_graph( + init, ops.get_default_graph())[init]) + + initialize_variables() + if self._created_variables: # In this case we have created variables on the first call, so we run the # defunned version which is guaranteed to never create variables. diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py index 6c93f977291..8682173a1dc 100644 --- a/tensorflow/python/eager/function_test.py +++ b/tensorflow/python/eager/function_test.py @@ -190,7 +190,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): def testBasicGraphFunction(self): matmul = def_function.function(math_ops.matmul) - @function.defun + @def_function.function def sq(a): return matmul(a, a) @@ -204,7 +204,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): def testInputSpecGraphFunction(self): matmul = def_function.function(math_ops.matmul) - @function.defun + @def_function.function def sq(a): return matmul(a, a) @@ -223,7 +223,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): def testNestedInputSpecGraphFunction(self): matmul = def_function.function(math_ops.matmul) - @function.defun + @def_function.function def sq(mats): ((a, b),) = mats return matmul(a, b) @@ -347,7 +347,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): pair = collections.namedtuple('pair', ['a', 'b']) - @function.defun + @def_function.function def a_times_b(inputs): return matmul(inputs.a['a'], inputs.b['b']) @@ -362,7 +362,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): def testNestedOutputGraphFunction(self): matmul = def_function.function(math_ops.matmul) - @function.defun + @def_function.function def sq(a): return (matmul(a, a), {'b': constant_op.constant(1.0)}) @@ -381,7 +381,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): def testGraphFunctionWithGradients(self): v = resource_variable_ops.ResourceVariable(1.0, name='v') - @function.defun + @def_function.function def step(): def inner(): return v * v @@ -394,7 +394,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): self.assertAllEqual(step_op(), 2.0) def testGraphFunctionNoneOutput(self): - @function.defun + @def_function.function def fn(unused_a, unused_b): return None @@ -968,7 +968,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): v_gpu = resource_variable_ops.ResourceVariable( [0.0, 1.0, 2.0], name='gpu') - @function.defun + @def_function.function def resource_apply_adam(): training_ops.resource_apply_adam( v_cpu.handle, @@ -1040,11 +1040,11 @@ class FunctionTest(test.TestCase, parameterized.TestCase): @test_util.run_in_graph_and_eager_modes def testNestedDifferentiableFunction(self): - @function.defun + @def_function.function def inner_fn(a, b): return a * math_ops.add(a, b) - @function.defun + @def_function.function def outer_fn(x): return inner_fn(x, 1.0) @@ -1058,19 +1058,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase): @test_util.run_in_graph_and_eager_modes def testDeeplyNestedDifferentiableFunction(self): - @function.defun + @def_function.function def inner_inner_fn(a, b): return math_ops.add(a, b) - @function.defun + @def_function.function def inner_fn(a, b): return inner_inner_fn(a, b) - @function.defun + @def_function.function def middle_fn(a, b): return a * inner_fn(a, b) - @function.defun + @def_function.function def outer_fn(x): return middle_fn(x, 1.0) @@ -1084,15 +1084,15 @@ class FunctionTest(test.TestCase, parameterized.TestCase): @test_util.run_in_graph_and_eager_modes def testDeeplyNestedDifferentiableFunctionWithMultipleGradCalls(self): - @function.defun + @def_function.function def inner_fn(a, b): return math_ops.add(a, b) - @function.defun + @def_function.function def middle_fn(a, b): return math_ops.mul(a, inner_fn(a, b)) - @function.defun + @def_function.function def outer_fn(x): return middle_fn(x, 3.0) @@ -1132,19 +1132,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase): @test_util.run_in_graph_and_eager_modes def testDeeplyNestedDifferentiableFunctionGradientTapeInDefun(self): - @function.defun + @def_function.function def inner_inner_fn(a, b): return math_ops.add(a, b) - @function.defun + @def_function.function def inner_fn(a, b): return inner_inner_fn(a, b) - @function.defun + @def_function.function def middle_fn(a, b): return a * inner_fn(a, b) - @function.defun + @def_function.function def outer_fn(x): with backprop.GradientTape() as tp: tp.watch(x) @@ -1158,19 +1158,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase): @test_util.run_in_graph_and_eager_modes def testDeeplyNestedDifferentiableFunctionGradientTapeInNestedDefun(self): - @function.defun + @def_function.function def inner_inner_fn(a, b): return math_ops.add(a, b) - @function.defun + @def_function.function def inner_fn(a, b): return inner_inner_fn(a, b) - @function.defun + @def_function.function def middle_fn(a, b): return a * inner_fn(a, b) - @function.defun + @def_function.function def almost_outer_fn(x): with backprop.GradientTape() as tp: tp.watch(x) @@ -1178,7 +1178,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): grad = tp.gradient(result, x) return grad - @function.defun + @def_function.function def outer_fn(x): return almost_outer_fn(x) @@ -1188,19 +1188,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase): @test_util.run_in_graph_and_eager_modes def testDeeplyNestedDifferentiableFunctionGradientTapeInMultNestedDefun(self): - @function.defun + @def_function.function def inner_inner_fn(a, b): return math_ops.add(a, b) - @function.defun + @def_function.function def inner_fn(a, b): return inner_inner_fn(a, b) - @function.defun + @def_function.function def middle_fn(a, b): return a * inner_fn(a, b) - @function.defun + @def_function.function def almost_outer_fn(x): with backprop.GradientTape() as tp: tp.watch(x) @@ -1208,11 +1208,11 @@ class FunctionTest(test.TestCase, parameterized.TestCase): grad = tp.gradient(result, x) return grad - @function.defun + @def_function.function def outer_fn(x): return almost_outer_fn(x) - @function.defun + @def_function.function def outer_outer_fn(x): return outer_fn(x) @@ -1222,19 +1222,19 @@ class FunctionTest(test.TestCase, parameterized.TestCase): @test_util.run_in_graph_and_eager_modes def testDeeplyNestedDifferentiableFunctionTFGradientInDefun(self): - @function.defun + @def_function.function def inner_inner_fn(a, b): return math_ops.add(a, b) - @function.defun + @def_function.function def inner_fn(a, b): return inner_inner_fn(a, b) - @function.defun + @def_function.function def middle_fn(a, b): return a * inner_fn(a, b) - @function.defun + @def_function.function def outer_fn(x): result = middle_fn(x, 1.0) return gradients_impl.gradients(result, [x])[0] @@ -1245,24 +1245,24 @@ class FunctionTest(test.TestCase, parameterized.TestCase): @test_util.run_in_graph_and_eager_modes def testDeeplyNestedDifferentiableFunctionTFGradientInNestedDefun(self): - @function.defun + @def_function.function def inner_inner_fn(a, b): return math_ops.add(a, b) - @function.defun + @def_function.function def inner_fn(a, b): return inner_inner_fn(a, b) - @function.defun + @def_function.function def middle_fn(a, b): return a * inner_fn(a, b) - @function.defun + @def_function.function def almost_outer_fn(x): result = middle_fn(x, 1.0) return gradients_impl.gradients(result, [x])[0] - @function.defun + @def_function.function def outer_fn(x): return almost_outer_fn(x) @@ -1272,28 +1272,28 @@ class FunctionTest(test.TestCase, parameterized.TestCase): @test_util.run_in_graph_and_eager_modes def testDeeplyNestedDifferentiableFunctionTFGradientInMultNestedDefun(self): - @function.defun + @def_function.function def inner_inner_fn(a, b): return math_ops.add(a, b) - @function.defun + @def_function.function def inner_fn(a, b): return inner_inner_fn(a, b) - @function.defun + @def_function.function def middle_fn(a, b): return a * inner_fn(a, b) - @function.defun + @def_function.function def almost_outer_fn(x): result = middle_fn(x, 1.0) return gradients_impl.gradients(result, [x])[0] - @function.defun + @def_function.function def outer_fn(x): return almost_outer_fn(x) - @function.defun + @def_function.function def outer_outer_fn(x): return outer_fn(x) @@ -1461,7 +1461,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): def add(a, b): return math_ops.add(a, b) - @function.defun + @def_function.function def add_one(x): return add(x, 1) @@ -1675,7 +1675,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): with ops.device('gpu:0'): y = constant_op.constant(1.0) - @function.defun + @def_function.function def foo(): return test_ops.device_placement_op() diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py index 67eb60289fb..c231264047b 100644 --- a/tensorflow/python/eager/lift_to_graph.py +++ b/tensorflow/python/eager/lift_to_graph.py @@ -37,10 +37,8 @@ def lift_to_graph(init_tensor, graph, sources=None): visited_ops = set([x.op for x in sources]) ops_to_visit = [init_tensor.op] op_outputs = collections.defaultdict(set) - print("ops_to_visit", ops_to_visit) while ops_to_visit: op = ops_to_visit.pop() - print("visiting", op) if op in visited_ops: continue visited_ops.add(op) From 0e9c44642c0ad83c55af6a8268865a4672fdca78 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 7 Nov 2018 16:39:07 -0800 Subject: [PATCH 267/540] AsyncCheckpoint: call after_save after saving checkpoints, and call before_save only once per save. PiperOrigin-RevId: 220551918 --- tensorflow/contrib/tpu/python/tpu/async_checkpoint.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py index c32bd5997c1..1cf7f9fcf67 100644 --- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py +++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py @@ -164,14 +164,15 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook): SessionLog( status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path), step) + + for l in self._listeners: + l.after_save(session, step) + end_time = time.time() logging.info("Checkpoint actual writing time: (%.3f sec)", end_time - start_time) logging.info("Checkpoint finished for %d into %s.", step, self._save_path) - for l in self._listeners: - l.before_save(session, step) - if not asynchronous: _save_fn() return From 224a23d5dc022cc13b8c67705cd3d3142b5f3957 Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Wed, 7 Nov 2018 16:42:58 -0800 Subject: [PATCH 268/540] [TF] keras_style_scope: Decided for layers at __init__. A layer finds out if it's in a keras style scope at __init__ and stores this information, then uses keras style (or not) consistently through its lifetime. PiperOrigin-RevId: 220552411 --- tensorflow/python/kernel_tests/rnn_test.py | 23 +++++++++++----------- tensorflow/python/layers/base.py | 9 ++++++--- tensorflow/python/layers/base_test.py | 10 +++++----- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py index 2fb49638ef5..d62ebacf5a4 100644 --- a/tensorflow/python/kernel_tests/rnn_test.py +++ b/tensorflow/python/kernel_tests/rnn_test.py @@ -664,24 +664,25 @@ class RNNTest(test.TestCase): kn1 = KerasNetworkTFRNNs(name="kn1") kn2 = KerasNetworkKerasRNNs(name="kn2") - z = array_ops.zeros((2, 3)) + z = array_ops.zeros((2, 3)) - kn1(z) - kn2(z) + kn1(z) + kn2(z) - # pylint: disable=protected-access - self.assertTrue(all("kn1" in v.name for v in kn1._cell.variables)) - self.assertTrue(all("kn2" in v.name for v in kn2._cell.variables)) + # pylint: disable=protected-access + self.assertTrue(all("kn1" in v.name for v in kn1._cell.variables)) + self.assertTrue(all("kn2" in v.name for v in kn2._cell.variables)) + with base_layers.keras_style_scope(): kn1_new = KerasNetworkTFRNNs(name="kn1_new") kn2_new = KerasNetworkKerasRNNs(name="kn2_new") - kn2_new(z) - # Most importantly, this doesn't fail due to variable scope reuse issues. - kn1_new(z) + kn2_new(z) + # Most importantly, this doesn't fail due to variable scope reuse issues. + kn1_new(z) - self.assertTrue(all("kn1_new" in v.name for v in kn1_new._cell.variables)) - self.assertTrue(all("kn2_new" in v.name for v in kn2_new._cell.variables)) + self.assertTrue(all("kn1_new" in v.name for v in kn1_new._cell.variables)) + self.assertTrue(all("kn2_new" in v.name for v in kn2_new._cell.variables)) ######### Benchmarking RNN code diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py index 24f6a098b35..fccea484b0f 100644 --- a/tensorflow/python/layers/base.py +++ b/tensorflow/python/layers/base.py @@ -208,6 +208,9 @@ class Layer(base_layer.Layer): raise ValueError( 'reuse argument not allowed when keras style layers are enabled, ' 'but saw: {}'.format(self._reuse)) + self._keras_style = True + else: + self._keras_style = False self._graph = None self._call_has_scope_arg = 'scope' in self._call_fn_args @@ -275,7 +278,7 @@ class Layer(base_layer.Layer): def _name_scope(self): """Determines op naming for the Layer.""" - if _is_in_keras_style_scope(): + if self._keras_style: return super(Layer, self)._name_scope() return self._current_scope.original_name_scope @@ -349,7 +352,7 @@ class Layer(base_layer.Layer): ValueError: When trainable has been set to True with synchronization set as `ON_READ`. """ - if _is_in_keras_style_scope(): + if self._keras_style: return super(Layer, self).add_weight( name=name, shape=shape, @@ -477,7 +480,7 @@ class Layer(base_layer.Layer): """ scope = kwargs.pop('scope', None) - if _is_in_keras_style_scope(): + if self._keras_style: if scope is not None: raise ValueError( 'scope argument not allowed when keras style layers are enabled, ' diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py index 7a0ed63a491..90abf35e875 100644 --- a/tensorflow/python/layers/base_test.py +++ b/tensorflow/python/layers/base_test.py @@ -76,11 +76,11 @@ class BaseLayerTest(test.TestCase): self.assertEqual(variable.name, 'my_layer/my_var:0') with base_layers.keras_style_scope(): - with ops.name_scope('bar'): - layer = base_layers.Layer(name='my_layer') - # Test basic variable creation. - variable = layer.add_variable( - 'my_var', [2, 2], initializer=init_ops.zeros_initializer()) + layer = base_layers.Layer(name='my_layer') + # Test basic variable creation. + with ops.name_scope('bar'): + variable = layer.add_variable( + 'my_var', [2, 2], initializer=init_ops.zeros_initializer()) self.assertEqual(variable.name, 'bar/my_var:0') @test_util.run_in_graph_and_eager_modes From 81a35aa53f7bf56d2e22ce506cc5ee77624502c3 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 7 Nov 2018 17:00:40 -0800 Subject: [PATCH 269/540] Fix the issue for MaxPool3DGrad --- tensorflow/python/ops/nn_grad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py index 902653befc4..e1e998a8d60 100644 --- a/tensorflow/python/ops/nn_grad.py +++ b/tensorflow/python/ops/nn_grad.py @@ -179,7 +179,7 @@ def _AvgPool3DGradGrad(op, grad): def _MaxPool3DGrad(op, grad): return gen_nn_ops.max_pool3d_grad( op.inputs[0], - op.outputs[0], + op.inputs[0], grad, ksize=op.get_attr("ksize"), strides=op.get_attr("strides"), From 2f999200830f267b25fcade4bb91460df6c634cd Mon Sep 17 00:00:00 2001 From: Katherine Wu Date: Wed, 7 Nov 2018 16:54:09 -0800 Subject: [PATCH 270/540] Fix Sequential model cloning to copy InputLayer when input_tensors are not specified. PiperOrigin-RevId: 220553954 --- tensorflow/python/keras/models.py | 11 +++- tensorflow/python/keras/models_test.py | 70 ++++++++++++++++++++------ 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py index 0c9c066a852..225c6c6af8e 100644 --- a/tensorflow/python/keras/models.py +++ b/tensorflow/python/keras/models.py @@ -206,10 +206,17 @@ def _clone_sequential_model(model, input_tensors=None): def clone(layer): return layer.__class__.from_config(layer.get_config()) - layers = [clone(layer) for layer in model.layers] + # Use model._layers to ensure that all layers are cloned. The model's layers + # property will exclude the initial InputLayer (if it exists) in the model, + # resulting in a different Sequential model structure. + layers = [clone(layer) for layer in model._layers] if input_tensors is None: return Sequential(layers=layers, name=model.name) else: + # If input tensors are provided, the original model's InputLayer is + # overwritten with a different InputLayer. + if isinstance(layers[0], InputLayer): + layers = layers[1:] if len(generic_utils.to_list(input_tensors)) != 1: raise ValueError('To clone a `Sequential` model, we expect ' ' at most one tensor ' @@ -452,7 +459,7 @@ def clone_and_build_model( if all([isinstance(clone, Sequential), not clone._is_graph_network, - model.built]): + getattr(model, '_build_input_shape', None) is not None]): # Set model inputs to build the model and add input/output properties. # TODO(kathywu): Add multiple placeholders to handle edge case where # sequential model has multiple inputs. diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py index 36875cf984f..bf778f14971 100644 --- a/tensorflow/python/keras/models_test.py +++ b/tensorflow/python/keras/models_test.py @@ -50,6 +50,21 @@ class TestModel(keras.Model): return self.layer1(x) +def sequential_model(add_input_layer, include_input_shape=True): + model = keras.models.Sequential() + if add_input_layer: + model.add(keras.layers.InputLayer(input_shape=(4,))) + model.add(keras.layers.Dense(4)) + elif include_input_shape: + model.add(keras.layers.Dense(4, input_shape=(4,))) + else: + model.add(keras.layers.Dense(4)) + model.add(keras.layers.BatchNormalization()) + model.add(keras.layers.Dropout(0.5)) + model.add(keras.layers.Dense(4)) + return model + + class TestModelCloning(test.TestCase): def test_clone_sequential_model(self): @@ -57,11 +72,7 @@ class TestModelCloning(test.TestCase): val_a = np.random.random((10, 4)) val_out = np.random.random((10, 4)) - model = keras.models.Sequential() - model.add(keras.layers.Dense(4, input_shape=(4,))) - model.add(keras.layers.BatchNormalization()) - model.add(keras.layers.Dropout(0.5)) - model.add(keras.layers.Dense(4)) + model = sequential_model(False) # Everything should work in a new session. keras.backend.clear_session() @@ -76,20 +87,55 @@ class TestModelCloning(test.TestCase): # On top of new tensor input_a = keras.Input(shape=(4,)) - new_model = keras.models.clone_model( - model, input_tensors=input_a) + new_model = keras.models.clone_model(model, input_tensors=input_a) self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2) new_model.compile('rmsprop', 'mse') new_model.train_on_batch(val_a, val_out) # On top of new, non-Keras tensor input_a = keras.backend.variable(val_a) - new_model = keras.models.clone_model( - model, input_tensors=input_a) + new_model = keras.models.clone_model(model, input_tensors=input_a) self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2) new_model.compile('rmsprop', 'mse') new_model.train_on_batch(None, val_out) + def test_clone_sequential_model_input_layer(self): + def test_input_layer(include_inputs): + with self.cached_session(): + val_a = np.random.random((10, 4)) + model = sequential_model(include_inputs, include_inputs) + # Sanity check + self.assertEqual( + isinstance(model._layers[0], keras.layers.InputLayer), + include_inputs) + self.assertEqual(model._is_graph_network, include_inputs) + + keras.backend.clear_session() + with self.cached_session(): + # With placeholder creation -- clone model should have an InputLayer + # if the original model has one. + new_model = keras.models.clone_model(model) + self.assertEqual( + isinstance(new_model._layers[0], keras.layers.InputLayer), + include_inputs) + self.assertEqual(new_model._is_graph_network, model._is_graph_network) + + # On top of new tensor -- clone model should always have an InputLayer. + input_a = keras.Input(shape=(4,)) + new_model = keras.models.clone_model(model, input_tensors=input_a) + self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer) + self.assertTrue(new_model._is_graph_network) + + # On top of new, non-Keras tensor -- clone model should always have an + # InputLayer. + input_a = keras.backend.variable(val_a) + new_model = keras.models.clone_model(model, input_tensors=input_a) + self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer) + self.assertTrue(new_model._is_graph_network) + + test_input_layer(True) + test_input_layer(False) + def test_clone_functional_model(self): with self.cached_session(): val_a = np.random.random((10, 4)) @@ -401,11 +447,7 @@ class TestCloneAndBuildModel(test.TestCase): def test_clone_and_build_sequential_model_without_inputs_defined(self): with self.cached_session(): - model = keras.models.Sequential() - model.add(keras.layers.Dense(4)) - model.add(keras.layers.BatchNormalization()) - model.add(keras.layers.Dropout(0.5)) - model.add(keras.layers.Dense(4)) + model = sequential_model(False, False) model.compile('rmsprop', 'mse', metrics=['acc', metrics.categorical_accuracy]) self._clone_and_build_test_helper(model, False) From 6c40bb5ebb7538c26c23a5114a18dc8909e3df2a Mon Sep 17 00:00:00 2001 From: Ian Langmore Date: Wed, 7 Nov 2018 16:54:13 -0800 Subject: [PATCH 271/540] tf.linalg.matvec: Use tf.squeeze and expand_dims rather than implicit [..., newaxis] and [..., 0]. I was mistaken in thinking the latter had better performance. There is no verified difference. PiperOrigin-RevId: 220553967 --- tensorflow/python/ops/math_ops.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index d247e7b2463..af50aec429b 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -2145,17 +2145,14 @@ def matvec(a, ValueError: If transpose_a and adjoint_a are both set to True. """ with ops.name_scope(name, "MatVec", [a, b]) as name: - # matvec is achieved by reshaping b into a matrix (appending a singleton), - # then squeezing out the trailing dim of the result. There are other ways - # to do this, e.g. using tf.expand_dims and tf.squeeze. What we have here - # has been found to be most memory efficient on TPU. - return matmul( + output = matmul( a, - b[..., array_ops.newaxis], + array_ops.expand_dims(b, axis=-1), transpose_a=transpose_a, adjoint_a=adjoint_a, a_is_sparse=a_is_sparse, - b_is_sparse=b_is_sparse)[..., 0] + b_is_sparse=b_is_sparse) + return array_ops.squeeze(output, axis=-1) _OverrideBinaryOperatorHelper(matmul, "matmul") From 9cc5cce6990a61cd23a65dda7093f93b69804433 Mon Sep 17 00:00:00 2001 From: Igor Ganichev Date: Wed, 7 Nov 2018 17:20:16 -0800 Subject: [PATCH 272/540] core_cpu_base should not depend on core_cpu_impl PiperOrigin-RevId: 220557603 --- tensorflow/core/BUILD | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index b6e818e0a70..afde912cb5f 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2813,7 +2813,6 @@ tf_cuda_library( ":functional_ops_op_lib", "//tensorflow/core/kernels:bounds_check", "//tensorflow/core/kernels:required", - ":core_cpu_impl", ]), alwayslink = 1, ) From c6914229cb411517df1a12066ae81e891480df2f Mon Sep 17 00:00:00 2001 From: Asim Shankar Date: Wed, 7 Nov 2018 17:27:33 -0800 Subject: [PATCH 273/540] Fix contrib/cudnn_rnn tests. These tests started failing after https://github.com/tensorflow/tensorflow/commit/4cedc8b6e738b7a188c9c091cf667bacafae44b7 variables.Variable is the 2.0-style class - where the notion of global collections is being done away with (as per https://github.com/tensorflow/community/pull/17) and hence the call to sess.run(tf.global_variables_initializer()) is a no-op. For now, switch the tests to use variable.VariableV1. In a follow up, these tests need to be updated so that they do not rely on global collections (i.e., do not call tf.global_variables_initializer()). PiperOrigin-RevId: 220558354 --- .../python/kernel_tests/cudnn_rnn_ops_test.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py index c59d3682d40..4aa151dba55 100644 --- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py +++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py @@ -202,7 +202,7 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase): dtype=dtype) random_seed.set_random_seed(1234) params_size_t = model.params_size() - params = variables.Variable( + params = variables.VariableV1( random_ops.random_uniform([params_size_t], dtype=dtype), dtype=dtype, validate_shape=False) @@ -248,7 +248,7 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase): params_size_t = model.params_size() names = ["rnn_1", "rnn_2"] param_vars = [ - variables.Variable( + variables.VariableV1( random_ops.random_uniform([params_size_t], dtype=dtype), dtype=dtype, validate_shape=False) for name in names @@ -304,7 +304,7 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase): direction=direction, dtype=dtype) params_size_t = model.params_size() - params = variables.Variable( + params = variables.VariableV1( array_ops.ones([params_size_t], dtype=dtype), validate_shape=False, dtype=dtype) @@ -458,7 +458,7 @@ class CudnnRNNTestInference(TensorFlowTestCase): params_size_t = model.params_size() input_data = array_ops.ones([seq_length, batch_size, input_size]) input_h = array_ops.ones([num_layers * dir_count, batch_size, num_units]) - params = variables.Variable( + params = variables.VariableV1( array_ops.ones([params_size_t]), validate_shape=False) if has_input_c: input_c = array_ops.ones([num_layers * dir_count, batch_size, num_units]) @@ -584,20 +584,20 @@ class CudnnRNNTestTraining(TensorFlowTestCase): dtype=dtype, dropout=dropout) params_size_t = model.params_size() - input_data = variables.Variable( + input_data = variables.VariableV1( random_ops.random_uniform( [seq_length, batch_size, input_size], dtype=dtype), dtype=dtype) - input_h = variables.Variable( + input_h = variables.VariableV1( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) - params = variables.Variable( + params = variables.VariableV1( random_ops.random_uniform([params_size_t], dtype=dtype), validate_shape=False, dtype=dtype) if has_input_c: - input_c = variables.Variable( + input_c = variables.VariableV1( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) From c070362d981e3bbeb8b69fa9f9c701bec8b9c40b Mon Sep 17 00:00:00 2001 From: Igor Ganichev Date: Wed, 7 Nov 2018 17:30:11 -0800 Subject: [PATCH 274/540] Fix a bug in eager FingerprintCat128 The buggy version was ignoring the high 64 bits. PiperOrigin-RevId: 220558690 --- tensorflow/core/common_runtime/eager/attr_builder.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc index 29edc4e3b8f..201f06242f8 100644 --- a/tensorflow/core/common_runtime/eager/attr_builder.cc +++ b/tensorflow/core/common_runtime/eager/attr_builder.cc @@ -184,7 +184,7 @@ namespace { inline tensorflow::Fprint128 FingerprintCat128(const tensorflow::Fprint128& a, const tensorflow::Fprint128& b) { return {tensorflow::FingerprintCat64(a.low64, b.low64), - tensorflow::FingerprintCat64(a.low64, b.low64)}; + tensorflow::FingerprintCat64(a.high64, b.high64)}; } void CombineUnordered(const tensorflow::Fprint128& a, From b4c0a5edf2345dc9210ed6cc6804f7b89df9029e Mon Sep 17 00:00:00 2001 From: Yanhui Liang Date: Wed, 7 Nov 2018 17:32:09 -0800 Subject: [PATCH 275/540] Fix bugs in Mean metric. PiperOrigin-RevId: 220558930 --- tensorflow/python/keras/metrics.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py index 33e526352fa..2ea64055979 100644 --- a/tensorflow/python/keras/metrics.py +++ b/tensorflow/python/keras/metrics.py @@ -521,10 +521,12 @@ class Mean(Metric): values = math_ops.multiply(values, sample_weight) values = math_ops.reduce_sum(values) - # Update state variables + # Update state variables. Count should be updated only when total is + # updated. update_total_op = state_ops.assign_add(self.total, values) - update_count_op = state_ops.assign_add(self.count, num_values) - return control_flow_ops.group(update_total_op, update_count_op) + with ops.control_dependencies([update_total_op]): + update_count_op = state_ops.assign_add(self.count, num_values) + return ops.convert_to_tensor(update_count_op) def result(self): return safe_div(self.total, self.count) From 483d0aeca48e52c8690cfc6274561aa349a65df3 Mon Sep 17 00:00:00 2001 From: Karim Nosir Date: Wed, 7 Nov 2018 17:39:04 -0800 Subject: [PATCH 276/540] Fix some signed unsigned comparisons. PiperOrigin-RevId: 220559797 --- tensorflow/lite/interpreter.cc | 9 +++++---- tensorflow/lite/optional_debug_tools.cc | 15 ++++++++------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc index bff7145de99..831fde6e2f6 100644 --- a/tensorflow/lite/interpreter.cc +++ b/tensorflow/lite/interpreter.cc @@ -154,7 +154,7 @@ Interpreter::~Interpreter() { node.builtin_data = nullptr; } - for (int i = 0; i < context_.tensors_size; i++) { + for (size_t i = 0; i < context_.tensors_size; i++) { TfLiteTensor* tensor = &context_.tensors[i]; if (tensor->buffer_handle != kTfLiteNullBufferHandle && tensor->delegate->FreeBufferHandle != nullptr) { @@ -729,10 +729,10 @@ void Interpreter::ReportError(TfLiteContext* context, const char* format, ...) { TfLiteStatus Interpreter::AddTensors(int tensors_to_add, int* first_new_tensor_index) { - int base_index = tensors_.size(); + const size_t base_index = tensors_.size(); if (first_new_tensor_index) *first_new_tensor_index = base_index; tensors_.resize(tensors_.size() + tensors_to_add); - for (int i = base_index; i < tensors_.size(); i++) { + for (size_t i = base_index; i < tensors_.size(); i++) { memset(&tensors_[i], 0, sizeof(tensors_[i])); tensors_[i].buffer_handle = kTfLiteNullBufferHandle; } @@ -752,7 +752,8 @@ TfLiteStatus Interpreter::AddTensors(TfLiteContext* context, int tensors_to_add, TfLiteStatus Interpreter::GetNodeAndRegistration( int node_index, TfLiteNode** node, TfLiteRegistration** registration) { - TF_LITE_ENSURE(&context_, node_index < nodes_size() && node_index >= 0); + TF_LITE_ENSURE(&context_, node_index >= 0); + TF_LITE_ENSURE(&context_, static_cast(node_index) < nodes_size()); TF_LITE_ENSURE(&context_, node != nullptr && registration != nullptr); *node = &nodes_and_registration_[node_index].first; *registration = &nodes_and_registration_[node_index].second; diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc index 020d1d8de5f..5ee1cf6d33d 100644 --- a/tensorflow/lite/optional_debug_tools.cc +++ b/tensorflow/lite/optional_debug_tools.cc @@ -83,26 +83,27 @@ void PrintInterpreterState(Interpreter* interpreter) { printf("Outputs:"); PrintIntVector(interpreter->outputs()); printf("\n"); - for (int tensor_index = 0; tensor_index < interpreter->tensors_size(); + for (size_t tensor_index = 0; tensor_index < interpreter->tensors_size(); tensor_index++) { - TfLiteTensor* tensor = interpreter->tensor(tensor_index); - printf("Tensor %3d %-20s %10s %15s %10zu bytes (%4.1f MB) ", tensor_index, + TfLiteTensor* tensor = interpreter->tensor(static_cast(tensor_index)); + printf("Tensor %3zu %-20s %10s %15s %10zu bytes (%4.1f MB) ", tensor_index, tensor->name, TensorTypeName(tensor->type), AllocTypeName(tensor->allocation_type), tensor->bytes, (static_cast(tensor->bytes) / (1 << 20))); PrintTfLiteIntVector(tensor->dims); } printf("\n"); - for (int node_index = 0; node_index < interpreter->nodes_size(); + for (size_t node_index = 0; node_index < interpreter->nodes_size(); node_index++) { const std::pair* node_and_reg = - interpreter->node_and_registration(node_index); + interpreter->node_and_registration(static_cast(node_index)); const TfLiteNode& node = node_and_reg->first; const TfLiteRegistration& reg = node_and_reg->second; if (reg.custom_name != nullptr) { - printf("Node %3d Operator Custom Name %s\n", node_index, reg.custom_name); + printf("Node %3zu Operator Custom Name %s\n", node_index, + reg.custom_name); } else { - printf("Node %3d Operator Builtin Code %3d\n", node_index, + printf("Node %3zu Operator Builtin Code %3d\n", node_index, reg.builtin_code); } printf(" Inputs:"); From ef6e793e21558dc49264483f5417215495ca27c9 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Wed, 7 Nov 2018 18:48:41 -0800 Subject: [PATCH 277/540] Avoid using non-OK Status for non-errors IncreaseDynamismForAutoJitPass used to use non-OK Status to signal non-errors like a Slice op not having a Const size. This was a bad idea for two reasons: - It mixes errors (that should be propagated out of ::Run) with non errors (that are just regular control flow within the pass). - It results in confusing LOG messages because we get (e.g.) TF_RET_CHECK failures even in successful runs. This CL changes the pass to use regular control flow instead of non-OK Statuses. PiperOrigin-RevId: 220566824 --- .../increase_dynamism_for_auto_jit_pass.cc | 88 ++++++++++++------- 1 file changed, 58 insertions(+), 30 deletions(-) diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc index bd8719b7f1a..d984ca15cb7 100644 --- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc +++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc @@ -18,6 +18,7 @@ limitations under the License. #include "absl/container/inlined_vector.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_replace.h" +#include "absl/types/optional.h" #include "tensorflow/cc/framework/scope_internal.h" #include "tensorflow/cc/ops/array_ops.h" #include "tensorflow/cc/ops/const_op.h" @@ -34,14 +35,30 @@ limitations under the License. namespace tensorflow { namespace { -Status GetTensorFromConstOp(Node* n, Tensor* out_tensor) { - TF_RET_CHECK(n->type_string() == "Const"); + +// StatusOrOptional instances hold +// +// - A non-OK Status to indicate an error that needs to be propagated out of +// this pass (e.g. the Graph is malformed). +// +// - A nullopt to indicate the function that created the instance failed to do +// what it set out to do but this is not actually an error +// (e.g. TryToGetTensorFromConstOp was passed a non-Const node). +// +// - A T to indicate a successful operation. +template +using StatusOrOptional = xla::StatusOr>; + +StatusOrOptional TryToGetTensorFromConstOp(Node* n) { + if (n->type_string() != "Const") { + return {absl::nullopt}; + } + const TensorProto* proto = nullptr; TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "value", &proto)); Tensor tensor(proto->dtype()); TF_RET_CHECK(tensor.FromProto(*proto)); - *out_tensor = std::move(tensor); - return Status::OK(); + return {tensor}; } struct SliceInputs { @@ -70,7 +87,7 @@ std::vector IntTensorAsVector(const Tensor& t) { // Packages up the inputs to a Slice operation into an instance of // `SliceInputs`. -Status GetSliceInputs(Node* slice, SliceInputs* slice_inputs) { +StatusOrOptional GetSliceInputs(Node* slice) { const int kSliceInputIndex = 0; const int kSliceBeginIndex = 1; const int kSliceSizeIndex = 2; @@ -81,23 +98,27 @@ Status GetSliceInputs(Node* slice, SliceInputs* slice_inputs) { TF_RETURN_IF_ERROR(slice->input_edge(kSliceSizeIndex, &slice_size_edge)); const Edge* slice_begin_edge; TF_RETURN_IF_ERROR(slice->input_edge(kSliceBeginIndex, &slice_begin_edge)); - slice_inputs->input = + + SliceInputs slice_inputs; + slice_inputs.input = Output(slice_input_edge->src(), slice_input_edge->src_output()); - slice_inputs->begin = + slice_inputs.begin = Output(slice_begin_edge->src(), slice_begin_edge->src_output()); - slice_inputs->size = + slice_inputs.size = Output(slice_size_edge->src(), slice_size_edge->src_output()); - Tensor tf_slice_size; - TF_RETURN_IF_ERROR( - GetTensorFromConstOp(slice_inputs->size.node(), &tf_slice_size)); - - if (tf_slice_size.dims() != 1) { - return errors::Internal("Expected vector for the slice size input."); + TF_ASSIGN_OR_RETURN(absl::optional tf_slice_size, + TryToGetTensorFromConstOp(slice_inputs.size.node())); + if (!tf_slice_size.has_value()) { + return {absl::nullopt}; } - slice_inputs->size_as_vector = IntTensorAsVector(tf_slice_size); - return Status::OK(); + if (tf_slice_size->dims() != 1) { + return {absl::nullopt}; + } + + slice_inputs.size_as_vector = IntTensorAsVector(*tf_slice_size); + return {slice_inputs}; } // Casts `x` to a DT_INT64 if it isn't one already. @@ -263,36 +284,43 @@ Status RewriteSlice(Graph* g, Node* slice, const SliceInputs& slice_inputs, return Status::OK(); } -// Returns true if `n` is a slice we can rewrite to have a static shape -// (i.e. have the output shape only depend on the "size" input). Fills in -// `slice_inputs` in the process. -bool IsRewritableSlice(Node* n, SliceInputs* slice_inputs) { +// If `n` is a slice we can rewrite to have a static shape (i.e. have the output +// shape only depend on the "size" input) then returns the a SliceInputs +// representing the inputs to `n`. Otherwise returns nullopt. +StatusOrOptional IsRewritableSlice(Node* n) { if (n->type_string() != "Slice") { - return false; + return {absl::nullopt}; } if (!GetXlaClusterForNode(*n).has_value()) { // There is no need to change slice ops outside XLA clusters. - return false; + return {absl::nullopt}; } - if (!GetSliceInputs(n, slice_inputs).ok()) { - // Could not parse slice inputs. E.g. the sizes input was not a constant. - return false; + TF_ASSIGN_OR_RETURN(absl::optional slice_inputs, + GetSliceInputs(n)); + if (!slice_inputs.has_value()) { + return {absl::nullopt}; } // If slice_size[i] < -1 for any i then executing the slice will throw an // error, and we don't do anything here. - return absl::c_all_of(slice_inputs->size_as_vector, - [](int64 size_i) { return size_i >= -1; }); + bool slice_is_ok = absl::c_all_of(slice_inputs->size_as_vector, + [](int64 size_i) { return size_i >= -1; }); + if (!slice_is_ok) { + return {absl::nullopt}; + } + + return slice_inputs; } Status FindAndRewriteSlices(Graph* g, bool* changed) { std::vector> slices_to_rewrite; for (Node* n : g->nodes()) { - SliceInputs slice_inputs; - if (IsRewritableSlice(n, &slice_inputs)) { - slices_to_rewrite.push_back({n, std::move(slice_inputs)}); + TF_ASSIGN_OR_RETURN(absl::optional slice_inputs, + IsRewritableSlice(n)); + if (slice_inputs.has_value()) { + slices_to_rewrite.push_back({n, std::move(*slice_inputs)}); } } From bb878129e1204d07a93b6f54e1bdbb10c82b9acc Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Wed, 7 Nov 2018 18:52:04 -0800 Subject: [PATCH 278/540] BoostedTrees support float feature columns. PiperOrigin-RevId: 220567110 --- .../api_def_BoostedTreesBucketize.pbtxt | 4 +- ...ef_BoostedTreesMakeQuantileSummaries.pbtxt | 9 +- ...eesQuantileStreamResourceDeserialize.pbtxt | 26 ++++++ .../kernels/boosted_trees/quantile_ops.cc | 91 +++++++++++++++--- tensorflow/core/ops/boosted_trees_ops.cc | 21 +++-- .../boosted_trees/quantile_ops_test.py | 92 ++++++++++++++++--- tensorflow/python/ops/boosted_trees_ops.py | 66 +++++++++++++ ....estimator.-boosted-trees-classifier.pbtxt | 2 +- ...w.estimator.-boosted-trees-regressor.pbtxt | 2 +- ....estimator.-boosted-trees-classifier.pbtxt | 2 +- ...w.estimator.-boosted-trees-regressor.pbtxt | 2 +- 11 files changed, 274 insertions(+), 43 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt index cdaeb5091c7..bfaf3d2ea59 100644 --- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt @@ -4,7 +4,7 @@ op { in_arg { name: "float_values" description: <