enable sparse_matmul and immutable_constant kernels for windows (#5290)

* enable sparse_matmul for windows.

* enable immutable_constant op on windows

* use char const* instead of char * to get rid of compiler warnings
This commit is contained in:
guschmue 2016-10-31 10:07:08 -07:00 committed by Vijay Vasudevan
parent 7cd00438e7
commit 1511dd4bad
6 changed files with 63 additions and 21 deletions

View File

@ -87,11 +87,7 @@ if(WIN32)
# not working on windows yet # not working on windows yet
"${tensorflow_source_dir}/tensorflow/core/kernels/depthwise_conv_op.cc" # Cannot find symbol: tensorflow::LaunchConv2DOp<struct Eigen::ThreadPoolDevice, double>::launch(...). "${tensorflow_source_dir}/tensorflow/core/kernels/depthwise_conv_op.cc" # Cannot find symbol: tensorflow::LaunchConv2DOp<struct Eigen::ThreadPoolDevice, double>::launch(...).
"${tensorflow_source_dir}/tensorflow/core/kernels/fact_op.cc" "${tensorflow_source_dir}/tensorflow/core/kernels/fact_op.cc"
"${tensorflow_source_dir}/tensorflow/core/kernels/immutable_constant_op.cc"
"${tensorflow_source_dir}/tensorflow/core/kernels/immutable_constant_op.h"
"${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*" "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
"${tensorflow_source_dir}/tensorflow/core/kernels/sparse_matmul_op.cc"
"${tensorflow_source_dir}/tensorflow/core/kernels/sparse_matmul_op.h"
"${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h" "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
"${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc" "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
"${tensorflow_source_dir}/tensorflow/core/kernels/svd*.cc" "${tensorflow_source_dir}/tensorflow/core/kernels/svd*.cc"

View File

@ -96,9 +96,9 @@ void ImmutableConstantOp::Compute(OpKernelContext* ctx) {
} }
ImmutableConstantOp::~ImmutableConstantOp() {} ImmutableConstantOp::~ImmutableConstantOp() {}
constexpr char ImmutableConstantOp::kDTypeAttr[]; constexpr char const* ImmutableConstantOp::kDTypeAttr;
constexpr char ImmutableConstantOp::kShapeAttr[]; constexpr char const* ImmutableConstantOp::kShapeAttr;
constexpr char ImmutableConstantOp::kMemoryRegionNameAttr[]; constexpr char const* ImmutableConstantOp::kMemoryRegionNameAttr;
REGISTER_KERNEL_BUILDER(Name("ImmutableConst").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("ImmutableConst").Device(DEVICE_CPU),
ImmutableConstantOp); ImmutableConstantOp);

View File

@ -33,9 +33,9 @@ class ImmutableConstantOp : public OpKernel {
~ImmutableConstantOp() override; ~ImmutableConstantOp() override;
// Names of attributes that are used by this op // Names of attributes that are used by this op
static constexpr char kDTypeAttr[] = "dtype"; static constexpr char const* kDTypeAttr = "dtype";
static constexpr char kShapeAttr[] = "shape"; static constexpr char const* kShapeAttr = "shape";
static constexpr char kMemoryRegionNameAttr[] = "memory_region_name"; static constexpr char const* kMemoryRegionNameAttr = "memory_region_name";
private: private:
string region_name_; string region_name_;

View File

@ -34,6 +34,7 @@ limitations under the License.
#include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
namespace tensorflow { namespace tensorflow {
namespace { namespace {
@ -134,7 +135,7 @@ struct SparseSlice {
template <typename T> template <typename T>
template <bool Transpose> template <bool Transpose>
void SparseSlice<T>::Initialize(const SparseSlice<T>::ConstMatrixMap& mat, void SparseSlice<T>::Initialize(const typename SparseSlice<T>::ConstMatrixMap& mat,
int col_offset) { int col_offset) {
const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0); const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0);
const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1); const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1);
@ -950,7 +951,7 @@ class SparseMatMulOp : public OpKernel {
template <typename TL, typename TR> template <typename TL, typename TR>
inline void SparseMatMul<TL, TR>::ComputeOutputBlock( inline void SparseMatMul<TL, TR>::ComputeOutputBlock(
const std::vector<SparseSlice<TL>*>& left, const std::vector<SparseSlice<TL>*>& left,
const SparseMatMul<TL, TR>::ConstMatrixMapR& right, int num_cols, const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, int num_cols,
int output_row_offset, int output_col_offset, bool assign, int output_row_offset, int output_col_offset, bool assign,
bool transpose_output, MatrixMap* output) { bool transpose_output, MatrixMap* output) {
static const Eigen::array<int, 2> perm({1, 0}); static const Eigen::array<int, 2> perm({1, 0});
@ -1000,7 +1001,7 @@ inline void SparseMatMul<TL, TR>::ComputeOutputBlock(
template <typename TL, typename TR> template <typename TL, typename TR>
inline BlockingCounter* SparseMatMul<TL, TR>::CreateSparseSlices( inline BlockingCounter* SparseMatMul<TL, TR>::CreateSparseSlices(
const SparseMatMul<TL, TR>::ConstMatrixMapL& mat, bool transpose, const typename SparseMatMul<TL, TR>::ConstMatrixMapL& mat, bool transpose,
int slice_num_rows, int slice_block_size, int slice_num_cols, int slice_num_rows, int slice_block_size, int slice_num_cols,
std::vector<std::vector<SparseSlice<TL>*>>* mat_slices, std::vector<std::vector<SparseSlice<TL>*>>* mat_slices,
const DeviceBase::CpuWorkerThreads* thread_pool) { const DeviceBase::CpuWorkerThreads* thread_pool) {
@ -1096,7 +1097,7 @@ ALWAYS_INLINE void CopyAndMayBeInterleave(void* dst, const void* src,
template <typename TL, typename TR> template <typename TL, typename TR>
inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix( inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix(
const SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int slice_row_start, const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int slice_row_start,
int slice_num_rows, int slice_col_start, int slice_num_cols, const int N, int slice_num_rows, int slice_col_start, int slice_num_cols, const int N,
const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) { const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) {
DCHECK_EQ(N % 2, 0); DCHECK_EQ(N % 2, 0);
@ -1153,7 +1154,7 @@ inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix(
template <typename TL, typename TR> template <typename TL, typename TR>
inline void SparseMatMul<TL, TR>::SliceMatrix( inline void SparseMatMul<TL, TR>::SliceMatrix(
const MatrixR& mat, const int num_rows, const int num_slices, const MatrixR& mat, const int num_rows, const int num_slices,
std::vector<SparseMatMul<TL, TR>::ConstMatrixMapR*>* slices) { std::vector<typename SparseMatMul<TL, TR>::ConstMatrixMapR*>* slices) {
slices->resize(num_slices); slices->resize(num_slices);
DSizes d(num_rows, mat.dimension(1)); DSizes d(num_rows, mat.dimension(1));
DCHECK_LE(num_rows * num_slices, mat.dimension(0)); DCHECK_LE(num_rows * num_slices, mat.dimension(0));
@ -1164,10 +1165,10 @@ inline void SparseMatMul<TL, TR>::SliceMatrix(
template <typename TL, typename TR> template <typename TL, typename TR>
inline BlockingCounter* SparseMatMul<TL, TR>::CreateDenseSlices( inline BlockingCounter* SparseMatMul<TL, TR>::CreateDenseSlices(
const SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int row_start, const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int row_start,
int num_rows, int col_start, int num_cols, int num_rows, int col_start, int num_cols,
const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer, const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer,
std::vector<SparseMatMul<TL, TR>::ConstMatrixMapR*>* slices) { std::vector<typename SparseMatMul<TL, TR>::ConstMatrixMapR*>* slices) {
BlockingCounter* shuffle_counter = ShuffleMatrix( BlockingCounter* shuffle_counter = ShuffleMatrix(
mat, row_start, num_rows, col_start, num_cols, N, thread_pool, buffer); mat, row_start, num_rows, col_start, num_cols, N, thread_pool, buffer);
const int num_slices = (num_cols + N - 1) / N; const int num_slices = (num_cols + N - 1) / N;
@ -1177,8 +1178,8 @@ inline BlockingCounter* SparseMatMul<TL, TR>::CreateDenseSlices(
template <typename TL, typename TR> template <typename TL, typename TR>
inline void SparseMatMul<TL, TR>::ComputeBlockSizes( inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
const SparseMatMul<TL, TR>::ConstMatrixMapL& left, const typename SparseMatMul<TL, TR>::ConstMatrixMapL& left,
const SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left, const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) { int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) {
// Heuristics for calculating block sizes // Heuristics for calculating block sizes
// Assume two hyperthreads per core. // Assume two hyperthreads per core.
@ -1248,8 +1249,8 @@ inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
// {l_i} and JB elements from {r_j} and compute the IB * JB inner products. // {l_i} and JB elements from {r_j} and compute the IB * JB inner products.
template <typename TL, typename TR> template <typename TL, typename TR>
inline void SparseMatMul<TL, TR>::Compute( inline void SparseMatMul<TL, TR>::Compute(
const SparseMatMul<TL, TR>::ConstMatrixMapL& left, const typename SparseMatMul<TL, TR>::ConstMatrixMapL& left,
const SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left, const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
const DeviceBase::CpuWorkerThreads* thread_pool, bool transpose_output, const DeviceBase::CpuWorkerThreads* thread_pool, bool transpose_output,
MatrixMap* output) { MatrixMap* output) {
const int num_threads = thread_pool->num_threads; const int num_threads = thread_pool->num_threads;

View File

@ -19,6 +19,10 @@ limitations under the License.
#include "third_party/eigen3/Eigen/Core" #include "third_party/eigen3/Eigen/Core"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
#if defined(PLATFORM_WINDOWS)
#include "tensorflow/core/platform/windows/intrinsics_port.h"
#endif
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {

View File

@ -0,0 +1,41 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_INTRINSICS_PORT_H_
#define TENSORFLOW_CORE_PLATFORM_WINDOWS_INTRINSICS_PORT_H_
#ifdef _MSC_VER
// the following avx intrinsics are not defined on windows
// in immintrin.h so we define them here.
//
#include "tensorflow/core/platform/types.h"
#define _mm_load_pd1 _mm_load1_pd
static inline int
_mm256_extract_epi32(__m256i a, const int i)
{
return a.m256i_i32[i & 7];
}
static inline __m256i
_mm256_insert_epi32(__m256i a, int b, const int i)
{
__m256i c = a;
c.m256i_i32[i & 7] = b;
return c;
}
#endif
#endif