enable sparse_matmul and immutable_constant kernels for windows (#5290)

* enable sparse_matmul for windows. * enable immutable_constant op on windows * use char const* instead of char * to get rid of compiler warnings
2016-10-31 10:07:08 -07:00 · 2016-10-31 10:07:08 -07:00 · 1511dd4bad
commit 1511dd4bad
parent 7cd00438e7
6 changed files with 63 additions and 21 deletions
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@ -87,11 +87,7 @@ if(WIN32)
      # not working on windows yet
      "${tensorflow_source_dir}/tensorflow/core/kernels/depthwise_conv_op.cc"  # Cannot find symbol: tensorflow::LaunchConv2DOp<struct Eigen::ThreadPoolDevice, double>::launch(...).
      "${tensorflow_source_dir}/tensorflow/core/kernels/fact_op.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/immutable_constant_op.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/immutable_constant_op.h"
      "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_matmul_op.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_matmul_op.h"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/svd*.cc"
--- a/tensorflow/core/kernels/immutable_constant_op.cc
+++ b/tensorflow/core/kernels/immutable_constant_op.cc
@ -96,9 +96,9 @@ void ImmutableConstantOp::Compute(OpKernelContext* ctx) {
 }

 ImmutableConstantOp::~ImmutableConstantOp() {}
-constexpr char ImmutableConstantOp::kDTypeAttr[];
-constexpr char ImmutableConstantOp::kShapeAttr[];
-constexpr char ImmutableConstantOp::kMemoryRegionNameAttr[];
+constexpr char const* ImmutableConstantOp::kDTypeAttr;
+constexpr char const* ImmutableConstantOp::kShapeAttr;
+constexpr char const* ImmutableConstantOp::kMemoryRegionNameAttr;

 REGISTER_KERNEL_BUILDER(Name("ImmutableConst").Device(DEVICE_CPU),
                        ImmutableConstantOp);
--- a/tensorflow/core/kernels/immutable_constant_op.h
+++ b/tensorflow/core/kernels/immutable_constant_op.h
@ -33,9 +33,9 @@ class ImmutableConstantOp : public OpKernel {
  ~ImmutableConstantOp() override;

  // Names of attributes that are used by this op
-  static constexpr char kDTypeAttr[] = "dtype";
-  static constexpr char kShapeAttr[] = "shape";
-  static constexpr char kMemoryRegionNameAttr[] = "memory_region_name";
+  static constexpr char const* kDTypeAttr = "dtype";
+  static constexpr char const* kShapeAttr = "shape";
+  static constexpr char const* kMemoryRegionNameAttr = "memory_region_name";

 private:
  string region_name_;
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"

+
 namespace tensorflow {

 namespace {
@ -134,7 +135,7 @@ struct SparseSlice {

 template <typename T>
 template <bool Transpose>
-void SparseSlice<T>::Initialize(const SparseSlice<T>::ConstMatrixMap& mat,
+void SparseSlice<T>::Initialize(const typename SparseSlice<T>::ConstMatrixMap& mat,
                                int col_offset) {
  const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0);
  const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1);
@ -950,7 +951,7 @@ class SparseMatMulOp : public OpKernel {
 template <typename TL, typename TR>
 inline void SparseMatMul<TL, TR>::ComputeOutputBlock(
    const std::vector<SparseSlice<TL>*>& left,
-    const SparseMatMul<TL, TR>::ConstMatrixMapR& right, int num_cols,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, int num_cols,
    int output_row_offset, int output_col_offset, bool assign,
    bool transpose_output, MatrixMap* output) {
  static const Eigen::array<int, 2> perm({1, 0});
@ -1000,7 +1001,7 @@ inline void SparseMatMul<TL, TR>::ComputeOutputBlock(

 template <typename TL, typename TR>
 inline BlockingCounter* SparseMatMul<TL, TR>::CreateSparseSlices(
-    const SparseMatMul<TL, TR>::ConstMatrixMapL& mat, bool transpose,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapL& mat, bool transpose,
    int slice_num_rows, int slice_block_size, int slice_num_cols,
    std::vector<std::vector<SparseSlice<TL>*>>* mat_slices,
    const DeviceBase::CpuWorkerThreads* thread_pool) {
@ -1096,7 +1097,7 @@ ALWAYS_INLINE void CopyAndMayBeInterleave(void* dst, const void* src,

 template <typename TL, typename TR>
 inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix(
-    const SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int slice_row_start,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int slice_row_start,
    int slice_num_rows, int slice_col_start, int slice_num_cols, const int N,
    const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) {
  DCHECK_EQ(N % 2, 0);
@ -1153,7 +1154,7 @@ inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix(
 template <typename TL, typename TR>
 inline void SparseMatMul<TL, TR>::SliceMatrix(
    const MatrixR& mat, const int num_rows, const int num_slices,
-    std::vector<SparseMatMul<TL, TR>::ConstMatrixMapR*>* slices) {
+    std::vector<typename SparseMatMul<TL, TR>::ConstMatrixMapR*>* slices) {
  slices->resize(num_slices);
  DSizes d(num_rows, mat.dimension(1));
  DCHECK_LE(num_rows * num_slices, mat.dimension(0));
@ -1164,10 +1165,10 @@ inline void SparseMatMul<TL, TR>::SliceMatrix(

 template <typename TL, typename TR>
 inline BlockingCounter* SparseMatMul<TL, TR>::CreateDenseSlices(
-    const SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int row_start,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int row_start,
    int num_rows, int col_start, int num_cols,
    const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer,
-    std::vector<SparseMatMul<TL, TR>::ConstMatrixMapR*>* slices) {
+    std::vector<typename SparseMatMul<TL, TR>::ConstMatrixMapR*>* slices) {
  BlockingCounter* shuffle_counter = ShuffleMatrix(
      mat, row_start, num_rows, col_start, num_cols, N, thread_pool, buffer);
  const int num_slices = (num_cols + N - 1) / N;
@ -1177,8 +1178,8 @@ inline BlockingCounter* SparseMatMul<TL, TR>::CreateDenseSlices(

 template <typename TL, typename TR>
 inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
-    const SparseMatMul<TL, TR>::ConstMatrixMapL& left,
-    const SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapL& left,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
    int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) {
  // Heuristics for calculating block sizes
  // Assume two hyperthreads per core.
@ -1248,8 +1249,8 @@ inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
 //    {l_i} and JB elements from {r_j} and compute the IB * JB inner products.
 template <typename TL, typename TR>
 inline void SparseMatMul<TL, TR>::Compute(
-    const SparseMatMul<TL, TR>::ConstMatrixMapL& left,
-    const SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapL& left,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
    const DeviceBase::CpuWorkerThreads* thread_pool, bool transpose_output,
    MatrixMap* output) {
  const int num_threads = thread_pool->num_threads;
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@ -19,6 +19,10 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"

+#if defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/windows/intrinsics_port.h"
+#endif
+
 namespace Eigen {
 namespace internal {

--- a/tensorflow/core/platform/windows/intrinsics_port.h
+++ b/tensorflow/core/platform/windows/intrinsics_port.h
@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_INTRINSICS_PORT_H_
+#define TENSORFLOW_CORE_PLATFORM_WINDOWS_INTRINSICS_PORT_H_
+
+
+#ifdef _MSC_VER
+// the following avx intrinsics are not defined on windows
+// in immintrin.h so we define them here.
+// 
+#include "tensorflow/core/platform/types.h"
+
+#define _mm_load_pd1 _mm_load1_pd
+static inline int
+_mm256_extract_epi32(__m256i a, const int i)
+{
+  return a.m256i_i32[i & 7];
+}
+
+static inline __m256i
+_mm256_insert_epi32(__m256i a, int b, const int i)
+{
+  __m256i c = a;
+  c.m256i_i32[i & 7] = b;
+  return c;
+}
+#endif
+#endif