diff --git a/WORKSPACE b/WORKSPACE index 9d3622878c3..e0931512f4a 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -38,6 +38,13 @@ new_http_archive( sha256 = "b4c178fd6236dcf0a20d25d07c45eebe85281263978c6a6f1dfc49d75befc45f" ) +new_http_archive( + name = "stylize", + build_file = "models.BUILD", + url = "https://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip", + sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa" +) + # TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT new_http_archive( diff --git a/configure b/configure index 64add33bd5d..1e4d786974d 100755 --- a/configure +++ b/configure @@ -57,9 +57,27 @@ done if is_windows; then TF_NEED_GCP=0 TF_NEED_HDFS=0 + TF_NEED_JEMALLOC=0 TF_NEED_OPENCL=0 fi +while [ "$TF_NEED_JEMALLOC" == "" ]; do + read -p "Do you wish to use jemalloc as the malloc implementation? "\ +"(Linux only) [Y/n] " INPUT + case $INPUT in + [Yy]* ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;; + [Nn]* ) echo "jemalloc disabled on Linux"; TF_NEED_JEMALLOC=0;; + "" ) echo "jemalloc enabled on Linux"; TF_NEED_JEMALLOC=1;; + * ) echo "Invalid selection: " $INPUT;; + esac +done + +if [ "$TF_NEED_JEMALLOC" == "1" ]; then + sed -i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl +else + sed -i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl +fi + while [ "$TF_NEED_GCP" == "" ]; do read -p "Do you wish to build TensorFlow with "\ "Google Cloud Platform support? [y/N] " INPUT diff --git a/tensorflow/.clang-format b/tensorflow/.clang-format new file mode 100644 index 00000000000..e06cf478f46 --- /dev/null +++ b/tensorflow/.clang-format @@ -0,0 +1,4 @@ +# Run manually to reformat a file: +# clang-format -i --style=file +BasedOnStyle: Google +DerivePointerAlignment: false diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 9063af696fd..355e48d582f 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -190,6 +190,7 @@ filegroup( "//tensorflow/examples/image_retraining:all_files", "//tensorflow/examples/label_image:all_files", "//tensorflow/examples/learn:all_files", + "//tensorflow/examples/saved_model:all_files", "//tensorflow/examples/tutorials/estimators:all_files", "//tensorflow/examples/tutorials/mnist:all_files", "//tensorflow/examples/tutorials/word2vec:all_files", @@ -203,7 +204,6 @@ filegroup( "//tensorflow/python/debug:all_files", "//tensorflow/python/kernel_tests:all_files", "//tensorflow/python/saved_model:all_files", - "//tensorflow/python/saved_model/example:all_files", "//tensorflow/python/tools:all_files", "//tensorflow/tensorboard:all_files", "//tensorflow/tensorboard/app:all_files", diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD index 32696e83e4d..a6bc8fdc492 100644 --- a/tensorflow/c/BUILD +++ b/tensorflow/c/BUILD @@ -6,6 +6,7 @@ licenses(["notice"]) # Apache 2.0 load( "//tensorflow:tensorflow.bzl", "tf_cc_test", + "tf_copts", "tf_cuda_library", "tf_custom_op_library", ) @@ -23,13 +24,19 @@ tf_cuda_library( name = "c_api", srcs = ["c_api.cc"], hdrs = ["c_api.h"], + copts = tf_copts(), visibility = ["//visibility:public"], - deps = [ - "//tensorflow/cc/saved_model:loader", - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework", - "//tensorflow/core:lib", - ], + deps = select({ + "//tensorflow:android": [ + "//tensorflow/core:android_tensorflow_lib_lite", + ], + "//conditions:default": [ + "//tensorflow/cc/saved_model:loader", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + ], + }), ) tf_cuda_library( diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index 14988fbc4d7..83ce3e25d46 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -20,7 +20,9 @@ limitations under the License. #include #include +#ifndef __ANDROID__ #include "tensorflow/cc/saved_model/loader.h" +#endif #include "tensorflow/core/common_runtime/shape_refiner.h" #include "tensorflow/core/framework/log_memory.h" #include "tensorflow/core/framework/node_def_util.h" @@ -37,6 +39,7 @@ limitations under the License. #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -159,11 +162,13 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in, return InvalidArgument("Passing non-empty TF_Buffer is invalid."); } const auto proto_size = in.ByteSize(); - void* buf = malloc(proto_size); + void* buf = tensorflow::port::Malloc(proto_size); in.SerializeToArray(buf, proto_size); out->data = buf; out->length = proto_size; - out->data_deallocator = [](void* data, size_t length) { free(data); }; + out->data_deallocator = [](void* data, size_t length) { + tensorflow::port::Free(data); + }; return Status::OK(); } @@ -287,13 +292,15 @@ void TF_SetConfig(TF_SessionOptions* options, const void* proto, TF_Buffer* TF_NewBuffer() { return new TF_Buffer{nullptr, 0, nullptr}; } TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len) { - void* copy = malloc(proto_len); + void* copy = tensorflow::port::Malloc(proto_len); memcpy(copy, proto, proto_len); TF_Buffer* buf = new TF_Buffer; buf->data = copy; buf->length = proto_len; - buf->data_deallocator = [](void* data, size_t length) { free(data); }; + buf->data_deallocator = [](void* data, size_t length) { + tensorflow::port::Free(data); + }; return buf; } @@ -694,7 +701,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) { TF_Buffer TF_GetOpList(TF_Library* lib_handle) { return lib_handle->op_list; } void TF_DeleteLibraryHandle(TF_Library* lib_handle) { - free(const_cast(lib_handle->op_list.data)); + tensorflow::port::Free(const_cast(lib_handle->op_list.data)); delete lib_handle; } @@ -1704,6 +1711,7 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt, } } +#ifndef __ANDROID__ TF_Session* TF_LoadSessionFromSavedModel( const TF_SessionOptions* session_options, const TF_Buffer* run_options, const char* export_dir, const char* const* tags, int tags_len, @@ -1757,6 +1765,7 @@ TF_Session* TF_LoadSessionFromSavedModel( session->last_num_graph_nodes = graph->graph.num_node_ids(); return session; } +#endif // __ANDROID__ void TF_CloseSession(TF_Session* s, TF_Status* status) { status->status = s->session->Close(); diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h index 3ea2d31699d..e625d656ade 100644 --- a/tensorflow/c/c_api.h +++ b/tensorflow/c/c_api.h @@ -835,6 +835,10 @@ typedef struct TF_Session TF_Session; extern TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opts, TF_Status* status); +#ifndef __ANDROID__ +// TODO(ashankar): Remove the __ANDROID__ guard. This will require ensuring that +// the tensorflow/cc/saved_model:loader build target is Android friendly. + // This function creates a new TF_Session (which is created on success) using // `session_options`, and then initializes state (restoring tensors and other // assets) using `run_options`. @@ -853,6 +857,7 @@ TF_Session* TF_LoadSessionFromSavedModel( const TF_SessionOptions* session_options, const TF_Buffer* run_options, const char* export_dir, const char* const* tags, int tags_len, TF_Graph* graph, TF_Buffer* meta_graph_def, TF_Status* status); +#endif // __ANDROID__ // Close a session. // diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc index 50e596786ab..00c07932aca 100644 --- a/tensorflow/compiler/aot/compile.cc +++ b/tensorflow/compiler/aot/compile.cc @@ -204,23 +204,23 @@ Status RewriteAndPruneGraph(Graph* graph, const Config& config, string feed_id; TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFeedIdAttr, &feed_id)); if (missing_feeds.erase(feed_id) == 0) { - return errors::Aborted(kArgOp, " node found with unknown feed id: ", - feed_id); + return errors::Aborted(kArgOp, + " node found with unknown feed id: ", feed_id); } } else if (n->type_string() == kRetvalOp) { string fetch_id; TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFetchIdAttr, &fetch_id)); if (missing_fetches.erase(fetch_id) == 0) { - return errors::Aborted(kRetvalOp, " node found with unknown fetch id: ", - fetch_id); + return errors::Aborted(kRetvalOp, + " node found with unknown fetch id: ", fetch_id); } } } if (!missing_feeds.empty() || !missing_fetches.empty()) { - return errors::Aborted("Post graph-pruning", ", missing feeds: ", - str_util::Join(missing_feeds, ", "), - ", missing fetches: ", - str_util::Join(missing_fetches, ", ")); + return errors::Aborted( + "Post graph-pruning", + ", missing feeds: ", str_util::Join(missing_feeds, ", "), + ", missing fetches: ", str_util::Join(missing_fetches, ", ")); } return Status::OK(); } @@ -351,16 +351,19 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation, for (int i = 0; i < pshape->parameters_size(); ++i) { arg_layouts.push_back(pshape->mutable_parameters(i)); } - xla::StatusOr> aot_or = - client->CompileAheadOfTime(computation, arg_layouts, pshape->result(), - aot_opts); + xla::LocalClient::AheadOfTimeComputationInstance instance; + instance.computation = &computation; + instance.argument_layouts = std::move(arg_layouts); + instance.result_layout = &pshape->result(); + xla::StatusOr>> + aot_or = client->CompileAheadOfTime({instance}, aot_opts); if (!aot_or.ok()) { return errors::Unknown("XLA compilation failed: ", aot_or.status().error_message()); } compile_result->aot = xla::unique_ptr_static_cast( - aot_or.ConsumeValueOrDie()); + std::move(aot_or.ValueOrDie().back())); compile_result->entry_point = aot_opts.entry_point_name(); compile_result->pointer_size = xla::LocalClient::PointerSizeForTriple(aot_opts.triple()); diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py index 261dfcbdf8c..2a2d13dc498 100644 --- a/tensorflow/compiler/aot/tests/make_test_graphs.py +++ b/tensorflow/compiler/aot/tests/make_test_graphs.py @@ -18,6 +18,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import argparse +import sys + from tensorflow.core.protobuf import saver_pb2 from tensorflow.python.client import session from tensorflow.python.framework import constant_op @@ -27,22 +30,18 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variables from tensorflow.python.platform import app -from tensorflow.python.platform import flags as flags_lib from tensorflow.python.training import saver as saver_lib -flags = flags_lib -FLAGS = flags.FLAGS -flags.DEFINE_string('out_dir', '', - 'Output directory for graphs, checkpoints and savers.') +FLAGS = None -def tfadd(): +def tfadd(_): x = constant_op.constant([1], name='x_const') y = constant_op.constant([2], name='y_const') math_ops.add(x, y, name='x_y_sum') -def tfadd_with_ckpt(): +def tfadd_with_ckpt(out_dir): x = array_ops.placeholder(dtypes.int32, name='x_hold') y = variables.Variable(constant_op.constant([0]), name='y_saved') math_ops.add(x, y, name='x_y_sum') @@ -53,11 +52,11 @@ def tfadd_with_ckpt(): sess.run(init_op) sess.run(y.assign(y + 42)) # Without the checkpoint, the variable won't be set to 42. - ckpt = '%s/test_graph_tfadd_with_ckpt.ckpt' % FLAGS.out_dir + ckpt = '%s/test_graph_tfadd_with_ckpt.ckpt' % out_dir saver.save(sess, ckpt) -def tfadd_with_ckpt_saver(): +def tfadd_with_ckpt_saver(out_dir): x = array_ops.placeholder(dtypes.int32, name='x_hold') y = variables.Variable(constant_op.constant([0]), name='y_saved') math_ops.add(x, y, name='x_y_sum') @@ -68,27 +67,27 @@ def tfadd_with_ckpt_saver(): sess.run(init_op) sess.run(y.assign(y + 42)) # Without the checkpoint, the variable won't be set to 42. - ckpt_file = '%s/test_graph_tfadd_with_ckpt_saver.ckpt' % FLAGS.out_dir + ckpt_file = '%s/test_graph_tfadd_with_ckpt_saver.ckpt' % out_dir saver.save(sess, ckpt_file) # Without the SaverDef, the restore op won't be named correctly. - saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % FLAGS.out_dir + saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % out_dir with open(saver_file, 'w') as f: f.write(saver.as_saver_def().SerializeToString()) -def tfgather(): +def tfgather(_): params = array_ops.placeholder(dtypes.float32, name='params') indices = array_ops.placeholder(dtypes.int32, name='indices') array_ops.gather(params, indices, name='gather_output') -def tfmatmul(): +def tfmatmul(_): x = array_ops.placeholder(dtypes.float32, name='x_hold') y = array_ops.placeholder(dtypes.float32, name='y_hold') math_ops.matmul(x, y, name='x_y_prod') -def tfmatmulandadd(): +def tfmatmulandadd(_): # This tests multiple outputs. x = array_ops.placeholder(dtypes.float32, name='x_hold') y = array_ops.placeholder(dtypes.float32, name='y_hold') @@ -96,24 +95,33 @@ def tfmatmulandadd(): math_ops.add(x, y, name='x_y_sum') -def write_graph(build_graph): +def write_graph(build_graph, out_dir): """Build a graph using build_graph and write it out.""" g = ops.Graph() with g.as_default(): - build_graph() - filename = '%s/test_graph_%s.pb' % (FLAGS.out_dir, build_graph.__name__) + build_graph(out_dir) + filename = '%s/test_graph_%s.pb' % (out_dir, build_graph.__name__) with open(filename, 'w') as f: f.write(g.as_graph_def().SerializeToString()) def main(_): - write_graph(tfadd) - write_graph(tfadd_with_ckpt) - write_graph(tfadd_with_ckpt_saver) - write_graph(tfgather) - write_graph(tfmatmul) - write_graph(tfmatmulandadd) + write_graph(tfadd, FLAGS.out_dir) + write_graph(tfadd_with_ckpt, FLAGS.out_dir) + write_graph(tfadd_with_ckpt_saver, FLAGS.out_dir) + write_graph(tfgather, FLAGS.out_dir) + write_graph(tfmatmul, FLAGS.out_dir) + write_graph(tfmatmulandadd, FLAGS.out_dir) if __name__ == '__main__': - app.run() + parser = argparse.ArgumentParser() + parser.register('type', 'bool', lambda v: v.lower() == 'true') + parser.add_argument( + '--out_dir', + type=str, + default='', + help='Output directory for graphs, checkpoints and savers.' + ) + FLAGS, unparsed = parser.parse_known_args() + app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 486725f1daa..318dc7fada9 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -41,12 +41,15 @@ const char* const kXlaClusterAttr = "_XlaCluster"; namespace { -bool HasXLAKernel(const NodeDef& node_def, DeviceType jit_device_type) { +bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) { + // _Send and _Recv should not be marked for compilation. + if (node.IsSend() || node.IsRecv()) return false; + // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient // is really a kind of function call and will be handled by // IsCompilableCall(). - if (node_def.op() == "SymbolicGradient") return false; - return FindKernelDef(jit_device_type, node_def, nullptr, nullptr).ok(); + if (node.type_string() == "SymbolicGradient") return false; + return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok(); } // Make sure we don't recurse infinitely on recursive functions. @@ -125,7 +128,7 @@ bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type, return IsCompilableWhile(node->def(), jit_device_type, depth + 1, lib_runtime); } - if (!HasXLAKernel(node->def(), jit_device_type) && + if (!HasXLAKernel(*node, jit_device_type) && !IsCompilableCall(node->def(), jit_device_type, depth + 1, lib_runtime)) { VLOG(2) << "Function marking failed: unsupported op " << node->name() @@ -168,7 +171,7 @@ Status FindCompilationCandidates( CHECK(XlaOpRegistry::GetJitDevice(device_type.type(), &jit_device_name, /*requires_jit=*/nullptr)); DeviceType jit_device_type(*jit_device_name); - if (!HasXLAKernel(node->def(), jit_device_type) && + if (!HasXLAKernel(*node, jit_device_type) && !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime.get())) { VLOG(2) << "Compilation rejected node: unsupported op " << node->name() << ": " << node->def().op(); diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc index 250960d3958..f329e83e14d 100644 --- a/tensorflow/compiler/jit/xla_device_context.cc +++ b/tensorflow/compiler/jit/xla_device_context.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/common_runtime/dma_helper.h" +#include "tensorflow/core/platform/mem.h" namespace tensorflow { @@ -41,7 +42,7 @@ void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { // Regardless of the size requested, always allocate a XlaGlobalData. Respect // the aligment request because there is alignment checking even for Tensors // whose data is never accessed. - void* p = port::aligned_malloc(sizeof(XlaGlobalData), alignment); + void* p = port::AlignedMalloc(sizeof(XlaGlobalData), alignment); VLOG(2) << "Allocated XLA device tensor " << p; return new (p) XlaGlobalData(); } @@ -50,7 +51,7 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) { XlaGlobalData* global_data = reinterpret_cast(ptr); VLOG(2) << "Deallocated XLA device tensor " << ptr; global_data->~XlaGlobalData(); - port::aligned_free(ptr); + port::AlignedFree(ptr); } void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); } diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index 731ff7d673f..db4c86505cb 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -45,7 +45,7 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options, name_prefix, &device); if (!status.ok()) { // Treat failures as non-fatal; there might not be a GPU in the machine. - LOG(WARNING) << "Failed to create XLA_GPU device: " << status; + VLOG(1) << "Failed to create XLA_GPU device: " << status; return Status::OK(); } devices->push_back(device.release()); diff --git a/tensorflow/compiler/tests/lstm_test.py b/tensorflow/compiler/tests/lstm_test.py index 9ffeb6c2a2f..31093c65713 100644 --- a/tensorflow/compiler/tests/lstm_test.py +++ b/tensorflow/compiler/tests/lstm_test.py @@ -18,7 +18,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import argparse import os +import sys import numpy as np @@ -32,29 +34,8 @@ from tensorflow.python.ops import gradients_impl from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variables -from tensorflow.python.platform import flags as flags_lib from tensorflow.python.platform import test -flags = flags_lib -FLAGS = flags.FLAGS - -flags.DEFINE_integer('batch_size', 128, - 'Inputs are fed in batches of this size, for both ' - 'inference and training. Larger values cause the matmul ' - 'in each LSTM cell to have higher dimensionality.') -flags.DEFINE_integer('seq_length', 60, - 'Length of the unrolled sequence of LSTM cells in a layer.' - 'Larger values cause more LSTM matmuls to be run.') -flags.DEFINE_integer('num_inputs', 1024, - 'Dimension of inputs that are fed into each LSTM cell.') -flags.DEFINE_integer('num_nodes', 1024, 'Number of nodes in each LSTM cell.') -flags.DEFINE_string('device', 'gpu', - 'TensorFlow device to assign ops to, e.g. "gpu", "cpu". ' - 'For details see documentation for tf.Graph.device.') - -flags.DEFINE_string('dump_graph_dir', '', 'If non-empty, dump graphs in ' - '*.pbtxt format to this directory.') - def _DumpGraph(graph, basename): if FLAGS.dump_graph_dir: @@ -290,4 +271,54 @@ class LSTMBenchmark(test.Benchmark): if __name__ == '__main__': - test.main() + parser = argparse.ArgumentParser() + parser.register('type', 'bool', lambda v: v.lower() == 'true') + parser.add_argument( + '--batch_size', + type=int, + default=128, + help="""\ + Inputs are fed in batches of this size, for both inference and training. + Larger values cause the matmul in each LSTM cell to have higher + dimensionality.\ + """ + ) + parser.add_argument( + '--seq_length', + type=int, + default=60, + help="""\ + Length of the unrolled sequence of LSTM cells in a layer.Larger values + cause more LSTM matmuls to be run.\ + """ + ) + parser.add_argument( + '--num_inputs', + type=int, + default=1024, + help='Dimension of inputs that are fed into each LSTM cell.' + ) + parser.add_argument( + '--num_nodes', + type=int, + default=1024, + help='Number of nodes in each LSTM cell.' + ) + parser.add_argument( + '--device', + type=str, + default='gpu', + help="""\ + TensorFlow device to assign ops to, e.g. "gpu", "cpu". For details see + documentation for tf.Graph.device.\ + """ + ) + parser.add_argument( + '--dump_graph_dir', + type=str, + default='', + help='If non-empty, dump graphs in *.pbtxt format to this directory.' + ) + global FLAGS # pylint:disable=global-at-module-level + FLAGS, unparsed = parser.parse_known_args() + test.main(argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index 3de9958cd66..4d861c71c41 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -89,6 +89,27 @@ cc_library( # Internal targets below this point. +cc_test( + name = "xla_compiler_test", + srcs = ["xla_compiler_test.cc"], + deps = [ + ":xla_compiler", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:function_ops", + "//tensorflow/cc:ops", + "//tensorflow/cc:sendrecv_ops", + "//tensorflow/compiler/tf2xla/kernels:xla_ops", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla/client:client_library", + "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/tests:literal_test_util", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + cc_test( name = "str_util_test", srcs = [ diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc index 3cddff9df40..8adac23eeec 100644 --- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc @@ -18,7 +18,6 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h" #include "tensorflow/compiler/tf2xla/xla_compilation_device.h" #include "tensorflow/compiler/tf2xla/xla_helpers.h" -#include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/framework/kernel_def_builder.h" diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc index 86a53c929ef..ad3c9217440 100644 --- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc +++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/local_device.h" #include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace tensorflow { @@ -47,7 +48,7 @@ class XlaCompilationAllocator : public Allocator { // XlaExpression. Respect the aligment request because there is // alignment checking even for Tensors whose data is never // accessed. - void* p = port::aligned_malloc(sizeof(XlaExpression), alignment); + void* p = port::AlignedMalloc(sizeof(XlaExpression), alignment); XlaExpression* expression = reinterpret_cast(p); new (expression) XlaExpression(); return expression; @@ -56,7 +57,7 @@ class XlaCompilationAllocator : public Allocator { void DeallocateRaw(void* ptr) override { XlaExpression* expression = reinterpret_cast(ptr); expression->~XlaExpression(); - port::aligned_free(ptr); + port::AlignedFree(ptr); } // Make sure that even tensors with 0 elements have allocated diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index e46c2a31482..a0edbc5cbc3 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -318,7 +318,7 @@ Status XlaCompiler::CompileGraph(string const& name, } XlaContext* xla_context = - new XlaContext(client(), name, allow_cpu_custom_calls_); + new XlaContext(this, client(), name, allow_cpu_custom_calls_); core::ScopedUnref xla_context_unref(xla_context); TF_RETURN_IF_ERROR(xla_context->BuildArguments(args, use_tuple_arg)); @@ -402,4 +402,15 @@ Status XlaCompiler::CompileGraph(string const& name, return Status::OK(); } +Status XlaCompiler::GetChannelHandle(const string& key, + xla::ChannelHandle* channel) { + mutex_lock lock(mu_); + auto result = channels_.emplace(key, xla::ChannelHandle()); + if (result.second) { + TF_ASSIGN_OR_RETURN(result.first->second, client_->CreateChannelHandle()); + } + *channel = result.first->second; + return Status::OK(); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index 0b882d60a1b..f21abae5f42 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -172,6 +172,12 @@ class XlaCompiler { XlaCompilationDevice* device() const { return device_; } const DeviceMgr* device_mgr() const { return &device_mgr_; } + // Retrieves the channel handle associated with `key`. Allocates + // a new channel handle if none exists. + // Channel handles can be used to communicate between different computations. + // Computations that communicate should be compiled with the same XlaCompiler. + Status GetChannelHandle(const string& key, xla::ChannelHandle* channel); + private: // Does the real work of Compile() and CompileToComputation(). Status CompileFunctionBody(FunctionLibraryRuntime* function_library, @@ -195,6 +201,8 @@ class XlaCompiler { XlaCompilationDevice* device_; // Owned by device_mgr_ DeviceMgr device_mgr_; + std::unordered_map channels_ GUARDED_BY(mu_); + TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler); }; diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc new file mode 100644 index 00000000000..24efd3ed0b8 --- /dev/null +++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc @@ -0,0 +1,107 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_compiler.h" +#include "tensorflow/cc/framework/ops.h" +#include "tensorflow/cc/ops/function_ops.h" +#include "tensorflow/cc/ops/sendrecv_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/tf2xla/xla_compilation_device.h" +#include "tensorflow/compiler/xla/client/client_library.h" +#include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/tests/literal_test_util.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/public/version.h" + +namespace tensorflow { +namespace { + +class XlaCompilerTest : public ::testing::Test { + protected: + void SetUp() override { + client_ = xla::ClientLibrary::LocalClientOrDie(); + + XlaCompiler::Options options; + options.device_type = DeviceType(DEVICE_CPU_XLA_JIT); + options.client = client_; + compiler_.reset(new XlaCompiler(options)); + + XlaOpRegistry::RegisterJitKernels(); + + FunctionDefLibrary flib; + flib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), flib)); + flr_.reset(NewFunctionLibraryRuntime( + compiler_->device_mgr(), /*env=*/nullptr, compiler_->device(), + TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(), + /*custom_kernel_creator=*/nullptr)); + } + + xla::Client* client_; + std::unique_ptr compiler_; + std::unique_ptr flib_def_; + std::unique_ptr flr_; +}; + +TEST_F(XlaCompilerTest, Simple) { + // Builds a graph that adds two Tensors. + Scope scope = Scope::NewRootScope().ExitOnError(); + auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0); + auto b = ops::_Arg(scope.WithOpName("B"), DT_INT32, 1); + auto c = ops::Add(scope.WithOpName("C"), a, b); + auto d = ops::_Retval(scope.WithOpName("D"), c, 0); + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(scope.ToGraph(graph.get())); + + // Builds a description of the arguments. + std::vector args(2); + args[0].type = DT_INT32; + args[0].shape = TensorShape({2}); + args[0].parameter = 0; + args[1].type = DT_INT32; + args[1].shape = TensorShape({2}); + args[1].parameter = 1; + + // Compiles the graph. + XlaCompiler::CompilationResult result; + TF_ASSERT_OK(compiler_->CompileGraph("add", std::move(graph), flr_.get(), + args, /*use_tuple_arg=*/false, &result)); + + // Tests that the generated computation works. + std::unique_ptr param0_literal = + xla::LiteralUtil::CreateR1({7, 42}); + std::unique_ptr param1_literal = + xla::LiteralUtil::CreateR1({-3, 101}); + std::unique_ptr param0_data = + client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); + std::unique_ptr param1_data = + client_->TransferToServer(*param1_literal).ConsumeValueOrDie(); + + std::unique_ptr actual = + client_ + ->Execute(result.computation, {param0_data.get(), param1_data.get()}) + .ConsumeValueOrDie(); + std::unique_ptr actual_literal = + client_->Transfer(*actual).ConsumeValueOrDie(); + + std::unique_ptr expected_literal = + xla::LiteralUtil::CreateR1({4, 143}); + xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc index ad8fc3f2057..6c399366e5e 100644 --- a/tensorflow/compiler/tf2xla/xla_context.cc +++ b/tensorflow/compiler/tf2xla/xla_context.cc @@ -167,7 +167,7 @@ Status XlaContext::CollectResults( } } - if (handle.handle() > 0) { + if (handle.handle() > 0 || has_side_effects_) { // Build the full computation. The return value is the handle // constructed above. xla::StatusOr computation_status = builder().Build(); @@ -190,9 +190,11 @@ Status XlaContext::CollectResults( return Status::OK(); } -XlaContext::XlaContext(xla::Client* client, const string& computation_name, +XlaContext::XlaContext(XlaCompiler* compiler, xla::Client* client, + const string& computation_name, bool allow_cpu_custom_calls) - : xla_builder_(client, computation_name), + : compiler_(compiler), + xla_builder_(client, computation_name), allow_cpu_custom_calls_(allow_cpu_custom_calls) {} const xla::ComputationDataHandle& @@ -233,6 +235,11 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype, return Status::OK(); } +void XlaContext::AddSideEffects() { + mutex_lock lock(mu_); + has_side_effects_ = true; +} + /* static */ const XlaExpression* XlaContext::CastExpressionFromTensor( const Tensor& tensor) { const XlaExpression* expression = diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h index b0464025f7d..f4c840dc0bd 100644 --- a/tensorflow/compiler/tf2xla/xla_context.h +++ b/tensorflow/compiler/tf2xla/xla_context.h @@ -68,7 +68,7 @@ class XlaExpression { TF_DISALLOW_COPY_AND_ASSIGN(XlaExpression); }; -// The XlaContext is the datastructure accessible from +// The XlaContext is the data structure accessible from // OpKernelContexts when evaluating a subgraph of Ops for JIT // compilation by XLA. When an Op is executed during JIT // compilation the input Tensors to the Op store handles to @@ -132,8 +132,8 @@ class XlaContext : public ResourceBase { } // Create a new XlaContext. - XlaContext(xla::Client* client, const string& computation_name, - bool allow_cpu_custom_calls); + XlaContext(XlaCompiler* compiler, xla::Client* client, + const string& computation_name, bool allow_cpu_custom_calls); // Builds XLA computations for each of the arguments. // Should only be called once to initialize the arguments. Not thread-safe. @@ -160,6 +160,9 @@ class XlaContext : public ResourceBase { Status AddConstRetval(int retval_index, DataType dtype, const xla::Literal& literal); + // Mark the computation as having side effects (i.e., Send operators). + void AddSideEffects(); + // Retrieves the ComputationDataHandle from an input Tensor to an Op. This // computation was constructed by an Op that executed previously and // created the output Tensor using CreateOutputTensorFromComputation @@ -167,6 +170,8 @@ class XlaContext : public ResourceBase { static const xla::ComputationDataHandle& GetComputationFromTensor( const Tensor& tensor); + XlaCompiler* compiler() const { return compiler_; } + // Returns the ComputationBuilder that Ops use for compiling new // expressions. xla::ComputationBuilder& builder(); @@ -215,6 +220,8 @@ class XlaContext : public ResourceBase { // or CreateConstantOutputTensor. static const XlaExpression* GetExpressionFromTensor(const Tensor& tensor); + XlaCompiler* const compiler_; + mutable mutex mu_; // The ComputationBuilder used to construct the subgraph's compiled @@ -250,6 +257,9 @@ class XlaContext : public ResourceBase { // The non-data-dependent return values of the computation. std::vector compile_time_constant_ GUARDED_BY(mu_); + // Does the computation have side effects, i.e., Send() calls? + bool has_side_effects_ GUARDED_BY(mu_) = false; + // Cache of prebuilt computations indexed by their type. using ComputationMap = std::map; diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc index 3883b907b43..00cf1adc119 100644 --- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc +++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc @@ -223,6 +223,10 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) { expression->set_constant_value(constant); } +void XlaOpKernelContext::SetOpHasSideEffects() { + XlaContext::Get(context_).AddSideEffects(); +} + void XlaOpKernelContext::CtxFailure(Status s) { context_->CtxFailure(s); } void XlaOpKernelContext::CtxFailureWithWarning(Status s) { context_->CtxFailureWithWarning(s); diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h index 0c614005bec..5fbc0cb6ac3 100644 --- a/tensorflow/compiler/tf2xla/xla_op_kernel.h +++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h @@ -131,6 +131,9 @@ class XlaOpKernelContext { void SetStatus(const Status& status) { context_->SetStatus(status); } Status status() { return context_->status(); } + // Mark the op has having side effects (i.e., via Send). + void SetOpHasSideEffects(); + // Helper routines for the OP_REQUIRES macros void CtxFailure(Status s); void CtxFailureWithWarning(Status s); diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 148c033eaa3..384aae867b1 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -314,12 +314,23 @@ tensorflow::Status LocalClient::ExecuteLocally( options, result); } -StatusOr> LocalClient::CompileAheadOfTime( - const Computation& computation, - const tensorflow::gtl::ArraySlice argument_layouts, - const Shape& result_layout, const AotCompilationOptions& options) { - return local_service_->CompileAheadOfTime( - computation.handle(), argument_layouts, result_layout, options); +StatusOr>> +LocalClient::CompileAheadOfTime( + const tensorflow::gtl::ArraySlice + computations, + const AotCompilationOptions& options) { + std::vector service_instances; + service_instances.reserve(computations.size()); + for (const AheadOfTimeComputationInstance& instance : computations) { + service_instances.push_back({}); + LocalService::AheadOfTimeComputationInstance& service_instance = + service_instances.back(); + TF_RET_CHECK(instance.computation != nullptr); + service_instance.computation = instance.computation->handle(); + service_instance.argument_layouts = instance.argument_layouts; + service_instance.result_layout = instance.result_layout; + } + return local_service_->CompileAheadOfTime(service_instances, options); } int64 LocalClient::PointerSizeForTriple(tensorflow::StringPiece target_triple) { diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h index 1d6243a3b68..33366b97fd5 100644 --- a/tensorflow/compiler/xla/client/local_client.h +++ b/tensorflow/compiler/xla/client/local_client.h @@ -219,19 +219,26 @@ class LocalClient : public Client { const tensorflow::gtl::ArraySlice argument_layouts, const ExecutableBuildOptions& options); - // Compiles the computation for ahead-of-time execution. This is intended for - // use in static compilation. The |argument_layouts| parameter is used to - // inform the compiler of the expected layout for arguments while - // |result_layout| is used to signal the layout of the result. The |options| - // parameter is used to request which target the compiler should emit code - // for. + // A description of a computation to compile using CompileAheadOfTime. + struct AheadOfTimeComputationInstance { + const Computation* computation; + // Inform the compiler of the expected layout for arguments. + std::vector argument_layouts; + // Specifies the expected result layout. + const Shape* result_layout; + }; + + // Compiles a list of computations for ahead-of-time execution. This is + // intended for use in static compilation. The |options| parameter describes + // the target for which the compiler should emit code. // // TODO(b/31222190): This doesn't really belong in LocalClient. Move it to its // own library. - StatusOr> CompileAheadOfTime( - const Computation& computation, - const tensorflow::gtl::ArraySlice argument_layouts, - const Shape& result_layout, const AotCompilationOptions& options); + StatusOr>> + CompileAheadOfTime( + const tensorflow::gtl::ArraySlice + computations, + const AotCompilationOptions& options); // Returns the size of a pointer in bytes for a given triple. static int64 PointerSizeForTriple(tensorflow::StringPiece triple); diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc index 81eb717821b..4d5f682156e 100644 --- a/tensorflow/compiler/xla/layout_util.cc +++ b/tensorflow/compiler/xla/layout_util.cc @@ -360,4 +360,20 @@ tensorflow::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, } } +/* static */ bool LayoutUtil::AreDimensionsConsecutive( + const Layout& layout, tensorflow::gtl::ArraySlice dims) { + std::vector positions_in_layout; + for (int64 dim : dims) { + positions_in_layout.push_back( + PositionInContainer(layout.minor_to_major(), dim)); + } + std::sort(positions_in_layout.begin(), positions_in_layout.end()); + for (size_t i = 1; i < positions_in_layout.size(); ++i) { + if (1 != positions_in_layout[i] - positions_in_layout[i - 1]) { + return false; + } + } + return true; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h index 984bf402cdc..b6e8ecaa169 100644 --- a/tensorflow/compiler/xla/layout_util.h +++ b/tensorflow/compiler/xla/layout_util.h @@ -144,6 +144,11 @@ class LayoutUtil { // except that the element type is ignored. static bool LayoutsInShapesEqual(const Shape& lhs, const Shape& rhs); + // Returns whether the given dimensions are consecutive in the given layout, + // not necessarily in the order given. + static bool AreDimensionsConsecutive(const Layout& layout, + tensorflow::gtl::ArraySlice dims); + private: TF_DISALLOW_COPY_AND_ASSIGN(LayoutUtil); }; diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h index 78e9e3fb24f..f26116bf078 100644 --- a/tensorflow/compiler/xla/literal_util.h +++ b/tensorflow/compiler/xla/literal_util.h @@ -136,6 +136,12 @@ class LiteralUtil { const Literal& literal, tensorflow::gtl::ArraySlice start_indices, tensorflow::gtl::ArraySlice limit_indices); + // Creates a literal with a prepended dimension with bound "times"; e.g. a + // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from the input + // literal replicated four times. + template + static std::unique_ptr Replicate(const Literal& input, int64 times); + // Create a literal by converting each element in an original literal to a new // type. template @@ -999,6 +1005,30 @@ LiteralUtil::CreateFullWithMonotonicDim0MajorLayout( return literal; } +template +/* static */ std::unique_ptr LiteralUtil::Replicate( + const Literal& input, int64 times) { + std::vector bounds = {times}; + bounds.insert(bounds.end(), input.shape().dimensions().begin(), + input.shape().dimensions().end()); + auto literal = MakeUnique(); + *literal->mutable_shape() = + ShapeUtil::MakeShape(input.shape().element_type(), bounds); + Reserve(ShapeUtil::ElementsIn(literal->shape()), literal.get()); + for (int64 index = 0; index < ShapeUtil::ElementsIn(input.shape()); ++index) { + const std::vector element_indices = + IndexUtil::LinearIndexToMultidimensionalIndex(input.shape(), index); + const auto element = Get(input, element_indices); + for (int64 sample = 0; sample < times; ++sample) { + std::vector output_indices = {sample}; + output_indices.insert(output_indices.end(), element_indices.begin(), + element_indices.end()); + Set(literal.get(), output_indices, element); + } + } + return literal; +} + } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_LITERAL_UTIL_H_ diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index fe892e872fd..b08f859270a 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -749,11 +749,11 @@ Status AlgebraicSimplifierVisitor::HandleConvolution( TF_RET_CHECK(LayoutUtil::HasLayout(filter_shape)); TF_RET_CHECK(LayoutUtil::HasLayout(convolution_shape)); - // Require 1x1 filter in the spatial dimensions (so no need to extract image - // patches). - if (filter_shape.dimensions(dnums.kernel_spatial_dimensions(0)) != 1 || - filter_shape.dimensions(dnums.kernel_spatial_dimensions(1)) != 1) { - return Status::OK(); + // Require the spatial dimensions in the kernel to have a bound of one. + for (int64 i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) { + if (filter_shape.dimensions(dnums.kernel_spatial_dimensions(i)) != 1) { + return Status::OK(); + } } // Stride ignores part of the output, which matrix multiplication does not do, @@ -782,9 +782,9 @@ Status AlgebraicSimplifierVisitor::HandleConvolution( input_shape.layout().minor_to_major(0) != dnums.feature_dimension() || // The input feature dimension should come later in the minor-to-major // order. - (PositionInContainer(AsInt64Slice(filter_shape.layout().minor_to_major()), + (PositionInContainer(filter_shape.layout().minor_to_major(), dnums.kernel_input_feature_dimension()) < - PositionInContainer(AsInt64Slice(filter_shape.layout().minor_to_major()), + PositionInContainer(filter_shape.layout().minor_to_major(), dnums.kernel_output_feature_dimension()))) { return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc index 6e76c98c9f3..7452a7b6965 100644 --- a/tensorflow/compiler/xla/service/backend.cc +++ b/tensorflow/compiler/xla/service/backend.cc @@ -234,4 +234,8 @@ StatusOr Backend::devices_equivalent(int device_ordinal_a, executor_b->GetDeviceDescription().name()); } +Status Backend::ResetDevices() { + return transfer_manager_->ResetDevices(stream_executors_); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h index 17c53d299ed..db482c09ae2 100644 --- a/tensorflow/compiler/xla/service/backend.h +++ b/tensorflow/compiler/xla/service/backend.h @@ -149,6 +149,9 @@ class Backend { // used for scheduling work. For other platforms, returns NULL. const Eigen::ThreadPoolDevice* eigen_intra_op_thread_pool_device() const; + // Resets the devices associated with this backend. + Status ResetDevices(); + private: struct EigenThreadPoolWrapper; Backend(int64 replica_count, perftools::gputools::Platform* platform, diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h index 632081a747e..85c2d03e1bc 100644 --- a/tensorflow/compiler/xla/service/compiler.h +++ b/tensorflow/compiler/xla/service/compiler.h @@ -128,10 +128,11 @@ class Compiler { // Compiles the HLO module for ahead-of-time execution. This is intended for // use in static compilation. - virtual StatusOr> CompileAheadOfTime( - std::unique_ptr module, - std::unique_ptr module_config, HloDumper dump_hlo, - const AotCompilationOptions& options) = 0; + virtual StatusOr>> + CompileAheadOfTime( + std::vector> module, + std::vector> module_config, + HloDumper dump_hlo, const AotCompilationOptions& options) = 0; ///// // The Compiler class also serves as a point to register compiler objects diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index d566cfd8c8f..b9f4537b809 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -478,10 +478,13 @@ StatusOr>> CpuCompiler::Compile( "Compilation of multiple HLO modules is not yet supported on CPU."); } -StatusOr> CpuCompiler::CompileAheadOfTime( - std::unique_ptr hlo_module, - std::unique_ptr module_config, HloDumper dump_hlo, - const AotCompilationOptions& aot_options) { +StatusOr>> +CpuCompiler::CompileAheadOfTime( + std::vector> hlo_modules, + std::vector> module_configs, + HloDumper dump_hlo, const AotCompilationOptions& aot_options) { + TF_RET_CHECK(hlo_modules.size() == module_configs.size()); + if (aot_options.PlatformId() != se::host::kHostPlatformId) { return InvalidArgument("Incompatible AOT compilation platform"); } @@ -549,72 +552,78 @@ StatusOr> CpuCompiler::CompileAheadOfTime( const llvm::DataLayout& data_layout = llvm_module.getDataLayout(); int64 pointer_size = data_layout.getPointerSize(); - TF_RETURN_IF_ERROR( - RunHloPasses(hlo_module.get(), module_config.get(), dump_hlo)); + std::vector> results; + for (int i = 0; i < hlo_modules.size(); ++i) { + HloModule* hlo_module = hlo_modules[i].get(); + HloModuleConfig* module_config = module_configs[i].get(); - SequentialHloOrdering::HloModuleSequence module_sequence = - CreateModuleSequence(hlo_module.get()); - // Run buffer analysis on the HLO graph. This analysis figures out which - // temporary buffers are required to run the computation. - TF_ASSIGN_OR_RETURN( - std::unique_ptr assignment, - BufferAssigner::Run( - hlo_module.get(), - MakeUnique(hlo_module.get(), module_sequence), - pointer_size)); + TF_RETURN_IF_ERROR(RunHloPasses(hlo_module, module_config, dump_hlo)); - IrEmitter ir_emitter(*hlo_module, *module_config, *assignment, &llvm_module, - /*hlo_to_profile_idx=*/nullptr); - HloComputation* computation = hlo_module->entry_computation(); - for (auto embedded_computation : - computation->MakeEmbeddedComputationsList()) { - TF_RETURN_IF_ERROR( - ir_emitter - .EmitComputation(embedded_computation, embedded_computation->name(), - /*is_entry_computation=*/false, - &module_sequence.at(embedded_computation)) - .status()); - } - const string& entry_point_name = options.entry_point_name(); - TF_ASSIGN_OR_RETURN( - llvm::Function * entry_function, - ir_emitter.EmitComputation(computation, entry_point_name, - /*is_entry_computation=*/true)); + SequentialHloOrdering::HloModuleSequence module_sequence = + CreateModuleSequence(hlo_module); + // Run buffer analysis on the HLO graph. This analysis figures out which + // temporary buffers are required to run the computation. + TF_ASSIGN_OR_RETURN( + std::unique_ptr assignment, + BufferAssigner::Run(hlo_module, MakeUnique( + hlo_module, module_sequence), + pointer_size)); - entry_function->setName(llvm_ir::AsStringRef(entry_point_name)); - - Disassembler disassembler(*target_machine); - CompilerFunctor compiler_functor(target_machine.get(), &disassembler, - opt_level, CompilerFunctor::AllIntrinsics()); - llvm::object::OwningBinary object_file = - compiler_functor(llvm_module); - llvm::StringRef object_file_data_ref = object_file.getBinary()->getData(); - ObjectFileData object_file_data(object_file_data_ref.begin(), - object_file_data_ref.end()); - - BufferSizes buffer_sizes; - for (const BufferAllocation& allocation : assignment->Allocations()) { - // Callers don't need to allocate temporary buffers for parameters. - if (allocation.is_entry_computation_parameter()) { - buffer_sizes.push_back(-1); - continue; + IrEmitter ir_emitter(*hlo_module, *module_config, *assignment, &llvm_module, + /*hlo_to_profile_idx=*/nullptr); + HloComputation* computation = hlo_module->entry_computation(); + for (auto embedded_computation : + computation->MakeEmbeddedComputationsList()) { + TF_RETURN_IF_ERROR( + ir_emitter + .EmitComputation(embedded_computation, + embedded_computation->name(), + /*is_entry_computation=*/false, + &module_sequence.at(embedded_computation)) + .status()); } - // Callers don't need to allocate anything for thread-local temporary - // buffers. They are lowered to allocas. - if (allocation.is_thread_local()) { - buffer_sizes.push_back(-1); - continue; + const string& entry_point_name = options.entry_point_name(); + TF_ASSIGN_OR_RETURN( + llvm::Function * entry_function, + ir_emitter.EmitComputation(computation, entry_point_name, + /*is_entry_computation=*/true)); + + entry_function->setName(llvm_ir::AsStringRef(entry_point_name)); + + Disassembler disassembler(*target_machine); + CompilerFunctor compiler_functor(target_machine.get(), &disassembler, + opt_level, + CompilerFunctor::AllIntrinsics()); + llvm::object::OwningBinary object_file = + compiler_functor(llvm_module); + llvm::StringRef object_file_data_ref = object_file.getBinary()->getData(); + ObjectFileData object_file_data(object_file_data_ref.begin(), + object_file_data_ref.end()); + + BufferSizes buffer_sizes; + for (const BufferAllocation& allocation : assignment->Allocations()) { + // Callers don't need to allocate temporary buffers for parameters. + if (allocation.is_entry_computation_parameter()) { + buffer_sizes.push_back(-1); + continue; + } + // Callers don't need to allocate anything for thread-local temporary + // buffers. They are lowered to allocas. + if (allocation.is_thread_local()) { + buffer_sizes.push_back(-1); + continue; + } + buffer_sizes.push_back(allocation.size()); } - buffer_sizes.push_back(allocation.size()); + + TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation, + assignment->GetUniqueTopLevelOutputAllocation()); + + results.emplace_back(MakeUnique( + std::move(object_file_data), std::move(buffer_sizes), + result_allocation->index())); } - - TF_ASSIGN_OR_RETURN(const BufferAllocation* result_allocation, - assignment->GetUniqueTopLevelOutputAllocation()); - - return std::unique_ptr( - MakeUnique(std::move(object_file_data), - std::move(buffer_sizes), - result_allocation->index())); + return std::move(results); } se::Platform::Id CpuCompiler::PlatformId() const { diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h index 349724d8406..d7d77ce58a6 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h @@ -123,10 +123,11 @@ class CpuCompiler : public Compiler { HloDumper dump_hlo, std::vector stream_exec) override; - StatusOr> CompileAheadOfTime( - std::unique_ptr module, - std::unique_ptr module_config, HloDumper dump_hlo, - const AotCompilationOptions& options) override; + StatusOr>> + CompileAheadOfTime( + std::vector> module, + std::vector> module_config, + HloDumper dump_hlo, const AotCompilationOptions& options) override; perftools::gputools::Platform::Id PlatformId() const override; diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc index 086306696d3..1a6a144bd63 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc @@ -160,7 +160,9 @@ Status GenericTransferManager::TransferLiteralToInfeed( return Unimplemented("Infeed is not supported on GPU (b/30467474)"); } -Status GenericTransferManager::ResetDevice(se::StreamExecutor* executor) { +Status GenericTransferManager::ResetDevices( + tensorflow::gtl::ArraySlice + executors) { return Unimplemented( "Device reset is not yet supported on CPU and GPU (b/30481585)"); } diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h index cfa02bf22f7..06819d65c70 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.h +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h @@ -55,7 +55,9 @@ class GenericTransferManager : public TransferManager { Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor, const Literal& literal) override; - Status ResetDevice(perftools::gputools::StreamExecutor* executor) override; + Status ResetDevices( + tensorflow::gtl::ArraySlice + executors) override; StatusOr> ShallowCopyTupleFromDevice( diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index a13279c6ff6..2f95446e6c4 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -312,10 +312,11 @@ StatusOr>> GpuCompiler::Compile( "Compilation of multiple HLO modules is not yet supported on GPU."); } -StatusOr> GpuCompiler::CompileAheadOfTime( - std::unique_ptr module, - std::unique_ptr module_config, HloDumper dump_hlo, - const AotCompilationOptions& options) { +StatusOr>> +GpuCompiler::CompileAheadOfTime( + std::vector> module, + std::vector> module_config, + HloDumper dump_hlo, const AotCompilationOptions& options) { return Unimplemented("not yet implemented: GpuCompiler::CompileAheadOfTime"); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h index fefa4031041..a074607760f 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h @@ -52,10 +52,11 @@ class GpuCompiler : public Compiler { HloDumper dump_hlo, std::vector stream_exec) override; - StatusOr> CompileAheadOfTime( - std::unique_ptr module, - std::unique_ptr module_config, HloDumper dump_hlo, - AotCompilationOptions const& options) override; + StatusOr>> + CompileAheadOfTime( + std::vector> module, + std::vector> module_config, + HloDumper dump_hlo, AotCompilationOptions const& options) override; perftools::gputools::Platform::Id PlatformId() const override; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc index 0821fb01abb..e141179ba17 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include +#include #include "external/llvm/include/llvm/IR/Module.h" #include "tensorflow/compiler/xla/layout_util.h" @@ -121,8 +122,22 @@ bool IsReductionToVector(const HloInstruction& reduce) { return false; } const HloInstruction* input = reduce.operand(0); - return ShapeUtil::Rank(input->shape()) > 1 && - ShapeUtil::Rank(reduce.shape()) == 1; + std::vector dims_to_keep; + for (int64 dim = 0; dim < input->shape().dimensions().size(); ++dim) { + if (!std::count(reduce.dimensions().begin(), reduce.dimensions().end(), + dim)) { + dims_to_keep.push_back(dim); + } + } + return LayoutUtil::AreDimensionsConsecutive(input->shape().layout(), + dims_to_keep) && + ShapeUtil::Equal(reduce.shape(), ShapeUtil::FilterDimensions( + [&dims_to_keep](int64 dim) { + return std::count( + dims_to_keep.begin(), + dims_to_keep.end(), dim); + }, + input->shape())); } // This emits a device-side call to diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 79a64433465..c107f9cbbe2 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1047,8 +1047,9 @@ Status IrEmitterUnnested::EmitRowReduction( // Figures out whether `reduce` is a row or column reduction, and which // dimensions to reduce, and calls either `EmitRowReduction` or // `EmitColumnReduction` as appropriate. -// Prerequisite: the shape of `reduce` has rank 1 and, if `reduce` is fused, the -// fused subgraph is pure elementwise. +// Prerequisite: all the dimensions to keep are contiguous in the input layout +// and, if `reduce` is fused, the fused subgraph is pure +// elementwise. Status IrEmitterUnnested::EmitReductionToVector( HloInstruction* reduce, const Shape& input_shape, const llvm_ir::ElementGenerator& input_gen, @@ -1063,25 +1064,39 @@ Status IrEmitterUnnested::EmitReductionToVector( << reduce->ToString(); // Specialize multi-dimensional-array-to-vector reduction. - // - // TODO(b/33239522): we could use the same algorithm for general reduction - // as long as the input dimensions to keep are adjacent in the layout and - // have the same relative layout as their corresponding output dimensions. - // For example, reducing shape [2,3,4,5] with minor_to_major={2,0,1,3} to - // shape [2,4] with minor_to_major={1,0} can be implemented as a column - // reduction from shape [15,8] to shape [8]. - int64 input_dim_to_keep = -1; + std::vector input_dims_to_keep; for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape); ++input_dim) { if (std::find(dimensions_to_reduce.begin(), dimensions_to_reduce.end(), input_dim) == dimensions_to_reduce.end()) { - input_dim_to_keep = input_dim; - break; + input_dims_to_keep.push_back(input_dim); } } - CHECK_NE(-1, input_dim_to_keep); - if (LayoutUtil::Minor(input_shape.layout(), 0) == input_dim_to_keep) { + // Sort the dimensions to keep from minor to major, to facilitate checking + // whether another dimension is major or minor of them. + std::sort(input_dims_to_keep.begin(), input_dims_to_keep.end(), + [&input_shape](int64 dim_a, int64 dim_b) { + return PositionInContainer(input_shape.layout().minor_to_major(), + dim_a) < + PositionInContainer(input_shape.layout().minor_to_major(), + dim_b); + }); + // Now, if output rank is at least 1, `input_dims_to_keep.front()` is + // minormost and `input_dims_to_keep.back()` is majormost. + + // If the dimensions to keep are minormost, emit a column reduction. As all + // the dimensions to keep are contiguous, by prerequisite of + // `EmitReductionToVector`, we only need to check whether the minormost + // dimension of the input is to keep. + // + // If the output is scalar, we could emit either a row or a column reduction. + // Some tests have shown scalar reduction is no more efficient as row + // reduction, and is simpler to emit as column reduction, so we emit a column + // reduction in this case. + if (input_dims_to_keep.empty() || + input_dims_to_keep.front() == + LayoutUtil::Minor(input_shape.layout(), 0)) { // Column reduction. Treat the result of "input" as a matrix whose width // is the most minor dimension and height the product of other dimensions, // and treat "reduce" as a column reduction of the input matrix. @@ -1091,7 +1106,8 @@ Status IrEmitterUnnested::EmitReductionToVector( int64 height = 1; for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape); ++input_dim) { - if (input_dim != input_dim_to_keep) { + if (!std::count(input_dims_to_keep.begin(), input_dims_to_keep.end(), + input_dim)) { height *= input_shape.dimensions(input_dim); } } @@ -1108,22 +1124,19 @@ Status IrEmitterUnnested::EmitReductionToVector( int64 width = 1; for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape); ++input_dim) { - if (PositionInContainer( - AsInt64Slice(input_shape.layout().minor_to_major()), input_dim) > - PositionInContainer( - AsInt64Slice(input_shape.layout().minor_to_major()), - input_dim_to_keep)) { + if (PositionInContainer(input_shape.layout().minor_to_major(), + input_dim) > + PositionInContainer(input_shape.layout().minor_to_major(), + input_dims_to_keep.back())) { depth *= input_shape.dimensions(input_dim); - } else if (PositionInContainer( - AsInt64Slice(input_shape.layout().minor_to_major()), - input_dim) < - PositionInContainer( - AsInt64Slice(input_shape.layout().minor_to_major()), - input_dim_to_keep)) { + } else if (PositionInContainer(input_shape.layout().minor_to_major(), + input_dim) < + PositionInContainer(input_shape.layout().minor_to_major(), + input_dims_to_keep.front())) { width *= input_shape.dimensions(input_dim); } } - int64 height = input_shape.dimensions(input_dim_to_keep); + const int64 height = ShapeUtil::ElementsIn(reduce->shape()); return EmitRowReduction(depth, height, width, reduce, input_shape, input_gen, init_value_gen, reducer); } diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc index 38465e37e7b..7f86a3cbb57 100644 --- a/tensorflow/compiler/xla/service/local_service.cc +++ b/tensorflow/compiler/xla/service/local_service.cc @@ -206,42 +206,49 @@ tensorflow::Status LocalService::ExecuteLocally( return tensorflow::Status::OK(); } -StatusOr> +StatusOr>> LocalService::CompileAheadOfTime( - const ComputationHandle& computation, - const tensorflow::gtl::ArraySlice argument_layouts, - const Shape& result_layout, const AotCompilationOptions& options) { - TF_ASSIGN_OR_RETURN(UserComputation * user_computation, - computation_tracker_.Resolve(computation)); - VersionedComputationHandle versioned_handle = - user_computation->GetVersionedHandle(); + const tensorflow::gtl::ArraySlice + computations, + const AotCompilationOptions& options) { + std::vector> hlo_modules; + std::vector> module_configs; + for (const AheadOfTimeComputationInstance& instance : computations) { + TF_ASSIGN_OR_RETURN(UserComputation * user_computation, + computation_tracker_.Resolve(instance.computation)); + VersionedComputationHandle versioned_handle = + user_computation->GetVersionedHandle(); - TF_ASSIGN_OR_RETURN( - std::unique_ptr hlo_module, - computation_tracker_.BuildHloModule(versioned_handle, - /*include_unused_parameters=*/true)); + TF_ASSIGN_OR_RETURN(std::unique_ptr hlo_module, + computation_tracker_.BuildHloModule( + versioned_handle, + /*include_unused_parameters=*/true)); + hlo_modules.push_back(std::move(hlo_module)); - TF_ASSIGN_OR_RETURN( - std::shared_ptr program_shape, - user_computation->ComputeProgramShape(versioned_handle.version)); + TF_ASSIGN_OR_RETURN( + std::shared_ptr program_shape, + user_computation->ComputeProgramShape(versioned_handle.version)); - auto module_config = MakeUnique(*program_shape); - auto* computation_layout = module_config->mutable_entry_computation_layout(); - for (int i = 0; i < argument_layouts.size(); ++i) { - const Shape& argument_layout = *argument_layouts[i]; - if (ShapeUtil::IsTuple(argument_layout)) { - return Unimplemented("tuple arguments not supported yet"); + module_configs.push_back(MakeUnique(*program_shape)); + HloModuleConfig* module_config = module_configs.back().get(); + auto* computation_layout = + module_config->mutable_entry_computation_layout(); + for (int i = 0; i < instance.argument_layouts.size(); ++i) { + const Shape& argument_layout = *instance.argument_layouts[i]; + if (ShapeUtil::IsTuple(argument_layout)) { + return Unimplemented("tuple arguments not supported yet"); + } + TF_RETURN_IF_ERROR( + computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape( + argument_layout)); } TF_RETURN_IF_ERROR( - computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape( - argument_layout)); + computation_layout->mutable_result_layout()->CopyLayoutFromShape( + *instance.result_layout)); } - TF_RETURN_IF_ERROR( - computation_layout->mutable_result_layout()->CopyLayoutFromShape( - result_layout)); return execute_backend_->compiler() - ->CompileAheadOfTime(std::move(hlo_module), std::move(module_config), + ->CompileAheadOfTime(std::move(hlo_modules), std::move(module_configs), MakeHloDumper(), options) .ConsumeValueOrDie(); } @@ -426,8 +433,9 @@ StatusOr> LocalService::ExecuteLocallyInternal( } else { se::StreamExecutor* stream_executor; if (options.device_ordinal() >= 0) { - TF_ASSIGN_OR_RETURN(stream_executor, execute_backend_->stream_executor( - options.device_ordinal())); + TF_ASSIGN_OR_RETURN( + stream_executor, + execute_backend_->stream_executor(options.device_ordinal())); } else { stream_executor = execute_backend_->default_stream_executor(); } diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h index 3e160a0201e..9fe0d5993b3 100644 --- a/tensorflow/compiler/xla/service/local_service.h +++ b/tensorflow/compiler/xla/service/local_service.h @@ -139,13 +139,21 @@ class LocalService : public Service { tensorflow::gtl::ArraySlice arguments, const LocalExecuteOptions& options, ShapedBuffer* result_buffer); - // Compiles the computation for ahead-of-time execution. This is intended for - // use in static compilation. See |LocalClient::CompileAheadOfTime| for - // additional details. - StatusOr> CompileAheadOfTime( - const ComputationHandle& computation, - const tensorflow::gtl::ArraySlice argument_layouts, - const Shape& result_layout, const AotCompilationOptions& Options); + // A description of a computation to compile using CompileAheadOfTime. + struct AheadOfTimeComputationInstance { + ComputationHandle computation; + std::vector argument_layouts; + const Shape* result_layout = nullptr; + }; + + // Compiles a list of computations for ahead-of-time execution. This is + // intended for use in static compilation. See + // |LocalClient::CompileAheadOfTime| for additional details. + StatusOr>> + CompileAheadOfTime( + const tensorflow::gtl::ArraySlice + computations, + const AotCompilationOptions& Options); // Builds an Executable with the given argument layouts and options. If // result_layout is non-null, then the executable is compiled to produce a diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 847aea78884..0b3900b3b20 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -1019,16 +1019,7 @@ tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg, tensorflow::Status Service::ResetDevice(const ResetDeviceRequest* arg, ResetDeviceResponse* result) { - int first_device_ordinal = arg->has_device_handle() - ? arg->device_handle().handle() - : execute_backend_->default_device_ordinal(); - TF_ASSIGN_OR_RETURN(auto executors, - execute_backend_->Replicas(first_device_ordinal)); - for (se::StreamExecutor* executor : executors) { - TF_RETURN_IF_ERROR( - execute_backend_->transfer_manager()->ResetDevice(executor)); - } - return tensorflow::Status::OK(); + return execute_backend_->ResetDevices(); } tensorflow::Status Service::TransferToClientInProcess( diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h index 1141e99fe32..e8ad61c9d0f 100644 --- a/tensorflow/compiler/xla/service/service.h +++ b/tensorflow/compiler/xla/service/service.h @@ -162,7 +162,15 @@ class Service : public ServiceInterface { const TransferToInfeedRequest* arg, TransferToInfeedResponse* result) override; - // Resets the device, clearing all existing state on the device. + // Resets devices, clearing all existing state on all the devices associated + // with this service (including memory allocated on the devices). + // + // ResetDevice may only be called where no previous Execution state on the + // device is used by the next Execution. + // + // ResetDevice should be called before an Execution that expect the device to + // be in the reset state. For example, if the prior Execution modifies device + // state (e.g., architectural state) that the next Execution depends on. tensorflow::Status ResetDevice(const ResetDeviceRequest* arg, ResetDeviceResponse* result) override; diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index 11559ad7578..fbab2dfd4af 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -1319,9 +1319,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( // Permute(dimensions,input) computes output[dimensions[i]]=input[i]. However, // we need output[i]=input[dimensions[i]] which is // Permute(Inverse(dimensions),input). - return ShapeUtil::MakeShape(operand.element_type(), - Permute(InversePermutation(dimensions), - AsInt64Slice(operand.dimensions()))); + return ShapeUtil::PermuteDimensions(InversePermutation(dimensions), operand); } /* static */ StatusOr ShapeInference::InferSelectShape( diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc index 10fd4e53c5c..5a1ae6b0024 100644 --- a/tensorflow/compiler/xla/service/shape_inference_test.cc +++ b/tensorflow/compiler/xla/service/shape_inference_test.cc @@ -1125,8 +1125,8 @@ TEST_F(ShapeInferenceTest, Transpose) { ShapeInference::InferTransposeShape(a_shape, {1, 2, 3, 0}); EXPECT_IS_OK(inferred_shape_and_status); Shape inferred_shape = inferred_shape_and_status.ValueOrDie(); - EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, - ShapeUtil::MakeShape(F32, {3, 4, 5, 2}))); + EXPECT_TRUE(ShapeUtil::Compatible(inferred_shape, + ShapeUtil::MakeShape(F32, {3, 4, 5, 2}))); } } // namespace diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h index 90dc921b7de..7ffce452139 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.h +++ b/tensorflow/compiler/xla/service/transfer_manager.h @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -63,8 +64,10 @@ class TransferManager { perftools::gputools::StreamExecutor* executor, const Literal& literal) = 0; - // Resets the device that the given executor runs on. - virtual Status ResetDevice(perftools::gputools::StreamExecutor* executor) = 0; + // Resets the devices associated with this transfer manager. + virtual Status ResetDevices( + tensorflow::gtl::ArraySlice + executor) = 0; // Shallow copy a tuple from the device and create a DeviceMemoryBase object // for each element in the tuple. A DeviceMemoryBase object refers to the diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc index 1e1e8c1b98a..ab2c43cd3dc 100644 --- a/tensorflow/compiler/xla/shape_util.cc +++ b/tensorflow/compiler/xla/shape_util.cc @@ -984,4 +984,38 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape, check_input_unit_indices(output_shape, input_shape); } +/* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete, + Shape shape) { + shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete); + if (LayoutUtil::HasLayout(shape)) { + Layout* layout = shape.mutable_layout(); + for (size_t i = 0; i < layout->minor_to_major().size();) { + if (layout->minor_to_major(i) == dim_to_delete) { + layout->mutable_minor_to_major()->erase( + layout->minor_to_major().begin() + i); + continue; + } + if (layout->minor_to_major(i) > dim_to_delete) { + (*layout->mutable_minor_to_major())[i] -= 1; + } + ++i; + } + } + return shape; +} + +/* static */ Shape ShapeUtil::FilterDimensions( + const std::function& p, Shape shape) { + std::vector dims_to_delete; + for (int64 i = shape.dimensions().size() - 1; i >= 0; --i) { + if (!p(i)) { + dims_to_delete.push_back(i); + } + } + for (int64 dim : dims_to_delete) { + shape = DeleteDimension(dim, shape); + } + return shape; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index 35fd714b0bc..fa5fcc0224f 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -374,6 +374,19 @@ class ShapeUtil { static bool ReshapeIsBitcast(const Shape& input_shape, const Shape& output_shape); + // Returns a shape with the given dimension deleted. + // For example: + // • `DeleteDimension(1, T[m, n, k]) = T[m, k]` + static Shape DeleteDimension(int64 dim_to_delete, Shape shape); + + // Returns a shape with all the dimensions of the input shape for which `p` + // returns true. + // For examples: + // • `FilterDimensions((< 2), T[m, n, k]) = T[m, n]` + // • `FilterDimensions(is_even_number, T[m, n, k]) = T[m, k]` + static Shape FilterDimensions(const std::function& p, + Shape shape); + private: // Recursive helper for comparing the equality of two shapes. Returns true if // the shapes are the same. If compare_layouts is true, then layouts must also diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc index 50e5dec0f62..50d9ee50835 100644 --- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc +++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" @@ -72,16 +73,19 @@ int main(int argc, char** argv) { llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string)); + xla::Computation computation = builder.Build().ConsumeValueOrDie(); + xla::LocalClient::AheadOfTimeComputationInstance instance{ + &computation, /*argument_layouts=*/{&opaque_shape}, &r0f32}; + xla::cpu::CpuAotCompilationOptions options( triple_string, /*cpu_name=*/"", /*features=*/"", "SumAndDouble", xla::cpu::CpuAotCompilationOptions::RelocationModel::Static); + + auto results = + client->CompileAheadOfTime({instance}, options).ConsumeValueOrDie(); auto result = xla::unique_ptr_static_cast( - client - ->CompileAheadOfTime(builder.Build().ValueOrDie(), - /*argument_layouts=*/{&opaque_shape}, r0f32, - options) - .ConsumeValueOrDie()); + std::move(results.front())); // We should have two buffers, one for the result and one temporary buffer, // and both should be float-sized. It's lame to hard-code this, but we need // local_client_aot_test.cc to be able to easily invoke the function. diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc index dac5dadf834..3ee5dfc9496 100644 --- a/tensorflow/compiler/xla/util.cc +++ b/tensorflow/compiler/xla/util.cc @@ -176,12 +176,6 @@ std::vector ComposePermutations(tensorflow::gtl::ArraySlice p1, return output; } -int64 PositionInContainer(tensorflow::gtl::ArraySlice container, - int64 value) { - return std::find(container.begin(), container.end(), value) - - container.begin(); -} - PaddingConfig MakeNoPaddingConfig(int64 rank) { PaddingConfig padding_config; for (int64 dnum = 0; dnum < rank; ++dnum) { diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h index 842b4f219a0..00f8d946f89 100644 --- a/tensorflow/compiler/xla/util.h +++ b/tensorflow/compiler/xla/util.h @@ -183,8 +183,11 @@ std::vector InversePermutation( std::vector ComposePermutations(tensorflow::gtl::ArraySlice p1, tensorflow::gtl::ArraySlice p2); -int64 PositionInContainer(tensorflow::gtl::ArraySlice container, - int64 value); +template +int64 PositionInContainer(const Container& container, int64 value) { + return std::distance(container.begin(), + std::find(container.begin(), container.end(), value)); +} // Returns a PaddingConfig object that represents no padding for the given rank. PaddingConfig MakeNoPaddingConfig(int64 rank); diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD index be77f4985a0..4c7df9d8d63 100644 --- a/tensorflow/contrib/android/BUILD +++ b/tensorflow/contrib/android/BUILD @@ -33,6 +33,7 @@ cc_library( visibility = ["//visibility:public"], deps = [ "//tensorflow/core:android_tensorflow_lib_lite", + "//tensorflow/java/src/main/native", ], alwayslink = 1, ) diff --git a/tensorflow/contrib/android/cmake/README.md b/tensorflow/contrib/android/cmake/README.md index ad9e1720c74..915319da557 100644 --- a/tensorflow/contrib/android/cmake/README.md +++ b/tensorflow/contrib/android/cmake/README.md @@ -1,6 +1,10 @@ TensorFlow-Android-Inference ============================ -Android Java interface to the TensorFlow native APIs +This directory contains CMake support for building the Android Java Inference +interface to the TensorFlow native APIs. + +See [tensorflow/contrib/android](..) for more details about the library, and +instructions for building with Bazel. Usage ----- @@ -24,9 +28,9 @@ Note: this makes native code in the lib traceable from your app. Dependencies ------------ -TensorFlow-Android-Inference depends on the TensorFlow static libs already built in your -local TensorFlow repo directory. For Linux/Mac OS, build_all_android.sh is used -in build.gradle to build it. It DOES take time to build the core libs; +TensorFlow-Android-Inference depends on the TensorFlow static libs already built +in your local TensorFlow repo directory. For Linux/Mac OS, build_all_android.sh +is used in build.gradle to build it. It DOES take time to build the core libs; so, by default, it is commented out to avoid confusion (otherwise Android Studio would appear to hang during opening the project). To enable it, refer to the comment in diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py index 8d5ff341acd..24b726ac098 100644 --- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py +++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function import time + from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops from tensorflow.contrib.rnn.python.ops import core_rnn from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl @@ -31,12 +32,8 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import gradients_impl from tensorflow.python.ops import init_ops from tensorflow.python.ops import variables -from tensorflow.python.platform import flags from tensorflow.python.platform import test -flags.DEFINE_integer("batch_size", 64, "batch size.") -FLAGS = flags.FLAGS - class CudnnRNNBenchmark(test.Benchmark): """Benchmarks Cudnn LSTM and other related models. diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h new file mode 100644 index 00000000000..6d85e6ce487 --- /dev/null +++ b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/include/soc_interface.h @@ -0,0 +1,98 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_PLATFORM_HEXAGON_SOC_INTERFACE_H_ +#define TENSORFLOW_PLATFORM_HEXAGON_SOC_INTERFACE_H_ + +#include + +// Declaration of APIs provided by hexagon shared library. This header is shared +// with both hexagon library built with qualcomm SDK and tensorflow. +// All functions defined here must have prefix "soc_interface" to avoid +// naming conflicts. +#ifdef __cplusplus +extern "C" { +#else +#include +#endif // __cplusplus +// Returns the version of loaded hexagon wrapper shared library. +// You should assert that the version matches the expected version before +// calling APIs defined in this header. +int soc_interface_GetWrapperVersion(); +// Returns the version of hexagon binary. +// You should assert that the version matches the expected version before +// calling APIs defined in this header. +int soc_interface_GetSocControllerVersion(); +// Initialize SOC +bool soc_interface_Init(); +// Finalize SOC +bool soc_interface_Finalize(); +// Execute graph on SOC +bool soc_interface_ExecuteGraph(); +// Teardown graph setup +bool soc_interface_TeardownGraph(); +// Send input data to SOC +bool soc_interface_FillInputNodeFloat(int x, int y, int z, int d, + const uint8_t* const buf, + uint64_t buf_size); +// Load output data from SOC +bool soc_interface_ReadOutputNodeFloat(const char* const node_name, + uint8_t** buf, uint64_t* buf_size); +// Setup graph +// TODO(satok): Remove and use runtime version +bool soc_interface_setupDummyGraph(int version); + +// Allocate memory for params of node inputs and node outputs +bool soc_interface_AllocateNodeInputAndNodeOutputArray(int total_input_count, + int total_output_count); + +// Release memory for params of node inputs and node outputs +bool soc_interface_ReleaseNodeInputAndNodeOutputArray(); + +// Set one node's inputs and return pointer to that struct +void* soc_interface_SetOneNodeInputs(int input_count, const int* const node_id, + const int* const port); + +// Set one node's outputs and return pointer to that struct +void* soc_interface_SetOneNodeOutputs(int output_count, int* max_size); + +// Append const node to the graph +bool soc_interface_AppendConstNode(const char* const name, int node_id, + int batch, int height, int width, int depth, + const uint8_t* const data, int data_length); + +// Append node to the graph +bool soc_interface_AppendNode(const char* const name, int node_id, int op_id, + int padding_id, const void* const inputs, + int inputs_count, const void* const outputs, + int outputs_count); + +// Instantiate graph +bool soc_interface_InstantiateGraph(); + +// Construct graph +bool soc_interface_ConstructGraph(); + +// Set log level +void soc_interface_SetLogLevel(int log_level); + +// Set debug flag +void soc_interface_SetDebugFlag(uint64_t flag); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // TENSORFLOW_PLATFORM_HEXAGON_SOC_INTERFACE_H_ diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c new file mode 100755 index 00000000000..ebcbb963e83 --- /dev/null +++ b/tensorflow/contrib/hvx/hexagon_controller/src_soc_interface/soc_interface.c @@ -0,0 +1,124 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "soc_interface.h" + +int soc_interface_GetWrapperVersion() { + // TODO(satok): implement + return -1; +} + +int soc_interface_GetSocControllerVersion() { + // TODO(satok): implement + return -1; +} + +bool soc_interface_Init() { + // TODO(satok): implement + return false; +} + +bool soc_interface_Finalize() { + // TODO(satok): implement + return false; +} + +bool soc_interface_ExecuteGraph() { + // TODO(satok): implement + return false; +} + +bool soc_interface_TeardownGraph() { + // TODO(satok): implement + return false; +} + +bool soc_interface_FillInputNodeFloat( + int x, int y, int z, int d, const uint8_t* const buf, uint64_t buf_size) { + // TODO(satok): implement + return false; +} + +// TODO(satok): Remove and use runtime version +bool soc_interface_ReadOutputNodeFloat( + const char* const node_name, uint8_t** buf, uint64_t *buf_size) { + // TODO(satok): implement + return false; +} + +bool soc_interface_SetupGraphDummy(int version) { + // TODO(satok): implement + return false; +} + +bool soc_interface_AllocateNodeInputAndNodeOutputArray( + int total_input_count, int total_output_count) { + // TODO(satok): implement + return false; +} + +bool soc_interface_ReleaseNodeInputAndNodeOutputArray() { + // TODO(satok): implement + return false; +} + +void* soc_interface_SetOneNodeInputs( + int input_count, const int* const node_id, const int* const port) { + // TODO(satok): implement + return 0; +} + +void* soc_interface_SetOneNodeOutputs(int output_count, int* max_size) { + // TODO(satok): implement + return 0; +} + +// Append const node to the graph +bool soc_interface_AppendConstNode( + const char* const name, int node_id, int batch, int height, int width, + int depth, const uint8_t* const data, int data_length) { + // TODO(satok): implement + return false; +} + +// Append node to the graph +bool soc_interface_AppendNode( + const char* const name, int node_id, int ops_id, int padding_id, + const void* const inputs, int inputs_count, const void* const outputs, + int outputs_count) { + // TODO(satok): implement + return false; +} + + +// Instantiate graph +bool soc_interface_InstantiateGraph() { + // TODO(satok): implement + return false; +} + +// Construct graph +bool soc_interface_ConstructGraph() { + // TODO(satok): implement + return false; +} + +void soc_interface_SetLogLevel(int log_level) { + // TODO(satok): implement +} + +void soc_interface_SetDebugFlag(uint64_t flag) { + // TODO(satok): implement +} diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py index b303a9d32b7..2673495b904 100644 --- a/tensorflow/contrib/layers/python/layers/layers.py +++ b/tensorflow/contrib/layers/python/layers/layers.py @@ -173,11 +173,12 @@ def _fused_batch_norm( `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. decay: decay for the moving average. Reasonable values for `decay` are close - to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower - `decay` value (recommend trying `decay`=0.9) if model experiences reasonably - good training performance but poor validation and/or test performance. - center: If True, add offset of `beta` to normalized tensor. If False, `beta` - is ignored. + to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. + Lower `decay` value (recommend trying `decay`=0.9) if model experiences + reasonably good training performance but poor validation and/or test + performance. + center: If True, add offset of `beta` to normalized tensor. If False, + `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. @@ -632,16 +633,12 @@ def batch_norm( if need_moments: # Calculate the moments based on the individual batch. if batch_weights is None: - # Use a copy of moving_mean as a shift to compute more reliable moments. - shift = math_ops.add(moving_mean, 0) if data_format == DATA_FORMAT_NCHW: - shift = array_ops.reshape(shift, params_shape_broadcast) - mean, variance = nn.moments(inputs, moments_axes, shift=shift, - keep_dims=True) + mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: - mean, variance = nn.moments(inputs, moments_axes, shift=shift) + mean, variance = nn.moments(inputs, moments_axes) else: if data_format == DATA_FORMAT_NCHW: mean, variance = nn.weighted_moments(inputs, moments_axes, @@ -1385,7 +1382,7 @@ def fully_connected(inputs, Raises: ValueError: if x has rank less than 2 or if its last dimension is not set. """ - if not (isinstance(num_outputs, six.integer_types)): + if not isinstance(num_outputs, six.integer_types): raise ValueError('num_outputs should be int or long, got %s.', num_outputs) layer_variable_getter = _build_variable_getter({'bias': 'biases'}) diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py index 1b0a8b12728..d1b35e33c26 100644 --- a/tensorflow/contrib/layers/python/layers/layers_test.py +++ b/tensorflow/contrib/layers/python/layers/layers_test.py @@ -2356,7 +2356,7 @@ class BatchNormTest(test.TestCase): else: image_shape = (batch_size, channels, height, width) axis = (0, 2, 3) - image_values = np.random.rand(*image_shape) + 2 + image_values = np.random.rand(*image_shape) + 256 expected_mean = np.mean(image_values, axis=axis) expected_var = np.var(image_values, axis=axis) if fused: @@ -2393,9 +2393,9 @@ class BatchNormTest(test.TestCase): # The outputs should be close to 0.0 mean and 1.0 variance self.assertAllClose( np.mean( - np_output, axis=axis), [0] * channels, rtol=0.1, atol=0.1) + np_output, axis=axis), [0] * channels, rtol=0.001, atol=0.001) self.assertAllClose( - np.var(np_output, axis=axis), [1] * channels, rtol=0.1, atol=0.1) + np.var(np_output, axis=axis), [1] * channels, rtol=0.01, atol=0.01) # The gradients should change slowly while updating moving_mean. max_diff = np.max(np.abs(images_gradients_value - new_images_gradients)) self.assertGreaterEqual(max_diff, 0.0) @@ -2558,25 +2558,29 @@ class LayerNormTest(test.TestCase): # output_train and output_eval should be the same. self.assertAllClose(sess.run([output_train]), sess.run([output_eval])) - def doOutputTest(self, input_shape): - with self.test_session() as sess: - input_values = np.random.rand(*input_shape) - inputs = constant_op.constant( - input_values, shape=input_shape, dtype=dtypes.float32) - output_op = _layers.layer_norm(inputs, scope='LN') - # Initialize all variables - sess.run(variables_lib.global_variables_initializer()) - # The mean and variance of the output should be close to 0 and 1 - # respectively. - moments_axis = tuple([i for i in range(1, len(input_shape))]) - outputs = sess.run(output_op) - expected_mean = np.zeros(input_shape[0]) - expected_var = np.ones(input_shape[0]) - mean = np.mean(outputs, axis=moments_axis) - var = np.var(outputs, axis=moments_axis) - tol = 1e-5 - self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol) - self.assertAllClose(var, expected_var, rtol=tol, atol=tol) + def doOutputTest(self, input_shape, tol=1e-3): + for mu in [0.0, 1e2]: + for sigma in [1.0, 0.1]: + input_values = np.random.rand(*input_shape) * sigma + mu + expected_mean = np.zeros(input_shape[0]) + expected_var = np.ones(input_shape[0]) + with ops.Graph().as_default() as g: + with self.test_session(graph=g) as sess: + inputs = constant_op.constant(input_values, shape=input_shape, + dtype=dtypes.float32) + output_op = _layers.layer_norm(inputs, scope='LN') + # Initialize all variables + sess.run(variables_lib.global_variables_initializer()) + # The mean and variance of the output should be close to 0 and 1 + # respectively. + moments_axis = tuple([i for i in range(1, len(input_shape))]) + outputs = sess.run(output_op) + # Make sure that there are no NaNs + self.assertFalse(np.isnan(outputs).any()) + mean = np.mean(outputs, axis=moments_axis) + var = np.var(outputs, axis=moments_axis) + self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol) + self.assertAllClose(var, expected_var, rtol=tol, atol=tol) def testOutput2DInput(self): self.doOutputTest((10, 300)) @@ -2584,6 +2588,12 @@ class LayerNormTest(test.TestCase): def testOutput4DInput(self): self.doOutputTest((100, 10, 10, 3)) + def testOutputSmallInput(self): + self.doOutputTest((10, 10, 10, 30)) + + def testOutputBigInput(self): + self.doOutputTest((1, 100, 100, 1)) + class MaxPool2DTest(test.TestCase): diff --git a/tensorflow/contrib/layers/python/layers/regularizers.py b/tensorflow/contrib/layers/python/layers/regularizers.py index 86d05167748..02eb2b390c6 100644 --- a/tensorflow/contrib/layers/python/layers/regularizers.py +++ b/tensorflow/contrib/layers/python/layers/regularizers.py @@ -65,7 +65,7 @@ def l1_regularizer(scale, scope=None): my_scale = ops.convert_to_tensor(scale, dtype=weights.dtype.base_dtype, name='scale') - return standard_ops.mul( + return standard_ops.multiply( my_scale, standard_ops.reduce_sum(standard_ops.abs(weights)), name=name) @@ -104,7 +104,7 @@ def l2_regularizer(scale, scope=None): my_scale = ops.convert_to_tensor(scale, dtype=weights.dtype.base_dtype, name='scale') - return standard_ops.mul(my_scale, nn.l2_loss(weights), name=name) + return standard_ops.multiply(my_scale, nn.l2_loss(weights), name=name) return l2 diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py index 467d31c3317..e3dc27e6460 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py @@ -407,14 +407,15 @@ class BaseEstimator( raise ValueError('Can not provide both steps and max_steps.') _verify_input_args(x, y, input_fn, None, batch_size) if x is not None: - return SKCompat(self).fit(x, y, batch_size, steps, max_steps, monitors) + SKCompat(self).fit(x, y, batch_size, steps, max_steps, monitors) + return self if max_steps is not None: try: start_step = load_variable(self._model_dir, ops.GraphKeys.GLOBAL_STEP) if max_steps <= start_step: logging.info('Skipping training since max_steps has already saved.') - return None + return self except: # pylint: disable=bare-except pass diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py index d8336e3cbd0..42da9969733 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py +++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py @@ -21,7 +21,6 @@ from __future__ import print_function import json import os -from tensorflow.contrib.framework import deprecated from tensorflow.core.protobuf import config_pb2 from tensorflow.python.training import server_lib @@ -256,79 +255,30 @@ class RunConfig(ClusterConfig): def tf_config(self): return self._tf_config - @tf_config.setter - @deprecated( - '2017-01-08', - 'RunConfig will be made immutable, please pass all args to constructor.') - def tf_config(self, value): - self._tf_config = value - @property def tf_random_seed(self): return self._tf_random_seed - @tf_random_seed.setter - @deprecated( - '2017-01-08', - 'RunConfig will be made immutable, please pass all args to constructor.') - def tf_random_seed(self, value): - self._tf_random_seed = value - @property def save_summary_steps(self): return self._save_summary_steps - @save_summary_steps.setter - @deprecated( - '2017-01-08', - 'RunConfig will be made immutable, please pass all args to constructor.') - def save_summary_steps(self, value): - self._save_summary_steps = value - @property def save_checkpoints_secs(self): return self._save_checkpoints_secs - @save_checkpoints_secs.setter - @deprecated( - '2017-01-08', - 'RunConfig will be made immutable, please pass all args to constructor.') - def save_checkpoints_secs(self, value): - self._save_checkpoints_secs = value - @property def save_checkpoints_steps(self): return self._save_checkpoints_steps - @save_checkpoints_steps.setter - @deprecated( - '2017-01-08', - 'RunConfig will be made immutable, please pass all args to constructor.') - def save_checkpoints_steps(self, value): - self._save_checkpoints_steps = value - @property def keep_checkpoint_max(self): return self._keep_checkpoint_max - @keep_checkpoint_max.setter - @deprecated( - '2017-01-08', - 'RunConfig will be made immutable, please pass all args to constructor.') - def keep_checkpoint_max(self, value): - self._keep_checkpoint_max = value - @property def keep_checkpoint_every_n_hours(self): return self._keep_checkpoint_every_n_hours - @keep_checkpoint_every_n_hours.setter - @deprecated( - '2017-01-08', - 'RunConfig will be made immutable, please pass all args to constructor.') - def keep_checkpoint_every_n_hours(self, value): - self._keep_checkpoint_every_n_hours = value - def _count_ps(cluster_spec): """Counts the number of parameter servers in cluster_spec.""" diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py index 4356e930fac..48d79ecbbff 100644 --- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py +++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Implementations of different data feeders to provide data for TF trainer.""" # TODO(ipolosukhin): Replace this module with feed-dict queue runners & queues. @@ -37,13 +36,13 @@ from tensorflow.python.platform import tf_logging as logging from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels - # pylint: enable=g-multiple-import,g-bad-import-order def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None): """Returns shape for input and output of the data feeder.""" - x_is_dict, y_is_dict = isinstance(x_shape, dict), y_shape is not None and isinstance(y_shape, dict) + x_is_dict, y_is_dict = isinstance( + x_shape, dict), y_shape is not None and isinstance(y_shape, dict) if y_is_dict and n_classes is not None: assert (isinstance(n_classes, dict)) @@ -76,8 +75,11 @@ def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None): if not y_is_dict: output_shape = out_el_shape(y_shape, n_classes) else: - output_shape = dict([(k, out_el_shape(v, n_classes[k] if n_classes is not None and k in n_classes else None)) - for k, v in list(y_shape.items())]) + output_shape = dict([ + (k, out_el_shape(v, n_classes[k] + if n_classes is not None and k in n_classes else None)) + for k, v in list(y_shape.items()) + ]) return input_shape, output_shape, batch_size @@ -99,8 +101,12 @@ def _is_iterable(x): return hasattr(x, 'next') or hasattr(x, '__next__') -def setup_train_data_feeder( - x, y, n_classes, batch_size=None, shuffle=True, epochs=None): +def setup_train_data_feeder(x, + y, + n_classes, + batch_size=None, + shuffle=True, + epochs=None): """Create data feeder, to sample inputs from dataset. If `x` and `y` are iterators, use `StreamingDataFeeder`. @@ -108,10 +114,13 @@ def setup_train_data_feeder( Args: x: numpy, pandas or Dask matrix or dictionary of aforementioned. Also supports iterables. - y: numpy, pandas or Dask array or dictionary of aforementioned. Also supports + y: numpy, pandas or Dask array or dictionary of aforementioned. Also + supports iterables. - n_classes: number of classes. Must be None or same type as y. In case, `y` is `dict` - (or iterable which returns dict) such that `n_classes[key] = n_classes for y[key]` + n_classes: number of classes. Must be None or same type as y. In case, `y` + is `dict` + (or iterable which returns dict) such that `n_classes[key] = n_classes for + y[key]` batch_size: size to split data into parts. Must be >= 1. shuffle: Whether to shuffle the inputs. epochs: Number of epochs to run. @@ -127,7 +136,7 @@ def setup_train_data_feeder( # pylint: disable=g-import-not-at-top import dask.dataframe as dd if (isinstance(x, (dd.Series, dd.DataFrame)) and - (y is None or isinstance(y, (dd.Series, dd.DataFrame)))): + (y is None or isinstance(y, (dd.Series, dd.DataFrame)))): data_feeder_cls = DaskDataFeeder else: data_feeder_cls = DataFeeder @@ -140,7 +149,7 @@ def setup_train_data_feeder( 'streaming learning to work.') return StreamingDataFeeder(x, y, n_classes, batch_size) return data_feeder_cls( - x, y, n_classes, batch_size, shuffle=shuffle, epochs=epochs) + x, y, n_classes, batch_size, shuffle=shuffle, epochs=epochs) def _batch_data(x, batch_size=None): @@ -150,7 +159,8 @@ def _batch_data(x, batch_size=None): x_first_el = six.next(x) x = itertools.chain([x_first_el], x) - chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(x_first_el, dict) else [] + chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance( + x_first_el, dict) else [] chunk_filled = False for data in x: if isinstance(data, dict): @@ -161,7 +171,8 @@ def _batch_data(x, batch_size=None): chunk_filled = True if chunk_filled: yield chunk - chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(x_first_el, dict) else [] + chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance( + x_first_el, dict) else [] chunk_filled = False else: chunk.append(data) @@ -259,16 +270,21 @@ def _access(data, iloc): def _check_dtype(dtype): if dtypes.as_dtype(dtype) == dtypes.float64: logging.warn( - 'float64 is not supported by many models, consider casting to float32.') + 'float64 is not supported by many models, consider casting to float32.') return dtype class DataFeeder(object): """Data feeder is an example class to sample data for TF trainer.""" - def __init__( - self, x, y, n_classes, batch_size=None, shuffle=True, random_state=None, - epochs=None): + def __init__(self, + x, + y, + n_classes, + batch_size=None, + shuffle=True, + random_state=None, + epochs=None): """Initializes a DataFeeder instance. Args: @@ -299,29 +315,33 @@ class DataFeeder(object): input_dtype: DType of input (or dictionary of shapes). output_dtype: DType of output (or dictionary of shapes. """ - x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance(y, dict) + x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance( + y, dict) if isinstance(y, list): y = np.array(y) - self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())]) if x_is_dict else check_array(x, x.dtype) + self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items()) + ]) if x_is_dict else check_array(x, x.dtype) self._y = None if y is None else \ dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())]) if x_is_dict else check_array(y, y.dtype) # self.n_classes is not None means we're converting raw target indices to one-hot. if n_classes is not None: if not y_is_dict: - y_dtype = (np.int64 if n_classes is not None and n_classes > 1 else np.float32) + y_dtype = (np.int64 + if n_classes is not None and n_classes > 1 else np.float32) self._y = (None if y is None else check_array(y, dtype=y_dtype)) self.n_classes = n_classes self.max_epochs = epochs - x_shape = dict([(k, v.shape) for k, v in list(self._x.items())]) if x_is_dict else self._x.shape - y_shape = dict( - [(k, v.shape) for k, v in list(self._y.items())]) if y_is_dict else None if y is None else self._y.shape + x_shape = dict([(k, v.shape) for k, v in list(self._x.items()) + ]) if x_is_dict else self._x.shape + y_shape = dict([(k, v.shape) for k, v in list(self._y.items()) + ]) if y_is_dict else None if y is None else self._y.shape self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape( - x_shape, y_shape, n_classes, batch_size) + x_shape, y_shape, n_classes, batch_size) # Input dtype matches dtype of x. self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._x.items())]) if x_is_dict \ @@ -339,9 +359,10 @@ class DataFeeder(object): self._shuffle = shuffle self.random_state = np.random.RandomState( - 42) if random_state is None else random_state + 42) if random_state is None else random_state - num_samples = list(self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0] + num_samples = list(self._x.values())[0].shape[ + 0] if x_is_dict else self._x.shape[0] if self._shuffle: self.indices = self.random_state.permutation(num_samples) else: @@ -380,8 +401,8 @@ class DataFeeder(object): Returns: The epoch placeholder. """ - self._epoch_placeholder = array_ops.placeholder(dtypes.int32, [1], - name='epoch') + self._epoch_placeholder = array_ops.placeholder( + dtypes.int32, [1], name='epoch') return self._epoch_placeholder def input_builder(self): @@ -398,19 +419,17 @@ class DataFeeder(object): placeholder = {} for key in list(shape.keys()): placeholder[key] = array_ops.placeholder( - dtypes.as_dtype(dtype[key]), - [None] + shape[key][1:], - name=name_prepend + '_' + key - ) + dtypes.as_dtype(dtype[key]), [None] + shape[key][1:], + name=name_prepend + '_' + key) else: placeholder = array_ops.placeholder( - dtypes.as_dtype(dtype), - [None] + shape[1:], - name=name_prepend) + dtypes.as_dtype(dtype), [None] + shape[1:], name=name_prepend) return placeholder - self._input_placeholder = get_placeholder(self.input_shape, self._input_dtype, 'input') - self._output_placeholder = get_placeholder(self.output_shape, self._output_dtype, 'output') + self._input_placeholder = get_placeholder(self.input_shape, + self._input_dtype, 'input') + self._output_placeholder = get_placeholder(self.output_shape, + self._output_dtype, 'output') return self._input_placeholder, self._output_placeholder def set_placeholders(self, input_placeholder, output_placeholder): @@ -432,9 +451,9 @@ class DataFeeder(object): A `dict` with data feed params while training. """ return { - 'epoch': self.epoch, - 'offset': self.offset, - 'batch_size': self._batch_size + 'epoch': self.epoch, + 'offset': self.offset, + 'batch_size': self._batch_size } def get_feed_dict_fn(self): @@ -444,12 +463,13 @@ class DataFeeder(object): A function that when called samples a random subset of batch size from `x` and `y`. """ - x_is_dict, y_is_dict = isinstance(self._x, dict), self._y is not None and isinstance(self._y, dict) + x_is_dict, y_is_dict = isinstance( + self._x, dict), self._y is not None and isinstance(self._y, dict) # Assign input features from random indices. def extract(data, indices): - return (np.array(_access(data, indices)).reshape((indices.shape[0], 1)) - if len(data.shape) == 1 else _access(data, indices)) + return (np.array(_access(data, indices)).reshape((indices.shape[0], 1)) if + len(data.shape) == 1 else _access(data, indices)) # assign labels from random indices def assign_label(data, shape, dtype, n_classes, indices): @@ -481,19 +501,22 @@ class DataFeeder(object): feed_dict[self._epoch_placeholder.name] = [self.epoch] # Take next batch of indices. - x_len = list(self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0] + x_len = list(self._x.values())[0].shape[ + 0] if x_is_dict else self._x.shape[0] end = min(x_len, self.offset + self._batch_size) batch_indices = self.indices[self.offset:end] # adding input placeholder feed_dict.update( - dict([(self._input_placeholder[k].name, extract(v, batch_indices)) for k, v in list(self._x.items())]) - if x_is_dict else {self._input_placeholder.name: extract(self._x, batch_indices)}) + dict([(self._input_placeholder[k].name, extract(v, batch_indices)) + for k, v in list(self._x.items())]) if x_is_dict else + {self._input_placeholder.name: extract(self._x, batch_indices)}) # move offset and reset it if necessary self.offset += self._batch_size if self.offset >= x_len: - self.indices = self.random_state.permutation(x_len) if self._shuffle else np.array(range(x_len)) + self.indices = self.random_state.permutation( + x_len) if self._shuffle else np.array(range(x_len)) self.offset = 0 self.epoch += 1 @@ -504,15 +527,19 @@ class DataFeeder(object): # adding output placeholders if y_is_dict: for k, v in list(self._y.items()): - n_classes = ( - self.n_classes[k] if k in self.n_classes else None) if self.n_classes is not None else None + n_classes = (self.n_classes[k] if k in self.n_classes else + None) if self.n_classes is not None else None shape, dtype = self.output_shape[k], self._output_dtype[k] - feed_dict.update( - {self._output_placeholder[k].name: assign_label(v, shape, dtype, n_classes, batch_indices)}) + feed_dict.update({ + self._output_placeholder[k].name: + assign_label(v, shape, dtype, n_classes, batch_indices) + }) else: shape, dtype, n_classes = self.output_shape, self._output_dtype, self.n_classes - feed_dict.update( - {self._output_placeholder.name: assign_label(self._y, shape, dtype, n_classes, batch_indices)}) + feed_dict.update({ + self._output_placeholder.name: + assign_label(self._y, shape, dtype, n_classes, batch_indices) + }) return feed_dict @@ -566,41 +593,56 @@ class StreamingDataFeeder(DataFeeder): self._y = None self.n_classes = n_classes - x_is_dict, y_is_dict = isinstance(x_first_el, dict), y is not None and isinstance(y_first_el, dict) + x_is_dict = isinstance(x_first_el, dict) + y_is_dict = y is not None and isinstance(y_first_el, dict) if y_is_dict and n_classes is not None: - assert (isinstance(n_classes, dict)) + assert isinstance(n_classes, dict) # extract shapes for first_elements - x_first_el_shape = dict([(k, [1] + list(v.shape)) for k, v in list(x_first_el.items())]) if x_is_dict \ - else [1] + list(x_first_el.shape) + if x_is_dict: + x_first_el_shape = dict( + [(k, [1] + list(v.shape)) for k, v in list(x_first_el.items())]) + else: + x_first_el_shape = [1] + list(x_first_el.shape) - y_first_el_shape = dict([(k, [1] + list(v.shape)) for k, v in list(y_first_el.items())]) if y_is_dict \ - else ([1] + list(y_first_el[0].shape if isinstance(y_first_el, list) else y_first_el.shape) - if y is not None else None) + if y_is_dict: + y_first_el_shape = dict( + [(k, [1] + list(v.shape)) for k, v in list(y_first_el.items())]) + elif y is None: + y_first_el_shape = None + else: + y_first_el_shape = ([1] + list(y_first_el[0].shape if isinstance( + y_first_el, list) else y_first_el.shape)) - self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(x_first_el_shape, y_first_el_shape, - n_classes, batch_size) + self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape( + x_first_el_shape, y_first_el_shape, n_classes, batch_size) # Input dtype of x_first_el. - self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(x_first_el.items())]) if x_is_dict \ - else _check_dtype(x_first_el.dtype) + if x_is_dict: + self._input_dtype = dict( + [(k, _check_dtype(v.dtype)) for k, v in list(x_first_el.items())]) + else: + self._input_dtype = _check_dtype(x_first_el.dtype) # Output dtype of y_first_el. def check_y_dtype(el): - if isinstance(el, list) or isinstance(el, np.ndarray): - if isinstance(el, np.ndarray) and el.ndim == 0: - return el.dtype - else: - return _check_dtype(np.dtype(type(el[0]))) + if isinstance(el, np.ndarray): + return el.dtype + elif isinstance(el, list): + return check_y_dtype(el[0]) else: return _check_dtype(np.dtype(type(el))) # Output types are floats, due to both softmaxes and regression req. if n_classes is not None and (y is None or not y_is_dict) and n_classes > 0: self._output_dtype = np.float32 + elif y_is_dict: + self._output_dtype = dict( + [(k, check_y_dtype(v)) for k, v in list(y_first_el.items())]) + elif y is None: + self._output_dtype = None else: - self._output_dtype = dict([(k, check_y_dtype(v)) for k, v in list(y_first_el.items())]) if y_is_dict \ - else (check_y_dtype(y_first_el) if y is not None else None) + self._output_dtype = check_y_dtype(y_first_el) def get_feed_params(self): """Function returns a `dict` with data feed params while training. @@ -627,13 +669,17 @@ class StreamingDataFeeder(DataFeeder): """ def init_array(shape, dtype): + """Initialize array of given shape or dict of shapes and dtype.""" if shape is None: return None + elif isinstance(shape, dict): + return dict([(k, np.zeros(shape[k], dtype[k])) + for k in list(shape.keys())]) else: - return dict([(k, np.zeros(shape[k], dtype[k])) for k in list(shape.keys())]) if isinstance(shape, dict) else \ - np.zeros(shape, dtype=dtype) + return np.zeros(shape, dtype=dtype) def put_data_array(dest, index, source=None, n_classes=None): + """Puts data array into container.""" if source is None: dest = dest[:index] elif n_classes is not None and n_classes > 1: @@ -650,12 +696,13 @@ class StreamingDataFeeder(DataFeeder): return dest def put_data_array_or_dict(holder, index, data=None, n_classes=None): + """Puts data array or data dictionary into container.""" if holder is None: return None if isinstance(holder, dict): if data is None: data = {k: None for k in holder.keys()} - assert (isinstance(data, dict)) + assert isinstance(data, dict) for k in holder.keys(): num_classes = n_classes[k] if (n_classes is not None and k in n_classes) else None @@ -688,12 +735,18 @@ class StreamingDataFeeder(DataFeeder): out = put_data_array_or_dict(out, i, next_out, self.n_classes) # creating feed_dict - feed_dict = dict([(self._input_placeholder[k].name, inp[k]) for k in list(self._input_placeholder.keys())]) if \ - isinstance(inp, dict) else {self._input_placeholder.name: inp} + if isinstance(inp, dict): + feed_dict = dict([(self._input_placeholder[k].name, inp[k]) + for k in list(self._input_placeholder.keys())]) + else: + feed_dict = {self._input_placeholder.name: inp} if self._y is not None: - feed_dict.update( - dict([(self._output_placeholder[k].name, out[k]) for k in list(self._output_placeholder.keys())]) \ - if isinstance(out, dict) else {self._output_placeholder.name: out}) + if isinstance(out, dict): + feed_dict.update( + dict([(self._output_placeholder[k].name, out[k]) + for k in list(self._output_placeholder.keys())])) + else: + feed_dict.update({self._output_placeholder.name: out}) return feed_dict @@ -708,8 +761,14 @@ class DaskDataFeeder(object): memory and still do random seeks for sampling of batches. """ - def __init__(self, x, y, n_classes, batch_size, shuffle=True, - random_state=None, epochs=None): + def __init__(self, + x, + y, + n_classes, + batch_size, + shuffle=True, + random_state=None, + epochs=None): """Initializes a DaskDataFeeder instance. Args: @@ -732,10 +791,14 @@ class DaskDataFeeder(object): output_shape: shape of the output. input_dtype: dtype of input. output_dtype: dtype of output. + + Raises: + ValueError: if `x` or `y` are `dict`, as they are not supported currently. """ if isinstance(x, dict) or isinstance(y, dict): - raise ValueError("DaskDataFeeder does not support dictionaries at the moment.") + raise ValueError( + 'DaskDataFeeder does not support dictionaries at the moment.') # pylint: disable=invalid-name,super-init-not-called import dask.dataframe as dd # pylint: disable=g-import-not-at-top @@ -763,7 +826,7 @@ class DaskDataFeeder(object): self._shuffle = shuffle self.epochs = epochs self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape( - x_shape, y_shape, n_classes, batch_size) + x_shape, y_shape, n_classes, batch_size) self.sample_fraction = self._batch_size / float(x_count) self._input_dtype = _check_dtype(self._x.dtypes[0]) self._output_dtype = _check_dtype(self._y.dtypes[self._y_columns]) @@ -797,8 +860,8 @@ class DaskDataFeeder(object): # TODO(ipolosukhin): option for with/without replacement (dev version of # dask) sample = self.df.random_split( - [self.sample_fraction, 1 - self.sample_fraction], - random_state=self.random_state) + [self.sample_fraction, 1 - self.sample_fraction], + random_state=self.random_state) inp = extract_pandas_matrix(sample[0][self._x_columns].compute()).tolist() out = extract_pandas_matrix(sample[0][self._y_columns].compute()) # convert to correct dtype @@ -811,7 +874,6 @@ class DaskDataFeeder(object): out_max = self._y.max().compute().values[0] encoded_out = np.zeros((out.size, out_max + 1), dtype=self._output_dtype) encoded_out[np.arange(out.size), out] = 1 - return {input_placeholder.name: inp, - output_placeholder.name: encoded_out} + return {input_placeholder.name: inp, output_placeholder.name: encoded_out} return _feed_dict_fn diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py index f6e5a3973b0..7f5711ac1b5 100644 --- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py +++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py @@ -253,20 +253,20 @@ class DataFeederTest(test.TestCase): inp, out = df.input_builder() feed_dict_fn = df.get_feed_dict_fn() feed_dict = feed_dict_fn() - self._assertAllClose(inp, [[1, 2], [3, 4]], feed_dict, 'name') - self._assertAllClose(out, [1, 2], feed_dict, 'name') + self._assertAllClose(inp, [[[1, 2]], [[3, 4]]], feed_dict, 'name') + self._assertAllClose(out, [[[1], [2]], [[2], [2]]], feed_dict, 'name') def x_iter(wrap_dict=False): - yield np.array([1, 2]) if not wrap_dict else self._wrap_dict( - np.array([1, 2]), 'in') - yield np.array([3, 4]) if not wrap_dict else self._wrap_dict( - np.array([3, 4]), 'in') + yield np.array([[1, 2]]) if not wrap_dict else self._wrap_dict( + np.array([[1, 2]]), 'in') + yield np.array([[3, 4]]) if not wrap_dict else self._wrap_dict( + np.array([[3, 4]]), 'in') def y_iter(wrap_dict=False): - yield np.array([1]) if not wrap_dict else self._wrap_dict( - np.array([1]), 'out') - yield np.array([2]) if not wrap_dict else self._wrap_dict( - np.array([2]), 'out') + yield np.array([[1], [2]]) if not wrap_dict else self._wrap_dict( + np.array([[1], [2]]), 'out') + yield np.array([[2], [2]]) if not wrap_dict else self._wrap_dict( + np.array([[2], [2]]), 'out') func( data_feeder.StreamingDataFeeder( diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py index 9579b8fabd1..5de9bb5d775 100644 --- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py +++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py @@ -25,6 +25,7 @@ import six from tensorflow.contrib.framework import tensor_util as contrib_tensor_util from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops +from tensorflow.python.framework import random_seed from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.ops import linalg_ops @@ -139,10 +140,11 @@ class LinearOperatorDerivedClassTest(test.TestCase): def test_to_dense(self): self._maybe_skip("to_dense") - with self.test_session() as sess: - for use_placeholder in False, True: - for shape in self._shapes_to_test: - for dtype in self._dtypes_to_test: + for use_placeholder in False, True: + for shape in self._shapes_to_test: + for dtype in self._dtypes_to_test: + with self.test_session(graph=ops.Graph()) as sess: + sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED operator, mat, feed_dict = self._operator_and_mat_and_feed_dict( shape, dtype, use_placeholder=use_placeholder) op_dense = operator.to_dense() @@ -153,14 +155,15 @@ class LinearOperatorDerivedClassTest(test.TestCase): def test_det(self): self._maybe_skip("det") - with self.test_session() as sess: - for use_placeholder in False, True: - for shape in self._shapes_to_test: - for dtype in self._dtypes_to_test: - if dtype.is_complex: - self.skipTest( - "tf.matrix_determinant does not work with complex, so this " - "test is being skipped.") + for use_placeholder in False, True: + for shape in self._shapes_to_test: + for dtype in self._dtypes_to_test: + if dtype.is_complex: + self.skipTest( + "tf.matrix_determinant does not work with complex, so this " + "test is being skipped.") + with self.test_session(graph=ops.Graph()) as sess: + sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED operator, mat, feed_dict = self._operator_and_mat_and_feed_dict( shape, dtype, use_placeholder=use_placeholder) op_det = operator.determinant() @@ -173,11 +176,12 @@ class LinearOperatorDerivedClassTest(test.TestCase): def test_apply(self): self._maybe_skip("apply") - with self.test_session() as sess: - for use_placeholder in False, True: - for shape in self._shapes_to_test: - for dtype in self._dtypes_to_test: - for adjoint in False, True: + for use_placeholder in False, True: + for shape in self._shapes_to_test: + for dtype in self._dtypes_to_test: + for adjoint in False, True: + with self.test_session(graph=ops.Graph()) as sess: + sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED operator, mat, feed_dict = self._operator_and_mat_and_feed_dict( shape, dtype, use_placeholder=use_placeholder) x = self._make_x(operator, adjoint=adjoint) @@ -191,11 +195,12 @@ class LinearOperatorDerivedClassTest(test.TestCase): def test_solve(self): self._maybe_skip("solve") - with self.test_session() as sess: - for use_placeholder in False, True: - for shape in self._shapes_to_test: - for dtype in self._dtypes_to_test: - for adjoint in False, True: + for use_placeholder in False, True: + for shape in self._shapes_to_test: + for dtype in self._dtypes_to_test: + for adjoint in False, True: + with self.test_session(graph=ops.Graph()) as sess: + sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED operator, mat, feed_dict = self._operator_and_mat_and_feed_dict( shape, dtype, use_placeholder=use_placeholder) rhs = self._make_rhs(operator, adjoint=adjoint) @@ -209,10 +214,11 @@ class LinearOperatorDerivedClassTest(test.TestCase): def test_add_to_tensor(self): self._maybe_skip("add_to_tensor") - with self.test_session() as sess: - for use_placeholder in False, True: - for shape in self._shapes_to_test: - for dtype in self._dtypes_to_test: + for use_placeholder in False, True: + for shape in self._shapes_to_test: + for dtype in self._dtypes_to_test: + with self.test_session(graph=ops.Graph()) as sess: + sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED operator, mat, feed_dict = self._operator_and_mat_and_feed_dict( shape, dtype, use_placeholder=use_placeholder) op_plus_2mat = operator.add_to_tensor(2 * mat) diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile index 2bf246bdf91..97d963ede2d 100644 --- a/tensorflow/contrib/makefile/Makefile +++ b/tensorflow/contrib/makefile/Makefile @@ -21,7 +21,7 @@ echo "false") # Hexagon integration ifdef HEXAGON_LIBS - LIBGEMM_WRAPPER := $(HEXAGON_LIBS)/libgemm_wrapper.so + LIBGEMM_WRAPPER := $(HEXAGON_LIBS)/libhexagon_controller.so ifeq ($(shell test -f $(LIBGEMM_WRAPPER) 2> /dev/null; echo $$?), 0) $(info "Use hexagon libs at " $(LIBGEMM_WRAPPER)) else @@ -271,7 +271,7 @@ ifeq ($(TARGET),ANDROID) ifdef HEXAGON_LIBS INCLUDES += -I$(HEXAGON_INCLUDE) - LIBS += -lgemm_wrapper + LIBS += -lhexagon_controller LDFLAGS += -L$(HEXAGON_LIBS) CXXFLAGS += -DUSE_HEXAGON_LIBS endif diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh index c826431dea8..3d80f0fd2dd 100755 --- a/tensorflow/contrib/makefile/build_all_android.sh +++ b/tensorflow/contrib/makefile/build_all_android.sh @@ -22,21 +22,32 @@ usage() { echo "-s [sub_makefiles] sub makefiles separated by white space" echo "-t [build_target] build target for Android makefile [default=all]" echo "-T only build tensorflow" - echo "-x use hexagon library located at ../hexagon/" + echo "-x use hexagon library located at tensorflow/contrib/makefile/downloads/hexagon" + echo "-X download hexagon deps and run hexagon_graph_execution" exit 1 } +download_and_push() { + URL="$1" + LOCAL_DEST="$2" + ANDROID_DEST="$3" + curl -Ls "${URL}" -o "${LOCAL_DEST}" + adb shell mkdir -p "${ANDROID_DEST}" + adb push "${LOCAL_DEST}" "${ANDROID_DEST}" +} + if [[ -z "${NDK_ROOT}" ]]; then echo "NDK_ROOT should be set as an environment variable" 1>&2 exit 1 fi -while getopts "s:t:Tx" opt_name; do +while getopts "s:t:TxX" opt_name; do case "$opt_name" in s) SUB_MAKEFILES="${OPTARG}";; t) BUILD_TARGET="${OPTARG}";; T) ONLY_MAKE_TENSORFLOW="true";; x) USE_HEXAGON="true";; + X) DOWNLOAD_AND_USE_HEXAGON="true";; *) usage;; esac done @@ -49,6 +60,8 @@ cd ${SCRIPT_DIR}/../../../ source "${SCRIPT_DIR}/build_helper.subr" JOB_COUNT="${JOB_COUNT:-$(get_job_count)}" +HEXAGON_DOWNLOAD_PATH="tensorflow/contrib/makefile/downloads/hexagon" + if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then # Remove any old files first. make -f tensorflow/contrib/makefile/Makefile clean @@ -63,10 +76,30 @@ else make -f tensorflow/contrib/makefile/Makefile clean_except_protobuf_libs fi +if [[ "${DOWNLOAD_AND_USE_HEXAGON}" == "true" ]]; then + URL_BASE="https://storage.googleapis.com/download.tensorflow.org" + + rm -rf "${HEXAGON_DOWNLOAD_PATH}" + mkdir -p "${HEXAGON_DOWNLOAD_PATH}/libs" + + download_and_push "${URL_BASE}/deps/hexagon/libhexagon_controller.so" \ +"${HEXAGON_DOWNLOAD_PATH}/libs/libhexagon_controller.so" "/data/local/tmp" + + download_and_push "${URL_BASE}/deps/hexagon/libhexagon_nn_skel.so" \ +"${HEXAGON_DOWNLOAD_PATH}/libs/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp" + + download_and_push "${URL_BASE}/example_images/img_299x299.jpg" \ +"${HEXAGON_DOWNLOAD_PATH}/img_299x299.jpg" "/data/local/tmp" + + USE_HEXAGON="true" + SUB_MAKEFILES="$(pwd)/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in" + BUILD_TARGET="hexagon_graph_execution" +fi + if [[ "${USE_HEXAGON}" == "true" ]]; then - HEXAGON_PARENT_DIR=$(cd ../hexagon && pwd) + HEXAGON_PARENT_DIR=$(cd "${HEXAGON_DOWNLOAD_PATH}" && pwd) HEXAGON_LIBS="${HEXAGON_PARENT_DIR}/libs" - HEXAGON_INCLUDE=$(cd tensorflow/core/platform/hexagon && pwd) + HEXAGON_INCLUDE=$(cd "tensorflow/core/platform/hexagon" && pwd) fi if [[ -z "${BUILD_TARGET}" ]]; then @@ -80,3 +113,14 @@ else HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \ SUB_MAKEFILES="${SUB_MAKEFILES}" "${BUILD_TARGET}" fi + +if [[ "${DOWNLOAD_AND_USE_HEXAGON}" == "true" ]]; then + ANDROID_EXEC_FILE_MODE=755 + echo "Run hexagon_graph_execution" + adb push -p "./tensorflow/contrib/makefile/gen/bin/hexagon_graph_execution" "/data/local/tmp/" + adb wait-for-device + adb shell chmod "${ANDROID_EXEC_FILE_MODE}" "/data/local/tmp/hexagon_graph_execution" + adb wait-for-device + adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \ + "/data/local/tmp/hexagon_graph_execution" +fi diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py index 35efaf14d1b..3e2e408e6f9 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py @@ -4486,7 +4486,7 @@ class StreamingMeanIOUTest(test.TestCase): num_classes) sess.run(variables.local_variables_initializer()) confusion_matrix = update_op.eval() - self.assertAllEqual([[3, 2], [0, 5]], confusion_matrix) + self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix) desired_miou = np.mean([3. / 5., 5. / 7.]) self.assertAlmostEqual(desired_miou, miou.eval()) @@ -4509,7 +4509,7 @@ class StreamingMeanIOUTest(test.TestCase): miou, update_op = metrics.streaming_mean_iou(predictions, labels, num_classes) sess.run(variables.local_variables_initializer()) - self.assertAllEqual([[0, 40], [0, 0]], update_op.eval()) + self.assertAllEqual([[0, 0], [40, 0]], update_op.eval()) self.assertEqual(0., miou.eval()) def testResultsWithSomeMissing(self): @@ -4540,7 +4540,7 @@ class StreamingMeanIOUTest(test.TestCase): miou, update_op = metrics.streaming_mean_iou( predictions, labels, num_classes, weights=weights) sess.run(variables.local_variables_initializer()) - self.assertAllEqual([[2, 2], [0, 4]], update_op.eval()) + self.assertAllEqual([[2, 0], [2, 4]], update_op.eval()) desired_miou = np.mean([2. / 4., 4. / 6.]) self.assertAlmostEqual(desired_miou, miou.eval()) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index e81d0349b60..d3ffd692b28 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -84,12 +84,14 @@ load( "//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library", "tf_proto_library_cc", + "tf_additional_core_deps", + "tf_additional_lib_defines", + "tf_additional_lib_deps", "tf_additional_lib_hdrs", "tf_additional_lib_srcs", "tf_additional_minimal_lib_srcs", "tf_additional_proto_hdrs", "tf_additional_proto_srcs", - "tf_additional_lib_deps", "tf_additional_stream_executor_srcs", "tf_additional_cupti_wrapper_deps", "tf_additional_libdevice_data", @@ -1127,12 +1129,13 @@ cc_library( "platform/tracing.h", ], copts = tf_copts(), + defines = tf_additional_lib_defines(), linkopts = ["-ldl"], - deps = [ + deps = tf_additional_lib_deps() + [ ":lib_proto_parsing", ":protos_all_cc", - "//tensorflow/core/platform/default/build_config:platformlib", "//third_party/eigen3", + "//tensorflow/core/platform/default/build_config:platformlib", "@zlib_archive//:zlib", ], ) @@ -1352,7 +1355,7 @@ tf_cuda_library( ":protos_all_cc", "//third_party/eigen3", "//tensorflow/core/kernels:required", - ] + tf_additional_lib_deps(), + ] + tf_additional_core_deps(), alwayslink = 1, ) diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc index ee93b19d291..981a6549889 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc @@ -215,7 +215,7 @@ Status CUPTIManager::DisableTrace() { void CUPTIManager::InternalBufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { VLOG(2) << "BufferRequested"; - void *p = port::aligned_malloc(kBufferSize, kBufferAlignment); + void *p = port::AlignedMalloc(kBufferSize, kBufferAlignment); *size = kBufferSize; *buffer = reinterpret_cast(p); *maxNumRecords = 0; @@ -246,7 +246,7 @@ void CUPTIManager::InternalBufferCompleted(CUcontext ctx, uint32_t streamId, LOG(WARNING) << "Dropped " << dropped << " activity records"; } } - port::aligned_free(buffer); + port::AlignedFree(buffer); } CUPTIManager *GetCUPTIManager() { diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h index 5842758f0e3..91ce830df85 100644 --- a/tensorflow/core/common_runtime/gpu/pool_allocator.h +++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h @@ -171,9 +171,9 @@ class BasicCPUAllocator : public SubAllocator { ~BasicCPUAllocator() override {} void* Alloc(size_t alignment, size_t num_bytes) override { - return port::aligned_malloc(num_bytes, alignment); + return port::AlignedMalloc(num_bytes, alignment); } - void Free(void* ptr, size_t num_bytes) override { port::aligned_free(ptr); } + void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); } }; // Allocator for pinned CPU RAM that is made known to CUDA for the diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD index 3e4ab5bc179..5a7e7bb7e56 100644 --- a/tensorflow/core/debug/BUILD +++ b/tensorflow/core/debug/BUILD @@ -1,8 +1,12 @@ # Description: # TensorFlow Debugger (tfdbg). # -# Public Android targets: -# filegroup ":android_srcs" - Debugger source files for Android. +# Public target(s): +# +# ":debug" - Depending on this target causes a concrete implementation of +# DebuggerState to be constructed at initialization time, enabling +# TensorFlow Debugger (tfdbg) support. For details, please see +# core/common_runtime/debugger_state_interface.h. package( default_visibility = ["//tensorflow:internal"], @@ -39,14 +43,12 @@ tf_proto_library_cc( protodeps = ["//tensorflow/core:protos_all"], ) -# Depending on this target causes a concrete DebuggerState implementation -# to be registered at initialization time. For details, please see -# core/common_runtime/debugger_state_interface.h. cc_library( name = "debug", srcs = ["debug.cc"], copts = tf_copts(), linkstatic = 1, + visibility = ["//visibility:public"], deps = [ ":debug_graph_utils", "//tensorflow/core:core_cpu_internal", diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD index 89710a4654c..8ab8712c8cc 100644 --- a/tensorflow/core/distributed_runtime/rpc/BUILD +++ b/tensorflow/core/distributed_runtime/rpc/BUILD @@ -275,6 +275,7 @@ cc_library( "//tensorflow/core/distributed_runtime:server_lib", "//tensorflow/core/distributed_runtime:worker_env", "@grpc//:grpc++_unsecure", + "@grpc//:grpc_unsecure", ], alwayslink = 1, ) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index addf09672ab..99309a98cab 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -21,6 +21,7 @@ limitations under the License. #include "grpc++/grpc++.h" #include "grpc++/security/credentials.h" #include "grpc++/server_builder.h" +#include "grpc/support/alloc.h" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/device_mgr.h" @@ -41,6 +42,7 @@ limitations under the License. #include "tensorflow/core/framework/op.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/public/session_options.h" namespace tensorflow { @@ -304,6 +306,11 @@ class GrpcServerFactory : public ServerFactory { class GrpcServerRegistrar { public: GrpcServerRegistrar() { + gpr_allocation_functions alloc_fns; + alloc_fns.malloc_fn = port::Malloc; + alloc_fns.realloc_fn = port::Realloc; + alloc_fns.free_fn = port::Free; + gpr_set_allocation_functions(alloc_fns); ServerFactory::Register("GRPC_SERVER", new GrpcServerFactory()); } }; diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc index 601d87fa554..812ce4bfe7e 100644 --- a/tensorflow/core/framework/allocator.cc +++ b/tensorflow/core/framework/allocator.cc @@ -68,7 +68,7 @@ class CPUAllocator : public Allocator { string Name() override { return "cpu"; } void* AllocateRaw(size_t alignment, size_t num_bytes) override { - void* p = port::aligned_malloc(num_bytes, alignment); + void* p = port::AlignedMalloc(num_bytes, alignment); if (cpu_allocator_collect_stats) { const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p); mutex_lock l(mu_); @@ -89,7 +89,7 @@ class CPUAllocator : public Allocator { mutex_lock l(mu_); stats_.bytes_in_use -= alloc_size; } - port::aligned_free(ptr); + port::AlignedFree(ptr); } void GetStats(AllocatorStats* stats) override { diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc index 9e5bfc4d6e9..5959bce9e76 100644 --- a/tensorflow/core/framework/function.cc +++ b/tensorflow/core/framework/function.cc @@ -211,43 +211,6 @@ Status AddRetName(NameInfoIndex* name_info, const string& ret, return Status::OK(); } -Status BuildNodeOutputIndex(const FunctionDef::Node& node, - const InstantiateAttrValueMap& attrs, - GetFunctionSignature get_function, - const int arg_index, NameInfoIndex* name_info) { - const OpDef* node_sig = nullptr; - TF_RETURN_IF_ERROR(get_function(node.op(), &node_sig)); - if (node_sig->output_arg_size() == 0) { - // This node produces no output. - if (node.ret_size() != 1) { - return errors::InvalidArgument("Expect one ret name."); - } - return AddRetName(name_info, node.ret(0), {false, arg_index, 0, false, {}}); - } - const int num_retval = node_sig->output_arg_size(); - if (num_retval != node.ret_size()) { - return errors::InvalidArgument("Malformed function node (#ret): ", - num_retval, " vs. ", node.ret_size()); - } - int start = 0; - bool is_type_list; - DataTypeVector dtypes; - for (int i = 0; i < num_retval; ++i) { - TF_RETURN_IF_ERROR( - ArgNumType(attrs, node_sig->output_arg(i), &is_type_list, &dtypes)); - TF_RETURN_IF_ERROR( - AddRetName(name_info, node.ret(i), - {false, arg_index, start, is_type_list, dtypes})); - for (int j = 0; j < static_cast(dtypes.size()); ++j) { - TF_RETURN_IF_ERROR( - AddRetName(name_info, strings::StrCat(node.ret(i), ":", j), - {false, arg_index, start + j, false, {dtypes[j]}})); - } - start += dtypes.size(); - } - return Status::OK(); -} - Status BuildNodeOutputIndex(const NodeDef& node, const InstantiateAttrValueMap& attrs, GetFunctionSignature get_function, @@ -280,85 +243,6 @@ Status BuildNodeOutputIndex(const NodeDef& node, return Status::OK(); } -Status InstantiateNode(const FunctionDef::Node& fnode, - const InstantiateAttrValueMap& attrs, - GetFunctionSignature get_function, - const NameInfoIndex& name_info, GraphDef* gdef) { - const OpDef* fnode_sig = nullptr; - TF_CHECK_OK(get_function(fnode.op(), &fnode_sig)); - NodeDef* gnode = gdef->add_node(); - gnode->set_name(Name(gdef->node_size() - 1)); - gnode->set_op(fnode.op()); - - // Input - const int num_args = fnode_sig->input_arg_size(); - bool is_type_list; - DataTypeVector dtypes; - int fnode_arg_index = 0; - for (int i = 0; i < num_args; ++i) { - TF_RETURN_IF_ERROR( - ArgNumType(attrs, fnode_sig->input_arg(i), &is_type_list, &dtypes)); - if (!is_type_list) { - const NameInfoItem* item = - gtl::FindOrNull(name_info, fnode.arg(fnode_arg_index)); - if (item == nullptr) { - return errors::InvalidArgument("arg[", i, "] is not found: ", - ProtoShortDebugString(fnode)); - } - if (dtypes != item->dtypes) { - return errors::InvalidArgument("Invalid arg(", i, - ") for function arg: ", - DataTypeSliceString(dtypes), " vs. ", - DataTypeSliceString(item->dtypes), "."); - } - for (size_t j = 0; j < dtypes.size(); ++j) { - if (item->is_func_arg) { - gnode->add_input(Name(item->nid + j)); - } else { - gnode->add_input(Name(item->nid, item->idx + j)); - } - } - ++fnode_arg_index; - } else { - for (size_t j = 0; j < dtypes.size(); ++j) { - const NameInfoItem* item = - gtl::FindOrNull(name_info, fnode.arg(fnode_arg_index + j)); - if (item == nullptr) { - return errors::InvalidArgument("arg[", i + j, "] is not found: ", - ProtoShortDebugString(fnode)); - } - if (item->dtypes.size() != 1 || (item->dtypes[0] != dtypes[j])) { - return errors::InvalidArgument( - "Invalid typelist arg(", i + j, ") for function arg: ", - DataTypeSliceString(dtypes), " vs. ", - DataTypeSliceString(item->dtypes), "."); - } - if (item->is_func_arg) { - gnode->add_input(Name(item->nid)); - } else { - gnode->add_input(Name(item->nid, item->idx)); - } - } - fnode_arg_index += dtypes.size(); - } - } - // Control deps. - for (int i = 0; i < fnode.dep_size(); ++i) { - const NameInfoItem* item = gtl::FindOrNull(name_info, fnode.dep(i)); - if (item == nullptr) { - return errors::InvalidArgument("dep[", i, "] is not found."); - } - gnode->add_input(Dep(item->nid)); - } - - // Attrs. - for (const auto& p : attrs) { - (*gnode->mutable_attr())[p.first] = p.second; - } - - return Status::OK(); -} - Status InstantiateNode(const NodeDef& fnode, const InstantiateAttrValueMap& attrs, GetFunctionSignature get_function, @@ -448,38 +332,6 @@ Status InstantiateNode(const NodeDef& fnode, return Status::OK(); } -// FunctionDef::Node version -Status AddReturnNode(const OpDef::ArgDef& ret_def, - const InstantiateAttrValueMap& attrs, - const NameInfoIndex& name_info, int* ret_index, - InstantiationResult* result) { - bool is_type_list; - DataTypeVector dtypes; - TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes)); - CHECK_GE(dtypes.size(), size_t{1}); - const NameInfoItem* item = gtl::FindOrNull(name_info, ret_def.name()); - if (item == nullptr) { - return errors::InvalidArgument("ret is not found."); - } - if (dtypes != item->dtypes) { - return errors::InvalidArgument("Invalid ret types ", ret_def.name(), " : ", - DataTypeVectorString(dtypes), " vs. ", - DataTypeVectorString(item->dtypes)); - } - GraphDef* gdef = &result->gdef; - for (size_t i = 0; i < dtypes.size(); ++i) { - NodeDef* gnode = gdef->add_node(); - gnode->set_name(Name(gdef->node_size() - 1)); - gnode->set_op("_Retval"); - gnode->add_input(Name(item->nid, item->idx + i)); - AddAttr("T", dtypes[i], gnode); - AddAttr("index", (*ret_index)++, gnode); - result->ret_types.push_back(dtypes[i]); - } - return Status::OK(); -} - -// NodeDef version Status AddReturnNode(const OpDef::ArgDef& ret_def, const InstantiateAttrValueMap& attrs, const ::tensorflow::protobuf::Map& ret_map, @@ -561,38 +413,6 @@ string Print(const AttrValue& attr_value) { return SummarizeAttrValue(attr_value); } -string Print(const FunctionDef::Node& node) { - string out; - for (int i = 0; i < node.ret_size(); ++i) { - const auto& name = node.ret(i); - if (i > 0) strings::StrAppend(&out, ", "); - strings::StrAppend(&out, name); - } - strings::StrAppend(&out, " = ", node.op()); - if (node.attr_size() > 0) { - std::vector entries; - for (auto p : node.attr()) { - entries.push_back(strings::StrCat(p.first, "=", Print(p.second))); - } - sort(entries.begin(), entries.end()); - strings::StrAppend(&out, "[", str_util::Join(entries, ", "), "]"); - } - strings::StrAppend(&out, "("); - for (int i = 0; i < node.arg_size(); ++i) { - if (i > 0) strings::StrAppend(&out, ", "); - strings::StrAppend(&out, node.arg(i)); - } - strings::StrAppend(&out, ")"); - if (node.dep_size() > 0) { - strings::StrAppend(&out, " @ "); - for (int i = 0; i < node.dep_size(); ++i) { - if (i > 0) strings::StrAppend(&out, ", "); - strings::StrAppend(&out, node.dep(i)); - } - } - return out; -} - // TODO(josh11b): Merge this with SummarizeNodeDef(). string Print(const NodeDef& n) { string out; @@ -650,17 +470,11 @@ string Print(const FunctionDef& fdef) { strings::StrAppend(&out, Print(sig.output_arg(i))); } strings::StrAppend(&out, ") {\n"); - if (fdef.node_def_size() > 0 || fdef.ret_size() > 0) { - for (const auto& n : fdef.node_def()) { - strings::StrAppend(&out, " ", Print(n), "\n"); - } - for (const auto& r : fdef.ret()) { - strings::StrAppend(&out, " return ", r.first, " = ", r.second, "\n"); - } - } else { // TODO(josh11b): Eventually remove this case. - for (const auto& n : fdef.node()) { - strings::StrAppend(&out, " ", Print(n), "\n"); - } + for (const auto& n : fdef.node_def()) { + strings::StrAppend(&out, " ", Print(n), "\n"); + } + for (const auto& r : fdef.ret()) { + strings::StrAppend(&out, " return ", r.first, " = ", r.second, "\n"); } strings::StrAppend(&out, "}\n"); return out; @@ -772,92 +586,47 @@ Status InstantiateFunction(const FunctionDef& fdef, // Makes a copy of all attrs in fdef and substitutes placeholders. // After this step, every attr is bound to a concrete value. std::vector node_attrs; - if (fdef.node_def_size() > 0 || fdef.ret_size() > 0) { - node_attrs.resize(fdef.node_def_size()); - for (int i = 0; i < fdef.node_def_size(); ++i) { - for (auto attr : fdef.node_def(i).attr()) { - if (!SubstitutePlaceholders(substitute, &attr.second)) { - return errors::InvalidArgument("Failed to bind all placeholders in ", - SummarizeAttrValue(attr.second)); - } - if (!node_attrs[i].insert(attr).second) { - return errors::Internal("Somehow duplicated: ", attr.first); - } + node_attrs.resize(fdef.node_def_size()); + for (int i = 0; i < fdef.node_def_size(); ++i) { + for (auto attr : fdef.node_def(i).attr()) { + if (!SubstitutePlaceholders(substitute, &attr.second)) { + return errors::InvalidArgument("Failed to bind all placeholders in ", + SummarizeAttrValue(attr.second)); + } + if (!node_attrs[i].insert(attr).second) { + return errors::Internal("Somehow duplicated: ", attr.first); } - TF_RETURN_IF_ERROR( - AddDefaultAttrs(fdef.node_def(i).op(), get_function, &node_attrs[i])); } + TF_RETURN_IF_ERROR( + AddDefaultAttrs(fdef.node_def(i).op(), get_function, &node_attrs[i])); + } - for (int i = 0; i < fdef.node_def_size(); ++i) { - s = BuildNodeOutputIndex(fdef.node_def(i), node_attrs[i], get_function, - gdef->node_size() + i, &name_info); - if (!s.ok()) { - errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i))); - return s; - } + for (int i = 0; i < fdef.node_def_size(); ++i) { + s = BuildNodeOutputIndex(fdef.node_def(i), node_attrs[i], get_function, + gdef->node_size() + i, &name_info); + if (!s.ok()) { + errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i))); + return s; } - // Emits one gdef.node for each fdef.node_def. - for (int i = 0; i < fdef.node_def_size(); ++i) { - s = InstantiateNode(fdef.node_def(i), node_attrs[i], get_function, - name_info, gdef); - if (!s.ok()) { - errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i))); - return s; - } + } + // Emits one gdef.node for each fdef.node_def. + for (int i = 0; i < fdef.node_def_size(); ++i) { + s = InstantiateNode(fdef.node_def(i), node_attrs[i], get_function, + name_info, gdef); + if (!s.ok()) { + errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i))); + return s; } + } - // Emits nodes for the function's return values. - int ret_index = 0; - for (const OpDef::ArgDef& ret_def : sig.output_arg()) { - s = AddReturnNode(ret_def, attr_values, fdef.ret(), name_info, &ret_index, - result); - if (!s.ok()) { - errors::AppendToMessage(&s, "In function output ", Print(ret_def)); - return s; - } - } - } else { // TODO(josh11b): Eventually remove this case. - node_attrs.resize(fdef.node_size()); - for (int i = 0; i < fdef.node_size(); ++i) { - for (auto attr : fdef.node(i).attr()) { - if (!SubstitutePlaceholders(substitute, &attr.second)) { - return errors::InvalidArgument("Failed to bind all placeholders in ", - SummarizeAttrValue(attr.second)); - } - if (!node_attrs[i].insert(attr).second) { - return errors::Internal("Somehow duplicated: ", attr.first); - } - } - TF_RETURN_IF_ERROR( - AddDefaultAttrs(fdef.node(i).op(), get_function, &node_attrs[i])); - } - - for (int i = 0; i < fdef.node_size(); ++i) { - s = BuildNodeOutputIndex(fdef.node(i), node_attrs[i], get_function, - gdef->node_size() + i, &name_info); - if (!s.ok()) { - errors::AppendToMessage(&s, "In ", Print(fdef.node(i))); - return s; - } - } - // Emits one gdef.node for each fdef.node. - for (int i = 0; i < fdef.node_size(); ++i) { - s = InstantiateNode(fdef.node(i), node_attrs[i], get_function, name_info, - gdef); - if (!s.ok()) { - errors::AppendToMessage(&s, "In ", Print(fdef.node(i))); - return s; - } - } - - // Emits nodes for the function's return values. - int ret_index = 0; - for (const OpDef::ArgDef& ret_def : sig.output_arg()) { - s = AddReturnNode(ret_def, attr_values, name_info, &ret_index, result); - if (!s.ok()) { - errors::AppendToMessage(&s, "In function output ", Print(ret_def)); - return s; - } + // Emits nodes for the function's return values. + int ret_index = 0; + for (const OpDef::ArgDef& ret_def : sig.output_arg()) { + s = AddReturnNode(ret_def, attr_values, fdef.ret(), name_info, &ret_index, + result); + if (!s.ok()) { + errors::AppendToMessage(&s, "In function output ", Print(ret_def)); + return s; } } diff --git a/tensorflow/core/framework/function.proto b/tensorflow/core/framework/function.proto index 5a394d64809..bd01e86da3a 100644 --- a/tensorflow/core/framework/function.proto +++ b/tensorflow/core/framework/function.proto @@ -30,61 +30,7 @@ message FunctionDef { // Attributes specific to this function definition. map attr = 5; - // TO BE REPLACED - - // The body of the function. - repeated Node node = 2; // function.node.ret[*] are unique. - - // A node is a multi-value assignment: - // (ret[0], ret[1], ...) = func(arg[0], arg[1], ...) - // - // By convention, "func" is resolved by consulting with a user-defined - // library first. If not resolved, "func" is assumed to be a builtin op. - message Node { - // This node produces multiple outputs. They are named ret[0], - // ret[1], ..., etc. - // - // REQUIRES: function.node.ret[*] are unique across all nodes. - // REQUIRES: ret.size == func/op def's number of output args. - repeated string ret = 1; - - // The op/function name. - string op = 2; - - // Arguments passed to this func/op. - // - // arg[i] must be either one of - // function.signature.input_args[*].name or one of - // function.node[*].ret[*]. - // - // REQUIRES: arg.size == func/op def's number of input args. - repeated string arg = 3; - - // Control dependencies. - // - // dep[i] must be one of function.node[*].ret[*] or one of - // function.signature.input_args[*].name. - repeated string dep = 4; - - // Attrs. - // - // 'attr' maps names defined by 'func's attr defs to attr values. - // attr values may have placeholders which are substituted - // recursively by concrete values when this node is instantiated. - // These placeholders must name an attr listed in the FunctionDef's - // signature. - map attr = 5; - } - - // WILL REPLACE THE ABOVE - - // If node_def is present, and the consumer is at GraphDef version - // >= 12, then these fields are used and `node` is ignored. If the - // consumer's GraphDef version is < 12 or this field is empty, then - // `node` is used. This allows producers to fill both fields to - // remain compatible with old consumers. At some future GraphDef - // version, `node` will be ignored even if `node_def` is empty. - // TODO(josh11b): Finish this transition. + // NOTE: field id 2 deleted on Jan 11, 2016, GraphDef version 21. // In both of the following fields, there is the need to specify an // output that is used as either the input to another node (in @@ -120,6 +66,10 @@ message FunctionDef { // The body of the function. Unlike the NodeDefs in a GraphDef, attrs // may have values of type `placeholder` and the `input` field uses // the "output" format above. + + // By convention, "op" in node_def is resolved by consulting with a + // user-defined library first. If not resolved, "func" is assumed to + // be a builtin op. repeated NodeDef node_def = 3; // A mapping from the output arg names from `signature` to the diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc index a768e18b014..8bce215a9af 100644 --- a/tensorflow/core/framework/function_test.cc +++ b/tensorflow/core/framework/function_test.cc @@ -48,52 +48,8 @@ y: A scalar in type T. static InstantiateAttrValueMap kNoAttrs; -TEST(TFunc, SquarePlusOneOld) { - auto fdef = FDH::Define( // Create a FunctionDef using Function::Nodes. - // Name - "SquarePlusOne", - // Args - {"x: T"}, - // Return values - {"y: T"}, - // Attrs - {"T: {float, double, int32, int64}"}, - // Nodes - {// a = Square(x) - {{"a"}, "Square", {"x"}, {{"T", "$T"}}}, - // o = One() - // NOTE: We can also have a Cast(x) instead. - {{"o"}, "One", {}, {{"T", "$T"}}}, - // y = Add(a, o) - {{"y"}, "Add", {"a", "o"}, {{"T", "$T"}}}}); - - const char* e = R"P( -SquarePlusOne[T:{float, double, int32, int64}](x:T) -> (y:T) { - a = Square[T=$T](x) - o = One[T=$T]() - y = Add[T=$T](a:y:0, o:y:0) - return y = y:z:0 -} -)P"; - EXPECT_EQ(DebugString(fdef), e); - - // Instantiate one with T=float - InstantiationResult result; - TF_ASSERT_OK(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result)); - const char* e2 = R"P( -(n0:float) -> (n3:float) { - n1 = Square[T=float](n0) - n2 = One[T=float]() - n3 = Add[T=float](n1, n2) -} -)P"; - EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT})); - EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT})); - EXPECT_EQ(DebugString(result.gdef), e2); -} - -TEST(TFunc, SquarePlusOneNodeDef) { - auto fdef = FDH::Create( // Create a FunctionDef using NodeDefs. +TEST(TFunc, SquarePlusOne) { + auto fdef = FDH::Create( // Name "SquarePlusOne", // Inputs @@ -138,8 +94,8 @@ SquarePlusOne[T:{float, double, int32, int64}](x:T) -> (y:T) { EXPECT_EQ(DebugString(result.gdef), e2); } -TEST(TFunc, ControlDepNodeDef) { - auto fdef = FDH::Create( // Create a FunctionDef using NodeDefs. +TEST(TFunc, ControlDep) { + auto fdef = FDH::Create( // Name "ControlDep", // Inputs @@ -190,44 +146,8 @@ REGISTER_OP("HasDefaultType") // This verifies that a function using an op before a type attr (with // a default) is added, still works. This is important for backwards // compatibilty. -TEST(TFunc, MissingTypeAttrOld) { - auto fdef = FDH::Define( // Create a FunctionDef using Function::Nodes. - // Name - "BackCompat", - // Args - {}, - // Return values - {"y: float"}, - // Attrs - {}, - // Nodes - {// y = HasDefaultType(x), T missing, defaults to float - {{"y"}, "HasDefaultType", {}, {}}}); - - const char* e = R"P( -BackCompat() -> (y:float) { - y = HasDefaultType() - return y = y:out:0 -} -)P"; - EXPECT_EQ(DebugString(fdef), e); - - InstantiationResult result; - TF_ASSERT_OK( - InstantiateFunction(fdef, InstantiateAttrValueMap{}, GetOpSig, &result)); - // Should get T=float from Op's default. - const char* e2 = R"P( -() -> (n0:float) { - n0 = HasDefaultType[T=float]() -} -)P"; - EXPECT_EQ(result.arg_types, DataTypeVector()); - EXPECT_EQ(result.ret_types, DataTypeVector({DT_FLOAT})); - EXPECT_EQ(DebugString(result.gdef), e2); -} - -TEST(TFunc, MissingTypeAttrNodeDef) { - auto fdef = FDH::Create( // Create a FunctionDef using NodeDefs. +TEST(TFunc, MissingTypeAttr) { + auto fdef = FDH::Create( // Name "BackCompat", // Args @@ -264,11 +184,8 @@ BackCompat() -> (y:float) { EXPECT_EQ(DebugString(result.gdef), e2); } -TEST(TFunc, NTimesTNodeDef) { - // Note that the equivalent FunctionDef using FunctionDef::Node requires - // using a _ListToArray to package up the two inputs to AddN as a single - // N*T edge. - auto fdef = FDH::Create( // Create a FunctionDef using NodeDefs. +TEST(TFunc, NTimesT) { + auto fdef = FDH::Create( // Name "NTimesT", // Inputs @@ -790,8 +707,8 @@ TEST(InstantiateErrors, TypeList_Missing_Arg) { "input unknown is not found"); } -TEST(InstantiateErrors, NodeDef_TooManyInputs) { - auto fdef = FDH::Create( // Create a FunctionDef using NodeDefs. +TEST(InstantiateErrors, TooManyInputs) { + auto fdef = FDH::Create( // Name "TooManyInputs", // Inputs @@ -811,8 +728,8 @@ TEST(InstantiateErrors, NodeDef_TooManyInputs) { "Expected input[2] == 'x' to be a control input."); } -TEST(InstantiateErrors, NodeDef_TooFewInputs) { - auto fdef = FDH::Create( // Create a FunctionDef using NodeDefs. +TEST(InstantiateErrors, TooFewInputs) { + auto fdef = FDH::Create( // Name "TooFewInputs", // Inputs @@ -832,8 +749,8 @@ TEST(InstantiateErrors, NodeDef_TooFewInputs) { "Attempt to access beyond input size: 2 >= 2"); } -TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray1) { - auto fdef = FDH::Create( // Create a FunctionDef using NodeDefs. +TEST(InstantiateErrors, TooManyInputsFromArray1) { + auto fdef = FDH::Create( // Name "TooManyInputsFromArray", // Inputs @@ -860,8 +777,8 @@ TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray1) { "Expected input[1] == 'y' to be a control input."); } -TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray2) { - auto fdef = FDH::Create( // Create a FunctionDef using NodeDefs. +TEST(InstantiateErrors, TooManyInputsFromArray2) { + auto fdef = FDH::Create( // Name "TooManyInputsFromArray", // Inputs @@ -888,8 +805,8 @@ TEST(InstantiateErrors, NodeDef_TooManyInputsFromArray2) { "Input a:output too long for inputs"); } -TEST(InstantiateErrors, NodeDef_TypeMismatch) { - auto fdef = FDH::Create( // Create a FunctionDef using NodeDefs. +TEST(InstantiateErrors, TypeMismatch) { + auto fdef = FDH::Create( // Name "TypeMismatch", // Inputs diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc index 58fb8cf611b..b76ab40b683 100644 --- a/tensorflow/core/framework/graph_def_util.cc +++ b/tensorflow/core/framework/graph_def_util.cc @@ -178,14 +178,8 @@ void OpsUsedByGraph(const GraphDef& graph_def, while (!functions_to_process.empty()) { const FunctionDef* fun = functions_to_process.back(); functions_to_process.pop_back(); - if (fun->node_def_size() > 0) { - for (const auto& node : fun->node_def()) { - mark_op_as_used(node.op()); - } - } else { // TODO(josh11b): Eventually drop support for this. - for (const auto& node : fun->node()) { - mark_op_as_used(node.op()); - } + for (const auto& node : fun->node_def()) { + mark_op_as_used(node.op()); } } diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc index f56e5fae1bc..f8253353008 100644 --- a/tensorflow/core/framework/load_library.cc +++ b/tensorflow/core/framework/load_library.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mem.h" namespace tensorflow { @@ -91,7 +92,7 @@ Status LoadLibrary(const char* library_filename, void** result, } string str; library.op_list.SerializeToString(&str); - char* str_buf = reinterpret_cast(malloc(str.length())); + char* str_buf = reinterpret_cast(port::Malloc(str.length())); memcpy(str_buf, str.data(), str.length()); *buf = str_buf; *len = str.length(); diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc index 5ff8aea02c4..568346a71d3 100644 --- a/tensorflow/core/framework/node_def_util.cc +++ b/tensorflow/core/framework/node_def_util.cc @@ -185,6 +185,17 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name, return Status::OK(); } +Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name, + std::vector* value) { + const AttrValue* attr_value; + TF_RETURN_IF_ERROR(attrs.Find(attr_name, &attr_value)); + TF_RETURN_IF_ERROR(AttrValueHasType(*attr_value, "list(func)")); + for (const auto& v : attr_value->list().func()) { + value->emplace_back(v); + } + return Status::OK(); +} + namespace { // Helper for InOutTypesForNode(). Status AddArgToSig(const NodeDef& node_def, const OpDef::ArgDef& arg_def, diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h index 85b83c4d74b..5c4d2272682 100644 --- a/tensorflow/core/framework/node_def_util.h +++ b/tensorflow/core/framework/node_def_util.h @@ -150,6 +150,9 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name, Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name, const NameAttrList** value); // type: "func" +Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name, + std::vector* value); // type: "list(func)" + // Computes the input and output types for a specific node. // REQUIRES: ValidateOpDef(op_def).ok() Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def, diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc index 98134392ef7..850cdc39099 100644 --- a/tensorflow/core/framework/tracking_allocator_test.cc +++ b/tensorflow/core/framework/tracking_allocator_test.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/test.h" namespace tensorflow { @@ -27,7 +28,7 @@ class TestableSizeTrackingAllocator : public Allocator { public: string Name() override { return "test"; } void* AllocateRaw(size_t /*alignment*/, size_t num_bytes) override { - void* ptr = malloc(num_bytes); + void* ptr = port::Malloc(num_bytes); size_map_[ptr] = num_bytes; return ptr; } @@ -35,7 +36,7 @@ class TestableSizeTrackingAllocator : public Allocator { const auto& iter = size_map_.find(ptr); EXPECT_NE(size_map_.end(), iter); size_map_.erase(iter); - free(ptr); + port::Free(ptr); } bool TracksAllocationSizes() override { return true; } size_t RequestedSize(void* ptr) override { diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc index 13c6a2146bc..ef4dd047875 100644 --- a/tensorflow/core/graph/testlib.cc +++ b/tensorflow/core/graph/testlib.cc @@ -254,6 +254,10 @@ Node* Identity(Graph* g, Node* input, int index) { Node* Add(Graph* g, Node* in0, Node* in1) { return Binary(g, "Add", in0, in1); } +Node* Reverse(Graph* g, Node* tensor, Node* axis) { + return Binary(g, "ReverseV2", tensor, axis); +} + Node* Error(Graph* g, Node* input, const string& errmsg) { Node* ret; TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error") diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h index 7b4425bfeb9..7a23b20c2c8 100644 --- a/tensorflow/core/graph/testlib.h +++ b/tensorflow/core/graph/testlib.h @@ -100,6 +100,9 @@ Node* Multi(Graph* g, const string& func, gtl::ArraySlice ins); // Adds a binary add node in "g" doing in0 + in1. Node* Add(Graph* g, Node* in0, Node* in1); +// Reverses dimensions of > +Node* Reverse(Graph* g, Node* tensor, Node* axis); + // Generates random unit uniform distribution of the input shape. Node* RandomUniform(Graph* g, Node* input, DataType dtype); diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index b117b84c6d8..1b5f5292d3a 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -256,6 +256,15 @@ tf_cc_test( ], ) +cc_library( + name = "stage_op", + srcs = ["stage_op.cc"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + ], +) + cc_library( name = "queue_base", srcs = ["queue_base.cc"], @@ -1161,6 +1170,7 @@ cc_library( ":session_ops", ":sparse_conditional_accumulator_op", ":stack_ops", + ":stage_op", ":tensor_array_ops", ], ) @@ -3228,6 +3238,7 @@ tf_kernel_library( prefix = "training_ops", deps = [ ":bounds_check", + ":variable_ops", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:training_ops_op_lib", diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h index 897afe77966..60091fc27fd 100644 --- a/tensorflow/core/kernels/conv_ops.h +++ b/tensorflow/core/kernels/conv_ops.h @@ -18,6 +18,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/util/tensor_format.h" #if GOOGLE_CUDA @@ -44,9 +45,9 @@ class LaunchConv2DOp { template struct Im2ColBufferResource : public ResourceBase { Im2ColBufferResource() { - data = static_cast(malloc(size * sizeof(T))); + data = static_cast(port::Malloc(size * sizeof(T))); } - ~Im2ColBufferResource() { free(data); } + ~Im2ColBufferResource() { port::Free(data); } // This mutex ensures that only a single operation at a time is able to use // the buffer memory held by this resource. mutex mu; diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc index 1648a54f2fa..ecebd3c599c 100644 --- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc +++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc @@ -26,9 +26,9 @@ limitations under the License. namespace tensorflow { const bool SHOW_DBG_IN_SOC = false; -const bool DBG_DUMP_RESULT = false; const bool DBG_USE_DUMMY_INPUT = false; const bool DBG_USE_SAMPLE_INPUT = false; +const bool DBG_SHOW_RESULT = false; const int64 FLAG_ENABLE_PANDA_BINARY_INPUT = 0x01; #ifdef USE_HEXAGON_LIBS @@ -169,7 +169,7 @@ bool HexagonControlWrapper::SetupGraph( return soc_interface_ConstructGraph(); // Keep following comment to use dummy graph construction - // return soc_interface_SetupGraphDummy(3 /* inception version */); + // return soc_interface_setupDummyGraph(3 /* inception version */); } bool HexagonControlWrapper::ExecuteGraph() { @@ -213,7 +213,7 @@ bool HexagonControlWrapper::ReadOutputNode( // TODO: Accept all results std::get<2>(output) = DT_FLOAT; outputs->emplace_back(output); - if (DBG_DUMP_RESULT) { + if (DBG_SHOW_RESULT) { const int byte_size = std::get<1>(output); const int element_count = byte_size / sizeof(float); const float* float_array = reinterpret_cast(std::get<0>(output)); diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc index f644fa02ed3..7852499965c 100644 --- a/tensorflow/core/kernels/reverse_op.cc +++ b/tensorflow/core/kernels/reverse_op.cc @@ -27,23 +27,83 @@ limitations under the License. #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/work_sharder.h" namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +namespace { + +// Reverse rows (middle dimension) of a three dimensional tensor. +// NUM_CHANNELS can be <= 0 to compute it dynamically from +// Otherwise, it must equal input.dim_size(2) and is used as a compile-time +// constant. +template +void ReverseRows(OpKernelContext* context, const Tensor& input, + Tensor* result) { + auto work = [&input, result](int64 start, int64 end) { + const int64 inner_size = + NUM_CHANNELS > 0 ? NUM_CHANNELS : input.dim_size(2); + const int64 middle_size = input.dim_size(1); + const int64 row_size = inner_size * middle_size; + DCHECK_EQ(input.dim_size(2), inner_size); + + const int32* in_ptr = input.bit_casted_tensor().data(); + int32* out_ptr = result->bit_casted_tensor().data(); + + in_ptr += start * row_size; + out_ptr += start * row_size; + + for (int outer_dim = start; outer_dim < end; ++outer_dim) { + out_ptr += row_size; + int remaining = middle_size; + while (remaining > 0) { + out_ptr -= inner_size; + memcpy(out_ptr, in_ptr, inner_size * sizeof(float)); + in_ptr += inner_size; + --remaining; + } + + out_ptr += row_size; + } + }; + + // Shard across outer dimension. + const int64 N = input.dim_size(0); + const int64 cost_per_unit = input.NumElements() / N; + auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads->num_threads, worker_threads->workers, N, cost_per_unit, + std::move(work)); +} + +} // namespace + template void HandleReverseCase(OpKernelContext* context, typename TTypes::ConstTensor dims, Tensor* result) { + const Tensor& input = context->input(0); + + // Use optimized reverse if possible. + if (NDIMS == 3 && std::is_same::value && + std::is_same::value && (!dims(0) && dims(1) && !dims(2))) { + if (input.dim_size(2) == 3) { + ReverseRows<3>(context, input, result); + } else { + ReverseRows<-1>(context, input, result); + } + return; + } + typename Eigen::array axes_di; for (int i = 0; i < NDIMS; i++) { axes_di[i] = dims(i); } functor::Reverse()(context->eigen_device(), - context->input(0).tensor(), - axes_di, result->tensor()); + input.tensor(), axes_di, + result->tensor()); } template @@ -105,13 +165,26 @@ class ReverseOp : public OpKernel { template void HandleReverseV2Case(OpKernelContext* context, const gtl::ArraySlice& axes, Tensor* result) { + const Tensor& input = context->input(0); + + // Use optimized reverse if possible. + if (NDIMS == 3 && std::is_same::value && + std::is_same::value && (!axes[0] && axes[1] && !axes[2])) { + if (input.dim_size(2) == 3) { + ReverseRows<3>(context, input, result); + } else { + ReverseRows<-1>(context, input, result); + } + return; + } + typename Eigen::array axes_di; for (int i = 0; i < NDIMS; i++) { axes_di[i] = axes[i]; } functor::Reverse()(context->eigen_device(), - context->input(0).tensor(), - axes_di, result->tensor()); + input.tensor(), axes_di, + result->tensor()); } template @@ -158,6 +231,11 @@ class ReverseV2Op : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &output)); +// TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse of +// a single dimension to the dims=3 or dims=2 case, regardless of the number +// of dimensions in the tensor. This would let some ops use faster +// lower-dimension code (and use optimized versions). + #define HANDLE_REVERSE(NDIMS) \ case NDIMS: \ HandleReverseV2Case(context, axes_dense, output); \ diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc index ee59adf9dab..19e25b887d7 100644 --- a/tensorflow/core/kernels/reverse_op_test.cc +++ b/tensorflow/core/kernels/reverse_op_test.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/fake_input.h" #include "tensorflow/core/framework/graph.pb.h" @@ -31,6 +32,7 @@ limitations under the License. #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" namespace tensorflow { namespace { @@ -109,5 +111,104 @@ TEST_F(ReverseOpTest, Reverse_1234) { test::ExpectTensorEqual(expected, *params_tensor); } +static SessionOptions GetOptions(int intra_threads) { + SessionOptions opts; + opts.config.set_intra_op_parallelism_threads(intra_threads); + opts.config.set_inter_op_parallelism_threads(1); + return opts; +} + +// Creates a Graph which "reduce"s a 3D float tensor of "num" elements +// into a scalar. +static Graph* Reverse(TensorShape shape, int reverse_axis) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor data(DT_FLOAT, shape); + data.flat().setRandom(); + Tensor axes(DT_INT32, TensorShape({1})); + axes.flat()(0) = reverse_axis; + test::graph::Reverse(g, test::graph::Constant(g, data), + test::graph::Constant(g, axes)); + return g; +} + +static void RunReverseRowsBenchmark(int iters, int outer_dim, int middle_dim, + int intra_threads, int channels) { + SessionOptions opts = GetOptions(intra_threads); + TensorShape shape{outer_dim, middle_dim, channels}; + const int64 num_items = static_cast(iters) * shape.num_elements(); + testing::ItemsProcessed(num_items); + testing::BytesProcessed(num_items * sizeof(float)); + testing::UseRealTime(); + test::Benchmark("cpu", Reverse(shape, 1), &opts).Run(iters); +} + +static void BM_ReverseRowsOf1Channel_1T(int iters, int outer_dim, + int middle_dim) { + RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 1 /* intra_threads */, + 1 /* channels */); +} + +BENCHMARK(BM_ReverseRowsOf1Channel_1T) + ->ArgPair(288, 288) + ->ArgPair(1024, 1024) + ->ArgPair(10 * 1024, 1024); + +static void BM_ReverseRowsOf1Channel_4T(int iters, int outer_dim, + int middle_dim) { + RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 4 /* intra_threads */, + 1 /* channels */); +} + +BENCHMARK(BM_ReverseRowsOf1Channel_4T) + ->ArgPair(288, 288) + ->ArgPair(1024, 1024) + ->ArgPair(10 * 1024, 1024); + +static void BM_ReverseRowsOf3Channels_1T(int iters, int outer_dim, + int middle_dim) { + RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 1 /* intra_threads */, + 3 /* channels */); +} + +BENCHMARK(BM_ReverseRowsOf3Channels_1T) + ->ArgPair(288, 288) + ->ArgPair(224, 224) + ->ArgPair(1024, 1024) + ->ArgPair(10 * 1024, 1024); + +static void BM_ReverseRowsOf3Channels_4T(int iters, int outer_dim, + int middle_dim) { + RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 4 /* intra_threads */, + 3 /* channels */); +} + +BENCHMARK(BM_ReverseRowsOf3Channels_4T) + ->ArgPair(288, 288) + ->ArgPair(224, 224) + ->ArgPair(1024, 1024) + ->ArgPair(10 * 1024, 1024); + +static void BM_ReverseRowsOf4Channels_1T(int iters, int outer_dim, + int middle_dim) { + RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 1 /* intra_threads */, + 4 /* channels */); +} + +BENCHMARK(BM_ReverseRowsOf4Channels_1T) + ->ArgPair(288, 288) + ->ArgPair(1024, 1024) + ->ArgPair(10 * 1024, 1024); + +static void BM_ReverseRowsOf4Channels_4T(int iters, int outer_dim, + int middle_dim) { + RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 4 /* intra_threads */, + 4 /* channels */); +} + +BENCHMARK(BM_ReverseRowsOf4Channels_4T) + ->ArgPair(288, 288) + ->ArgPair(1024, 1024) + ->ArgPair(10 * 1024, 1024); + } // namespace } // namespace tensorflow diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc new file mode 100644 index 00000000000..34db850013d --- /dev/null +++ b/tensorflow/core/kernels/stage_op.cc @@ -0,0 +1,130 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mutex.h" + +namespace tensorflow { + +namespace { + +class Buffer : public ResourceBase { + public: + explicit Buffer() {} + + typedef std::vector Tuple; + + // the Buffer takes ownership of the Tuple + void Put(Tuple* tuple) { + mutex_lock l(mu_); + buf_.push_back(std::move(*tuple)); + non_empty_cond_var_.notify_one(); // maybe possible to optimize by reducing + // how often this signal is sent + } + + void Get(Tuple* tuple) { // TODO(zhifengc): Support cancellation. + mutex_lock l(mu_); + while (buf_.empty()) { + non_empty_cond_var_.wait(l); + } + + *tuple = std::move(buf_.front()); + buf_.pop_front(); + } + + string DebugString() { + mutex_lock l(mu_); + return strings::StrCat("Staging size: ", buf_.size()); + } + + private: + mutex mu_; + condition_variable non_empty_cond_var_; + std::deque buf_ GUARDED_BY(mu_); +}; + +Status CreateBuffer(Buffer** ret) { + *ret = new Buffer; + return Status::OK(); +} + +Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) { + auto rm = ctx->resource_manager(); + ContainerInfo cinfo; + TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */)); + TF_RETURN_IF_ERROR(rm->LookupOrCreate(cinfo.container(), cinfo.name(), + buf, CreateBuffer)); + return Status::OK(); +} + +} // namespace + +class StageOp : public OpKernel { + public: + explicit StageOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + Buffer* buf = nullptr; + OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf)); + core::ScopedUnref scope(buf); + Buffer::Tuple tuple; + for (int i = 0; i < ctx->num_inputs(); ++i) { + tuple.push_back(ctx->input(i)); + } + buf->Put(&tuple); + } +}; + +REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp); +#endif + +class UnstageOp : public OpKernel { + public: + explicit UnstageOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + // Using this op in such a way that it blocks forever + // is an error. As such cancellation is not handled. + void Compute(OpKernelContext* ctx) override { + Buffer* buf = nullptr; + OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf)); + core::ScopedUnref scope(buf); + Buffer::Tuple tuple; + buf->Get(&tuple); + OP_REQUIRES( + ctx, tuple.size() == ctx->num_outputs(), + errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(), + " vs. ", ctx->num_outputs())); + for (int i = 0; i < tuple.size(); ++i) { + ctx->set_output(i, tuple[i]); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp); +#endif + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 641c991a7e3..cbc44017dcf 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/kernels/variable_ops.h" namespace tensorflow { @@ -292,10 +293,26 @@ struct ApplyCenteredRMSProp { } // namespace functor +mutex* GetMutex(OpKernelContext* ctx, int input) { + if (ctx->input_dtype(input) == DT_RESOURCE) { + Var* var; + if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) { + return var->mu(); + } else { + ctx->CtxFailureWithWarning( + errors::Internal("Invalid variable reference.")); + return nullptr; + } + } + return ctx->input_ref_mutex(input); +} + // MaybeLockMutexesInOrder is a helper function to acquire mutexes in address -// order to mitigate deadlock. Returns a vector of acquired mutexes. -// Safe to pass duplicates - will only lock each distinct mutex once. -// If do_lock is false, returns immediately. +// order to mitigate deadlock. Returns a vector of acquired mutexes. Safe to +// pass duplicates - will only lock each distinct mutex once. If do_lock is +// false, returns immediately. Note that this silently doesn't lock mutexes for +// invalid variable references; in all usages this is followed by GetInputTensor +// which will signal a failure. std::vector MaybeLockMutexesInOrder( OpKernelContext* ctx, bool do_lock, const std::vector& input_ids) { std::vector locks; @@ -305,7 +322,7 @@ std::vector MaybeLockMutexesInOrder( std::vector mutexes; std::vector acquire_order; for (auto input : input_ids) { - auto* mutex = ctx->input_ref_mutex(input); + mutex* mutex = GetMutex(ctx, input); // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3). if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) { acquire_order.push_back(input); @@ -316,11 +333,41 @@ std::vector MaybeLockMutexesInOrder( [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; }); for (auto input : acquire_order) { - locks.emplace_back(*ctx->input_ref_mutex(input)); + mutex* mu = GetMutex(ctx, input); + if (mu != nullptr) { + locks.emplace_back(*mu); + } } return locks; } +Status GetInputTensor(OpKernelContext* ctx, int input, bool lock_held, + Tensor* out) { + if (ctx->input_dtype(input) == DT_RESOURCE) { + Var* var; + if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) { + if (lock_held) { + *out = *var->tensor(); + } else { + mutex_lock ml(*var->mu()); + *out = *var->tensor(); + } + return Status::OK(); + } else { + return errors::Internal("Invalid variable reference."); + } + } + *out = ctx->mutable_input(input, lock_held); + return Status::OK(); +} + +void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input, + int output) { + if (ctx->input_dtype(input) != DT_RESOURCE) { + ctx->forward_ref_input_to_ref_output(input, output); + } +} + template class ApplyGradientDescentOp : public OpKernel { public: @@ -330,7 +377,8 @@ class ApplyGradientDescentOp : public OpKernel { void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); OP_REQUIRES( ctx, var.IsInitialized(), @@ -351,7 +399,7 @@ class ApplyGradientDescentOp : public OpKernel { functor::ApplyGradientDescent()( device, var.flat(), alpha.scalar(), delta.flat()); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -361,7 +409,11 @@ class ApplyGradientDescentOp : public OpKernel { #define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint("T"), \ - ApplyGradientDescentOp); + ApplyGradientDescentOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceApplyGradientDescent") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T"), \ + ApplyGradientDescentOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); TF_CALL_half(REGISTER_CPU_KERNELS); @@ -406,7 +458,7 @@ class ApplyAdadeltaOp : public OpKernel { void Compute(OpKernelContext* ctx) override { if (use_exclusive_lock_) { - mutex_lock l1(*ctx->input_ref_mutex(0)); + mutex_lock l1(*GetMutex(ctx, 0)); // Don't try to acquire a lock on the second ref as they share the same // mutex. // @@ -419,16 +471,20 @@ class ApplyAdadeltaOp : public OpKernel { if (!ctx->status().ok()) return; DoCompute(ctx); } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; void DoValidate(OpKernelContext* ctx) { - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); - Tensor accum_update = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); + Tensor accum_update; + OP_REQUIRES_OK(ctx, + GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update)); OP_REQUIRES( ctx, var.IsInitialized(), @@ -474,9 +530,13 @@ class ApplyAdadeltaOp : public OpKernel { void DoCompute(OpKernelContext* ctx) { const Device& device = ctx->template eigen_device(); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); - Tensor accum_update = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); + Tensor accum_update; + OP_REQUIRES_OK(ctx, + GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update)); const Tensor& lr = ctx->input(3); const Tensor& rho = ctx->input(4); @@ -492,9 +552,12 @@ class ApplyAdadeltaOp : public OpKernel { using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; -#define REGISTER_KERNELS(D, T) \ - REGISTER_KERNEL_BUILDER( \ - Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint("T"), \ +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyAdadeltaOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("ResourceApplyAdadelta").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyAdadeltaOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); @@ -536,7 +599,7 @@ class SparseApplyAdadeltaOp : public OpKernel { } void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { - mutex* mu_var = ctx->input_ref_mutex(0); + mutex* mu_var = GetMutex(ctx, 0); // mu_accum is actually the same mutex as mu_var since currently we use a // global mutex. // @@ -544,9 +607,14 @@ class SparseApplyAdadeltaOp : public OpKernel { if (use_exclusive_lock_) { mu_var->lock(); } - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum_grad = ctx->mutable_input(1, use_exclusive_lock_); - Tensor accum_update = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum_grad; + OP_REQUIRES_OK(ctx, + GetInputTensor(ctx, 1, use_exclusive_lock_, &accum_grad)); + Tensor accum_update; + OP_REQUIRES_OK(ctx, + GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -642,7 +710,7 @@ class SparseApplyAdadeltaOp : public OpKernel { mu_var->unlock(); } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -654,6 +722,11 @@ class SparseApplyAdadeltaOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ + SparseApplyAdadeltaOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdadelta") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ SparseApplyAdadeltaOp); #define REGISTER_CPU_KERNELS(T) \ REGISTER_KERNELS(T, int32); \ @@ -677,7 +750,8 @@ class ApplyProximalGradientDescentOp : public OpKernel { void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); OP_REQUIRES( ctx, var.IsInitialized(), @@ -710,17 +784,21 @@ class ApplyProximalGradientDescentOp : public OpKernel { device, var.flat(), alpha.scalar(), l1.scalar(), l2.scalar(), delta.flat()); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; -#define REGISTER_KERNELS(D, T) \ - REGISTER_KERNEL_BUILDER(Name("ApplyProximalGradientDescent") \ - .Device(DEVICE_##D) \ - .TypeConstraint("T"), \ +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER(Name("ApplyProximalGradientDescent") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T"), \ + ApplyProximalGradientDescentOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceApplyProximalGradientDescent") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T"), \ ApplyProximalGradientDescentOp); REGISTER_KERNELS(CPU, float); @@ -738,7 +816,8 @@ class SparseApplyProximalGradientDescentOp : public OpKernel { void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), errors::InvalidArgument("var must be at least 1 dimensional")); @@ -846,18 +925,23 @@ class SparseApplyProximalGradientDescentOp : public OpKernel { } } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; -#define REGISTER_KERNELS(T, Tindices) \ - REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalGradientDescent") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tindices"), \ +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalGradientDescent") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + SparseApplyProximalGradientDescentOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyProximalGradientDescent") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ SparseApplyProximalGradientDescentOp); REGISTER_KERNELS(float, int32); @@ -875,8 +959,10 @@ class ApplyAdagradOp : public OpKernel { void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -905,7 +991,7 @@ class ApplyAdagradOp : public OpKernel { functor::ApplyAdagrad()(device, var.flat(), accum.flat(), lr.scalar(), grad.flat()); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -915,9 +1001,12 @@ class ApplyAdagradOp : public OpKernel { using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; -#define REGISTER_KERNELS(D, T) \ - REGISTER_KERNEL_BUILDER( \ - Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint("T"), \ +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyAdagradOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("ResourceApplyAdagrad").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyAdagradOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); @@ -957,8 +1046,10 @@ class ApplyProximalAdagradOp : public OpKernel { void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -1004,7 +1095,7 @@ class ApplyProximalAdagradOp : public OpKernel { device, var.flat(), accum.flat(), lr.scalar(), l1.scalar(), l2.scalar(), grad.flat()); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -1017,7 +1108,11 @@ using GPUDevice = Eigen::GpuDevice; #define REGISTER_KERNELS(D, T) \ REGISTER_KERNEL_BUILDER( \ Name("ApplyProximalAdagrad").Device(DEVICE_##D).TypeConstraint("T"), \ - ApplyProximalAdagradOp); + ApplyProximalAdagradOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceApplyProximalAdagrad") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T"), \ + ApplyProximalAdagradOp); REGISTER_KERNELS(CPU, float); REGISTER_KERNELS(CPU, double); @@ -1053,8 +1148,10 @@ class SparseApplyAdagradOp : public OpKernel { void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -1142,7 +1239,7 @@ class SparseApplyAdagradOp : public OpKernel { } } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -1154,6 +1251,11 @@ class SparseApplyAdagradOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ + SparseApplyAdagradOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdagrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ SparseApplyAdagradOp); #define REGISTER_CPU_KERNELS(T) \ REGISTER_KERNELS(T, int32); \ @@ -1177,8 +1279,10 @@ class SparseApplyProximalAdagradOp : public OpKernel { void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -1311,18 +1415,23 @@ class SparseApplyProximalAdagradOp : public OpKernel { } } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; -#define REGISTER_KERNELS(T, Tindices) \ - REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalAdagrad") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tindices"), \ +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalAdagrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + SparseApplyProximalAdagradOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyProximalAdagrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ SparseApplyProximalAdagradOp); REGISTER_KERNELS(float, int32); @@ -1340,9 +1449,14 @@ class ApplyAdagradDAOp : public OpKernel { void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor gradient_accum = ctx->mutable_input(1, use_exclusive_lock_); - Tensor gradient_squared_accum = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor gradient_accum; + OP_REQUIRES_OK( + ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum)); + Tensor gradient_squared_accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, + &gradient_squared_accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -1399,7 +1513,7 @@ class ApplyAdagradDAOp : public OpKernel { global_step.scalar()(), l1.scalar(), l2.scalar(), grad.flat()); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -1428,9 +1542,14 @@ class SparseApplyAdagradDAOp : public OpKernel { void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor gradient_accum = ctx->mutable_input(1, use_exclusive_lock_); - Tensor gradient_squared_accum = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor gradient_accum; + OP_REQUIRES_OK( + ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum)); + Tensor gradient_squared_accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, + &gradient_squared_accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -1580,7 +1699,7 @@ class SparseApplyAdagradDAOp : public OpKernel { } } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -1592,6 +1711,11 @@ class SparseApplyAdagradDAOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ + SparseApplyAdagradDAOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyAdagradDA") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ SparseApplyAdagradDAOp); REGISTER_KERNELS(float, int32); @@ -1610,9 +1734,12 @@ class ApplyFtrlOp : public OpKernel { void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); - Tensor linear = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); + Tensor linear; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -1677,7 +1804,7 @@ class ApplyFtrlOp : public OpKernel { lr.scalar(), l1.scalar(), l2.scalar(), lr_power.scalar()); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -1687,9 +1814,12 @@ class ApplyFtrlOp : public OpKernel { using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; -#define REGISTER_KERNELS(D, T) \ - REGISTER_KERNEL_BUILDER( \ - Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint("T"), \ +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyFtrlOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("ResourceApplyFtrl").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyFtrlOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); @@ -1710,9 +1840,12 @@ class SparseApplyFtrlOp : public OpKernel { void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); - Tensor linear = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); + Tensor linear; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -1874,18 +2007,23 @@ class SparseApplyFtrlOp : public OpKernel { } } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; -#define REGISTER_KERNELS(T, Tindices) \ - REGISTER_KERNEL_BUILDER(Name("SparseApplyFtrl") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tindices"), \ +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER(Name("SparseApplyFtrl") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + SparseApplyFtrlOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyFtrl") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ SparseApplyFtrlOp); #define REGISTER_CPU_KERNELS(T) \ REGISTER_KERNELS(T, int32); \ @@ -1909,8 +2047,10 @@ class ApplyMomentumOp : public OpKernel { void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -1944,7 +2084,7 @@ class ApplyMomentumOp : public OpKernel { functor::ApplyMomentum()(device, var.flat(), accum.flat(), lr.scalar(), grad.flat(), momentum.scalar(), use_nesterov_); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -1955,9 +2095,12 @@ class ApplyMomentumOp : public OpKernel { using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; -#define REGISTER_KERNELS(D, T) \ - REGISTER_KERNEL_BUILDER( \ - Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint("T"), \ +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyMomentumOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("ResourceApplyMomentum").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyMomentumOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); @@ -2001,8 +2144,10 @@ class SparseApplyMomentumOp : public OpKernel { void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor accum; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -2072,7 +2217,7 @@ class SparseApplyMomentumOp : public OpKernel { } } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -2085,6 +2230,11 @@ class SparseApplyMomentumOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .TypeConstraint("Tindices"), \ + SparseApplyMomentumOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyMomentum") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ SparseApplyMomentumOp); #define REGISTER_CPU_KERNELS(T) \ REGISTER_KERNELS(T, int32); \ @@ -2107,9 +2257,12 @@ class ApplyAdamOp : public OpKernel { void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor m = ctx->mutable_input(1, use_exclusive_lock_); - Tensor v = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor m; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &m)); + Tensor v; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &v)); OP_REQUIRES( ctx, var.IsInitialized(), errors::FailedPrecondition( @@ -2171,7 +2324,7 @@ class ApplyAdamOp : public OpKernel { beta1.scalar(), beta2.scalar(), epsilon.scalar(), grad.flat()); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -2181,9 +2334,12 @@ class ApplyAdamOp : public OpKernel { using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; -#define REGISTER_KERNELS(D, T) \ - REGISTER_KERNEL_BUILDER( \ - Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint("T"), \ +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyAdamOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("ResourceApplyAdam").Device(DEVICE_##D).TypeConstraint("T"), \ ApplyAdamOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); @@ -2236,9 +2392,12 @@ class ApplyRMSPropOp : public OpKernel { void Compute(OpKernelContext* ctx) override { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor ms = ctx->mutable_input(1, use_exclusive_lock_); - Tensor mom = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor ms; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms)); + Tensor mom; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom)); OP_REQUIRES( ctx, var.IsInitialized(), @@ -2294,7 +2453,7 @@ class ApplyRMSPropOp : public OpKernel { rho.scalar(), momentum.scalar(), epsilon.scalar(), grad.flat()); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -2312,10 +2471,14 @@ class ApplyCenteredRMSPropOp : public OpKernel { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor mg = ctx->mutable_input(1, use_exclusive_lock_); - Tensor ms = ctx->mutable_input(2, use_exclusive_lock_); - Tensor mom = ctx->mutable_input(3, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor mg; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg)); + Tensor ms; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms)); + Tensor mom; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom)); OP_REQUIRES( ctx, var.IsInitialized(), @@ -2379,7 +2542,7 @@ class ApplyCenteredRMSPropOp : public OpKernel { device, var.flat(), mg.flat(), ms.flat(), mom.flat(), lr.scalar(), rho.scalar(), momentum.scalar(), epsilon.scalar(), grad.flat()); - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -2395,7 +2558,14 @@ using GPUDevice = Eigen::GpuDevice; ApplyRMSPropOp); \ REGISTER_KERNEL_BUILDER( \ Name("ApplyCenteredRMSProp").Device(DEVICE_##D).TypeConstraint("T"), \ - ApplyCenteredRMSPropOp); + ApplyCenteredRMSPropOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("ResourceApplyRMSProp").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyRMSPropOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceApplyCenteredRMSProp") \ + .Device(DEVICE_##D) \ + .TypeConstraint("T"), \ + ApplyCenteredRMSPropOp); #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T); TF_CALL_half(REGISTER_CPU_KERNELS); @@ -2449,9 +2619,12 @@ class SparseApplyRMSPropOp : public OpKernel { void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor ms = ctx->mutable_input(1, use_exclusive_lock_); - Tensor mom = ctx->mutable_input(2, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor ms; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms)); + Tensor mom; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom)); OP_REQUIRES( ctx, var.IsInitialized(), @@ -2552,7 +2725,7 @@ class SparseApplyRMSPropOp : public OpKernel { } } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: @@ -2572,10 +2745,14 @@ class SparseApplyCenteredRMSPropOp : public OpKernel { auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3}); - Tensor var = ctx->mutable_input(0, use_exclusive_lock_); - Tensor mg = ctx->mutable_input(1, use_exclusive_lock_); - Tensor ms = ctx->mutable_input(2, use_exclusive_lock_); - Tensor mom = ctx->mutable_input(3, use_exclusive_lock_); + Tensor var; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var)); + Tensor mg; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg)); + Tensor ms; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms)); + Tensor mom; + OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom)); OP_REQUIRES( ctx, var.IsInitialized(), @@ -2685,23 +2862,33 @@ class SparseApplyCenteredRMSPropOp : public OpKernel { } } - ctx->forward_ref_input_to_ref_output(0, 0); + MaybeForwardRefInputToRefOutput(ctx, 0, 0); } private: bool use_exclusive_lock_; }; -#define REGISTER_KERNELS(T, Tindices) \ - REGISTER_KERNEL_BUILDER(Name("SparseApplyRMSProp") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tindices"), \ - SparseApplyRMSPropOp); \ - REGISTER_KERNEL_BUILDER(Name("SparseApplyCenteredRMSProp") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tindices"), \ +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER(Name("SparseApplyRMSProp") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + SparseApplyRMSPropOp); \ + REGISTER_KERNEL_BUILDER(Name("SparseApplyCenteredRMSProp") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + SparseApplyCenteredRMSPropOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyRMSProp") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + SparseApplyRMSPropOp); \ + REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyCenteredRMSProp") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ SparseApplyCenteredRMSPropOp); REGISTER_KERNELS(Eigen::half, int32); diff --git a/tensorflow/core/lib/core/arena.cc b/tensorflow/core/lib/core/arena.cc index a7148ed1c75..ef1b9a5468c 100644 --- a/tensorflow/core/lib/core/arena.cc +++ b/tensorflow/core/lib/core/arena.cc @@ -25,6 +25,7 @@ limitations under the License. #include +#include #include #include "tensorflow/core/platform/logging.h" @@ -48,7 +49,8 @@ Arena::Arena(const size_t block_size) overflow_blocks_(NULL) { assert(block_size > kDefaultAlignment); - first_blocks_[0].mem = reinterpret_cast(malloc(block_size_)); + first_blocks_[0].mem = + reinterpret_cast(port::AlignedMalloc(block_size_, sizeof(void*))); first_blocks_[0].size = block_size_; @@ -59,7 +61,9 @@ Arena::~Arena() { FreeBlocks(); assert(overflow_blocks_ == NULL); // FreeBlocks() should do that // The first X blocks stay allocated always by default. Delete them now. - for (size_t i = 0; i < blocks_alloced_; ++i) free(first_blocks_[i].mem); + for (size_t i = 0; i < blocks_alloced_; ++i) { + port::AlignedFree(first_blocks_[i].mem); + } } // Returns true iff it advances freestart_ to the first position @@ -162,8 +166,11 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size, // Must be a multiple of kDefaultAlignment, unless requested // alignment is 1, in which case we don't care at all. - const uint32 adjusted_alignment = + uint32 adjusted_alignment = (alignment > 1 ? LeastCommonMultiple(alignment, kDefaultAlignment) : 1); + // Required minimum alignment for port::AlignedMalloc(). + adjusted_alignment = + std::max(adjusted_alignment, static_cast(sizeof(void*))); CHECK_LE(adjusted_alignment, static_cast(1 << 20)) << "Alignment on boundaries greater than 1MB not supported."; @@ -171,16 +178,12 @@ Arena::AllocatedBlock* Arena::AllocNewBlock(const size_t block_size, // If block_size > alignment we force block_size to be a multiple // of alignment; if block_size < alignment we make no adjustment. size_t adjusted_block_size = block_size; - if (adjusted_alignment > 1) { - if (adjusted_block_size > adjusted_alignment) { - const uint32 excess = adjusted_block_size % adjusted_alignment; - adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0); - } - block->mem = reinterpret_cast( - port::aligned_malloc(adjusted_block_size, adjusted_alignment)); - } else { - block->mem = reinterpret_cast(malloc(adjusted_block_size)); + if (adjusted_block_size > adjusted_alignment) { + const uint32 excess = adjusted_block_size % adjusted_alignment; + adjusted_block_size += (excess > 0 ? adjusted_alignment - excess : 0); } + block->mem = reinterpret_cast( + port::AlignedMalloc(adjusted_block_size, adjusted_alignment)); block->size = adjusted_block_size; CHECK(NULL != block->mem) << "block_size=" << block_size << " adjusted_block_size=" << adjusted_block_size @@ -242,7 +245,7 @@ void* Arena::GetMemoryFallback(const size_t size, const int alignment) { void Arena::FreeBlocks() { for (size_t i = 1; i < blocks_alloced_; ++i) { // keep first block alloced - free(first_blocks_[i].mem); + port::AlignedFree(first_blocks_[i].mem); first_blocks_[i].mem = NULL; first_blocks_[i].size = 0; } @@ -250,7 +253,7 @@ void Arena::FreeBlocks() { if (overflow_blocks_ != NULL) { std::vector::iterator it; for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) { - free(it->mem); + port::AlignedFree(it->mem); } delete overflow_blocks_; // These should be used very rarely overflow_blocks_ = NULL; diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h index fc439f9eb66..d6e5d9effa7 100644 --- a/tensorflow/core/lib/gtl/inlined_vector.h +++ b/tensorflow/core/lib/gtl/inlined_vector.h @@ -45,6 +45,7 @@ limitations under the License. #include "tensorflow/core/lib/gtl/manual_constructor.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/types.h" #include // NOLINT(build/include_order) @@ -353,7 +354,7 @@ class InlinedVector { size_t n = size(); Destroy(base, n); if (!is_inline()) { - free(base); + port::Free(base); } } @@ -434,7 +435,7 @@ class InlinedVector { } T* src = data(); - T* dst = static_cast(malloc(target * sizeof(T))); + T* dst = static_cast(port::Malloc(target * sizeof(T))); // Need to copy elem before discarding src since it might alias src. InitType{}(dst + s, std::forward(args)...); diff --git a/tensorflow/core/lib/gtl/manual_constructor.h b/tensorflow/core/lib/gtl/manual_constructor.h index 8f041a13538..0a76e0962e6 100644 --- a/tensorflow/core/lib/gtl/manual_constructor.h +++ b/tensorflow/core/lib/gtl/manual_constructor.h @@ -30,7 +30,7 @@ limitations under the License. #include #include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mem.h" // For aligned_malloc/aligned_free +#include "tensorflow/core/platform/mem.h" namespace tensorflow { namespace gtl { @@ -127,9 +127,9 @@ class ManualConstructor { // Support users creating arrays of ManualConstructor<>s. This ensures that // the array itself has the correct alignment. static void* operator new[](size_t size) { - return port::aligned_malloc(size, TF_LIB_GTL_ALIGN_OF(Type)); + return port::AlignedMalloc(size, TF_LIB_GTL_ALIGN_OF(Type)); } - static void operator delete[](void* mem) { port::aligned_free(mem); } + static void operator delete[](void* mem) { port::AlignedFree(mem); } inline Type* get() { return reinterpret_cast(space_); } inline const Type* get() const { diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt index 1c4c52be975..b9589d1c6fa 100644 --- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt @@ -24830,6 +24830,1318 @@ op { } } } +op { + name: "ResourceApplyAdadelta" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "accum_update" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "rho" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyAdagrad" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyAdagradDA" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "gradient_accumulator" + type: DT_RESOURCE + } + input_arg { + name: "gradient_squared_accumulator" + type: DT_RESOURCE + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "global_step" + type: DT_INT64 + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyAdam" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "m" + type: DT_RESOURCE + } + input_arg { + name: "v" + type: DT_RESOURCE + } + input_arg { + name: "beta1_power" + type_attr: "T" + } + input_arg { + name: "beta2_power" + type_attr: "T" + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "beta1" + type_attr: "T" + } + input_arg { + name: "beta2" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyCenteredRMSProp" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "mg" + type: DT_RESOURCE + } + input_arg { + name: "ms" + type: DT_RESOURCE + } + input_arg { + name: "mom" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "rho" + type_attr: "T" + } + input_arg { + name: "momentum" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyFtrl" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "linear" + type: DT_RESOURCE + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "lr_power" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyGradientDescent" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "alpha" + type_attr: "T" + } + input_arg { + name: "delta" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyMomentum" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "momentum" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } + attr { + name: "use_nesterov" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyProximalAdagrad" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyProximalGradientDescent" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "alpha" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "delta" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceApplyRMSProp" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "ms" + type: DT_RESOURCE + } + input_arg { + name: "mom" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "rho" + type_attr: "T" + } + input_arg { + name: "momentum" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceSparseApplyAdadelta" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "accum_update" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "rho" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceSparseApplyAdagrad" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceSparseApplyAdagradDA" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "gradient_accumulator" + type: DT_RESOURCE + } + input_arg { + name: "gradient_squared_accumulator" + type: DT_RESOURCE + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "global_step" + type: DT_INT64 + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceSparseApplyCenteredRMSProp" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "mg" + type: DT_RESOURCE + } + input_arg { + name: "ms" + type: DT_RESOURCE + } + input_arg { + name: "mom" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "rho" + type_attr: "T" + } + input_arg { + name: "momentum" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceSparseApplyFtrl" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "linear" + type: DT_RESOURCE + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "lr_power" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceSparseApplyMomentum" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + input_arg { + name: "momentum" + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } + attr { + name: "use_nesterov" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceSparseApplyProximalAdagrad" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceSparseApplyProximalGradientDescent" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "alpha" + type_attr: "T" + } + input_arg { + name: "l1" + type_attr: "T" + } + input_arg { + name: "l2" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} +op { + name: "ResourceSparseApplyRMSProp" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "ms" + type: DT_RESOURCE + } + input_arg { + name: "mom" + type: DT_RESOURCE + } + input_arg { + name: "lr" + type_attr: "T" + } + input_arg { + name: "rho" + type_attr: "T" + } + input_arg { + name: "momentum" + type_attr: "T" + } + input_arg { + name: "epsilon" + type_attr: "T" + } + input_arg { + name: "grad" + type_attr: "T" + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + } +} op { name: "Restore" input_arg { @@ -32556,6 +33868,34 @@ op { } } } +op { + name: "Stage" + input_arg { + name: "values" + type_list_attr: "dtypes" + } + attr { + name: "dtypes" + type: "list(type)" + has_minimum: true + minimum: 1 + } + attr { + name: "container" + type: "string" + default_value { + s: "" + } + } + attr { + name: "shared_name" + type: "string" + default_value { + s: "" + } + } + is_stateful: true +} op { name: "StopGradient" input_arg { @@ -37032,6 +38372,34 @@ op { } } } +op { + name: "Unstage" + output_arg { + name: "values" + type_list_attr: "dtypes" + } + attr { + name: "dtypes" + type: "list(type)" + has_minimum: true + minimum: 1 + } + attr { + name: "container" + type: "string" + default_value { + s: "" + } + } + attr { + name: "shared_name" + type: "string" + default_value { + s: "" + } + } + is_stateful: true +} op { name: "Variable" output_arg { diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc index ea24a0a16f2..54e766e8e9c 100644 --- a/tensorflow/core/ops/data_flow_ops.cc +++ b/tensorflow/core/ops/data_flow_ops.cc @@ -2180,4 +2180,35 @@ Delete the tensor specified by its handle in the session. handle: The handle for a tensor stored in the session state. )doc"); +REGISTER_OP("Stage") + .Input("values: dtypes") + .Attr("dtypes: list(type)") + .Attr("container: string = ''") + .Attr("shared_name: string = ''") + .SetShapeFn(shape_inference::UnknownShape) + .SetIsStateful() + .Doc(R"doc( +Stage values similar to a lightweight Enqueue. The basic functionality of this +Op is similar to a queue with many fewer capabilities and options. This Op is +optimized for performance. + +values: a list of tensors +container: If non-empty, this queue is placed in the given container. Otherwise, + a default container is used. +shared_name: It is necessary to match this name to the matching Unstage Op. + )doc"); + +REGISTER_OP("Unstage") + .Output("values: dtypes") + .Attr("dtypes: list(type)") + .Attr("container: string = ''") + .Attr("shared_name: string = ''") + .SetShapeFn(shape_inference::UnknownShape) + .SetIsStateful() + .Doc(R"doc( +Op is similar to a lightweight Dequeue. The basic funtionality is similar to +dequeue with many fewer capabilities and options. This Op is optimized for +performance. + )doc"); + } // namespace tensorflow diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index dd26b30d32f..65ad47e7b73 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -15933,6 +15933,1513 @@ op { } summary: "Computes the gradient of nearest neighbor interpolation." } +op { + name: "ResourceApplyAdadelta" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum_update" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "rho" + description: "Decay factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "epsilon" + description: "Constant factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, updating of the var, accum and update_accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Update \'*var\' according to the adadelta scheme." + description: "accum = rho() * accum + (1 - rho()) * grad.square();\nupdate = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;\nupdate_accum = rho() * update_accum + (1 - rho()) * update.square();\nvar -= update;" +} +op { + name: "ResourceApplyAdagrad" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + summary: "Update \'*var\' according to the adagrad scheme." + description: "accum += grad * grad\nvar -= lr * grad * (1 / sqrt(accum))" +} +op { + name: "ResourceApplyAdagradDA" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "gradient_accumulator" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "gradient_squared_accumulator" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "global_step" + description: "Training step number. Must be a scalar." + type: DT_INT64 + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Update \'*var\' according to the proximal adagrad scheme." +} +op { + name: "ResourceApplyAdam" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "m" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "v" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "beta1_power" + description: "Must be a scalar." + type_attr: "T" + } + input_arg { + name: "beta2_power" + description: "Must be a scalar." + type_attr: "T" + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "beta1" + description: "Momentum factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "beta2" + description: "Momentum factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "epsilon" + description: "Ridge term. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var, m, and v tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + summary: "Update \'*var\' according to the Adam algorithm." + description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)" +} +op { + name: "ResourceApplyCenteredRMSProp" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "mg" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "ms" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "mom" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "rho" + description: "Decay rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "momentum" + type_attr: "T" + } + input_arg { + name: "epsilon" + description: "Ridge term. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var, mg, ms, and mom tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + summary: "Update \'*var\' according to the centered RMSProp algorithm." + description: "The centered RMSProp algorithm uses an estimate of the centered second moment\n(i.e., the variance) for normalization, as opposed to regular RMSProp, which\nuses the (uncentered) second moment. This often helps with training, but is\nslightly more expensive in terms of computation and memory.\n\nNote that in dense implementation of this algorithm, mg, ms, and mom will\nupdate even if the grad is zero, but in this sparse implementation, mg, ms,\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nmean_grad = decay * mean_grad + (1-decay) * gradient\n\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)\n\nmg <- rho * mg_{t-1} + (1-rho) * grad\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)\nvar <- var - mom" +} +op { + name: "ResourceApplyFtrl" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "linear" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regulariation. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regulariation. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "lr_power" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + summary: "Update \'*var\' according to the Ftrl-proximal scheme." + description: "accum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new" +} +op { + name: "ResourceApplyGradientDescent" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "alpha" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "delta" + description: "The change." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it." +} +op { + name: "ResourceApplyMomentum" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "momentum" + description: "Momentum. Must be a scalar." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + attr { + name: "use_nesterov" + type: "bool" + default_value { + b: false + } + description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum." + } + summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you" + description: "want to use Nesterov momentum.\n\naccum = accum * momentum + grad\nvar -= lr * accum" +} +op { + name: "ResourceApplyProximalAdagrad" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate." + description: "accum += grad * grad\nprox_v = var - lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}" +} +op { + name: "ResourceApplyProximalGradientDescent" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "alpha" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "delta" + description: "The change." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate." + description: "prox_v = var - alpha * delta\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}" +} +op { + name: "ResourceApplyRMSProp" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "ms" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "mom" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "rho" + description: "Decay rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "momentum" + type_attr: "T" + } + input_arg { + name: "epsilon" + description: "Ridge term. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var, ms, and mom tensors is protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + summary: "Update \'*var\' according to the RMSProp algorithm." + description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom" +} +op { + name: "ResourceSparseApplyAdadelta" + input_arg { + name: "var" + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum_update" + description: ": Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Learning rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "rho" + description: "Decay factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "epsilon" + description: "Constant factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var and accum." + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention." + } + summary: "var: Should be from a Variable()." +} +op { + name: "ResourceSparseApplyAdagrad" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Learning rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var and accum." + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme." + description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nvar -= lr * grad * (1 / sqrt(accum))" +} +op { + name: "ResourceSparseApplyAdagradDA" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "gradient_accumulator" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "gradient_squared_accumulator" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var and accum." + type_attr: "Tindices" + } + input_arg { + name: "lr" + description: "Learning rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "global_step" + description: "Training step number. Must be a scalar." + type: DT_INT64 + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Update entries in \'*var\' and \'*accum\' according to the proximal adagrad scheme." +} +op { + name: "ResourceSparseApplyCenteredRMSProp" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "mg" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "ms" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "mom" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "rho" + description: "Decay rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "momentum" + type_attr: "T" + } + input_arg { + name: "epsilon" + description: "Ridge term. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var, ms and mom." + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var, mg, ms, and mom tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + summary: "Update \'*var\' according to the centered RMSProp algorithm." + description: "The centered RMSProp algorithm uses an estimate of the centered second moment\n(i.e., the variance) for normalization, as opposed to regular RMSProp, which\nuses the (uncentered) second moment. This often helps with training, but is\nslightly more expensive in terms of computation and memory.\n\nNote that in dense implementation of this algorithm, mg, ms, and mom will\nupdate even if the grad is zero, but in this sparse implementation, mg, ms,\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nmean_grad = decay * mean_grad + (1-decay) * gradient\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom" +} +op { + name: "ResourceSparseApplyFtrl" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "linear" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var and accum." + type_attr: "Tindices" + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "lr_power" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme." + description: "That is for rows we have grad for, we update var, accum and linear as follows:\naccum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new" +} +op { + name: "ResourceSparseApplyMomentum" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Learning rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var and accum." + type_attr: "Tindices" + } + input_arg { + name: "momentum" + description: "Momentum. Must be a scalar." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + attr { + name: "use_nesterov" + type: "bool" + default_value { + b: false + } + description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum." + } + summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme." + description: "Set use_nesterov = True if you want to use Nesterov momentum.\n\nThat is for rows we have grad for, we update var and accum as follows:\n\naccum = accum * momentum + grad\nvar -= lr * accum" +} +op { + name: "ResourceSparseApplyProximalAdagrad" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "accum" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Learning rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var and accum." + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm." + description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nprox_v = var\nprox_v -= lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}" +} +op { + name: "ResourceSparseApplyProximalGradientDescent" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "alpha" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l1" + description: "L1 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "l2" + description: "L2 regularization. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var and accum." + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention." + } + summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate." + description: "That is for rows we have grad for, we update var as follows:\nprox_v = var - alpha * grad\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}" +} +op { + name: "ResourceSparseApplyRMSProp" + input_arg { + name: "var" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "ms" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "mom" + description: "Should be from a Variable()." + type: DT_RESOURCE + } + input_arg { + name: "lr" + description: "Scaling factor. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "rho" + description: "Decay rate. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "momentum" + type_attr: "T" + } + input_arg { + name: "epsilon" + description: "Ridge term. Must be a scalar." + type_attr: "T" + } + input_arg { + name: "grad" + description: "The gradient." + type_attr: "T" + } + input_arg { + name: "indices" + description: "A vector of indices into the first dimension of var, ms and mom." + type_attr: "Tindices" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_COMPLEX64 + type: DT_COMPLEX128 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + attr { + name: "use_locking" + type: "bool" + default_value { + b: false + } + description: "If `True`, updating of the var, ms, and mom tensors is protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention." + } + summary: "Update \'*var\' according to the RMSProp algorithm." + description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom" +} op { name: "Restore" input_arg { @@ -21028,6 +22535,39 @@ op { } summary: "Push an element onto the stack." } +op { + name: "Stage" + input_arg { + name: "values" + description: "a list of tensors" + type_list_attr: "dtypes" + } + attr { + name: "dtypes" + type: "list(type)" + has_minimum: true + minimum: 1 + } + attr { + name: "container" + type: "string" + default_value { + s: "" + } + description: "If non-empty, this queue is placed in the given container. Otherwise,\na default container is used." + } + attr { + name: "shared_name" + type: "string" + default_value { + s: "" + } + description: "It is necessary to match this name to the matching Unstage Op." + } + summary: "Stage values similar to a lightweight Enqueue. The basic functionality of this" + description: "Op is similar to a queue with many fewer capabilities and options. This Op is\noptimized for performance." + is_stateful: true +} op { name: "StopGradient" input_arg { @@ -23604,6 +25144,36 @@ op { summary: "Computes the sum along segments of a tensor." description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n
\n\n
" } +op { + name: "Unstage" + output_arg { + name: "values" + type_list_attr: "dtypes" + } + attr { + name: "dtypes" + type: "list(type)" + has_minimum: true + minimum: 1 + } + attr { + name: "container" + type: "string" + default_value { + s: "" + } + } + attr { + name: "shared_name" + type: "string" + default_value { + s: "" + } + } + summary: "Op is similar to a lightweight Dequeue. The basic funtionality is similar to" + description: "dequeue with many fewer capabilities and options. This Op is optimized for\nperformance." + is_stateful: true +} op { name: "Variable" output_arg { diff --git a/tensorflow/core/ops/set_ops.cc b/tensorflow/core/ops/set_ops.cc index 3da83ddae24..fad70072071 100644 --- a/tensorflow/core/ops/set_ops.cc +++ b/tensorflow/core/ops/set_ops.cc @@ -64,24 +64,20 @@ REGISTER_OP("DenseToDenseSetOperation") } // The following should stay in sync with `ComputeDenseToDense` shape // assertions in kernels/set_kernels.cc. - // Dimension n contains the set values to be compared, so ranks and the - // first n-1 dimensions of inputs and output must match. + // Dimension n contains the set values to be compared, so ranks must be + // >= 2, and the first n-1 dimensions of inputs and output must be + // compatible. DimensionHandle output_rank; ShapeHandle input0_shape = c->input(0); + TF_RETURN_IF_ERROR(c->WithRankAtLeast(input0_shape, 2, &input0_shape)); if (c->RankKnown(input0_shape)) { const int32 input0_rank = c->Rank(input0_shape); - if (input0_rank < 2) { - return errors::InvalidArgument("Input 0, expected rank >= 2, got ", - input0_rank, "."); - } ShapeHandle input1_shape = c->input(1); + TF_RETURN_IF_ERROR( + c->WithRank(input1_shape, input0_rank, &input1_shape)); if (c->RankKnown(input1_shape)) { + // If both ranks are specified, the first n-1 dims must be compatible. const int32 rank = c->Rank(input1_shape); - if (input0_rank != rank) { - return errors::InvalidArgument("Ranks do not match: input 0 ", - input0_rank, ", input 1 ", rank, - "."); - } ShapeHandle group0_shape; TF_RETURN_IF_ERROR( c->Subshape(input0_shape, 0, rank - 1, &group0_shape)); @@ -95,28 +91,16 @@ REGISTER_OP("DenseToDenseSetOperation") output_rank = c->MakeDim(input0_rank); } else { ShapeHandle input1_shape = c->input(1); + TF_RETURN_IF_ERROR(c->WithRankAtLeast(input1_shape, 2, &input1_shape)); if (c->RankKnown(input1_shape)) { - const int32 input1_rank = c->Rank(input1_shape); - if (input1_rank < 2) { - return errors::InvalidArgument("Input 0, expected rank >= 2, got ", - input1_rank, "."); - } - output_rank = c->MakeDim(input1_rank); + output_rank = c->MakeDim(c->Rank(input1_shape)); } else { output_rank = c->UnknownDim(); } } - DimensionHandle output_num_elements = c->Dim(input0_shape, 0); - if (!c->ValueKnown(output_num_elements)) { - ShapeHandle input1_shape = c->input(1); - output_num_elements = c->Dim(input1_shape, 0); - if (!c->ValueKnown(output_num_elements)) { - output_num_elements = c->UnknownDim(); - } - } - c->set_output(0, c->Matrix(output_num_elements, output_rank)); - c->set_output(1, c->Vector(output_num_elements)); + c->set_output(0, c->Matrix(c->UnknownDim(), output_rank)); + c->set_output(1, c->Vector(c->UnknownDim())); c->set_output(2, c->Vector(output_rank)); return Status::OK(); }) @@ -159,30 +143,30 @@ REGISTER_OP("DenseToSparseSetOperation") } // The following should stay in sync with `ComputeDenseToSparse` shape // assertions in kernels/set_kernels.cc. - // Dimension n contains the set values to be compared, so ranks and the - // first n-1 dimensions of inputs and output must match. - DimensionHandle output_rank; + // Ranks must be compatible, and be >= 2. + ShapeHandle input1_shape_shape = c->input(3); + TF_RETURN_IF_ERROR(shape_inference::ValidateSparseTensor( + c, c->input(1), c->input(2), input1_shape_shape)); + + DimensionHandle input1_rank_dim = c->Dim(input1_shape_shape, 0); + + DimensionHandle output_rank_dim; ShapeHandle input0_shape = c->input(0); + TF_RETURN_IF_ERROR(c->WithRankAtLeast(input0_shape, 2, &input0_shape)); if (c->RankKnown(input0_shape)) { const int32 input0_rank = c->Rank(input0_shape); - if (input0_rank < 2) { - return errors::InvalidArgument("Input 0, expected rank >= 2, got ", - input0_rank, "."); - } - output_rank = c->MakeDim(input0_rank); + TF_RETURN_IF_ERROR( + c->WithValue(input1_rank_dim, input0_rank, &input1_rank_dim)); + output_rank_dim = c->MakeDim(input0_rank); + } else if (c->ValueKnown(input1_rank_dim)) { + output_rank_dim = input1_rank_dim; } else { - output_rank = c->UnknownDim(); - } - TF_RETURN_IF_ERROR(shape_inference::ValidateSparseTensor( - c, c->input(1), c->input(2), c->input(3))); - DimensionHandle output_num_elements = c->Dim(input0_shape, 0); - if (!c->ValueKnown(output_num_elements)) { - output_num_elements = c->UnknownDim(); + output_rank_dim = c->UnknownDim(); } - c->set_output(0, c->Matrix(output_num_elements, output_rank)); - c->set_output(1, c->Vector(output_num_elements)); - c->set_output(2, c->Vector(output_rank)); + c->set_output(0, c->Matrix(c->UnknownDim(), output_rank_dim)); + c->set_output(1, c->Vector(c->UnknownDim())); + c->set_output(2, c->Vector(output_rank_dim)); return Status::OK(); }) .Doc(R"doc( @@ -239,13 +223,40 @@ REGISTER_OP("SparseToSparseSetOperation") } // The following should stay in sync with `ComputeSparseToSparse` shape // assertions in kernels/set_kernels.cc. + // Ranks must be compatible, and be >= 2. + ShapeHandle input0_shape_shape = c->input(2); + ShapeHandle input1_shape_shape = c->input(5); TF_RETURN_IF_ERROR(shape_inference::ValidateSparseTensor( - c, c->input(0), c->input(1), c->input(2))); + c, c->input(0), c->input(1), input0_shape_shape)); TF_RETURN_IF_ERROR(shape_inference::ValidateSparseTensor( - c, c->input(3), c->input(4), c->input(5))); - c->set_output(0, c->Matrix(c->UnknownDim(), c->UnknownDim())); + c, c->input(3), c->input(4), input1_shape_shape)); + + DimensionHandle input0_rank_dim = c->Dim(input0_shape_shape, 0); + DimensionHandle input1_rank_dim = c->Dim(input1_shape_shape, 0); + DimensionHandle output_rank_dim; + if (c->ValueKnown(input0_rank_dim)) { + const int32 input0_rank = c->Value(input0_rank_dim); + if (input0_rank < 2) { + return errors::InvalidArgument("Input 0, expected rank >= 2, got ", + input0_rank, "."); + } + TF_RETURN_IF_ERROR( + c->WithValue(input1_rank_dim, input0_rank, &input1_rank_dim)); + output_rank_dim = input0_rank_dim; + } else if (c->ValueKnown(input1_rank_dim)) { + const int32 input1_rank = c->Value(input1_rank_dim); + if (input1_rank < 2) { + return errors::InvalidArgument("Input 1, expected rank >= 2, got ", + input1_rank, "."); + } + output_rank_dim = input1_rank_dim; + } else { + output_rank_dim = c->UnknownDim(); + } + + c->set_output(0, c->Matrix(c->UnknownDim(), output_rank_dim)); c->set_output(1, c->Vector(c->UnknownDim())); - c->set_output(2, c->Vector(c->UnknownDim())); + c->set_output(2, c->Vector(output_rank_dim)); return Status::OK(); }) .Doc(R"doc( diff --git a/tensorflow/core/ops/set_ops_test.cc b/tensorflow/core/ops/set_ops_test.cc index 75da599767e..3a58eb974ff 100644 --- a/tensorflow/core/ops/set_ops_test.cc +++ b/tensorflow/core/ops/set_ops_test.cc @@ -34,16 +34,16 @@ TEST(SetOpsTest, DenseToDenseShape) { INFER_OK(op, "?;?", "[?,?];[?];[?]"); // Invalid rank. - INFER_ERROR("expected rank >= 2", op, "[?];?"); - INFER_ERROR("expected rank >= 2", op, "?;[?]"); - INFER_ERROR("expected rank >= 2", op, "[2];?"); - INFER_ERROR("expected rank >= 2", op, "?;[2]"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[?];?"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "?;[?]"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[2];?"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "?;[2]"); // Mismatched ranks. - INFER_ERROR("Ranks do not match", op, "[?,?];[?,?,?]"); - INFER_ERROR("Ranks do not match", op, "[?,?,?];[?,?]"); - INFER_ERROR("Ranks do not match", op, "[2,1];[2,1,2]"); - INFER_ERROR("Ranks do not match", op, "[2,1,2];[2,1]"); + INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[?,?];[?,?,?]"); + INFER_ERROR("Shape must be rank 3 but is rank 2", op, "[?,?,?];[?,?]"); + INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[2,1];[2,1,2]"); + INFER_ERROR("Shape must be rank 3 but is rank 2", op, "[2,1,2];[2,1]"); // Rank 2, unknown dims. INFER_OK(op, "[?,?];?", "[?,2];[?];[2]"); @@ -55,26 +55,26 @@ TEST(SetOpsTest, DenseToDenseShape) { INFER_OK(op, "?;[?,?,?,?]", "[?,4];[?];[4]"); INFER_OK(op, "[?,?,?,?];[?,?,?,?]", "[?,4];[?];[4]"); - // Known dimension 0. - INFER_OK(op, "[4,?,?,?];?", "[d0_0,4];[d0_0];[4]"); - INFER_OK(op, "?;[4,?,?,?]", "[d1_0,4];[d1_0];[4]"); - INFER_OK(op, "[4,?,?,?];[?,?,?,?]", "[d0_0,4];[d0_0];[4]"); - INFER_OK(op, "[?,?,?,?];[4,?,?,?]", "[d1_0,4];[d1_0];[4]"); - INFER_OK(op, "[4,?,?,?];[4,?,?,?]", "[d0_0,4];[d0_0];[4]"); + // Known rank for 1 input. + INFER_OK(op, "[5,3,2,1];?", "[?,4];[?];[4]"); + INFER_OK(op, "?;[5,3,2,1]", "[?,4];[?];[4]"); + INFER_OK(op, "[5,3,2,1];[?,?,?,?]", "[?,4];[?];[4]"); + INFER_OK(op, "[?,?,?,?];[5,3,2,1]", "[?,4];[?];[4]"); + INFER_OK(op, "[5,3,2,1];[?,?,?,?]", "[?,4];[?];[4]"); - // Mismatched known n-1 dims. + // Mismatched n-1 dims. INFER_ERROR("Dimension 0 in both shapes must be equal", op, "[4,?,2,?];[3,1,?,5]"); INFER_ERROR("Dimension 2 in both shapes must be equal", op, "[4,3,2,1];[4,3,3,1]"); - // Matched known n-1 dims. - INFER_OK(op, "[4,5,6,7];[?,?,?,?]", "[d0_0,4];[d0_0];[4]"); - INFER_OK(op, "[4,5,6,7];[?,?,?,4]", "[d0_0,4];[d0_0];[4]"); - INFER_OK(op, "[?,?,?,?];[4,5,6,7]", "[d1_0,4];[d1_0];[4]"); - INFER_OK(op, "[4,?,2,?];[?,1,?,5]", "[d0_0,4];[d0_0];[4]"); - INFER_OK(op, "[4,5,6,7];[4,?,6,?]", "[d0_0,4];[d0_0];[4]"); - INFER_OK(op, "[4,5,6,7];[4,5,6,4]", "[d0_0,4];[d0_0];[4]"); + // Matched n-1 dims. + INFER_OK(op, "[4,5,6,7];[?,?,?,?]", "[?,4];[?];[4]"); + INFER_OK(op, "[4,5,6,7];[?,?,?,4]", "[?,4];[?];[4]"); + INFER_OK(op, "[?,?,?,?];[4,5,6,7]", "[?,4];[?];[4]"); + INFER_OK(op, "[4,?,2,?];[?,1,?,5]", "[?,4];[?];[4]"); + INFER_OK(op, "[4,5,6,7];[4,?,6,?]", "[?,4];[?];[4]"); + INFER_OK(op, "[4,5,6,7];[4,5,6,4]", "[?,4];[?];[4]"); } TEST(SetOpsTest, DenseToSparseShape_InvalidNumberOfInputs) { @@ -89,35 +89,37 @@ TEST(SetOpsTest, DenseToSparseShape) { // Unknown shapes. INFER_OK(op, "?;?;?;?", "[?,?];[?];[?]"); + INFER_OK(op, "?;[?,?];[?];[?]", "[?,?];[?];[?]"); // Invalid rank. - INFER_ERROR("expected rank >= 2", op, "[?];?;?;?"); - INFER_ERROR("expected rank >= 2", op, "[?];[?,?];[?];[?]"); - INFER_ERROR("expected rank >= 2", op, "[?];[5,3];[5];[3]"); - INFER_ERROR("expected rank >= 2", op, "[2];?;?;?"); - INFER_ERROR("expected rank >= 2", op, "[2];[?,?];[?];[?]"); - INFER_ERROR("expected rank >= 2", op, "[2];[5,3];[5];[3]"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[?];?;?;?"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, + "[?];[?,?];[?];[?]"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, + "[?];[5,3];[5];[3]"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[2];?;?;?"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, + "[2];[?,?];[?];[?]"); + INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, + "[2];[5,3];[5];[3]"); - // Rank 2, unknown dims. + // Unknown sparse rank. INFER_OK(op, "[?,?];?;?;?", "[?,2];[?];[2]"); INFER_OK(op, "[?,?];[?,?];[?];[?]", "[?,2];[?];[2]"); - INFER_OK(op, "[?,?];[5,3];[5];[3]", "[?,2];[?];[2]"); - // Rank 4, unknown dims. - INFER_OK(op, "[?,?,?,?];?;?;?", "[?,4];[?];[4]"); - INFER_OK(op, "[?,?,?,?];[?,?];[?];[?]", "[?,4];[?];[4]"); - INFER_OK(op, "[?,?,?,?];[5,3];[5];[3]", "[?,4];[?];[4]"); + // Unknown dense rank. + INFER_OK(op, "?;[?,2];[?];[2]", "[?,d3_0];[?];[d3_0]"); + INFER_OK(op, "?;[5,2];[5];[2]", "[?,d3_0];[?];[d3_0]"); - // Known dimension 0. - INFER_OK(op, "[4,?,?,?];?;?;?", "[d0_0,4];[d0_0];[4]"); - INFER_OK(op, "[4,?,?,?];[?,?];[?];[?]", "[d0_0,4];[d0_0];[4]"); - INFER_OK(op, "[4,?,?,?];[5,3];[5];[3]", "[d0_0,4];[d0_0];[4]"); + // Known both ranks. + INFER_OK(op, "[?,?];[5,2];[5];[2]", "[?,2];[?];[2]"); + INFER_OK(op, "[4,3];[5,2];[5];[2]", "[?,2];[?];[2]"); // Invalid input sparse tensor. INFER_ERROR("elements in index (5) and values (6) do not match", op, - "[?,?];[5,3];[6];[3]"); + "?;[5,3];[6];[3]"); INFER_ERROR("rank (3) and shape rank (4) do not match", op, - "[?,?];[5,3];[5];[4]"); + "?;[5,3];[5];[4]"); } TEST(SetOpsTest, SparseToSparseShape_InvalidNumberOfInputs) { @@ -128,7 +130,21 @@ TEST(SetOpsTest, SparseToSparseShape_InvalidNumberOfInputs) { TEST(SetOpsTest, SparseToSparseShape) { ShapeInferenceTestOp op("SparseToSparseSetOperation"); + + // Unknown. INFER_OK(op, "?;?;?;?;?;?", "[?,?];[?];[?]"); + INFER_OK(op, "[?,?];[?];[?];[?,?];[?];[?]", "[?,?];[?];[?]"); + INFER_OK(op, "?;?;?;[?,?];[?];[?]", "[?,?];[?];[?]"); + INFER_OK(op, "[?,?];[?];[?];?;?;?", "[?,?];[?];[?]"); + + // Known rank for 1 input. + INFER_OK(op, "[?,2];[?];[2];?;?;?", "[?,d2_0];[?];[d2_0]"); + INFER_OK(op, "?;?;?;[?,2];[?];[2]", "[?,d5_0];[?];[d5_0]"); + INFER_OK(op, "[?,2];[?];[2];[?,?];[?];[?]", "[?,d2_0];[?];[d2_0]"); + INFER_OK(op, "[?,?];[?];[?];[?,2];[?];[2]", "[?,d5_0];[?];[d5_0]"); + + // Known rank for both inputs. + INFER_OK(op, "[?,2];[?];[2];[?,2];[?];[2]", "[?,d2_0];[?];[d2_0]"); } } // end namespace tensorflow diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc index a2615b9f9f2..2027bf4603d 100644 --- a/tensorflow/core/ops/training_ops.cc +++ b/tensorflow/core/ops/training_ops.cc @@ -22,12 +22,20 @@ using shape_inference::DimensionHandle; using shape_inference::InferenceContext; using shape_inference::ShapeHandle; +static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) { + auto h_dtype = c->input_handle_dtype(input); + if (h_dtype == DT_INVALID) { + return c->input(input); + } + return c->input_handle_shape(input); +} + // Handle the gradient and, if , indices inputs. // is an input+output parameter, containing the current known input shape to // the gradient. static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse, int grad_idx, ShapeHandle* s) { - ShapeHandle grad = c->input(grad_idx); + ShapeHandle grad = ShapeOrHandleShape(c, grad_idx); if (!sparse) { TF_RETURN_IF_ERROR(c->Merge(*s, grad, s)); return Status::OK(); @@ -49,10 +57,12 @@ static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse, static Status ApplyGradientDescentShapeFn(InferenceContext* c) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var + ShapeHandle s = ShapeOrHandleShape(c, 0); // var TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); // alpha TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s)); // delta - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -75,16 +85,35 @@ use_locking: If `True`, the subtraction will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. )doc"); +REGISTER_OP("ResourceApplyGradientDescent") + .Input("var: resource") + .Input("alpha: T") + .Input("delta: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn(ApplyGradientDescentShapeFn) + .Doc(R"doc( +Update '*var' by subtracting 'alpha' * 'delta' from it. + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +delta: The change. +use_locking: If `True`, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + static Status ApplyProximalGradientDescentShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var + ShapeHandle s = ShapeOrHandleShape(c, 0); // var TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); // alpha TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // l1 TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // l2 TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 4 /* grad_idx */, &s)); - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -146,17 +175,76 @@ out: Same as "var". use_locking: If True, the subtraction will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. )doc"); + +REGISTER_OP("ResourceApplyProximalGradientDescent") + .Input("var: resource") + .Input("alpha: T") + .Input("l1: T") + .Input("l2: T") + .Input("delta: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalGradientDescentShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' as FOBOS algorithm with fixed learning rate. +prox_v = var - alpha * delta +var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +delta: The change. +use_locking: If True, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyProximalGradientDescent") + .Input("var: resource") + .Input("alpha: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalGradientDescentShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Sparse update '*var' as FOBOS algorithm with fixed learning rate. + +That is for rows we have grad for, we update var as follows: +prox_v = var - alpha * grad +var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} + +var: Should be from a Variable(). +alpha: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +use_locking: If True, the subtraction will be protected by a lock; + otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + static Status ApplyAdadeltaShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var - TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s)); // accum - TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s)); // accum update + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum + TF_RETURN_IF_ERROR( + c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // accum update TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // lr TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // rho TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // epsilon TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s)); - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -224,14 +312,76 @@ use_locking: If True, updating of the var and accum tensors will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. )doc"); +REGISTER_OP("ResourceApplyAdadelta") + .Input("var: resource") + .Input("accum: resource") + .Input("accum_update: resource") + .Input("lr: T") + .Input("rho: T") + .Input("epsilon: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdadeltaShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the adadelta scheme. + +accum = rho() * accum + (1 - rho()) * grad.square(); +update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad; +update_accum = rho() * update_accum + (1 - rho()) * update.square(); +var -= update; + +var: Should be from a Variable(). +accum: Should be from a Variable(). +accum_update: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +rho: Decay factor. Must be a scalar. +epsilon: Constant factor. Must be a scalar. +grad: The gradient. +use_locking: If True, updating of the var, accum and update_accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyAdadelta") + .Input("var: resource") + .Input("accum: resource") + .Input("accum_update: resource") + .Input("lr: T") + .Input("rho: T") + .Input("epsilon: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdadeltaShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +var: Should be from a Variable(). +accum: Should be from a Variable(). +accum_update:: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +rho: Decay factor. Must be a scalar. +epsilon: Constant factor. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + static Status ApplyAdagradShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var - TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s)); // accum + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -261,16 +411,44 @@ use_locking: If `True`, updating of the var and accum tensors will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. )doc"); + +REGISTER_OP("ResourceApplyAdagrad") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the adagrad scheme. + +accum += grad * grad +var -= lr * grad * (1 / sqrt(accum)) + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +grad: The gradient. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + static Status ApplyProximalAdagradShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var - TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s)); // accum + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // l1 TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // l2 TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 5 /* grad_idx */, &s)); - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -304,6 +482,34 @@ use_locking: If True, updating of the var and accum tensors will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. )doc"); +REGISTER_OP("ResourceApplyProximalAdagrad") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalAdagradShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' and '*accum' according to FOBOS with Adagrad learning rate. +accum += grad * grad +prox_v = var - lr * grad * (1 / sqrt(accum)) +var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} + +var: Should be from a Variable(). +accum: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + REGISTER_OP("SparseApplyAdagrad") .Input("var: Ref(T)") .Input("accum: Ref(T)") @@ -335,12 +541,42 @@ use_locking: If `True`, updating of the var and accum tensors will be protected contention. )doc"); +REGISTER_OP("ResourceSparseApplyAdagrad") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' and '*accum' according to the adagrad scheme. + +That is for rows we have grad for, we update var and accum as follows: +accum += grad * grad +var -= lr * grad * (1 / sqrt(accum)) + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var - TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s)); // grad_accumulator + ShapeHandle s = ShapeOrHandleShape(c, 0); // var TF_RETURN_IF_ERROR( - c->Merge(s, c->input(2), &s)); // gradient_squared_accumulator + c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // grad_accumulator + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), + &s)); // gradient_squared_accumulator TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); int idx = sparse ? 5 : 4; @@ -348,7 +584,9 @@ static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) { TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l1 TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l2 TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // global step - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -453,11 +691,106 @@ use_locking: If True, updating of the var and accum tensors will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. )doc"); +REGISTER_OP("ResourceApplyAdagradDA") + .Input("var: resource") + .Input("gradient_accumulator: resource") + .Input("gradient_squared_accumulator: resource") + .Input("grad: T") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("global_step: int64") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradDAShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the proximal adagrad scheme. + +var: Should be from a Variable(). +gradient_accumulator: Should be from a Variable(). +gradient_squared_accumulator: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +global_step: Training step number. Must be a scalar. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyAdagradDA") + .Input("var: resource") + .Input("gradient_accumulator: resource") + .Input("gradient_squared_accumulator: resource") + .Input("grad: T") + .Input("indices: Tindices") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("global_step: int64") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdagradDAShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update entries in '*var' and '*accum' according to the proximal adagrad scheme. + +var: Should be from a Variable(). +gradient_accumulator: Should be from a Variable(). +gradient_squared_accumulator: Should be from a Variable(). +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +lr: Learning rate. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +global_step: Training step number. Must be a scalar. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyProximalAdagrad") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyProximalAdagradShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Sparse update entries in '*var' and '*accum' according to FOBOS algorithm. + +That is for rows we have grad for, we update var and accum as follows: +accum += grad * grad +prox_v = var +prox_v -= lr * grad * (1 / sqrt(accum)) +var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0} + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +use_locking: If True, updating of the var and accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var - TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s)); // accum - TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s)); // linear + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // linear TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); int idx = sparse ? 5 : 4; @@ -465,7 +798,9 @@ static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) { TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l1 TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // l2 TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // lr_power - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -549,16 +884,94 @@ use_locking: If `True`, updating of the var and accum tensors will be protected contention. )doc"); +REGISTER_OP("ResourceApplyFtrl") + .Input("var: resource") + .Input("accum: resource") + .Input("linear: resource") + .Input("grad: T") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("lr_power: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the Ftrl-proximal scheme. + +accum_new = accum + grad * grad +linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +lr: Scaling factor. Must be a scalar. +l1: L1 regulariation. Must be a scalar. +l2: L2 regulariation. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyFtrl") + .Input("var: resource") + .Input("accum: resource") + .Input("linear: resource") + .Input("grad: T") + .Input("indices: Tindices") + .Input("lr: T") + .Input("l1: T") + .Input("l2: T") + .Input("lr_power: T") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyFtrlShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' according to the Ftrl-proximal scheme. + +That is for rows we have grad for, we update var, accum and linear as follows: +accum_new = accum + grad * grad +linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var +quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 +var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 +accum = accum_new + +var: Should be from a Variable(). +accum: Should be from a Variable(). +linear: Should be from a Variable(). +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +lr: Scaling factor. Must be a scalar. +l1: L1 regularization. Must be a scalar. +l2: L2 regularization. Must be a scalar. +lr_power: Scaling factor. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + static Status ApplyMomentumShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var - TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s)); // accum + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // accum TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); // lr TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s)); int idx = sparse ? 5 : 4; TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused)); // momentum - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -635,11 +1048,80 @@ var - lr * momentum * accum, so in the end, the var you get is actually var - lr * momentum * accum. )doc"); +REGISTER_OP("ResourceApplyMomentum") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("grad: T") + .Input("momentum: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyMomentumShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the momentum scheme. Set use_nesterov = True if you +want to use Nesterov momentum. + +accum = accum * momentum + grad +var -= lr * accum + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +grad: The gradient. +momentum: Momentum. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +use_nesterov: If `True`, the tensor passed to compute grad will be +var - lr * momentum * accum, so in the end, the var you get is actually +var - lr * momentum * accum. +)doc"); + +REGISTER_OP("ResourceSparseApplyMomentum") + .Input("var: resource") + .Input("accum: resource") + .Input("lr: T") + .Input("grad: T") + .Input("indices: Tindices") + .Input("momentum: T") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .Attr("use_nesterov: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyMomentumShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update relevant entries in '*var' and '*accum' according to the momentum scheme. +Set use_nesterov = True if you want to use Nesterov momentum. + +That is for rows we have grad for, we update var and accum as follows: + +accum = accum * momentum + grad +var -= lr * accum + +var: Should be from a Variable(). +accum: Should be from a Variable(). +lr: Learning rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var and accum. +momentum: Momentum. Must be a scalar. +use_locking: If `True`, updating of the var and accum tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +use_nesterov: If `True`, the tensor passed to compute grad will be +var - lr * momentum * accum, so in the end, the var you get is actually +var - lr * momentum * accum. +)doc"); + static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var - TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s)); // m - TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s)); // v + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // m + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // v TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // beta1_power TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // beta2_power TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // lr @@ -648,7 +1130,9 @@ static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) { TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused)); // epsilon TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s)); - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -693,34 +1177,77 @@ use_locking: If `True`, updating of the var, m, and v tensors will be protected contention. )doc"); +REGISTER_OP("ResourceApplyAdam") + .Input("var: resource") + .Input("m: resource") + .Input("v: resource") + .Input("beta1_power: T") + .Input("beta2_power: T") + .Input("lr: T") + .Input("beta1: T") + .Input("beta2: T") + .Input("epsilon: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyAdamShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the Adam algorithm. + +lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) +m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t +v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t +variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon) + +var: Should be from a Variable(). +m: Should be from a Variable(). +v: Should be from a Variable(). +beta1_power: Must be a scalar. +beta2_power: Must be a scalar. +lr: Scaling factor. Must be a scalar. +beta1: Momentum factor. Must be a scalar. +beta2: Momentum factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +grad: The gradient. +use_locking: If `True`, updating of the var, m, and v tensors will be protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var - TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s)); // ms - TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s)); // mom + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // ms + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // mom TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); // lr TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // rho TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // momentum TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // epsilon TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 7 /* grad_idx */, &s)); - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } static Status ApplyCenteredRMSPropShapeFn(InferenceContext* c, bool sparse) { ShapeHandle unused; - ShapeHandle s = c->input(0); // var - TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s)); // ms - TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s)); // mg - TF_RETURN_IF_ERROR(c->Merge(s, c->input(3), &s)); // mom + ShapeHandle s = ShapeOrHandleShape(c, 0); // var + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s)); // ms + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s)); // mg + TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s)); // mom TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); // lr TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); // rho TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused)); // momentum TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused)); // epsilon TF_RETURN_IF_ERROR( HandleGradAndIndicesInputs(c, sparse, 8 /* grad_idx */, &s)); - c->set_output(0, s); + if (c->num_outputs() > 0) { + c->set_output(0, s); + } return Status::OK(); } @@ -912,4 +1439,184 @@ use_locking: If `True`, updating of the var, mg, ms, and mom tensors is contention. )doc"); +REGISTER_OP("ResourceApplyRMSProp") + .Input("var: resource") + .Input("ms: resource") + .Input("mom: resource") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyRMSPropShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the RMSProp algorithm. +Note that in dense implementation of this algorithm, ms and mom will +update even if the grad is zero, but in this sparse implementation, ms +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +Delta = learning_rate * gradient / sqrt(mean_square + epsilon) + +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +var <- var - mom + +var: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +use_locking: If `True`, updating of the var, ms, and mom tensors is protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceApplyCenteredRMSProp") + .Input("var: resource") + .Input("mg: resource") + .Input("ms: resource") + .Input("mom: resource") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyCenteredRMSPropShapeFn(c, false /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the centered RMSProp algorithm. +The centered RMSProp algorithm uses an estimate of the centered second moment +(i.e., the variance) for normalization, as opposed to regular RMSProp, which +uses the (uncentered) second moment. This often helps with training, but is +slightly more expensive in terms of computation and memory. + +Note that in dense implementation of this algorithm, mg, ms, and mom will +update even if the grad is zero, but in this sparse implementation, mg, ms, +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +mean_grad = decay * mean_grad + (1-decay) * gradient + +Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) + +mg <- rho * mg_{t-1} + (1-rho) * grad +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon) +var <- var - mom + +var: Should be from a Variable(). +mg: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +use_locking: If `True`, updating of the var, mg, ms, and mom tensors is + protected by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyRMSProp") + .Input("var: resource") + .Input("ms: resource") + .Input("mom: resource") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyRMSPropShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the RMSProp algorithm. +Note that in dense implementation of this algorithm, ms and mom will +update even if the grad is zero, but in this sparse implementation, ms +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +Delta = learning_rate * gradient / sqrt(mean_square + epsilon) + +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +var <- var - mom + +var: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var, ms and mom. +use_locking: If `True`, updating of the var, ms, and mom tensors is protected + by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + +REGISTER_OP("ResourceSparseApplyCenteredRMSProp") + .Input("var: resource") + .Input("mg: resource") + .Input("ms: resource") + .Input("mom: resource") + .Input("lr: T") + .Input("rho: T") + .Input("momentum: T") + .Input("epsilon: T") + .Input("grad: T") + .Input("indices: Tindices") + .Attr("T: numbertype") + .Attr("Tindices: {int32, int64}") + .Attr("use_locking: bool = false") + .SetShapeFn([](InferenceContext* c) { + return ApplyCenteredRMSPropShapeFn(c, true /* sparse */); + }) + .Doc(R"doc( +Update '*var' according to the centered RMSProp algorithm. +The centered RMSProp algorithm uses an estimate of the centered second moment +(i.e., the variance) for normalization, as opposed to regular RMSProp, which +uses the (uncentered) second moment. This often helps with training, but is +slightly more expensive in terms of computation and memory. + +Note that in dense implementation of this algorithm, mg, ms, and mom will +update even if the grad is zero, but in this sparse implementation, mg, ms, +and mom will not update in iterations during which the grad is zero. + +mean_square = decay * mean_square + (1-decay) * gradient ** 2 +mean_grad = decay * mean_grad + (1-decay) * gradient +Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2) + +ms <- rho * ms_{t-1} + (1-rho) * grad * grad +mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) +var <- var - mom + +var: Should be from a Variable(). +mg: Should be from a Variable(). +ms: Should be from a Variable(). +mom: Should be from a Variable(). +lr: Scaling factor. Must be a scalar. +epsilon: Ridge term. Must be a scalar. +rho: Decay rate. Must be a scalar. +grad: The gradient. +indices: A vector of indices into the first dimension of var, ms and mom. +use_locking: If `True`, updating of the var, mg, ms, and mom tensors is + protected by a lock; otherwise the behavior is undefined, but may exhibit less + contention. +)doc"); + } // namespace tensorflow diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc index 3998324047c..ab56ad09d11 100644 --- a/tensorflow/core/platform/cloud/gcs_file_system.cc +++ b/tensorflow/core/platform/cloud/gcs_file_system.cc @@ -307,6 +307,7 @@ class GcsWritableFile : public WritableFile { object_(object), auth_provider_(auth_provider), http_request_factory_(http_request_factory), + sync_needed_(true), max_upload_attempts_(max_upload_attempts) { if (GetTmpFilename(&tmp_content_filename_).ok()) { outfile_.open(tmp_content_filename_, @@ -328,6 +329,7 @@ class GcsWritableFile : public WritableFile { object_(object), auth_provider_(auth_provider), http_request_factory_(http_request_factory), + sync_needed_(true), max_upload_attempts_(max_upload_attempts) { tmp_content_filename_ = tmp_content_filename; outfile_.open(tmp_content_filename_, @@ -338,6 +340,7 @@ class GcsWritableFile : public WritableFile { Status Append(const StringPiece& data) override { TF_RETURN_IF_ERROR(CheckWritable()); + sync_needed_ = true; outfile_ << data; if (!outfile_.good()) { return errors::Internal( @@ -357,14 +360,26 @@ class GcsWritableFile : public WritableFile { Status Flush() override { return Sync(); } + Status Sync() override { + TF_RETURN_IF_ERROR(CheckWritable()); + if (!sync_needed_) { + return Status::OK(); + } + Status status = SyncImpl(); + if (status.ok()) { + sync_needed_ = false; + } + return status; + } + + private: /// Copies the current version of the file to GCS. /// - /// This Sync() uploads the object to GCS. + /// This SyncImpl() uploads the object to GCS. /// In case of a failure, it resumes failed uploads as recommended by the GCS /// resumable API documentation. When the whole upload needs to be /// restarted, Sync() returns UNAVAILABLE and relies on RetryingFileSystem. - Status Sync() override { - TF_RETURN_IF_ERROR(CheckWritable()); + Status SyncImpl() { outfile_.flush(); if (!outfile_.good()) { return errors::Internal( @@ -410,7 +425,6 @@ class GcsWritableFile : public WritableFile { return errors::Aborted("Upload gs://", bucket_, "/", object_, " failed."); } - private: Status CheckWritable() const { if (!outfile_.is_open()) { return errors::FailedPrecondition( @@ -556,6 +570,7 @@ class GcsWritableFile : public WritableFile { string tmp_content_filename_; std::ofstream outfile_; HttpRequest::Factory* http_request_factory_; + bool sync_needed_; // whether there is buffered data that needs to be synced int32 max_upload_attempts_; }; diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc index 5f5f868a5c7..84f219616a9 100644 --- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc +++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc @@ -228,6 +228,11 @@ TEST(GcsFileSystemTest, NewWritableFile) { TF_EXPECT_OK(file->Append("content1,")); TF_EXPECT_OK(file->Append("content2")); + TF_EXPECT_OK(file->Flush()); + // The calls to flush, sync, and close below should not cause uploads because + // the file is not dirty. + TF_EXPECT_OK(file->Flush()); + TF_EXPECT_OK(file->Sync()); TF_EXPECT_OK(file->Close()); } diff --git a/tensorflow/core/platform/cloud/http_request_test.cc b/tensorflow/core/platform/cloud/http_request_test.cc index 93c4ec51d95..31ba3e337f9 100644 --- a/tensorflow/core/platform/cloud/http_request_test.cc +++ b/tensorflow/core/platform/cloud/http_request_test.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/test.h" namespace tensorflow { @@ -172,7 +173,8 @@ class FakeLibCurl : public LibCurl { temp_str.replace(n, victim.size(), encoded); n += encoded.size(); } - char* out_char_str = (char*)malloc(sizeof(char) * temp_str.size() + 1); + char* out_char_str = + (char*)port::Malloc(sizeof(char) * temp_str.size() + 1); std::copy(temp_str.begin(), temp_str.end(), out_char_str); out_char_str[temp_str.size()] = '\0'; return out_char_str; @@ -180,7 +182,7 @@ class FakeLibCurl : public LibCurl { void curl_slist_free_all(curl_slist* list) override { delete reinterpret_cast*>(list); } - void curl_free(void* p) override { free(p); } + void curl_free(void* p) override { port::Free(p); } // Variables defining the behavior of this fake. string response_content; diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index 80c23b1df15..168f9df2e84 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -3,10 +3,11 @@ load("@protobuf//:protobuf.bzl", "cc_proto_library") load("@protobuf//:protobuf.bzl", "py_proto_library") -# configure may change the following lines to True +# configure may change the following lines WITH_GCP_SUPPORT = False WITH_HDFS_SUPPORT = False WITH_XLA_SUPPORT = False +WITH_JEMALLOC = True # Appends a suffix to a list of deps. def tf_deps(deps, suffix): @@ -176,7 +177,29 @@ def tf_additional_test_srcs(): def tf_kernel_tests_linkstatic(): return 0 +# jemalloc only enabled on Linux for now. +# TODO(jhseu): Enable on other platforms. +def tf_additional_lib_defines(): + defines = [] + if WITH_JEMALLOC: + defines += select({ + "//tensorflow:linux_x86_64": [ + "TENSORFLOW_USE_JEMALLOC" + ], + "//conditions:default": [], + }) + return defines + def tf_additional_lib_deps(): + deps = [] + if WITH_JEMALLOC: + deps += select({ + "//tensorflow:linux_x86_64": ["@jemalloc"], + "//conditions:default": [], + }) + return deps + +def tf_additional_core_deps(): deps = [] if WITH_GCP_SUPPORT: deps.append("//tensorflow/core/platform/cloud:gcs_file_system") diff --git a/tensorflow/core/platform/hexagon/soc_interface.h b/tensorflow/core/platform/hexagon/soc_interface.h index 61567de3276..f4a3cdf4bda 100644 --- a/tensorflow/core/platform/hexagon/soc_interface.h +++ b/tensorflow/core/platform/hexagon/soc_interface.h @@ -48,7 +48,7 @@ bool soc_interface_ReadOutputNodeFloat(const char* const node_name, uint8_t** buf, uint64_t* buf_size); // Setup graph // TODO(satok): Remove and use runtime version -bool soc_interface_SetupGraphDummy(int version); +bool soc_interface_setupDummyGraph(int version); // Allocate memory for params of node inputs and node outputs bool soc_interface_AllocateNodeInputAndNodeOutputArray(int total_input_count, diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h index 6618145c3d1..dc389a87415 100644 --- a/tensorflow/core/platform/mem.h +++ b/tensorflow/core/platform/mem.h @@ -24,9 +24,14 @@ limitations under the License. namespace tensorflow { namespace port { -// Aligned allocation/deallocation -void* aligned_malloc(size_t size, int minimum_alignment); -void aligned_free(void* aligned_memory); +// Aligned allocation/deallocation. `minimum_alignment` must be a power of 2 +// and a multiple of sizeof(void*). +void* AlignedMalloc(size_t size, int minimum_alignment); +void AlignedFree(void* aligned_memory); + +void* Malloc(size_t size); +void* Realloc(void* ptr, size_t size); +void Free(void* ptr); // Tries to release num_bytes of free memory back to the operating // system for reuse. Use this routine with caution -- to get this diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc index 8d98eb25a20..8930e49ff84 100644 --- a/tensorflow/core/platform/port_test.cc +++ b/tensorflow/core/platform/port_test.cc @@ -25,11 +25,11 @@ namespace port { TEST(Port, AlignedMalloc) { for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) { - void* p = aligned_malloc(1, alignment); - ASSERT_TRUE(p != NULL) << "aligned_malloc(1, " << alignment << ")"; + void* p = AlignedMalloc(1, alignment); + ASSERT_TRUE(p != NULL) << "AlignedMalloc(1, " << alignment << ")"; uintptr_t pval = reinterpret_cast(p); EXPECT_EQ(pval % alignment, 0); - aligned_free(p); + AlignedFree(p); } } diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc index 84bc9492b57..91d612f2339 100644 --- a/tensorflow/core/platform/posix/port.cc +++ b/tensorflow/core/platform/posix/port.cc @@ -13,8 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#ifdef TENSORFLOW_USE_JEMALLOC +#include "jemalloc/jemalloc.h" +#endif + #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/types.h" #if defined(__linux__) && !defined(__ANDROID__) #include @@ -33,7 +38,7 @@ limitations under the License. namespace tensorflow { namespace port { -void InitMain(const char *usage, int *argc, char ***argv) {} +void InitMain(const char* usage, int* argc, char*** argv) {} string Hostname() { char hostname[1024]; @@ -60,36 +65,66 @@ int NumSchedulableCPUs() { return kDefaultCores; } -void *aligned_malloc(size_t size, int minimum_alignment) { +void* AlignedMalloc(size_t size, int minimum_alignment) { #if defined(__ANDROID__) return memalign(minimum_alignment, size); #else // !defined(__ANDROID__) - void *ptr = NULL; + void* ptr = NULL; // posix_memalign requires that the requested alignment be at least // sizeof(void*). In this case, fall back on malloc which should return // memory aligned to at least the size of a pointer. - const int required_alignment = sizeof(void *); - if (minimum_alignment < required_alignment) return malloc(size); - if (posix_memalign(&ptr, minimum_alignment, size) != 0) + const int required_alignment = sizeof(void*); + if (minimum_alignment < required_alignment) return Malloc(size); +#ifdef TENSORFLOW_USE_JEMALLOC + int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size); +#else + int err = posix_memalign(&ptr, minimum_alignment, size); +#endif + if (err != 0) { return NULL; - else + } else { return ptr; + } #endif } -void aligned_free(void *aligned_memory) { free(aligned_memory); } +void AlignedFree(void* aligned_memory) { Free(aligned_memory); } + +void* Malloc(size_t size) { +#ifdef TENSORFLOW_USE_JEMALLOC + return jemalloc_malloc(size); +#else + return malloc(size); +#endif +} + +void* Realloc(void* ptr, size_t size) { +#ifdef TENSORFLOW_USE_JEMALLOC + return jemalloc_realloc(ptr, size); +#else + return realloc(ptr, size); +#endif +} + +void Free(void* ptr) { +#ifdef TENSORFLOW_USE_JEMALLOC + jemalloc_free(ptr); +#else + free(ptr); +#endif +} void MallocExtension_ReleaseToSystem(std::size_t num_bytes) { // No-op. } -std::size_t MallocExtension_GetAllocatedSize(const void *p) { return 0; } +std::size_t MallocExtension_GetAllocatedSize(const void* p) { return 0; } -void AdjustFilenameForLogging(string *filename) { +void AdjustFilenameForLogging(string* filename) { // Nothing to do } -bool Snappy_Compress(const char *input, size_t length, string *output) { +bool Snappy_Compress(const char* input, size_t length, string* output) { #ifdef SNAPPY output->resize(snappy::MaxCompressedLength(length)); size_t outlen; @@ -101,8 +136,8 @@ bool Snappy_Compress(const char *input, size_t length, string *output) { #endif } -bool Snappy_GetUncompressedLength(const char *input, size_t length, - size_t *result) { +bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { #ifdef SNAPPY return snappy::GetUncompressedLength(input, length, result); #else @@ -110,7 +145,7 @@ bool Snappy_GetUncompressedLength(const char *input, size_t length, #endif } -bool Snappy_Uncompress(const char *input, size_t length, char *output) { +bool Snappy_Uncompress(const char* input, size_t length, char* output) { #ifdef SNAPPY return snappy::RawUncompress(input, length, output); #else @@ -118,7 +153,7 @@ bool Snappy_Uncompress(const char *input, size_t length, char *output) { #endif } -string Demangle(const char *mangled) { return mangled; } +string Demangle(const char* mangled) { return mangled; } } // namespace port } // namespace tensorflow diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc index ee5be221cd6..b2167081a69 100644 --- a/tensorflow/core/platform/windows/port.cc +++ b/tensorflow/core/platform/windows/port.cc @@ -52,11 +52,17 @@ int NumSchedulableCPUs() { return system_info.dwNumberOfProcessors; } -void* aligned_malloc(size_t size, int minimum_alignment) { +void* AlignedMalloc(size_t size, int minimum_alignment) { return _aligned_malloc(size, minimum_alignment); } -void aligned_free(void* aligned_memory) { _aligned_free(aligned_memory); } +void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); } + +void* Malloc(size_t size) { return ::malloc(size); } + +void* Realloc(void* ptr, size_t size) { return ::realloc(ptr, size); } + +void Free(void* ptr) { ::free(ptr); } void MallocExtension_ReleaseToSystem(std::size_t num_bytes) { // No-op. diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index d960b8dd42f..a2b333aad1b 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -79,10 +79,12 @@ limitations under the License. // used for tf.split, ReverseV2 is now used by tf.reverse, ConcatV2 is // now used by tf.concat_v2 (and soon tf.concat). Graphs use flooring // division and mod semantics. TensorArrayV3. (12dec2016) +// 21. Dropped FunctionDef.Node support, switched to node_def introduced +// in version 12. (11jan2017) #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0 -#define TF_GRAPH_DEF_VERSION 20 +#define TF_GRAPH_DEF_VERSION 21 // Checkpoint compatibility versions (the versions field in SavedSliceMeta). // diff --git a/tensorflow/examples/android/AndroidManifest.xml b/tensorflow/examples/android/AndroidManifest.xml index e388734564b..9f229d8b9d4 100644 --- a/tensorflow/examples/android/AndroidManifest.xml +++ b/tensorflow/examples/android/AndroidManifest.xml @@ -41,7 +41,7 @@ - + @@ -50,6 +50,15 @@ + + + + + + + diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD index 3ba3a494aba..0c1cea5fc35 100644 --- a/tensorflow/examples/android/BUILD +++ b/tensorflow/examples/android/BUILD @@ -66,6 +66,7 @@ android_binary( "//tensorflow/examples/android/assets:asset_files", "@inception5h//:model_files", "@mobile_multibox//:model_files", + "@stylize//:model_files", ], assets_dir = "", custom_package = "org.tensorflow.demo", diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md index 79f543fb748..fbbe9f276bc 100644 --- a/tensorflow/examples/android/README.md +++ b/tensorflow/examples/android/README.md @@ -22,6 +22,10 @@ existing application. Demonstrates a model based on [Scalable Object Detection using Deep Neural Networks](https://arxiv.org/abs/1312.2249) to localize and track people in the camera preview in real-time. +3. [TF Stylize](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java): + Uses a model based on [A Learned Representation For Artistic Style] + (https://arxiv.org/abs/1610.07629) to restyle the camera preview image + to that of a number of different artists. ## Prebuilt APK: diff --git a/tensorflow/examples/android/bin/AndroidManifest.xml b/tensorflow/examples/android/bin/AndroidManifest.xml new file mode 100644 index 00000000000..d4792bc4823 --- /dev/null +++ b/tensorflow/examples/android/bin/AndroidManifest.xml @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tensorflow/examples/android/res/layout/camera_connection_fragment_stylize.xml b/tensorflow/examples/android/res/layout/camera_connection_fragment_stylize.xml new file mode 100644 index 00000000000..1cdb24cab03 --- /dev/null +++ b/tensorflow/examples/android/res/layout/camera_connection_fragment_stylize.xml @@ -0,0 +1,51 @@ + + + + + + + + + + + + diff --git a/tensorflow/examples/android/res/values/base-strings.xml b/tensorflow/examples/android/res/values/base-strings.xml index f6c57d5030b..56edb55def7 100644 --- a/tensorflow/examples/android/res/values/base-strings.xml +++ b/tensorflow/examples/android/res/values/base-strings.xml @@ -19,4 +19,5 @@ TensorFlow Demo TF Classify TF Detect + TF Stylize diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java index 853dae4b83d..387bd3f8faa 100644 --- a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java +++ b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java @@ -73,7 +73,7 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab private static final boolean MAINTAIN_ASPECT = true; - private TensorFlowImageClassifier classifier; + private Classifier classifier; private Integer sensorOrientation; @@ -88,7 +88,6 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab private boolean computing = false; - private Matrix frameToCropTransform; private Matrix cropToFrameTransform; @@ -112,17 +111,15 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab @Override public void onPreviewSizeChosen(final Size size, final int rotation) { - final float textSizePx = TypedValue.applyDimension( - TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, - getResources().getDisplayMetrics()); + final float textSizePx = + TypedValue.applyDimension( + TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics()); borderedText = new BorderedText(textSizePx); borderedText.setTypeface(Typeface.MONOSPACE); - classifier = new TensorFlowImageClassifier(); - try { - final int initStatus = - classifier.initializeTensorFlow( + classifier = + TensorFlowImageClassifier.create( getAssets(), MODEL_FILE, LABEL_FILE, @@ -132,10 +129,6 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab IMAGE_STD, INPUT_NAME, OUTPUT_NAME); - if (initStatus != 0) { - LOGGER.e("TF init status != 0: %d", initStatus); - throw new RuntimeException(); - } } catch (final Exception e) { throw new RuntimeException("Error initializing TensorFlow!", e); } @@ -147,8 +140,7 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab final Display display = getWindowManager().getDefaultDisplay(); final int screenOrientation = display.getRotation(); - LOGGER.i("Sensor orientation: %d, Screen orientation: %d", - rotation, screenOrientation); + LOGGER.i("Sensor orientation: %d, Screen orientation: %d", rotation, screenOrientation); sensorOrientation = rotation + screenOrientation; @@ -157,22 +149,24 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888); croppedBitmap = Bitmap.createBitmap(INPUT_SIZE, INPUT_SIZE, Config.ARGB_8888); - frameToCropTransform = ImageUtils.getTransformationMatrix( - previewWidth, previewHeight, - INPUT_SIZE, INPUT_SIZE, - sensorOrientation, MAINTAIN_ASPECT); + frameToCropTransform = + ImageUtils.getTransformationMatrix( + previewWidth, previewHeight, + INPUT_SIZE, INPUT_SIZE, + sensorOrientation, MAINTAIN_ASPECT); cropToFrameTransform = new Matrix(); frameToCropTransform.invert(cropToFrameTransform); yuvBytes = new byte[3][]; - addCallback(new DrawCallback() { - @Override - public void drawCallback(final Canvas canvas) { - renderDebug(canvas); - } - }); + addCallback( + new DrawCallback() { + @Override + public void drawCallback(final Canvas canvas) { + renderDebug(canvas); + } + }); } @Override diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java index c8aeb8ae25c..9ab5a7108ab 100644 --- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java +++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java @@ -124,30 +124,19 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable tracker = new MultiBoxTracker(getResources().getDisplayMetrics()); - if (USE_YOLO) { - final TensorFlowYoloDetector yoloDetector = new TensorFlowYoloDetector(); - try { - final int initStatus = - yoloDetector.initializeTensorFlow( + try { + if (USE_YOLO) { + detector = + TensorFlowYoloDetector.create( getAssets(), YOLO_MODEL_FILE, YOLO_INPUT_SIZE, YOLO_INPUT_NAME, YOLO_OUTPUT_NAMES, YOLO_BLOCK_SIZE); - if (initStatus != 0) { - LOGGER.e("TF init status != 0: %d", initStatus); - throw new RuntimeException(); - } - } catch (final Exception e) { - throw new RuntimeException("Error initializing TensorFlow!", e); - } - detector = yoloDetector; - } else { - final TensorFlowMultiBoxDetector multiBoxDetector = new TensorFlowMultiBoxDetector(); - try { - final int initStatus = - multiBoxDetector.initializeTensorFlow( + } else { + detector = + TensorFlowMultiBoxDetector.create( getAssets(), MB_MODEL_FILE, MB_LOCATION_FILE, @@ -157,14 +146,9 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable MB_IMAGE_STD, MB_INPUT_NAME, MB_OUTPUT_NAMES); - if (initStatus != 0) { - LOGGER.e("TF init status != 0: %d", initStatus); - throw new RuntimeException(); - } - } catch (final Exception e) { - throw new RuntimeException("Error initializing TensorFlow!", e); } - detector = multiBoxDetector; + } catch (final Exception e) { + throw new RuntimeException("Error initializing TensorFlow!", e); } previewWidth = size.getWidth(); @@ -249,6 +233,7 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable } OverlayView trackingOverlay; + @Override public void onImageAvailable(final ImageReader reader) { Image image = null; diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java new file mode 100644 index 00000000000..8a3c7a4ef92 --- /dev/null +++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java @@ -0,0 +1,662 @@ +/* + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.tensorflow.demo; + +import android.content.Context; +import android.content.res.AssetManager; +import android.graphics.Bitmap; +import android.graphics.Bitmap.Config; +import android.graphics.BitmapFactory; +import android.graphics.Canvas; +import android.graphics.Color; +import android.graphics.Matrix; +import android.graphics.Paint; +import android.graphics.Paint.Style; +import android.graphics.Rect; +import android.graphics.Typeface; +import android.media.Image; +import android.media.Image.Plane; +import android.media.ImageReader; +import android.media.ImageReader.OnImageAvailableListener; +import android.os.Bundle; +import android.os.SystemClock; +import android.os.Trace; +import android.util.Size; +import android.util.TypedValue; +import android.view.Display; +import android.view.MotionEvent; +import android.view.View; +import android.view.View.OnClickListener; +import android.view.View.OnTouchListener; +import android.view.ViewGroup; +import android.widget.BaseAdapter; +import android.widget.Button; +import android.widget.GridView; +import android.widget.ImageView; +import android.widget.Toast; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Vector; +import org.tensorflow.contrib.android.TensorFlowInferenceInterface; +import org.tensorflow.demo.OverlayView.DrawCallback; +import org.tensorflow.demo.env.BorderedText; +import org.tensorflow.demo.env.ImageUtils; +import org.tensorflow.demo.env.Logger; +import org.tensorflow.demo.R; + +/** + * Sample activity that stylizes the camera preview according to "A Learned Representation For + * Artistic Style" (https://arxiv.org/abs/1610.07629) + */ +public class StylizeActivity extends CameraActivity implements OnImageAvailableListener { + static { + System.loadLibrary("tensorflow_demo"); + } + + private static final Logger LOGGER = new Logger(); + + private static final String MODEL_FILE = "file:///android_asset/stylize_quantized.pb"; + private static final String INPUT_NODE = "input:0"; + private static final String STYLE_NODE = "style_num:0"; + private static final String OUTPUT_NODE = "transformer/expand/conv3/conv/Sigmoid"; + private static final int NUM_STYLES = 26; + + private static final boolean SAVE_PREVIEW_BITMAP = false; + + // Whether to actively manipulate non-selected sliders so that sum of activations always appears + // to be 1.0. The actual style input tensor will be normalized to sum to 1.0 regardless. + private static final boolean NORMALIZE_SLIDERS = true; + + private static final float TEXT_SIZE_DIP = 12; + + private static final boolean DEBUG_MODEL = false; + + private static final int[] SIZES = {32, 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024}; + + // Start at a medium size, but let the user step up through smaller sizes so they don't get + // immediately stuck processing a large image. + private int desiredSizeIndex = -1; + private int desiredSize = 256; + private int initializedSize = 0; + + private Integer sensorOrientation; + + private int previewWidth = 0; + private int previewHeight = 0; + private byte[][] yuvBytes; + private int[] rgbBytes = null; + private Bitmap rgbFrameBitmap = null; + private Bitmap croppedBitmap = null; + + private final float[] styleVals = new float[NUM_STYLES]; + private int[] intValues; + private float[] floatValues; + + private int frameNum = 0; + + private Bitmap cropCopyBitmap; + private Bitmap textureCopyBitmap; + + private boolean computing = false; + + private Matrix frameToCropTransform; + private Matrix cropToFrameTransform; + + private BorderedText borderedText; + + private long lastProcessingTimeMs; + + private TensorFlowInferenceInterface inferenceInterface; + + private int lastOtherStyle = 1; + + private boolean allZero = false; + + private ImageGridAdapter adapter; + private GridView grid; + + private final OnTouchListener gridTouchAdapter = + new OnTouchListener() { + ImageSlider slider = null; + + @Override + public boolean onTouch(final View v, final MotionEvent event) { + switch (event.getActionMasked()) { + case MotionEvent.ACTION_DOWN: + for (int i = 0; i < NUM_STYLES; ++i) { + final ImageSlider child = adapter.items[i]; + final Rect rect = new Rect(); + child.getHitRect(rect); + if (rect.contains((int) event.getX(), (int) event.getY())) { + slider = child; + slider.setHilighted(true); + } + } + break; + + case MotionEvent.ACTION_MOVE: + if (slider != null) { + final Rect rect = new Rect(); + slider.getHitRect(rect); + + final float newSliderVal = + (float) + Math.min( + 1.0, + Math.max( + 0.0, 1.0 - (event.getY() - slider.getTop()) / slider.getHeight())); + + setStyle(slider, newSliderVal); + } + break; + + case MotionEvent.ACTION_UP: + if (slider != null) { + slider.setHilighted(false); + slider = null; + } + break; + } + return true; + } + }; + + @Override + public void onCreate(final Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + } + + @Override + protected int getLayoutId() { + return R.layout.camera_connection_fragment_stylize; + } + + @Override + protected int getDesiredPreviewFrameSize() { + return SIZES[SIZES.length - 1]; + } + + public static Bitmap getBitmapFromAsset(final Context context, final String filePath) { + final AssetManager assetManager = context.getAssets(); + + Bitmap bitmap = null; + try { + final InputStream inputStream = assetManager.open(filePath); + bitmap = BitmapFactory.decodeStream(inputStream); + } catch (final IOException e) { + LOGGER.e("Error opening bitmap!", e); + } + + return bitmap; + } + + private class ImageSlider extends ImageView { + private float value = 0.0f; + private boolean hilighted = false; + + private final Paint boxPaint; + private final Paint linePaint; + + public ImageSlider(final Context context) { + super(context); + value = 0.0f; + + boxPaint = new Paint(); + boxPaint.setColor(Color.BLACK); + boxPaint.setAlpha(128); + + linePaint = new Paint(); + linePaint.setColor(Color.WHITE); + linePaint.setStrokeWidth(10.0f); + linePaint.setStyle(Style.STROKE); + } + + @Override + public void onDraw(final Canvas canvas) { + super.onDraw(canvas); + final float y = (1.0f - value) * canvas.getHeight(); + + // If all sliders are zero, don't bother shading anything. + if (!allZero) { + canvas.drawRect(0, 0, canvas.getWidth(), y, boxPaint); + } + + if (value > 0.0f) { + canvas.drawLine(0, y, canvas.getWidth(), y, linePaint); + } + + if (hilighted) { + canvas.drawRect(0, 0, getWidth(), getHeight(), linePaint); + } + } + + @Override + protected void onMeasure(final int widthMeasureSpec, final int heightMeasureSpec) { + super.onMeasure(widthMeasureSpec, heightMeasureSpec); + setMeasuredDimension(getMeasuredWidth(), getMeasuredWidth()); + } + + public void setValue(final float value) { + this.value = value; + postInvalidate(); + } + + public void setHilighted(final boolean highlighted) { + this.hilighted = highlighted; + this.postInvalidate(); + } + } + + private class ImageGridAdapter extends BaseAdapter { + final ImageSlider[] items = new ImageSlider[NUM_STYLES]; + final ArrayList