diff --git a/configure.py b/configure.py index 186fdc9ddce..fcf359d061d 100644 --- a/configure.py +++ b/configure.py @@ -685,10 +685,12 @@ def set_tf_cunn_version(environ_cp): ldconfig_bin = which('ldconfig') or '/sbin/ldconfig' cudnn_path_from_ldconfig = run_shell([ldconfig_bin, '-p']) cudnn_path_from_ldconfig = re.search('.*libcudnn.so .* => (.*)', - cudnn_path_from_ldconfig).group(1) - if os.path.exists('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version)): - cudnn_install_path = os.path.dirname(cudnn_path_from_ldconfig) - break + cudnn_path_from_ldconfig) + if cudnn_path_from_ldconfig: + cudnn_path_from_ldconfig = cudnn_path_from_ldconfig.group(1) + if os.path.exists('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version)): + cudnn_install_path = os.path.dirname(cudnn_path_from_ldconfig) + break # Reset and Retry print( diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 20e5c573c6b..5b6a18b6a69 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -296,6 +296,7 @@ filegroup( "//tensorflow/contrib/ffmpeg/default:all_files", "//tensorflow/contrib/framework:all_files", "//tensorflow/contrib/fused_conv:all_files", + "//tensorflow/contrib/gan:all_files", "//tensorflow/contrib/graph_editor:all_files", "//tensorflow/contrib/grid_rnn:all_files", "//tensorflow/contrib/hooks:all_files", @@ -323,6 +324,7 @@ filegroup( "//tensorflow/contrib/nn:all_files", "//tensorflow/contrib/opt:all_files", "//tensorflow/contrib/predictor:all_files", + "//tensorflow/contrib/receptive_field:all_files", "//tensorflow/contrib/reduce_slice_ops:all_files", "//tensorflow/contrib/remote_fused_graph/pylib:all_files", "//tensorflow/contrib/resampler:all_files", @@ -342,6 +344,7 @@ filegroup( "//tensorflow/contrib/staging:all_files", "//tensorflow/contrib/stat_summarizer:all_files", "//tensorflow/contrib/stateless:all_files", + "//tensorflow/contrib/summary:all_files", "//tensorflow/contrib/tensor_forest:all_files", "//tensorflow/contrib/tensor_forest/hybrid:all_files", "//tensorflow/contrib/tensor_forest/kernels/v4:all_files", diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD index 604dfab148b..1822e235eba 100644 --- a/tensorflow/c/BUILD +++ b/tensorflow/c/BUILD @@ -45,8 +45,13 @@ tf_cuda_library( tf_cuda_library( name = "c_api", - srcs = ["c_api.cc"], - hdrs = ["c_api.h"], + srcs = [ + "c_api.cc", + "c_api_function.cc", + ], + hdrs = [ + "c_api.h", + ], copts = tf_copts(), visibility = ["//visibility:public"], deps = select({ @@ -157,6 +162,21 @@ tf_cc_test( ], ) +tf_cc_test( + name = "c_api_function_test", + size = "small", + srcs = ["c_api_function_test.cc"], + deps = [ + ":c_api", + ":c_test_util", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + tf_cc_test( name = "while_loop_test", size = "small", diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index 07c8277a6f2..c454c94249b 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -165,22 +165,6 @@ void deallocate_buffer(void* data, size_t len, void* arg) { tensorflow::cpu_allocator()->DeallocateRaw(data); } -Status MessageToBuffer(const tensorflow::protobuf::Message& in, - TF_Buffer* out) { - if (out->data != nullptr) { - return InvalidArgument("Passing non-empty TF_Buffer is invalid."); - } - const auto proto_size = in.ByteSizeLong(); - void* buf = tensorflow::port::Malloc(proto_size); - in.SerializeToArray(buf, proto_size); - out->data = buf; - out->length = proto_size; - out->data_deallocator = [](void* data, size_t length) { - tensorflow::port::Free(data); - }; - return Status::OK(); -} - } // namespace TF_Tensor::~TF_Tensor() { buffer->Unref(); } @@ -559,6 +543,27 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, dimvec.size(), base, size, DeleteArray, base); } +Status MessageToBuffer(const tensorflow::protobuf::Message& in, + TF_Buffer* out) { + if (out->data != nullptr) { + return InvalidArgument("Passing non-empty TF_Buffer is invalid."); + } + const size_t proto_size = in.ByteSizeLong(); + void* buf = tensorflow::port::Malloc(proto_size); + if (buf == nullptr) { + return tensorflow::errors::ResourceExhausted( + "Failed to allocate memory to serialize message of type '", + in.GetTypeName(), "' and size ", proto_size); + } + in.SerializeToArray(buf, proto_size); + out->data = buf; + out->length = proto_size; + out->data_deallocator = [](void* data, size_t length) { + tensorflow::port::Free(data); + }; + return Status::OK(); +} + // Helpers for loading a TensorFlow plugin (a .so file). Status LoadLibrary(const char* library_filename, void** result, const void** buf, size_t* len); diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h index 43b50780137..ee110d88cea 100644 --- a/tensorflow/c/c_api.h +++ b/tensorflow/c/c_api.h @@ -357,6 +357,14 @@ typedef struct TF_Output { int index; // The index of the output within oper. } TF_Output; +// TF_Function is a grouping of operations with defined inputs and outputs. +// Once created and added to graphs, functions can be invoked by creating an +// operation whose operation type matches the function name. +typedef struct TF_Function TF_Function; + +// Function definition options. TODO(iga): Define and implement +typedef struct TF_FunctionOptions TF_FunctionOptions; + // Sets the shape of the Tensor referenced by `output` in `graph` to // the shape described by `dims` and `num_dims`. // @@ -914,6 +922,15 @@ TF_CAPI_EXPORT extern void TF_GraphImportGraphDef( TF_Graph* graph, const TF_Buffer* graph_def, const TF_ImportGraphDefOptions* options, TF_Status* status); +// Add `function` to graph `g`. Once `function` is added to `g`, +// it can be called by creating an operation using the function's name. +// +// If successful, status is set to OK and function is added to g +// Otherwise, status is set to the encountered error and g is unmodified +TF_CAPI_EXPORT extern void TF_GraphAddFunction(TF_Graph* g, + const TF_Function* function, + TF_Status* status); + // Note: The following function may fail on very large protos in the future. TF_CAPI_EXPORT extern void TF_OperationToNodeDef(TF_Operation* oper, @@ -1001,6 +1018,105 @@ TF_CAPI_EXPORT void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx, TF_Output* dx, TF_Status* status, TF_Output* dy); +// Create a TF_Function from a TF_Graph +// +// Params: +// fn_body - the graph whose operations (or subset of whose operations) will be +// converted to TF_Function. +// fn_name - the name of the new TF_Function. Should match the operation +// name (OpDef.name) regexp [A-Z][A-Za-z0-9_.\\-/]* and be distinct +// from other operation names (at least those registered in graphs +// where this function will be used). +// TODO(iga): Allow null in here and have C API come up with +// a unique name with high probability (similarly to +// _create_hash_str in function.py) +// num_opers - `num_opers` contains the number of elements in the `opers` array +// or a special value of -1 meaning that no array is given. +// The distinction between an empty array of operations and no +// array of operations is necessary to distinguish the case of +// creating a function with no body (e.g. identity or permutation) +// and the case of creating a function whose body contains all +// the nodes in the graph (except for the automatic skipping, see +// below). +// opers - Array of operations to become the body of the function or null. +// - If no array is given (`num_opers` = -1), all the +// operations in `fn_body` will become part of the function +// except operations referenced in `inputs`. These operations +// must have a single output (these operations are typically +// placeholders created for the sole purpose of representing +// an input. We can relax this constraint if there are +// compelling use cases). +// - If an array is given (`num_opers` >= 0), all operations +// in it will become part of the function. In particular, no +// automatic skipping of dummy input operations is performed. +// ninputs - number of elements in `inputs` array +// inputs - array of TF_Outputs that specify the inputs to the function. +// If `ninputs` is zero (the function takes no inputs), `inputs` +// can be null. The names used for function inputs are normalized +// names of the operations (usually placeholders) pointed to by +// `inputs`. These operation names should start with a letter. +// Normalization will convert all letters to lowercase and +// non-alphanumeric characters to '_' to make resulting names match +// the "[a-z][a-z0-9_]*" pattern for operation argument names. +// `inputs` cannot contain the same tensor twice. +// noutputs - number of elements in `outputs` array +// outputs - array of TF_Outputs that specify the outputs of the function. +// If `noutputs` is zero (the function returns no outputs), `outputs` +// can be null. `outputs` can contain the same tensor more than once. +// output_names - The names of the function's outputs. `output_names` array +// must either have the same length as `outputs` +// (i.e. `noutputs`) or be null. In the former case, +// the names should match the regular expression for ArgDef +// names - "[a-z][a-z0-9_]*". In the latter case, +// names for outputs will be generated automatically. +// opts - various options for the function, e.g. XLA's inlining control. +// status - Set to OK on success and an appropriate error on failure. +// +// Note that when the same TF_Output is listed as both an input and an output, +// the corresponding function's output will equal to this input, +// instead of the original node's output. +// +// Callers must also satisfy the following constraints: +// - `inputs` cannot refer to TF_Outputs within a control flow context. For +// example, one cannot use the output of "switch" node as input. +// - No TF_Output of a function (inside any of `inputs`, `outputs`, `fn_body`) +// is allowed to have a reference type. Reference types are not exposed +// through C API and are being deprecated. +// - Every node in the function's body must have all of its inputs (including +// control inputs). In other words, for every node in the body, each input +// must be either listed in `inputs` or must come from another node in +// the body. In particular, it is an error to have a control edge going from +// a node outside of the body into a node in the body. This applies to control +// edges going from nodes referenced in `inputs` to nodes in the body when +// the former nodes are not in the body (automatically skipped or not +// included in explicitly specified body). +// +// Returns: +// On successful, a newly created TF_Function instance. It must be deleted by +// calling TF_DeleteFunction. +// +// On failure, null. +// +// TODO(iga): Add input_names argument and get output_names working (they are +// currently ignored) +TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunction( + const TF_Graph* fn_body, const char* fn_name, int num_opers, + const TF_Operation* const* opers, int ninputs, const TF_Output* inputs, + int noutputs, const TF_Output* outputs, const char* const* output_names, + const TF_FunctionOptions* opts, TF_Status* status); + +// Write out a serialized representation of `func` (as a FunctionDef protocol +// message) to `output_func_def` (allocated by TF_NewBuffer()). +// `output_func_def`'s underlying buffer will be freed when TF_DeleteBuffer() +// is called. +// +// May fail on very large graphs in the future. +TF_CAPI_EXPORT extern void TF_FunctionToFunctionDef(TF_Function* func, + TF_Buffer* output_func_def, + TF_Status* status); + +TF_CAPI_EXPORT extern void TF_DeleteFunction(TF_Function*); + // TODO(josh11b): Register OpDef, available to all operations added // to this graph. diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc new file mode 100644 index 00000000000..b4c6397d0b4 --- /dev/null +++ b/tensorflow/c/c_api_function.cc @@ -0,0 +1,496 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/c/c_api_internal.h" + +#include +#include +#include + +#include "tensorflow/core/framework/attr_value_util.h" +#include "tensorflow/core/framework/function.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/lib/strings/strcat.h" + +namespace tensorflow { +namespace { + +// Class that maintains a one-to-one original node name -> new node name +// mapping. We normalize the names used as input and output arguments to match +// regexp "[a-z][a-z0-9_]*" specified in definition of ArgDef.name. +// Once we rename them, we risk creating a name collision with the other +// node names, so if necessary we add a suffix to make +// names unique. If we have an input named "A" and a node in the function +// body named "a", they will be renamed to "a" and "a_0". +class NodeNameMapping { + public: + NodeNameMapping() = default; + + // Normalize the input/output name and make it unique. + string GetIOName(const string& name); + + // Make the node name unique. + string Uniquify(const string& name); + + // Look up how a node name was previously normalized/uniquified. + // Returns empty if name was never seen. + string Lookup(const string& name) const; + + private: + string UniquifyHelper(const string& name) const; + static string Normalize(string name); + + // The normalized/uniquified names already used as + // input names (in signature), output names (in signature), and node names + // (in node_def). + // This is a superset of values in name_mapping_. + std::unordered_set used_names_; + // Mapping from original node name from the graph to the normalized + // and uniqified version of it. + std::unordered_map name_mapping_; +}; + +string NodeNameMapping::Normalize(string name) { + // Convert letters to lowercase and non-alphanumeric characters to '_'. + if (name.empty()) return "unknown"; + const int n = name.size(); + for (int i = 0; i < n; ++i) { + char c = name[i]; + if (isalnum(c)) { + if (isupper(c)) { + name[i] = tolower(c); + } + } else { + name[i] = '_'; + } + } + + // Find the first letter and start with it. + int i = 0; + for (; i < n; ++i) { + if (isalpha(name[i])) break; + } + + // Return "unknown" if none of the name's chars were letters. + return i == n ? "unknown" : name.substr(i); +} + +string NodeNameMapping::UniquifyHelper(const string& name) const { + // If the name hasn't been used yet, use it as-is. + if (used_names_.find(name) == used_names_.end()) return name; + // Add a suffix to name to make it unique. + for (int i = 0;; ++i) { + const string candidate = strings::StrCat(name, "_", i); + if (used_names_.find(candidate) == used_names_.end()) return candidate; + } +} + +string NodeNameMapping::GetIOName(const string& name) { + const string& input_name = UniquifyHelper(Normalize(name)); + // Record that we used this name, but don't add it to name_mapping_ + // since this name is not for a node. + used_names_.insert(input_name); + return input_name; +} + +string NodeNameMapping::Uniquify(const string& name) { + const string uniqued = UniquifyHelper(name); + name_mapping_[name] = uniqued; + used_names_.insert(uniqued); + return uniqued; +} + +string NodeNameMapping::Lookup(const string& name) const { + const auto iter = name_mapping_.find(name); + if (iter == name_mapping_.end()) return string(); + return iter->second; +} + +Status ValidateNoRefOutputs(const Node* node) { + for (int i = 0; i < node->num_outputs(); ++i) { + const DataType& dt = node->output_type(i); + if (IsRefType(dt)) { + return errors::InvalidArgument("Output ", i, " of node '", node->name(), + "' has a reference " + "type ", + DataTypeString(dt)); + } + } + return Status::OK(); +} + +Status FillFunctionBody( + const string& fn_name, const NodeNameMapping& node_names, + const std::vector& body_nodes, + const std::unordered_map& tensor_renaming, + FunctionDef* fdef) { + std::vector in_edges; + std::vector control_edges; + for (const Node* node : body_nodes) { + NodeDef* node_def = fdef->add_node_def(); + // First, copy the node_def as is. We will patch it next. + *node_def = node->def(); + if (!node->assigned_device_name().empty()) { + node_def->set_device(node->assigned_device_name()); + } + node_def->set_name(node_names.Lookup(node->name())); + + // Input names must be set based on nested names in tensor_renaming. + // Clear the flat input names we got from the original node_def + // from the graph. + node_def->clear_input(); + + // Collect regular and control inputs. Regular inputs are indexed + // by the index at which they come into the `node`. Control inputs + // don't follow any order. + in_edges.clear(); + in_edges.resize(node->num_inputs(), nullptr); + control_edges.clear(); + for (const Edge* edge : node->in_edges()) { + if (edge->src()->IsSource()) continue; + if (edge->IsControlEdge()) { + control_edges.push_back(edge); + } else { + in_edges[edge->dst_input()] = edge; + } + } + + // Add regular inputs. + for (size_t i = 0; i < in_edges.size(); ++i) { + const Edge* edge = in_edges[i]; + string original_input_name; + if (edge == nullptr) { + // A backedge might not appear as a regular Edge, but be only present + // in the node_def. Such edges are referred to as requested_inputs(). + if (i >= node->requested_inputs().size()) { + return errors::InvalidArgument( + "Graph to be converted to function appears to be malformed. ", + "Node ", node->name(), " is missing input edge ", i); + } + original_input_name = + ParseTensorName(node->requested_inputs()[i]).ToString(); + } else { + original_input_name = + strings::StrCat(edge->src()->name(), ":", edge->src_output()); + } + + const auto iter = tensor_renaming.find(original_input_name); + if (iter == tensor_renaming.end()) { + return errors::InvalidArgument( + "Input ", i, ", '", original_input_name, "', of node '", + node->name(), "' in function '", fn_name, + "' is not available. You might need to include it in inputs " + "or include its source node in the body"); + } + node_def->add_input(iter->second); + } + + // Add control inputs. + for (const Edge* edge : control_edges) { + // Add this control input only if the src node is in the body. + const string normalized = node_names.Lookup(edge->src()->name()); + // If we did not find a name for the source of control edge, this + // source must be outside of the body. Raise an error. + if (normalized.empty()) { + return errors::InvalidArgument( + "The source of control edge ", edge->DebugString(), + " is not in the body. Encountered while creating function '", + fn_name, "'"); + } + node_def->add_input(strings::StrCat("^", normalized)); + } + } + return Status::OK(); +} + +// Graph to FunctionDef conversion. This code is closely modeled on the Python +// code in third_party/tensorflow/python/framework/function.py. +Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name, + const std::vector& body_nodes, + const std::vector& inputs, + const std::vector& outputs, + const std::vector& output_names, + FunctionDef* fdef) { + fdef->mutable_signature()->set_name(fn_name); + + // Keep track of names we used and how we normalized them. + NodeNameMapping node_names; + + // Mapping from original names of tensors (i.e. ":") to the + // name we used in the function: + // - For input tensors: + // {flat_tensor_name -> normalized_name_of_src_node} + // e.g. {In:3 -> in} + // - For tensors produced by nodes in function's body: + // {flat_tensor_name -> nested_tensor_name} + // e.g. {Add:3 -> add_0:z:1} + std::unordered_map tensor_renaming; + + // Fill inputs in function's signature. + for (size_t i = 0; i < inputs.size(); ++i) { + const Node* node = inputs[i].node; + int idx = inputs[i].index; + OpDef::ArgDef* argdef = fdef->mutable_signature()->add_input_arg(); + argdef->set_type(node->output_type(idx)); + const string& input_name = node_names.GetIOName(node->name()); + argdef->set_name(input_name); + tensor_renaming[strings::StrCat(node->name(), ":", idx)] = input_name; + } + + // Fill outputs in function's signature. + for (size_t i = 0; i < outputs.size(); ++i) { + const Node* node = outputs[i].node; + int idx = outputs[i].index; + OpDef::ArgDef* argdef = fdef->mutable_signature()->add_output_arg(); + argdef->set_type(node->output_type(idx)); + argdef->set_name(node_names.GetIOName(node->name())); + } + + // Populate tensor_renaming and node_names. + // Generate the new output names for every node in the function. + // The NodeDefs in FunctionDefs use a different naming scheme for + // their inputs than the NodeDefs in a graph (see the comment for + // FunctionDef.node_def in function.proto). We do the + // graph tensor name -> function tensor name conversion for every + // possible input (i.e. every node's outputs) and store the result + // in tensor_renaming. + for (const Node* node : body_nodes) { + // Make sure node_name does not collide with an input or output name. + const string& node_name = node_names.Uniquify(node->name()); + // For each output_arg in the op_def, the output_ranges + // map will have [start, end] range of indices that this arg produces + // among all the output tensors of this op. + NameRangeMap output_ranges; + TF_RETURN_IF_ERROR( + NameRangesForNode(*node, node->op_def(), nullptr, &output_ranges)); + for (const auto& output : output_ranges) { + const string& output_name = output.first; + int index_start = output.second.first; + int index_end = output.second.second; + for (int i = index_start; i < index_end; ++i) { + const string& original_name = strings::StrCat(node->name(), ":", i); + const string& new_name = + strings::StrCat(node_name, ":", output_name, ":", i - index_start); + // Record the mapping if this tensor is not already mapped. + // Tensor can be already mapped if it is used as an input. + if (tensor_renaming.find(original_name) == tensor_renaming.end()) { + tensor_renaming[original_name] = new_name; + } + } + } + } + + TF_RETURN_IF_ERROR( + FillFunctionBody(fn_name, node_names, body_nodes, tensor_renaming, fdef)); + + // Remap return values. + for (int r = 0; r < fdef->signature().output_arg_size(); ++r) { + const string& ret_name = fdef->signature().output_arg(r).name(); + + // We convert this flat tensor name to the nested value + // (e.g. `add:z:1`) that we stored in tensor_renaming. + const string& return_value = + strings::StrCat(outputs[r].node->name(), ":", outputs[r].index); + const auto iter = tensor_renaming.find(return_value); + if (iter == tensor_renaming.end()) { + return errors::InvalidArgument( + "TF_Output ", return_value, " is neither in the function body ", + "nor among function inputs. Encountered while creating function '", + fn_name, "'"); + } + (*fdef->mutable_ret())[ret_name] = iter->second; + } + + return Status::OK(); +} + +// Converts `ninputs` and `inputs` into `inputs_tensors` and `input_nodes` and +// does various checks while doing so. `input_nodes` will contain the same +// information as input_tensors just in a different structure to make +// following processing easier. TODO(iga): Simplify this nested structure. +Status ProcessInputs( + const TF_Graph* fn_body, const char* fn_name, int ninputs, + const TF_Output* inputs, std::vector* input_tensors, + std::unordered_map>* input_nodes) + EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) { + input_tensors->reserve(ninputs); + for (int i = 0; i < ninputs; ++i) { + const Node& node = inputs[i].oper->node; + int idx = inputs[i].index; + + TF_RETURN_WITH_CONTEXT_IF_ERROR( + fn_body->graph.IsValidOutputTensor(&node, idx), + "Encountered while processing input ", i, " into function '", fn_name, + "'"); + TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNoRefOutputs(&node), + "Encountered while processing input ", i, + " into function '", fn_name, "'"); + + input_tensors->emplace_back(&node, idx); + + const auto& iter = input_nodes->find(&node); + if (iter == input_nodes->end()) { + input_nodes->insert({&node, {idx}}); + } else { + auto& indices = iter->second; + if (std::find(indices.begin(), indices.end(), idx) != indices.end()) { + return errors::InvalidArgument( + "TF_Output ", node.name(), ":", idx, + " appears more than once in the input list"); + } + indices.push_back(idx); + } + } + return Status::OK(); +} + +// Converts `noutputs` and `outputs` into `outputs_tensors` and does various +// checks while doing so. +Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name, + int noutputs, const TF_Output* outputs, + std::vector* output_tensors) + EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) { + output_tensors->reserve(noutputs); + for (int i = 0; i < noutputs; ++i) { + const Node& node = outputs[i].oper->node; + int idx = outputs[i].index; + TF_RETURN_WITH_CONTEXT_IF_ERROR( + fn_body->graph.IsValidOutputTensor(&node, idx), + "Encountered while processing output ", i, " from function '", fn_name, + "'"); + output_tensors->emplace_back(&node, idx); + } + return Status::OK(); +} + +// Populates `body_nodes` with the nodes that will become function's body. +// Performs various checks. +Status ComputeBodyNodes( + const TF_Graph* fn_body, const char* fn_name, int num_opers, + const TF_Operation* const* opers, + const std::unordered_map>& input_nodes, + std::vector* body_nodes) + EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) { + if (num_opers == -1) { + for (const Node* node : fn_body->graph.op_nodes()) { + const auto& iter = input_nodes.find(node); + if (iter == input_nodes.end()) { + // This node is not referenced in inputs. Add it to the body. + TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNoRefOutputs(node), + "Encountered while creating function '", + fn_name, "'"); + body_nodes->push_back(node); + } else { + // This node is referenced in inputs. Currently, we place an + // artificial restriction and require that when num_opers=-1, such + // nodes must have a single output. + if (node->num_outputs() != 1) { + return errors::InvalidArgument( + "When `num_opers` is set to -1, nodes referenced in `inputs` " + "must have a single output. Node ", + node->name(), " has ", node->num_outputs(), + " outputs. Encountered while creating function '", fn_name, "'"); + } + } + } + } else { + body_nodes->reserve(num_opers); + for (int i = 0; i < num_opers; ++i) { + const Node* node = &opers[i]->node; + TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNoRefOutputs(node), + "Encountered while creating function '", + fn_name, "'"); + body_nodes->push_back(node); + } + } + return Status::OK(); +} + +} // anonymous namespace +} // namespace tensorflow + +using tensorflow::Node; +using tensorflow::string; + +TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name, + int num_opers, const TF_Operation* const* opers, + int ninputs, const TF_Output* inputs, + int noutputs, const TF_Output* outputs, + const char* const* output_names, + const TF_FunctionOptions* opts, + TF_Status* status) { + tensorflow::mutex_lock l(*const_cast(&fn_body->mu)); + + // Process inputs. + std::vector input_tensors; + std::unordered_map> input_nodes; + status->status = tensorflow::ProcessInputs(fn_body, fn_name, ninputs, inputs, + &input_tensors, &input_nodes); + if (!status->status.ok()) return nullptr; + + // Process outputs. + std::vector output_tensors; + status->status = tensorflow::ProcessOutputs(fn_body, fn_name, noutputs, + outputs, &output_tensors); + if (!status->status.ok()) return nullptr; + + // Process output names. + std::vector output_names_vec; + if (output_names) { + output_names_vec.reserve(noutputs); + for (int i = 0; i < noutputs; ++i) { + output_names_vec.push_back(string(output_names[i])); + } + } + + // Compute body nodes. + std::vector body_nodes; + status->status = tensorflow::ComputeBodyNodes( + fn_body, fn_name, num_opers, opers, input_nodes, &body_nodes); + if (!status->status.ok()) return nullptr; + + // Do the actual function creation. + TF_Function* tf_function = new TF_Function(); + status->status = tensorflow::GraphToFunctionDef( + fn_body->graph, fn_name, body_nodes, input_tensors, output_tensors, + output_names_vec, tf_function->fdef_lib.add_function()); + if (!status->status.ok()) { + TF_DeleteFunction(tf_function); + return nullptr; + } + return tf_function; +} + +void TF_GraphAddFunction(TF_Graph* g, const TF_Function* function, + TF_Status* status) { + tensorflow::mutex_lock l(g->mu); + + // At the moment, we have only one function and no gradients in fdef_lib. + // This makes the following operation atomic. + // TODO(iga): Add an atomic version of AddFunctionLibrary when we support + // gradients + status->status = g->graph.AddFunctionLibrary(function->fdef_lib); +} + +void TF_FunctionToFunctionDef(TF_Function* func, TF_Buffer* output_func_def, + TF_Status* status) { + DCHECK_EQ(1, func->fdef_lib.function_size()); + status->status = MessageToBuffer(func->fdef_lib.function(0), output_func_def); +} + +void TF_DeleteFunction(TF_Function* function) { delete function; } diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc new file mode 100644 index 00000000000..c9dd38ea15f --- /dev/null +++ b/tensorflow/c/c_api_function_test.cc @@ -0,0 +1,1039 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/c/c_api.h" + +#include "tensorflow/c/c_test_util.h" +#include "tensorflow/core/framework/function.pb.h" +#include "tensorflow/core/framework/op_def.pb.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +// Specification for expected input/output and its type. +// DataType value of DT_INVALID signifies that we don't want to +// check the data type. +typedef std::pair IOSpec; + +std::vector M(const std::initializer_list& names) { + std::vector v; + for (const string& name : names) { + v.push_back(IOSpec(name, DT_INVALID)); + } + return v; +} + +// Specification for an expected edge. +// src is either: +// - input name (as it appears in FunctionDef) +// - name of output tensor (in nested "add:z:0" format) +// dst is either: +// - output name (as it appears in FunctionDef) +// - : (this looks the same as +// output tensor naming, but it the index is actually an input index) +struct EdgeSpec : public std::pair { + typedef std::pair Base; + + // Inherit the set of constructors + using Base::pair; + + string ToString() const { return strings::StrCat(first, "->", second); } +}; + +class CApiFunctionTest : public ::testing::Test { + protected: + CApiFunctionTest() + : s_(TF_NewStatus()), + func_graph_(TF_NewGraph()), + host_graph_(TF_NewGraph()), + func_(nullptr) {} + + void SetUp() override {} + + ~CApiFunctionTest() override { + TF_DeleteFunction(func_); + TF_DeleteGraph(host_graph_); + TF_DeleteGraph(func_graph_); + TF_DeleteStatus(s_); + } + + void Run(const std::vector>& inputs, + TF_Operation* output, int32_t expected_result) { + Run(inputs, {{output, 0}}, {expected_result}); + } + + // Run the host graph, which now contains a function and check that + // outputs are as expected. + // 'T' stands for 'tensor' since the outputs are tensors, not scalars. + void RunT(const std::vector>& inputs, + std::initializer_list outputs, + const std::vector>& expected_results) { + // Create a session for this graph + CSession csession(host_graph_, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + // Run + csession.SetInputs(inputs); + csession.SetOutputs(outputs); + csession.Run(s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + // Check results + for (int i = 0; i < expected_results.size(); ++i) { + TF_Tensor* out = csession.output_tensor(i); + ASSERT_TRUE(out != nullptr); + EXPECT_EQ(TF_INT32, TF_TensorType(out)); + EXPECT_EQ(1, TF_NumDims(out)); + CompareInt32Tensor(expected_results[i], out); + } + } + + // Run the host graph, which now contains a function and check that + // outputs are as expected. + void Run(const std::vector>& inputs, + std::initializer_list outputs, + const std::vector& expected_results) { + // Create a session for this graph. + CSession csession(host_graph_, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + csession.SetInputs(inputs); + csession.SetOutputs(outputs); + csession.Run(s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + for (int i = 0; i < expected_results.size(); ++i) { + TF_Tensor* out = csession.output_tensor(i); + ASSERT_TRUE(out != nullptr); + EXPECT_EQ(TF_INT32, TF_TensorType(out)); + EXPECT_EQ(0, TF_NumDims(out)); // scalar + ASSERT_EQ(sizeof(int32_t), TF_TensorByteSize(out)); + int32_t* output_contents = static_cast(TF_TensorData(out)); + EXPECT_EQ(expected_results[i], *output_contents); + } + } + + void CompareInt32Tensor(const std::vector& expected, TF_Tensor* t) { + int32_t* data = static_cast(TF_TensorData(t)); + size_t size = TF_TensorByteSize(t); + ASSERT_EQ(expected.size() * sizeof(int32_t), size); + for (int i = 0; i < expected.size(); ++i) { + ASSERT_EQ(expected[i], data[i]) << "Different data at index " << i; + } + } + + std::vector ToOutput(const std::vector ops) { + std::vector out; + for (auto op : ops) { + out.push_back({op, 0}); + } + return out; + } + + void Define(int num_opers, const std::vector& opers, + const std::vector& inputs, + const std::vector& outputs, + const char** output_names, bool expect_failure = false) { + DefineT(num_opers, opers, ToOutput(inputs), ToOutput(outputs), output_names, + expect_failure); + } + + // An explicit `num_opers` is needed so that we can distinguish between the + // case of no operations specified (-1) and the case of an empty set of + // operations specified (0). + void DefineT(int num_opers, const std::vector& opers, + const std::vector& inputs, + const std::vector& outputs, const char** output_names, + bool expect_failure = false) { + ASSERT_EQ(func_, nullptr); + func_ = TF_GraphToFunction(func_graph_, func_name_, num_opers, + num_opers == -1 ? nullptr : opers.data(), + inputs.size(), inputs.data(), outputs.size(), + outputs.data(), output_names, + /*opts=*/nullptr, s_); + if (expect_failure) { + ASSERT_EQ(func_, nullptr); + return; + } + + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + ASSERT_NE(func_, nullptr); + TF_GraphAddFunction(host_graph_, func_, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + } + + TF_Operation* Use(const std::vector& inputs) { + return UseT(ToOutput(inputs)); + } + + TF_Operation* UseT(const std::vector& inputs) { + TF_Operation* op; + UseHelper(inputs, &op); + return op; + } + + // All the *Helper methods are used as a workaround for the restrictions that + // one cannot call ASSERT_* methods in non-void-returning functions (when + // exceptions are disabled during compilation) + void UseHelper(const std::vector& inputs, TF_Operation** op) { + TF_OperationDescription* desc = + TF_NewOperation(host_graph_, func_name_, func_node_name_); + for (auto input : inputs) { + TF_AddInput(desc, input); + } + // Set device to CPU because some ops inside the function might not be + // available on GPU. + TF_SetDevice(desc, "/cpu:0"); + *op = TF_FinishOperation(desc, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + ASSERT_NE(*op, nullptr); + } + + FunctionDef fdef() { + tensorflow::FunctionDef fdef; + EXPECT_TRUE(GetFunctionDef(func_, &fdef)); + return fdef; + } + + // logging utility + template + string ToString(const Container& v) { + std::stringstream ss; + ss << "{"; + size_t i = 0; + for (const auto& e : v) { + if (i != 0) { + ss << ", "; + } + ss << e.ToString(); + ++i; + } + ss << "}"; + return ss.str(); + } + + void VerifyFDefNodes(const tensorflow::FunctionDef& fdef, + const std::unordered_set& nodes) { + ASSERT_EQ(nodes.size(), fdef.node_def_size()) + << "Got unexpected number of nodes. Expected: [" + << str_util::Join(nodes, ", ") + << "] Actual nodes in fdef: " << fdef.DebugString(); + for (const NodeDef& node_def : fdef.node_def()) { + ASSERT_TRUE(nodes.find(node_def.name()) != nodes.end()) + << "Got unexpected node: " << node_def.name() + << " in fdef: " << fdef.DebugString(); + } + } + + void VerifyFDefInputs(const tensorflow::FunctionDef& fdef, + const std::vector& inputs) { + const OpDef& signature = fdef.signature(); + ASSERT_EQ(inputs.size(), signature.input_arg_size()); + for (int i = 0; i < inputs.size(); ++i) { + const OpDef::ArgDef& arg = signature.input_arg(i); + const IOSpec& in = inputs[i]; + if (in.second != DT_INVALID) { + ASSERT_EQ(arg.type(), in.second) + << "Got unexpected type for input " << i + << ". fdef: " << fdef.DebugString(); + } + ASSERT_EQ(arg.name(), in.first) << "Got unexpected name for input " << i + << ". fdef: " << fdef.DebugString(); + } + } + + void VerifyFDefOutputs(const tensorflow::FunctionDef& fdef, + const std::vector& outputs) { + const OpDef& signature = fdef.signature(); + ASSERT_EQ(outputs.size(), signature.output_arg_size()); + for (int i = 0; i < outputs.size(); ++i) { + const OpDef::ArgDef& arg = signature.output_arg(i); + const IOSpec& out = outputs[i]; + if (out.second != DT_INVALID) { + ASSERT_EQ(arg.type(), out.second) + << "Got unexpected type for output " << i + << ". fdef: " << fdef.DebugString(); + } + ASSERT_EQ(arg.name(), out.first) << "Got unexpected name for output " << i + << ". fdef: " << fdef.DebugString(); + } + } + + void VerifyFDefEdges( + const tensorflow::FunctionDef& fdef, + const std::vector& e_edges, // expected edges + const std::vector& c_edges, // expected ctrl edges + bool is_exact_edges = true) { + // Build a set of edges from fdef + std::set a_edges; // actual edges + // Get edges from inputs to body nodes and between body nodes + for (const NodeDef& node_def : fdef.node_def()) { + for (int i = 0; i < node_def.input_size(); ++i) { + const string& in = node_def.input(i); + const auto& v = + a_edges.insert({in, strings::StrCat(node_def.name(), ":", i)}); + ASSERT_TRUE(v.second) << "Duplicate edge " << in << " -> " + << strings::StrCat(node_def.name(), ":", i) + << ". fdef: " << fdef.DebugString(); + } + } + // Get edges from body nodes to outputs and from inputs to outputs + for (const OpDef::ArgDef& arg : fdef.signature().output_arg()) { + const auto& iter = fdef.ret().find(arg.name()); + if (iter != fdef.ret().end()) { + const auto& v = a_edges.insert({iter->second, arg.name()}); + ASSERT_TRUE(v.second) << "Duplicate edge " << iter->second << " -> " + << arg.name() << ". fdef: " << fdef.DebugString(); + } else { + const auto& v = a_edges.insert({arg.name(), arg.name()}); + ASSERT_TRUE(v.second) << "Duplicate edge " << arg.name() << " -> " + << arg.name() << ". fdef: " << fdef.DebugString(); + } + } + + // Verify edges + for (const EdgeSpec& e : e_edges) { + ASSERT_TRUE(a_edges.find(e) != a_edges.end()) + << "Failed to find expected edge " << e.ToString() + << " in fdef: " << fdef.DebugString(); + } + + // If caller specified all edges, check that we have seen all + if (is_exact_edges) { + ASSERT_EQ(e_edges.size() + c_edges.size(), a_edges.size()) + << "Expected edges: " << ToString(e_edges) + << " Expected Control edges: " << ToString(c_edges) + << " Actual edges: " << ToString(a_edges) + << " in fdef: " << fdef.DebugString(); + } + } + + void VerifyFDef(const std::unordered_set& nodes, + const std::vector& inputs, + const std::vector& outputs, + const std::vector& e_edges, // expected edges + const std::vector& c_edges, // expected ctrl edges + bool is_exact_edges = true) { + tensorflow::FunctionDef fdef; + ASSERT_TRUE(GetFunctionDef(func_, &fdef)); + VerifyFDefNodes(fdef, nodes); + VerifyFDefInputs(fdef, inputs); + VerifyFDefOutputs(fdef, outputs); + VerifyFDefEdges(fdef, e_edges, c_edges, is_exact_edges); + } + + const char* func_name_ = "MyFunc"; + const char* func_node_name_ = "MyFunc_0"; + TF_Status* s_; + TF_Graph* func_graph_; + TF_Graph* host_graph_; + TF_Function* func_; + + // Workaround for not being able to initialize empty map using {} + std::unordered_set empty_; +}; + +TEST_F(CApiFunctionTest, OneOp_ZeroInputs_OneOutput) { + /* + * constant + * | + * v + */ + // Define + TF_Operation* c = ScalarConst(10, func_graph_, s_, "scalar10"); + Define(-1, {}, {}, {c}, nullptr); + + // Use, run, and verify + TF_Operation* func_op = Use({}); + Run({}, func_op, 10); + VerifyFDef({"scalar10_0"}, {}, {{"scalar10", DT_INT32}}, + {{"scalar10_0:output:0", "scalar10"}}, {}); +} + +TEST_F(CApiFunctionTest, OneOp_OneInput_OneOutput) { + /* + * | + * v + * negate + * | + * v + */ + // Define + TF_Operation* feed = Placeholder(func_graph_, s_); + TF_Operation* neg = Neg(feed, func_graph_, s_); + Define(-1, {}, {feed}, {neg}, nullptr); + + // Use, run, and verify + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({func_feed}); + Run({{func_feed, Int32Tensor(3)}}, func_op, -3); + VerifyFDef({"neg_0"}, {{"feed", DT_INT32}}, {{"neg", DT_INT32}}, + {{"feed", "neg_0:0"}, {"neg_0:y:0", "neg"}}, {}); +} + +TEST_F(CApiFunctionTest, ZeroOps_Identity) { + /* + * | + * | + * | + * v + */ + // Define + TF_Operation* feed = Placeholder(func_graph_, s_); + Define(-1, {}, {feed}, {feed}, nullptr); + + // Use, run, and verify + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({func_feed}); + Run({{func_feed, Int32Tensor(3)}}, func_op, 3); + VerifyFDef(empty_, {{"feed", DT_INT32}}, {{"feed_0", DT_INT32}}, + {{"feed", "feed_0"}}, {}); +} + +TEST_F(CApiFunctionTest, ZeroOps_Permutation) { + /* + * | | + * \ / + * \/ + * x + * /\ + * / \ + * | | + * v v + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + Define(-1, {}, {feed1, feed2}, {feed2, feed1}, nullptr); + + // Use, run, and verify + TF_Operation* two = ScalarConst(2, host_graph_, s_); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({two, func_feed}); + Run({{func_feed, Int32Tensor(3)}}, {{func_op, 0}, {func_op, 1}}, {3, 2}); + VerifyFDef(empty_, M({{"feed1"}, {"feed2"}}), M({{"feed2_0"}, {"feed1_0"}}), + {{"feed1", "feed1_0"}, {"feed2", "feed2_0"}}, {}); +} + +TEST_F(CApiFunctionTest, OneOp_TwoInputs_OneOutput) { + /* + * | | + * v v + * add + * | + * v + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* add = Add(feed1, feed2, func_graph_, s_); + Define(-1, {}, {feed1, feed2}, {add}, nullptr); + + // Use, run, and verify + TF_Operation* two = ScalarConst(2, host_graph_, s_); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({two, func_feed}); + Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3); + VerifyFDef( + {"add_0"}, M({{"feed1"}, {"feed2"}}), M({{"add"}}), + {{"feed1", "add_0:0"}, {"feed2", "add_0:1"}, {"add_0:sum:0", "add"}}, {}); +} + +TEST_F(CApiFunctionTest, OneOp_TwoInputs_ZeroOutputs) { + /* + * | | + * v v + * add + * + * (output ignored) + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + Add(feed1, feed2, func_graph_, s_); + Define(-1, {}, {feed1, feed2}, {}, nullptr); + + // Use, run, and verify + TF_Operation* two = ScalarConst(2, host_graph_, s_); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + Use({two, func_feed}); + VerifyFDef({"add"}, M({{"feed1"}, {"feed2"}}), {}, + {{"feed1", "add:0"}, {"feed2", "add:1"}}, {}); +} + +TEST_F(CApiFunctionTest, TwoOps_ThreeInputs_OneOutput) { + /* + * | | | + * v v / + * add1 / + * | | + * v v + * add2 + * | + * v + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* feed3 = Placeholder(func_graph_, s_, "feed3"); + TF_Operation* add1 = Add(feed1, feed2, func_graph_, s_, "add1"); + TF_Operation* add2 = Add(add1, feed3, func_graph_, s_, "add2"); + Define(-1, {}, {feed1, feed2, feed3}, {add2}, nullptr); + + // Use, run, and verify + TF_Operation* two = ScalarConst(2, host_graph_, s_, "two"); + TF_Operation* ten = ScalarConst(10, host_graph_, s_, "ten"); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({two, ten, func_feed}); + Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 10 + 3); + VerifyFDef({"add1", "add2_0"}, M({{"feed1"}, {"feed2"}, {"feed3"}}), + M({{"add2"}}), + {{"feed1", "add1:0"}, + {"feed2", "add1:1"}, + {"add1:sum:0", "add2_0:0"}, + {"feed3", "add2_0:1"}, + {"add2_0:sum:0", "add2"}}, + {}); +} + +TEST_F(CApiFunctionTest, OneOp_TwoInputs_TwoDuplicateOutputs) { + /* + * | | + * v v + * add + * | + * +-+-+ + * | | + * v v + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* add = Add(feed1, feed2, func_graph_, s_); + Define(-1, {}, {feed1, feed2}, {add, add}, nullptr); + + // Use, run, and verify + TF_Operation* two = ScalarConst(2, host_graph_, s_); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({two, func_feed}); + Run({{func_feed, Int32Tensor(3)}}, {{func_op, 0}, {func_op, 1}}, {5, 5}); + VerifyFDef({"add_1"}, M({{"feed1"}, {"feed2"}}), M({{"add"}, {"add_0"}}), + {{"feed1", "add_1:0"}, + {"feed2", "add_1:1"}, + {"add_1:sum:0", "add"}, + {"add_1:sum:0", "add_0"}}, + {}); +} + +TEST_F(CApiFunctionTest, TwoOps_ThreeInputs_TwoOutputs) { + /* + * | | | + * v v / + * add / + * | | + * +-+ | + * | | | + * | v v + * | add + * | | + * v v + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* feed3 = Placeholder(func_graph_, s_, "feed3"); + TF_Operation* add1 = Add(feed1, feed2, func_graph_, s_, "add1"); + TF_Operation* add2 = Add(add1, feed3, func_graph_, s_, "add2"); + Define(-1, {}, {feed1, feed2, feed3}, {add1, add2}, nullptr); + + // Use, run, and verify + TF_Operation* two = ScalarConst(2, host_graph_, s_, "two"); + TF_Operation* ten = ScalarConst(10, host_graph_, s_, "ten"); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({two, ten, func_feed}); + Run({{func_feed, Int32Tensor(3)}}, {{func_op, 0}, {func_op, 1}}, {12, 15}); + VerifyFDef({"add1_0", "add2_0"}, M({{"feed1"}, {"feed2"}, {"feed3"}}), + M({{"add1"}, {"add2"}}), + {{"feed1", "add1_0:0"}, + {"feed2", "add1_0:1"}, + {"add1_0:sum:0", "add2_0:0"}, + {"feed3", "add2_0:1"}, + {"add1_0:sum:0", "add1"}, + {"add2_0:sum:0", "add2"}}, + {}); +} + +TEST_F(CApiFunctionTest, FromSubsetOfOps) { + /* + * | | | + * v v / + * add / + * | | + * +---+--+---+ + * Ops used | | | | + * for func | v v | + * | | add | + * +-------> | | | + * | v | + * | | + * +----------+ + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* feed3 = Placeholder(func_graph_, s_, "feed3"); + TF_Operation* add1 = Add(feed1, feed2, func_graph_, s_, "add1"); + TF_Operation* add2 = Add(add1, feed3, func_graph_, s_, "add2"); + Define(1, {add2}, {add1, feed3}, {add2}, nullptr); + + // Use, run, and verify + TF_Operation* two = ScalarConst(2, host_graph_, s_, "two"); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({two, func_feed}); + Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3); + VerifyFDef( + {"add2_0"}, M({{"add1"}, {"feed3"}}), M({{"add2"}}), + {{"add1", "add2_0:0"}, {"feed3", "add2_0:1"}, {"add2_0:sum:0", "add2"}}, + {}); +} + +TEST_F(CApiFunctionTest, UsingOneOutputOfSplit) { + /* + * feed + * | + * +---------+---+ + * | const0 | | + * | | | | + * | v / | + * | split | + * | | | | | + * | v | v | + * | | | + * +------+------+ + * | + * v + * + * Only the second output from split is used as function output + */ + // Define + TF_Operation* feed = Placeholder(func_graph_, s_); + TF_Operation* split = Split3(feed, func_graph_, s_); + DefineT(-1, {}, {{feed, 0}}, {{split, 1}}, nullptr); + + // Use, run, and verify + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({func_feed}); + RunT({{func_feed, Int32Tensor({1, 2, 3, 4, 5, 6})}}, {{func_op, 0}}, + {{3, 4}}); + VerifyFDef({"split3_const0", "split3_0"}, M({{"feed"}}), M({{"split3"}}), + {{"split3_const0:output:0", "split3_0:0"}, + {"feed", "split3_0:1"}, + {"split3_0:output:1", "split3"}}, + {}); +} + +TEST_F(CApiFunctionTest, UsingTwoOutputsOfSplit) { + /* + * feed + * | + * +---------+---+ + * | const0 | | + * | | | | + * | v / | + * | split | + * | | | | | + * | | v | | + * | | | | + * +---+-----+---+ + * | | + * v v + * + * Second output from split is not used as function output + */ + // Define + TF_Operation* feed = Placeholder(func_graph_, s_); + TF_Operation* split = Split3(feed, func_graph_, s_); + DefineT(-1, {}, {{feed, 0}}, {{split, 0}, {split, 2}}, nullptr); + + // Use, run, and verify + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({func_feed}); + RunT({{func_feed, Int32Tensor({1, 2, 3, 4, 5, 6})}}, + {{func_op, 0}, {func_op, 1}}, {{1, 2}, {5, 6}}); + VerifyFDef({"split3_const0", "split3_1"}, M({{"feed"}}), + M({{"split3"}, {"split3_0"}}), + {{"split3_const0:output:0", "split3_1:0"}, + {"feed", "split3_1:1"}, + {"split3_1:output:0", "split3"}, + {"split3_1:output:2", "split3_0"}}, + {}); +} + +TEST_F(CApiFunctionTest, UsingTwoOutputsOfSplitAsInputs) { + /* + * | + * v + * split + * | | | + * | v | + * | | + * +---+-----+---+ + * | | | | + * | v v | + * | add | + * | | | + * | | | + * +------+------+ + * | + * v + */ + // Define + TF_Operation* feed = Placeholder(func_graph_, s_); + TF_Operation* split = Split3(feed, func_graph_, s_); + TF_Operation* add = Add({split, 0}, {split, 2}, func_graph_, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + DefineT(1, {add}, {{split, 0}, {split, 2}}, {{add, 0}}, nullptr); + + // Use, run, and verify + TF_Operation* two = ScalarConst(2, host_graph_, s_, "two"); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({two, func_feed}); + Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3); + VerifyFDef( + {"add_0"}, M({{"split3"}, {"split3_0"}}), M({{"add"}}), + {{"split3", "add_0:0"}, {"split3_0", "add_0:1"}, {"add_0:sum:0", "add"}}, + {}); +} + +TEST_F(CApiFunctionTest, NodesUsedInInputsMustHaveSingleOutput) { + /* + * | + * v + * split + * | | | + * | v | + * | | + * input --->| |<--- input + * | | + * v v + * add + * | + * | + * v + */ + // Define + TF_Tensor* tensor_123 = Int32Tensor({1, 2, 3}); + TF_Operation* c = Const(tensor_123, func_graph_, s_, "const_array"); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + TF_Operation* split = Split3(c, func_graph_, s_); + TF_Operation* add = Add({split, 0}, {split, 2}, func_graph_, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + DefineT(-1, {}, {{split, 0}, {split, 2}}, {{add, 0}}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ(string("When `num_opers` is set to -1, nodes referenced in " + "`inputs` must have a single output. Node split3 has " + "3 outputs. Encountered while creating function 'MyFunc'"), + string(TF_Message(s_))); + + TF_DeleteTensor(tensor_123); +} + +TEST_F(CApiFunctionTest, FunctionWithWhileLoop) { + // Inputs to the while loop and the function as a whole + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + + // Outputs of the while loop corresponding to the two inputs above + // The first one will the function's output + std::vector outputs; + + // Add while loop to func_graph_ + { + // The inputs to the while loop + std::vector inputs = {{feed1, 0}, {feed2, 0}}; + std::unique_ptr params(new TF_WhileParams( + TF_NewWhile(func_graph_, &inputs[0], inputs.size(), s_))); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + params->name = "test_loop"; + + // Initialize outputs so we can easily detect errors/bugs + outputs.resize(2, {nullptr, -1}); + + // Create loop: while (input1 < input2) input1 += input2 + 1 + TF_Operation* less_than = LessThan( + params->cond_inputs[0], params->cond_inputs[1], params->cond_graph, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + params->cond_output = {less_than, 0}; + + TF_Operation* add1 = Add(params->body_inputs[0], params->body_inputs[1], + params->body_graph, s_, "add1"); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + TF_Operation* one = ScalarConst(1, params->body_graph, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + TF_Operation* add2 = Add(add1, one, params->body_graph, s_, "add2"); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + params->body_outputs[0] = {add2, 0}; + params->body_outputs[1] = params->body_inputs[1]; + + // Finalize while loop + TF_FinishWhile(params.get(), s_, &outputs[0]); + EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + } + + // Define function, use it in graph, and run + DefineT(-1, {}, {{feed1, 0}, {feed2, 0}}, {outputs[0]}, nullptr); + TF_Operation* five = ScalarConst(5, host_graph_, s_, "five"); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({func_feed, five}); + Run({{func_feed, Int32Tensor(2)}}, func_op, 2 /*+=*/ + 5 + 1); + + // Verify input, output, and subset of edges in fdef. + // The subset of edges we verify is a chain between feed1 and output to + // make sure that the correct output is picked. + tensorflow::FunctionDef fdef; + ASSERT_TRUE(GetFunctionDef(func_, &fdef)); + VerifyFDefInputs(fdef, M({{"feed1"}, {"feed2"}})); + VerifyFDefOutputs(fdef, M({{"test_loop_exit"}})); + VerifyFDefEdges(fdef, + {{"feed1", "test_loop/Enter:0"}, + {"test_loop/Enter:output:0", "test_loop/Merge:0"}, + {"test_loop/Merge:output:0", "test_loop/Switch:0"}, + {"test_loop/Switch:output_false:0", "test_loop/Exit:0"}, + {"test_loop/Exit:output:0", "test_loop_exit"}}, + {}, false); +} + +TEST_F(CApiFunctionTest, ControlDependency) { + /* + * | | scalar + * | | . + * v v . <---- control dependency + * add < - + * | + * v + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* five = ScalarConst(5, func_graph_, s_); + TF_Operation* add = + AddWithCtrlDependency(feed1, feed2, func_graph_, five, s_); + EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + Define(-1, {}, {feed1, feed2}, {add}, nullptr); + + // Use, run, and verify + TF_Operation* two = ScalarConst(2, host_graph_, s_); + TF_Operation* func_feed = Placeholder(host_graph_, s_); + TF_Operation* func_op = Use({two, func_feed}); + Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3); + VerifyFDef( + {"add_0", "scalar"}, M({{"feed1"}, {"feed2"}}), M({{"add"}}), + {{"feed1", "add_0:0"}, {"feed2", "add_0:1"}, {"add_0:sum:0", "add"}}, + {{"scalar", "add_0"}}); +} + +TEST_F(CApiFunctionTest, ControlDependencyOutsideOfBody) { + /* + * | | scalar + * | | . + * v v . <---- control dependency + * add < - + * | + * v + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* five = ScalarConst(5, func_graph_, s_); + TF_Operation* add = + AddWithCtrlDependency(feed1, feed2, func_graph_, five, s_); + EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + Define(1, {add}, {feed1, feed2}, {add}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ(string("The source of control edge [id=3 scalar:-1 -> add:-1] " + "is not in the body. Encountered while creating " + "function 'MyFunc'"), + string(TF_Message(s_))); +} + +TEST_F(CApiFunctionTest, ControlDependencyOutsideOfBody_FromInputNode) { + /* + * | |. + * | | . + * | | . + * v v . <---- control dependency + * add < - + * | + * v + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* add = + AddWithCtrlDependency(feed1, feed2, func_graph_, feed1, s_); + EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + Define(-1, {}, {feed1, feed2}, {add}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ(string("The source of control edge [id=3 feed1:-1 -> add:-1] " + "is not in the body. Encountered while creating " + "function 'MyFunc'"), + string(TF_Message(s_))); +} + +TEST_F(CApiFunctionTest, DuplicateInputsAreNotAllowed) { + /* + * feed + * | + * +++ + * | | + * +---+-+---+ + * | | | | + * | v v | + * | add | + * | | | + * | | | + * +----+----+ + * | + * v + */ + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* add = Add(feed1, feed1, func_graph_, s_); + Define(-1, {}, {feed1, feed1}, {add}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ( + string("TF_Output feed1:0 appears more than once in the input list"), + string(TF_Message(s_))); +} + +TEST_F(CApiFunctionTest, InvalidInputTensor_HighIndex) { + /* + * | | + * v v + * add + * | + * v + */ + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* add = Add(feed1, feed2, func_graph_, s_); + DefineT(-1, {}, {{feed1, 0}, {feed2, 2}}, {{add, 0}}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ(string("Node 'feed2' (type: 'Placeholder', num of outputs: 1) does " + "not have output 2\n\tEncountered while processing " + "input 1 into function 'MyFunc'"), + string(TF_Message(s_))); +} + +TEST_F(CApiFunctionTest, InvalidInputTensor_BadNodePtr) { + /* + * | | + * v v + * add + * | + * v + */ + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* add = Add(feed1, feed2, func_graph_, s_); + DefineT(-1, {}, {{feed1, 0}, {nullptr, 0}}, {{add, 0}}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ(string("Node is null\n\tEncountered while processing input 1 " + "into function 'MyFunc'"), + string(TF_Message(s_))); +} + +TEST_F(CApiFunctionTest, InvalidOutputTensor_HighIndex) { + /* + * | | + * v v + * add + * | + * v + */ + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* add = Add(feed1, feed2, func_graph_, s_); + DefineT(-1, {}, {{feed1, 0}, {feed2, 0}}, {{add, 3}}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ(string("Node 'add' (type: 'AddN', num of outputs: 1) does " + "not have output 3\n\tEncountered while processing " + "output 0 from function 'MyFunc'"), + string(TF_Message(s_))); +} + +TEST_F(CApiFunctionTest, InvalidOutputTensor_BadNodePtr) { + /* + * | | + * v v + * add + * | + * v + */ + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + Add(feed1, feed2, func_graph_, s_); + DefineT(-1, {}, {{feed1, 0}, {feed2, 0}}, {{nullptr, 3}}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ(string("Node is null\n\tEncountered while processing output 0 " + "from function 'MyFunc'"), + string(TF_Message(s_))); +} + +TEST_F(CApiFunctionTest, NodeMissingInput) { + /* + * input---> | | <----missing input + * v v + * body----> add + * | + * v + */ + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* add = Add(feed1, feed2, func_graph_, s_); + DefineT(1, {add}, {{feed1, 0}}, {{add, 0}}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ(string("Input 1, 'feed2:0', of node 'add' in function 'MyFunc' " + "is not available. You might need to include it in inputs " + "or include its source node in the body"), + string(TF_Message(s_))); +} + +TEST_F(CApiFunctionTest, OutputOpNotInBody) { + /* + * | | + * v v + * add scalar (scalar not included in body) + * | | + * v v (function has two outputs) + */ + // Define + TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1"); + TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2"); + TF_Operation* scalar = ScalarConst(2, func_graph_, s_); + TF_Operation* add = Add(feed1, feed2, func_graph_, s_); + Define(1, {add}, {feed1, feed2}, {add, scalar}, nullptr, true); + EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)); + EXPECT_EQ(string("TF_Output scalar:0 is neither in the function body nor " + "among function inputs. Encountered while creating " + "function 'MyFunc'"), + string(TF_Message(s_))); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h index f7d25dce8f5..68c324f2b99 100644 --- a/tensorflow/c/c_api_internal.h +++ b/tensorflow/c/c_api_internal.h @@ -130,6 +130,11 @@ struct TF_DeviceList { std::vector response; }; +struct TF_Function { + // Currently contains a single function and no gradients + tensorflow::FunctionDefLibrary fdef_lib; +}; + namespace tensorflow { class TensorCApi { @@ -141,7 +146,12 @@ class TensorCApi { } }; +Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst); + TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status); + +Status MessageToBuffer(const tensorflow::protobuf::Message& in, TF_Buffer* out); + } // end namespace tensorflow #endif // TENSORFLOW_C_C_API_INTERNAL_H_ diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc index 0aa60fb45dd..c4420290099 100644 --- a/tensorflow/c/c_api_test.cc +++ b/tensorflow/c/c_api_test.cc @@ -829,7 +829,7 @@ TEST(CAPI, ShapeInferenceError) { TF_Operation* vec3 = Const(vec3_tensor.get(), graph, status, "vec3"); ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); - TF_Operation* add = Add(vec2, vec3, graph, status); + TF_Operation* add = AddNoCheck(vec2, vec3, graph, status); ASSERT_NE(TF_OK, TF_GetCode(status)); ASSERT_TRUE(add == nullptr); diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc index 21603c1a07c..9cd978c97ea 100644 --- a/tensorflow/c/c_test_util.cc +++ b/tensorflow/c/c_test_util.cc @@ -15,7 +15,9 @@ limitations under the License. #include "tensorflow/c/c_test_util.h" +#include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/framework/tensor.pb.h" +#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" using tensorflow::GraphDef; @@ -36,6 +38,23 @@ TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) { return t; } +TF_Tensor* Int32Tensor(const int64_t* dims, int num_dims, + const int32_t* values) { + int64_t num_values = 1; + for (int i = 0; i < num_dims; ++i) { + num_values *= dims[i]; + } + TF_Tensor* t = + TF_AllocateTensor(TF_INT32, dims, num_dims, sizeof(int32_t) * num_values); + memcpy(TF_TensorData(t), values, sizeof(int32_t) * num_values); + return t; +} + +TF_Tensor* Int32Tensor(const std::vector& values) { + int64_t dims = values.size(); + return Int32Tensor(&dims, 1, values.data()); +} + TF_Tensor* Int32Tensor(int32_t v) { const int num_bytes = sizeof(int32_t); int32_t* values = new int32_t[1]; @@ -44,19 +63,40 @@ TF_Tensor* Int32Tensor(int32_t v) { &Int32Deallocator, nullptr); } -TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name) { +// All the *Helper methods are used as a workaround for the restrictions that +// one cannot call ASSERT_* methods in non-void-returning functions (when +// exceptions are disabled during compilation) +void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name, + TF_Operation** op) { TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name); TF_SetAttrType(desc, "dtype", TF_INT32); - return TF_FinishOperation(desc, s); + *op = TF_FinishOperation(desc, s); + ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s); + ASSERT_NE(*op, nullptr); +} + +TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name) { + TF_Operation* op; + PlaceholderHelper(graph, s, name, &op); + return op; +} + +void ConstHelper(TF_Tensor* t, TF_Graph* graph, TF_Status* s, const char* name, + TF_Operation** op) { + TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name); + TF_SetAttrTensor(desc, "value", t, s); + ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s); + TF_SetAttrType(desc, "dtype", TF_TensorType(t)); + *op = TF_FinishOperation(desc, s); + ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s); + ASSERT_NE(*op, nullptr); } TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s, const char* name) { - TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name); - TF_SetAttrTensor(desc, "value", t, s); - if (TF_GetCode(s) != TF_OK) return nullptr; - TF_SetAttrType(desc, "dtype", TF_TensorType(t)); - return TF_FinishOperation(desc, s); + TF_Operation* op; + ConstHelper(t, graph, s, name, &op); + return op; } TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s, @@ -65,11 +105,39 @@ TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s, return Const(tensor.get(), graph, s, name); } -TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph, - TF_Status* s, const char* name) { +void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph, TF_Status* s, + const char* name, TF_Operation** op, bool check) { TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name); TF_Output add_inputs[2] = {{l, 0}, {r, 0}}; TF_AddInputList(desc, add_inputs, 2); + *op = TF_FinishOperation(desc, s); + if (check) { + ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s); + ASSERT_NE(*op, nullptr); + } +} + +TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph, + TF_Status* s, const char* name) { + TF_Operation* op; + AddHelper(l, r, graph, s, name, &op, true); + return op; +} + +TF_Operation* AddNoCheck(TF_Operation* l, TF_Operation* r, TF_Graph* graph, + TF_Status* s, const char* name) { + TF_Operation* op; + AddHelper(l, r, graph, s, name, &op, false); + return op; +} + +TF_Operation* AddWithCtrlDependency(TF_Operation* l, TF_Operation* r, + TF_Graph* graph, TF_Operation* ctrl_op, + TF_Status* s, const char* name) { + TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name); + TF_Output add_inputs[2] = {{l, 0}, {r, 0}}; + TF_AddInputList(desc, add_inputs, 2); + TF_AddControlInput(desc, ctrl_op); return TF_FinishOperation(desc, s); } @@ -81,11 +149,20 @@ TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s, return TF_FinishOperation(desc, s); } -TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s) { +void NegHelper(TF_Operation* n, TF_Graph* graph, TF_Status* s, + TF_Operation** op) { TF_OperationDescription* desc = TF_NewOperation(graph, "Neg", "neg"); TF_Output neg_input = {n, 0}; TF_AddInput(desc, neg_input); - return TF_FinishOperation(desc, s); + *op = TF_FinishOperation(desc, s); + ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s); + ASSERT_NE(*op, nullptr); +} + +TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s) { + TF_Operation* op; + NegHelper(n, graph, s, &op); + return op; } TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, @@ -96,6 +173,32 @@ TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, return TF_FinishOperation(desc, s); } +void Split3Helper(TF_Operation* input, TF_Graph* graph, TF_Status* s, + const char* name, TF_Operation** op) { + TF_Operation* zero = ScalarConst( + 0, graph, s, ::tensorflow::strings::StrCat(name, "_const0").c_str()); + TF_OperationDescription* desc = TF_NewOperation(graph, "Split", name); + TF_AddInput(desc, {zero, 0}); + TF_AddInput(desc, {input, 0}); + TF_SetAttrInt(desc, "num_split", 3); + TF_SetAttrType(desc, "T", TF_INT32); + // Set device to CPU since there is no version of split for int32 on GPU + // TODO(iga): Convert all these helpers and tests to use floats because + // they are usually available on GPUs. After doing this, remove TF_SetDevice + // call in c_api_function_test.cc + TF_SetDevice(desc, "/cpu:0"); + *op = TF_FinishOperation(desc, s); + ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s); + ASSERT_NE(*op, nullptr); +} + +TF_Operation* Split3(TF_Operation* input, TF_Graph* graph, TF_Status* s, + const char* name) { + TF_Operation* op; + Split3Helper(input, graph, s, name, &op); + return op; +} + bool IsPlaceholder(const tensorflow::NodeDef& node_def) { if (node_def.op() != "Placeholder" || node_def.name() != "feed") { return false; @@ -196,6 +299,18 @@ bool GetNodeDef(TF_Operation* oper, tensorflow::NodeDef* node_def) { return ret; } +bool GetFunctionDef(TF_Function* func, tensorflow::FunctionDef* func_def) { + TF_Status* s = TF_NewStatus(); + TF_Buffer* buffer = TF_NewBuffer(); + TF_FunctionToFunctionDef(func, buffer, s); + bool ret = TF_GetCode(s) == TF_OK; + EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s); + if (ret) ret = func_def->ParseFromArray(buffer->data, buffer->length); + TF_DeleteBuffer(buffer); + TF_DeleteStatus(s); + return ret; +} + bool GetAttrValue(TF_Operation* oper, const char* attr_name, tensorflow::AttrValue* attr_value, TF_Status* s) { TF_Buffer* buffer = TF_NewBuffer(); diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h index 0c0ba667bd0..a927739d462 100644 --- a/tensorflow/c/c_test_util.h +++ b/tensorflow/c/c_test_util.h @@ -33,6 +33,13 @@ typedef std::unique_ptr // Create a tensor with values of type TF_INT8 provided by `values`. TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values); +// Create a tensor with values of type TF_INT32 provided by `values`. +TF_Tensor* Int32Tensor(const int64_t* dims, int num_dims, + const int32_t* values); + +// Create 1 dimensional tensor with values from `values` +TF_Tensor* Int32Tensor(const std::vector& values); + TF_Tensor* Int32Tensor(int32_t v); TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, @@ -47,6 +54,13 @@ TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s, TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph, TF_Status* s, const char* name = "add"); +TF_Operation* AddNoCheck(TF_Operation* l, TF_Operation* r, TF_Graph* graph, + TF_Status* s, const char* name = "add"); + +TF_Operation* AddWithCtrlDependency(TF_Operation* l, TF_Operation* r, + TF_Graph* graph, TF_Operation* ctrl_op, + TF_Status* s, const char* name = "add"); + TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s, const char* name = "add"); @@ -54,6 +68,10 @@ TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s); TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s); +// Split `input` along the first dimention into 3 tensors +TF_Operation* Split3(TF_Operation* input, TF_Graph* graph, TF_Status* s, + const char* name = "split3"); + bool IsPlaceholder(const tensorflow::NodeDef& node_def); bool IsScalarConst(const tensorflow::NodeDef& node_def, int v); @@ -66,6 +84,8 @@ bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def); bool GetNodeDef(TF_Operation* oper, tensorflow::NodeDef* node_def); +bool GetFunctionDef(TF_Function* func, tensorflow::FunctionDef* func_def); + bool GetAttrValue(TF_Operation* oper, const char* attr_name, tensorflow::AttrValue* attr_value, TF_Status* s); diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index 01e251a1ac2..e70539ceefa 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -151,10 +151,11 @@ TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) { return TF_SessionListDevices(ctx->session, status); } -TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t) { - return new TFE_TensorHandle( - tensorflow::TensorCApi::MakeTensor(t->dtype, t->shape, t->buffer), - nullptr); +TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) { + tensorflow::Tensor tensor; + status->status = tensorflow::TF_TensorToTensor(t, &tensor); + if (!status->status.ok()) return nullptr; + return new TFE_TensorHandle(tensor, nullptr); } void TFE_DeleteTensorHandle(TFE_TensorHandle* h) { delete h; } diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h index 476c9288f89..a54d206a307 100644 --- a/tensorflow/c/eager/c_api.h +++ b/tensorflow/c/eager/c_api.h @@ -20,6 +20,25 @@ limitations under the License. #include "tensorflow/c/c_api.h" +// Macro to control visibility of exported symbols in the shared library (.so, +// .dylib, .dll). +// This duplicates the TF_EXPORT macro definition in +// tensorflow/core/platform/macros.h in order to keep this .h file independent +// of any other includes.$a +#ifdef SWIG +#define TF_CAPI_EXPORT +#else +#if defined(COMPILER_MSVC) +#ifdef TF_COMPILE_LIBRARY +#define TF_CAPI_EXPORT __declspec(dllexport) +#else +#define TF_CAPI_EXPORT __declspec(dllimport) +#endif // TF_COMPILE_LIBRARY +#else +#define TF_CAPI_EXPORT __attribute__((visibility("default"))) +#endif // COMPILER_MSVC +#endif // SWIG + #ifdef __cplusplus extern "C" { #endif @@ -30,11 +49,11 @@ extern "C" { // TODO(ashankar): Merge with TF_Session? typedef struct TFE_Context TFE_Context; -extern TFE_Context* TFE_NewContext(const TF_SessionOptions* opts, - TF_Status* status); -extern void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status); -extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, - TF_Status* status); +TF_CAPI_EXPORT extern TFE_Context* TFE_NewContext(const TF_SessionOptions* opts, + TF_Status* status); +TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status); +TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, + TF_Status* status); // A handle to a tensor on a device. // @@ -43,14 +62,15 @@ extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, // placed in memory of different devices or remote address spaces. typedef struct TFE_TensorHandle TFE_TensorHandle; -extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t); -extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h); -extern TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h); -extern int TFE_TensorHandleNumDims(TFE_TensorHandle* h); -extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index); -extern const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h); -extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, - TF_Status* status); +TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, + TF_Status* status); +TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h); +TF_CAPI_EXPORT extern TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h); +TF_CAPI_EXPORT extern int TFE_TensorHandleNumDims(TFE_TensorHandle* h); +TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index); +TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h); +TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, + TF_Status* status); // Create a new TFE_TensorHandle with the same contents as 'h' but placed // in the memory of the device name 'device_name'. @@ -58,10 +78,10 @@ extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, // that shares the underlying buffer. Otherwise, it currently requires at least // one of the source or destination devices to be CPU (i.e., for the source or // destination tensor to be placed in host memory). -extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h, - TFE_Context* ctx, - const char* device_name, - TF_Status* status); +TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h, + TFE_Context* ctx, + const char* device_name, + TF_Status* status); // Description of the TensorFlow op to execute. // @@ -76,49 +96,49 @@ extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h, // the additional sanity checks there seem unnecessary; typedef struct TFE_Op TFE_Op; -extern TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, - TF_Status* status); -extern void TFE_DeleteOp(TFE_Op* op); +TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, + TF_Status* status); +TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op); // TODO(ashankar): TFE_OpSetDevice and TFE_Execute should not have a TFE_Context // parameter. Instead, the TFE_Context should be captured when creating the // TFE_Op. -extern void TFE_OpSetDevice(TFE_Op* op, TFE_Context* ctx, - const char* device_name, TF_Status* status); +TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, TFE_Context* ctx, + const char* device_name, TF_Status* status); -extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status); +TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status); -extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name, - unsigned char* is_list, TF_Status* status); +TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name, + unsigned char* is_list, TF_Status* status); -extern void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, - const char* value); -extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value); -extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value); -extern void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, - unsigned char value); -extern void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, - TF_DataType value); +TF_CAPI_EXPORT extern void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, + const char* value); +TF_CAPI_EXPORT extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value); +TF_CAPI_EXPORT extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value); +TF_CAPI_EXPORT extern void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, + unsigned char value); +TF_CAPI_EXPORT extern void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, + TF_DataType value); // If the number of dimensions is unknown, `num_dims` must be set to // -1 and `dims` can be null. If a dimension is unknown, the // corresponding entry in the `dims` array must be -1. -extern void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, - const int64_t* dims, const int num_dims, - TF_Status* out_status); +TF_CAPI_EXPORT extern void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, + const int64_t* dims, const int num_dims, + TF_Status* out_status); -extern void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name, - const char** value, int num_values); -extern void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name, - const int64_t* values, int num_values); -extern void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name, - const float* values, int num_values); -extern void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name, - const unsigned char* values, int num_values); -extern void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name, - const TF_DataType* values, int num_values); -extern void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name, - const int64_t** dims, const int* num_dims, - int num_values, TF_Status* out_status); +TF_CAPI_EXPORT extern void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name, + const char** value, int num_values); +TF_CAPI_EXPORT extern void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name, + const int64_t* values, int num_values); +TF_CAPI_EXPORT extern void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name, + const float* values, int num_values); +TF_CAPI_EXPORT extern void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name, + const unsigned char* values, int num_values); +TF_CAPI_EXPORT extern void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name, + const TF_DataType* values, int num_values); +TF_CAPI_EXPORT extern void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name, + const int64_t** dims, const int* num_dims, + int num_values, TF_Status* out_status); // Execute the operation defined by 'op' and return handles to computed // tensors in 'retvals'. @@ -128,14 +148,14 @@ extern void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name, // // On return, 'num_retvals' will be set to the actual number of outputs // returned by the operation. -extern void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, - int* num_retvals, TF_Status* status); +TF_CAPI_EXPORT extern void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, + int* num_retvals, TF_Status* status); // Add a function (serialized FunctionDef protocol buffer) to ctx so // that it can be invoked using TFE_Execute. -extern void TFE_ContextAddFunctionDef(TFE_Context* ctx, - const char* serialized_function_def, - size_t size, TF_Status* status); +TF_CAPI_EXPORT extern void TFE_ContextAddFunctionDef(TFE_Context* ctx, + const char* serialized_function_def, + size_t size, TF_Status* status); #ifdef __cplusplus } /* end extern "C" */ diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc index 6f5c21c9472..d19583a3abe 100644 --- a/tensorflow/c/eager/c_api_test.cc +++ b/tensorflow/c/eager/c_api_test.cc @@ -34,7 +34,9 @@ TFE_TensorHandle* TestMatrixTensorHandle() { TF_Tensor* t = TF_AllocateTensor( TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data)); memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t)); - TFE_TensorHandle* th = TFE_NewTensorHandle(t); + TF_Status* status = TF_NewStatus(); + TFE_TensorHandle* th = TFE_NewTensorHandle(t, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TF_DeleteTensor(t); return th; } @@ -383,7 +385,8 @@ TFE_TensorHandle* CreateVariable(TFE_Context* ctx, float value, memcpy(TF_TensorData(t.get()), &value, TF_TensorByteSize(t.get())); std::unique_ptr - value_handle(TFE_NewTensorHandle(t.get()), TFE_DeleteTensorHandle); + value_handle(TFE_NewTensorHandle(t.get(), status), TFE_DeleteTensorHandle); + if (TF_GetCode(status) != TF_OK) return nullptr; TFE_OpAddInput(op, value_handle.get(), status); if (TF_GetCode(status) != TF_OK) return nullptr; diff --git a/tensorflow/c/version_script.lds b/tensorflow/c/version_script.lds index 9bdc6dcc2e0..c352a1440d1 100644 --- a/tensorflow/c/version_script.lds +++ b/tensorflow/c/version_script.lds @@ -2,6 +2,7 @@ VERS_1.0 { # Export symbols in c_api.h. global: *TF_*; + *TFE_*; # Hide everything else. local: diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc index 66a943410e2..1868207148d 100644 --- a/tensorflow/cc/framework/gradients.cc +++ b/tensorflow/cc/framework/gradients.cc @@ -77,6 +77,10 @@ class SymbolicGradientBuilder { Status CallGradFunction(const Operation& op, const std::vector& grad_inputs, std::vector* grad_outputs); + + // Returns a list mapping whether each node in the graph is reachable + // from outputs_. Keyed by node id. + std::vector GetReachableNodes(); const Scope& scope_; const ops::GradOpRegistry* registry_; @@ -143,11 +147,36 @@ Status SymbolicGradientBuilder::BackpropAlongEdge(const Output& dst_grad, return Status::OK(); } +std::vector SymbolicGradientBuilder::GetReachableNodes() { + std::vector reachable_nodes(scope_.graph()->num_node_ids(), false); + std::deque queue; + for (const Output& out : outputs_) { + if (!reachable_nodes[out.node()->id()]) { + queue.push_back(out.node()); + reachable_nodes[out.node()->id()] = true; + } + } + + while (!queue.empty()) { + Node* n = queue.front(); + queue.pop_front(); + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) continue; + queue.push_back(e->src()); + reachable_nodes[e->src()->id()] = true; + } + } + return reachable_nodes; +} + Status SymbolicGradientBuilder::Initialize() { if (outputs_.size() != grad_inputs_.size()) { return errors::InvalidArgument( "Must specify a gradient input for each output."); } + std::vector reachable_nodes = GetReachableNodes(); + // TODO(theflofly) Check that inputs_ are reachable from + // outputs_ using reachable_nodes grad_outputs_->clear(); grad_outputs_->resize(inputs_.size()); // Populate `output_nodes_` from node ids in `outputs_`. @@ -188,12 +217,15 @@ Status SymbolicGradientBuilder::Initialize() { if (output_nodes_.find(n->id()) == output_nodes_.end()) { // Internal node: continue BFS along connected outputs. for (const Edge* e : n->out_edges()) { - if (e->IsControlEdge()) continue; - ++num_expected_backprops; + // If a node is not reachable from outputs_, + // we don't expect it to receive a backpropagated gradient. + // It will not be counted in num_expected_backprops. + if (e->IsControlEdge() || !reachable_nodes[e->dst()->id()]) continue; if (visited.find(e->dst()) == visited.end()) { queue.push_back(e->dst()); visited.insert(e->dst()); } + ++num_expected_backprops; } } else { // Output node: stop BFS and update `num_expected_backprops` for diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc index 24af7d567b2..032ab936235 100644 --- a/tensorflow/cc/framework/gradients_test.cc +++ b/tensorflow/cc/framework/gradients_test.cc @@ -364,6 +364,73 @@ TEST_F(GradientsTest, MultipleNodeOutputGrads) { test::AsTensor({60, 61, 62, 63, 66, 66, 66, 67}, {4, 2})); } +TEST_F(GradientsTest, UnreachableEdgeGradOneOutput) { + auto x = Variable(scope_test_, {2, 3}, DT_DOUBLE); + auto x_const = Const(scope_test_, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}); + auto x_assign = Assign(scope_test_, x, x_const); + + auto y = Variable(scope_test_, {3, 1}, DT_DOUBLE); + auto y_const = Const(scope_test_, {{1.0}, {2.0}, {3.0}}); + auto y_assign = Assign(scope_test_, y, y_const); + + auto m1 = MatMul(scope_test_, x, y); + + auto z = Variable(scope_test_, {1, 3}, DT_DOUBLE); + auto z_const = Const(scope_test_, {{9.0, 10.0, 11.0}}); + auto z_assign = Assign(scope_test_, z, z_const); + + auto m2 = MatMul(scope_test_, y, z); + + auto dm1 = Const(scope_test_, {{0.5}, {0.5}}); + + std::vector grad_outputs; + TF_ASSERT_OK( + AddSymbolicGradients(scope_test_, {m1}, {y}, {dm1}, &grad_outputs)); + + std::vector outputs; + test::GetTensors(scope_test_, {x_assign, y_assign, z_assign}, + {grad_outputs[0]}, &outputs); + // dz/dy = xT * dm1 + test::ExpectTensorNear( + outputs[0], test::AsTensor({2.5, 3.5, 4.5}, {3, 1}), 1e-5); +} + +TEST_F(GradientsTest, UnreachableEdgeGradTwoOutputs) { + auto x = Variable(scope_test_, {2, 3}, DT_DOUBLE); + auto x_const = Const(scope_test_, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}); + auto x_assign = Assign(scope_test_, x, x_const); + + auto y = Variable(scope_test_, {3, 1}, DT_DOUBLE); + auto y_const = Const(scope_test_, {{1.0}, {2.0}, {3.0}}); + auto y_assign = Assign(scope_test_, y, y_const); + + auto m1 = MatMul(scope_test_, x, y); + + auto z = Variable(scope_test_, {1, 3}, DT_DOUBLE); + auto z_const = Const(scope_test_, {{9.0, 10.0, 11.0}}); + auto z_assign = Assign(scope_test_, z, z_const); + + auto m2 = MatMul(scope_test_, y, z); + + auto dm1 = Const(scope_test_, {{0.5}, {0.5}}); + auto dm2 = + Const(scope_test_, {{0.5, 0.5, 0.5}, {0.6, 0.7, 0.8}, {0.6, 0.7, 0.9}}); + + std::vector grad_outputs; + TF_ASSERT_OK(AddSymbolicGradients(scope_test_, {m1, m2}, {y}, {dm1, dm2}, + &grad_outputs)); + + std::vector outputs; + test::GetTensors(scope_test_, {x_assign, y_assign, z_assign}, + {grad_outputs[0]}, &outputs); + + // the gradients from m1 and m2 will be summed to compute the gradient + // w.r.t y + // dz/dy = xT * dm1 + dm2 * zT + test::ExpectTensorNear( + outputs[0], test::AsTensor({17.5, 24.7, 26.8}, {3, 1}), 1e-5); +} + // StopGradientSingleOutputMultiEdgeTest tests combinations of valid and // 'NoGradient' (induced by StopGradient op) returned along multiple edges from // a single nodes output. diff --git a/tensorflow/cc/framework/testutil.cc b/tensorflow/cc/framework/testutil.cc index ca78f31db51..25ee08f6762 100644 --- a/tensorflow/cc/framework/testutil.cc +++ b/tensorflow/cc/framework/testutil.cc @@ -36,5 +36,19 @@ void GetTensor(const Scope& scope, Output tensor, Tensor* out) { *out = outputs[0]; } +void GetTensors(const Scope& scope, const std::vector& assign_vars, + OutputList tensors, std::vector* out) { + ClientSession session(scope); + TF_CHECK_OK(session.Run(assign_vars, nullptr)); + TF_CHECK_OK(session.Run(tensors, out)); +} + +void GetTensor(const Scope& scope, const std::vector& assign_vars, + Output tensor, Tensor* out) { + std::vector outputs; + GetTensors(scope, assign_vars, {std::move(tensor)}, &outputs); + *out = outputs[0]; +} + } // end namespace test } // end namespace tensorflow diff --git a/tensorflow/cc/framework/testutil.h b/tensorflow/cc/framework/testutil.h index d027ad3744d..ca57c0f0a40 100644 --- a/tensorflow/cc/framework/testutil.h +++ b/tensorflow/cc/framework/testutil.h @@ -26,9 +26,21 @@ namespace test { void GetTensors(const Scope& scope, OutputList tensors, std::vector* out); +// Computes the outputs listed in 'tensors', returns the tensors in 'out'. +// assign_vars are extra outputs that should be run +// e.g. to assign values to variables. +void GetTensors(const Scope& scope, const std::vector& assign_vars, + OutputList tensors, std::vector* out); + /// Computes the output 'tensor', returning the resulting tensor in 'out'. void GetTensor(const Scope& scope, Output tensor, Tensor* out); +// Computes the output 'tensor', returning the resulting tensor in 'out'. +// assign_vars are extra outputs that should be run +// e.g. to assign values to variables. +void GetTensor(const Scope& scope, const std::vector& assign_vars, + Output tensor, Tensor* out); + } // namespace test } // namespace tensorflow diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc index 09a15fbe5f1..d90654f2e9a 100644 --- a/tensorflow/cc/gradients/math_grad.cc +++ b/tensorflow/cc/gradients/math_grad.cc @@ -687,6 +687,72 @@ Status MeanGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Mean", MeanGrad); +Status MinOrMaxGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // The partial derivative for any input along a "reduced" dimension + // is 1 when it is the min (or max) and 0 everywhere else. So the + // gradient calculation is identical for both operators. + // + // There's a special case for propagating gradients when there are + // multiple minima (or maxima) - we choose to divide the gradient + // equally among all matching inputs. + // + // Please note this comment + // https://github.com/tensorflow/tensorflow/issues/4886#issuecomment-256836063 + // for details. + + // Running example: + // input: [[5, 5, 5], + // [1, 2, -3]] + // reduction_indices: [1] + auto input = op.input(0); + auto reduction_indices = op.input(1); + + // [2, 3] + auto input_shape = Shape(scope, input); + + // [2, 1] + auto output_shape_kept_dims = + ReducedShapeHelper(scope, input_shape, reduction_indices); + + // for op=min (say) + // output = [5, -3] + // y = [[5], + // [-3]] + auto y = Reshape(scope, op.output(0), output_shape_kept_dims); + + // reshape([g1, g2], [2, 1]) = [[g1], + // [g2]] + auto grad = Reshape(scope, grad_inputs[0], output_shape_kept_dims); + + // indicators = equal(y, input) + // = equal([[5], [[5, 5, 5], + // [-3]], [1, 2, -3]]) + // = [[1, 1, 1], + // [0, 0, 1]] + auto indicators = Cast(scope, Equal(scope, y, input), grad_inputs[0].type()); + + // [[3], + // [1]] + auto num_selected = Reshape(scope, Sum(scope, indicators, reduction_indices), + output_shape_kept_dims); + + // [[1/3, 1/3, 1/3], + // [0, 0, 1]] + auto scale = Div(scope, indicators, num_selected); + + // [[g1/3, g1/3, g1/3], + // [0, 0, g2]] + grad_outputs->push_back(Mul(scope, scale, grad)); + + // Stop propagation along reduction_indices + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("Min", MinOrMaxGrad); +REGISTER_GRADIENT_OP("Max", MinOrMaxGrad); + // MatMulGrad helper function used to compute two MatMul operations // based on input matrix transposition combinations. Status MatMulGradHelper(const Scope& scope, const bool is_batch, diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc index 62b59b25c7a..5b1558dd820 100644 --- a/tensorflow/cc/gradients/math_grad_test.cc +++ b/tensorflow/cc/gradients/math_grad_test.cc @@ -955,6 +955,55 @@ TEST_F(NaryGradTest, Mean) { RunTest({x}, {x_shape}, {y}, {y_shape}); } +TEST_F(NaryGradTest, Min) { + TensorShape x_shape({2, 3}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + auto y = Min(scope_, x, {-1}); + // y's shape is the result of reducing x along axes -1 (= 1) + TensorShape y_shape({2}); + Tensor x_init_value = + test::AsTensor({0.5f, 0.7f, 0.2f, 1.0f, 1.5f, -2.8f}, x_shape); + RunTest(x, x_init_value, y, y_shape); +} + +TEST_F(NaryGradTest, Max) { + TensorShape x_shape({2, 3}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + auto y = Max(scope_, x, {-1}); + // y's shape is the result of reducing x along axes -1 (= 1) + TensorShape y_shape({2}); + Tensor x_init_value = + test::AsTensor({0.5f, 0.7f, 0.2f, 1.0f, 1.5f, -2.8f}, x_shape); + RunTest(x, x_init_value, y, y_shape); +} + +TEST_F(NaryGradTest, MinMulti) { + // Test gradient when there are multiple minima. + // Note that we cannot directly use a test Tensor with multiple + // minima, as the numeric estimator will calculate incorrect + // gradients when perturbing each entry in the Tensor (which then + // changes how many minima exist.) + // Instead, we use a single input that broadcast-multiplies a larger + // tensor with equal values, and apply reduce_min to the multiplied + // result. + TensorShape x_shape({1}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + auto all_same = Mul(scope_, Const(scope_, {1.f, 1.f, 1.f}), x); + auto y = Min(scope_, all_same, {0}); + // y is a [3] shaped tensor reduced along dimension 0, so it is [1] shaped + TensorShape y_shape({1}); + RunTest({x}, {x_shape}, {y}, {y_shape}); +} + +TEST_F(NaryGradTest, MaxMulti) { + TensorShape x_shape({1}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + auto all_same = Mul(scope_, Const(scope_, {1.f, 1.f, 1.f}), x); + auto y = Max(scope_, all_same, {0}); + TensorShape y_shape({1}); + RunTest({x}, {x_shape}, {y}, {y_shape}); +} + TEST_F(NaryGradTest, AddN) { TensorShape shape({3, 2, 5}); std::vector xs; diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py index e349aefd4cb..e6862f0d9dd 100644 --- a/tensorflow/compiler/tests/binary_ops_test.py +++ b/tensorflow/compiler/tests/binary_ops_test.py @@ -52,6 +52,12 @@ class BinaryOpsTest(XLATestCase): def testFloatOps(self): for dtype in self.float_types: + self._testBinary( + lambda x, y: math_ops.approximate_equal(x, y, tolerance=0.0001), + np.array([[[[-1, 2.00009999], [-3, 4.01]]]], dtype=dtype), + np.array([[[[-1.001, 2], [-3.00009, 4]]]], dtype=dtype), + expected=np.array([[[[False, True], [True, False]]]], dtype=dtype)) + self._testBinary( gen_math_ops._real_div, np.array([3, 3, -1.5, -8, 44], dtype=dtype), @@ -82,6 +88,12 @@ class BinaryOpsTest(XLATestCase): dtype(4), expected=np.array([[16], [81]], dtype=dtype)) + self._testBinary( + gen_math_ops._reciprocal_grad, + np.array([4, -3, -2, 1], dtype=dtype), + np.array([5, -6, 7, -8], dtype=dtype), + expected=np.array([-80, 54, -28, 8], dtype=dtype)) + self._testBinary( gen_math_ops._sigmoid_grad, np.array([4, 3, 2, 1], dtype=dtype), @@ -107,6 +119,13 @@ class BinaryOpsTest(XLATestCase): expected=np.array( [3.97322869, 2.99258232, 1.99817801, 0.99966466], dtype=dtype)) + self._testBinary( + gen_nn_ops._softsign_grad, + np.array([4, 3, 2, 1], dtype=dtype), + np.array([5, 6, 7, 8], dtype=dtype), + expected=np.array( + [0.11111111, 0.06122449, 0.03125, 0.01234568], dtype=dtype)) + self._testBinary( gen_math_ops._tanh_grad, np.array([4, 3, 2, 1], dtype=dtype), diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc index a342e37e0ee..49c1699b6ed 100644 --- a/tensorflow/compiler/tests/randomized_tests.cc +++ b/tensorflow/compiler/tests/randomized_tests.cc @@ -888,6 +888,16 @@ TEST_F(OpTest, Any) { }); } +TEST_F(OpTest, ApproximateEqual) { + Repeatedly([this]() { + auto dims = RandomDims(); + return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ApproximateEqual") + .RandomInput(DT_FLOAT, dims) + .RandomInput(DT_FLOAT, dims) + .Attr("T", DT_FLOAT)); + }); +} + TEST_F(OpTest, Asinh) { Repeatedly([this]() { return ExpectTfAndXlaOutputsAreClose( @@ -1662,11 +1672,9 @@ TEST_F(OpTest, GreaterEqual) { TEST_F(OpTest, L2Loss) { Repeatedly([this]() { - DataType type = Choose({DT_INT32, DT_FLOAT}); - // TODO(b/31644876): scalars currently crash. - return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("L2Loss") - .RandomInput(type, RandomDims(1)) - .Attr("T", type)); + DataType type = DT_FLOAT; + return ExpectTfAndXlaOutputsAreClose( + OpTestBuilder("L2Loss").RandomInput(type).Attr("T", type)); }); } @@ -2165,6 +2173,15 @@ TEST_F(OpTest, Reciprocal) { }); } +TEST_F(OpTest, ReciprocalGrad) { + Repeatedly([this]() { + std::vector dims = RandomDims(); + return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ReciprocalGrad") + .RandomInput(DT_FLOAT, dims) + .RandomInput(DT_FLOAT, dims) + .Attr("T", DT_FLOAT)); + }); +} TEST_F(OpTest, Relu) { Repeatedly([this]() { return ExpectTfAndXlaOutputsAreClose( @@ -2250,6 +2267,13 @@ TEST_F(OpTest, ReverseV2) { }); } +TEST_F(OpTest, Rint) { + Repeatedly([this]() { + return ExpectTfAndXlaOutputsAreClose( + OpTestBuilder("Rint").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT)); + }); +} + TEST_F(OpTest, Round) { Repeatedly([this]() { return ExpectTfAndXlaOutputsAreClose( @@ -2402,6 +2426,23 @@ TEST_F(OpTest, SoftplusGrad) { }); } +TEST_F(OpTest, Softsign) { + Repeatedly([this]() { + return ExpectTfAndXlaOutputsAreClose( + OpTestBuilder("Softsign").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT)); + }); +} + +TEST_F(OpTest, SoftsignGrad) { + Repeatedly([this]() { + std::vector dims = RandomDims(); + return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SoftsignGrad") + .RandomInput(DT_FLOAT, dims) + .RandomInput(DT_FLOAT, dims) + .Attr("T", DT_FLOAT)); + }); +} + TEST_F(OpTest, SpaceToBatch) { Repeatedly([this]() { std::vector block_dims = RandomDims(4, 4, 0, 5); diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py index ca2a438005f..b21f1998a5d 100644 --- a/tensorflow/compiler/tests/unary_ops_test.py +++ b/tensorflow/compiler/tests/unary_ops_test.py @@ -18,6 +18,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import unittest + import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin @@ -161,12 +163,17 @@ class UnaryOpsTest(XLATestCase): np.array([[-1.7, 1.2]], dtype=dtype), expected=np.array([[-2, 1]], dtype=dtype)) + self._assertOpOutputMatchesExpected( + math_ops.is_finite, + np.array([[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], + dtype=dtype), + expected=np.array([[0, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=np.bool)) + # Tests for tf.nn ops. self._assertOpOutputMatchesExpected( nn_ops.l2_loss, np.array([[[]]], dtype=dtype), expected=dtype(0)) - # TODO(b/31644876): enable this test case when fixed. - # self._assertOpOutputMatchesExpected(tf.nn.l2_loss, dtype(4), dtype(10)) + self._assertOpOutputMatchesExpected(nn_ops.l2_loss, dtype(4), dtype(8)) self._assertOpOutputMatchesExpected( nn_ops.l2_loss, np.array([[-2, 4]], dtype=dtype), expected=dtype(10)) @@ -198,6 +205,12 @@ class UnaryOpsTest(XLATestCase): np.array([[1e-14, 1e-15, 0.6]], dtype=dtype), expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype))) + self._assertOpOutputMatchesExpected( + math_ops.rint, + np.array([[-1.7, 1.2, 4.0, 0.0], [-3.5, -2.5, -1.5, -0.5], + [0.5, 1.5, 2.5, 3.5]], dtype=dtype), + expected=np.array([[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]], + dtype=dtype)) self._assertOpOutputMatchesExpected( math_ops.round, np.array([[-1.7, 1.2, 4.0, 0.0], [-3.5, -2.5, -1.5, -0.5], @@ -301,6 +314,12 @@ class UnaryOpsTest(XLATestCase): np.array([[-2, 0, 8]], dtype=dtype), expected=np.array([[0.126928, 0.6931472, 8.0003354]], dtype=dtype)) + self._assertOpOutputMatchesExpected( + nn_ops.softsign, + np.array([[-2, -1, 0, 1, 2]], dtype=dtype), + expected=np.array([[-0.66666669, -0.5, 0, 0.5, 0.66666669]], + dtype=dtype)) + self._assertOpOutputMatchesExpected( math_ops.is_finite, np.array( @@ -335,6 +354,23 @@ class UnaryOpsTest(XLATestCase): np.array([[4, 3], [2, 1]], dtype=dtype), expected=np.array([[1, 1], [1, 1]], dtype=dtype)) + # TODO(phawkins): these tests fail unless fastmath optimizations + # are disabled. Use more robust IsInf/IsNaN detection and enable these + # tests. + @unittest.skip("test case fails in fast-math mode") + def testIsInfAndIsNan(self): + for dtype in self.float_types: + self._assertOpOutputMatchesExpected( + math_ops.is_inf, + np.array([[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], + dtype=dtype), + expected=np.array([[1, 0, 0, 0, 0, 0, 0, 1, 0]], dtype=np.bool)) + self._assertOpOutputMatchesExpected( + math_ops.is_nan, + np.array([[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], + dtype=dtype), + expected=np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=np.bool)) + def testLogicalOps(self): self._assertOpOutputMatchesExpected( math_ops.logical_not, diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index d09e721c936..6e6c5dc17f5 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -31,7 +31,6 @@ tf_kernel_library( "function_ops.cc", "gather_op.cc", "identity_op.cc", - "is_finite_op.cc", "l2loss_op.cc", "lrn_ops.cc", "matmul_op.cc", diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc index f9bb1e2fb1d..58538b45137 100644 --- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc @@ -102,6 +102,7 @@ XLA_MAKE_BINARY(Mod, b->Rem(lhs, rhs, extend_dimensions)); XLA_MAKE_BINARY(Maximum, b->Max(lhs, rhs, extend_dimensions)); XLA_MAKE_BINARY(Minimum, b->Min(lhs, rhs, extend_dimensions)); XLA_MAKE_BINARY(RealDiv, b->Div(lhs, rhs, extend_dimensions)); +XLA_MAKE_BINARY(ReciprocalGrad, b->Neg(b->Mul(rhs, b->Mul(lhs, lhs)))); XLA_MAKE_BINARY( RsqrtGrad, b->Mul(b->Pow(lhs, XlaHelpers::IntegerLiteral(b, input_type(0), 3)), @@ -140,6 +141,11 @@ XLA_MAKE_BINARY(SoftplusGrad, b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)), XlaHelpers::One(b, input_type(1))))); +// softsigngrad(gradients, features) = gradients / (1 + abs(features)) ** 2 +XLA_MAKE_BINARY(SoftsignGrad, + b->Div(lhs, Square(b, b->Add(XlaHelpers::One(b, input_type(0)), + b->Abs(rhs))))); + XLA_MAKE_BINARY(TanhGrad, b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)), b->Mul(lhs, lhs)))); @@ -147,5 +153,24 @@ XLA_MAKE_BINARY(Pow, b->Pow(lhs, rhs, extend_dimensions)); #undef XLA_MAKE_BINARY +class ApproximateEqualOp : public XlaOpKernel { + public: + explicit ApproximateEqualOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("tolerance", &tolerance_)); + } + + // Computes the max of the scalar input x and 0. + void Compile(XlaOpKernelContext* ctx) override { + xla::ComputationBuilder* b = ctx->builder(); + auto result = b->Lt(b->Abs(b->Sub(ctx->Input(0), ctx->Input(1))), + XlaHelpers::FloatLiteral(b, input_type(0), tolerance_)); + ctx->SetOutput(0, result); + } + + private: + float tolerance_; +}; +REGISTER_XLA_OP(Name("ApproximateEqual"), ApproximateEqualOp); + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/is_finite_op.cc b/tensorflow/compiler/tf2xla/kernels/is_finite_op.cc deleted file mode 100644 index 788dcee5443..00000000000 --- a/tensorflow/compiler/tf2xla/kernels/is_finite_op.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/tf2xla/xla_helpers.h" -#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" -#include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/bcast.h" - -namespace tensorflow { -namespace { - -class IsFiniteOp : public XlaOpKernel { - public: - explicit IsFiniteOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} - - void Compile(XlaOpKernelContext* ctx) override { - xla::ComputationDataHandle input = ctx->Input(0); - ctx->SetOutput(0, ctx->builder()->IsFinite(input)); - } - - private: - TF_DISALLOW_COPY_AND_ASSIGN(IsFiniteOp); -}; - -REGISTER_XLA_OP(Name("IsFinite"), IsFiniteOp); - -} // anonymous namespace -} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc index 7b39f0533b4..6b8f5ec7b33 100644 --- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc @@ -73,8 +73,12 @@ XLAJIT_MAKE_UNARY(Exp, b->Exp(x)); XLAJIT_MAKE_UNARY(Expm1, b->Sub(b->Exp(x), XlaHelpers::One(b, input_type(0)))); XLAJIT_MAKE_UNARY(Floor, b->Floor(x)); -// Returns 0 if x is 0, -1 if x < 0 and 1 if x > 0. -XLAJIT_MAKE_UNARY(Sign, b->Sign(x)); +XLAJIT_MAKE_UNARY(IsFinite, b->IsFinite(x)); +XLAJIT_MAKE_UNARY(IsInf, b->Eq(b->Abs(x), + XlaHelpers::FloatLiteral( + b, input_type(0), + std::numeric_limits::infinity()))); +XLAJIT_MAKE_UNARY(IsNan, b->Ne(x, x)); // Return 1/x XLAJIT_MAKE_UNARY(Inv, b->Div(XlaHelpers::One(b, input_type(0)), x)); XLAJIT_MAKE_UNARY(Reciprocal, b->Div(XlaHelpers::One(b, input_type(0)), x)); @@ -105,6 +109,12 @@ static xla::ComputationDataHandle Round(xla::ComputationBuilder* b, b->Add(round_val, one), round_val); } +XLAJIT_MAKE_UNARY(Rint, Round(b, input_type(0), x)); +XLAJIT_MAKE_UNARY(Round, Round(b, input_type(0), x)); + +XLAJIT_MAKE_UNARY(Rsqrt, + b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), -0.5))); + // Expresses sigmoid as a rescaled tanh: sigmoid(x) == (tanh(x/2) + 1) / 2. static xla::ComputationDataHandle Sigmoid(xla::ComputationBuilder* b, DataType dtype, @@ -112,16 +122,19 @@ static xla::ComputationDataHandle Sigmoid(xla::ComputationBuilder* b, auto half = XlaHelpers::FloatLiteral(b, dtype, 0.5); return b->Add(half, b->Mul(half, b->Tanh(b->Mul(half, x)))); } - -XLAJIT_MAKE_UNARY(Round, Round(b, input_type(0), x)); -XLAJIT_MAKE_UNARY(Rsqrt, - b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), -0.5))); XLAJIT_MAKE_UNARY(Sigmoid, Sigmoid(b, input_type(0), x)); + +// Returns 0 if x is 0, -1 if x < 0 and 1 if x > 0. +XLAJIT_MAKE_UNARY(Sign, b->Sign(x)); XLAJIT_MAKE_UNARY(Sinh, b->Mul(b->Sub(b->Exp(x), b->Exp(b->Neg(x))), XlaHelpers::FloatLiteral(b, input_type(0), 0.5))); XLAJIT_MAKE_UNARY(Softplus, b->Log(b->Add(b->Exp(x), XlaHelpers::One(b, input_type(0))))); +// softsign(x) = x / (abs(x) + 1) +XLAJIT_MAKE_UNARY(Softsign, + b->Div(x, + b->Add(b->Abs(x), XlaHelpers::One(b, input_type(0))))); XLAJIT_MAKE_UNARY(Sqrt, b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), 0.5))); XLAJIT_MAKE_UNARY(Square, b->Mul(x, x)); diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 7e59b70abe0..98cc3401c14 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -847,6 +847,7 @@ cc_test( srcs = ["hlo_ordering_test.cc"], deps = [ ":hlo", + ":hlo_dataflow_analysis", ":hlo_ordering", ":hlo_scheduling", "//tensorflow/compiler/xla:shape_util", diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc index bef4ecd480d..40fa3a67bde 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc @@ -241,7 +241,7 @@ Status Executor::Run() { completion_queue_.pop_front(); break; } - } while (1); + } while (true); TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, assignment_->GetUniqueTopLevelSlice(instruction)); void* result_buffer = diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc index 669ebb55bec..6efd0bcee58 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc @@ -24,16 +24,14 @@ limitations under the License. namespace xla { -Status DfsHloVisitor::HandleElementwiseUnary(HloInstruction* hlo, - HloOpcode opcode) { +Status DfsHloVisitor::HandleElementwiseUnary(HloInstruction* hlo) { return Unimplemented("DfsHloVisitor::HandleElementwiseUnary: %s", - HloOpcodeString(opcode).c_str()); + HloOpcodeString(hlo->opcode()).c_str()); } -Status DfsHloVisitor::HandleElementwiseBinary(HloInstruction* hlo, - HloOpcode opcode) { +Status DfsHloVisitor::HandleElementwiseBinary(HloInstruction* hlo) { return Unimplemented("DfsHloVisitor::HandleElementwiseBinary: %s", - HloOpcodeString(opcode).c_str()); + HloOpcodeString(hlo->opcode()).c_str()); } DfsHloVisitor::VisitState DfsHloVisitor::GetVisitState( diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h index a1a3a882c7a..2f21043a1d3 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h @@ -63,37 +63,37 @@ class DfsHloVisitor { // These routines are self-descriptive, see class comment for usage // information. - virtual Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode); - virtual Status HandleElementwiseBinary(HloInstruction* hlo, HloOpcode opcode); + virtual Status HandleElementwiseUnary(HloInstruction* hlo); + virtual Status HandleElementwiseBinary(HloInstruction* hlo); virtual Status HandleClamp(HloInstruction* clamp, HloInstruction* min, HloInstruction* arg, HloInstruction* max) = 0; virtual Status HandleSelect(HloInstruction* select, HloInstruction* pred, HloInstruction* on_true, HloInstruction* on_false) = 0; virtual Status HandleMaximum(HloInstruction* maximum) { - return HandleElementwiseBinary(maximum, HloOpcode::kMaximum); + return HandleElementwiseBinary(maximum); } virtual Status HandleMinimum(HloInstruction* minimum) { - return HandleElementwiseBinary(minimum, HloOpcode::kMinimum); + return HandleElementwiseBinary(minimum); } virtual Status HandleConcatenate( HloInstruction* concatenate, tensorflow::gtl::ArraySlice operands) = 0; virtual Status HandleConvert(HloInstruction* convert) { - return HandleElementwiseUnary(convert, HloOpcode::kConvert); + return HandleElementwiseUnary(convert); } virtual Status HandleCopy(HloInstruction* copy) { - return HandleElementwiseUnary(copy, HloOpcode::kCopy); + return HandleElementwiseUnary(copy); } virtual Status HandleMultiply(HloInstruction* multiply, HloInstruction* lhs, HloInstruction* rhs) { - return HandleElementwiseBinary(multiply, HloOpcode::kMultiply); + return HandleElementwiseBinary(multiply); } virtual Status HandleDot(HloInstruction* dot, HloInstruction* lhs, HloInstruction* rhs) = 0; virtual Status HandlePower(HloInstruction* power, HloInstruction* lhs, HloInstruction* rhs) { - return HandleElementwiseBinary(power, HloOpcode::kPower); + return HandleElementwiseBinary(power); } virtual Status HandleConvolution(HloInstruction* convolution, HloInstruction* lhs, HloInstruction* rhs, @@ -101,73 +101,72 @@ class DfsHloVisitor { virtual Status HandleCrossReplicaSum(HloInstruction* crs) = 0; virtual Status HandleCompare(HloInstruction* compare, HloOpcode opcode, HloInstruction* lhs, HloInstruction* rhs) { - return HandleElementwiseBinary(compare, opcode); + return HandleElementwiseBinary(compare); } virtual Status HandleAdd(HloInstruction* add, HloInstruction* lhs, HloInstruction* rhs) { - return HandleElementwiseBinary(add, HloOpcode::kAdd); + return HandleElementwiseBinary(add); } virtual Status HandleDivide(HloInstruction* divide, HloInstruction* lhs, HloInstruction* rhs) { - return HandleElementwiseBinary(divide, HloOpcode::kDivide); + return HandleElementwiseBinary(divide); } virtual Status HandleRemainder(HloInstruction* remainder, HloInstruction* lhs, HloInstruction* rhs) { - return HandleElementwiseBinary(remainder, HloOpcode::kRemainder); + return HandleElementwiseBinary(remainder); } virtual Status HandleSubtract(HloInstruction* subtract, HloInstruction* lhs, HloInstruction* rhs) { - return HandleElementwiseBinary(subtract, HloOpcode::kSubtract); + return HandleElementwiseBinary(subtract); } virtual Status HandleAbs(HloInstruction* abs, HloInstruction* operand) { - return HandleElementwiseUnary(abs, HloOpcode::kAbs); + return HandleElementwiseUnary(abs); } virtual Status HandleSign(HloInstruction* sign, HloInstruction* operand) { - return HandleElementwiseUnary(sign, HloOpcode::kSign); + return HandleElementwiseUnary(sign); } virtual Status HandleNegate(HloInstruction* negate, HloInstruction* operand) { - return HandleElementwiseUnary(negate, HloOpcode::kNegate); + return HandleElementwiseUnary(negate); } virtual Status HandleExp(HloInstruction* exp, HloInstruction* operand) { - return HandleElementwiseUnary(exp, HloOpcode::kExp); + return HandleElementwiseUnary(exp); } virtual Status HandleFloor(HloInstruction* floor, HloInstruction* operand) { - return HandleElementwiseUnary(floor, HloOpcode::kFloor); + return HandleElementwiseUnary(floor); } virtual Status HandleCeil(HloInstruction* ceil, HloInstruction* operand) { - return HandleElementwiseUnary(ceil, HloOpcode::kCeil); + return HandleElementwiseUnary(ceil); } virtual Status HandleLog(HloInstruction* log, HloInstruction* operand) { - return HandleElementwiseUnary(log, HloOpcode::kLog); + return HandleElementwiseUnary(log); } virtual Status HandleCos(HloInstruction* cos, HloInstruction* operand) { - return HandleElementwiseUnary(cos, HloOpcode::kCos); + return HandleElementwiseUnary(cos); } virtual Status HandleSin(HloInstruction* sin, HloInstruction* operand) { - return HandleElementwiseUnary(sin, HloOpcode::kSin); + return HandleElementwiseUnary(sin); } virtual Status HandleTanh(HloInstruction* tanh, HloInstruction* operand) { - return HandleElementwiseUnary(tanh, HloOpcode::kTanh); + return HandleElementwiseUnary(tanh); } virtual Status HandleIsFinite(HloInstruction* is_finite, HloInstruction* operand) { - return HandleElementwiseUnary(is_finite, HloOpcode::kIsFinite); + return HandleElementwiseUnary(is_finite); } virtual Status HandleLogicalAnd(HloInstruction* logical_and, HloInstruction* lhs, HloInstruction* rhs) { - return HandleElementwiseBinary(logical_and, HloOpcode::kLogicalAnd); + return HandleElementwiseBinary(logical_and); } virtual Status HandleLogicalNot(HloInstruction* logical_not, HloInstruction* operand) { - return HandleElementwiseUnary(logical_not, HloOpcode::kLogicalNot); + return HandleElementwiseUnary(logical_not); } virtual Status HandleLogicalOr(HloInstruction* logical_or, HloInstruction* lhs, HloInstruction* rhs) { - return HandleElementwiseBinary(logical_or, HloOpcode::kLogicalOr); + return HandleElementwiseBinary(logical_or); } virtual Status HandleReducePrecision(HloInstruction* reduce_precision) { - return HandleElementwiseUnary(reduce_precision, - HloOpcode::kReducePrecision); + return HandleElementwiseUnary(reduce_precision); } virtual Status HandleInfeed(HloInstruction* infeed) = 0; diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h index 10f8ae9b044..a5fe1205984 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h @@ -41,12 +41,10 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor { // Default action performed on HloInstruction. virtual Status DefaultAction(HloInstruction* hlo_instruction) = 0; - Status HandleElementwiseUnary(HloInstruction* hlo, - HloOpcode opcode) override { + Status HandleElementwiseUnary(HloInstruction* hlo) override { return DefaultAction(hlo); } - Status HandleElementwiseBinary(HloInstruction* hlo, - HloOpcode opcode) override { + Status HandleElementwiseBinary(HloInstruction* hlo) override { return DefaultAction(hlo); } diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index b02138325ed..350dbc321fb 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -709,7 +709,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator( } else { auto r = ir_builder_->CreateSub(q, p); auto leading_zeros = llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::ctlz, {r, ir_builder_->getInt1(1)}, + llvm::Intrinsic::ctlz, {r, ir_builder_->getInt1(true)}, {param_ir_type}, ir_builder_); auto in_block = ir_builder_->GetInsertBlock(); diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc index d044462f9a7..5edaaba3ebe 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc @@ -334,7 +334,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder_); IrArray::Index input_index(index.size()); - llvm::Value* in_bounds = ir_builder_->getInt1(1); + llvm::Value* in_bounds = ir_builder_->getInt1(true); for (size_t i = 0; i < index.size(); ++i) { llvm::Value* stridden_index = ir_builder_->CreateNSWMul( index[i], ir_builder_->getInt64(window.dimensions(i).stride())); diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc index 2a999f52f01..2e7765c4c61 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc @@ -389,7 +389,7 @@ StatusOr CompileModuleToPtx(llvm::Module* module, // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA // again after the standard optimization passes [http://b/13329423]. - // TODO(jingyue): SROA may further expose more optimization opportunities, such + // TODO(jingyue): SROA may further expose more optimization opportunities such // as more precise alias analysis and more function inlining (SROA may change // the inlining cost of a function). For now, running SROA already emits good // enough code for the evaluated benchmarks. We may want to run more diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc index 0beea423798..3dd8ac6dc5f 100644 --- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc @@ -37,6 +37,230 @@ namespace xla { using ::tensorflow::strings::StrAppend; using ::tensorflow::strings::StrCat; +// Data structure used to construct the alias analysis. Thrown away after alias +// analysis is complete. This data structure keeps track of which sets of +// HloValues must be in the same HloBuffer. This is maintained as a map from a +// buffer identifier (BufferNumber) to set of HLoValues. +// +// Initially each value is its own buffer. In MergeAliasedBuffers, sets of +// values which must share the same buffer are merged together. The end result +// is a partitioning of all HloValues into sets where each set needs its own +// HloBuffer. By performing this analysis without constructing HloBuffers on the +// fly, we can after-the-fact construct a vector of contiguously numbered +// HloBuffers after the buffer requirement has been determined. +class BufferValueMap { + public: + // A unique identifier for a set of colocated values which must share the same + // buffer. This is not necessarily the same as the HloBuffer::Id which will + // ultimately contain the values. The reason is that HloBuffer::Id's are + // contiguous, while BufferNumbers may not be. BufferNumbers may not be + // dense because buffers may be created and destroyed during the analysis + // construction process. + using BufferNumber = int64; + + explicit BufferValueMap(const HloDataflowAnalysis& dataflow) + : dataflow_(dataflow) { + buffers_.reserve(dataflow_.values().size()); + value_to_buffer_number_.reserve(dataflow_.values().size()); + for (const HloValue* value : dataflow_.values()) { + BufferNumber buffer_number = next_buffer_number_++; + buffers_[buffer_number].insert(value); + value_to_buffer_number_[value] = buffer_number; + } + } + + // Merge together sets of HloValues which must be in the same HloBuffer + // because of aliasing rules (eg, in-place kWhile instruction). + void MergeAliasedBuffers() { + for (const HloValue* value : dataflow_.values()) { + VLOG(3) << "Merging colocated values, value: " << value->ToShortString(); + + // Gather the set of buffers with aliasing rules (eg, kWhile) which this + // value must be contained in. + std::vector aliased_buffers = ComputeAliasedBuffers(*value); + + BufferNumber current_buffer = value_to_buffer_number_.at(value); + if (aliased_buffers.empty()) { + // The buffer containing 'value' aliases no other buffers. If the buffer + // containing 'value' already only contains 'value', then no change is + // necessary. If the buffer containing 'value' does contain other + // values, then remove 'value' from the buffer and create a new buffer + // containing only 'value' + if (buffers_.at(current_buffer).size() == 1) { + CHECK_EQ(*buffers_.at(current_buffer).begin(), value); + } else { + MoveValueToNewBuffer(*value); + } + } else { + // If multiple buffers are aliased merge these buffers together into a + // single buffer (arbitrarily chosen as the first buffer in the vector). + if (aliased_buffers.size() > 1) { + for (int64 i = 1; i < aliased_buffers.size(); ++i) { + MergeBuffers(/*from=*/aliased_buffers[i], + /*to=*/aliased_buffers[0]); + } + } + BufferNumber new_buffer = aliased_buffers[0]; + if (current_buffer != new_buffer) { + MoveValueToBuffer(*value, new_buffer); + } + } + } + } + + // Compute and return a sorted vector of all BufferNumbers. Can be used to + // iterate through all buffers stabily. + std::vector ComputeSortedBufferNumbers() const { + std::vector buffer_numbers; + for (const auto& pair : buffers_) { + buffer_numbers.push_back(pair.first); + } + std::sort(buffer_numbers.begin(), buffer_numbers.end()); + return buffer_numbers; + } + + // Return a set of all the values in the given buffer. + const tensorflow::gtl::FlatSet& GetValuesInBuffer( + BufferNumber buffer_number) const { + return buffers_.at(buffer_number); + } + + private: + // Create a new buffer. + void NewBuffer(const HloValue& value) { + BufferNumber buffer_number = next_buffer_number_++; + buffers_[buffer_number].insert(&value); + value_to_buffer_number_[&value] = buffer_number; + } + + // Move the given value into a new buffer containing only the value. + void MoveValueToNewBuffer(const HloValue& value) { + BufferNumber new_buffer_number = next_buffer_number_++; + buffers_[new_buffer_number]; + MoveValueToBuffer(value, new_buffer_number); + } + + // Move the given value into the given buffer. + void MoveValueToBuffer(const HloValue& value, BufferNumber buffer_number) { + BufferNumber old_buffer_number = value_to_buffer_number_.at(&value); + buffers_.at(old_buffer_number).erase(&value); + if (buffers_.at(old_buffer_number).empty()) { + buffers_.erase(old_buffer_number); + } + + buffers_.at(buffer_number).insert(&value); + value_to_buffer_number_.at(&value) = buffer_number; + } + + // Merge the buffer 'from' into the buffer 'to'. + void MergeBuffers(BufferNumber from, BufferNumber to) { + auto& from_value_set = buffers_.at(from); + buffers_.at(to).insert(from_value_set.begin(), from_value_set.end()); + // NOTE: using a union-find algorithm to hold the colocated values might be + // faster. + for (const HloValue* value : from_value_set) { + value_to_buffer_number_.at(value) = to; + } + buffers_.erase(from); + } + + BufferNumber GetBufferForValue(const HloValue& value) { + return value_to_buffer_number_.at(&value); + } + + // Compute and return a vector of buffers that the given value must be + // contained in due to HLO aliasing rules. + std::vector ComputeAliasedBuffers(const HloValue& value) { + // Value is init of a while (use is while). + std::vector aliased_buffers; + for (const HloUse& use : value.uses()) { + VLOG(1) << "use of value " << value.ToShortString() << ": " << use; + if (use.instruction->opcode() == HloOpcode::kWhile) { + // Determine the while value that this shares a buffer with. + const HloValue& while_value = + dataflow_.GetUniqueValueAt(use.instruction, use.operand_index); + aliased_buffers.push_back(GetBufferForValue(while_value)); + VLOG(3) << " value is init value to a while; must share buffer with " + "while value " + << while_value.ToShortString(); + } + } + + // Value is a parameter of a while body/condition. + if (value.defining_instruction()->opcode() == HloOpcode::kParameter) { + const HloComputation* computation = + value.defining_instruction()->parent(); + const CallGraphNode& call_graph_node = + dataflow_.call_graph().GetNode(computation); + for (const CallSite& callsite : call_graph_node.caller_callsites()) { + if (callsite.instruction()->opcode() == HloOpcode::kWhile) { + // Call graph must have been flattened. + CHECK_EQ(call_graph_node.caller_callsites().size(), 1); + + const HloValue& while_value = dataflow_.GetUniqueValueAt( + callsite.instruction(), value.defining_index()); + VLOG(3) << " value is parameter value of the body or condition of a " + "while; must share buffer with while value " + << while_value.ToShortString(); + aliased_buffers.push_back(GetBufferForValue(while_value)); + } + } + } + + // Value is the root of a while body. + for (const HloPosition& position : value.positions()) { + const HloComputation* computation = position.instruction->parent(); + const CallGraphNode& call_graph_node = + dataflow_.call_graph().GetNode(computation); + if (position.instruction == computation->root_instruction()) { + for (const CallSite& callsite : call_graph_node.caller_callsites()) { + if (callsite.instruction()->opcode() == HloOpcode::kWhile && + callsite.instruction()->while_body() == computation) { + // Call graph must have been flattened. + CHECK_EQ(call_graph_node.caller_callsites().size(), 1); + + const HloValue& while_value = dataflow_.GetUniqueValueAt( + callsite.instruction(), position.index); + VLOG(3) << " value is root the body computation of a while; must " + "share buffer with while value " + << while_value.ToShortString(); + aliased_buffers.push_back(GetBufferForValue(while_value)); + } + } + } + } + + // Value is the output of the while instruction itself. + if (value.defining_instruction()->opcode() == HloOpcode::kWhile) { + VLOG(3) << " value is output of a while instruction"; + aliased_buffers.push_back(GetBufferForValue(value)); + } + + // Uniquify aliased buffers. + std::sort(aliased_buffers.begin(), aliased_buffers.end()); + aliased_buffers.erase( + std::unique(aliased_buffers.begin(), aliased_buffers.end()), + aliased_buffers.end()); + + return aliased_buffers; + } + + // Dataflow analysis used to construct the buffer map. + const HloDataflowAnalysis& dataflow_; + + // A map containing the set of values contained in each buffer. + tensorflow::gtl::FlatMap> + buffers_; + + // A map indicating which buffer each value is contained in. + tensorflow::gtl::FlatMap + value_to_buffer_number_; + + // The buffer number of the next buffer to be created. + BufferNumber next_buffer_number_ = 0; +}; + HloAliasAnalysis::HloAliasAnalysis(HloModule* module) : module_(module) {} const HloBuffer& HloAliasAnalysis::GetUniqueBufferAt( @@ -99,10 +323,11 @@ bool HloAliasAnalysis::InstructionBuffersAreDistinct( } } else { // It's possible for multiple values at this index to have the same - // HloBuffer. This does not result in non-distictness. To account for this - // case, add all of the buffers at this index after checking whether each - // buffer exists at an earlier index. This is a corner case, however, as - // the number of values at an index is almost always one. + // HloBuffer. This does not result in non-distictness. To account for + // this case, add all of the buffers at this index after checking + // whether each buffer exists at an earlier index. This is a corner + // case, however, as the number of values at an index is almost always + // one. std::vector buffers_at_this_index; for (const HloValue* value : value_set.values()) { const HloBuffer* buffer = &GetBufferContainingValue(*value); @@ -118,15 +343,6 @@ bool HloAliasAnalysis::InstructionBuffersAreDistinct( return true; } -void HloAliasAnalysis::InitializeBufferSets() { - // Initially define a buffer for every HloValue in the module. - for (const HloValue& value : dataflow_analysis_->values()) { - HloBuffer& buffer = NewHloBuffer(); - buffer.AddValue(value); - value_to_buffer_[&value] = &buffer; - } -} - Status HloAliasAnalysis::Verify() const { // Verify consistency between the value_to_buffer_ map and // HloBuffer::values(). @@ -137,9 +353,8 @@ Status HloAliasAnalysis::Verify() const { value) != buffer.values().end()); } - for (const auto& pair : buffers_) { - const HloBuffer::Id id = pair.first; - const HloBuffer& buffer = pair.second; + for (HloBuffer::Id id = 0; id < buffers_.size(); ++id) { + const HloBuffer& buffer = buffers_[id]; TF_RET_CHECK(buffer.id() == id); HloValue::Id last_value_id = -1; @@ -152,116 +367,9 @@ Status HloAliasAnalysis::Verify() const { } } - if (!buffers_vector_.empty()) { - // buffers_vector_ should be a vector of all HloBuffers sorted by id. - std::vector buffers; - for (const auto& id_buffer : buffers_) { - buffers.push_back(&id_buffer.second); - } - std::sort(buffers.begin(), buffers.end(), HloBuffer::IdLessThan); - TF_RET_CHECK(buffers_vector_ == buffers); - } - return Status::OK(); } -Status HloAliasAnalysis::VerifyAgainstReference() const { - TF_RETURN_IF_ERROR(Verify()); - - TF_ASSIGN_OR_RETURN(std::unique_ptr reference, - Run(module_)); - TF_RETURN_IF_ERROR(reference->Verify()); - - VLOG(2) << "This analysis:"; - XLA_VLOG_LINES(2, ToString()); - VLOG(2) << "Reference:"; - XLA_VLOG_LINES(2, reference->ToString()); - - // Create map from HloValue in the reference analysis to HloValue in this - // analysis and vice versa. - tensorflow::gtl::FlatMap reference_to_this; - tensorflow::gtl::FlatMap this_to_reference; - for (const HloValue& value : dataflow_analysis().values()) { - const HloValue& reference_value = - reference->dataflow_analysis().GetValueDefinedAt( - value.defining_instruction(), value.defining_index()); - reference_to_this[&reference_value] = &value; - this_to_reference[&value] = &reference_value; - } - - TF_RET_CHECK(buffers_.size() == reference->buffers_.size()) - << "Different number of buffers (" << buffers_.size() - << " != " << reference->buffers_.size() << ")"; - for (const auto& pair : reference->buffers_) { - const HloBuffer& reference_buffer = pair.second; - - // Find the corresponding buffer in the reference by taking the first value - // in the buffer, finding the corresponding value in the reference, and then - // finding the buffer holding that value. - TF_RET_CHECK(!reference_buffer.values().empty()); - const HloValue* reference_value = reference_buffer.values()[0]; - const HloValue* value = reference_to_this.at(reference_value); - const HloBuffer& buffer = GetBufferContainingValue(*value); - - // The buffer and the reference should have the exact same values. To make - // comparison easy, sort the values in the reference buffer identically to - // the values in the non-reference buffer (ie, by the corresponding id of - // the non-reference value). - std::vector reference_values = reference_buffer.values(); - std::sort(reference_values.begin(), reference_values.end(), - [&reference_to_this](const HloValue* a, const HloValue* b) { - return reference_to_this.at(a)->id() < - reference_to_this.at(b)->id(); - }); - TF_RET_CHECK(reference_values.size() == buffer.values().size()); - for (int i = 0; i < buffer.values().size(); ++i) { - TF_RET_CHECK(*reference_values[i] == *buffer.values()[i]) - << "Buffer:\n " << buffer - << "\ndoes not have the same values as reference buffer:\n " - << reference_buffer; - } - } - - return Status::OK(); -} - -HloBuffer& HloAliasAnalysis::NewHloBuffer() { - HloBuffer::Id buffer_id = next_buffer_id_++; - auto emplaced = buffers_.emplace(std::piecewise_construct, - std::forward_as_tuple(buffer_id), - std::forward_as_tuple(buffer_id)); - CHECK(emplaced.second); - - buffers_vector_.clear(); - - return emplaced.first->second; -} - -void HloAliasAnalysis::MoveValueToNewBuffer(const HloValue& value) { - HloBuffer& new_buffer = NewHloBuffer(); - MoveValueToBuffer(value, &new_buffer); - - VLOG(3) << "Moved value " << value.ToShortString() << " into new buffer " - << new_buffer.id(); -} - -void HloAliasAnalysis::MoveValueToBuffer(const HloValue& value, - HloBuffer* buffer) { - HloBuffer& old_buffer = GetBufferContainingValue(value); - CHECK_NE(buffer, &old_buffer); - VLOG(3) << "Moved value " << value.ToShortString() << " from buffer " - << old_buffer.id() << " into buffer " << buffer->id(); - old_buffer.RemoveValue(value); - if (old_buffer.values().empty()) { - VLOG(3) << "Buffer " << old_buffer.id() << " now empty. Removing."; - buffers_.erase(old_buffer.id()); - buffers_vector_.clear(); - } - - buffer->AddValue(value); - value_to_buffer_[&value] = buffer; -} - string HloAliasAnalysis::ToString() const { string out = StrCat("HloAliasAnalysis, module ", module_->name(), "\n"); StrAppend(&out, " Buffers at each position:\n"); @@ -290,10 +398,10 @@ string HloAliasAnalysis::ToString() const { } StrAppend(&out, " Buffers:\n"); - for (const HloBuffer* buffer : buffers()) { - StrAppend(&out, " ", buffer->ToString(), "\n"); + for (const HloBuffer& buffer : buffers()) { + StrAppend(&out, " ", buffer.ToString(), "\n"); StrAppend(&out, " positions:\n"); - for (const HloPosition& position : buffer->ComputePositions()) { + for (const HloPosition& position : buffer.ComputePositions()) { StrAppend(&out, " ", position.ToString(), "\n"); } } @@ -301,217 +409,6 @@ string HloAliasAnalysis::ToString() const { return out; } -const std::vector& HloAliasAnalysis::buffers() const { - if (buffers_vector_.empty()) { - // Lazily construct vector of buffers. - buffers_vector_.reserve(buffers_.size()); - for (auto& pair : buffers_) { - buffers_vector_.push_back(&pair.second); - } - std::sort(buffers_vector_.begin(), buffers_vector_.end(), - HloBuffer::IdLessThan); - } else { - CHECK_EQ(buffers_vector_.size(), buffers_.size()); - for (const HloBuffer* buffer : buffers_vector_) { - DCHECK(ContainsKey(buffers_, buffer->id())); - DCHECK(&GetBuffer(buffer->id()) == buffer); - } - } - return buffers_vector_; -} - -void HloAliasAnalysis::UpdateAtInstructions( - tensorflow::gtl::ArraySlice instructions) { - VLOG(4) << "Updated HLO module:"; - XLA_VLOG_LINES(4, module_->ToString()); - - VLOG(3) << "Before update:"; - XLA_VLOG_LINES(3, ToString()); - - std::vector values_to_update; - for (const HloInstruction* instruction : instructions) { - for (auto& pair : dataflow_analysis().GetInstructionValueSet(instruction)) { - for (const HloValue* value : pair.second.values()) { - values_to_update.push_back(value); - } - } - } - - UpdateBuffersForValues(values_to_update); - - VLOG(3) << "After update:"; - XLA_VLOG_LINES(3, ToString()); -} - -void HloAliasAnalysis::UpdateAfterChangingOperand(HloInstruction* instruction, - HloInstruction* old_operand, - HloInstruction* new_operand) { - VLOG(1) << "UpdateAfterChangingOperand(" << instruction->name() << ", " - << old_operand->name() << " => " << new_operand->name() << ")"; - - dataflow_analysis_->UpdateAfterChangingOperand(instruction, old_operand, - new_operand); - TF_DCHECK_OK(dataflow_analysis_->VerifyAgainstReference()); - - VLOG(4) << "Updated dataflow:"; - XLA_VLOG_LINES(4, dataflow_analysis_->ToString()); - - UpdateAtInstructions({instruction, old_operand, new_operand}); -} - -void HloAliasAnalysis::UpdateAfterChangingRoot(HloInstruction* old_root, - HloInstruction* new_root) { - VLOG(1) << "UpdateAfterChangingRoot(" << old_root->name() << " => " - << new_root->name() << ")"; - - dataflow_analysis_->UpdateAfterChangingRoot(old_root, new_root); - TF_DCHECK_OK(dataflow_analysis_->VerifyAgainstReference()); - - VLOG(4) << "Updated dataflow:"; - XLA_VLOG_LINES(4, dataflow_analysis_->ToString()); - - UpdateAtInstructions({old_root, new_root}); -} - -std::vector HloAliasAnalysis::ComputeAliasedBuffers( - const HloValue& value) { - std::vector aliased_buffers; - - // Value is init of a while (use is while). - for (const HloUse& use : value.uses()) { - VLOG(1) << "use of value " << value.ToShortString() << ": " << use; - if (use.instruction->opcode() == HloOpcode::kWhile) { - // Determine the while value that this shares a buffer with. - const HloValue& while_value = dataflow_analysis().GetUniqueValueAt( - use.instruction, use.operand_index); - aliased_buffers.push_back(&GetBufferContainingValue(while_value)); - VLOG(3) << " value is init value to a while; must share buffer with " - "while value " - << while_value.ToShortString(); - } - } - - // Value is a parameter of a while body/condition. - if (value.defining_instruction()->opcode() == HloOpcode::kParameter) { - const HloComputation* computation = value.defining_instruction()->parent(); - const CallGraphNode& call_graph_node = - dataflow_analysis().call_graph().GetNode(computation); - for (const CallSite& callsite : call_graph_node.caller_callsites()) { - if (callsite.instruction()->opcode() == HloOpcode::kWhile) { - // Call graph must have been flattened. - CHECK_EQ(call_graph_node.caller_callsites().size(), 1); - - const HloValue& while_value = dataflow_analysis().GetUniqueValueAt( - callsite.instruction(), value.defining_index()); - VLOG(3) << " value is parameter value of the body or condition of a " - "while; must share buffer with while value " - << while_value.ToShortString(); - aliased_buffers.push_back(&GetBufferContainingValue(while_value)); - } - } - } - - // Value is the root of a while body. - for (const HloPosition& position : value.positions()) { - const HloComputation* computation = position.instruction->parent(); - const CallGraphNode& call_graph_node = - dataflow_analysis().call_graph().GetNode(computation); - if (position.instruction == computation->root_instruction()) { - for (const CallSite& callsite : call_graph_node.caller_callsites()) { - if (callsite.instruction()->opcode() == HloOpcode::kWhile && - callsite.instruction()->while_body() == computation) { - // Call graph must have been flattened. - CHECK_EQ(call_graph_node.caller_callsites().size(), 1); - - // If the value appears in the root of a while body, then - // necessarily the value is defined in the body as well. - CHECK_EQ(value.defining_instruction()->parent(), computation); - - const HloValue& while_value = dataflow_analysis().GetUniqueValueAt( - callsite.instruction(), position.index); - VLOG(3) << " value is root the body computation of a while; must " - "share buffer with while value " - << while_value.ToShortString(); - aliased_buffers.push_back(&GetBufferContainingValue(while_value)); - } - } - } - } - - // Value is in the while instruction itself. - if (value.defining_instruction()->opcode() == HloOpcode::kWhile) { - VLOG(3) << " value is output of a while instruction"; - aliased_buffers.push_back(&GetUniqueBufferAt(value.defining_instruction(), - value.defining_index())); - } - - // Uniquify aliased buffers. - std::sort(aliased_buffers.begin(), aliased_buffers.end(), - HloBuffer::IdLessThan); - aliased_buffers.erase( - std::unique(aliased_buffers.begin(), aliased_buffers.end()), - aliased_buffers.end()); - - return aliased_buffers; -} - -// This method recomputes the HloBuffer for each of the given HloValues. The -// method does not necessarily update the HloBuffer of values which share a -// buffer with the given values, but are not explicitly passed in -// 'values'. Therefore, the caller must pass in all values which may require an -// update according to the kind of HLO graph change which occurred: operand -// changed (UpdateAfterChangingOperand), or root of computation changed -// (UpdateAfterChangingRoot). -void HloAliasAnalysis::UpdateBuffersForValues( - tensorflow::gtl::ArraySlice values) { - for (const HloValue* value : values) { - VLOG(3) << "Updating buffer for value: " << value->ToShortString(); - - // Gather the set of buffer with aliasing rules (eg, kWhile) which this - // value must be contained in due. - std::vector aliased_buffers = ComputeAliasedBuffers(*value); - - HloBuffer& current_buffer = GetBufferContainingValue(*value); - if (aliased_buffers.empty()) { - // The buffer containing 'value' aliases no other buffers. If the buffer - // containing 'value' already only contains 'value', then no change is - // necessary. If the buffer containing 'value' does contain other values, - // then remove 'value' from the buffer and create a new buffer containing - // only 'value' - if (current_buffer.values().size() == 1) { - CHECK_EQ(current_buffer.values()[0], value); - } else { - MoveValueToNewBuffer(*value); - } - } else { - // If multiple buffers are aliased merge these buffers together into a - // single buffer (arbitrarily chosen as the first buffer in the vector). - if (aliased_buffers.size() > 1) { - for (int64 i = 1; i < aliased_buffers.size(); ++i) { - // Make copy of values vector because MoveValueToBuffer invalidates - // the values iterator. The could be done more efficiently by moving - // all values and once. - std::vector values = aliased_buffers[i]->values(); - for (const HloValue* value : values) { - MoveValueToBuffer(*value, aliased_buffers[0]); - } - } - aliased_buffers.resize(1); - } - - CHECK_EQ(aliased_buffers.size(), 1); - HloBuffer* new_buffer = aliased_buffers[0]; - - if (¤t_buffer != new_buffer) { - MoveValueToBuffer(*value, new_buffer); - } - } - - VLOG(4) << "Analysis after update:"; - XLA_VLOG_LINES(4, ToString()); - } -} - /* static */ StatusOr> HloAliasAnalysis::Run( HloModule* module) { @@ -524,18 +421,28 @@ StatusOr> HloAliasAnalysis::Run( HloDataflowAnalysis::Run(module, /*ssa_form=*/true, /*bitcast_defines_value=*/false)); - alias_analysis->InitializeBufferSets(); + BufferValueMap buffer_map(alias_analysis->dataflow_analysis()); + buffer_map.MergeAliasedBuffers(); - VLOG(3) << "After initialization:"; - XLA_VLOG_LINES(3, alias_analysis->ToString()); - - std::vector all_values; - for (const HloValue& value : alias_analysis->dataflow_analysis().values()) { - all_values.push_back(&value); + // Create a vector of HloBuffers, one for each set of values in the + // BufferValueMap. Create the HloBuffers as a vector of contiguously numbered + // buffers. + std::vector sorted_buffer_numbers = + buffer_map.ComputeSortedBufferNumbers(); + alias_analysis->buffers_.reserve(sorted_buffer_numbers.size()); + HloBuffer::Id next_id = 0; + for (BufferValueMap::BufferNumber buffer_number : sorted_buffer_numbers) { + auto& value_set = buffer_map.GetValuesInBuffer(buffer_number); + std::vector sorted_values(value_set.begin(), + value_set.end()); + std::sort(sorted_values.begin(), sorted_values.end(), HloValue::IdLessThan); + alias_analysis->buffers_.emplace_back(next_id++, sorted_values); + for (const HloValue* value : sorted_values) { + alias_analysis->value_to_buffer_[value] = + &alias_analysis->buffers_.back(); + } } - alias_analysis->UpdateBuffersForValues(all_values); - TF_DCHECK_OK(alias_analysis->Verify()); XLA_VLOG_LINES(1, alias_analysis->ToString()); diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h index 1b538f6d1cf..39554e46648 100644 --- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h @@ -74,7 +74,7 @@ class HloAliasAnalysis { // Return a vector of all HloBuffers stabily sorted by HloBuffer::Id. This // vector is lazily computed. Mutating operations on HloAliasAnalysis may // invalidate the underlying vector requiring recomputation. - const std::vector& buffers() const; + const std::vector& buffers() const { return buffers_; } // Returns the underlying dataflow analysis used by this alias analysis. const HloDataflowAnalysis& dataflow_analysis() const { @@ -90,50 +90,13 @@ class HloAliasAnalysis { // output of the given instruction. bool InstructionBuffersAreDistinct(const HloInstruction* instruction) const; - // Updates the analysis after the operands of 'instruction' have changed or if - // 'instruction' has been made the root of a computation. Analysis update is - // not possible if instructions have been added or removed from the graph. - void UpdateAfterChangingOperand(HloInstruction* instruction, - HloInstruction* old_operand, - HloInstruction* new_operand); - void UpdateAfterChangingRoot(HloInstruction* old_root, - HloInstruction* new_root); - // Compare the dataflow analysis against a clean recomputation of the // analysis. Returns an error status if there is a mismatch. Useful for // verifying the correctness after updates to the analysis. Status VerifyAgainstReference() const; protected: - HloAliasAnalysis(HloModule* module); - - // Create a new empty HloBuffer. - HloBuffer& NewHloBuffer(); - - // Move the given value to the given buffer. The value is removed from it's - // current buffer. - void MoveValueToBuffer(const HloValue& value, HloBuffer* buffer); - - // Move the given value to a newly created buffer. The value is removed from - // it's current buffer. - void MoveValueToNewBuffer(const HloValue& value); - - // Construct the initial set of buffer sets where an HloBuffer is created for - // each HloValue in the module. - void InitializeBufferSets(); - - // Compute and return the buffers with aliasing rules (eg, kWhile) which the - // given value must be contained in. - std::vector ComputeAliasedBuffers(const HloValue& value); - - // Recompute the HloBuffers for the given values. - void UpdateBuffersForValues( - tensorflow::gtl::ArraySlice values); - - // Recompute the HloBuffers for all the values which appear in the output of - // the given instructions. - void UpdateAtInstructions( - tensorflow::gtl::ArraySlice instructions); + explicit HloAliasAnalysis(HloModule* module); // Verify various invariants of the alias analysis. Status Verify() const; @@ -143,20 +106,12 @@ class HloAliasAnalysis { // The underlying dataflow analysis used by this alias analysis. std::unique_ptr dataflow_analysis_; - // The map of all HloBuffers in the module. We pass around pointers to the - // mapped HloBuffers, so the underlying container must keep them valid despite - // mutations touching other map entries. - std::unordered_map buffers_; - // A map indicating which buffer a value is contained in. tensorflow::gtl::FlatMap value_to_buffer_; // A lazily constructed vector containing all HloBuffers sorted by // HloBuffer::Id. - mutable std::vector buffers_vector_; - - // The Id to use for the next HloBuffer. - int64 next_buffer_id_ = 0; + std::vector buffers_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc index e2815d6e648..6e311e25fb9 100644 --- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc @@ -87,14 +87,13 @@ class HloAliasAnalysisTest : public HloTestBase { // constructed. bool AnyValuesInSameBufferInterfere() { DependencyHloOrdering ordering(module_.get()); - for (const HloBuffer* buffer : analysis_->buffers()) { - for (const HloValue* value_a : buffer->values()) { - for (const HloValue* value_b : buffer->values()) { + for (const HloBuffer& buffer : analysis_->buffers()) { + for (const HloValue* value_a : buffer.values()) { + for (const HloValue* value_b : buffer.values()) { if (*value_a != *value_b && - analysis_->dataflow_analysis().MayInterfere(*value_a, *value_b, - ordering)) { + ordering.MayInterfere(*value_a, *value_b)) { VLOG(1) << *value_a << " interferes with " << *value_b - << " in buffer: " << *buffer; + << " in buffer: " << buffer; return true; } } @@ -384,10 +383,7 @@ TEST_F(HloAliasAnalysisTest, SingleWhile) { EXPECT_THAT( GetValuesInBuffer(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0})), - UnorderedElementsAre(GetValueDefinedAt(xla_while, /*index=*/{0}), - GetValueDefinedAt(body_param, /*index=*/{0}), - GetValueDefinedAt(cond_param, /*index=*/{0}), - GetValueDefinedAt(constant1))); + UnorderedElementsAre(GetValueDefinedAt(constant1))); EXPECT_THAT( GetValuesInBuffer(analysis.GetUniqueBufferAt(xla_while, /*index=*/{1})), UnorderedElementsAre(GetValueDefinedAt(constant2), @@ -631,9 +627,9 @@ TEST_F(HloAliasAnalysisTest, SwizzlingWhile) { // HloBuffers. EXPECT_THAT( analysis.buffers(), - UnorderedElementsAre(&analysis.GetUniqueBufferAt(constant1), - &analysis.GetUniqueBufferAt(tuple, /*index=*/{}), - &analysis.GetUniqueBufferAt(cond_constant))); + UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1), + analysis.GetUniqueBufferAt(tuple, /*index=*/{}), + analysis.GetUniqueBufferAt(cond_constant))); // The tuple elements of the while and the three constant inputs should all be // smooshed into the same buffer. @@ -820,127 +816,5 @@ TEST_F(HloAliasAnalysisTest, Bitcast) { analysis.GetUniqueBufferAt(bitcast)); } -TEST_F(HloAliasAnalysisTest, UpdateAnalysisForWhile) { - // Test updating alias analysis after modifying a module with an array shaped - // while: - // - // body(F32[] %param): - // %negate = Negate(%param) - // - // condition(F32[] %param): - // return Constant(false) - // - // entry: - // %constant = Constant(1.0) - // %exp = Exp(%constant) - // return While(%exp, body, condition) - // - auto body_builder = HloComputation::Builder("body"); - auto body_param = body_builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param")); - auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary( - scalar_shape_, HloOpcode::kNegate, body_param)); - HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build()); - - // Condition computation trivially returns a constant "false". - auto cond_builder = HloComputation::Builder("condition"); - auto cond_param = cond_builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param")); - cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); - HloComputation* condition = - module_->AddEmbeddedComputation(cond_builder.Build()); - - auto builder = HloComputation::Builder(TestName()); - auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); - auto exp = builder.AddInstruction( - HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kExp, constant)); - auto xla_while = builder.AddInstruction( - HloInstruction::CreateWhile(scalar_shape_, condition, body, exp)); - module_->AddEntryComputation(builder.Build()); - - HloAliasAnalysis& analysis = RunAnalysis(); - - // Sanity check some alias information. - EXPECT_EQ(analysis.GetUniqueBufferAt(exp), - analysis.GetUniqueBufferAt(body_param)); - EXPECT_EQ(analysis.GetUniqueBufferAt(exp), - analysis.GetUniqueBufferAt(cond_param)); - EXPECT_EQ(analysis.GetUniqueBufferAt(exp), - analysis.GetUniqueBufferAt(negate)); - EXPECT_EQ(analysis.GetUniqueBufferAt(exp), - analysis.GetUniqueBufferAt(xla_while)); - - // Set the body root to the body_param. Previously it was Negate(body_param). - body->set_root_instruction(body_param); - - // Prior to updating, verify that the analysis is no longer valid. - Status verify_status = analysis.VerifyAgainstReference(); - EXPECT_FALSE(verify_status.ok()); - - analysis.UpdateAfterChangingRoot(/*old_root=*/negate, - /*new_root*/ body_param); - - // Analysis should be valid after the update. - TF_ASSERT_OK(analysis.VerifyAgainstReference()); - - // The exponential should now pass through the body transparently. - EXPECT_EQ(analysis.GetUniqueBufferAt(exp), - analysis.GetUniqueBufferAt(body_param)); - EXPECT_EQ(analysis.GetUniqueBufferAt(exp), - analysis.GetUniqueBufferAt(cond_param)); - EXPECT_NE(analysis.GetUniqueBufferAt(exp), - analysis.GetUniqueBufferAt(negate)); - EXPECT_EQ(analysis.GetUniqueBufferAt(exp), - analysis.GetUniqueBufferAt(xla_while)); - - // Now replace the operand of the while with %constant (was %exp). - TF_ASSERT_OK(exp->ReplaceUseWith(xla_while, constant)); - analysis.UpdateAfterChangingOperand(xla_while, /*old_operand=*/exp, - /*new_operand=*/constant); - - // Analysis should be valid after the update. - TF_ASSERT_OK(analysis.VerifyAgainstReference()); - - EXPECT_EQ(analysis.GetUniqueBufferAt(constant), - analysis.GetUniqueBufferAt(body_param)); - EXPECT_EQ(analysis.GetUniqueBufferAt(constant), - analysis.GetUniqueBufferAt(cond_param)); - EXPECT_EQ(analysis.GetUniqueBufferAt(constant), - analysis.GetUniqueBufferAt(xla_while)); - EXPECT_NE(analysis.GetUniqueBufferAt(constant), - analysis.GetUniqueBufferAt(exp)); - EXPECT_NE(analysis.GetUniqueBufferAt(constant), - analysis.GetUniqueBufferAt(negate)); - - // And finally make the negate the root of the body again. - body->set_root_instruction(negate); - analysis.UpdateAfterChangingRoot(/*old_root=*/body_param, - /*new_root*/ negate); - - // Analysis should be valid after the update. - TF_ASSERT_OK(analysis.VerifyAgainstReference()); - - EXPECT_EQ(analysis.GetUniqueBufferAt(negate), - analysis.GetUniqueBufferAt(body_param)); - EXPECT_EQ(analysis.GetUniqueBufferAt(negate), - analysis.GetUniqueBufferAt(cond_param)); - EXPECT_EQ(analysis.GetUniqueBufferAt(negate), - analysis.GetUniqueBufferAt(xla_while)); - EXPECT_EQ(analysis.GetUniqueBufferAt(constant), - analysis.GetUniqueBufferAt(negate)); - - auto value_of = [&analysis](const HloInstruction* instruction) { - return &analysis.dataflow_analysis().GetValueDefinedAt(instruction); - }; - EXPECT_THAT(analysis.GetUniqueBufferAt(negate).values(), - UnorderedElementsAre(value_of(body_param), value_of(cond_param), - value_of(negate), value_of(constant), - value_of(xla_while))); -} - -// Test update tuple element. - } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc index 2bfdd9156ad..e16413f361f 100644 --- a/tensorflow/compiler/xla/service/hlo_buffer.cc +++ b/tensorflow/compiler/xla/service/hlo_buffer.cc @@ -36,22 +36,6 @@ namespace xla { using ::tensorflow::str_util::Join; using ::tensorflow::strings::StrCat; -void HloBuffer::AddValue(const HloValue& value) { - values_.push_back(&value); - // Sort vector and remove duplicates. - std::sort(values_.begin(), values_.end(), HloValue::IdLessThan); - values_.erase(std::unique(values_.begin(), values_.end(), HloValue::IdEqual), - values_.end()); -} - -void HloBuffer::RemoveValue(const HloValue& value) { - // The values are sorted, so finding the value could be done in log(n) time - // with a binary search. - auto it = std::find(values_.begin(), values_.end(), &value); - CHECK(it != values_.end()); - values_.erase(it); -} - bool HloBuffer::operator==(const HloBuffer& other) const { bool equal = id() == other.id(); if (equal) { diff --git a/tensorflow/compiler/xla/service/hlo_buffer.h b/tensorflow/compiler/xla/service/hlo_buffer.h index cb961e1601c..4873463b2ea 100644 --- a/tensorflow/compiler/xla/service/hlo_buffer.h +++ b/tensorflow/compiler/xla/service/hlo_buffer.h @@ -84,22 +84,15 @@ class HloBuffer { return a->id() == b->id(); } - HloBuffer(Id id) : id_(id) {} + HloBuffer(Id id, tensorflow::gtl::ArraySlice values) + : id_(id), values_(values.begin(), values.end()) {} // Return the unique identifier for this HloBuffer. Id id() const { return id_; } - // Add a value to the set of values held by this buffer. Also adds the - // HloPositions of the value to the positions vector of the buffer. If the - // buffer already contains this value, then this method is a nop. - void AddValue(const HloValue& value); - void RemoveValue(const HloValue& value); - // Return all values contained in this buffer. const std::vector& values() const { return values_; } - std::vector ComputePositions() const; - // Return the unique HLO value in the buffer. CHECK fails if the buffer does // not contain exactly one value. const HloValue& GetUniqueValue() const { @@ -107,6 +100,8 @@ class HloBuffer { return *values_[0]; } + std::vector ComputePositions() const; + string ToString() const; bool operator==(const HloBuffer& other) const; @@ -118,7 +113,7 @@ class HloBuffer { // The set of values contained in this buffer. Vector contains no duplicates // and is sorted stably by HloValue::Id. - std::vector values_; + const std::vector values_; }; std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer); diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc index 9dbde0ec243..f6b764732b4 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc @@ -118,13 +118,11 @@ Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) { } } -Status HloCostAnalysis::HandleElementwiseUnary(HloInstruction* hlo, - HloOpcode opcode) { +Status HloCostAnalysis::HandleElementwiseUnary(HloInstruction* hlo) { return HandleElementwiseOp(hlo); } -Status HloCostAnalysis::HandleElementwiseBinary(HloInstruction* hlo, - HloOpcode opcode) { +Status HloCostAnalysis::HandleElementwiseBinary(HloInstruction* hlo) { return HandleElementwiseOp(hlo); } diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h index 6d8fdfa64b5..eeb3d4edd1b 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h @@ -49,9 +49,8 @@ class HloCostAnalysis : public DfsHloVisitor { using ShapeSizeFunction = std::function; explicit HloCostAnalysis(const ShapeSizeFunction& shape_size); - Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode) override; - Status HandleElementwiseBinary(HloInstruction* hlo, - HloOpcode opcode) override; + Status HandleElementwiseUnary(HloInstruction* hlo) override; + Status HandleElementwiseBinary(HloInstruction* hlo) override; Status HandleConstant(HloInstruction* constant, const Literal& literal) override; Status HandleGetTupleElement(HloInstruction* get_tuple_element, diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc index ea8b239e100..2be1645f1b0 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc @@ -67,6 +67,22 @@ HloValue& HloDataflowAnalysis::GetValueDefinedAt( return GetUniqueValueAt(instruction, index); } +HloValue* HloDataflowAnalysis::NewHloValue(HloInstruction* instruction, + const ShapeIndex& index, + bool is_phi) { + const int64 value_id = next_value_id_++; + auto emplaced = values_.emplace( + std::piecewise_construct, std::forward_as_tuple(value_id), + std::forward_as_tuple(value_id, instruction, index, is_phi)); + CHECK(emplaced.second); + + return &emplaced.first->second; +} + +void HloDataflowAnalysis::DeleteHloValue(HloValue::Id value_id) { + values_.erase(value_id); +} + string HloDataflowAnalysis::ToString() const { string out = StrCat("HloDataflowAnalysis, module ", module_->name(), "\n"); StrAppend(&out, " Instruction value sets:\n"); @@ -99,22 +115,98 @@ string HloDataflowAnalysis::ToString() const { } } StrAppend(&out, " HloValues:\n"); - for (const HloValue& value : values()) { - StrAppend(&out, value.ToString(/*indent=*/4)); - } - StrAppend(&out, " Phi resolutions:\n"); - for (const HloValue& value : values()) { - if (value.is_phi()) { - const HloValue* resolved_value = ResolvePhi(value); - StrAppend(&out, " ", value.ToShortString(), " => ", - resolved_value == nullptr ? "UNKNOWN" - : resolved_value->ToShortString(), - "\n"); - } + for (const HloValue* value : values()) { + StrAppend(&out, value->ToString(/*indent=*/4)); } return out; } +bool HloDataflowAnalysis::Phi( + HloInstruction* instruction, + tensorflow::gtl::ArraySlice inputs) { + CHECK(ssa_form_); + + for (const InstructionValueSet* input : inputs) { + DCHECK(ShapeUtil::Compatible(instruction->shape(), input->shape())); + } + + bool changed = false; + for (auto& pair : GetInstructionValueSet(instruction)) { + const ShapeIndex& index = pair.first; + HloValueSet& value_set = pair.second; + + // Positions with phi values should never have more than one value in the + // value set. + CHECK_LE(value_set.values().size(), 1); + const HloValue* current_value = + value_set.values().size() == 1 ? value_set.values()[0] : nullptr; + + // Construct a vector of unique value IDs of the inputs. + std::vector input_value_ids; + for (const InstructionValueSet* input : inputs) { + for (const HloValue* value : input->element(index).values()) { + input_value_ids.push_back(value->id()); + } + } + std::sort(input_value_ids.begin(), input_value_ids.end()); + input_value_ids.erase( + std::unique(input_value_ids.begin(), input_value_ids.end()), + input_value_ids.end()); + + // Remove the existing phi value (if it exists). The phi can be its own + // input, for example, in while body parameters where the body passes + // through the parameter value. + bool current_value_defined_here = + (current_value != nullptr && + current_value->defining_instruction() == instruction && + current_value->defining_index() == index); + if (current_value_defined_here) { + CHECK(current_value->is_phi()); + auto it = std::find(input_value_ids.begin(), input_value_ids.end(), + current_value->id()); + if (it != input_value_ids.end()) { + input_value_ids.erase(it); + } + } + + if (input_value_ids.empty()) { + // A value set which has at least one element should never have its value + // set reduced to zero elements. During dataflow value sets only can go + // from empty to non-empty, not the reverse. + CHECK_EQ(value_set.values().size(), 0) + << "Instruction " << instruction->name() << " at index " << index + << " previously had non-empty value set. Value set: " << value_set; + } else if (input_value_ids.size() == 1) { + // Only a single value reaches this point. There should be no phi, and + // this value set should contain this single value. + const HloValue& new_value = GetValue(input_value_ids[0]); + if (current_value == nullptr) { + value_set.Clear(); + value_set.AddValue(&new_value); + changed = true; + } else if (current_value != &new_value) { + if (current_value_defined_here) { + // Remove the existing phi. + DeleteHloValue(current_value->id()); + } + value_set.Clear(); + value_set.AddValue(&new_value); + changed = true; + } + } else { + // Multiple distinct values reach this point. A phi value is + // necessary. + CHECK_GT(input_value_ids.size(), 1); + if (current_value == nullptr || !current_value->is_phi()) { + value_set.Clear(); + value_set.AddValue(NewHloValue(instruction, index, /*is_phi=*/true)); + changed = true; + } + } + } + return changed; +} + const HloValue& HloDataflowAnalysis::GetValue(HloValue::Id value_id) const { return values_.at(value_id); } @@ -142,129 +234,6 @@ HloValueSet& HloDataflowAnalysis::GetValueSet(const HloPosition& position) { return GetValueSet(position.instruction, position.index); } -void HloDataflowAnalysis::UpdateAfterChangingOperand( - HloInstruction* instruction, HloInstruction* old_operand, - HloInstruction* new_operand) { - CHECK(std::find(instruction->operands().begin(), - instruction->operands().end(), - new_operand) != instruction->operands().end()); - VLOG(1) << "UpdateAfterChangingOperand(" << instruction->name() << ", " - << old_operand->name() << " => " << new_operand->name() << ")"; - - std::vector to_update = {instruction}; - - // If the instruction calls any computations then add the parameters of called - // computation to capture any changes to the dataflow into the subcomputation - // introduced by the new operand. - for (HloComputation* computation : instruction->called_computations()) { - to_update.insert(to_update.end(), - computation->parameter_instructions().begin(), - computation->parameter_instructions().end()); - } - - UpdateInstructionsAndPropagate(to_update); - - // The uses of the values in the old and new operand may have changed. Uses of - // other HloValues are updated in UpdateInstructionsAndPropagate. - for (auto& pair : GetInstructionValueSet(old_operand)) { - for (const HloValue* value : pair.second.values()) { - GetValue(value->id()).RecomputeUses(); - } - } - for (auto& pair : GetInstructionValueSet(new_operand)) { - for (const HloValue* value : pair.second.values()) { - GetValue(value->id()).RecomputeUses(); - } - } - - TF_DCHECK_OK(VerifyAgainstReference()); -} - -void HloDataflowAnalysis::UpdateAfterChangingRoot(HloInstruction* old_root, - HloInstruction* new_root) { - VLOG(1) << "UpdateAfterChangingRoot(" << old_root->name() << " => " - << new_root->name() << ")"; - - CHECK_EQ(new_root, new_root->parent()->root_instruction()); - CHECK_EQ(new_root->parent(), old_root->parent()); - - std::vector to_update = {old_root, new_root}; - - const CallGraphNode& call_graph_node = - call_graph_->GetNode(new_root->parent()); - for (const CallSite& callsite : call_graph_node.caller_callsites()) { - if (callsite.instruction()->opcode() == HloOpcode::kCall) { - to_update.push_back(callsite.instruction()); - } else if (callsite.instruction()->opcode() == HloOpcode::kWhile) { - // Add the while itself, and the body and condition parameters. - to_update.push_back(callsite.instruction()); - to_update.push_back( - callsite.instruction()->while_body()->parameter_instruction(0)); - to_update.push_back( - callsite.instruction()->while_condition()->parameter_instruction(0)); - } - } - - UpdateInstructionsAndPropagate(to_update); - - TF_DCHECK_OK(VerifyAgainstReference()); -} - -const HloValue* HloDataflowAnalysis::ResolvePhi(const HloValue& phi) const { - CHECK(phi.is_phi()); - - tensorflow::gtl::FlatSet visited; - std::queue worklist; - auto add_to_worklist = [&worklist, &visited](const HloValue* v) { - if (visited.insert(v).second) { - // 'v' was not previously in visited. - worklist.push(v); - } - }; - add_to_worklist(&phi); - - const HloValue* resolved_value = nullptr; - while (!worklist.empty()) { - const HloValue* value = worklist.front(); - worklist.pop(); - - if (!value->is_phi()) { - if (resolved_value == nullptr) { - resolved_value = value; - } else if (resolved_value != value) { - return nullptr; - } - } else { - for (const HloValue* input : phi_inputs_.at(value)) { - add_to_worklist(input); - } - } - } - return resolved_value; -} - -void HloDataflowAnalysis::UpdatePhiInputs( - const HloInstruction* instruction, - tensorflow::gtl::ArraySlice inputs) { - CHECK(ssa_form_); - for (auto& pair : GetInstructionValueSet(instruction)) { - const ShapeIndex& index = pair.first; - const HloValue& phi_value = GetUniqueValueAt(instruction, index); - auto& phi_inputs = phi_inputs_.at(&phi_value); - phi_inputs.clear(); - for (const InstructionValueSet* input : inputs) { - for (const HloValue* value : input->element(index).values()) { - // The number of phi inputs is typically 2, and virtually always very - // small. - if (std::find(phi_inputs.begin(), phi_inputs.end(), value) == - phi_inputs.end()) { - phi_inputs.push_back(value); - } - } - } - } -} - bool HloDataflowAnalysis::UpdateBitcastValueSet(HloInstruction* bitcast) { CHECK_EQ(bitcast->opcode(), HloOpcode::kBitcast); const InstructionValueSet& operand_set = @@ -380,8 +349,7 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) { } if (ssa_form_ && called_from_while) { - UpdatePhiInputs(parameter, inputs); - return false; + return Phi(parameter, inputs); } else { return GetInstructionValueSet(parameter).AssignUnionOf(inputs); } @@ -439,8 +407,7 @@ bool HloDataflowAnalysis::UpdateWhileValueSet(HloInstruction* xla_while) { &GetInstructionValueSet(xla_while->while_body()->root_instruction()), &GetInstructionValueSet(xla_while->operand(0))}; if (ssa_form_) { - UpdatePhiInputs(xla_while, inputs); - return false; + return Phi(xla_while, inputs); } else { return GetInstructionValueSet(xla_while).AssignUnionOf(inputs); } @@ -487,38 +454,7 @@ void HloDataflowAnalysis::UpdateInstructionsAndPropagate( VLOG(3) << "Worklist top: " << instruction->name(); VLOG(3) << ToString(); - // The updating of the instruction value set below in - // UpdateInstructionValueSet does not update HloValue::positions(). To - // perform the positions() update remove all positions in 'instruction' from - // the HloValues in 'instruction's value set prior to the update, then after - // the update add the new positions back in. There is likely a more - // efficient way of doing this. - for (auto& pair : GetInstructionValueSet(instruction)) { - const ShapeIndex& index = pair.first; - HloValueSet& value_set = pair.second; - for (const HloValue* value : value_set.values()) { - if (value->defining_instruction() != instruction) { - // Use GetValue for a non-const HloValue reference. - GetValue(value->id()).RemovePosition(instruction, index); - } - } - } - - bool changed = UpdateInstructionValueSet(instruction); - - // Add the positions back in. - for (auto& pair : GetInstructionValueSet(instruction)) { - const ShapeIndex& index = pair.first; - HloValueSet& value_set = pair.second; - for (const HloValue* value : value_set.values()) { - if (value->defining_instruction() != instruction) { - // Use GetValue for a non-const HloValue reference. - GetValue(value->id()).AddPosition(instruction, index); - } - } - } - - if (!changed) { + if (!UpdateInstructionValueSet(instruction)) { // No change to the instruction's value set. VLOG(4) << "No change."; continue; @@ -531,12 +467,16 @@ void HloDataflowAnalysis::UpdateInstructionsAndPropagate( for (HloInstruction* user : instruction->users()) { worklist.push(user); - // If user calls a computation, then the respective parameter(s) of the - // computation need to be updated. + // If user sequentially calls a computation, then the respective + // parameter(s) of the computation need to be updated. for (HloComputation* called_computation : user->called_computations()) { - for (int64 operand_number : user->OperandIndices(instruction)) { - worklist.push( - called_computation->parameter_instruction(operand_number)); + const CallGraphNode& call_graph_node = + call_graph_->GetNode(called_computation); + if (call_graph_node.context() == CallContext::kSequential) { + for (int64 operand_number : user->OperandIndices(instruction)) { + worklist.push( + called_computation->parameter_instruction(operand_number)); + } } } } @@ -574,25 +514,10 @@ InstructionValueSet& HloDataflowAnalysis::GetInstructionValueSet( } Status HloDataflowAnalysis::InitializeInstructionValueSets() { - // Gather the values to create before creating them. This is done because we - // want to allocate the vector of values only once so references to elements - // are stable. - struct ValueToCreate { - HloInstruction* instruction; - ShapeIndex index; - bool is_phi; - }; - std::vector values_to_create; - for (const std::unique_ptr& computation : module_->computations()) { const CallGraphNode& call_graph_node = call_graph_->GetNode(computation.get()); - bool called_from_while = std::any_of( - call_graph_node.caller_callsites().begin(), - call_graph_node.caller_callsites().end(), [](const CallSite& cs) { - return cs.instruction()->opcode() == HloOpcode::kWhile; - }); for (const std::unique_ptr& instruction : computation->instructions()) { @@ -603,20 +528,22 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { // Lambda to set the value set to define all values in the output of the // instruction. - auto define_all_values = [this, &instruction, - &values_to_create](bool is_phi = false) { + auto define_all_values = [this, &instruction](bool is_phi = false) { for (auto& pair : GetInstructionValueSet(instruction.get())) { const ShapeIndex& index = pair.first; - values_to_create.push_back({instruction.get(), index, is_phi}); + HloValue* value = + NewHloValue(instruction.get(), index, /*is_phi=*/false); + GetValueSet(instruction.get(), index).AddValue(value); } }; // Lambda to set the value set to define only the top-level buffer in the // output of the instruction. Any other values flow from the operands of // the instruction (or from cross-computation dataflow). - auto define_top_level_only = [this, &instruction, &values_to_create]() { - values_to_create.push_back( - {instruction.get(), /*index=*/{}, /*is_phi=*/false}); + auto define_top_level_only = [this, &instruction]() { + HloValue* value = + NewHloValue(instruction.get(), /*index=*/{}, /*is_phi=*/false); + GetValueSet(instruction.get(), /*index=*/{}).AddValue(value); }; switch (instruction->opcode()) { @@ -626,10 +553,6 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { } break; case HloOpcode::kWhile: - if (ssa_form_) { - define_all_values(/*is_phi=*/true); - } - break; case HloOpcode::kCall: case HloOpcode::kGetTupleElement: // These instructions define no values. The values in their output @@ -654,10 +577,6 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { // values in their output. Otherwise the values of the parameter // come from the caller (eg, operands to the kCall instruction). define_all_values(); - } else if (call_graph_node.context() == CallContext::kSequential && - called_from_while && ssa_form_) { - // Parameters of while bodies and conditions are phis. - define_all_values(/*is_phi=*/true); } break; case HloOpcode::kCopy: @@ -674,164 +593,9 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { } } - // Reserve the vector ahead of time so references to elements are stable. - values_.reserve(values_to_create.size()); - for (int64 i = 0; i < values_to_create.size(); ++i) { - const ValueToCreate& to_create = values_to_create[i]; - values_.emplace_back(/*id=*/i, to_create.instruction, to_create.index, - to_create.is_phi); - const HloValue& value = values_.back(); - GetValueSet(to_create.instruction, to_create.index).AddValue(&value); - if (value.is_phi()) { - phi_inputs_[&value] = {}; - } - } return Status::OK(); } -bool HloDataflowAnalysis::IsDefinedBefore(const HloValue& a, const HloValue& b, - const HloOrdering& ordering) const { - // If 'b' is an entry param then 'a' cannot be defined before 'b' because 'b' - // is live into the module. - if (b.defining_instruction()->parent() == module_->entry_computation() && - b.defining_instruction()->opcode() == HloOpcode::kParameter) { - return false; - } - - // Phi values require special handling. Because XLA does not have a phi - // instruction, the definition instruction of the phis values are - // placeholders: either the subcomputation parameter (body or condition) or - // the while instruction. However, the program point where these values are - // logically defined does not necessarily coincide exactly with program point - // of these place-holder instructions. So we explicitly define the following - // order for phi values: - // - // body/condition parameter phi: - // Defined before all values defined in its computation excepting other - // phis. - // - // while phi: - // defined after all values defined in the condition or body. - // - auto is_body_or_condition_phi = [](const HloValue& v) { - return v.is_phi() && - v.defining_instruction()->opcode() == HloOpcode::kParameter; - }; - if (is_body_or_condition_phi(a) && !is_body_or_condition_phi(b) && - call_graph_->InstructionIsNestedIn(b.defining_instruction(), - a.defining_instruction()->parent())) { - return true; - } - if (is_body_or_condition_phi(b) && - call_graph_->InstructionIsNestedIn(a.defining_instruction(), - b.defining_instruction()->parent())) { - return false; - } - - // If 'b' is a while phi and 'a' is in the body or condition, then 'a' - // executes before 'b'. - if (b.is_phi() && b.defining_instruction()->opcode() == HloOpcode::kWhile && - (call_graph_->InstructionIsNestedIn( - a.defining_instruction(), b.defining_instruction()->while_body()) || - call_graph_->InstructionIsNestedIn( - a.defining_instruction(), - b.defining_instruction()->while_condition()))) { - return true; - } - - return ordering.ExecutesBefore(a.defining_instruction(), - b.defining_instruction()); -} - -bool HloDataflowAnalysis::UseIsBeforeValueDefinition( - const HloUse& use, const HloValue& value, - const HloOrdering& ordering) const { - if (ordering.ExecutesBefore(use.instruction, value.defining_instruction())) { - return true; - } - - // If the use is at the instruction where the value is defined, then the use - // is before the def if the instruction allows buffer sharing (in place - // computation). - if (use.instruction == value.defining_instruction() && - CanShareOperandBufferWithUser( - use.instruction->mutable_operand(use.operand_number), - use.operand_index, value.defining_instruction(), - value.defining_index())) { - return true; - } - - // The use at a while is an input to a phi, and logically occurs before values - // are defined in the body or condition computations. - if (use.instruction->opcode() == HloOpcode::kWhile) { - const HloInstruction* xla_while = use.instruction; - if (call_graph_->InstructionIsNestedIn(value.defining_instruction(), - xla_while->while_body()) || - call_graph_->InstructionIsNestedIn(value.defining_instruction(), - xla_while->while_condition())) { - return true; - } - } - - // Similarly if the value is defined at a while, it logically occurs after any - // uses in the body or condition computations. - if (value.defining_instruction()->opcode() == HloOpcode::kWhile) { - CHECK(ssa_form_); - const HloInstruction* xla_while = value.defining_instruction(); - if (call_graph_->InstructionIsNestedIn(use.instruction, - xla_while->while_body()) || - call_graph_->InstructionIsNestedIn(use.instruction, - xla_while->while_condition())) { - return true; - } - } - return false; -} - -bool HloDataflowAnalysis::LiveRangeStrictlyBefore( - const HloValue& a, const HloValue& b, const HloOrdering& ordering) const { - VLOG(4) << "LiveRangeStrictlyBefore(a = " << a.ToShortString() - << ", b = " << b.ToShortString() << ")"; - if (!IsDefinedBefore(a, b, ordering)) { - VLOG(4) << "a not defined before b"; - return false; - } - - // Live-out values from the module can never have ranges strictly before any - // other value. - if (a.live_out_of_module()) { - VLOG(4) << "a is live out of module"; - return false; - } - - // Live-out values of computations can never have ranges strictly before any - // other value in the computation (including values nested in - // subcomputations). - if (a.live_out_of_computation() && - call_graph_->InstructionIsNestedIn(b.defining_instruction(), - a.defining_instruction()->parent())) { - VLOG(4) << "a is live out of computation containing b"; - return false; - } - - // All uses of 'a' must be before 'b' is defined. - for (const HloUse& use : a.uses()) { - if (!UseIsBeforeValueDefinition(use, b, ordering)) { - VLOG(4) << "use of a (" << use << ") not before b is defined"; - return false; - } - } - - return true; -} - -bool HloDataflowAnalysis::MayInterfere(const HloValue& a, const HloValue& b, - const HloOrdering& ordering) const { - // Buffers without disjoint liveness may interfere. - return !LiveRangeStrictlyBefore(a, b, ordering) && - !LiveRangeStrictlyBefore(b, a, ordering); -} - /* static */ StatusOr> HloDataflowAnalysis::Run( HloModule* module, bool ssa_form, bool bitcast_defines_value) { @@ -855,6 +619,33 @@ StatusOr> HloDataflowAnalysis::Run( } dataflow_analysis->UpdateInstructionsAndPropagate(all_instructions); + // Add in positions to all values. + for (const std::unique_ptr& computation : + module->computations()) { + for (const std::unique_ptr& instruction : + computation->instructions()) { + for (const auto& pair : + dataflow_analysis->GetInstructionValueSet(instruction.get())) { + const ShapeIndex& index = pair.first; + const HloValueSet& value_set = pair.second; + for (const HloValue* value : value_set.values()) { + if (value->defining_instruction() != instruction.get()) { + dataflow_analysis->GetValue(value->id()) + .AddPosition(instruction.get(), index); + } + } + } + } + } + + // Construct vector of values. + dataflow_analysis->values_vector_.reserve(dataflow_analysis->values_.size()); + for (auto& pair : dataflow_analysis->values_) { + dataflow_analysis->values_vector_.push_back(&pair.second); + } + std::sort(dataflow_analysis->values_vector_.begin(), + dataflow_analysis->values_vector_.end(), HloValue::IdLessThan); + TF_DCHECK_OK(dataflow_analysis->Verify()); XLA_VLOG_LINES(1, dataflow_analysis->ToString()); @@ -865,14 +656,14 @@ StatusOr> HloDataflowAnalysis::Run( Status HloDataflowAnalysis::Verify() const { // Verify each HloValue appears in the value sets that the value's positions() // indicate. - for (const HloValue& value : values()) { - for (const HloPosition& position : value.positions()) { + for (const HloValue* value : values()) { + for (const HloPosition& position : value->positions()) { const HloValueSet& value_set = GetValueSet(position); TF_RET_CHECK(std::find(value_set.values().begin(), value_set.values().end(), - &value) != value_set.values().end()) + value) != value_set.values().end()) << "Value set at position " << position << " does not contain value " - << value.ToShortString(); + << value->ToShortString(); } } @@ -898,75 +689,4 @@ Status HloDataflowAnalysis::Verify() const { return Status::OK(); } -Status HloDataflowAnalysis::VerifyAgainstReference() const { - TF_RETURN_IF_ERROR(Verify()); - - TF_ASSIGN_OR_RETURN(std::unique_ptr reference, - Run(module_, ssa_form_, bitcast_defines_value_)); - TF_RETURN_IF_ERROR(reference->Verify()); - - VLOG(2) << "This analysis:"; - XLA_VLOG_LINES(2, ToString()); - VLOG(2) << "Reference:"; - XLA_VLOG_LINES(2, reference->ToString()); - - // Verify value sets in each position are identical. - for (const auto& computation : module_->computations()) { - for (const auto& instruction : computation->instructions()) { - for (const auto& pair : GetInstructionValueSet(instruction.get())) { - const ShapeIndex& index = pair.first; - const HloValueSet& value_set = pair.second; - const HloValueSet& reference_value_set = - reference->GetValueSet(instruction.get(), index); - - auto value_in_set = [](const HloValue& v, const HloValueSet& vset) { - return std::find_if(vset.values().begin(), vset.values().end(), - [&v](const HloValue* w) { return *w == v; }) != - vset.values().end(); - }; - - for (const HloValue* value : value_set.values()) { - TF_RET_CHECK(value_in_set(*value, reference_value_set)) - << "Value " << value->ToShortString() - << " does not exist in reference"; - } - for (const HloValue* reference_value : reference_value_set.values()) { - TF_RET_CHECK(value_in_set(*reference_value, value_set)) - << "Value " << reference_value->ToShortString() - << " only exists in reference"; - } - } - } - } - - // Verify all phis resolve identically and uses are identical. - for (const HloValue& value : values()) { - const HloValue& reference_value = reference->GetValueDefinedAt( - value.defining_instruction(), value.defining_index()); - TF_RET_CHECK(value.is_phi() == reference_value.is_phi()); - if (value.is_phi()) { - const HloValue* resolved_value = ResolvePhi(value); - const HloValue* reference_resolved_value = - reference->ResolvePhi(reference_value); - if (resolved_value == nullptr) { - TF_RET_CHECK(reference_resolved_value == nullptr); - } else { - TF_RET_CHECK(reference_resolved_value != nullptr); - TF_RET_CHECK(*reference_resolved_value == *resolved_value); - } - } - - for (const HloUse& use : value.uses()) { - TF_RET_CHECK(std::find(reference_value.uses().begin(), - reference_value.uses().end(), - use) != reference_value.uses().end()); - } - for (const HloUse& reference_use : reference_value.uses()) { - TF_RET_CHECK(std::find(value.uses().begin(), value.uses().end(), - reference_use) != value.uses().end()); - } - } - return Status::OK(); -} - } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h index 7781cc58a3a..aae257dd09e 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h @@ -88,10 +88,10 @@ class HloDataflowAnalysis { // given position. const HloValueSet& GetValueSet(const HloInstruction* instruction, const ShapeIndex& index = {}) const; - HloValueSet& GetValueSet(const HloInstruction* instruction, - const ShapeIndex& index = {}); const HloValueSet& GetValueSet(const HloPosition& position) const; HloValueSet& GetValueSet(const HloPosition& position); + HloValueSet& GetValueSet(const HloInstruction* instruction, + const ShapeIndex& index = {}); // Return the unique value in the HloValueSet at the given instruction and // shape index. CHECKs if the value set does not contain a exactly one value. @@ -108,49 +108,11 @@ class HloDataflowAnalysis { const HloValue& GetValue(HloValue::Id value_id) const; HloValue& GetValue(HloValue::Id value_id); - // Returns whether the given values interfere assuming the given HLO - // ordering. Two values interfere if they may both be simultaneously live. - bool MayInterfere(const HloValue& a, const HloValue& b, - const HloOrdering& ordering) const; - - // Overload which takes HloValue:Ids. - bool MayInterfere(HloValue::Id a, HloValue::Id b, - const HloOrdering& ordering) const { - return MayInterfere(GetValue(a), GetValue(b), ordering); - } - // Return the total number of HloValues. int64 value_count() const { return values_.size(); } - // Return a vector of all HloValues. - const std::vector& values() const { return values_; } - - // Updates the dataflow after the changing an operand of - // 'instruction'. Dataflow update is not possible if instructions have been - // added or removed from the graph. - void UpdateAfterChangingOperand(HloInstruction* instruction, - HloInstruction* old_operand, - HloInstruction* new_operand); - - // Updates the dataflow after the changing the root of a computation from - // 'old_root' to 'new_root'. - void UpdateAfterChangingRoot(HloInstruction* old_root, - HloInstruction* new_root); - - // Returns the non-phi HloValue that is the unique (transitive) input to the - // given phi. If no such HloValue exists (there are multiple inputs to the - // phi) then nullptr is returned. This is computed by all walking the inputs - // of the given phi value until non-phi HloValue(s) are encountered. - const HloValue* ResolvePhi(const HloValue& phi) const; - const HloValue* ResolvePhi(const HloInstruction* instruction, - const ShapeIndex& index = {}) const { - return ResolvePhi(GetValueDefinedAt(instruction, index)); - } - - // Compare the dataflow analysis against a clean recomputation of the - // analysis. Returns an error status if there is a mismatch. Useful for - // verifying the correctness after updates to the analysis. - Status VerifyAgainstReference() const; + // Return a vector of all HloValues stabily sorted by HloValue::Id. + const std::vector& values() const { return values_vector_; } // Return the call graph used for computing the dataflow. const CallGraph& call_graph() const { return *call_graph_; } @@ -161,6 +123,13 @@ class HloDataflowAnalysis { HloDataflowAnalysis(HloModule* module, bool ssa_form, bool bitcast_defines_value = false); + // Returns a new HloValue defined at the given instruction and shape index. + HloValue* NewHloValue(HloInstruction* instruction, const ShapeIndex& index, + bool is_phi = false); + + // Delete the HloValue with the given ID. + void DeleteHloValue(HloValue::Id value_id); + // Constructs and initializes the InstructionValueSets of all instructions to // contain exactly the HloValues defined by each instruction. These values can // then propagated throughout the HLO graph by calling @@ -187,10 +156,11 @@ class HloDataflowAnalysis { void UpdateInstructionsAndPropagate( tensorflow::gtl::ArraySlice instructions); - // Sets the inputs of the given phi to given value(s). - void UpdatePhiInputs( - const HloInstruction* instruction, - tensorflow::gtl::ArraySlice inputs); + // Return the result of the SSA Phi function applied to the given inputs at + // the given instruction. If skip_top_level is true, then the top level of the + // value set of 'instruction' is not modified. + bool Phi(HloInstruction* instruction, + tensorflow::gtl::ArraySlice inputs); // Updates the positions of the HloValues in the output of the given // instruction. This should be called after the instruction value set of @@ -203,20 +173,6 @@ class HloDataflowAnalysis { HloInstruction* instruction, const InstructionValueSet& new_value_set, const InstructionValueSet* prev_value_set = nullptr); - // Returns true if the live range of the given value 'a' is strictly before - // the live range of value 'b' using the given HLO ordering. - bool LiveRangeStrictlyBefore(const HloValue& a, const HloValue& b, - const HloOrdering& ordering) const; - - // Returns whether the value 'a' is defined before the value 'b' under the - // given ordering. - bool IsDefinedBefore(const HloValue& a, const HloValue& b, - const HloOrdering& ordering) const; - - // Returns whether the given use is before the given value definition. - bool UseIsBeforeValueDefinition(const HloUse& use, const HloValue& value, - const HloOrdering& ordering) const; - // Verify various invariants of the dataflow analysis. Status Verify() const; @@ -226,19 +182,19 @@ class HloDataflowAnalysis { std::unique_ptr call_graph_; - // Array of all values in the module. This is allocated once at analysis - // construction time so HloValue references are stable. Updates to the - // analysis via UpdateAfterChangingOperand and UpdateAfterChangingRoot do not - // result in the creation or destruction of any HloValues. - std::vector values_; - - // Map hold the inputs to each phi value in the module. Used by ResolvePhi. - tensorflow::gtl::FlatMap> - phi_inputs_; + // The map of all HloValues in the module. We pass around pointers to the + // mapped HloValues, so the underlying container must keep them valid despite + // mutations touching other map entries. + std::unordered_map values_; // A map from instruction to InstructionValueSet. std::unordered_map value_sets_; + + // A vector containing all HloValues sorted by HloValue::Id. + std::vector values_vector_; + + // The Id to use for the next HloValue. + HloValue::Id next_value_id_ = 0; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc index 9f3dd539efe..ef0fa1d745a 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/test.h" @@ -44,8 +43,8 @@ class HloDataflowAnalysisTest : public HloTestBase, // Run dataflow analysis on the member module. For convenience returns a // reference to the generated analysis stored in analysis_. - HloDataflowAnalysis& RunAnalysis(bool ssa_form, - bool bitcast_defines_value = false) { + const HloDataflowAnalysis& RunAnalysis(bool ssa_form, + bool bitcast_defines_value = false) { analysis_ = HloDataflowAnalysis::Run(module_.get(), ssa_form, bitcast_defines_value) .ConsumeValueOrDie(); @@ -71,8 +70,8 @@ class HloDataflowAnalysisTest : public HloTestBase, const HloInstruction* b) { EXPECT_FALSE(ShapeUtil::IsTuple(a->shape())); EXPECT_FALSE(ShapeUtil::IsTuple(b->shape())); - return analysis_->MayInterfere(analysis_->GetValueDefinedAt(a), - analysis_->GetValueDefinedAt(b), ordering); + return ordering.MayInterfere(analysis_->GetValueDefinedAt(a), + analysis_->GetValueDefinedAt(b)); } std::unique_ptr module_; @@ -499,37 +498,26 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) { EXPECT_FALSE(analysis.GetValueDefinedAt(cond_constant).live_out_of_module()); if (ssa_form) { - // While instruction should define phi values. The value at index {0} is a - // degenerate phi with a single input 'constant1'. - EXPECT_TRUE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{0})); - EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{0}).is_phi()); - EXPECT_EQ(analysis.ResolvePhi(xla_while, /*index=*/{0}), - &analysis.GetValueDefinedAt(constant1)); - EXPECT_TRUE(analysis.ValueIsDefinedAt(body_param, /*index=*/{0})); - EXPECT_TRUE(analysis.GetValueDefinedAt(body_param, /*index=*/{0}).is_phi()); - EXPECT_EQ(analysis.ResolvePhi(body_param, /*index=*/{0}), - &analysis.GetValueDefinedAt(constant1)); - EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{0})); - EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param, /*index=*/{0}).is_phi()); - EXPECT_EQ(analysis.ResolvePhi(cond_param, /*index=*/{0}), - &analysis.GetValueDefinedAt(constant1)); + // Element 0 of the tuple passed through the body so no phi value is + // defined. + EXPECT_FALSE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{0})); + EXPECT_FALSE(analysis.ValueIsDefinedAt(body_param, /*index=*/{0})); + EXPECT_FALSE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{0})); + // Element 1 of the tuple should be a phi value. EXPECT_TRUE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{1})); EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1}).is_phi()); - EXPECT_EQ(analysis.ResolvePhi(xla_while, /*index=*/{1}), nullptr); EXPECT_TRUE(analysis.ValueIsDefinedAt(body_param, /*index=*/{1})); EXPECT_TRUE(analysis.GetValueDefinedAt(body_param, /*index=*/{1}).is_phi()); - EXPECT_EQ(analysis.ResolvePhi(body_param, /*index=*/{1}), nullptr); EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{1})); EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param, /*index=*/{1}).is_phi()); - EXPECT_EQ(analysis.ResolvePhi(cond_param, /*index=*/{1}), nullptr); - EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(), - UnorderedElementsAre(HloUse{xla_while, 0, {0}})); + EXPECT_THAT( + analysis.GetValueDefinedAt(constant1).uses(), + UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{xla_while, 0, {0}})); - EXPECT_FALSE(analysis.GetValueDefinedAt(constant1).live_out_of_module()); - EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{0}) - .live_out_of_module()); + // Constant1 passes through the body and out of the module. + EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module()); EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1}) .live_out_of_module()); @@ -613,20 +601,15 @@ TEST_P(HloDataflowAnalysisTest, SequentialWhiles) { bool ssa_form = GetParam(); const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form); - if (ssa_form) { - EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while2).live_out_of_module()); - EXPECT_FALSE(analysis.GetValueDefinedAt(constant1).live_out_of_module()); - } else { - // Element 0 is passed through all the while instructions and out of the - // module. - EXPECT_EQ(analysis.GetUniqueValueAt(xla_while0, /*index=*/{0}), - analysis.GetValueDefinedAt(constant1)); - EXPECT_EQ(analysis.GetUniqueValueAt(xla_while1, /*index=*/{0}), - analysis.GetValueDefinedAt(constant1)); - EXPECT_EQ(analysis.GetUniqueValueAt(xla_while2, /*index=*/{0}), - analysis.GetValueDefinedAt(constant1)); - EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module()); - } + // Element 0 is passed through all the while instructions and out of the + // module.. + EXPECT_EQ(analysis.GetUniqueValueAt(xla_while0, /*index=*/{0}), + analysis.GetValueDefinedAt(constant1)); + EXPECT_EQ(analysis.GetUniqueValueAt(xla_while1, /*index=*/{0}), + analysis.GetValueDefinedAt(constant1)); + EXPECT_EQ(analysis.GetUniqueValueAt(xla_while2, /*index=*/{0}), + analysis.GetValueDefinedAt(constant1)); + EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module()); } TEST_P(HloDataflowAnalysisTest, NestedWhiles) { @@ -705,13 +688,18 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) { bool ssa_form = GetParam(); const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form); + EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{0}), + UnorderedElementsAre(analysis.GetValueDefinedAt(negate))); if (ssa_form) { EXPECT_TRUE(analysis.ValueIsDefinedAt(inner_param, /*index=*/{1})); EXPECT_TRUE( analysis.GetValueDefinedAt(inner_param, /*index=*/{1}).is_phi()); - EXPECT_TRUE(analysis.ValueIsDefinedAt(nested_while, /*index=*/{0})); - EXPECT_TRUE( - analysis.GetValueDefinedAt(inner_param, /*index=*/{1}).is_phi()); + + // Element 0 of the nested while is %negate. + EXPECT_FALSE(analysis.ValueIsDefinedAt(nested_while, /*index=*/{0})); + EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{0}), + UnorderedElementsAre(analysis.GetValueDefinedAt(negate))); + // Element 1 is a phi value (join of %add and %constant2). EXPECT_TRUE(analysis.ValueIsDefinedAt(nested_while, /*index=*/{1})); EXPECT_TRUE( analysis.GetValueDefinedAt(nested_while, /*index=*/{1}).is_phi()); @@ -724,8 +712,6 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) { EXPECT_TRUE( analysis.GetValueDefinedAt(entry_while, /*index=*/{1}).is_phi()); } else { - EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{0}), - UnorderedElementsAre(analysis.GetValueDefinedAt(negate))); EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{1}), UnorderedElementsAre(analysis.GetValueDefinedAt(add), analysis.GetValueDefinedAt(constant2))); @@ -1496,256 +1482,6 @@ TEST_P(HloDataflowAnalysisTest, EmbeddedComputationInterference) { EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, embedded_log)); } -TEST_P(HloDataflowAnalysisTest, UpdateAnalysisForWhile) { - // Test updating dataflow after modifying a module with an array shaped while: - // - // body(F32[] %param): - // %negate = Negate(%param) - // - // condition(F32[] %param): - // return Constant(false) - // - // entry: - // %constant = Constant(1.0) - // %exp = Exp(%constant) - // return While(%exp, body, condition) - // - auto body_builder = HloComputation::Builder("body"); - auto body_param = body_builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param")); - auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary( - scalar_shape_, HloOpcode::kNegate, body_param)); - HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build()); - - // Condition computation trivially returns a constant "false". - auto cond_builder = HloComputation::Builder("condition"); - auto cond_param = cond_builder.AddInstruction( - HloInstruction::CreateParameter(0, scalar_shape_, "param")); - cond_builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); - HloComputation* condition = - module_->AddEmbeddedComputation(cond_builder.Build()); - - auto builder = HloComputation::Builder(TestName()); - auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); - auto exp = builder.AddInstruction( - HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kExp, constant)); - auto xla_while = builder.AddInstruction( - HloInstruction::CreateWhile(scalar_shape_, condition, body, exp)); - module_->AddEntryComputation(builder.Build()); - - bool ssa_form = GetParam(); - HloDataflowAnalysis& analysis = RunAnalysis(ssa_form); - - // Sanity check the initial dataflow analysis before transforming the HLO - // graph. - if (ssa_form) { - EXPECT_TRUE(analysis.ValueIsDefinedAt(body_param)); - EXPECT_TRUE(analysis.GetValueDefinedAt(body_param).is_phi()); - EXPECT_EQ(analysis.ResolvePhi(body_param), nullptr); - - EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param)); - EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param).is_phi()); - EXPECT_EQ(analysis.ResolvePhi(cond_param), nullptr); - - EXPECT_FALSE(analysis.GetValueDefinedAt(exp).live_out_of_module()); - EXPECT_FALSE(analysis.GetValueDefinedAt(negate).live_out_of_module()); - } else { - EXPECT_THAT(HloValuesAt(body_param), - UnorderedElementsAre(analysis.GetValueDefinedAt(exp), - analysis.GetValueDefinedAt(negate))); - EXPECT_THAT(HloValuesAt(cond_param), - UnorderedElementsAre(analysis.GetValueDefinedAt(exp), - analysis.GetValueDefinedAt(negate))); - EXPECT_THAT(HloValuesAt(xla_while), - UnorderedElementsAre(analysis.GetValueDefinedAt(exp), - analysis.GetValueDefinedAt(negate))); - - EXPECT_TRUE(analysis.GetValueDefinedAt(negate).live_out_of_module()); - EXPECT_TRUE(analysis.GetValueDefinedAt(exp).live_out_of_module()); - } - - // Set the body root to the body_param. Previously it was Negate(body_param). - body->set_root_instruction(body_param); - - // Prior to updating, verify that the dataflow analysis is no longer valid. - Status verify_status = analysis.VerifyAgainstReference(); - EXPECT_FALSE(verify_status.ok()); - - analysis.UpdateAfterChangingRoot(/*old_root=*/negate, - /*new_root=*/body_param); - - // Analysis should be valid after the update. - TF_EXPECT_OK(analysis.VerifyAgainstReference()); - - if (ssa_form) { - // The phis should now be resolvable as 'exp' is passed through the body - // transparently. - EXPECT_EQ(analysis.ResolvePhi(body_param), - &analysis.GetValueDefinedAt(exp)); - EXPECT_EQ(analysis.ResolvePhi(cond_param), - &analysis.GetValueDefinedAt(exp)); - EXPECT_EQ(analysis.ResolvePhi(xla_while), &analysis.GetValueDefinedAt(exp)); - EXPECT_FALSE(analysis.GetValueDefinedAt(exp).live_out_of_module()); - } else { - EXPECT_THAT(HloValuesAt(body_param), - UnorderedElementsAre(analysis.GetValueDefinedAt(exp))); - EXPECT_THAT(HloValuesAt(cond_param), - UnorderedElementsAre(analysis.GetValueDefinedAt(exp))); - EXPECT_THAT(HloValuesAt(xla_while), - UnorderedElementsAre(analysis.GetValueDefinedAt(exp))); - EXPECT_TRUE(analysis.GetValueDefinedAt(exp).live_out_of_module()); - } - EXPECT_FALSE(analysis.GetValueDefinedAt(negate).live_out_of_module()); - - // Now replace the operand of the while with %constant (was %exp). - TF_ASSERT_OK(exp->ReplaceUseWith(xla_while, constant)); - analysis.UpdateAfterChangingOperand(xla_while, /*old_operand=*/exp, - /*new_operand=*/constant); - - // Verify that the dataflow is correct. - TF_ASSERT_OK(analysis.VerifyAgainstReference()); - - if (ssa_form) { - // The phis now resolve to 'constant'. - EXPECT_EQ(analysis.ResolvePhi(body_param), - &analysis.GetValueDefinedAt(constant)); - EXPECT_EQ(analysis.ResolvePhi(cond_param), - &analysis.GetValueDefinedAt(constant)); - EXPECT_EQ(analysis.ResolvePhi(xla_while), - &analysis.GetValueDefinedAt(constant)); - } else { - EXPECT_THAT(HloValuesAt(body_param), - UnorderedElementsAre(analysis.GetValueDefinedAt(constant))); - EXPECT_THAT(HloValuesAt(cond_param), - UnorderedElementsAre(analysis.GetValueDefinedAt(constant))); - EXPECT_THAT(HloValuesAt(xla_while), - UnorderedElementsAre(analysis.GetValueDefinedAt(constant))); - EXPECT_TRUE(analysis.GetValueDefinedAt(constant).live_out_of_module()); - } - - // And finally make the negate the root of the body again. - body->set_root_instruction(negate); - analysis.UpdateAfterChangingRoot(/*old_root=*/body_param, - /*new_root=*/negate); - - // Verify that the dataflow is correct. - TF_ASSERT_OK(analysis.VerifyAgainstReference()); - - if (ssa_form) { - // Phis should no longer be resolvable. - EXPECT_EQ(analysis.ResolvePhi(body_param), nullptr); - EXPECT_EQ(analysis.ResolvePhi(cond_param), nullptr); - EXPECT_EQ(analysis.ResolvePhi(xla_while), nullptr); - } else { - EXPECT_THAT(HloValuesAt(body_param), - UnorderedElementsAre(analysis.GetValueDefinedAt(constant), - analysis.GetValueDefinedAt(negate))); - EXPECT_THAT(HloValuesAt(cond_param), - UnorderedElementsAre(analysis.GetValueDefinedAt(constant), - analysis.GetValueDefinedAt(negate))); - EXPECT_THAT(HloValuesAt(xla_while), - UnorderedElementsAre(analysis.GetValueDefinedAt(constant), - analysis.GetValueDefinedAt(negate))); - - EXPECT_FALSE(analysis.GetValueDefinedAt(exp).live_out_of_module()); - EXPECT_TRUE(analysis.GetValueDefinedAt(negate).live_out_of_module()); - EXPECT_TRUE(analysis.GetValueDefinedAt(constant).live_out_of_module()); - } - - // After the updates, verify that the dataflow is correct. - TF_ASSERT_OK(analysis.VerifyAgainstReference()); -} - -TEST_P(HloDataflowAnalysisTest, UpdateOfATupleSelect) { - // Test changing the operands of kSelects of a tuple value and updating the - // dataflow. - auto builder = HloComputation::Builder(TestName()); - auto pred = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(false))); - auto a = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(1.0))); - auto b = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(2.0))); - auto c = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(3.0))); - auto d = builder.AddInstruction( - HloInstruction::CreateConstant(Literal::CreateR0(4.0))); - auto tuple_a = builder.AddInstruction(HloInstruction::CreateTuple({a})); - auto tuple_b = builder.AddInstruction(HloInstruction::CreateTuple({b})); - auto tuple_c = builder.AddInstruction(HloInstruction::CreateTuple({c})); - auto tuple_d = builder.AddInstruction(HloInstruction::CreateTuple({d})); - const Shape tuple_shape = tuple_a->shape(); - auto select_aa = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple_a, tuple_a)); - auto select_ab = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple_a, tuple_b)); - auto select_cd = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, tuple_c, tuple_d)); - auto select_abcd = builder.AddInstruction(HloInstruction::CreateTernary( - tuple_shape, HloOpcode::kSelect, pred, select_ab, select_cd)); - - module_->AddEntryComputation(builder.Build()); - - bool ssa_form = GetParam(); - HloDataflowAnalysis& analysis = RunAnalysis(ssa_form); - - // Sanity check dataflow before changing the graph and updating. - EXPECT_THAT(HloValuesAt(select_aa, /*index=*/{0}), - UnorderedElementsAre(analysis.GetValueDefinedAt(a))); - EXPECT_THAT(HloValuesAt(select_ab, /*index=*/{0}), - UnorderedElementsAre(analysis.GetValueDefinedAt(a), - analysis.GetValueDefinedAt(b))); - EXPECT_THAT(HloValuesAt(select_cd, /*index=*/{0}), - UnorderedElementsAre(analysis.GetValueDefinedAt(c), - analysis.GetValueDefinedAt(d))); - EXPECT_THAT(HloValuesAt(select_abcd, /*index=*/{0}), - UnorderedElementsAre(analysis.GetValueDefinedAt(a), - analysis.GetValueDefinedAt(b), - analysis.GetValueDefinedAt(c), - analysis.GetValueDefinedAt(d))); - EXPECT_TRUE(analysis.GetValueDefinedAt(a).live_out_of_module()); - EXPECT_TRUE(analysis.GetValueDefinedAt(b).live_out_of_module()); - EXPECT_TRUE(analysis.GetValueDefinedAt(c).live_out_of_module()); - EXPECT_TRUE(analysis.GetValueDefinedAt(d).live_out_of_module()); - - // Set the rhs of 'select_aa' to be 'd'. - TF_ASSERT_OK(select_aa->ReplaceOperandWith(2, tuple_d)); - analysis.UpdateAfterChangingOperand(select_aa, /*old_operand=*/tuple_a, - /*new_operand=*/tuple_d); - - // Verify that the dataflow is correct. - TF_ASSERT_OK(analysis.VerifyAgainstReference()); - - EXPECT_THAT(HloValuesAt(select_aa, /*index=*/{0}), - UnorderedElementsAre(analysis.GetValueDefinedAt(a), - analysis.GetValueDefinedAt(d))); - - // Set the lhs of 'select_cd' to be 'a'. - TF_ASSERT_OK(select_cd->ReplaceOperandWith(1, tuple_a)); - analysis.UpdateAfterChangingOperand(select_cd, /*old_operand=*/tuple_c, - /*new_operand=*/tuple_a); - - // Verify that the dataflow is correct. - TF_ASSERT_OK(analysis.VerifyAgainstReference()); - - EXPECT_THAT(HloValuesAt(select_cd, /*index=*/{0}), - UnorderedElementsAre(analysis.GetValueDefinedAt(a), - analysis.GetValueDefinedAt(d))); - EXPECT_THAT(HloValuesAt(select_abcd, /*index=*/{0}), - UnorderedElementsAre(analysis.GetValueDefinedAt(a), - analysis.GetValueDefinedAt(b), - analysis.GetValueDefinedAt(d))); - EXPECT_TRUE(analysis.GetValueDefinedAt(a).live_out_of_module()); - EXPECT_TRUE(analysis.GetValueDefinedAt(b).live_out_of_module()); - EXPECT_FALSE(analysis.GetValueDefinedAt(c).live_out_of_module()); - EXPECT_TRUE(analysis.GetValueDefinedAt(d).live_out_of_module()); - - // After the updates, verify that the dataflow is correct. - TF_ASSERT_OK(analysis.VerifyAgainstReference()); -} - INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation, HloDataflowAnalysisTest, ::testing::Values(false, true)); diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index dfb111d1d0b..07b3369d5c1 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -561,13 +561,21 @@ tooltip = " "; } string comp_body = DumpComputation(subcomp); - string computation = - Printf(computation_fmt, id, style, subcomp_label, comp_body, id); - // Add an edge from the subcomputation to its parent node. If subcomp - // belongs to a fusion node, it's drawn in place of the fusion instruction, so - // there's no need to link those. - if (parent_instr->opcode() != HloOpcode::kFusion) { + if (parent_instr->opcode() == HloOpcode::kFusion) { + // Dump any nested fusion nodes. + for (const auto& subcomp_instr : subcomp->instructions()) { + if (subcomp_instr->opcode() == HloOpcode::kFusion) { + StrAppend( + &comp_body, + DumpSubcomputation(subcomp_instr->fused_instructions_computation(), + subcomp_instr.get())); + } + } + } else { + // Add an edge from the subcomputation to its parent node. If subcomp + // belongs to a fusion node, it's drawn in place of the fusion instruction, + // so there's no need to link those. edge_ids_.insert( {{subcomp->root_instruction(), parent_instr}, next_edge_id_++}); const char* edge_fmt = @@ -578,6 +586,9 @@ tooltip = " "; subcomp->name(), parent_instr->name())); } + string computation = + Printf(computation_fmt, id, style, subcomp_label, comp_body, id); + return computation; } diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index 3bdb67ba923..24ef4e09e7c 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -793,13 +793,6 @@ HloInstruction* HloInstruction::CloneAndFuseInternal( } } - for (HloComputation* computation : - instruction_to_fuse->called_computations()) { - if (std::find(called_computations_.begin(), called_computations_.end(), - computation) == called_computations_.end()) { - called_computations_.push_back(computation); - } - } VLOG(2) << "New clone:\n" << clone->ToString(); return clone; } @@ -1669,6 +1662,21 @@ string HloInstruction::ExtendedOpcodeStr() const { string HloInstruction::ToString(bool compact_operands, bool include_metadata) const { + string result = + StrCat(name(), " = ", ShapeUtil::HumanStringWithLayout(shape()), " ", + ExtendedOpcodeStr(), "(", OperandsToString(compact_operands), ")"); + for (const string& extra : ExtraAttributesToString()) { + StrAppend(&result, ", ", extra); + } + if (include_metadata && + (!metadata_.op_type().empty() || !metadata_.op_name().empty() || + !metadata_.source_file().empty())) { + StrAppend(&result, " # metadata=", metadata_.ShortDebugString()); + } + return result; +} + +string HloInstruction::OperandsToString(bool compact) const { string operands; if (opcode() == HloOpcode::kConstant) { // For constants, show the actual value in place of an empty operand list. @@ -1697,12 +1705,12 @@ string HloInstruction::ToString(bool compact_operands, } else { tensorflow::gtl::ArraySlice slice(operands_); const int64 kMaxOperandsToShowIfCompact = 4; - if (compact_operands && slice.size() > kMaxOperandsToShowIfCompact) { + if (compact && slice.size() > kMaxOperandsToShowIfCompact) { slice.remove_suffix(slice.size() - kMaxOperandsToShowIfCompact); } operands = Join(slice, ", ", [&](string* out, HloInstruction* operand) { *out += ShapeUtil::HumanStringWithLayout(operand->shape()); - if (!compact_operands) { + if (!compact) { StrAppend(out, " ", operand->name()); } }); @@ -1711,15 +1719,19 @@ string HloInstruction::ToString(bool compact_operands, StrAppend(&operands, ", ...(+", remaining, ")"); } } - string extra; + return operands; +} + +std::vector HloInstruction::ExtraAttributesToString() const { + std::vector extra; if (CanHaveDimensionsField()) { - StrAppend(&extra, ", dimensions={", Join(dimensions(), ","), "}"); + extra.push_back(StrCat("dimensions={", Join(dimensions(), ","), "}")); } if (window_ != nullptr) { - StrAppend(&extra, ", ", window_util::ToString(*window_)); + extra.push_back(window_util::ToString(*window_)); } if (padding_config_ != nullptr) { - StrAppend(&extra, ", padding=", padding_config_->ShortDebugString()); + extra.push_back(StrCat("padding=", padding_config_->ShortDebugString())); } if (!slice_starts_.empty() && !slice_limits_.empty()) { std::vector bounds; @@ -1728,45 +1740,38 @@ string HloInstruction::ToString(bool compact_operands, bounds.push_back( StrCat("[", slice_starts_[i], ":", slice_limits_[i], "]")); } - StrAppend(&extra, ", slice={", Join(bounds, ", "), "}"); + extra.push_back(StrCat("slice={", Join(bounds, ", "), "}")); } if (convolution_dimension_numbers_ != nullptr) { - StrAppend(&extra, ", ", ConvolutionDimensionNumbersToString()); + extra.push_back(ConvolutionDimensionNumbersToString()); } if (opcode() == HloOpcode::kWhile) { - StrAppend(&extra, ", condition=", while_condition()->name()); - StrAppend(&extra, ", body=", while_body()->name()); + extra.push_back(StrCat("condition=", while_condition()->name())); + extra.push_back(StrCat("body=", while_body()->name())); } else if (opcode() == HloOpcode::kSelectAndScatter) { - StrAppend(&extra, ", select=", select()->name()); - StrAppend(&extra, ", scatter=", scatter()->name()); + extra.push_back(StrCat("select=", select()->name())); + extra.push_back(StrCat("scatter=", scatter()->name())); } else if (!called_computations().empty()) { - StrAppend(&extra, ", calls=", - Join(called_computations(), ", ", - [](string* out, const HloComputation* computation) { - StrAppend(out, computation->name()); - })); + extra.push_back(StrCat( + "calls=", Join(called_computations(), ", ", + [](string* out, const HloComputation* computation) { + StrAppend(out, computation->name()); + }))); } if (opcode() == HloOpcode::kGetTupleElement) { - StrAppend(&extra, ", index=", tuple_index()); + extra.push_back(StrCat("index=", tuple_index())); } if (!control_successors_.empty()) { - StrAppend( - &extra, ", control-successors=", + extra.push_back(StrCat( + "control-successors=", Join(control_successors_, ", ", [](string* out, HloInstruction* succ) { StrAppend(out, succ->name()); - })); + }))); } - if (include_metadata && - (!metadata_.op_type().empty() || !metadata_.op_name().empty() || - !metadata_.source_file().empty())) { - StrAppend(&extra, " # metadata=", metadata_.ShortDebugString()); - } - - return StrCat(name(), " = ", ShapeUtil::HumanStringWithLayout(shape()), " ", - ExtendedOpcodeStr(), "(", operands, ")", extra); + return extra; } string HloInstruction::ToShortString() const { diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 5688fcc4255..ca6f27bd40e 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -548,6 +548,14 @@ class HloInstruction { string ToString(bool compact_operands = false, bool include_metadata = true) const; + // Components of the ToString() representation: + + // Returns a string representation of the operand list. + string OperandsToString(bool compact) const; + + // Returns string representation of op-specific attributes. + std::vector ExtraAttributesToString() const; + string ToStringNoMetadata() const { return ToString(false, false); } // As ToString, but returns a shorter string. @@ -797,8 +805,7 @@ class HloInstruction { const Shape& shape, tensorflow::gtl::ArraySlice operands); - // Returns the computations this instruction calls (if any). This includes - // computations called by fused instructions inside of a fusion instruction. + // Returns the computations this instruction directly calls (if any). const std::vector& called_computations() const { return called_computations_; } diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc index ea5749581b5..2e1eeee36b5 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc @@ -758,16 +758,13 @@ TEST_F(HloInstructionTest, FusionOpWithCalledComputations) { auto* fusion = computation->CreateFusionInstruction( {map_3_y}, HloInstruction::FusionKind::kLoop); auto* fused_computation = fusion->fused_instructions_computation(); - EXPECT_THAT(fusion->called_computations(), - ElementsAre(fused_computation, computation_y)); + EXPECT_THAT(fusion->called_computations(), ElementsAre(fused_computation)); fusion->FuseInstruction(map_2_x); - EXPECT_THAT(fusion->called_computations(), - ElementsAre(fused_computation, computation_y, computation_x)); + EXPECT_THAT(fusion->called_computations(), ElementsAre(fused_computation)); fusion->FuseInstruction(map_1_x); - EXPECT_THAT(fusion->called_computations(), - ElementsAre(fused_computation, computation_y, computation_x)); + EXPECT_THAT(fusion->called_computations(), ElementsAre(fused_computation)); } TEST_F(HloInstructionTest, ComplexFusionOp) { diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc index ad6070a9c1b..c95e44bd5d9 100644 --- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc +++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc @@ -19,6 +19,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_scheduling.h" @@ -218,6 +219,94 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) { EXPECT_FALSE(ordering.ExecutesBefore(body_param, cond_param)); } +TEST_F(HloOrderingTest, ValuesInWhileComputations) { + // Tests the ordering of values (defined by dataflow analysis) in the body and + // condition of a while instruction. HLO code: + // + // body(F32[]) %param): + // %negate = Negate(%param) + // + // condition(F32[] %param): + // %convert = Convert(%param) + // + // entry: + // %constant = Constant(1.0) + // %while = While(%constant, body, condition) + // %add = Add(%constant, %while) + // + auto module = CreateNewModule(); + const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {}); + + auto body_builder = HloComputation::Builder("body"); + auto body_param = body_builder.AddInstruction( + HloInstruction::CreateParameter(0, scalar_shape, "body_param")); + auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary( + scalar_shape, HloOpcode::kNegate, body_param)); + HloComputation* body = module->AddEmbeddedComputation(body_builder.Build()); + + auto cond_builder = HloComputation::Builder("condition"); + auto cond_param = cond_builder.AddInstruction( + HloInstruction::CreateParameter(0, scalar_shape, "cond_param")); + auto convert = cond_builder.AddInstruction(HloInstruction::CreateConvert( + ShapeUtil::MakeShape(xla::PRED, {}), cond_param)); + HloComputation* condition = + module->AddEmbeddedComputation(cond_builder.Build()); + + auto builder = HloComputation::Builder(TestName()); + auto constant = builder.AddInstruction( + HloInstruction::CreateConstant(Literal::CreateR0(1.0))); + auto xla_while = builder.AddInstruction( + HloInstruction::CreateWhile(scalar_shape, condition, body, constant)); + auto add = builder.AddInstruction(HloInstruction::CreateBinary( + scalar_shape, HloOpcode::kAdd, constant, xla_while)); + module->AddEntryComputation(builder.Build()); + + TF_ASSERT_OK_AND_ASSIGN( + auto dataflow, HloDataflowAnalysis::Run(module.get(), /*ssa_form=*/true)); + DependencyHloOrdering ordering(module.get()); + + // Init value is defined before the while, but live range is not before the + // while because of the use of the init value in the add. + EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(constant), + dataflow->GetValueDefinedAt(xla_while))); + EXPECT_FALSE( + ordering.LiveRangeStrictlyBefore(dataflow->GetValueDefinedAt(constant), + dataflow->GetValueDefinedAt(xla_while))); + EXPECT_TRUE(ordering.MayInterfere(dataflow->GetValueDefinedAt(constant), + dataflow->GetValueDefinedAt(xla_while))); + + // Any value defined in the body or condition is defined before the while, and + // has a live range strictly before the while. + EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(negate), + dataflow->GetValueDefinedAt(xla_while))); + EXPECT_TRUE( + ordering.LiveRangeStrictlyBefore(dataflow->GetValueDefinedAt(negate), + dataflow->GetValueDefinedAt(xla_while))); + EXPECT_FALSE(ordering.MayInterfere(dataflow->GetValueDefinedAt(negate), + dataflow->GetValueDefinedAt(xla_while))); + + EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(convert), + dataflow->GetValueDefinedAt(xla_while))); + EXPECT_TRUE( + ordering.LiveRangeStrictlyBefore(dataflow->GetValueDefinedAt(convert), + dataflow->GetValueDefinedAt(xla_while))); + EXPECT_FALSE(ordering.MayInterfere(dataflow->GetValueDefinedAt(convert), + dataflow->GetValueDefinedAt(xla_while))); + + // The live range of the while should be before the add. + EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(xla_while), + dataflow->GetValueDefinedAt(add))); + ASSERT_EQ(dataflow->GetValueDefinedAt(xla_while).uses().size(), 1); + + const HloUse& while_use = dataflow->GetValueDefinedAt(xla_while).uses()[0]; + EXPECT_EQ(while_use.instruction, add); + EXPECT_TRUE(ordering.UseIsBeforeValueDefinition( + while_use, dataflow->GetValueDefinedAt(add))); + EXPECT_TRUE( + ordering.LiveRangeStrictlyBefore(dataflow->GetValueDefinedAt(xla_while), + dataflow->GetValueDefinedAt(add))); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc index eb3da111a24..7ad33c8947c 100644 --- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc +++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" using ::tensorflow::strings::StrAppend; +using ::tensorflow::strings::StrCat; namespace xla { @@ -54,11 +55,18 @@ StatusOr HloPassPipeline::Run(HloModule* module) { << tensorflow::str_util::Join(disabled_passes, ", "); } - auto run_invariant_checkers = [this, module]() -> Status { + auto run_invariant_checkers = [this, + module](const string& message) -> Status { for (auto& invariant_checker : invariant_checkers_) { VLOG(1) << " Invariant checker " << invariant_checker->name(); - TF_ASSIGN_OR_RETURN(bool changed, invariant_checker->Run(module)); - TF_RET_CHECK(!changed) << "invariant checkers must not change the graph"; + StatusOr changed_status = invariant_checker->Run(module); + if (!changed_status.ok()) { + return Status(changed_status.status().code(), + StrCat(changed_status.status().error_message(), + "\n\nFailed ", message)); + } + TF_RET_CHECK(!changed_status.ValueOrDie()) + << "invariant checkers must not change the graph"; } return Status::OK(); }; @@ -66,6 +74,8 @@ StatusOr HloPassPipeline::Run(HloModule* module) { string prefix = name().ToString() + ": pipeline start"; bool changed = false; string message; + TF_RETURN_IF_ERROR( + run_invariant_checkers(StrCat("before running pipeline: ", name()))); for (auto& pass : passes_) { if (disabled_passes.count(pass->name().ToString()) > 0) { VLOG(1) << " Skipping HLO pass " << pass->name() @@ -80,14 +90,14 @@ StatusOr HloPassPipeline::Run(HloModule* module) { StrAppend(&message, prefix, ", before ", pass->name()); DumpModule(*module, message); - TF_RETURN_IF_ERROR(run_invariant_checkers()); TF_ASSIGN_OR_RETURN(bool changed_this_pass, pass->Run(module)); + TF_RETURN_IF_ERROR( + run_invariant_checkers(StrCat("after running pass: ", pass->name()))); changed |= changed_this_pass; prefix.clear(); StrAppend(&prefix, name(), ": after ", pass->name()); } - TF_RETURN_IF_ERROR(run_invariant_checkers()); DumpModule(*module, prefix + ", pipeline end"); return changed; } diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc index 278a1d7efad..6e5d7bca75c 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc +++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc @@ -1202,7 +1202,7 @@ StatusOr HloRematerialization::RematerializeComputation( StatusOr HloRematerialization::Run( HloModule* module, SequentialHloOrdering::HloModuleSequence* sequence, - int64 memory_limit_bytes) { + int64 memory_limit_bytes, RematerializationSizes* sizes) { // The sequence is constructed entirely by this method. TF_RET_CHECK(sequence->empty()); @@ -1248,7 +1248,8 @@ StatusOr HloRematerialization::Run( sequence->at(node.computation()))); } return Status::OK(); - })); + }, + /*visit_unreachable_nodes=*/false)); // The peak memory usage of the module equals the peak memory use of the entry // computation plus the output size of the computation. This is because the @@ -1318,13 +1319,20 @@ StatusOr HloRematerialization::Run( << HumanReadableNumBytes(reduced_peak_memory) << " (" << reduced_peak_memory << " bytes)"; + if (sizes != nullptr) { + sizes->before_bytes = before_peak_memory; + sizes->after_bytes = current_peak_memory; + } + XLA_VLOG_LINES(3, "After HloRematerialization:\n" + module->ToString()); if (current_peak_memory > memory_limit_bytes) { - LOG(WARNING) << "Can't reduce memory use below " - << HumanReadableNumBytes(memory_limit_bytes) - << " by rematerialization (only reduced to " - << HumanReadableNumBytes(current_peak_memory) << ")"; + LOG(WARNING) << tensorflow::strings::Printf( + "Can't reduce memory use below %s (%lld bytes) by rematerialization; " + "only reduced to %s (%lld bytes)", + HumanReadableNumBytes(memory_limit_bytes).c_str(), memory_limit_bytes, + HumanReadableNumBytes(current_peak_memory).c_str(), + current_peak_memory); } return changed; @@ -1333,9 +1341,10 @@ StatusOr HloRematerialization::Run( /* static */ StatusOr HloRematerialization::RematerializeAndSchedule( const HloRematerialization::ShapeSizeFunction& size_function, int64 memory_limit_bytes, HloModule* hlo_module, - SequentialHloOrdering::HloModuleSequence* sequence) { + SequentialHloOrdering::HloModuleSequence* sequence, + RematerializationSizes* sizes) { HloRematerialization remat(size_function); - return remat.Run(hlo_module, sequence, memory_limit_bytes); + return remat.Run(hlo_module, sequence, memory_limit_bytes, sizes); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h index 42c279d440b..11f79a6d415 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization.h +++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h @@ -28,6 +28,13 @@ class HloRematerialization { public: using ShapeSizeFunction = std::function; + // Helper struct that communicates the before / after sizes for the + // rematerialization process. + struct RematerializationSizes { + int64 before_bytes; + int64 after_bytes; + }; + // Rematerialize HLO instructions in the given module to reduce peak memory // use below memory_limit_bytes where memory use is defined as the total size // of all live HLO instruction values. Parameters and constants are included @@ -46,6 +53,9 @@ class HloRematerialization { // rematerialization. This is the order in which HLO instructions should // be emitted to minimize memory use. // + // sizes: Optional outparam that indicates the peak memory usage of the HLO + // module before/after rematerialization. + // // Returns whether any instructions were rematerialized. If memory use is // already below the given limit then no instructions are rematerialized and // false is returned. @@ -55,8 +65,8 @@ class HloRematerialization { // code generation. static StatusOr RematerializeAndSchedule( const ShapeSizeFunction& size_function, int64 memory_limit_bytes, - HloModule* hlo_module, - SequentialHloOrdering::HloModuleSequence* sequence); + HloModule* hlo_module, SequentialHloOrdering::HloModuleSequence* sequence, + RematerializationSizes* sizes = nullptr); protected: HloRematerialization(const ShapeSizeFunction& size_function) @@ -69,7 +79,7 @@ class HloRematerialization { // contains the memory-minimizing order in which to emit the HLO instructions. StatusOr Run(HloModule* module, SequentialHloOrdering::HloModuleSequence* sequence, - int64 memory_limit); + int64 memory_limit, RematerializationSizes* sizes); // Rematerializes instructions within the given computation. 'order' is the // order in which the computation's instructions will be emitted in the diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc index f85d8ec50de..e6cf0d37b8a 100644 --- a/tensorflow/compiler/xla/service/hlo_value.cc +++ b/tensorflow/compiler/xla/service/hlo_value.cc @@ -159,12 +159,6 @@ void HloValue::AddPosition(HloInstruction* instruction, for (const HloPosition& position : positions_) { DCHECK_NE(position, new_position); } - // The shape of the new position must match existing positions. - if (!positions_.empty()) { - CHECK( - ShapeUtil::Compatible(positions_.front().shape(), new_position.shape())) - << "front: " << positions_.front() << " new: " << new_position; - } positions_.push_back(std::move(new_position)); diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h index 63ecc25020b..6872bc76a82 100644 --- a/tensorflow/compiler/xla/service/hlo_value.h +++ b/tensorflow/compiler/xla/service/hlo_value.h @@ -225,6 +225,9 @@ class HloValueSet { // already exist in the set. bool AddValue(const HloValue* value); + // Clear all values from the set. + void Clear() { values_.clear(); } + // Return the unique HLO value in the set. CHECKs if the set does not contain // exactly one value. const HloValue& GetUniqueValue() const { diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 9ba2d54d024..c44be716cdf 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -32,13 +32,11 @@ class ShapeVerifier : public DfsHloVisitor { const std::function& shape_size_fn) : shape_size_fn_(shape_size_fn) {} - Status HandleElementwiseUnary(HloInstruction* hlo, - HloOpcode opcode) override { + Status HandleElementwiseUnary(HloInstruction* hlo) override { return CheckUnaryShape(hlo); } - Status HandleElementwiseBinary(HloInstruction* hlo, - HloOpcode opcode) override { + Status HandleElementwiseBinary(HloInstruction* hlo) override { return CheckBinaryShape(hlo); } @@ -282,6 +280,14 @@ class ShapeVerifier : public DfsHloVisitor { const std::function shape_size_fn_; }; +string ComputationsToString( + tensorflow::gtl::ArraySlice computations) { + return tensorflow::str_util::Join( + computations, ",", [](string* s, const HloComputation* computation) { + s->append(computation->name()); + }); +} + } // namespace StatusOr HloVerifier::Run(HloModule* module) { @@ -292,6 +298,17 @@ StatusOr HloVerifier::Run(HloModule* module) { for (const auto& instruction : computation->instructions()) { TF_RET_CHECK(instruction->parent() == computation.get()); if (instruction->opcode() == HloOpcode::kFusion) { + TF_RET_CHECK( + ContainersEqual(instruction->called_computations(), + {instruction->fused_instructions_computation()})) + << "Fusion HLO calls computations other than the " + "fused_instructions_computation: " + << instruction->ToString() + << " instruction->fused_instructions_computation(): " + << instruction->fused_instructions_computation()->ToString() + << " instruction->called_computations(): " + << ComputationsToString(instruction->called_computations()); + for (const auto& fused : instruction->fused_instructions()) { TF_RET_CHECK(fused->parent() == instruction->fused_instructions_computation()) diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc index 01dbb7e8663..8275531111c 100644 --- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc +++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc @@ -122,7 +122,9 @@ StatusOr ReducePrecisionInsertion::insert_on_inputs( continue; } - if (instruction->opcode() == HloOpcode::kFusion) { + if (instruction->opcode() == HloOpcode::kFusion && + (instruction->fusion_kind() == HloInstruction::FusionKind::kLoop || + instruction->fusion_kind() == HloInstruction::FusionKind::kInput)) { // Insert the reduce-precision operation inside the fusion computation, // after the corresponding parameter instruction. TF_ASSIGN_OR_RETURN( @@ -171,7 +173,9 @@ StatusOr ReducePrecisionInsertion::insert_on_outputs( continue; } - if (instruction->opcode() == HloOpcode::kFusion) { + if (instruction->opcode() == HloOpcode::kFusion && + (instruction->fusion_kind() == HloInstruction::FusionKind::kLoop || + instruction->fusion_kind() == HloInstruction::FusionKind::kOutput)) { // Insert the reduce-precision operation as the last operation inside // the fusion computation. HloInstruction* fusion_root = instruction->fused_expression_root(); diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index 52b2027aece..9f7ae4ae873 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -215,6 +215,7 @@ cc_library( ], deps = [ "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "//tensorflow/core:test", diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc index 407b5f4ada5..b61544466a1 100644 --- a/tensorflow/compiler/xla/tests/filecheck.cc +++ b/tensorflow/compiler/xla/tests/filecheck.cc @@ -17,6 +17,7 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/io/path.h" diff --git a/tensorflow/compiler/xla/tests/filecheck.h b/tensorflow/compiler/xla/tests/filecheck.h index 599bf57ad32..493ff7414bd 100644 --- a/tensorflow/compiler/xla/tests/filecheck.h +++ b/tensorflow/compiler/xla/tests/filecheck.h @@ -19,6 +19,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/types.h" namespace xla { diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD index a946d335ca6..da39ba3ffc3 100644 --- a/tensorflow/compiler/xla/tools/BUILD +++ b/tensorflow/compiler/xla/tools/BUILD @@ -111,6 +111,11 @@ cc_binary( deps = [ ":replay_computation_library", "//tensorflow/compiler/plugin/executor:plugin_lib", + # TODO: This dependency is a workaround for linking error with clang. + # Without it, linker complains about missing symbols from + # 'xla_device_launch_op'. This dependency should be propagated from + # plugin_lib instead, but no targets other than this break without it. + "//tensorflow/compiler/jit", ], ) diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc index bd93e114b73..89b26b8916b 100644 --- a/tensorflow/compiler/xla/tools/replay_computation.cc +++ b/tensorflow/compiler/xla/tools/replay_computation.cc @@ -144,7 +144,7 @@ int RealMain(tensorflow::gtl::ArraySlice args, int main(int argc, char** argv) { // Flags - string fake_infeed_shape; + xla::string fake_infeed_shape; bool use_fake_data = false; const std::vector flag_list = { tensorflow::Flag("use_fake_data", &use_fake_data, diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD index 47a0f54a023..84fcc0d0149 100644 --- a/tensorflow/contrib/BUILD +++ b/tensorflow/contrib/BUILD @@ -28,6 +28,7 @@ py_library( "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py", "//tensorflow/contrib/framework:framework_py", "//tensorflow/contrib/fused_conv:fused_conv_py", + "//tensorflow/contrib/gan", "//tensorflow/contrib/graph_editor:graph_editor_py", "//tensorflow/contrib/grid_rnn:grid_rnn_py", "//tensorflow/contrib/hooks", @@ -72,6 +73,7 @@ py_library( "//tensorflow/contrib/staging", "//tensorflow/contrib/stat_summarizer:stat_summarizer_py", "//tensorflow/contrib/stateless", + "//tensorflow/contrib/summary:summary_ops", "//tensorflow/contrib/tensor_forest:init_py", "//tensorflow/contrib/tensorboard", "//tensorflow/contrib/testing:testing_py", diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py index 315ea943cf3..d1d0e2823ad 100644 --- a/tensorflow/contrib/__init__.py +++ b/tensorflow/contrib/__init__.py @@ -31,6 +31,7 @@ from tensorflow.contrib import deprecated from tensorflow.contrib import distributions from tensorflow.contrib import factorization from tensorflow.contrib import framework +from tensorflow.contrib import gan from tensorflow.contrib import graph_editor from tensorflow.contrib import grid_rnn from tensorflow.contrib import image diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py index c377c50e9fe..a8b60460c8f 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py @@ -18,6 +18,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections +import os + from tensorflow.contrib.boosted_trees.proto import tree_config_pb2 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch from tensorflow.contrib.decision_trees.proto import generic_tree_model_extensions_pb2 @@ -26,18 +29,21 @@ from tensorflow.contrib.learn.python.learn import export_strategy from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils from tensorflow.python.client import session as tf_session from tensorflow.python.framework import ops +from tensorflow.python.platform import gfile from tensorflow.python.saved_model import loader as saved_model_loader from tensorflow.python.saved_model import tag_constants -def make_custom_export_strategy(name, convert_fn, feature_columns, +def make_custom_export_strategy(name, + convert_fn, + feature_columns, export_input_fn): """Makes custom exporter of GTFlow tree format. Args: name: A string, for the name of the export strategy. convert_fn: A function that converts the tree proto to desired format and - saves it to the desired location. + saves it to the desired location. Can be None to skip conversion. feature_columns: A list of feature columns. export_input_fn: A function that takes no arguments and returns an `InputFnOps`. @@ -68,9 +74,22 @@ def make_custom_export_strategy(name, convert_fn, feature_columns, dtec = tree_config_pb2.DecisionTreeEnsembleConfig() dtec.ParseFromString(dfec_str) # Export the result in the same folder as the saved model. - convert_fn(dtec, sorted_feature_names, len(dense_floats), - len(sparse_float_indices), len(sparse_int_indices), - result_dir, eval_result) + if convert_fn: + convert_fn(dtec, sorted_feature_names, + len(dense_floats), + len(sparse_float_indices), + len(sparse_int_indices), result_dir, eval_result) + feature_importances = _get_feature_importances( + dtec, sorted_feature_names, + len(dense_floats), + len(sparse_float_indices), len(sparse_int_indices)) + sorted_by_importance = sorted( + feature_importances.items(), key=lambda x: -x[1]) + assets_dir = os.path.join(result_dir, "assets.extra") + gfile.MakeDirs(assets_dir) + with gfile.GFile(os.path.join(assets_dir, "feature_importances"), + "w") as f: + f.write("\n".join("%s, %f" % (k, v) for k, v in sorted_by_importance)) return result_dir return export_strategy.ExportStrategy(name, export_fn) @@ -157,3 +176,41 @@ def convert_to_universal_format(dtec, sorted_feature_names, node.left_child_id.value = split.left_id node.right_child_id.value = split.right_id return model_and_features + + +def _get_feature_importances(dtec, feature_names, num_dense_floats, + num_sparse_float, num_sparse_int): + """Export the feature importance per feature column.""" + del num_sparse_int # Unused. + sums = collections.defaultdict(lambda: 0) + for tree_idx in range(len(dtec.trees)): + tree = dtec.trees[tree_idx] + for tree_node in tree.nodes: + node_type = tree_node.WhichOneof("node") + if node_type == "dense_float_binary_split": + split = tree_node.dense_float_binary_split + split_column = feature_names[split.feature_column] + elif node_type == "sparse_float_binary_split_default_left": + split = tree_node.sparse_float_binary_split_default_left.split + split_column = feature_names[split.feature_column + num_dense_floats] + elif node_type == "sparse_float_binary_split_default_right": + split = tree_node.sparse_float_binary_split_default_right.split + split_column = feature_names[split.feature_column + num_dense_floats] + elif node_type == "categorical_id_binary_split": + split = tree_node.categorical_id_binary_split + split_column = feature_names[split.feature_column + num_dense_floats + + num_sparse_float] + elif node_type == "categorical_id_set_membership_binary_split": + split = tree_node.categorical_id_set_membership_binary_split + split_column = feature_names[split.feature_column + num_dense_floats + + num_sparse_float] + elif node_type == "leaf": + assert tree_node.node_metadata.gain == 0 + continue + else: + raise ValueError("Unexpected split type %s", node_type) + # Apply shrinkage factor. It is important since it is not always uniform + # across different trees. + sums[split_column] += ( + tree_node.node_metadata.gain * dtec.tree_weights[tree_idx]) + return dict(sums) diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py index 8d801fa1f38..4ed18b2d34c 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py @@ -27,7 +27,7 @@ from tensorflow.python.platform import googletest class ConvertModelTest(test_util.TensorFlowTestCase): - def testConvertModel(self): + def _make_trees(self): dtec_str = """ trees { nodes { @@ -108,8 +108,12 @@ class ConvertModelTest(test_util.TensorFlowTestCase): """ dtec = tree_config_pb2.DecisionTreeEnsembleConfig() text_format.Merge(dtec_str, dtec) - # The feature columns in the order they were added. feature_columns = ["feature_b", "feature_a", "feature_d"] + return dtec, feature_columns + + def testConvertModel(self): + dtec, feature_columns = self._make_trees() + # The feature columns in the order they were added. out = custom_export_strategy.convert_to_universal_format( dtec, feature_columns, 1, 1, 1) @@ -273,6 +277,16 @@ class ConvertModelTest(test_util.TensorFlowTestCase): }""" self.assertProtoEquals(expected_tree, out) + def testFeatureImportance(self): + dtec, feature_columns = self._make_trees() + feature_importances = custom_export_strategy._get_feature_importances( + dtec, feature_columns, 1, 1, 1) + self.assertItemsEqual(["feature_b", "feature_a", "feature_d"], + feature_importances.keys()) + self.assertAlmostEqual(50.0, feature_importances["feature_b"], places=4) + self.assertAlmostEqual(50.0, feature_importances["feature_a"], places=4) + self.assertAlmostEqual(50.0, feature_importances["feature_d"], places=4) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py index e28adad53ec..f8028acbdb0 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py @@ -61,11 +61,19 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator): logits_modifier_function: A modifier function for the logits. center_bias: Whether a separate tree should be created for first fitting the bias. + + Raises: + ValueError: If learner_config is not valid. """ head = head_lib.multi_class_head( n_classes=n_classes, weight_column_name=weight_column_name, enable_centered_bias=False) + if learner_config.num_classes == 0: + learner_config.num_classes = n_classes + elif learner_config.num_classes != n_classes: + raise ValueError("n_classes (%d) doesn't match learner_config (%d)." % + (learner_config.num_classes, n_classes)) super(GradientBoostedDecisionTreeClassifier, self).__init__( model_fn=model.model_builder, params={ @@ -129,6 +137,10 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator): label_dimension=label_dimension, weight_column_name=weight_column_name, enable_centered_bias=False) + if label_dimension == 1: + learner_config.num_classes = 2 + else: + learner_config.num_classes = label_dimension super(GradientBoostedDecisionTreeRegressor, self).__init__( model_fn=model.model_builder, params={ diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py index 2d517f78111..8cda5c8f2b1 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py @@ -92,6 +92,7 @@ def model_builder(features, labels, mode, params, config): examples_per_layer=examples_per_layer, learner_config=learner_config, feature_columns=feature_columns, + logits_dimension=head.logits_dimension, features=features) with ops.name_scope("gbdt", "gbdt_optimizer"): predictions_dict = gbdt_model.predict(mode) diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc index 42112c586a5..f4ad99f779e 100644 --- a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc @@ -74,7 +74,7 @@ class TreeEnsembleStampTokenOp : public OpKernel { decision_tree_ensemble_resource; OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0), &decision_tree_ensemble_resource)); - mutex_lock l(*decision_tree_ensemble_resource->get_mutex()); + tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex()); core::ScopedUnref unref_me(decision_tree_ensemble_resource); Tensor* output_stamp_token_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(), @@ -95,7 +95,7 @@ class TreeEnsembleSerializeOp : public OpKernel { decision_tree_ensemble_resource; OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0), &decision_tree_ensemble_resource)); - mutex_lock l(*decision_tree_ensemble_resource->get_mutex()); + tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex()); core::ScopedUnref unref_me(decision_tree_ensemble_resource); Tensor* output_stamp_token_t = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(), diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc index daca0495481..8ffd7f120b4 100644 --- a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc @@ -143,7 +143,7 @@ class GradientTreesPredictionOp : public OpKernel { // Release the reference to the resource once we're done using it. core::ScopedUnref unref_me(decision_tree_ensemble_resource); if (use_locking_) { - mutex_lock l(*decision_tree_ensemble_resource->get_mutex()); + tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex()); DoCompute(context, decision_tree_ensemble_resource); } else { DoCompute(context, decision_tree_ensemble_resource); @@ -334,7 +334,7 @@ class GradientTreesPartitionExamplesOp : public OpKernel { // Release the reference to the resource once we're done using it. core::ScopedUnref unref_me(decision_tree_ensemble_resource); if (use_locking_) { - mutex_lock l(*decision_tree_ensemble_resource->get_mutex()); + tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex()); DoCompute(context, decision_tree_ensemble_resource); } else { DoCompute(context, decision_tree_ensemble_resource); diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc index 9e9ef1738cd..d528757cf99 100644 --- a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc @@ -656,7 +656,8 @@ class GrowTreeEnsembleOp : public OpKernel { CHECK(split->split_info.split_node().node_case() != TreeNode::NODE_NOT_SET); CHECK(tree_config->nodes(node_id).node_case() == TreeNode::kLeaf) << "Unexpected node type to split " - << tree_config->nodes(node_id).node_case(); + << tree_config->nodes(node_id).node_case() << " for node_id " << node_id + << ". Tree config: " << tree_config->DebugString(); // Add left leaf. int32 left_id = tree_config->nodes_size(); @@ -767,7 +768,7 @@ class TreeEnsembleStatsOp : public OpKernel { OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0), &decision_tree_ensemble_resource)); core::ScopedUnref unref_me(decision_tree_ensemble_resource); - mutex_lock l(*decision_tree_ensemble_resource->get_mutex()); + tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex()); // Get the stamp token. const Tensor* stamp_token_t; diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc index 82664aed72d..f4c7df7fabd 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc @@ -42,6 +42,7 @@ class BiasFeatureColumnHandlerTest : public ::testing::Test { example_partitions_({0, 0, 1, 3}) { // Set L2 regularization. learner_config_.mutable_regularization()->set_l2(2.0f); + learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); // Create handler. handler_.reset(new BiasFeatureColumnHandler(kClassId, kSlotId, kBatchSize)); diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc index abd72384648..ea82b3f086d 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc @@ -51,7 +51,7 @@ class CategoricalFeatureColumnHandlerTest : public ::testing::Test { values_(test::AsTensor({1, 2, 2, 0}, {4})) { // Set L2 regularization. learner_config_.mutable_regularization()->set_l2(2.0f); - + learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); // Create handler. handler_.reset(new CategoricalFeatureColumnHandler( kClassId, kSlotId, kBatchSize, kFeatureColumn, indices_.matrix(), diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc index 396f48e5321..1bc9d733ad3 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc @@ -51,7 +51,7 @@ class DenseQuantizedFeatureColumnHandlerTest : public ::testing::Test { dense_quantized_values_(test::AsTensor({1, 1, 0, 1}, {4})) { // Set L2 regularization. learner_config_.mutable_regularization()->set_l2(2.0f); - + learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); // Create handler. handler_.reset(new DenseQuantizedFeatureColumnHandler( kClassId, kSlotId, kBatchSize, kFeatureColumn, diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc index db8c64a617f..643d936ad23 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc @@ -53,7 +53,7 @@ class SparseQuantizedFeatureColumnHandlerTest : public ::testing::Test { sparse_quantized_values_(test::AsTensor({1, 0, 1}, {3})) { // Set L2 regularization. learner_config_.mutable_regularization()->set_l2(2.0f); - + learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); // Create handler. handler_.reset(new SparseQuantizedFeatureColumnHandler( kClassId, kSlotId, kBatchSize, kFeatureColumn, diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc index f99b6826a78..ecb7a04efb9 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc @@ -30,6 +30,7 @@ const double kDelta = 1e-5; TEST(NodeStatsTest, AlmostZero) { LearnerConfig learner_config; + learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); NodeStats node_stats(learner_config, GradientStats(1e-8f, 1e-8f)); EXPECT_EQ(0, node_stats.weight_contribution[0]); EXPECT_EQ(0, node_stats.gain); @@ -37,6 +38,7 @@ TEST(NodeStatsTest, AlmostZero) { TEST(NodeStatsTest, LessThanMinWeightConstraint) { LearnerConfig learner_config; + learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); learner_config.mutable_constraints()->set_min_node_weight(3.2f); NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f)); EXPECT_EQ(0, node_stats.weight_contribution[0]); @@ -45,6 +47,7 @@ TEST(NodeStatsTest, LessThanMinWeightConstraint) { TEST(NodeStatsTest, L1RegSquashed) { LearnerConfig learner_config; + learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); learner_config.mutable_regularization()->set_l1(10.0f); NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f)); EXPECT_EQ(0, node_stats.weight_contribution[0]); @@ -53,6 +56,7 @@ TEST(NodeStatsTest, L1RegSquashed) { TEST(NodeStatsTest, L1RegPos) { LearnerConfig learner_config; + learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); learner_config.mutable_regularization()->set_l1(5.0f); NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f)); const float expected_clipped_grad = 7.32f - 5.0f; @@ -66,6 +70,7 @@ TEST(NodeStatsTest, L1RegPos) { TEST(NodeStatsTest, L1RegNeg) { LearnerConfig learner_config; + learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); learner_config.mutable_regularization()->set_l1(5.0f); NodeStats node_stats(learner_config, GradientStats(-7.32f, 1.63f)); const float expected_clipped_grad = -7.32f + 5.0f; @@ -79,6 +84,7 @@ TEST(NodeStatsTest, L1RegNeg) { TEST(NodeStatsTest, L2Reg) { LearnerConfig learner_config; + learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); learner_config.mutable_regularization()->set_l2(8.0f); NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f)); const float expected_denom = 1.63f + 8.0f; @@ -91,6 +97,7 @@ TEST(NodeStatsTest, L2Reg) { TEST(NodeStatsTest, L1L2Reg) { LearnerConfig learner_config; + learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS); learner_config.mutable_regularization()->set_l1(5.0f); learner_config.mutable_regularization()->set_l2(8.0f); NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f)); diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h index 314c44fddc5..dad3b4e10de 100644 --- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h +++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h @@ -15,6 +15,7 @@ #ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_ #define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_ +#include #include #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h" @@ -34,10 +35,27 @@ class WeightedQuantilesSummary { struct SummaryEntry { SummaryEntry(const ValueType& v, const WeightType& w, const WeightType& min, - const WeightType& max) - : value(v), weight(w), min_rank(min), max_rank(max) {} + const WeightType& max) { + // Explicitely initialize all of memory (including padding from memory + // alignment) to allow the struct to be msan-resistant "plain old data". + // + // POD = http://en.cppreference.com/w/cpp/concept/PODType + memset(this, 0, sizeof(*this)); - SummaryEntry() : value(0), weight(0), min_rank(0), max_rank(0) {} + value = v; + weight = w; + min_rank = min; + max_rank = max; + } + + SummaryEntry() { + memset(this, 0, sizeof(*this)); + + value = 0; + weight = 0; + min_rank = 0; + max_rank = 0; + } bool operator==(const SummaryEntry& other) const { return value == other.value && weight == other.weight && diff --git a/tensorflow/contrib/boosted_trees/proto/learner.proto b/tensorflow/contrib/boosted_trees/proto/learner.proto index 06ee223467b..919e7cd8142 100644 --- a/tensorflow/contrib/boosted_trees/proto/learner.proto +++ b/tensorflow/contrib/boosted_trees/proto/learner.proto @@ -17,7 +17,7 @@ message TreeRegularizationConfig { // Tree constraints config. message TreeConstraintsConfig { - // Maximum depth of the trees. + // Maximum depth of the trees. The default value is 6 if not specified. uint32 max_tree_depth = 1; // Min hessian weight per node. @@ -86,20 +86,22 @@ message LearningRateDropoutDrivenConfig { message LearnerConfig { enum PruningMode { - PRE_PRUNE = 0; - POST_PRUNE = 1; + PRUNING_MODE_UNSPECIFIED = 0; + PRE_PRUNE = 1; + POST_PRUNE = 2; } enum GrowingMode { - WHOLE_TREE = 0; - // Layer by layer is only supported by the batch learner. - LAYER_BY_LAYER = 1; + GROWING_MODE_UNSPECIFIED = 0; + WHOLE_TREE = 1; + LAYER_BY_LAYER = 2; } enum MultiClassStrategy { - TREE_PER_CLASS = 0; - FULL_HESSIAN = 1; - DIAGONAL_HESSIAN = 2; + MULTI_CLASS_STRATEGY_UNSPECIFIED = 0; + TREE_PER_CLASS = 1; + FULL_HESSIAN = 2; + DIAGONAL_HESSIAN = 3; } // Number of classes. @@ -118,16 +120,18 @@ message LearnerConfig { // Constraints. TreeConstraintsConfig constraints = 5; - // Pruning. + // Pruning. POST_PRUNE is the default pruning mode. PruningMode pruning_mode = 8; - // Growing Mode. + // Growing Mode. LAYER_BY_LAYER is the default growing mode. GrowingMode growing_mode = 9; - // Learning rate. + // Learning rate. By default we use fixed learning rate of 0.1. LearningRateConfig learning_rate_tuner = 6; - // Multi-class strategy. + // Multi-class strategy. By default we use TREE_PER_CLASS for binary + // classification and linear regression. For other cases, we use + // DIAGONAL_HESSIAN as the default. MultiClassStrategy multi_class_strategy = 10; // If you want to average the ensembles (for regularization), provide the diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py index 51e084b79c6..37595f1c75d 100644 --- a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py +++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py @@ -344,6 +344,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase): # Prepare learner config. learner_config = learner_pb2.LearnerConfig() learner_config.num_classes = 2 + learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE result, result_no_dropout, dropout_info = ( prediction_ops.gradient_trees_prediction( diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py index 6f85874a33a..2d28e0a9f16 100644 --- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py +++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py @@ -261,6 +261,7 @@ class GradientBoostedDecisionTreeModel(object): examples_per_layer, learner_config, features, + logits_dimension, feature_columns=None): """Construct a new GradientBoostedDecisionTreeModel function. @@ -273,8 +274,8 @@ class GradientBoostedDecisionTreeModel(object): a tree layer. It can also be a function that computes the number of examples based on the depth of the layer that's being built. learner_config: A learner config. - print split, sorted_feature_names[split.feature_column] features: `dict` of `Tensor` objects. + logits_dimension: An int, the dimension of logits. feature_columns: A list of feature columns. Raises: @@ -289,11 +290,39 @@ class GradientBoostedDecisionTreeModel(object): if learner_config.num_classes < 2: raise ValueError("Number of classes must be >=2") + self._logits_dimension = logits_dimension self._is_chief = is_chief self._num_ps_replicas = num_ps_replicas self._ensemble_handle = ensemble_handle self._center_bias = center_bias self._examples_per_layer = examples_per_layer + + # Fill in the defaults. + if (learner_config.multi_class_strategy == + learner_pb2.LearnerConfig.MULTI_CLASS_STRATEGY_UNSPECIFIED): + if logits_dimension == 1: + learner_config.multi_class_strategy = ( + learner_pb2.LearnerConfig.TREE_PER_CLASS) + else: + learner_config.multi_class_strategy = ( + learner_pb2.LearnerConfig.DIAGONAL_HESSIAN) + + if (learner_config.growing_mode == + learner_pb2.LearnerConfig.GROWING_MODE_UNSPECIFIED): + learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER + + if (learner_config.pruning_mode == + learner_pb2.LearnerConfig.PRUNING_MODE_UNSPECIFIED): + learner_config.pruning_mode = learner_pb2.LearnerConfig.POST_PRUNE + + if learner_config.constraints.max_tree_depth == 0: + # Use 6 as the default maximum depth. + learner_config.constraints.max_tree_depth = 6 + + tuner = learner_config.learning_rate_tuner.WhichOneof("tuner") + if not tuner: + learner_config.learning_rate_tuner.fixed.learning_rate = 0.1 + self._learner_config = learner_config self._feature_columns = feature_columns self._learner_config_serialized = learner_config.SerializeToString() @@ -378,75 +407,81 @@ class GradientBoostedDecisionTreeModel(object): local_stamp), _refresh_local_ensemble_fn, lambda: (control_flow_ops.no_op(), ensemble_stamp)) - # Once updated, Use the the local model for prediction. + # Once updated, use the local model for prediction. with ops.control_dependencies([refresh_local_ensemble]): ensemble_stats = training_ops.tree_ensemble_stats( local_ensemble_handle, ensemble_stamp) - apply_dropout, seed = _dropout_params(mode, ensemble_stats) # We don't need dropout info - we can always restore it based on the # seed. - predictions, predictions_no_dropout, _ = ( - prediction_ops.gradient_trees_prediction( - local_ensemble_handle, - seed, - self._dense_floats, - self._sparse_float_indices, - self._sparse_float_values, - self._sparse_float_shapes, - self._sparse_int_indices, - self._sparse_int_values, - self._sparse_int_shapes, - learner_config=self._learner_config_serialized, - apply_dropout=apply_dropout, - apply_averaging=apply_averaging, - use_locking=False, - center_bias=self._center_bias, - reduce_dim=self._reduce_dim)) - partition_ids = prediction_ops.gradient_trees_partition_examples( - local_ensemble_handle, - self._dense_floats, - self._sparse_float_indices, - self._sparse_float_values, - self._sparse_float_shapes, - self._sparse_int_indices, - self._sparse_int_values, - self._sparse_int_shapes, - use_locking=False) + apply_dropout, seed = _dropout_params(mode, ensemble_stats) + # Make sure ensemble stats run. This will check that the ensemble has + # the right stamp. + with ops.control_dependencies(ensemble_stats): + predictions, predictions_no_dropout, _ = ( + prediction_ops.gradient_trees_prediction( + local_ensemble_handle, + seed, + self._dense_floats, + self._sparse_float_indices, + self._sparse_float_values, + self._sparse_float_shapes, + self._sparse_int_indices, + self._sparse_int_values, + self._sparse_int_shapes, + learner_config=self._learner_config_serialized, + apply_dropout=apply_dropout, + apply_averaging=apply_averaging, + use_locking=True, + center_bias=self._center_bias, + reduce_dim=self._reduce_dim)) + partition_ids = prediction_ops.gradient_trees_partition_examples( + local_ensemble_handle, + self._dense_floats, + self._sparse_float_indices, + self._sparse_float_values, + self._sparse_float_shapes, + self._sparse_int_indices, + self._sparse_int_values, + self._sparse_int_shapes, + use_locking=True) else: with ops.device(self._ensemble_handle.device): ensemble_stats = training_ops.tree_ensemble_stats( self._ensemble_handle, ensemble_stamp) - apply_dropout, seed = _dropout_params(mode, ensemble_stats) # We don't need dropout info - we can always restore it based on the # seed. - predictions, predictions_no_dropout, _ = ( - prediction_ops.gradient_trees_prediction( - self._ensemble_handle, - seed, - self._dense_floats, - self._sparse_float_indices, - self._sparse_float_values, - self._sparse_float_shapes, - self._sparse_int_indices, - self._sparse_int_values, - self._sparse_int_shapes, - learner_config=self._learner_config_serialized, - apply_dropout=apply_dropout, - apply_averaging=apply_averaging, - use_locking=False, - center_bias=self._center_bias, - reduce_dim=self._reduce_dim)) - partition_ids = prediction_ops.gradient_trees_partition_examples( - self._ensemble_handle, - self._dense_floats, - self._sparse_float_indices, - self._sparse_float_values, - self._sparse_float_shapes, - self._sparse_int_indices, - self._sparse_int_values, - self._sparse_int_shapes, - use_locking=False) + apply_dropout, seed = _dropout_params(mode, ensemble_stats) + # Make sure ensemble stats run. This will check that the ensemble has + # the right stamp. + with ops.control_dependencies(ensemble_stats): + predictions, predictions_no_dropout, _ = ( + prediction_ops.gradient_trees_prediction( + self._ensemble_handle, + seed, + self._dense_floats, + self._sparse_float_indices, + self._sparse_float_values, + self._sparse_float_shapes, + self._sparse_int_indices, + self._sparse_int_values, + self._sparse_int_shapes, + learner_config=self._learner_config_serialized, + apply_dropout=apply_dropout, + apply_averaging=apply_averaging, + use_locking=True, + center_bias=self._center_bias, + reduce_dim=self._reduce_dim)) + partition_ids = prediction_ops.gradient_trees_partition_examples( + self._ensemble_handle, + self._dense_floats, + self._sparse_float_indices, + self._sparse_float_values, + self._sparse_float_shapes, + self._sparse_int_indices, + self._sparse_int_values, + self._sparse_int_shapes, + use_locking=True) return _make_predictions_dict(ensemble_stamp, predictions, predictions_no_dropout, partition_ids, diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py index 9ce434edf8b..16e24d97dde 100644 --- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py +++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py @@ -164,7 +164,7 @@ class GbdtTest(test_util.TensorFlowTestCase): ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, - features=features) + logits_dimension=1, features=features) predictions = array_ops.constant( [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32) @@ -268,7 +268,7 @@ class GbdtTest(test_util.TensorFlowTestCase): ensemble_handle=ensemble_handle, examples_per_layer=num_examples_fn, learner_config=learner_config, - features=features) + logits_dimension=1, features=features) predictions = array_ops.constant( [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32) @@ -371,7 +371,7 @@ class GbdtTest(test_util.TensorFlowTestCase): ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, - features=features) + logits_dimension=1, features=features) predictions = array_ops.constant( [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32) @@ -442,7 +442,7 @@ class GbdtTest(test_util.TensorFlowTestCase): ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, - features=features) + logits_dimension=1, features=features) predictions = array_ops.constant( [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32) @@ -505,7 +505,7 @@ class GbdtTest(test_util.TensorFlowTestCase): ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, - features=features) + logits_dimension=1, features=features) predictions = array_ops.constant( [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32) @@ -588,7 +588,7 @@ class GbdtTest(test_util.TensorFlowTestCase): ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, - features=features) + logits_dimension=1, features=features) # Create predict op. mode = model_fn.ModeKeys.EVAL @@ -627,7 +627,7 @@ class GbdtTest(test_util.TensorFlowTestCase): ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, - features=features) + logits_dimension=5, features=features) predictions = array_ops.constant( [[0.0, -1.0, 0.5, 1.2, 3.1], [1.0, 0.0, 0.8, 0.3, 1.0], @@ -730,7 +730,7 @@ class GbdtTest(test_util.TensorFlowTestCase): ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, - features=features) + logits_dimension=5, features=features) predictions = array_ops.constant( [[0.0, -1.0, 0.5, 1.2, 3.1], [1.0, 0.0, 0.8, 0.3, 1.0], @@ -833,7 +833,7 @@ class GbdtTest(test_util.TensorFlowTestCase): ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, - features=features) + logits_dimension=5, features=features) batch_size = 3 predictions = array_ops.constant( diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt index 422df3063ee..c249a285562 100644 --- a/tensorflow/contrib/cmake/CMakeLists.txt +++ b/tensorflow/contrib/cmake/CMakeLists.txt @@ -33,6 +33,7 @@ option(tensorflow_BUILD_MORE_PYTHON_TESTS "Build more python unit tests for cont option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF) option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON) option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions") +option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON) if (NOT WIN32) # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option @@ -204,6 +205,12 @@ if(tensorflow_ENABLE_JEMALLOC_SUPPORT) list(APPEND tensorflow_EXTERNAL_DEPENDENCIES jemalloc) include_directories(${jemalloc_INCLUDE_DIRS}) endif() +if(tensorflow_ENABLE_SNAPPY_SUPPORT) + include(snappy) + list(APPEND tensorflow_EXTERNAL_LIBRARIES ${snappy_STATIC_LIBRARIES}) + list(APPEND tensorflow_EXTERNAL_DEPENDENCIES snappy) + include_directories(${snappy_INCLUDE_DIR}) +endif() if(WIN32) list(APPEND tensorflow_EXTERNAL_LIBRARIES wsock32 ws2_32 shlwapi) endif() diff --git a/tensorflow/contrib/cmake/external/boringssl.cmake b/tensorflow/contrib/cmake/external/boringssl.cmake index 04a9664701c..dc27eadaca1 100644 --- a/tensorflow/contrib/cmake/external/boringssl.cmake +++ b/tensorflow/contrib/cmake/external/boringssl.cmake @@ -17,7 +17,7 @@ include (ExternalProject) set(boringssl_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/boringssl/src/boringssl/include) #set(boringssl_EXTRA_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/boringssl/src) set(boringssl_URL https://boringssl.googlesource.com/boringssl) -set(boringssl_TAG 17cf2cb1d226b0ba2401304242df7ddd3b6f1ff2) +set(boringssl_TAG ee7aa02) set(boringssl_BUILD ${CMAKE_BINARY_DIR}/boringssl/src/boringssl-build) #set(boringssl_LIBRARIES ${boringssl_BUILD}/obj/so/libboringssl.so) set(boringssl_STATIC_LIBRARIES diff --git a/tensorflow/contrib/cmake/external/cub.cmake b/tensorflow/contrib/cmake/external/cub.cmake index 477572d5881..d98579d2077 100644 --- a/tensorflow/contrib/cmake/external/cub.cmake +++ b/tensorflow/contrib/cmake/external/cub.cmake @@ -14,8 +14,8 @@ # ============================================================================== include (ExternalProject) -set(cub_URL http://mirror.bazel.build/github.com/NVlabs/cub/archive/69ceda618313df8e9cac6659d607b08949455d14.tar.gz) -set(cub_HASH SHA256=87e856522c283b8ea887c3b61d7d5b252d2dd74abac4f1d756d776e721223e82) +set(cub_URL http://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.3.zip) +set(cub_HASH SHA256=b7ead9e291d34ffa8074243541c1380d63be63f88de23de8ee548db573b72ebe) set(cub_BUILD ${CMAKE_CURRENT_BINARY_DIR}/cub/src/cub) set(cub_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub/src/cub) set(cub_ARCHIVE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/cub_archive) diff --git a/tensorflow/contrib/cmake/external/snappy.cmake b/tensorflow/contrib/cmake/external/snappy.cmake new file mode 100644 index 00000000000..a35d8654fb6 --- /dev/null +++ b/tensorflow/contrib/cmake/external/snappy.cmake @@ -0,0 +1,50 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +include (ExternalProject) + +set(snappy_URL https://github.com/google/snappy.git) +set(snappy_TAG "55924d11095df25ab25c405fadfe93d0a46f82eb") +set(snappy_BUILD ${CMAKE_CURRENT_BINARY_DIR}/snappy/src/snappy) +set(snappy_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/snappy/src/snappy) + +if(WIN32) + set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/$(Configuration)/snappy.lib) +else() + set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/libsnappy.a) +endif() + +set(snappy_HEADERS + "${snappy_INCLUDE_DIR}/snappy.h" +) + +ExternalProject_Add(snappy + PREFIX snappy + GIT_REPOSITORY ${snappy_URL} + GIT_TAG ${snappy_TAG} + DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" + BUILD_IN_SOURCE 1 + INSTALL_COMMAND "" + LOG_DOWNLOAD ON + LOG_CONFIGURE ON + LOG_BUILD ON + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DSNAPPY_BUILD_TESTS:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON +) + +# actually enables snappy in the source code +add_definitions(-DSNAPPY) \ No newline at end of file diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake index 87d946c3462..c5a10181271 100644 --- a/tensorflow/contrib/cmake/tf_c.cmake +++ b/tensorflow/contrib/cmake/tf_c.cmake @@ -18,6 +18,7 @@ set(tf_c_srcs "${tensorflow_source_dir}/tensorflow/c/c_api.cc" "${tensorflow_source_dir}/tensorflow/c/c_api.h" + "${tensorflow_source_dir}/tensorflow/c/c_api_function.cc" "${tensorflow_source_dir}/tensorflow/c/eager/c_api.cc" "${tensorflow_source_dir}/tensorflow/c/eager/c_api.h" "${tensorflow_source_dir}/tensorflow/c/eager/runtime.cc" diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake index 48023099379..1b706159a3d 100755 --- a/tensorflow/contrib/cmake/tf_python.cmake +++ b/tensorflow/contrib/cmake/tf_python.cmake @@ -315,6 +315,7 @@ add_python_module("tensorflow/contrib/framework/ops") add_python_module("tensorflow/contrib/framework/python") add_python_module("tensorflow/contrib/framework/python/framework") add_python_module("tensorflow/contrib/framework/python/ops") +add_python_module("tensorflow/contrib/gan") add_python_module("tensorflow/contrib/graph_editor") add_python_module("tensorflow/contrib/graph_editor/examples") add_python_module("tensorflow/contrib/graph_editor/tests") diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake index 76531add312..eb02f20457e 100644 --- a/tensorflow/contrib/cmake/tf_tests.cmake +++ b/tensorflow/contrib/cmake/tf_tests.cmake @@ -240,6 +240,8 @@ if (tensorflow_BUILD_PYTHON_TESTS) "${tensorflow_source_dir}/tensorflow/python/training/quantize_training_test.py" # Needs quantization ops to be included in windows. "${tensorflow_source_dir}/tensorflow/python/training/supervisor_test.py" # Flaky I/O error on rename. "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py" # Needs portpicker. + "${tensorflow_source_dir}/tensorflow/python/training/server_lib_test.py" # Test occasionally deadlocks. + "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py" # depends on python/framework/test_ops # Broken tensorboard test due to cmake issues. "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py" @@ -291,6 +293,8 @@ if (tensorflow_BUILD_PYTHON_TESTS) # Failing with TF 1.3 (TODO) "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py" "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_test.py" + # Test should only be run manually + "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reduction_ops_test_big.py" ) endif() list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude}) diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py index 694bd507d97..bc4fd10cac6 100644 --- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py +++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py @@ -716,6 +716,482 @@ _cudnn_rnn_common_doc_string = """ """ +def _check_direction(direction): + if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION): + raise ValueError("Invalid direction: %s, expect %s or %s" % + (direction, CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION)) + + +def _check_rnn_mode(rnn_mode): + if rnn_mode not in (CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH, CUDNN_RNN_RELU): + raise ValueError("Invalid rnn_mode: %s, expect one of (%s, %s, %s, %s)" % + (rnn_mode, CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH, + CUDNN_RNN_RELU)) + + +def _get_seed(seed): + seed, seed2 = random_seed.get_seed(seed) + if seed is None and seed2 is None: + seed, seed2 = 0, 0 + return seed, seed2 + + +def _get_num_params(rnn_mode, num_layers, direction): + """Return num params for given Cudnn config.""" + if rnn_mode == CUDNN_LSTM: + num_params_per_layer = 8 + elif rnn_mode == CUDNN_GRU: + num_params_per_layer = 6 + elif rnn_mode in (CUDNN_RNN_RELU, CUDNN_RNN_TANH): + num_params_per_layer = 2 + else: + raise ValueError("Invalid \'rnn_mode\': %s", rnn_mode) + num_params = num_layers * num_params_per_layer + if direction != CUDNN_RNN_UNIDIRECTION: + num_params *= 2 + return num_params + + +def _cudnn_rnn(inputs, + input_h, + input_c, + params, + is_training, + rnn_mode, + input_mode=CUDNN_INPUT_LINEAR_MODE, + direction=CUDNN_RNN_UNIDIRECTION, + dropout=0., + seed=0, + name=None): + """Cudnn RNN. + + Args: + inputs: the input sequence to the RNN model. A Tensor of shape [?, + batch_size, input_size]. + input_h: the initial hidden state for h. A Tensor of shape [num_layers, + batch_size, num_units]. + input_c: the initial hidden state for c. This is only relevant for LSTM. + A Tensor of the same shape as input_h. + params: the parameter buffer created for this model. + is_training: whether this operation will be used in training or inference + rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh'). + input_mode: indicate whether there is a linear projection between the + input and the actual computation before the first layer. It could be + 'linear_input', 'skip_input' or 'auto_select'. + 'linear_input' (default) always applies a linear projection of input + onto RNN hidden state. (standard RNN behavior). + 'skip_input' is only allowed when input_size == num_units; + 'auto_select' implies 'skip_input' when input_size == num_units; + otherwise, it implies 'linear_input'. + direction: the direction model that the model operates. Could be either + 'unidirectional' or 'bidirectional' + dropout: whether to enable dropout. With it is 0, dropout is disabled. + seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + for behavior. + name: name of the operation. + Returns: + outputs, output_h, output_c + """ + _check_rnn_mode(rnn_mode) + _check_direction(direction) + seed, seed2 = random_seed.get_seed(seed) + outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn( + input=inputs, + input_h=input_h, + input_c=input_c, + params=params, + is_training=is_training, + rnn_mode=rnn_mode, + input_mode=input_mode, + direction=direction, + dropout=dropout, + seed=seed, + seed2=seed2, + name=name) + return (outputs, output_h, output_c) + + +def cudnn_lstm(inputs, + input_h, + input_c, + params, + is_training, + input_mode=CUDNN_INPUT_LINEAR_MODE, + direction=CUDNN_RNN_UNIDIRECTION, + dropout=0., + seed=0, + name=None): + """Cudnn LSTM. + + Args: + inputs: the input sequence to the RNN model. A Tensor of shape [?, + batch_size, input_size]. + input_h: the initial hidden state for h. A Tensor of shape [num_layers, + batch_size, num_units]. + input_c: the initial hidden state for c. This is only relevant for LSTM. + A Tensor of the same shape as input_h. + params: the parameter buffer created for this model. + is_training: whether this operation will be used in training or inference + input_mode: indicate whether there is a linear projection between the + input and the actual computation before the first layer. It could be + 'linear_input', 'skip_input' or 'auto_select'. + 'linear_input' (default) always applies a linear projection of input + onto RNN hidden state. (standard RNN behavior). + 'skip_input' is only allowed when input_size == num_units; + 'auto_select' implies 'skip_input' when input_size == num_units; + otherwise, it implies 'linear_input'. + direction: the direction model that the model operates. Could be either + 'unidirectional' or 'bidirectional' + dropout: whether to enable dropout. With it is 0, dropout is disabled. + seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + for behavior. + name: name of the operation. + Returns: + outputs, output_h, output_c + """ + return _cudnn_rnn(inputs, input_h, input_c, params, is_training, CUDNN_LSTM, + input_mode, direction, dropout, seed, name) + + +def _cudnn_rnn_no_input_c(inputs, + input_h, + params, + is_training, + rnn_mode, + input_mode=CUDNN_INPUT_LINEAR_MODE, + direction=CUDNN_RNN_UNIDIRECTION, + dropout=0., + seed=0, + name=None): + """Cudnn RNN w/o input_c. + + Args: + inputs: the input sequence to the RNN model. A Tensor of shape [?, + batch_size, input_size]. + input_h: the initial hidden state for h. A Tensor of shape [num_layers, + batch_size, num_units]. + params: the parameter buffer created for this model. + is_training: whether this operation will be used in training or inference + rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh'). + input_mode: indicate whether there is a linear projection between the + input and the actual computation before the first layer. It could be + 'linear_input', 'skip_input' or 'auto_select'. + 'linear_input' (default) always applies a linear projection of input + onto RNN hidden state. (standard RNN behavior). + 'skip_input' is only allowed when input_size == num_units; + 'auto_select' implies 'skip_input' when input_size == num_units; + otherwise, it implies 'linear_input'. + direction: the direction model that the model operates. Could be either + 'unidirectional' or 'bidirectional' + dropout: whether to enable dropout. With it is 0, dropout is disabled. + seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + for behavior. + name: name of the operation. + Returns: + outputs, output_h + """ + input_c = array_ops.constant([], dtype=input_h.dtype) + outputs, output_h, _ = _cudnn_rnn(inputs, input_h, input_c, params, + is_training, rnn_mode, input_mode, + direction, dropout, seed, name) + return outputs, output_h + + +def cudnn_gru(inputs, + input_h, + params, + is_training, + input_mode=CUDNN_INPUT_LINEAR_MODE, + direction=CUDNN_RNN_UNIDIRECTION, + dropout=0., + seed=0, + name=None): + """Cudnn GRU. + + Args: + inputs: the input sequence to the RNN model. A Tensor of shape [?, + batch_size, input_size]. + input_h: the initial hidden state for h. A Tensor of shape [num_layers, + batch_size, num_units]. + params: the parameter buffer created for this model. + is_training: whether this operation will be used in training or inference + input_mode: indicate whether there is a linear projection between the + input and the actual computation before the first layer. It could be + 'linear_input', 'skip_input' or 'auto_select'. + 'linear_input' (default) always applies a linear projection of input + onto RNN hidden state. (standard RNN behavior). + 'skip_input' is only allowed when input_size == num_units; + 'auto_select' implies 'skip_input' when input_size == num_units; + otherwise, it implies 'linear_input'. + direction: the direction model that the model operates. Could be either + 'unidirectional' or 'bidirectional' + dropout: whether to enable dropout. With it is 0, dropout is disabled. + seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + for behavior. + name: name of the operation. + Returns: + outputs, output_h + """ + return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, CUDNN_GRU, + input_mode, direction, dropout, seed, name) + + +def cudnn_rnn_relu(inputs, + input_h, + params, + is_training, + input_mode=CUDNN_INPUT_LINEAR_MODE, + direction=CUDNN_RNN_UNIDIRECTION, + dropout=0., + seed=0, + name=None): + """Cudnn RNN Relu. + + Args: + inputs: the input sequence to the RNN model. A Tensor of shape [?, + batch_size, input_size]. + input_h: the initial hidden state for h. A Tensor of shape [num_layers, + batch_size, num_units]. + params: the parameter buffer created for this model. + is_training: whether this operation will be used in training or inference + input_mode: indicate whether there is a linear projection between the + input and the actual computation before the first layer. It could be + 'linear_input', 'skip_input' or 'auto_select'. + 'linear_input' (default) always applies a linear projection of input + onto RNN hidden state. (standard RNN behavior). + 'skip_input' is only allowed when input_size == num_units; + 'auto_select' implies 'skip_input' when input_size == num_units; + otherwise, it implies 'linear_input'. + direction: the direction model that the model operates. Could be either + 'unidirectional' or 'bidirectional' + dropout: whether to enable dropout. With it is 0, dropout is disabled. + seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + for behavior. + name: name of the operation. + Returns: + outputs, output_h + """ + return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, + CUDNN_RNN_RELU, input_mode, direction, dropout, + seed, name) + + +def cudnn_rnn_tanh(inputs, + input_h, + params, + is_training, + input_mode=CUDNN_INPUT_LINEAR_MODE, + direction=CUDNN_RNN_UNIDIRECTION, + dropout=0., + seed=0, + name=None): + """Cudnn RNN Tanh. + + Args: + inputs: the input sequence to the RNN model. A Tensor of shape [?, + batch_size, input_size]. + input_h: the initial hidden state for h. A Tensor of shape [num_layers, + batch_size, num_units]. + params: the parameter buffer created for this model. + is_training: whether this operation will be used in training or inference + input_mode: indicate whether there is a linear projection between the + input and the actual computation before the first layer. It could be + 'linear_input', 'skip_input' or 'auto_select'. + 'linear_input' (default) always applies a linear projection of input + onto RNN hidden state. (standard RNN behavior). + 'skip_input' is only allowed when input_size == num_units; + 'auto_select' implies 'skip_input' when input_size == num_units; + otherwise, it implies 'linear_input'. + direction: the direction model that the model operates. Could be either + 'unidirectional' or 'bidirectional' + dropout: whether to enable dropout. With it is 0, dropout is disabled. + seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + for behavior. + name: name of the operation. + Returns: + outputs, output_h + """ + return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, + CUDNN_RNN_TANH, input_mode, direction, dropout, + seed, name) + + +def cudnn_rnn_params_to_canonical(rnn_mode, + num_layers, + num_units, + input_size, + params, + input_mode=CUDNN_INPUT_LINEAR_MODE, + direction=CUDNN_RNN_UNIDIRECTION, + dropout=0, + seed=0, + name=None): + """Convert cudnn opaque params to canonical. + + Args: + rnn_mode: a string specifies the mode, under which this RNN model runs. + Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'. + num_layers: the number of layers for the RNN model. + num_units: the number of units within the RNN model. + input_size: the size of the input, it could be different from the + num_units. + params: opaque cudnn params var. + input_mode: indicate whether there is a linear projection between the + input and the actual computation before the first layer. It could be + 'linear_input', 'skip_input' or 'auto_select'. + 'linear_input' (default) always applies a linear projection of input + onto RNN hidden state. (standard RNN behavior). + 'skip_input' is only allowed when input_size == num_units; + 'auto_select' implies 'skip_input' when input_size == num_units; + otherwise, it implies 'linear_input'. + direction: the direction model that the model operates. Could be either + 'unidirectional' or 'bidirectional' + dropout: whether to enable dropout. With it is 0, dropout is disabled. + seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + for behavior. + name: name of the operation. + Returns: + weights list and bias list + Raises: + ValueError: if rnn_mode or direction is invalid. + """ + + _check_rnn_mode(rnn_mode) + _check_direction(direction) + num_params = _get_num_params(rnn_mode, num_layers, direction) + seed, seed2 = random_seed.get_seed(seed) + weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical( + rnn_mode=rnn_mode, + num_layers=num_layers, + num_units=num_units, + input_size=input_size, + params=params, + input_mode=input_mode, + direction=direction, + dropout=dropout, + seed=seed, + seed2=seed2, + num_params=num_params, + name=name) + return weights, biases + + +def cudnn_rnn_canonical_to_params(rnn_mode, + num_layers, + num_units, + input_size, + weights, + biases, + input_mode=CUDNN_INPUT_LINEAR_MODE, + direction=CUDNN_RNN_UNIDIRECTION, + dropout=0, + seed=0, + name=None): + """Converts params from the canonical format to a specific format of cuDNN. + + Args: + rnn_mode: a string specifies the mode, under which this RNN model runs. + Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'. + num_layers: the number of layers for the RNN model. + num_units: the number of units within the RNN model. + input_size: the size of the input, it could be different from the + num_units. + weights: a Tensor for weight parameters. + biases: a Tensor for bias parameters. + input_mode: indicate whether there is a linear projection between the + input and the actual computation before the first layer. It could be + 'linear_input', 'skip_input' or 'auto_select'. + 'linear_input' (default) always applies a linear projection of input + onto RNN hidden state. (standard RNN behavior). + 'skip_input' is only allowed when input_size == num_units; + 'auto_select' implies 'skip_input' when input_size == num_units; + otherwise, it implies 'linear_input'. + direction: the direction model that the model operates. Could be either + 'unidirectional' or 'bidirectional' + dropout: whether to enable dropout. With it is 0, dropout is disabled. + seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + for behavior. + name: name of the operation. + Returns: + an opaque Cudnn param. + Raises: + ValueError: if rnn_mode or direction is invalid. + """ + _check_rnn_mode(rnn_mode) + _check_direction(direction) + seed, seed2 = random_seed.get_seed(seed) + return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params( + rnn_mode=rnn_mode, + num_layers=num_layers, + num_units=num_units, + input_size=input_size, + weights=weights, + biases=biases, + input_mode=input_mode, + direction=direction, + dropout=dropout, + seed=seed, + seed2=seed2, + name=name) + + +def cudnn_opaque_params_size(rnn_mode, + num_layers, + num_units, + input_size, + input_mode=CUDNN_INPUT_LINEAR_MODE, + direction=CUDNN_RNN_UNIDIRECTION, + dtype=dtypes.float32, + dropout=0, + seed=0, + name=None): + """Returns opaque params size for specific Cudnn config. + + Args: + rnn_mode: a string specifies the mode, under which this RNN model runs. + Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'. + num_layers: the number of layers for the RNN model. + num_units: the number of units within the RNN model. + input_size: the size of the input, it could be different from the + num_units. + input_mode: indicate whether there is a linear projection between the + input and the actual computation before the first layer. It could be + 'linear_input', 'skip_input' or 'auto_select'. + 'linear_input' (default) always applies a linear projection of input + onto RNN hidden state. (standard RNN behavior). + 'skip_input' is only allowed when input_size == num_units; + 'auto_select' implies 'skip_input' when input_size == num_units; + otherwise, it implies 'linear_input'. + direction: the direction model that the model operates. Could be either + 'unidirectional' or 'bidirectional' + dtype: one of tf.float32 or tf.float64. + dropout: whether to enable dropout. With it is 0, dropout is disabled. + seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + for behavior. + name: name of the operation. + Returns: + a int, size of Cudnn opaque params. + Raises: + ValueError: if rnn_mode or direction is invalid. + """ + _check_rnn_mode(rnn_mode) + _check_direction(direction) + seed, seed2 = random_seed.get_seed(seed) + return gen_cudnn_rnn_ops.cudnn_rnn_params_size( + rnn_mode=rnn_mode, + num_layers=num_layers, + num_units=num_units, + input_size=input_size, + T=dtype, + S=dtypes.int32, + dropout=dropout, + seed=seed, + seed2=seed2, + input_mode=input_mode, + direction=direction, + name=name)[0] + + class _CudnnRNN(object): """Creates an RNN model using the underlying Cudnn implementation. @@ -761,9 +1237,6 @@ class _CudnnRNN(object): Raises: ValueError: if direction is invalid. """ - if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION): - raise ValueError("Invalid direction: %s, expect %s or %s", - direction, CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION) self._num_layers = num_layers self._num_units = num_units self._input_size = input_size @@ -772,10 +1245,7 @@ class _CudnnRNN(object): self._direction = direction self._dtype = dtype self._dropout = dropout - # get graph and op seed. - self._seed, self._seed2 = random_seed.get_seed(seed) - if self._seed is None and self._seed2 is None: - self._seed, self._seed2 = 0, 0 + self._seed = seed @property def input_mode(self): @@ -807,18 +1277,16 @@ class _CudnnRNN(object): Returns: The calculated parameter buffer size. """ - return gen_cudnn_rnn_ops.cudnn_rnn_params_size( + return cudnn_opaque_params_size( + rnn_mode=self._rnn_mode, num_layers=self._num_layers, num_units=self._num_units, input_size=self._input_size, - T=self._dtype, - S=dtypes.int32, + dtype=self._dtype, dropout=self._dropout, seed=self._seed, - seed2=self._seed2, - rnn_mode=self._rnn_mode, input_mode=self._input_mode, - direction=self._direction)[0] + direction=self._direction) def __call__(self, input_data, input_h, input_c, params, is_training=True): """Runs the forward step for the RNN model. @@ -837,22 +1305,17 @@ class _CudnnRNN(object): output_h: the final state for h. output_c: the final state for c. This is only relevant for LSTM. """ - if self._rnn_mode != CUDNN_LSTM: - # For model that doesn't take input_c, replace with a dummy tensor. - input_c = array_ops.constant([], dtype=self._dtype) - output, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn( - input=input_data, - input_h=input_h, - input_c=input_c, - params=params, - rnn_mode=self._rnn_mode, + return _cudnn_rnn( + input_data, + input_h, + input_c, + params, + is_training, + self._rnn_mode, input_mode=self._input_mode, direction=self._direction, dropout=self._dropout, - seed=self._seed, - seed2=self._seed2, - is_training=is_training) - return (output, output_h, output_c) + seed=self._seed) def params_to_canonical(self, params): """Converts params from a specific format of cuDNN to the canonical format. @@ -863,22 +1326,16 @@ class _CudnnRNN(object): Returns: A function for the specific-to-canonical conversion. """ - num_params = self._num_layers * self._NUM_PARAMS_PER_LAYER - if self._direction != CUDNN_RNN_UNIDIRECTION: - num_params *= 2 - weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical( + return cudnn_rnn_params_to_canonical( + rnn_mode=self._rnn_mode, num_layers=self._num_layers, num_units=self._num_units, input_size=self._input_size, params=params, - dropout=self._dropout, - seed=self._seed, - seed2=self._seed2, - num_params=num_params, - rnn_mode=self._rnn_mode, input_mode=self._input_mode, - direction=self._direction) - return weights, biases + direction=self._direction, + dropout=self._dropout, + seed=self._seed) def canonical_to_params(self, weights, biases): """Converts params from the canonical format to a specific format of cuDNN. @@ -890,18 +1347,17 @@ class _CudnnRNN(object): Returns: A function for the canonical-to-params-to-specific conversion.. """ - return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params( + return cudnn_rnn_canonical_to_params( + rnn_mode=self._rnn_mode, num_layers=self._num_layers, num_units=self._num_units, input_size=self._input_size, weights=weights, biases=biases, - dropout=self._dropout, - seed=self._seed, - seed2=self._seed2, - rnn_mode=self._rnn_mode, input_mode=self._input_mode, - direction=self._direction) + direction=self._direction, + dropout=self._dropout, + seed=self._seed) class CudnnLSTM(_CudnnRNN): @@ -1036,9 +1492,16 @@ class _CudnnRNNNoInputC(_CudnnRNN): output: the output sequuence. output_h: the final state for h. """ - output, output_h, _ = super(_CudnnRNNNoInputC, self).__call__( - input_data, input_h, None, params, is_training=is_training) - return (output, output_h) + return _cudnn_rnn_no_input_c( + input_data, + input_h, + params, + is_training, + self._rnn_mode, + input_mode=self._input_mode, + direction=self._direction, + dropout=self._dropout, + seed=self._seed) class CudnnGRU(_CudnnRNNNoInputC): diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py index 5308ab64ace..1c0a5288f7e 100644 --- a/tensorflow/contrib/data/__init__.py +++ b/tensorflow/contrib/data/__init__.py @@ -22,6 +22,7 @@ @@read_batch_features @@rejection_resample +@@group_by_window """ from __future__ import absolute_import @@ -31,6 +32,7 @@ from __future__ import print_function # pylint: disable=unused-import from tensorflow.contrib.data.python.ops.dataset_ops import Dataset from tensorflow.contrib.data.python.ops.dataset_ops import FixedLengthRecordDataset +from tensorflow.contrib.data.python.ops.dataset_ops import group_by_window from tensorflow.contrib.data.python.ops.dataset_ops import Iterator from tensorflow.contrib.data.python.ops.dataset_ops import read_batch_features from tensorflow.contrib.data.python.ops.dataset_ops import rejection_resample diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py index 71df1ee0a50..0111aae1035 100644 --- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py @@ -37,7 +37,9 @@ class GroupByWindowTest(test.TestCase): components = np.random.randint(100, size=(200,)).astype(np.int64) iterator = dataset_ops.Iterator.from_dataset( dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x) - .group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)) + .apply( + dataset_ops.group_by_window, + args=(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))) init_op = iterator.initializer get_next = iterator.get_next() @@ -61,8 +63,9 @@ class GroupByWindowTest(test.TestCase): components = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64) iterator = dataset_ops.Iterator.from_dataset( - dataset_ops.Dataset.from_tensor_slices(components).repeat(-1) - .group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4)) + dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply( + dataset_ops.group_by_window, + args=(lambda x: x % 3, lambda _, xs: xs.batch(4), 4))) init_op = iterator.initializer get_next = iterator.get_next() @@ -81,8 +84,9 @@ class GroupByWindowTest(test.TestCase): def testSmallGroups(self): components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64) iterator = dataset_ops.Iterator.from_dataset( - dataset_ops.Dataset.from_tensor_slices(components) - .group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)) + dataset_ops.Dataset.from_tensor_slices(components).apply( + dataset_ops.group_by_window, + args=(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))) init_op = iterator.initializer get_next = iterator.get_next() @@ -108,8 +112,9 @@ class GroupByWindowTest(test.TestCase): iterator = dataset_ops.Iterator.from_dataset( dataset_ops.Dataset.from_tensor_slices(components) - .map(lambda x: (x, ops.convert_to_tensor([x * x]))) - .group_by_window(lambda x, _: x % 2, reduce_func, 32)) + .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply( + dataset_ops.group_by_window, + args=(lambda x, _: x % 2, reduce_func, 32))) init_op = iterator.initializer get_next = iterator.get_next() @@ -124,17 +129,20 @@ class GroupByWindowTest(test.TestCase): def reduce_func(key, window): # Apply two different kinds of padding to the input: tight # padding, and quantized (to a multiple of 10) padding. - return dataset_ops.Dataset.zip((window.padded_batch( - 4, - padded_shapes=tensor_shape.TensorShape([None])), window.padded_batch( + return dataset_ops.Dataset.zip(( + window.padded_batch( + 4, padded_shapes=tensor_shape.TensorShape([None])), + window.padded_batch( 4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),)) iterator = dataset_ops.Iterator.from_dataset( dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x)) - .group_by_window( - lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64), - reduce_func, 4)) + .apply( + dataset_ops.group_by_window, + args= + (lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64), + reduce_func, 4))) init_op = iterator.initializer get_next = iterator.get_next() @@ -151,10 +159,9 @@ class GroupByWindowTest(test.TestCase): self.assertEqual(len(components), sum(counts)) -# NOTE(mrry): These tests are based on the tests in -# bucket_ops_test.py. Currently, different batch sizes for each key -# are not supported, although this would be possible to add to -# `Dataset.group_by_window()`. +# NOTE(mrry): These tests are based on the tests in bucket_ops_test.py. +# Currently, they use a constant batch size, though should be made to use a +# different batch size per key. class BucketTest(test.TestCase): def _dynamicPad(self, bucket, window, window_size): @@ -168,6 +175,7 @@ class BucketTest(test.TestCase): tensor_shape.TensorShape([3]))))) def testSingleBucket(self): + def _map_fn(v): return (v, array_ops.fill([v], v), array_ops.fill([3], string_ops.as_string(v))) @@ -175,9 +183,10 @@ class BucketTest(test.TestCase): input_dataset = ( dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn)) - bucketed_dataset = input_dataset.group_by_window( - lambda x, y, z: 0, lambda k, bucket: self._dynamicPad(k, bucket, 32), - 32) + bucketed_dataset = input_dataset.apply( + dataset_ops.group_by_window, + args=(lambda x, y, z: 0, + lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)) iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset) init_op = iterator.initializer @@ -201,6 +210,7 @@ class BucketTest(test.TestCase): self.assertAllEqual(expected_vec3_str, bucketed_values[2]) def testEvenOddBuckets(self): + def _map_fn(v): return (v, array_ops.fill([v], v), array_ops.fill([3], string_ops.as_string(v))) @@ -208,9 +218,10 @@ class BucketTest(test.TestCase): input_dataset = ( dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn)) - bucketed_dataset = input_dataset.group_by_window( - lambda x, y, z: math_ops.cast(x % 2, dtypes.int64), - lambda k, bucket: self._dynamicPad(k, bucket, 32), 32) + bucketed_dataset = input_dataset.apply( + dataset_ops.group_by_window, + args=(lambda x, y, z: math_ops.cast(x % 2, dtypes.int64), + lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)) iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset) init_op = iterator.initializer @@ -256,25 +267,31 @@ class BucketTest(test.TestCase): self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2]) def testEvenOddBucketsFilterOutAllOdd(self): + def _map_fn(v): - return {"x": v, - "y": array_ops.fill([v], v), - "z": array_ops.fill([3], string_ops.as_string(v))} + return { + "x": v, + "y": array_ops.fill([v], v), + "z": array_ops.fill([3], string_ops.as_string(v)) + } def _dynamic_pad_fn(bucket, window, _): return dataset_ops.Dataset.zip( (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch( - 32, {"x": tensor_shape.TensorShape([]), - "y": tensor_shape.TensorShape([None]), - "z": tensor_shape.TensorShape([3])}))) + 32, { + "x": tensor_shape.TensorShape([]), + "y": tensor_shape.TensorShape([None]), + "z": tensor_shape.TensorShape([3]) + }))) input_dataset = ( dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn) .filter(lambda d: math_ops.equal(d["x"] % 2, 0))) - bucketed_dataset = input_dataset.group_by_window( - lambda d: math_ops.cast(d["x"] % 2, dtypes.int64), - lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32) + bucketed_dataset = input_dataset.apply( + dataset_ops.group_by_window, + args=(lambda d: math_ops.cast(d["x"] % 2, dtypes.int64), + lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32)) iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset) init_op = iterator.initializer @@ -295,6 +312,40 @@ class BucketTest(test.TestCase): self.assertAllEqual( np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"]) + def testDynamicWindowSize(self): + components = np.arange(100).astype(np.int64) + + # Key fn: even/odd + # Reduce fn: batches of 5 + # Window size fn: even=5, odd=10 + + def window_size_func(key): + window_sizes = constant_op.constant([5, 10], dtype=dtypes.int64) + return window_sizes[key] + + dataset = dataset_ops.Dataset.from_tensor_slices(components).apply( + dataset_ops.group_by_window, + args=(lambda x: x % 2, lambda _, xs: xs.batch(20), None, + window_size_func)) + iterator = dataset_ops.Iterator.from_dataset(dataset) + init_op = iterator.initializer + get_next = iterator.get_next() + + with self.test_session() as sess: + sess.run(init_op) + with self.assertRaises(errors.OutOfRangeError): + batches = 0 + while True: + result = sess.run(get_next) + is_even = all(x % 2 == 0 for x in result) + is_odd = all(x % 2 == 1 for x in result) + self.assertTrue(is_even or is_odd) + expected_batch_size = 5 if is_even else 10 + self.assertEqual(expected_batch_size, result.shape[0]) + batches += 1 + + self.assertEqual(batches, 15) + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py index 97b4ec44fc1..d05fbb7d285 100644 --- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py @@ -19,6 +19,7 @@ from __future__ import print_function import os import threading +from collections import namedtuple import numpy as np @@ -481,6 +482,40 @@ class MapDatasetTest(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) + def testMapNamedtuple(self, count=10): + # construct dataset of tuples + labels = dataset_ops.Dataset.range(count) + images = labels.map(lambda l: -l) + dataset_tuple = dataset_ops.Dataset.zip((labels, images)) + + # convert dataset of tuples to dataset of namedtuples + Example = namedtuple("Example", ["label", "image"]) + dataset_namedtuple = dataset_tuple.map(Example) + + def preprocess_tuple(label, image): + image = 2 * image + return label, image + + def preprocess_namedtuple(example): + return example._replace(image=2 * example.image) + + # preprocess both datasets + dataset_tuple = dataset_tuple.map(preprocess_tuple) + dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple) + + next_tuple = dataset_tuple.make_one_shot_iterator().get_next() + next_namedtuple = dataset_namedtuple.make_one_shot_iterator().get_next() + + # make sure both datasets contain the same data + with self.test_session() as sess: + for i in range(count): + tuple_, namedtuple_ = sess.run([next_tuple, next_namedtuple]) + self.assertEqual(tuple_, namedtuple_) + self.assertEqual(tuple_, (i, -2 * i)) + + with self.assertRaises(errors.OutOfRangeError): + sess.run(next_namedtuple) + def testUseStepContainerInMap(self): row = np.arange(6) iterator = ( diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py index 46af2b19494..0ee9acfc97f 100644 --- a/tensorflow/contrib/data/python/ops/dataset_ops.py +++ b/tensorflow/contrib/data/python/ops/dataset_ops.py @@ -1199,28 +1199,9 @@ class Dataset(object): return DenseToSparseBatchDataset(self, batch_size, row_shape) def group_by_window(self, key_func, reduce_func, window_size): - """Performs a windowed "group-by" operation on this dataset. - - This method maps each consecutive element in this dataset to a key - using `key_func` and groups the elements by key. It then applies - `reduce_func` to at most `window_size` elements matching the same - key. All execpt the final window for each key will contain - `window_size` elements; the final window may be smaller. - - Args: - key_func: A function mapping a nested structure of tensors - (having shapes and types defined by `self.output_shapes` and - `self.output_types`) to a scalar `tf.int64` tensor. - reduce_func: A function mapping a key and a dataset of up to `batch_size` - consecutive elements matching that key to another dataset. - window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of - consecutive elements matching the same key to combine in a single - batch, which will be passed to `reduce_func`. - - Returns: - A `Dataset`. - """ - return GroupByWindowDataset(self, key_func, reduce_func, window_size) + """See group_by_window().""" + return self.apply( + group_by_window, args=(key_func, reduce_func, window_size)) def map(self, map_func, @@ -1370,6 +1351,43 @@ class Dataset(object): """ return FilterDataset(self, predicate) + def apply(self, fn, args=(), kwargs={}): # pylint: disable=dangerous-default-value + """Apply a function to this dataset. + + `apply` enables chaining of custom `Dataset` transformations. + + For example: + + ``` + dataset.map( + lambda x: x**2 + ).apply( + group_by_window, args=(key_func, reduce_func, window_size) + ).map( + lambda x: x**3 + ) + ``` + + Args: + fn: A function that takes a `Dataset`, `args`, and `kwargs`, and + returns a `Dataset`. + args: A `tuple` or `list` of arguments to be passed to `fn`. + kwargs: A `dict` of keyword arguments to be passed to `fn`. + + Returns: + The `Dataset` returned by `fn`. + """ + if not (isinstance(args, tuple) or isinstance(args, list)): + raise TypeError("args must be a tuple or list.") + if not isinstance(kwargs, dict): + raise TypeError("kwargs must be a dict.") + + dataset = fn(self, *args, **kwargs) + + if not isinstance(dataset, Dataset): + raise TypeError("fn must return a Dataset.") + return dataset + class TensorDataset(Dataset): """A `Dataset` with a single element, viz. a nested structure of tensors.""" @@ -1903,7 +1921,7 @@ class DenseToSparseBatchDataset(Dataset): def _should_unpack_args(args): """Returns `True` if `args` should be `*args` when passed to a callable.""" - return nest.is_sequence(args) and not isinstance(args, dict) + return type(args) is tuple # pylint: disable=unidiomatic-typecheck class _ResourceDataset(Dataset): @@ -1927,71 +1945,6 @@ class _ResourceDataset(Dataset): return self._output_types -class GroupByWindowDataset(Dataset): - """A `Dataset` that groups its input and performs a windowed reduction.""" - - def __init__(self, input_dataset, key_func, reduce_func, window_size): - """See `Dataset.group_by_window()` for details.""" - super(GroupByWindowDataset, self).__init__() - self._input_dataset = input_dataset - self._window_size = window_size - - @function.Defun(*nest.flatten(input_dataset.output_types)) - def tf_key_func(*args): - """A wrapper for Defun that facilitates shape inference.""" - # Pass in shape information from the input_dataset. - for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)): - arg.set_shape(shape) - nested_args = nest.pack_sequence_as(input_dataset.output_types, args) - if _should_unpack_args(nested_args): - ret = key_func(*nested_args) - else: - ret = key_func(nested_args) - ret = ops.convert_to_tensor(ret, dtype=dtypes.int64) - if ret.dtype != dtypes.int64: - raise ValueError("`key_func` must return a single tf.int64 tensor.") - return ret - - self._key_func = tf_key_func - self._key_func.add_to_graph(ops.get_default_graph()) - - @function.Defun(dtypes.int64, dtypes.resource) - def tf_reduce_func(key, window_dataset_resource): - """A wrapper for Defun that facilitates shape inference.""" - key.set_shape([]) - window_dataset = _ResourceDataset(window_dataset_resource, - input_dataset.output_types, - input_dataset.output_shapes) - output_dataset = reduce_func(key, window_dataset) - if not isinstance(output_dataset, Dataset): - raise TypeError("`reduce_func` must return a `Dataset` object.") - self._output_types = output_dataset.output_types - self._output_shapes = output_dataset.output_shapes - return output_dataset.make_dataset_resource() - - self._reduce_func = tf_reduce_func - self._reduce_func.add_to_graph(ops.get_default_graph()) - - def make_dataset_resource(self): - return gen_dataset_ops.group_by_window_dataset( - self._input_dataset.make_dataset_resource(), - self._key_func.captured_inputs, - self._reduce_func.captured_inputs, - self._window_size, - key_func=self._key_func, - reduce_func=self._reduce_func, - output_types=nest.flatten(self.output_types), - output_shapes=nest.flatten(self.output_shapes)) - - @property - def output_shapes(self): - return self._output_shapes - - @property - def output_types(self): - return self._output_types - - class MapDataset(Dataset): """A `Dataset` that maps a function over elements in its input.""" @@ -2151,7 +2104,7 @@ class InterleaveDataset(Dataset): nested_args = nest.pack_sequence_as(input_dataset.output_types, args) - if nest.is_sequence(nested_args): + if _should_unpack_args(nested_args): dataset = map_func(*nested_args) else: dataset = map_func(nested_args) @@ -2460,7 +2413,7 @@ def rejection_resample(dataset, shapes and types defined by `dataset.output_shapes` and `dataset.output_types`) to a scalar `tf.int32` tensor. Values should be in `[0, num_classes)`. - target_dist: A floating point type tensor, shaped `[num_classes]. + target_dist: A floating point type tensor, shaped `[num_classes]`. initial_dist: (Optional.) A floating point type tensor, shaped `[num_classes]`. If not provided, the true class distribution is estimated live in a streaming fashion. @@ -2660,3 +2613,149 @@ def _get_file_names(file_pattern, randomize_input): if not randomize_input: file_names = sorted(file_names) return file_names + + +class GroupByWindowDataset(Dataset): + """A `Dataset` that groups its input and performs a windowed reduction.""" + + def __init__(self, input_dataset, key_func, reduce_func, window_size_func): + """See `group_by_window()` for details.""" + super(GroupByWindowDataset, self).__init__() + + self._input_dataset = input_dataset + + self._make_key_func(key_func, input_dataset) + self._make_reduce_func(reduce_func, input_dataset) + self._make_window_size_func(window_size_func) + + def _make_window_size_func(self, window_size_func): + """Make wrapping Defun for window_size_func.""" + + @function.Defun(dtypes.int64) + def tf_window_size_func(key): + key.set_shape([]) + window_size = ops.convert_to_tensor( + window_size_func(key), dtype=dtypes.int64) + if window_size.dtype != dtypes.int64: + raise ValueError( + "`window_size_func` must return a single tf.int64 tensor.") + return window_size + + self._window_size_func = tf_window_size_func + self._window_size_func.add_to_graph(ops.get_default_graph()) + + def _make_key_func(self, key_func, input_dataset): + """Make wrapping Defun for key_func.""" + + @function.Defun(*nest.flatten(input_dataset.output_types)) + def tf_key_func(*args): + """A wrapper for Defun that facilitates shape inference.""" + # Pass in shape information from the input_dataset. + for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)): + arg.set_shape(shape) + nested_args = nest.pack_sequence_as(input_dataset.output_types, args) + if _should_unpack_args(nested_args): + ret = key_func(*nested_args) + else: + ret = key_func(nested_args) + ret = ops.convert_to_tensor(ret, dtype=dtypes.int64) + if ret.dtype != dtypes.int64: + raise ValueError("`key_func` must return a single tf.int64 tensor.") + return ret + + self._key_func = tf_key_func + self._key_func.add_to_graph(ops.get_default_graph()) + + def _make_reduce_func(self, reduce_func, input_dataset): + """Make wrapping Defun for reduce_func.""" + + @function.Defun(dtypes.int64, dtypes.resource) + def tf_reduce_func(key, window_dataset_resource): + """A wrapper for Defun that facilitates shape inference.""" + key.set_shape([]) + window_dataset = _ResourceDataset(window_dataset_resource, + input_dataset.output_types, + input_dataset.output_shapes) + output_dataset = reduce_func(key, window_dataset) + if not isinstance(output_dataset, Dataset): + raise TypeError("`reduce_func` must return a `Dataset` object.") + self._output_types = output_dataset.output_types + self._output_shapes = output_dataset.output_shapes + return output_dataset.make_dataset_resource() + + self._reduce_func = tf_reduce_func + self._reduce_func.add_to_graph(ops.get_default_graph()) + + @property + def output_shapes(self): + return self._output_shapes + + @property + def output_types(self): + return self._output_types + + def make_dataset_resource(self): + return gen_dataset_ops.group_by_window_dataset( + self._input_dataset.make_dataset_resource(), + self._key_func.captured_inputs, + self._reduce_func.captured_inputs, + self._window_size_func.captured_inputs, + key_func=self._key_func, + reduce_func=self._reduce_func, + window_size_func=self._window_size_func, + output_types=nest.flatten(self.output_types), + output_shapes=nest.flatten(self.output_shapes)) + + +def group_by_window(dataset, + key_func, + reduce_func, + window_size=None, + window_size_func=None): + """Performs a windowed "group-by" operation on this dataset. + + This method maps each consecutive element in this dataset to a key + using `key_func` and groups the elements by key. It then applies + `reduce_func` to at most `window_size_func(key)` elements matching the same + key. All execpt the final window for each key will contain + `window_size_func(key)` elements; the final window may be smaller. + + You may provide either a constant `window_size` or a window size determined by + the key through `window_size_func`. + + Args: + dataset: A `Dataset`. + key_func: A function mapping a nested structure of tensors + (having shapes and types defined by `self.output_shapes` and + `self.output_types`) to a scalar `tf.int64` tensor. + reduce_func: A function mapping a key and a dataset of up to `batch_size` + consecutive elements matching that key to another dataset. + window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of + consecutive elements matching the same key to combine in a single + batch, which will be passed to `reduce_func`. Mutually exclusive with + `window_size_func`. + window_size_func: A function mapping a key to a `tf.int64` scalar + `tf.Tensor`, representing the number of consecutive elements matching + the same key to combine in a single batch, which will be passed to + `reduce_func`. Mutually exclusive with `window_size`. + + Returns: + A `Dataset`. + + Raises: + ValueError: if neither or both of {`window_size`, `window_size_func`} are + passed. + """ + if (window_size is not None and window_size_func or + not (window_size is not None or window_size_func)): + raise ValueError("Must pass either window_size or window_size_func.") + + if window_size is not None: + + def constant_window_func(unused_key): + return ops.convert_to_tensor(window_size, dtype=dtypes.int64) + + window_size_func = constant_window_func + + assert window_size_func is not None + return GroupByWindowDataset(dataset, key_func, reduce_func, window_size_func) diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD index c78b064b4fd..c2b99d67c7f 100644 --- a/tensorflow/contrib/distributions/BUILD +++ b/tensorflow/contrib/distributions/BUILD @@ -341,7 +341,7 @@ cuda_py_test( cuda_py_test( name = "sample_stats_test", - size = "small", + size = "medium", srcs = ["python/kernel_tests/sample_stats_test.py"], additional_deps = [ ":distributions_py", diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py index da1cd72a6f1..699cf45a738 100644 --- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py +++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py @@ -150,7 +150,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution): `N - 1` dimensions index into a batch of independent distributions and the last dimension represents a vector of probabilities for each class. Only one of `logits` or `probs` should be passed in. - dtype: The type of the event samples (default: int32). + dtype: The type of the event samples (default: float32). validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect @@ -388,7 +388,7 @@ class RelaxedOneHotCategorical( dimensions index into a batch of independent distributions and the last dimension represents a vector of probabilities for each class. Only one of `logits` or `probs` should be passed in. - dtype: The type of the event samples (default: int32). + dtype: The type of the event samples (default: float32). validate_args: Unused in this distribution. allow_nan_stats: Python `bool`, default `True`. If `False`, raise an exception if a statistic (e.g. mean/mode/etc...) is undefined for any diff --git a/tensorflow/contrib/framework/python/ops/checkpoint_ops.py b/tensorflow/contrib/framework/python/ops/checkpoint_ops.py index 848e26ab966..26146790b65 100644 --- a/tensorflow/contrib/framework/python/ops/checkpoint_ops.py +++ b/tensorflow/contrib/framework/python/ops/checkpoint_ops.py @@ -17,440 +17,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import math - from tensorflow.python.framework import dtypes -from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import gen_checkpoint_ops from tensorflow.python.ops import init_ops -from tensorflow.python.ops import math_ops - -ops.NotDifferentiable("GenerateVocabRemapping") -ops.NotDifferentiable("LoadAndRemapMatrix") +from tensorflow.python.training import checkpoint_ops -def _load_and_remap_matrix(ckpt_path, - old_tensor_name, - new_row_vocab_offset, - num_rows_to_load, - new_col_vocab_size, - initializer, - old_row_vocab_file=None, - new_row_vocab_file=None, - old_col_vocab_file=None, - new_col_vocab_file=None, - num_row_oov_buckets=0, - num_col_oov_buckets=0, - max_rows_in_memory=-1): - """Loads a 2-D (matrix) `Tensor` from checkpoint. - - Generates 1D-remappings for rows and columns using the - `GenerateVocabRemapping` op, and initializes any anticipated values with the - provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a - matrix that loads existing values from the checkpoint, while filling out - "missing" values with the newly initialized values. See - contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped - functionality (LoadAndRemapMatrix). This wrapper can be used to perform only - row remapping or only col remapping. If only row remapping is desired, - {new,old}_col_vocab_file should be `None`, and vice versa for column - remapping. - - NOTE: This only supports div-partitioning the vocabulary on the 1st dimension - (row axis) via `new_row_vocab_offset`. - - Args: - ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) - from which the old matrix `Tensor` will be loaded. - old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. - new_row_vocab_offset: A 0-indexed integer representing what line to - start reading at in the new row vocabulary. Used for partitioned - variables. - num_rows_to_load: Number of rows to load for the new vocabulary (note: to - support variable partitioning and partial loading, this does not need to - be the same as the number of entries in `new_row_vocab_file`). - new_col_vocab_size: Number of columns to load - should be the same as the - number of entries in `new_col_vocab_file`, since we don't support - partitioning along the column axis. - initializer: Callable initializer function that accepts a 1-D tensor as the - arg to specify the shape of the returned tensor. Used to initialize - missing values. - old_row_vocab_file: A scalar `Tensor` of type `string` containing the - path to the old row vocabulary file. Can be None, which represents no - remapping on the row axis. - new_row_vocab_file: A scalar `Tensor` of type `string` containing the path - to the new row vocabulary file. Can be None, which represents no remapping - on the row axis - in which case, `new_row_vocab_offset` and - `num_rows_to_load` work under the assumption that the new row vocab is the - same as the old row vocab. - old_col_vocab_file: A scalar `Tensor` of type `string` containing the - path to the old column vocabulary file. Can be None, which represents no - remapping on the column axis. - new_col_vocab_file: A scalar `Tensor` of type `string` containing the path - to the new column vocabulary file. Can be None, which represents no - remapping on the column axis - in which case, `new_col_vocab_size` works - under the assumption that the new col vocab is the same as the old col - vocab. - num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows - to append. Must be >= 0. - num_col_oov_buckets: `int` specifying the number of out-of-vocabulary - columns to append. Must be >= 0. - max_rows_in_memory: `int` specifying the maximum number of rows to load from - the checkpoint at once. If less than or equal to 0, the entire matrix will - be loaded into memory. Setting this arg trades increased disk reads for - lower memory usage. - - Returns: - A Tensor of shape `[num_rows_to_load + num_row_oov_buckets, - new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the - specified tensor in the checkpoint, and any missing or OOV values - initialized with the given `initializer`. - - Raises: - ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0. - ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is - provided, while the other is not. Same for `old_col_vocab_file` and - `new_col_vocab_file`. - ValueError: If neither row vocabs or col vocabs are provided. - """ - if num_row_oov_buckets < 0: - raise ValueError("num_row_oov_buckets must be >= 0, but received %d" % - num_row_oov_buckets) - if num_col_oov_buckets < 0: - raise ValueError("num_col_oov_buckets must be >= 0, but received %d" % - num_col_oov_buckets) - - if bool(old_row_vocab_file) != bool(new_row_vocab_file): - raise ValueError( - "old_row_vocab_file and new_row_vocab_file must both be specified or " - "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'". - format(old_row_vocab_file, new_row_vocab_file)) - if bool(old_col_vocab_file) != bool(new_col_vocab_file): - raise ValueError( - "old_col_vocab_file and new_col_vocab_file must both be specified or " - "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'". - format(old_col_vocab_file, new_col_vocab_file)) - - remap_rows = new_row_vocab_file and old_row_vocab_file - remap_cols = new_col_vocab_file and old_col_vocab_file - if not (remap_rows or remap_cols): - raise ValueError( - "Must provide either row or column vocab files. If no remapping is " - "necessary, consider using `tf.contrib.framework.init_from_checkpoint` " - "instead.") - - num_rows_present = num_rows_to_load - if remap_rows: - row_remapping, num_rows_present = ( - gen_checkpoint_ops._generate_vocab_remapping( # pylint: disable=protected-access - new_vocab_file=new_row_vocab_file, - old_vocab_file=old_row_vocab_file, - new_vocab_offset=new_row_vocab_offset, - num_new_vocab=num_rows_to_load)) - else: - # Even when the rows are not being reordered, we still need to generate a - # remapping to account for initializing partitioned Variables (when - # new_row_vocab_offset is non-zero). - row_remapping = math_ops.range( - new_row_vocab_offset, - new_row_vocab_offset + num_rows_to_load, - dtype=dtypes.int64) - - col_remapping = [] - num_cols_present = new_col_vocab_size - if remap_cols: - col_remapping, num_cols_present = ( - gen_checkpoint_ops._generate_vocab_remapping( # pylint: disable=protected-access - new_vocab_file=new_col_vocab_file, - old_vocab_file=old_col_vocab_file, - new_vocab_offset=0, # Offset is unused for cols (no partitioning). - num_new_vocab=new_col_vocab_size)) - - init_vals = initializer([ - num_rows_to_load * new_col_vocab_size - - num_rows_present * num_cols_present, 1 - ]) - return_tensor = gen_checkpoint_ops._load_and_remap_matrix( # pylint: disable=protected-access - ckpt_path=ckpt_path, - old_tensor_name=old_tensor_name, - row_remapping=row_remapping, - col_remapping=col_remapping, - initializing_values=init_vals, - num_rows=num_rows_to_load, - num_cols=new_col_vocab_size, - max_rows_in_memory=max_rows_in_memory) - - # Add OOV row(s) and column(s). - if num_row_oov_buckets > 0: - init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size]) - init_row_oov_val = ops.convert_to_tensor(init_row_oov_val) - return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0) - if num_col_oov_buckets > 0: - # We need to add any row OOV to the new column shape. - init_col_oov_val = initializer( - [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets]) - init_col_oov_val = ops.convert_to_tensor(init_col_oov_val) - return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1) - - return return_tensor - - -def load_and_remap_matrix_initializer(ckpt_path, - old_tensor_name, - new_row_vocab_size, - new_col_vocab_size, - old_row_vocab_file=None, - new_row_vocab_file=None, - old_col_vocab_file=None, - new_col_vocab_file=None, - num_row_oov_buckets=0, - num_col_oov_buckets=0, - initializer=None, - max_rows_in_memory=-1): - r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor. - - The returned initializer loads a 2-D (matrix) `Tensor` with name - `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the - rows/columns according to the specified vocab files and append additional - out-of-vocabulary rows/columns according to the number of OOV buckets. - - The format of the file at the `{old,new}_{row,col}_vocab_file` path should be - a text file, with each line containing a single entity within the vocabulary. - Let the function `line_of(f, "x")` return the 0-indexed line number of the - entity "x" in file f, and the function `entity_at(f, i)` return the entity at - line i of file f. Then, row i of the new output matrix will be taken from row - `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old - matrix. If any entity in `new_row_vocab_file` is not found in - `old_row_vocab_file`, that row is considered a "missing" row, and its values - will be initialized using the `initializer` arg. The same logic also applies - for the columns. - - For example, assuming that: - - * `old_row_vocab_file` contains "mercury\nvenus\nmars" - * `new_row_vocab_file` contains "venus\njupiter\nmercury" - * `old_col_vocab_file` contains "good\nbetter\nbest" - * `new_col_vocab_file` contains "good\nbest\nfantastic" - * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]` - * `w(i, j)` represents the value from row i, column j of the old matrix - - Then the new output matrix will look like: - - `[[w(1, 0), w(1, 2), 1], - [2, 3, 4], - [w(0, 0), w(0, 2), 5]]` - - If we further specify that: - - * `num_row_oov_buckets` == 2 - * `num_col_oov_buckets` == 1 - - Then the new output matrix will look like: - - `[[w(1, 0), w(1, 2), 1, 12], - [2, 3, 4, 13], - [w(0, 0), w(0, 2), 5, 14], - [6, 7, 8, 15], - [9, 10, 11, 16]]` - - If `{old,new}_row_vocab_file` are None, we assume that the old and new row - vocab files are the same, and no row remapping is done. If - `{old,new}_col_vocab_file` are None, we assume that the old and new column - vocab files are the same, and no column remapping is done. - - The returned initializer only supports div-partitioning along the row axis. It - does not support partitioning along the column axis or mod-partitioning. - - NOTE: When this is used to warm-start variables, client code should use - `tf.lookup.index_table_from_tensor()` like - contrib/layers/python/layers/feature_column.py does, as opposed to - `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the - same. - - Args: - ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) - from which the old matrix `Tensor` will be loaded. - old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. - new_row_vocab_size: `int` specifying the number of entries in - `new_row_vocab_file`. If no row remapping is needed (no row vocab - provided), this should be equal to the number of rows to load from the old - matrix (which can theoretically be smaller than the number of rows in the - old matrix). - new_col_vocab_size: `int` specifying the number of entries in - `new_col_vocab_file`. If no column remapping is needed (no column vocab - provided), this should be equal to the number of columns in the old - matrix. - old_row_vocab_file: A scalar `Tensor` of type `string` containing the - path to the old row vocabulary file. Can be None, which represents no - remapping on the row axis. - new_row_vocab_file: A scalar `Tensor` of type `string` containing the path - to the new row vocabulary file. Can be None, which represents no remapping - on the row axis. - old_col_vocab_file: A scalar `Tensor` of type `string` containing the - path to the old column vocabulary file. Can be None, which represents no - remapping on the column axis. - new_col_vocab_file: A scalar `Tensor` of type `string` containing the path - to the new column vocabulary file. Can be None, which represents no - remapping on the column axis. - num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows - to append. Must be >= 0. - num_col_oov_buckets: `int` specifying the number of out-of-vocabulary - columns to append. Must be >= 0. - initializer: Initializer function to initialize missing values. Accepts a - 1-D tensor as the arg to specify the shape of the returned tensor. If - `None`, defaults to using `zeros_initializer()`. - max_rows_in_memory: `int` specifying the maximum number of rows to load from - the checkpoint at once. If less than or equal to 0, the entire matrix will - be loaded into memory. Setting this arg trades increased disk reads for - lower memory usage. - - Returns: - A variable initializer function that should be used to initialize a - (potentially partitioned) `Variable` whose complete shape is - `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size + - num_col_oov_buckets]`. - - Raises: - TypeError: If `initializer` is specified but not callable. - """ - if initializer is None: - # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from - # Glorot and Bengio, 2010. - initializer = init_ops.zeros_initializer() - - if not callable(initializer): - raise TypeError( - "initializer must be callable, instead of being {} of type {}.".format( - initializer, type(initializer))) - - def _initializer(shape, dtype=dtypes.float32, partition_info=None): - """Variable initializer. - - Args: - shape: Shape of `Tensor` to return. Should include OOV on both axes. - dtype: Must be float32. - partition_info: variable_scope._PartitionInfo. - - Returns: - `Tensor` of shape `shape`. - - Raises: - TypeError: If `dtype` is anything other than float32. - ValueError: For shape mismatch upon invocation. - """ - # Sanity checks. - if dtype != dtypes.float32: - raise TypeError( - "Currently, only float32 is supported. Received dtype: {}".format( - dtype)) - if len(shape) != 2: - raise ValueError("Expected 2-dim shape, but received: {}".format(shape)) - if shape[0] <= 0: - raise ValueError( - "Expected 1st dim of shape to be > 0, but received shape: {}".format( - shape)) - if shape[1] != (new_col_vocab_size + num_col_oov_buckets): - raise ValueError( - "Expected 2nd dim of shape to be new_col_vocab_size ({}) + " - "num_col_oov_buckets ({}) = {}, but received shape: {}".format( - new_col_vocab_size, num_col_oov_buckets, - new_col_vocab_size + num_col_oov_buckets, shape)) - - offset = 0 - if partition_info is not None: - offset = partition_info.single_offset(shape) - - if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets: - raise ValueError( - "Trying to initialize {} additional rows after {} rows have already " - "been initialized, which would exceed expected total row count of " - "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format( - shape[0], offset, new_row_vocab_size, num_row_oov_buckets, - new_row_vocab_size + num_row_oov_buckets)) - - row_oov_buckets_to_use = min(shape[0], - max(0, offset + shape[0] - new_row_vocab_size)) - num_rows_to_load = shape[0] - row_oov_buckets_to_use - - return _load_and_remap_matrix( - ckpt_path=ckpt_path, - old_tensor_name=old_tensor_name, - new_row_vocab_offset=offset, - num_rows_to_load=num_rows_to_load, - new_col_vocab_size=new_col_vocab_size, - initializer=initializer, - old_row_vocab_file=old_row_vocab_file, - new_row_vocab_file=new_row_vocab_file, - old_col_vocab_file=old_col_vocab_file, - new_col_vocab_file=new_col_vocab_file, - num_row_oov_buckets=row_oov_buckets_to_use, - num_col_oov_buckets=num_col_oov_buckets, - max_rows_in_memory=max_rows_in_memory) - - return _initializer - - -def load_embedding_initializer(ckpt_path, - embedding_tensor_name, - new_vocab_size, - embedding_dim, - old_vocab_file, - new_vocab_file, - num_oov_buckets=0, - initializer=None, - max_rows_in_memory=-1): - """Returns a variable initializer for loading pre-trained embeddings. - - Wrapper around `load_and_remap_matrix_initializer()` specialized for loading - embedding weights and remapping according to the provided vocab files. See - docs for `load_and_remap_matrix_initializer()` for more details. - - NOTE: Only for use with div-partitioned variables / vocabularies. - - Args: - ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) - from which the old matrix `Tensor` will be loaded. - embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. - new_vocab_size: Number of entries in the new vocab. - embedding_dim: `int` specifying the dimension of the embedding vectors from - the checkpoint. Must match the number of columns in the old embedding - matrix. - old_vocab_file: A scalar `Tensor` of type `string` containing the - path to the old vocabulary file. - new_vocab_file: A scalar `Tensor` of type `string` containing the - path to the new vocabulary file. - num_oov_buckets: `int` specifying the number of out-of-vocabulary - buckets to use. Must be >= 0. - initializer: Initializer function that accepts a 1-D tensor as the arg to - specify the shape of the returned tensor. If `None`, defaults to using - `truncated_normal_initializer()`. - max_rows_in_memory: `int` specifying the maximum number of rows to load from - the checkpoint at once. If less than or equal to 0, the entire matrix will - be loaded into memory. Setting this arg trades increased disk reads for - lower memory usage. - - Returns: - A variable initializer function. - """ - if initializer is None: - # TODO(b/25671353): This should be kept in sync with the stddev used by - # feature_column.py's _EmbeddingColumn. - initializer = init_ops.truncated_normal_initializer( - stddev=1.0 / math.sqrt(embedding_dim)) - - return load_and_remap_matrix_initializer( - ckpt_path=ckpt_path, - old_tensor_name=embedding_tensor_name, - new_row_vocab_size=new_vocab_size, - new_col_vocab_size=embedding_dim, - old_row_vocab_file=old_vocab_file, - new_row_vocab_file=new_vocab_file, - old_col_vocab_file=None, - new_col_vocab_file=None, - num_row_oov_buckets=num_oov_buckets, - num_col_oov_buckets=0, - initializer=initializer, - max_rows_in_memory=max_rows_in_memory) +# pylint: disable=protected-access,line-too-long +load_and_remap_matrix_initializer = checkpoint_ops._load_and_remap_matrix_initializer +# pylint: enable=line-too-long +load_embedding_initializer = checkpoint_ops._load_embedding_initializer +# pylint: enable=protected-access def load_linear_multiclass_bias_initializer(ckpt_path, diff --git a/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py b/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py index a11d373244d..b7b9f5c59e1 100644 --- a/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py +++ b/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py @@ -21,7 +21,6 @@ import os import numpy as np from tensorflow.contrib import framework as contrib_framework -from tensorflow.contrib.framework.python.ops import checkpoint_ops from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -38,250 +37,6 @@ FLAGS = flags.FLAGS _TESTDATA_PATH = 'contrib/framework/testdata' -class LoadAndRemapWrappersTest(test.TestCase): - """Tests for the functionality of the Python wrappers.""" - - def setUp(self): - self.bundle_file = os.path.join( - test.test_src_dir_path(_TESTDATA_PATH), 'bundle_checkpoint') - self.new_feature_vocab_file = os.path.join( - test.test_src_dir_path(_TESTDATA_PATH), 'bundle_checkpoint_vocab.txt') - self.old_feature_vocab_file = os.path.join( - test.test_src_dir_path(_TESTDATA_PATH), - 'bundle_checkpoint_vocab_with_oov.txt') - self.new_class_vocab_file = os.path.join( - test.test_src_dir_path(_TESTDATA_PATH), 'keyword_new.txt') - self.old_class_vocab_file = os.path.join( - test.test_src_dir_path(_TESTDATA_PATH), 'keyword.txt') - self.init_val = 42 - - def _init_val_initializer(shape, dtype=None, partition_info=None): - del dtype, partition_info # Unused by this unit-testing initializer. - return array_ops.tile( - constant_op.constant([[self.init_val]], dtype=dtypes.float32), shape) - - self.initializer = _init_val_initializer - - def test_load_and_remap_matrix(self): - """Tests the end-to-end loading / remapping of weights.""" - # _load_and_remap_matrix() is the generalized wrapper that takes in row and - # column vocabulary files, calls the relevant remappings, and returns the - # weight matrix. Take this example to be linear multi-class by providing - # both row and column vocabularies. - remapped_matrix = checkpoint_ops._load_and_remap_matrix( - new_row_vocab_file=self.new_feature_vocab_file, - old_row_vocab_file=self.old_feature_vocab_file, - num_rows_to_load=4, - new_col_vocab_file=self.new_class_vocab_file, - old_col_vocab_file=self.old_class_vocab_file, - new_col_vocab_size=4, - old_tensor_name='some_scope/embeddings', - ckpt_path=[self.bundle_file], - new_row_vocab_offset=1, - initializer=self.initializer, - num_row_oov_buckets=1, - num_col_oov_buckets=1) - - # [4 in vocab + 1 oov features, 4 in vocab + 1 oov classes]. The offset - # means we read - expected_remapped_matrix = np.concatenate( - [ - np.reshape([18, 34, 50, self.init_val, self.init_val], [5, 1]), - np.reshape([16, 32, 48, self.init_val, self.init_val], [5, 1]), - np.reshape([self.init_val] * 5, [5, 1]), - np.reshape([17, 33, 49, self.init_val, self.init_val], [5, 1]), - np.reshape([self.init_val] * 5, [5, 1]) - ], - axis=1) - - with self.test_session(): - self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval()) - - def test_load_and_remap_output_layer_weight_initializer_linear(self): - """Tests for the output layer initializer in the linear multi-class case.""" - loading_initializer = (contrib_framework.load_and_remap_matrix_initializer( - new_row_vocab_size=5, - new_col_vocab_file=self.new_class_vocab_file, - old_col_vocab_file=self.old_class_vocab_file, - new_col_vocab_size=4, - old_tensor_name='some_scope/embeddings', - ckpt_path=[self.bundle_file], - new_row_vocab_file=self.new_feature_vocab_file, - old_row_vocab_file=self.old_feature_vocab_file, - num_row_oov_buckets=1, - num_col_oov_buckets=1, - initializer=self.initializer)) - - expected_remapped_matrix = np.concatenate( - [ - np.reshape([2, 18, 34, 50, self.init_val, self.init_val], [6, 1]), - np.reshape([0, 16, 32, 48, self.init_val, self.init_val], [6, 1]), - np.reshape([self.init_val] * 6, [6, 1]), - np.reshape([1, 17, 33, 49, self.init_val, self.init_val], [6, 1]), - np.reshape([self.init_val] * 6, [6, 1]) - ], - axis=1) - - # The new weight matrix is of size - # [5 feature vocab + 1 feature OOV, 4 class vocab + 1 class OOV]. Use a - # partitioned variable to confirm that the offset logic works. - remapped_matrix = variable_scope.get_variable( - name='linear/obtained_weight_matrix', - shape=[6, 5], - initializer=loading_initializer, - partitioner=partitioned_variables.fixed_size_partitioner(2)) - - with self.test_session(): - variables.global_variables_initializer().run() - self.assertAllClose(expected_remapped_matrix, - remapped_matrix.as_tensor().eval()) - - def test_load_and_remap_output_layer_weight_initializer_dnn_output(self): - """Tests for the output layer initializer in the DNN output case.""" - loading_initializer = (contrib_framework.load_and_remap_matrix_initializer( - new_row_vocab_size=5, - new_col_vocab_file=self.new_class_vocab_file, - old_col_vocab_file=self.old_class_vocab_file, - new_col_vocab_size=4, - old_tensor_name='some_scope/embeddings', - ckpt_path=[self.bundle_file], - num_col_oov_buckets=1, - initializer=self.initializer)) - - expected_remapped_matrix = np.concatenate( - [ - np.reshape([2, 18, 34, 50, 66], [5, 1]), - np.reshape([0, 16, 32, 48, 64], [5, 1]), - np.reshape([self.init_val] * 5, [5, 1]), - np.reshape([1, 17, 33, 49, 65], [5, 1]), - np.reshape([self.init_val] * 5, [5, 1]) - ], - axis=1) - - # The new weight matrix is of size - # [5-sized input layer, 4 class vocab + 1 class OOV]. - remapped_matrix = variable_scope.get_variable( - name='dnn_output/obtained_weight_matrix', - shape=[5, 5], - initializer=loading_initializer, - partitioner=partitioned_variables.fixed_size_partitioner(2)) - - with self.test_session(): - variables.global_variables_initializer().run() - self.assertAllClose(expected_remapped_matrix, - remapped_matrix.as_tensor().eval()) - - def test_initializer_with_oov_only_partition(self): - """Tests for the output layer initializer where one partition is all OOV.""" - loading_initializer = (contrib_framework.load_and_remap_matrix_initializer( - new_row_vocab_size=5, - new_col_vocab_file=self.new_class_vocab_file, - old_col_vocab_file=self.old_class_vocab_file, - new_col_vocab_size=4, - old_tensor_name='some_scope/embeddings', - ckpt_path=[self.bundle_file], - new_row_vocab_file=self.new_feature_vocab_file, - old_row_vocab_file=self.old_feature_vocab_file, - num_row_oov_buckets=5, - num_col_oov_buckets=1, - initializer=self.initializer)) - - expected_remapped_matrix = np.concatenate( - [ - np.reshape([2, 18, 34, 50] + [self.init_val] * 6, [10, 1]), - np.reshape([0, 16, 32, 48] + [self.init_val] * 6, [10, 1]), - np.reshape([self.init_val] * 10, [10, 1]), - np.reshape([1, 17, 33, 49] + [self.init_val] * 6, [10, 1]), - np.reshape([self.init_val] * 10, [10, 1]), - ], - axis=1) - - # The new weight matrix is of size - # [5 feature vocab + 5 feature OOV, 4 class vocab + 1 class OOV]. The - # second partition has only OOV. - remapped_matrix = variable_scope.get_variable( - name='linear_all_oov/obtained_weight_matrix', - shape=[10, 5], - initializer=loading_initializer, - partitioner=partitioned_variables.fixed_size_partitioner(2)) - - with self.test_session(): - variables.global_variables_initializer().run() - self.assertAllClose(expected_remapped_matrix, - remapped_matrix.as_tensor().eval()) - - def test_load_and_remap_linear_multiclass_initializer_default_init(self): - """Tests where the zeros_initializer default is used for linear.""" - loading_initializer = (contrib_framework.load_and_remap_matrix_initializer( - new_row_vocab_size=5, - new_col_vocab_file=self.new_class_vocab_file, - old_col_vocab_file=self.old_class_vocab_file, - new_col_vocab_size=4, - old_tensor_name='some_scope/embeddings', - ckpt_path=[self.bundle_file], - new_row_vocab_file=self.new_feature_vocab_file, - old_row_vocab_file=self.old_feature_vocab_file, - num_row_oov_buckets=1, - num_col_oov_buckets=1)) - - expected_remapped_matrix = np.concatenate( - [ - np.reshape([2, 18, 34, 50, 0, 0], [6, 1]), - np.reshape([0, 16, 32, 48, 0, 0], [6, 1]), - np.reshape([0] * 6, [6, 1]), - np.reshape([1, 17, 33, 49, 0, 0], [6, 1]), - np.reshape([0] * 6, [6, 1]) - ], - axis=1) - - remapped_matrix = variable_scope.get_variable( - name='linear_init_fallback/obtained_weight_matrix', - shape=[6, 5], - initializer=loading_initializer, - partitioner=partitioned_variables.fixed_size_partitioner(2)) - - with self.test_session(): - variables.global_variables_initializer().run() - self.assertAllClose(expected_remapped_matrix, - remapped_matrix.as_tensor().eval()) - - def test_load_embedding_initializer(self): - """Tests for the load_embedding_initializer wrapper.""" - embedding_loading_initializer = ( - contrib_framework.load_embedding_initializer( - new_vocab_file=self.new_feature_vocab_file, - old_vocab_file=self.old_feature_vocab_file, - new_vocab_size=5, - embedding_dim=16, - embedding_tensor_name='some_scope/embeddings', - ckpt_path=[self.bundle_file], - num_oov_buckets=1, - initializer=self.initializer)) - - expected_remapped_embeddings = np.concatenate( - [ - np.reshape(range(64), [4, 16]), - np.reshape([self.init_val] * 32, [2, 16]), - ], - axis=0) - - # The new weight matrix is of size - # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the - # last vocab row (2nd last row) is newly initialized (wasn't found in - # previous vocab) and the actual last row is OOV and also newly initialized. - # Use a partitioned variable to confirm that the offset logic works. - remapped_embeddings = variable_scope.get_variable( - name='embedding/obtained_embedding_matrix', - shape=[6, 16], - initializer=embedding_loading_initializer, - partitioner=partitioned_variables.fixed_size_partitioner(2)) - - with self.test_session(): - variables.global_variables_initializer().run() - self.assertAllClose(expected_remapped_embeddings, - remapped_embeddings.as_tensor().eval()) - - class LoadMulticlassBiasTest(test.TestCase): """Tests for the load_linear_multiclass_bias_initializer functionality.""" diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD new file mode 100644 index 00000000000..b2de2823563 --- /dev/null +++ b/tensorflow/contrib/gan/BUILD @@ -0,0 +1,27 @@ +package(default_visibility = ["//tensorflow:__subpackages__"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +py_library( + name = "gan", + srcs = [ + "__init__.py", + ], + srcs_version = "PY2AND3", + deps = [ + ], +) + +filegroup( + name = "all_files", + srcs = glob( + ["**/*"], + exclude = [ + "**/METADATA", + "**/OWNERS", + ], + ), + visibility = ["//tensorflow:__subpackages__"], +) diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md new file mode 100644 index 00000000000..586e5ac331c --- /dev/null +++ b/tensorflow/contrib/gan/README.md @@ -0,0 +1,4 @@ +This directory contains the TFGAN project. + +This file will have more details as code is added. + diff --git a/tensorflow/contrib/gan/__init__.py b/tensorflow/contrib/gan/__init__.py new file mode 100644 index 00000000000..a46b0e8d5de --- /dev/null +++ b/tensorflow/contrib/gan/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""TFGAN grouped API.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD index 645e364d191..bebcf079ba4 100644 --- a/tensorflow/contrib/gdr/BUILD +++ b/tensorflow/contrib/gdr/BUILD @@ -62,6 +62,7 @@ tf_cuda_library( }), deps = [ ":gdr_proto_cc", + "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:gpu_runtime", "//tensorflow/core:lib", diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD index e631c243c3c..a27bec48010 100755 --- a/tensorflow/contrib/image/BUILD +++ b/tensorflow/contrib/image/BUILD @@ -121,12 +121,9 @@ tf_gen_op_wrapper_py( cc_library( name = "image_ops_cc", - srcs = [ - "ops/image_ops.cc", - ], + srcs = ["ops/image_ops.cc"], deps = [ ":image_ops_kernels", - "//tensorflow/core", "//tensorflow/core:framework", ], alwayslink = 1, diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD index a09045d7fda..26f0e415180 100644 --- a/tensorflow/contrib/keras/BUILD +++ b/tensorflow/contrib/keras/BUILD @@ -551,6 +551,7 @@ py_test( size = "small", srcs = ["python/keras/utils/io_utils_test.py"], srcs_version = "PY2AND3", + tags = ["notsan"], deps = [ ":keras", "//tensorflow/python:client_testlib", diff --git a/tensorflow/contrib/keras/python/keras/utils/io_utils_test.py b/tensorflow/contrib/keras/python/keras/utils/io_utils_test.py index baa9781e71f..f6820ee0394 100644 --- a/tensorflow/contrib/keras/python/keras/utils/io_utils_test.py +++ b/tensorflow/contrib/keras/python/keras/utils/io_utils_test.py @@ -57,43 +57,44 @@ class TestIOUtils(test.TestCase): h5_path = os.path.join(temp_dir, 'test.h5') create_dataset(h5_path) - # Instantiating HDF5Matrix for the training set, - # which is a slice of the first 150 elements - x_train = keras.utils.io_utils.HDF5Matrix( - h5_path, 'my_data', start=0, end=150) - y_train = keras.utils.io_utils.HDF5Matrix( - h5_path, 'my_labels', start=0, end=150) + with self.test_session(): + # Instantiating HDF5Matrix for the training set, + # which is a slice of the first 150 elements + x_train = keras.utils.io_utils.HDF5Matrix( + h5_path, 'my_data', start=0, end=150) + y_train = keras.utils.io_utils.HDF5Matrix( + h5_path, 'my_labels', start=0, end=150) - # Likewise for the test set - x_test = keras.utils.io_utils.HDF5Matrix( - h5_path, 'my_data', start=150, end=200) - y_test = keras.utils.io_utils.HDF5Matrix( - h5_path, 'my_labels', start=150, end=200) + # Likewise for the test set + x_test = keras.utils.io_utils.HDF5Matrix( + h5_path, 'my_data', start=150, end=200) + y_test = keras.utils.io_utils.HDF5Matrix( + h5_path, 'my_labels', start=150, end=200) - # HDF5Matrix behave more or less like Numpy matrices - # with regard to indexing - self.assertEqual(y_train.shape, (150, 1)) - # But they do not support negative indices, so don't try print(x_train[-1]) + # HDF5Matrix behave more or less like Numpy matrices + # with regard to indexing + self.assertEqual(y_train.shape, (150, 1)) + # But they don't support negative indices, so don't try print(x_train[-1]) - self.assertEqual(y_train.dtype, np.dtype('i')) - self.assertEqual(y_train.ndim, 2) - self.assertEqual(y_train.size, 150) + self.assertEqual(y_train.dtype, np.dtype('i')) + self.assertEqual(y_train.ndim, 2) + self.assertEqual(y_train.size, 150) - model = keras.models.Sequential() - model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu')) - model.add(keras.layers.Dense(1, activation='sigmoid')) - model.compile(loss='binary_crossentropy', optimizer='sgd') + model = keras.models.Sequential() + model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu')) + model.add(keras.layers.Dense(1, activation='sigmoid')) + model.compile(loss='binary_crossentropy', optimizer='sgd') - # Note: you have to use shuffle='batch' or False with HDF5Matrix - model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False) - # test that evalutation and prediction - # don't crash and return reasonable results - out_pred = model.predict(x_test, batch_size=32, verbose=False) - out_eval = model.evaluate(x_test, y_test, batch_size=32, verbose=False) + # Note: you have to use shuffle='batch' or False with HDF5Matrix + model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False) + # test that evalutation and prediction + # don't crash and return reasonable results + out_pred = model.predict(x_test, batch_size=32, verbose=False) + out_eval = model.evaluate(x_test, y_test, batch_size=32, verbose=False) - self.assertEqual(out_pred.shape, (50, 1)) - self.assertEqual(out_eval.shape, ()) - self.assertGreater(out_eval, 0) + self.assertEqual(out_pred.shape, (50, 1)) + self.assertEqual(out_eval.shape, ()) + self.assertGreater(out_eval, 0) if __name__ == '__main__': diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py index 598d9aee02a..da16bf6ce64 100644 --- a/tensorflow/contrib/layers/python/layers/feature_column.py +++ b/tensorflow/contrib/layers/python/layers/feature_column.py @@ -2559,10 +2559,10 @@ def _create_sequence_feature_spec_for_parsing(sequence_feature_columns, feature_spec = create_feature_spec_for_parsing(sequence_feature_columns) sequence_feature_spec = {} for key, feature in feature_spec.items(): - if (isinstance(feature, parsing_ops.VarLenFeature) or - isinstance(feature, parsing_ops.FixedLenSequenceFeature)): + if isinstance(feature, parsing_ops.VarLenFeature): sequence_feature = feature - elif isinstance(feature, parsing_ops.FixedLenFeature): + elif (isinstance(feature, parsing_ops.FixedLenFeature) or + isinstance(feature, parsing_ops.FixedLenSequenceFeature)): default_is_set = feature.default_value is not None if default_is_set: logging.warning( diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py index 21ab9867102..ab65e47af88 100644 --- a/tensorflow/contrib/layers/python/layers/feature_column_test.py +++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py @@ -912,8 +912,7 @@ class FeatureColumnTest(test.TestCase): parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_var_len_dense_column": parsing_ops.FixedLenSequenceFeature( - shape=[], dtype=dtypes.float32, allow_missing=True, - default_value=4.0), + shape=[], dtype=dtypes.float32, allow_missing=True), } self.assertDictEqual(expected_feature_spec, feature_spec) diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py index 7eb410b4c72..33db93b9704 100644 --- a/tensorflow/contrib/layers/python/layers/optimizers.py +++ b/tensorflow/contrib/layers/python/layers/optimizers.py @@ -156,9 +156,9 @@ def optimize_loss(loss, loss = ops.convert_to_tensor(loss) contrib_framework.assert_scalar(loss) if global_step is None: - global_step = contrib_framework.get_global_step() + global_step = train.get_global_step() else: - contrib_framework.assert_global_step(global_step) + train.assert_global_step(global_step) with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]): # Update ops take UPDATE_OPS collection if not provided. if update_ops is None: diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD index 6fbe204ec6d..db3be9a991b 100644 --- a/tensorflow/contrib/learn/BUILD +++ b/tensorflow/contrib/learn/BUILD @@ -36,6 +36,7 @@ py_library( "//tensorflow/contrib/rnn:rnn_py", "//tensorflow/contrib/session_bundle:exporter", "//tensorflow/contrib/session_bundle:gc", + "//tensorflow/contrib/tpu:tpu_estimator", "//tensorflow/contrib/training:training_py", "//tensorflow/core:protos_all_py", "//tensorflow/python:array_ops", diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py index c31d5d2d47d..225d8796785 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/head.py +++ b/tensorflow/contrib/learn/python/learn/estimators/head.py @@ -24,7 +24,6 @@ import six from tensorflow.contrib import framework as framework_lib from tensorflow.contrib import layers as layers_lib -from tensorflow.contrib import lookup as lookup_lib from tensorflow.contrib.learn.python.learn.estimators import constants from tensorflow.contrib.learn.python.learn.estimators import model_fn from tensorflow.contrib.learn.python.learn.estimators import prediction_key @@ -35,6 +34,7 @@ from tensorflow.python.framework import sparse_tensor from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import logging_ops +from tensorflow.python.ops import lookup_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import metrics as metrics_lib from tensorflow.python.ops import nn @@ -1070,9 +1070,8 @@ class _MultiClassHead(_SingleHead): labels_tensor = _to_labels_tensor(labels, self._label_name) _check_no_sparse_tensor(labels_tensor) if self._label_keys: - table = lookup_lib.string_to_index_table_from_tensor( - mapping=self._label_keys, - name="label_id_lookup") + table = lookup_ops.index_table_from_tensor(self._label_keys, + name="label_id_lookup") return { "labels": labels_tensor, "label_ids": table.lookup(labels_tensor), @@ -1106,9 +1105,8 @@ class _MultiClassHead(_SingleHead): class_ids = math_ops.argmax( logits, 1, name=prediction_key.PredictionKey.CLASSES) if self._label_keys: - table = lookup_lib.index_to_string_table_from_tensor( - mapping=self._label_keys, - name="class_string_lookup") + table = lookup_ops.index_to_string_table_from_tensor( + self._label_keys, name="class_string_lookup") classes = table.lookup(class_ids) else: classes = class_ids diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py index c35a493086e..627d4991f03 100644 --- a/tensorflow/contrib/learn/python/learn/experiment.py +++ b/tensorflow/contrib/learn/python/learn/experiment.py @@ -33,6 +33,7 @@ from tensorflow.contrib.learn.python.learn import export_strategy from tensorflow.contrib.learn.python.learn import monitors from tensorflow.contrib.learn.python.learn import trainable from tensorflow.contrib.learn.python.learn.estimators import run_config +from tensorflow.contrib.tpu.python.tpu import tpu_estimator from tensorflow.python.estimator import estimator as core_estimator from tensorflow.python.framework import ops from tensorflow.python.platform import tf_logging as logging @@ -221,6 +222,14 @@ class Experiment(object): "`estimator` must implement `tf.contrib.learn.Trainable`" "or `tf.estimator.`Estimator`.") + if isinstance(estimator, tpu_estimator.TPUEstimator): + raise ValueError( + "`Experiment` class cannot work with `tf.contrib.tpu.TPUEstimator`. " + "Please call `TPUEstimator` train/evaluate directly. \n" + "Details: `Experiment` class is designed for between-graph " + "distributed training, while `TPUEstimator` is working in in-graph " + "distributed mode.") + super(Experiment, self).__init__() # Immutable fields. self._estimator = estimator diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py index fe40d27c445..2c68edbb34b 100644 --- a/tensorflow/contrib/learn/python/learn/experiment_test.py +++ b/tensorflow/contrib/learn/python/learn/experiment_test.py @@ -32,6 +32,8 @@ from tensorflow.contrib.learn.python.learn.estimators import dnn from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib from tensorflow.contrib.learn.python.learn.estimators import test_data from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils +from tensorflow.contrib.tpu.python.tpu import tpu_config +from tensorflow.contrib.tpu.python.tpu import tpu_estimator from tensorflow.core.protobuf import config_pb2 from tensorflow.python.client import session from tensorflow.python.estimator import estimator as core_estimator @@ -935,6 +937,20 @@ class ExperimentTest(test.TestCase): self.assertEqual(ex._maybe_export.call_count, 4) self.assertEqual(ex._call_evaluate.call_count, 4) + def test_fail_with_tpu_estimator(self): + def dummy_model_fn(features, labels): + del features, labels # unused + + with self.assertRaisesRegexp( + ValueError, + '`Experiment` class cannot work with `tf.contrib.tpu.TPUEstimator`'): + experiment.Experiment( + tpu_estimator.TPUEstimator(model_fn=dummy_model_fn, + config=tpu_config.RunConfig(), + train_batch_size=256), + train_input_fn='train_input', + eval_input_fn='eval_input') + if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py index bafde464afb..4c50d40aaa9 100644 --- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py +++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py @@ -28,7 +28,6 @@ import six from six.moves import xrange # pylint: disable=redefined-builtin from tensorflow.python.framework import dtypes -from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.platform import tf_logging as logging @@ -44,7 +43,7 @@ def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None): x_is_dict, y_is_dict = isinstance( x_shape, dict), y_shape is not None and isinstance(y_shape, dict) if y_is_dict and n_classes is not None: - assert (isinstance(n_classes, dict)) + assert isinstance(n_classes, dict) if batch_size is None: batch_size = list(x_shape.values())[0][0] if x_is_dict else x_shape[0] @@ -322,10 +321,12 @@ class DataFeeder(object): self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items()) ]) if x_is_dict else check_array(x, x.dtype) - self._y = None if y is None else \ - dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())]) if y_is_dict else check_array(y, y.dtype) + self._y = None if y is None else ( + dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())]) + if y_is_dict else check_array(y, y.dtype)) - # self.n_classes is not None means we're converting raw target indices to one-hot. + # self.n_classes is not None means we're converting raw target indices + # to one-hot. if n_classes is not None: if not y_is_dict: y_dtype = (np.int64 @@ -344,12 +345,15 @@ class DataFeeder(object): x_shape, y_shape, n_classes, batch_size) # Input dtype matches dtype of x. - self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._x.items())]) if x_is_dict \ - else _check_dtype(self._x.dtype) + self._input_dtype = ( + dict([(k, _check_dtype(v.dtype)) for k, v in list(self._x.items())]) + if x_is_dict else _check_dtype(self._x.dtype)) - # note: self._output_dtype = np.float32 when y is None - self._output_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())]) if y_is_dict \ - else _check_dtype(self._y.dtype) if y is not None else np.float32 + # self._output_dtype == np.float32 when y is None + self._output_dtype = ( + dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())]) + if y_is_dict else ( + _check_dtype(self._y.dtype) if y is not None else np.float32)) # self.n_classes is None means we're passing in raw target indices if n_classes is not None and y_is_dict: diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py index 1e68a3ef660..676e1f2b51c 100644 --- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py +++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Utilities supporting export to SavedModel. Some contents of this file are moved to tensorflow/python/estimator/export.py: @@ -39,6 +38,7 @@ import time from tensorflow.contrib.layers.python.layers import feature_column from tensorflow.contrib.learn.python.learn import export_strategy from tensorflow.contrib.learn.python.learn.estimators import constants +from tensorflow.contrib.learn.python.learn.estimators import metric_key from tensorflow.contrib.learn.python.learn.estimators import prediction_key from tensorflow.contrib.learn.python.learn.utils import gc from tensorflow.contrib.learn.python.learn.utils import input_fn_utils @@ -75,8 +75,8 @@ FEATURES_INPUT_ALTERNATIVE_KEY = 'features_input_alternative' _FALLBACK_DEFAULT_OUTPUT_ALTERNATIVE_KEY = 'default_output_alternative' -def build_standardized_signature_def( - input_tensors, output_tensors, problem_type): +def build_standardized_signature_def(input_tensors, output_tensors, + problem_type): """Build a SignatureDef using problem type and input and output Tensors. Note that this delegates the actual creation of the signatures to methods in @@ -116,8 +116,8 @@ def build_standardized_signature_def( (_, predictions), = output_tensors.items() return signature_def_utils.regression_signature_def(examples, predictions) else: - return signature_def_utils.predict_signature_def( - input_tensors, output_tensors) + return signature_def_utils.predict_signature_def(input_tensors, + output_tensors) def _get_classification_scores(output_tensors): @@ -139,17 +139,15 @@ def _is_classification_problem(problem_type, input_tensors, output_tensors): classes = _get_classification_classes(output_tensors) scores = _get_classification_scores(output_tensors) return ((problem_type == constants.ProblemType.CLASSIFICATION or - problem_type == constants.ProblemType.LOGISTIC_REGRESSION) - and len(input_tensors) == 1 - and (classes is not None or - scores is not None or - len(output_tensors) == 1)) + problem_type == constants.ProblemType.LOGISTIC_REGRESSION) and + len(input_tensors) == 1 and + (classes is not None or scores is not None or + len(output_tensors) == 1)) def _is_regression_problem(problem_type, input_tensors, output_tensors): - return (problem_type == constants.ProblemType.LINEAR_REGRESSION - and len(input_tensors) == 1 - and len(output_tensors) == 1) + return (problem_type == constants.ProblemType.LINEAR_REGRESSION and + len(input_tensors) == 1 and len(output_tensors) == 1) def get_input_alternatives(input_ops): @@ -177,9 +175,7 @@ def get_input_alternatives(input_ops): return input_alternatives, features -def get_output_alternatives( - model_fn_ops, - default_output_alternative_key=None): +def get_output_alternatives(model_fn_ops, default_output_alternative_key=None): """Obtain all output alternatives using the model_fn output and heuristics. Args: @@ -218,8 +214,10 @@ def get_output_alternatives( default_outputs = {prediction_key.PredictionKey.GENERIC: default_outputs} actual_default_output_alternative_key = ( _FALLBACK_DEFAULT_OUTPUT_ALTERNATIVE_KEY) - output_alternatives = {actual_default_output_alternative_key: - (default_problem_type, default_outputs)} + output_alternatives = { + actual_default_output_alternative_key: (default_problem_type, + default_outputs) + } return output_alternatives, actual_default_output_alternative_key if default_output_alternative_key: @@ -246,13 +244,12 @@ def build_all_signature_defs(input_alternatives, output_alternatives, actual_default_output_alternative_key): """Build `SignatureDef`s from all pairs of input and output alternatives.""" - signature_def_map = { - ('%s:%s' % (input_key, output_key or 'None')): - build_standardized_signature_def( - inputs, outputs, problem_type) - for input_key, inputs in input_alternatives.items() - for output_key, (problem_type, outputs) - in output_alternatives.items()} + signature_def_map = {('%s:%s' % (input_key, output_key or 'None')): + build_standardized_signature_def(inputs, outputs, + problem_type) + for input_key, inputs in input_alternatives.items() + for output_key, (problem_type, + outputs) in output_alternatives.items()} # Add the default SignatureDef default_inputs = input_alternatives.get(DEFAULT_INPUT_ALTERNATIVE_KEY) @@ -263,8 +260,8 @@ def build_all_signature_defs(input_alternatives, output_alternatives, (default_problem_type, default_outputs) = ( output_alternatives[actual_default_output_alternative_key]) signature_def_map[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = ( - build_standardized_signature_def( - default_inputs, default_outputs, default_problem_type)) + build_standardized_signature_def(default_inputs, default_outputs, + default_problem_type)) return signature_def_map @@ -308,9 +305,8 @@ def get_timestamped_export_dir(export_dir_base): return export_dir time.sleep(1) attempts += 1 - logging.warn( - 'Export directory {} already exists; retrying (attempt {}/{})'.format( - export_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS)) + logging.warn('Export directory {} already exists; retrying (attempt {}/{})'. + format(export_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS)) raise RuntimeError('Failed to obtain a unique export directory name after ' '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS)) @@ -330,8 +326,7 @@ def get_temp_export_dir(timestamped_export_dir): """ (dirname, basename) = os.path.split(timestamped_export_dir) temp_export_dir = os.path.join( - compat.as_bytes(dirname), - compat.as_bytes('temp-{}'.format(basename))) + compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename))) return temp_export_dir @@ -357,8 +352,8 @@ def get_most_recent_export(export_dir_base): A gc.Path, with is just a namedtuple of (path, export_version). """ select_filter = gc.largest_export_versions(1) - results = select_filter(gc.get_paths(export_dir_base, - parser=_export_version_parser)) + results = select_filter( + gc.get_paths(export_dir_base, parser=_export_version_parser)) return next(iter(results or []), None) @@ -378,8 +373,8 @@ def garbage_collect_exports(export_dir_base, exports_to_keep): keep_filter = gc.largest_export_versions(exports_to_keep) delete_filter = gc.negation(keep_filter) - for p in delete_filter(gc.get_paths(export_dir_base, - parser=_export_version_parser)): + for p in delete_filter( + gc.get_paths(export_dir_base, parser=_export_version_parser)): try: gfile.DeleteRecursively(p.path) except errors_impl.NotFoundError as e: @@ -416,10 +411,7 @@ def make_export_strategy(serving_input_fn, An ExportStrategy that can be passed to the Experiment constructor. """ - def export_fn(estimator, - export_dir_base, - checkpoint_path=None - ): + def export_fn(estimator, export_dir_base, checkpoint_path=None): """Exports the given Estimator as a SavedModel. Args: @@ -512,3 +504,128 @@ def make_parsing_export_strategy(feature_columns, assets_extra=assets_extra, as_text=as_text, exports_to_keep=exports_to_keep) + + +def _default_compare_fn(curr_best_eval_result, cand_eval_result): + """Compares two evaluation results and returns true if the 2nd one is better. + + Both evaluation results should have the values for MetricKey.LOSS, which are + used for comparison. + + Args: + curr_best_eval_result: current best eval metrics. + cand_eval_result: candidate eval metrics. + + Returns: + True if cand_eval_result is better. + + Raises: + ValueError: If input eval result is None or no loss is available. + """ + default_key = metric_key.MetricKey.LOSS + if not curr_best_eval_result or default_key not in curr_best_eval_result: + raise ValueError( + 'curr_best_eval_result cannot be empty or no loss is found in it.') + + if not cand_eval_result or default_key not in cand_eval_result: + raise ValueError( + 'cand_eval_result cannot be empty or no loss is found in it.') + + return curr_best_eval_result[default_key] > cand_eval_result[default_key] + + +class BestModelSelector(object): + """A helper that keeps track of export selection candidates.""" + + def __init__(self, compare_fn=None): + """Constructor of this class. + + Args: + compare_fn: a function that returns true if the candidate is better than + the current best model. + """ + self._best_eval_result = None + self._compare_fn = compare_fn or _default_compare_fn + + def update(self, checkpoint_path, eval_result): + """Records a given checkpoint and exports if this is the best model. + + Args: + checkpoint_path: the checkpoint path to export. + eval_result: a dictionary which is usually generated in evaluation runs. + By default, eval_results contains 'loss' field. + + Returns: + A string representing the path to the checkpoint to be exported. + A dictionary of the same type of eval_result. + + Raises: + ValueError: if checkpoint path is empty. + ValueError: if eval_results is None object. + """ + if not checkpoint_path: + raise ValueError('Checkpoint path is empty.') + if eval_result is None: + raise ValueError('%s has empty evaluation results.', checkpoint_path) + + if (self._best_eval_result is None or + self._compare_fn(self._best_eval_result, eval_result)): + self._best_eval_result = eval_result + return checkpoint_path, eval_result + else: + return '', None + + +def make_best_model_export_strategy(serving_input_fn, + exports_to_keep=1, + compare_fn=None, + default_output_alternative_key=None): + """Creates an custom ExportStrategy for use with tf.contrib.learn.Experiment. + + Args: + serving_input_fn: a function that takes no arguments and returns an + `InputFnOps`. + exports_to_keep: an integer indicating how many historical best models need + to be preserved. + compare_fn: a function that select the 'best' candidate from a dictionary + of evaluation result keyed by corresponding checkpoint path. + default_output_alternative_key: the key for default serving signature for + multi-headed inference graphs. + + Returns: + An ExportStrategy that can be passed to the Experiment constructor. + """ + best_model_export_strategy = make_export_strategy( + serving_input_fn, + exports_to_keep=exports_to_keep, + default_output_alternative_key=default_output_alternative_key) + + best_model_selector = BestModelSelector(compare_fn) + + def export_fn(estimator, export_dir_base, checkpoint_path, eval_result=None): + """Exports the given Estimator as a SavedModel. + + Args: + estimator: the Estimator to export. + export_dir_base: A string containing a directory to write the exported + graph and checkpoints. + checkpoint_path: The checkpoint path to export. If None (the default), + the most recent checkpoint found within the model directory is chosen. + eval_result: placehold args matching the call signature of ExportStrategy. + + Returns: + The string path to the exported directory. + """ + + export_checkpoint_path, export_eval_result = best_model_selector.update( + checkpoint_path, eval_result) + + if export_checkpoint_path and export_eval_result is not None: + checkpoint_base = os.path.basename(export_checkpoint_path) + export_dir = os.path.join(export_dir_base, checkpoint_base) + return best_model_export_strategy.export( + estimator, export_dir, export_checkpoint_path, export_eval_result) + else: + return '' + + return export_strategy.ExportStrategy('best_model', export_fn) diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py index 9e778ab72ad..66bca9c0f53 100644 --- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py +++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py @@ -24,6 +24,7 @@ import time from tensorflow.contrib.layers.python.layers import feature_column as fc from tensorflow.contrib.learn.python.learn import export_strategy as export_strategy_lib from tensorflow.contrib.learn.python.learn.estimators import constants +from tensorflow.contrib.learn.python.learn.estimators import estimator as core_estimator from tensorflow.contrib.learn.python.learn.estimators import model_fn from tensorflow.contrib.learn.python.learn.utils import input_fn_utils from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils @@ -40,18 +41,43 @@ from tensorflow.python.saved_model import signature_def_utils from tensorflow.python.util import compat +class TestEstimator(core_estimator.Estimator): + + def __init__(self, *args, **kwargs): + super(TestEstimator, self).__init__(*args, **kwargs) + self.last_exported_checkpoint = "" + self.last_exported_dir = "" + + # @Override + def export_savedmodel(self, + export_dir, + serving_input_fn, + default_output_alternative_key=None, + assets_extra=None, + as_text=False, + checkpoint_path=None): + + if not os.path.exists(export_dir): + os.makedirs(export_dir) + + open(os.path.join(export_dir, "placeholder.txt"), "a").close() + + self.last_exported_checkpoint = checkpoint_path + self.last_exported_dir = export_dir + + return export_dir + + class SavedModelExportUtilsTest(test.TestCase): def test_build_standardized_signature_def_regression(self): input_tensors = { "input-1": - array_ops.placeholder( - dtypes.float32, 1, name="input-tensor-1") + array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1") } output_tensors = { "output-1": - array_ops.placeholder( - dtypes.float32, 1, name="output-tensor-1") + array_ops.placeholder(dtypes.float32, 1, name="output-tensor-1") } problem_type = constants.ProblemType.LINEAR_REGRESSION actual_signature_def = ( @@ -61,10 +87,9 @@ class SavedModelExportUtilsTest(test.TestCase): shape = tensor_shape_pb2.TensorShapeProto( dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]) dtype = types_pb2.DataType.Value("DT_FLOAT") - expected_signature_def.inputs[ - signature_constants.REGRESS_INPUTS].CopyFrom( - meta_graph_pb2.TensorInfo( - name="input-tensor-1:0", dtype=dtype, tensor_shape=shape)) + expected_signature_def.inputs[signature_constants.REGRESS_INPUTS].CopyFrom( + meta_graph_pb2.TensorInfo( + name="input-tensor-1:0", dtype=dtype, tensor_shape=shape)) expected_signature_def.outputs[ signature_constants.REGRESS_OUTPUTS].CopyFrom( meta_graph_pb2.TensorInfo( @@ -77,13 +102,11 @@ class SavedModelExportUtilsTest(test.TestCase): """Tests classification with one output tensor.""" input_tensors = { "input-1": - array_ops.placeholder( - dtypes.float32, 1, name="input-tensor-1") + array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1") } output_tensors = { "output-1": - array_ops.placeholder( - dtypes.string, 1, name="output-tensor-1") + array_ops.placeholder(dtypes.string, 1, name="output-tensor-1") } problem_type = constants.ProblemType.CLASSIFICATION actual_signature_def = ( @@ -94,14 +117,14 @@ class SavedModelExportUtilsTest(test.TestCase): dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]) dtype_float = types_pb2.DataType.Value("DT_FLOAT") dtype_string = types_pb2.DataType.Value("DT_STRING") - expected_signature_def.inputs[ - signature_constants.CLASSIFY_INPUTS].CopyFrom( - meta_graph_pb2.TensorInfo( - name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) + expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom( + meta_graph_pb2.TensorInfo( + name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) expected_signature_def.outputs[ signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom( meta_graph_pb2.TensorInfo( - name="output-tensor-1:0", dtype=dtype_string, + name="output-tensor-1:0", + dtype=dtype_string, tensor_shape=shape)) expected_signature_def.method_name = ( @@ -112,8 +135,7 @@ class SavedModelExportUtilsTest(test.TestCase): """Tests multiple output tensors that include classes and probabilities.""" input_tensors = { "input-1": - array_ops.placeholder( - dtypes.float32, 1, name="input-tensor-1") + array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1") } output_tensors = { "classes": @@ -136,19 +158,20 @@ class SavedModelExportUtilsTest(test.TestCase): dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]) dtype_float = types_pb2.DataType.Value("DT_FLOAT") dtype_string = types_pb2.DataType.Value("DT_STRING") - expected_signature_def.inputs[ - signature_constants.CLASSIFY_INPUTS].CopyFrom( - meta_graph_pb2.TensorInfo( - name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) + expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom( + meta_graph_pb2.TensorInfo( + name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) expected_signature_def.outputs[ signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom( meta_graph_pb2.TensorInfo( - name="output-tensor-classes:0", dtype=dtype_string, + name="output-tensor-classes:0", + dtype=dtype_string, tensor_shape=shape)) expected_signature_def.outputs[ signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom( meta_graph_pb2.TensorInfo( - name="output-tensor-proba:0", dtype=dtype_float, + name="output-tensor-proba:0", + dtype=dtype_float, tensor_shape=shape)) expected_signature_def.method_name = ( @@ -159,8 +182,7 @@ class SavedModelExportUtilsTest(test.TestCase): """Tests multiple output tensors that include classes and scores.""" input_tensors = { "input-1": - array_ops.placeholder( - dtypes.float32, 1, name="input-tensor-1") + array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1") } output_tensors = { "classes": @@ -182,19 +204,20 @@ class SavedModelExportUtilsTest(test.TestCase): dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]) dtype_float = types_pb2.DataType.Value("DT_FLOAT") dtype_string = types_pb2.DataType.Value("DT_STRING") - expected_signature_def.inputs[ - signature_constants.CLASSIFY_INPUTS].CopyFrom( - meta_graph_pb2.TensorInfo( - name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) + expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom( + meta_graph_pb2.TensorInfo( + name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) expected_signature_def.outputs[ signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom( meta_graph_pb2.TensorInfo( - name="output-tensor-classes:0", dtype=dtype_string, + name="output-tensor-classes:0", + dtype=dtype_string, tensor_shape=shape)) expected_signature_def.outputs[ signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom( meta_graph_pb2.TensorInfo( - name="output-tensor-scores:0", dtype=dtype_float, + name="output-tensor-scores:0", + dtype=dtype_float, tensor_shape=shape)) expected_signature_def.method_name = ( @@ -205,8 +228,7 @@ class SavedModelExportUtilsTest(test.TestCase): """Tests classification without classes tensor.""" input_tensors = { "input-1": - array_ops.placeholder( - dtypes.float32, 1, name="input-tensor-1") + array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1") } output_tensors = { "probabilities": @@ -224,14 +246,14 @@ class SavedModelExportUtilsTest(test.TestCase): shape = tensor_shape_pb2.TensorShapeProto( dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]) dtype_float = types_pb2.DataType.Value("DT_FLOAT") - expected_signature_def.inputs[ - signature_constants.CLASSIFY_INPUTS].CopyFrom( - meta_graph_pb2.TensorInfo( - name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) + expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom( + meta_graph_pb2.TensorInfo( + name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) expected_signature_def.outputs[ signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom( meta_graph_pb2.TensorInfo( - name="output-tensor-proba:0", dtype=dtype_float, + name="output-tensor-proba:0", + dtype=dtype_float, tensor_shape=shape)) expected_signature_def.method_name = ( @@ -246,8 +268,7 @@ class SavedModelExportUtilsTest(test.TestCase): """ input_tensors = { "input-1": - array_ops.placeholder( - dtypes.float32, 1, name="input-tensor-1") + array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1") } output_tensors = { "classes": @@ -268,14 +289,14 @@ class SavedModelExportUtilsTest(test.TestCase): shape = tensor_shape_pb2.TensorShapeProto( dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]) dtype_float = types_pb2.DataType.Value("DT_FLOAT") - expected_signature_def.inputs[ - signature_constants.CLASSIFY_INPUTS].CopyFrom( - meta_graph_pb2.TensorInfo( - name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) + expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom( + meta_graph_pb2.TensorInfo( + name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) expected_signature_def.outputs[ signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom( meta_graph_pb2.TensorInfo( - name="output-tensor-scores:0", dtype=dtype_float, + name="output-tensor-scores:0", + dtype=dtype_float, tensor_shape=shape)) expected_signature_def.method_name = ( @@ -290,8 +311,7 @@ class SavedModelExportUtilsTest(test.TestCase): """ input_tensors = { "input-1": - array_ops.placeholder( - dtypes.float32, 1, name="input-tensor-1") + array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1") } output_tensors = { "classes": @@ -310,17 +330,18 @@ class SavedModelExportUtilsTest(test.TestCase): dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]) dtype_int64 = types_pb2.DataType.Value("DT_INT64") dtype_float = types_pb2.DataType.Value("DT_FLOAT") - expected_signature_def.inputs[ - "input-1"].CopyFrom( - meta_graph_pb2.TensorInfo( - name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) + expected_signature_def.inputs["input-1"].CopyFrom( + meta_graph_pb2.TensorInfo( + name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape)) expected_signature_def.outputs["classes"].CopyFrom( meta_graph_pb2.TensorInfo( - name="output-tensor-classes:0", dtype=dtype_int64, + name="output-tensor-classes:0", + dtype=dtype_int64, tensor_shape=shape)) expected_signature_def.outputs["logits"].CopyFrom( meta_graph_pb2.TensorInfo( - name="output-tensor-logits:0", dtype=dtype_float, + name="output-tensor-logits:0", + dtype=dtype_float, tensor_shape=shape)) expected_signature_def.method_name = ( @@ -379,8 +400,9 @@ class SavedModelExportUtilsTest(test.TestCase): def test_get_output_alternatives_single_no_default(self): prediction_tensor = constant_op.constant(["bogus"]) provided_output_alternatives = { - "head-1": (constants.ProblemType.LINEAR_REGRESSION, - {"output": prediction_tensor}), + "head-1": (constants.ProblemType.LINEAR_REGRESSION, { + "output": prediction_tensor + }), } model_fn_ops = model_fn.ModelFnOps( model_fn.ModeKeys.INFER, @@ -390,10 +412,11 @@ class SavedModelExportUtilsTest(test.TestCase): output_alternatives, _ = saved_model_export_utils.get_output_alternatives( model_fn_ops) - self.assertEqual({"head-1": - (constants.ProblemType.LINEAR_REGRESSION, - {"output": prediction_tensor})}, - output_alternatives) + self.assertEqual({ + "head-1": (constants.ProblemType.LINEAR_REGRESSION, { + "output": prediction_tensor + }) + }, output_alternatives) def test_get_output_alternatives_multi_no_default(self): provided_output_alternatives = { @@ -424,10 +447,11 @@ class SavedModelExportUtilsTest(test.TestCase): output_alternatives, _ = saved_model_export_utils.get_output_alternatives( model_fn_ops) - self.assertEqual( - {"default_output_alternative": (constants.ProblemType.UNSPECIFIED, { - "some_output": prediction_tensor})}, - output_alternatives) + self.assertEqual({ + "default_output_alternative": (constants.ProblemType.UNSPECIFIED, { + "some_output": prediction_tensor + }) + }, output_alternatives) def test_get_output_alternatives_empty_provided_with_default(self): prediction_tensor = constant_op.constant(["bogus"]) @@ -452,10 +476,11 @@ class SavedModelExportUtilsTest(test.TestCase): output_alternatives, _ = saved_model_export_utils.get_output_alternatives( model_fn_ops) - self.assertEqual( - {"default_output_alternative": (constants.ProblemType.UNSPECIFIED, { - "some_output": prediction_tensor})}, - output_alternatives) + self.assertEqual({ + "default_output_alternative": (constants.ProblemType.UNSPECIFIED, { + "some_output": prediction_tensor + }) + }, output_alternatives) def test_get_output_alternatives_implicit_single(self): prediction_tensor = constant_op.constant(["bogus"]) @@ -506,14 +531,14 @@ class SavedModelExportUtilsTest(test.TestCase): expected_signature_defs = { "serving_default": - signature_def_utils.regression_signature_def(input_example, - output_1), + signature_def_utils.regression_signature_def( + input_example, output_1), "default_input_alternative:head-1": - signature_def_utils.regression_signature_def(input_example, - output_1), + signature_def_utils.regression_signature_def( + input_example, output_1), "default_input_alternative:head-2": - signature_def_utils.classification_signature_def(input_example, - output_2, None), + signature_def_utils.classification_signature_def( + input_example, output_2, None), "default_input_alternative:head-3": signature_def_utils.predict_signature_def({ "default input": input_example @@ -624,17 +649,20 @@ class SavedModelExportUtilsTest(test.TestCase): (most_recent_export_dir, most_recent_export_version) = ( saved_model_export_utils.get_most_recent_export(export_dir_base)) - self.assertEqual(compat.as_bytes(export_dir_4), - compat.as_bytes(most_recent_export_dir)) - self.assertEqual(compat.as_bytes(export_dir_4), - os.path.join(compat.as_bytes(export_dir_base), - compat.as_bytes( - str(most_recent_export_version)))) + self.assertEqual( + compat.as_bytes(export_dir_4), compat.as_bytes(most_recent_export_dir)) + self.assertEqual( + compat.as_bytes(export_dir_4), + os.path.join( + compat.as_bytes(export_dir_base), + compat.as_bytes(str(most_recent_export_version)))) def test_make_export_strategy(self): """Only tests that an ExportStrategy instance is created.""" + def _serving_input_fn(): return array_ops.constant([1]), None + export_strategy = saved_model_export_utils.make_export_strategy( serving_input_fn=_serving_input_fn, default_output_alternative_key="default", @@ -655,14 +683,61 @@ class SavedModelExportUtilsTest(test.TestCase): real_valued_col1 = fc.real_valued_column("real_valued_column1") bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) - feature_columns = [sparse_col, embedding_col, real_valued_col1, - bucketized_col1] + feature_columns = [ + sparse_col, embedding_col, real_valued_col1, bucketized_col1 + ] export_strategy = saved_model_export_utils.make_parsing_export_strategy( feature_columns=feature_columns) self.assertTrue( isinstance(export_strategy, export_strategy_lib.ExportStrategy)) + def test_make_best_model_export_strategy(self): + export_dir_base = tempfile.mkdtemp() + "export/" + gfile.MkDir(export_dir_base) + + test_estimator = TestEstimator() + export_strategy = saved_model_export_utils.make_best_model_export_strategy( + serving_input_fn=None, exports_to_keep=3, compare_fn=None) + + self.assertNotEqual("", + export_strategy.export(test_estimator, export_dir_base, + "fake_ckpt_0", {"loss": 100})) + self.assertNotEqual("", test_estimator.last_exported_dir) + self.assertNotEqual("", test_estimator.last_exported_checkpoint) + + self.assertEqual("", + export_strategy.export(test_estimator, export_dir_base, + "fake_ckpt_1", {"loss": 101})) + self.assertEqual(test_estimator.last_exported_dir, + os.path.join(export_dir_base, "fake_ckpt_0")) + + self.assertNotEqual("", + export_strategy.export(test_estimator, export_dir_base, + "fake_ckpt_2", {"loss": 10})) + self.assertEqual(test_estimator.last_exported_dir, + os.path.join(export_dir_base, "fake_ckpt_2")) + + self.assertEqual("", + export_strategy.export(test_estimator, export_dir_base, + "fake_ckpt_3", {"loss": 20})) + self.assertEqual(test_estimator.last_exported_dir, + os.path.join(export_dir_base, "fake_ckpt_2")) + + def test_make_best_model_export_strategy_exceptions(self): + export_dir_base = tempfile.mkdtemp() + "export/" + + test_estimator = TestEstimator() + export_strategy = saved_model_export_utils.make_best_model_export_strategy( + serving_input_fn=None, exports_to_keep=3, compare_fn=None) + + with self.assertRaises(ValueError): + export_strategy.export(test_estimator, export_dir_base, "", {"loss": 200}) + + with self.assertRaises(ValueError): + export_strategy.export(test_estimator, export_dir_base, "fake_ckpt_1", + None) + def _create_test_export_dir(export_dir_base): export_dir = saved_model_export_utils.get_timestamped_export_dir( diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py index 00cde08bff1..9b959b43a9d 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py @@ -1496,6 +1496,15 @@ class StreamingAUCTest(test.TestCase): for _ in range(10): self.assertAlmostEqual(initial_auc, auc.eval(), 5) + def testPredictionsOutOfRange(self): + with self.test_session() as sess: + predictions = constant_op.constant( + [1, -1, 1, -1], shape=(1, 4), dtype=dtypes_lib.float32) + labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4)) + _, update_op = metrics.streaming_auc(predictions, labels) + sess.run(variables.local_variables_initializer()) + self.assertRaises(errors_impl.InvalidArgumentError, update_op.eval) + def testAllCorrect(self): self.allCorrectAsExpected('ROC') diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD index 338181e4cac..d6508362b8b 100644 --- a/tensorflow/contrib/nccl/BUILD +++ b/tensorflow/contrib/nccl/BUILD @@ -48,6 +48,8 @@ tf_cuda_cc_test( # Disabled on jenkins until errors finding nvmlShutdown are found. tags = [ "manual", + "no_oss", + "noguitar", # note: is run manually there "notap", ], deps = if_cuda( @@ -112,25 +114,26 @@ tf_custom_op_py_library( ], ) -# http://b/62064807 -# cuda_py_test( -# name = "nccl_ops_test", -# size = "small", -# srcs = ["python/ops/nccl_ops_test.py"], -# additional_deps = [ -# ":nccl_py", -# "//tensorflow/python:array_ops", -# "//tensorflow/python:client_testlib", -# "//tensorflow/python:framework_for_generated_wrappers", -# "//tensorflow/python:framework_test_lib", -# "//tensorflow/python:platform_test", -# ], -# # Disabled on jenkins until errors finding nvmlShutdown are found. -# tags = [ -# "manual", -# "notap", -# ], -# ) +cuda_py_test( + name = "nccl_ops_test", + size = "small", + srcs = ["python/ops/nccl_ops_test.py"], + additional_deps = [ + ":nccl_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:platform_test", + ], + # Disabled on jenkins until errors finding nvmlShutdown are found. + tags = [ + "manual", + "no_oss", + "noguitar", # note: is run manually there + "notap", + ], +) filegroup( name = "all_files", diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py index ae658e73227..1621e9f28e3 100644 --- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py +++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py @@ -43,7 +43,8 @@ class AllReduceTest(test.TestCase): self._testSingleAllReduce(sess, dtype, nccl.all_max, np.maximum) def _testSingleAllReduce(self, sess, np_type, nccl_fn, numpy_accumulation_fn): - for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]: + for devices in [['/device:GPU:1', '/device:GPU:2', '/device:GPU:0'], + ['/device:GPU:1', '/device:GPU:0']]: shape = (3, 4) np_ans = None tensors = [] @@ -84,7 +85,8 @@ class BroadcastTest(test.TestCase): # Create session inside outer loop to test use of # same communicator across multiple sessions. with self.test_session(use_gpu=True) as sess: - for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]: + for devices in [['/device:GPU:1', '/device:GPU:0', '/device:GPU:2'], + ['/device:GPU:1', '/device:GPU:0']]: shape = (3, 4) sender = np.random.randint(0, len(devices) - 1) with ops.device(devices[sender]): @@ -115,7 +117,8 @@ class CombinedTest(test.TestCase): # Create session inside outer loop to test use of # same communicator across multiple sessions. with self.test_session(use_gpu=True) as sess: - for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]: + for devices in [['/device:GPU:1', '/device:GPU:2', '/device:GPU:0'], + ['/device:GPU:0', '/device:GPU:1']]: shape = (3, 4) # all-reduce diff --git a/tensorflow/contrib/receptive_field/BUILD b/tensorflow/contrib/receptive_field/BUILD new file mode 100644 index 00000000000..ed2f3af08cb --- /dev/null +++ b/tensorflow/contrib/receptive_field/BUILD @@ -0,0 +1,71 @@ +# Description: +# Contains modules to compute receptive field parameters for CNN models. + +package( + default_visibility = ["//visibility:public"], +) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +load("//tensorflow:tensorflow.bzl", "py_test") + +# Transitive dependencies of this target will be included in the pip package. +py_library( + name = "receptive_field_pip", + deps = [ + ":graph_compute_order_py", + ":receptive_field_py", + ], +) + +py_library( + name = "graph_compute_order_py", + srcs = [ + "__init__.py", + "python/util/graph_compute_order.py", + ], + srcs_version = "PY2AND3", +) + +py_library( + name = "receptive_field_py", + srcs = [ + "__init__.py", + "python/util/receptive_field.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":graph_compute_order_py", + "//tensorflow/contrib/util:util_py", + "//tensorflow/python:platform", + ], +) + +py_test( + name = "receptive_field_test", + srcs = ["python/util/receptive_field_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":receptive_field_py", + "//tensorflow/contrib/framework:framework_py", + "//tensorflow/contrib/slim", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:dtypes", + "//tensorflow/python:nn", + ], +) + +filegroup( + name = "all_files", + srcs = glob( + ["**/*"], + exclude = [ + "**/METADATA", + "**/OWNERS", + ], + ), + visibility = ["//tensorflow:__subpackages__"], +) diff --git a/tensorflow/contrib/receptive_field/README.md b/tensorflow/contrib/receptive_field/README.md new file mode 100644 index 00000000000..b150b903b23 --- /dev/null +++ b/tensorflow/contrib/receptive_field/README.md @@ -0,0 +1,165 @@ +# Receptive field computation for convnets + +This library enables you to easily compute the receptive field parameters of +your favorite convnet. You can use it to understand how big of an input image +region your output features depend on. Better yet, using the parameters computed +by the library, you can easily find the exact image region which is used to +compute each convnet feature. + +## Basic usage + +The main function to be called is `compute_receptive_field_from_graph_def`, +which will return the receptive field, effective stride and effective padding +for both horizontal and vertical directions. + +For example, if your model is constructed using the function +`my_model_construction()`, you can use the library as follows: + +```python +import tensorflow as tf +from tensorflow.contrib import receptive_field + +# Construct graph. +g = tf.Graph() +with g.as_default(): + images = tf.placeholder(tf.float32, shape=(1, None, None, 3), name='input_image') + my_model_construction(images) + +# Compute receptive field parameters. +rf_x, rf_y, eff_stride_x, eff_stride_y, eff_pad_x, eff_pad_y = \ + receptive_field.compute_receptive_field_from_graph_def( \ + g.as_graph_def(), 'input_image', 'my_output_endpoint') +``` + +Here's a simple example of computing the receptive field parameters for +Inception-Resnet-v2. To get this to work, be sure to checkout +[tensorflow/models](https://github.com/tensorflow/models), so that the Inception +models are available to you. This can be done in three simple commands: + +```sh +git clone https://github.com/tensorflow/models +cd models/slim +sudo python setup.py install_lib +``` + +You can then compute the receptive field parameters for Inception-Resnet-v2 as: + +```python +from nets import inception +import tensorflow as tf +from tensorflow.contrib import receptive_field + +# Construct graph. +g = tf.Graph() +with g.as_default(): + images = tf.placeholder(tf.float32, shape=(1, None, None, 3), name='input_image') + inception.inception_resnet_v2_base(images) + +# Compute receptive field parameters. +rf_x, rf_y, eff_stride_x, eff_stride_y, eff_pad_x, eff_pad_y = \ + receptive_field.compute_receptive_field_from_graph_def( \ + g.as_graph_def(), 'input_image', 'InceptionResnetV2/Conv2d_7b_1x1/Relu') +``` + +This will give you `rf_x = rf_y = 3039`, `eff_stride_x = eff_stride_y = 32`, and +`eff_pad_x = eff_pad_y = 1482`. This means that each feature that is output at +the node `'InceptionResnetV2/Conv2d_7b_1x1/Relu'` is computed from a region +which is of size `3039x3039`. Further, by using the expressions + +```python +center_x = -eff_pad_x + feature_x*eff_stride_x + (rf_x - 1)/2 +center_y = -eff_pad_y + feature_y*eff_stride_y + (rf_y - 1)/2 +``` + +one can compute the center of the region in the input image that is used to +compute the output feature at position `[feature_x, feature_y]`. For example, +the feature at position `[0, 2]` at the output of the layer +`'InceptionResnetV2/Conv2d_7b_1x1/Relu'` is centered in the original image in +the position `[37, 101]`. + +TODO: include link to derivations and definitions of different parameters. + +## Receptive field benchmark + +As you might expect, it is straightforward to run this library on the popular +convnets, and gather their receptive fields. We provide a python script which +does exactly that, available under `python/util/examples/rf_benchmark.py`. + +To get this to work, be sure to checkout +[tensorflow/models](https://github.com/tensorflow/models) (see the 3-command +instructions for this above). Then, simply: + +```sh +cd python/util/examples +python rf_benchmark.py --csv_path /tmp/rf_benchmark_results.csv +``` + +The script will write to stdout the receptive field parameters for many variants +of several popular convnets: AlexNet, VGG, ResNet, Inception, Mobilenet. They +are also written to the file `/tmp/rf_benchmark_results.csv`. + +TODO: include here a plot for receptive field sizes of different convnets. + +TODO: include table/link to pre-computed RF parameters. + +## Compute RF parameters from a graph pbtxt + +We also provide a utility to compute the receptive field parameters directly +from a graph protobuf file. + +Have a `graph.pbtxt` file and want to compute its receptive field parameters? We +got you covered. The only prerequisite is to install +[google/protobuf](https://github.com/google/protobuf), which you probably +already have if you're using tensorflow (otherwise, follow installation +instructions [here](https://github.com/google/protobuf/tree/master/python)). + +This should work: + +```sh +cd python/util/examples +python compute_rf.py \ + --graph_path /path/to/graph.pbtxt \ + --output_path /path/to/output/rf_info.txt \ + --input_node my_input_node \ + --output_node my_output_node +``` + +Don't know how to generate a graph protobuf file? Take a look at the +`write_inception_resnet_v2_graph.py` script, which shows how to save it for the +Inception-Resnet-v2 model: + +```sh +cd python/util/examples +python write_inception_resnet_v2_graph.py --graph_dir /tmp --graph_filename graph.pbtxt +``` + +This will write the Inception-Resnet-v2 graph protobuf to `/tmp/graph.pbtxt`. + +For completeness, here's how you would use this file to get the receptive field +parameters of the Inception-Resnet-v2 model: + +```sh +cd python/util/examples +python compute_rf.py \ + --graph_path /tmp/graph.pbtxt \ + --output_path /tmp/rf_info.txt \ + --input_node input_image \ + --output_node InceptionResnetV2/Conv2d_7b_1x1/Relu +``` + +This will write the receptive field parameters of the model to +`/tmp/rf_info.txt`, which will look like: + +```sh +Receptive field size (horizontal) = 3039 +Receptive field size (vertical) = 3039 +Effective stride (horizontal) = 32 +Effective stride (vertical) = 32 +Effective padding (horizontal) = 1482 +Effective padding (vertical) = 1482 +``` + +## Authors + +André Araujo (github id: andrefaraujo) and Mark Sandler (github id: +marksandler) diff --git a/tensorflow/contrib/receptive_field/__init__.py b/tensorflow/contrib/receptive_field/__init__.py new file mode 100644 index 00000000000..10745a6a53d --- /dev/null +++ b/tensorflow/contrib/receptive_field/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Module to compute receptive field parameters for CNN tensorflow models.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# pylint: disable=unused-import +from tensorflow.contrib.receptive_field.python.util.graph_compute_order import get_compute_order +from tensorflow.contrib.receptive_field.python.util.receptive_field import compute_receptive_field_from_graph_def +# pylint: enable=unused-import diff --git a/tensorflow/contrib/receptive_field/python/__init__.py b/tensorflow/contrib/receptive_field/python/__init__.py new file mode 100644 index 00000000000..217047f92d3 --- /dev/null +++ b/tensorflow/contrib/receptive_field/python/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Module to compute receptive field parameters for CNN tensorflow models.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function diff --git a/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py new file mode 100644 index 00000000000..1cf978b90a3 --- /dev/null +++ b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py @@ -0,0 +1,94 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Computes Receptive Field (RF) information given a graph protobuf. + +For an example of usage, see accompanying file compute_rf.sh +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys + +from google.protobuf import text_format + +from tensorflow.contrib import receptive_field +from tensorflow.core.framework import graph_pb2 +from tensorflow.python.platform import app +from tensorflow.python.platform import gfile +from tensorflow.python.platform import tf_logging as logging + +cmd_args = None + + +def _load_graphdef(path): + """Helper function to load GraphDef from file. + + Args: + path: Path to pbtxt file. + + Returns: + graph_def: A GraphDef object. + """ + graph_def = graph_pb2.GraphDef() + pbstr = gfile.Open(path).read() + text_format.Parse(pbstr, graph_def) + return graph_def + + +def main(unused_argv): + + graph_def = _load_graphdef(cmd_args.graph_path) + + (receptive_field_x, receptive_field_y, effective_stride_x, effective_stride_y, + effective_padding_x, effective_padding_y + ) = receptive_field.compute_receptive_field_from_graph_def( + graph_def, cmd_args.input_node, cmd_args.output_node) + + logging.info('Receptive field size (horizontal) = %s', receptive_field_x) + logging.info('Receptive field size (vertical) = %s', receptive_field_y) + logging.info('Effective stride (horizontal) = %s', effective_stride_x) + logging.info('Effective stride (vertical) = %s', effective_stride_y) + logging.info('Effective padding (horizontal) = %s', effective_padding_x) + logging.info('Effective padding (vertical) = %s', effective_padding_y) + + f = gfile.GFile('%s' % cmd_args.output_path, 'w') + f.write('Receptive field size (horizontal) = %s\n' % receptive_field_x) + f.write('Receptive field size (vertical) = %s\n' % receptive_field_y) + f.write('Effective stride (horizontal) = %s\n' % effective_stride_x) + f.write('Effective stride (vertical) = %s\n' % effective_stride_y) + f.write('Effective padding (horizontal) = %s\n' % effective_padding_x) + f.write('Effective padding (vertical) = %s\n' % effective_padding_y) + f.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.register('type', 'bool', lambda v: v.lower() == 'true') + parser.add_argument( + '--graph_path', type=str, default='', help='Graph path (pbtxt format).') + parser.add_argument( + '--output_path', + type=str, + default='', + help='Path to output text file where RF information will be written to.') + parser.add_argument( + '--input_node', type=str, default='', help='Name of input node.') + parser.add_argument( + '--output_node', type=str, default='', help='Name of output node.') + cmd_args, unparsed = parser.parse_known_args() + app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py new file mode 100644 index 00000000000..94228dfa61b --- /dev/null +++ b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py @@ -0,0 +1,460 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Computes Receptive Field (RF) information for different models. + +The receptive field (and related parameters) for the different models are +printed to stdout, and may also optionally be written to a CSV file. + +For an example of usage, see rf_benchmark.sh +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import csv +import sys + +from nets import alexnet +from nets import inception +from nets import mobilenet_v1 +from nets import resnet_v1 +from nets import resnet_v2 +from nets import vgg +from tensorflow.contrib import framework +from tensorflow.contrib import receptive_field +from tensorflow.contrib import slim +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import app + +cmd_args = None + +# Input node name for all architectures. +_INPUT_NODE = 'input_image' + +# Variants of different network architectures. + +# - resnet: different versions and sizes. +_SUPPORTED_RESNET_VARIANTS = [ + 'resnet_v1_50', 'resnet_v1_101', 'resnet_v1_152', 'resnet_v1_200', + 'resnet_v2_50', 'resnet_v2_101', 'resnet_v2_152', 'resnet_v2_200' +] + +# - inception_resnet_v2: default, and version with SAME padding. +_SUPPORTED_INCEPTIONRESNETV2_VARIANTS = [ + 'inception_resnet_v2', 'inception_resnet_v2-same' +] + +# - inception_v2: default, and version with no separable conv. +_SUPPORTED_INCEPTIONV2_VARIANTS = [ + 'inception_v2', 'inception_v2-no-separable-conv' +] + +# - inception_v3: default version. +_SUPPORTED_INCEPTIONV3_VARIANTS = ['inception_v3'] + +# - inception_v4: default version. +_SUPPORTED_INCEPTIONV4_VARIANTS = ['inception_v4'] + +# - alexnet_v2: default version. +_SUPPORTED_ALEXNETV2_VARIANTS = ['alexnet_v2'] + +# - vgg: vgg_a (with 11 layers) and vgg_16 (version D). +_SUPPORTED_VGG_VARIANTS = ['vgg_a', 'vgg_16'] + +# - mobilenet_v1: 100% and 75%. +_SUPPORTED_MOBILENETV1_VARIANTS = ['mobilenet_v1', 'mobilenet_v1_075'] + + +def _construct_model(model_type='resnet_v1_50'): + """Constructs model for the desired type of CNN. + + Args: + model_type: Type of model to be used. + + Returns: + end_points: A dictionary from components of the network to the corresponding + activations. + + Raises: + ValueError: If the model_type is not supported. + """ + # Placeholder input. + images = array_ops.placeholder( + dtypes.float32, shape=(1, None, None, 3), name=_INPUT_NODE) + + # Construct model. + if model_type == 'inception_resnet_v2': + _, end_points = inception.inception_resnet_v2_base(images) + elif model_type == 'inception_resnet_v2-same': + _, end_points = inception.inception_resnet_v2_base( + images, align_feature_maps=True) + elif model_type == 'inception_v2': + _, end_points = inception.inception_v2_base(images) + elif model_type == 'inception_v2-no-separable-conv': + _, end_points = inception.inception_v2_base( + images, use_separable_conv=False) + elif model_type == 'inception_v3': + _, end_points = inception.inception_v3_base(images) + elif model_type == 'inception_v4': + _, end_points = inception.inception_v4_base(images) + elif model_type == 'alexnet_v2': + _, end_points = alexnet.alexnet_v2(images) + elif model_type == 'vgg_a': + _, end_points = vgg.vgg_a(images) + elif model_type == 'vgg_16': + _, end_points = vgg.vgg_16(images) + elif model_type == 'mobilenet_v1': + _, end_points = mobilenet_v1.mobilenet_v1_base(images) + elif model_type == 'mobilenet_v1_075': + _, end_points = mobilenet_v1.mobilenet_v1_base( + images, depth_multiplier=0.75) + elif model_type == 'resnet_v1_50': + _, end_points = resnet_v1.resnet_v1_50( + images, num_classes=None, is_training=False, global_pool=False) + elif model_type == 'resnet_v1_101': + _, end_points = resnet_v1.resnet_v1_101( + images, num_classes=None, is_training=False, global_pool=False) + elif model_type == 'resnet_v1_152': + _, end_points = resnet_v1.resnet_v1_152( + images, num_classes=None, is_training=False, global_pool=False) + elif model_type == 'resnet_v1_200': + _, end_points = resnet_v1.resnet_v1_200( + images, num_classes=None, is_training=False, global_pool=False) + elif model_type == 'resnet_v2_50': + _, end_points = resnet_v2.resnet_v2_50( + images, num_classes=None, is_training=False, global_pool=False) + elif model_type == 'resnet_v2_101': + _, end_points = resnet_v2.resnet_v2_101( + images, num_classes=None, is_training=False, global_pool=False) + elif model_type == 'resnet_v2_152': + _, end_points = resnet_v2.resnet_v2_152( + images, num_classes=None, is_training=False, global_pool=False) + elif model_type == 'resnet_v2_200': + _, end_points = resnet_v2.resnet_v2_200( + images, num_classes=None, is_training=False, global_pool=False) + else: + raise ValueError('Unsupported model_type %s.' % model_type) + + return end_points + + +def _get_desired_end_point_keys(model_type='resnet_v1_50'): + """Gets list of desired end point keys for a type of CNN. + + Args: + model_type: Type of model to be used. + + Returns: + desired_end_point_types: A list containing the desired end-points. + + Raises: + ValueError: If the model_type is not supported. + """ + if model_type in _SUPPORTED_RESNET_VARIANTS: + blocks = ['block1', 'block2', 'block3', 'block4'] + desired_end_point_keys = ['%s/%s' % (model_type, i) for i in blocks] + elif model_type in _SUPPORTED_INCEPTIONRESNETV2_VARIANTS: + desired_end_point_keys = [ + 'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'MaxPool_3a_3x3', + 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', 'Mixed_5b', + 'Mixed_6a', 'PreAuxLogits', 'Mixed_7a', 'Conv2d_7b_1x1' + ] + elif model_type in _SUPPORTED_INCEPTIONV2_VARIANTS: + desired_end_point_keys = [ + 'Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', + 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'Mixed_4a', 'Mixed_4b', + 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c' + ] + elif model_type in _SUPPORTED_INCEPTIONV3_VARIANTS: + desired_end_point_keys = [ + 'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'MaxPool_3a_3x3', + 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', 'Mixed_5b', + 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', + 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c' + ] + elif model_type in _SUPPORTED_INCEPTIONV4_VARIANTS: + desired_end_point_keys = [ + 'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'Mixed_3a', + 'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_5e', + 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_6f', + 'Mixed_6g', 'Mixed_6h', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c', 'Mixed_7d' + ] + elif model_type in _SUPPORTED_ALEXNETV2_VARIANTS: + ep = ['conv1', 'pool1', 'conv2', 'conv3', 'conv4', 'conv5', 'pool5'] + desired_end_point_keys = ['%s/%s' % (model_type, i) for i in ep] + elif model_type in _SUPPORTED_VGG_VARIANTS: + ep = [ + 'conv1/conv1_1', 'pool1', 'conv2/conv2_1', 'pool2', 'conv3/conv3_1', + 'conv3/conv3_2', 'pool3', 'conv4/conv4_1', 'conv4/conv4_2', 'pool4', + 'conv5/conv5_1', 'conv5/conv5_2', 'pool5' + ] + desired_end_point_keys = ['%s/%s' % (model_type, i) for i in ep] + elif model_type in _SUPPORTED_MOBILENETV1_VARIANTS: + desired_end_point_keys = [ + 'Conv2d_0', 'Conv2d_1_pointwise', 'Conv2d_2_pointwise', + 'Conv2d_3_pointwise', 'Conv2d_4_pointwise', 'Conv2d_5_pointwise', + 'Conv2d_6_pointwise', 'Conv2d_7_pointwise', 'Conv2d_8_pointwise', + 'Conv2d_9_pointwise', 'Conv2d_10_pointwise', 'Conv2d_11_pointwise', + 'Conv2d_12_pointwise', 'Conv2d_13_pointwise' + ] + else: + raise ValueError('Unsupported model_type %s.' % model_type) + + return desired_end_point_keys + + +def _model_graph_def(model_type='resnet_v1_50', arg_sc=None): + """Constructs a model graph, returning GraphDef and end-points. + + Args: + model_type: Type of model to be used. + arg_sc: Optional arg scope to use in constructing the graph. + + Returns: + graph_def: GraphDef of constructed graph. + end_points: A dictionary from components of the network to the corresponding + activations. + """ + if arg_sc is None: + arg_sc = {} + g = ops.Graph() + with g.as_default(): + with framework.arg_scope(arg_sc): + end_points = _construct_model(model_type) + + return g.as_graph_def(), end_points + + +def _model_rf(graphdef, + end_points, + desired_end_point_keys, + model_type='resnet_v1_50', + csv_writer=None): + """Computes receptive field information for a given CNN model. + + The information will be printed to stdout. If the RF parameters are the same + for the horizontal and vertical directions, it will be printed only once. + Otherwise, they are printed once for the horizontal and once for the vertical + directions. + + Args: + graphdef: GraphDef of given model. + end_points: A dictionary from components of the model to the corresponding + activations. + desired_end_point_keys: List of desired end points for which receptive field + information will be computed. + model_type: Type of model to be used, used only for printing purposes. + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + """ + for desired_end_point_key in desired_end_point_keys: + print('- %s:' % desired_end_point_key) + output_node_with_colon = end_points[desired_end_point_key].name + pos = output_node_with_colon.rfind(':') + output_node = output_node_with_colon[:pos] + (receptive_field_x, receptive_field_y, effective_stride_x, + effective_stride_y, effective_padding_x, effective_padding_y + ) = receptive_field.compute_receptive_field_from_graph_def( + graphdef, _INPUT_NODE, output_node) + # If values are the same in horizontal/vertical directions, just report one + # of them. Otherwise, report both. + if (receptive_field_x == receptive_field_y) and ( + effective_stride_x == effective_stride_y) and ( + effective_padding_x == effective_padding_y): + print('Receptive field size = %5s, effective stride = %5s, effective ' + 'padding = %5s' % (str(receptive_field_x), str(effective_stride_x), + str(effective_padding_x))) + else: + print('Receptive field size: horizontal = %5s, vertical = %5s. ' + 'Effective stride: horizontal = %5s, vertical = %5s. Effective ' + 'padding: horizontal = %5s, vertical = %5s' % + (str(receptive_field_x), str(receptive_field_y), + str(effective_stride_x), str(effective_stride_y), + str(effective_padding_x), str(effective_padding_y))) + if csv_writer is not None: + csv_writer.writerow({ + 'CNN': model_type, + 'end_point': desired_end_point_key, + 'RF size hor': str(receptive_field_x), + 'RF size ver': str(receptive_field_y), + 'effective stride hor': str(effective_stride_x), + 'effective stride ver': str(effective_stride_y), + 'effective padding hor': str(effective_padding_x), + 'effective padding ver': str(effective_padding_y) + }) + + +def _process_model_rf(model_type='resnet_v1_50', csv_writer=None, arg_sc=None): + """Contructs model graph and desired end-points, and compute RF. + + The computed RF parameters are printed to stdout by the _model_rf function. + + Args: + model_type: Type of model to be used. + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + arg_sc: Optional arg scope to use in constructing the graph. + + """ + print('********************%s' % model_type) + graphdef, end_points = _model_graph_def(model_type, arg_sc) + desired_end_point_keys = _get_desired_end_point_keys(model_type) + _model_rf(graphdef, end_points, desired_end_point_keys, model_type, + csv_writer) + + +def _resnet_rf(csv_writer=None): + """Computes RF and associated parameters for resnet models. + + The computed values are written to stdout. + + Args: + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + """ + for model_type in _SUPPORTED_RESNET_VARIANTS: + arg_sc = resnet_v1.resnet_arg_scope() + _process_model_rf(model_type, csv_writer, arg_sc) + + +def _inception_resnet_v2_rf(csv_writer=None): + """Computes RF and associated parameters for the inception_resnet_v2 model. + + The computed values are written to stdout. + + Args: + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + """ + for model_type in _SUPPORTED_INCEPTIONRESNETV2_VARIANTS: + _process_model_rf(model_type, csv_writer) + + +def _inception_v2_rf(csv_writer=None): + """Computes RF and associated parameters for the inception_v2 model. + + The computed values are written to stdout. + + Args: + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + """ + for model_type in _SUPPORTED_INCEPTIONV2_VARIANTS: + _process_model_rf(model_type, csv_writer) + + +def _inception_v3_rf(csv_writer=None): + """Computes RF and associated parameters for the inception_v3 model. + + The computed values are written to stdout. + + Args: + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + """ + for model_type in _SUPPORTED_INCEPTIONV3_VARIANTS: + _process_model_rf(model_type, csv_writer) + + +def _inception_v4_rf(csv_writer=None): + """Computes RF and associated parameters for the inception_v4 model. + + The computed values are written to stdout. + + Args: + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + """ + for model_type in _SUPPORTED_INCEPTIONV4_VARIANTS: + _process_model_rf(model_type, csv_writer) + + +def _alexnet_v2_rf(csv_writer=None): + """Computes RF and associated parameters for the alexnet_v2 model. + + The computed values are written to stdout. + + Args: + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + """ + for model_type in _SUPPORTED_ALEXNETV2_VARIANTS: + _process_model_rf(model_type, csv_writer) + + +def _vgg_rf(csv_writer=None): + """Computes RF and associated parameters for the vgg model. + + The computed values are written to stdout. + + Args: + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + """ + for model_type in _SUPPORTED_VGG_VARIANTS: + _process_model_rf(model_type, csv_writer) + + +def _mobilenet_v1_rf(csv_writer=None): + """Computes RF and associated parameters for the mobilenet_v1 model. + + The computed values are written to stdout. + + Args: + csv_writer: A CSV writer for RF parameters, which is used if it is not None. + """ + for model_type in _SUPPORTED_MOBILENETV1_VARIANTS: + with slim.arg_scope( + [slim.batch_norm, slim.dropout], is_training=False) as arg_sc: + _process_model_rf(model_type, csv_writer, arg_sc) + + +def main(unused_argv): + # Configure CSV file which will be written, if desired. + if cmd_args.csv_path: + csv_file = open(cmd_args.csv_path, 'w') + field_names = [ + 'CNN', 'end_point', 'RF size hor', 'RF size ver', + 'effective stride hor', 'effective stride ver', 'effective padding hor', + 'effective padding ver' + ] + rf_writer = csv.DictWriter(csv_file, fieldnames=field_names) + rf_writer.writeheader() + else: + rf_writer = None + + # Compute RF parameters for each network architecture. + _alexnet_v2_rf(rf_writer) + _vgg_rf(rf_writer) + _inception_v2_rf(rf_writer) + _inception_v3_rf(rf_writer) + _inception_v4_rf(rf_writer) + _inception_resnet_v2_rf(rf_writer) + _mobilenet_v1_rf(rf_writer) + _resnet_rf(rf_writer) + + # Close CSV file, if it was opened. + if cmd_args.csv_path: + csv_file.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.register('type', 'bool', lambda v: v.lower() == 'true') + parser.add_argument( + '--csv_path', + type=str, + default='', + help="""\ + Path to CSV file that will be written with RF parameters.If empty, no + file will be written.\ + """) + cmd_args, unparsed = parser.parse_known_args() + app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py b/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py new file mode 100644 index 00000000000..793ae163d80 --- /dev/null +++ b/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py @@ -0,0 +1,61 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Simple script to write Inception-ResNet-v2 model to graph file. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys + +from nets import inception +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import graph_io +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import app + +cmd_args = None + + +def main(unused_argv): + # Model definition. + g = ops.Graph() + with g.as_default(): + images = array_ops.placeholder( + dtypes.float32, shape=(1, None, None, 3), name='input_image') + inception.inception_resnet_v2_base(images) + + graph_io.write_graph(g.as_graph_def(), cmd_args.graph_dir, + cmd_args.graph_filename) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.register('type', 'bool', lambda v: v.lower() == 'true') + parser.add_argument( + '--graph_dir', + type=str, + default='/tmp', + help='Directory where graph will be saved.') + parser.add_argument( + '--graph_filename', + type=str, + default='graph.pbtxt', + help='Filename of graph that will be saved.') + cmd_args, unparsed = parser.parse_known_args() + app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py new file mode 100644 index 00000000000..8af4be16d6c --- /dev/null +++ b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py @@ -0,0 +1,88 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Library to compute order of computations in a graph. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections + + +class GraphDefHelper(object): + """Helper class to collect node names and definitions. + + Example: + b = GraphDefHelper(graph_def) + # Prints node that produces given output. + print b.output_of['conv/foo/bar'] + """ + + def __init__(self, gd): + self.output_of = {} + for each in gd.node: + self.output_of[each.name] = each + + +# pylint: disable=invalid-name +_NodeEntry = collections.namedtuple('NodeEntry', field_names=['order', 'node']) + + +def _get_computed_nodes(g, output, seen): + """Traverses the graph in topological order. + + Args: + g: GraphDefHelper object. + output: current node. + seen: map of nodes we've already traversed. + Returns: + order in topological sort for 'output'. + """ + if output in seen: + return seen[output].order + node_def = g.output_of.get(output, None) + if node_def is None: + seen[output] = _NodeEntry(0, None) + return 0 + + r = 0 + for each in node_def.input: + # Parses name of input node. + if each.startswith('^'): + each = each[1:] + each = each.split(':')[0] + # Recursively computes ordering. + new_v = _get_computed_nodes(g, each, seen) + r = max(r, new_v + 1) + + seen[output] = _NodeEntry(r, node_def) + + return seen[output].order + + +def get_compute_order(graph_def): + """Computes order of computation for a given graph. + + Args: + graph_def: GraphDef object. + Returns: + map: name -> {order, node} + """ + helper = GraphDefHelper(graph_def) + seen = collections.defaultdict(_NodeEntry) + for each in graph_def.node: + _get_computed_nodes(helper, each.name, seen) + return seen diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field.py b/tensorflow/contrib/receptive_field/python/util/receptive_field.py new file mode 100644 index 00000000000..db190a1a416 --- /dev/null +++ b/tensorflow/contrib/receptive_field/python/util/receptive_field.py @@ -0,0 +1,485 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functions to compute receptive field of a fully-convolutional network. + +Please refer to the following g3doc for detailed explanation on how this +computation is performed, and why it is important: +g3doc/photos/vision/features/delf/g3doc/rf_computation.md +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +from tensorflow.contrib.receptive_field.python.util import graph_compute_order +from tensorflow.contrib.util import make_ndarray +from tensorflow.python.platform import tf_logging as logging + +# White-listed layer operations, which do not affect the receptive field +# computation. +_UNCHANGED_RF_LAYER_OPS = [ + "Softplus", "Relu", "BiasAdd", "Mul", "Add", "Const", "Identity", + "VariableV2", "Sub", "Rsqrt", "ConcatV2" +] + +# Different ways in which padding modes may be spelled. +_VALID_PADDING = ["VALID", b"VALID"] +_SAME_PADDING = ["SAME", b"SAME"] + + +def _stride_size(node): + """Computes stride size given a TF node. + + Args: + node: Tensorflow node (NodeDef proto). + + Returns: + stride_x: Stride size for horizontal direction (integer). + stride_y: Stride size for vertical direction (integer). + """ + strides_attr = node.attr["strides"] + logging.vlog(4, "strides_attr = %s", strides_attr) + stride_y = strides_attr.list.i[1] + stride_x = strides_attr.list.i[2] + return stride_x, stride_y + + +def _conv_kernel_size(node, name_to_order_node): + """Computes kernel size given a TF convolution or pooling node. + + Args: + node: Tensorflow node (NodeDef proto). + name_to_order_node: Map from name to {order, node}. Output of + graph_compute_order.get_compute_order(). + + Returns: + kernel_size_x: Kernel size for horizontal direction (integer). + kernel_size_y: Kernel size for vertical direction (integer). + + Raises: + ValueError: If the weight layer node is invalid. + """ + weights_layer_read_name = node.input[1] + if not weights_layer_read_name.endswith("/read"): + raise ValueError( + "Weight layer's name input to conv layer does not end with '/read'") + weights_layer_param_name = weights_layer_read_name[:-5] + weights_node = name_to_order_node[weights_layer_param_name].node + if weights_node.op != "VariableV2": + raise ValueError("Weight layer is not of type VariableV2") + shape = weights_node.attr["shape"] + logging.vlog(4, "weight shape = %s", shape) + kernel_size_y = shape.shape.dim[0].size + kernel_size_x = shape.shape.dim[1].size + return kernel_size_x, kernel_size_y + + +def _padding_size_conv_pool(node, kernel_size, stride): + """Computes padding size given a TF convolution or pooling node. + + Args: + node: Tensorflow node (NodeDef proto). + kernel_size: Kernel size of node (integer). + stride: Stride size of node (integer). + + Returns: + padding: Padding size (integer). + + Raises: + ValueError: If padding is invalid. + """ + # In this case, we need to carefully consider the different TF padding modes. + # The padding depends on kernel size, and may depend on input size. If it + # depends on input size, we raise an exception. + padding_attr = node.attr["padding"] + logging.vlog(4, "padding_attr = %s", padding_attr) + if padding_attr.s in _VALID_PADDING: + padding = 0 + elif padding_attr.s in _SAME_PADDING: + if kernel_size == 1: + padding = 0 + elif stride == 1: + padding = int(math.floor((float(kernel_size) - 1) / 2)) + elif stride == 2 and kernel_size % 2 == 0: + padding = int(math.floor((float(kernel_size) - 1) / 2)) + else: + padding = None + logging.warning( + "Padding depends on input size, which means that the effective " + "padding may be different depending on the input image " + "dimensionality. In this case, alignment check will be skipped.") + else: + raise ValueError("Invalid padding operation %s" % padding_attr.s) + return padding + + +def _pool_kernel_size(node): + """Computes kernel size given a TF pooling node. + + Args: + node: Tensorflow node (NodeDef proto). + + Returns: + kernel_size_x: Kernel size for horizontal direction (integer). + kernel_size_y: Kernel size for vertical direction (integer). + + Raises: + ValueError: If pooling is invalid. + """ + ksize = node.attr["ksize"] + kernel_size_y = ksize.list.i[1] + kernel_size_x = ksize.list.i[2] + if ksize.list.i[0] != 1: + raise ValueError("pool ksize for first dim is not 1") + if ksize.list.i[3] != 1: + raise ValueError("pool ksize for last dim is not 1") + return kernel_size_x, kernel_size_y + + +def _padding_size_pad_layer(node, name_to_order_node): + """Computes padding size given a TF padding node. + + Args: + node: Tensorflow node (NodeDef proto). + name_to_order_node: Map from name to {order, node}. Output of + graph_compute_order.get_compute_order(). + + Returns: + padding_x: Padding size for horizontal direction (integer). + padding_y: Padding size for vertical direction (integer). + + Raises: + ValueError: If padding layer is invalid. + """ + paddings_layer_name = node.input[1] + if not paddings_layer_name.endswith("/paddings"): + raise ValueError("Padding layer name does not end with '/paddings'") + paddings_node = name_to_order_node[paddings_layer_name].node + if paddings_node.op != "Const": + raise ValueError("Padding op is not Const") + value = paddings_node.attr["value"] + t = make_ndarray(value.tensor) + padding_y = t[1][0] + padding_x = t[2][0] + if t[0][0] != 0: + raise ValueError("padding is not zero for first tensor dim") + if t[3][0] != 0: + raise ValueError("padding is not zero for last tensor dim") + return padding_x, padding_y + + +def _get_layer_params(node, name_to_order_node): + """Gets layer parameters relevant for RF computation. + + Currently, only these nodes are supported: + - Conv2D + - DepthwiseConv2dNative + - Pad + - MaxPool + - AvgPool + - all nodes listed in _UNCHANGED_RF_LAYER_OPS + + Args: + node: Tensorflow node (NodeDef proto). + name_to_order_node: Map from name to {order, node}. Output of + graph_compute_order.get_compute_order(). + + Returns: + kernel_size_x: Kernel size for horizontal direction (integer). + kernel_size_y: Kernel size for vertical direction (integer). + stride_x: Stride size for horizontal direction (integer). + stride_y: Stride size for vertical direction (integer). + padding_x: Padding size for horizontal direction (integer). + padding_y: Padding size for vertical direction (integer). + + Raises: + ValueError: If layer op is unknown. + """ + logging.vlog(3, "node.op = %s", node.op) + logging.vlog(4, "node = %s", node) + if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative": + stride_x, stride_y = _stride_size(node) + kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_order_node) + # Compute the padding for this node separately for each direction. + padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x) + padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y) + elif node.op == "Pad": + # Kernel and stride are simply 1 in this case. + kernel_size_x = 1 + kernel_size_y = 1 + stride_x = 1 + stride_y = 1 + padding_x, padding_y = _padding_size_pad_layer(node, name_to_order_node) + elif node.op == "MaxPool" or node.op == "AvgPool": + stride_x, stride_y = _stride_size(node) + kernel_size_x, kernel_size_y = _pool_kernel_size(node) + # Compute the padding for this node separately for each direction. + padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x) + padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y) + elif node.op in _UNCHANGED_RF_LAYER_OPS: + # These nodes do not modify the RF parameters. + kernel_size_x = 1 + kernel_size_y = 1 + stride_x = 1 + stride_y = 1 + padding_x = 0 + padding_y = 0 + else: + raise ValueError("Unknown layer op: %s" % node.op) + return kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y + + +def _reverse_sort_by_order(name_to_order_node): + """Sorts map of name_to_order_node nodes in reverse order. + + The output is such that the nodes in name_to_order_node are sorted in + descending order of the "order" field. + + Args: + name_to_order_node: Map from name to {order, node}. Output of + graph_compute_order.get_compute_order(). + + Returns: + sorted_name_to_order_node: Sorted version of the input, in descending order. + """ + return sorted(name_to_order_node.items(), key=lambda x: -x[1].order) + + +def _get_rf_size_node_input(stride, kernel_size, rf_size_output): + """Computes RF size at the input of a given layer. + + Args: + stride: Stride of given layer (integer). + kernel_size: Kernel size of given layer (integer). + rf_size_output: RF size at output of given layer (integer). + + Returns: + rf_size_input: RF size at input of given layer (integer). + """ + return stride * rf_size_output + kernel_size - stride + + +def _get_effective_stride_node_input(stride, effective_stride_output): + """Computes effective stride at the input of a given layer. + + Args: + stride: Stride of given layer (integer). + effective_stride_output: Effective stride at output of given layer + (integer). + + Returns: + effective_stride_input: Effective stride at input of given layer + (integer). + """ + return stride * effective_stride_output + + +def _get_effective_padding_node_input(stride, padding, + effective_padding_output): + """Computes effective padding at the input of a given layer. + + Args: + stride: Stride of given layer (integer). + padding: Padding of given layer (integer). + effective_padding_output: Effective padding at output of given layer + (integer). + + Returns: + effective_padding_input: Effective padding at input of given layer + (integer). + """ + return stride * effective_padding_output + padding + + +def compute_receptive_field_from_graph_def(graph_def, input_node, output_node): + """Computes receptive field (RF) parameters from a GraphDef object. + + Args: + graph_def: GraphDef object. + input_node: Name of the input node from graph. + output_node: Name of the output node from graph. + + Returns: + rf_size_x: Receptive field size of network in the horizontal direction, with + respect to specified input and output. + rf_size_y: Receptive field size of network in the vertical direction, with + respect to specified input and output. + effective_stride_x: Effective stride of network in the horizontal direction, + with respect to specified input and output. + effective_stride_y: Effective stride of network in the vertical direction, + with respect to specified input and output. + effective_padding_x: Effective padding of network in the horizontal + direction, with respect to specified input and output. + effective_padding_y: Effective padding of network in the vertical + direction, with respect to specified input and output. + + Raises: + ValueError: If network is not aligned or if either input or output nodes + cannot be found. For network criterion alignment, see + photos/vision/features/delf/g3doc/rf_computation.md + """ + # Computes order of computation for a given graph. + name_to_order_node = graph_compute_order.get_compute_order( + graph_def=graph_def) + + # Sort in reverse topological order. + order = _reverse_sort_by_order(name_to_order_node) + + # Dictionaries to keep track of receptive field, effective stride and + # effective padding of different nodes. + rf_sizes_x = {} + rf_sizes_y = {} + effective_strides_x = {} + effective_strides_y = {} + effective_paddings_x = {} + effective_paddings_y = {} + + # Initialize dicts for output_node. + rf_sizes_x[output_node] = 1 + rf_sizes_y[output_node] = 1 + effective_strides_x[output_node] = 1 + effective_strides_y[output_node] = 1 + effective_paddings_x[output_node] = 0 + effective_paddings_y[output_node] = 0 + + # Flag to denote if we found output node yet. If we have not, we skip nodes + # until the output node is found. + found_output_node = False + + # Flag to denote if padding is undefined. This happens when SAME padding mode + # is used in conjunction with stride and kernel sizes which make it such that + # the padding to be applied would depend on the input size. In this case, + # alignment checks are skipped, and the effective padding is None. + undefined_padding = False + + for _, (o, node) in order: + if node: + logging.vlog(3, "%10d %-100s %-20s" % (o, node.name[:90], node.op)) + else: + continue + + # When we find input node, we can stop. + if node.name == input_node: + break + + # Loop until we find the output node. All nodes before finding the output + # one are irrelevant, so they can be skipped. + if not found_output_node: + if node.name == output_node: + found_output_node = True + + if found_output_node: + if node.name not in rf_sizes_x: + assert node.name not in rf_sizes_y, ("Node %s is in rf_sizes_y, but " + "not in rf_sizes_x" % node.name) + # In this case, node is not relevant since it's not part of the + # computation we're interested in. + logging.vlog(3, "Irrelevant node %s, skipping it...", node.name) + continue + + # Get params for this layer. + kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y = ( + _get_layer_params(node, name_to_order_node)) + logging.vlog(3, "kernel_size_x = %s, kernel_size_y = %s, " + "stride_x = %s, stride_y = %s, " + "padding_x = %s, padding_y = %s" % + (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, + padding_y)) + if padding_x is None or padding_y is None: + undefined_padding = True + + # Get parameters at input of this layer which may or may not be propagated + # to the input layers. + rf_size_input_x = _get_rf_size_node_input(stride_x, kernel_size_x, + rf_sizes_x[node.name]) + rf_size_input_y = _get_rf_size_node_input(stride_y, kernel_size_y, + rf_sizes_y[node.name]) + effective_stride_input_x = _get_effective_stride_node_input( + stride_x, effective_strides_x[node.name]) + effective_stride_input_y = _get_effective_stride_node_input( + stride_y, effective_strides_y[node.name]) + if not undefined_padding: + effective_padding_input_x = _get_effective_padding_node_input( + stride_x, padding_x, effective_paddings_x[node.name]) + effective_padding_input_y = _get_effective_padding_node_input( + stride_y, padding_y, effective_paddings_y[node.name]) + else: + effective_padding_input_x = None + effective_padding_input_y = None + + # Loop over this node's inputs and potentially propagate information down. + for inp_name in node.input: + logging.vlog(4, "inp_name = %s", inp_name) + inp_node = name_to_order_node[inp_name].node + logging.vlog(4, "inp_node = \n%s", inp_node) + if inp_node.name in rf_sizes_x: + assert inp_node.name in rf_sizes_y, ( + "Node %s is in rf_sizes_x, but " + "not in rf_sizes_y" % inp_node.name) + # This node was already discovered through a previous path, so we need + # to make sure that graph is aligned. This alignment check is skipped + # if the padding is not defined, since in this case alignment cannot + # be checked. + if not undefined_padding: + if effective_strides_x[inp_node.name] != effective_stride_input_x: + raise ValueError( + "Graph is not aligned since effective stride from different " + "paths is different in horizontal direction") + if effective_strides_y[inp_node.name] != effective_stride_input_y: + raise ValueError( + "Graph is not aligned since effective stride from different " + "paths is different in vertical direction") + if (rf_sizes_x[inp_node.name] - 1 + ) / 2 - effective_paddings_x[inp_node.name] != ( + rf_size_input_x - 1) / 2 - effective_padding_input_x: + raise ValueError( + "Graph is not aligned since center shift from different " + "paths is different in horizontal direction") + if (rf_sizes_y[inp_node.name] - 1 + ) / 2 - effective_paddings_y[inp_node.name] != ( + rf_size_input_y - 1) / 2 - effective_padding_input_y: + raise ValueError( + "Graph is not aligned since center shift from different " + "paths is different in vertical direction") + # Keep track of path with largest RF, for both directions. + if rf_sizes_x[inp_node.name] < rf_size_input_x: + rf_sizes_x[inp_node.name] = rf_size_input_x + effective_strides_x[inp_node.name] = effective_stride_input_x + effective_paddings_x[inp_node.name] = effective_padding_input_x + if rf_sizes_y[inp_node.name] < rf_size_input_y: + rf_sizes_y[inp_node.name] = rf_size_input_y + effective_strides_y[inp_node.name] = effective_stride_input_y + effective_paddings_y[inp_node.name] = effective_padding_input_y + else: + assert inp_node.name not in rf_sizes_y, ( + "Node %s is in rf_sizes_y, but " + "not in rf_sizes_x" % inp_node.name) + # In this case, it is the first time we encounter this node. So we + # propagate the RF parameters. + rf_sizes_x[inp_node.name] = rf_size_input_x + rf_sizes_y[inp_node.name] = rf_size_input_y + effective_strides_x[inp_node.name] = effective_stride_input_x + effective_strides_y[inp_node.name] = effective_stride_input_y + effective_paddings_x[inp_node.name] = effective_padding_input_x + effective_paddings_y[inp_node.name] = effective_padding_input_y + + if not found_output_node: + raise ValueError("Output node was not found") + if input_node not in rf_sizes_x: + raise ValueError("Input node was not found") + return (rf_sizes_x[input_node], rf_sizes_y[input_node], + effective_strides_x[input_node], effective_strides_y[input_node], + effective_paddings_x[input_node], effective_paddings_y[input_node]) diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py new file mode 100644 index 00000000000..2771389250b --- /dev/null +++ b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py @@ -0,0 +1,225 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for receptive_fields module.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib import slim +from tensorflow.contrib.receptive_field.python.util import receptive_field +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import nn +from tensorflow.python.platform import test + + +def create_test_network_1(): + """Aligned network for test. + + The graph corresponds to the example from the second figure in + go/cnn-rf-computation#arbitrary-computation-graphs + + Returns: + g: Tensorflow graph object (Graph proto). + """ + g = ops.Graph() + with g.as_default(): + # An 8x8 test image. + x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image') + # Left branch. + l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID') + # Right branch. + l2_pad = array_ops.pad(x, [[0, 0], [1, 0], [1, 0], [0, 0]]) + l2 = slim.conv2d(l2_pad, 1, [3, 3], stride=2, scope='L2', padding='VALID') + l3 = slim.conv2d(l2, 1, [1, 1], stride=2, scope='L3', padding='VALID') + # Addition. + nn.relu(l1 + l3, name='output') + return g + + +def create_test_network_2(): + """Aligned network for test. + + The graph corresponds to a variation to the example from the second figure in + go/cnn-rf-computation#arbitrary-computation-graphs. Layers 2 and 3 are changed + to max-pooling operations. Since the functionality is the same as convolution, + the network is aligned and the receptive field size is the same as from the + network created using create_test_network_1(). + + Returns: + g: Tensorflow graph object (Graph proto). + """ + g = ops.Graph() + with g.as_default(): + # An 8x8 test image. + x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image') + # Left branch. + l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID') + # Right branch. + l2_pad = array_ops.pad(x, [[0, 0], [1, 0], [1, 0], [0, 0]]) + l2 = slim.max_pool2d(l2_pad, [3, 3], stride=2, scope='L2', padding='VALID') + l3 = slim.max_pool2d(l2, [1, 1], stride=2, scope='L3', padding='VALID') + # Addition. + nn.relu(l1 + l3, name='output') + return g + + +def create_test_network_3(): + """Misaligned network for test. + + The graph corresponds to the example from the first figure in + go/cnn-rf-computation#arbitrary-computation-graphs + + Returns: + g: Tensorflow graph object (Graph proto). + """ + g = ops.Graph() + with g.as_default(): + # An 8x8 test image. + x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image') + # Left branch. + l1_pad = array_ops.pad(x, [[0, 0], [2, 1], [2, 1], [0, 0]]) + l1 = slim.conv2d(l1_pad, 1, [5, 5], stride=2, scope='L1', padding='VALID') + # Right branch. + l2 = slim.conv2d(x, 1, [3, 3], stride=1, scope='L2', padding='VALID') + l3 = slim.conv2d(l2, 1, [3, 3], stride=1, scope='L3', padding='VALID') + # Addition. + nn.relu(l1 + l3, name='output') + return g + + +def create_test_network_4(): + """Misaligned network for test. + + The graph corresponds to a variation from the example from the second figure + in go/cnn-rf-computation#arbitrary-computation-graphs. Layer 2 uses 'SAME' + padding, which makes its padding dependent on the input image dimensionality. + In this case, the effective padding will be undetermined, and the utility is + not able to check the network alignment. + + Returns: + g: Tensorflow graph object (Graph proto). + """ + g = ops.Graph() + with g.as_default(): + # An 8x8 test image. + x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image') + # Left branch. + l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID') + # Right branch. + l2 = slim.conv2d(x, 1, [3, 3], stride=2, scope='L2', padding='SAME') + l3 = slim.conv2d(l2, 1, [1, 1], stride=2, scope='L3', padding='VALID') + # Addition. + nn.relu(l1 + l3, name='output') + return g + + +def create_test_network_5(): + """Single-path network for testing non-square kernels. + + The graph is similar to the right branch of the graph from + create_test_network_1(), except that the kernel sizes are changed to be + non-square. + + Returns: + g: Tensorflow graph object (Graph proto). + """ + g = ops.Graph() + with g.as_default(): + # An 8x8 test image. + x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image') + # Two convolutional layers, where the first one has non-square kernel. + l1 = slim.conv2d(x, 1, [3, 5], stride=2, scope='L1', padding='VALID') + l2 = slim.conv2d(l1, 1, [3, 1], stride=2, scope='L2', padding='VALID') + # ReLU. + nn.relu(l2, name='output') + return g + + +class RfUtilsTest(test.TestCase): + + def testComputeRFFromGraphDefAligned(self): + graph_def = create_test_network_1().as_graph_def() + input_node = 'input_image' + output_node = 'output' + (receptive_field_x, receptive_field_y, effective_stride_x, + effective_stride_y, effective_padding_x, effective_padding_y) = ( + receptive_field.compute_receptive_field_from_graph_def( + graph_def, input_node, output_node)) + self.assertEqual(receptive_field_x, 3) + self.assertEqual(receptive_field_y, 3) + self.assertEqual(effective_stride_x, 4) + self.assertEqual(effective_stride_y, 4) + self.assertEqual(effective_padding_x, 1) + self.assertEqual(effective_padding_y, 1) + + def testComputeRFFromGraphDefAligned2(self): + graph_def = create_test_network_2().as_graph_def() + input_node = 'input_image' + output_node = 'output' + (receptive_field_x, receptive_field_y, effective_stride_x, + effective_stride_y, effective_padding_x, effective_padding_y) = ( + receptive_field.compute_receptive_field_from_graph_def( + graph_def, input_node, output_node)) + self.assertEqual(receptive_field_x, 3) + self.assertEqual(receptive_field_y, 3) + self.assertEqual(effective_stride_x, 4) + self.assertEqual(effective_stride_y, 4) + self.assertEqual(effective_padding_x, 1) + self.assertEqual(effective_padding_y, 1) + + def testComputeRFFromGraphDefUnaligned(self): + graph_def = create_test_network_3().as_graph_def() + input_node = 'input_image' + output_node = 'output' + with self.assertRaises(ValueError): + receptive_field.compute_receptive_field_from_graph_def( + graph_def, input_node, output_node) + + def testComputeRFFromGraphDefUnaligned2(self): + graph_def = create_test_network_4().as_graph_def() + input_node = 'input_image' + output_node = 'output' + (receptive_field_x, receptive_field_y, effective_stride_x, + effective_stride_y, effective_padding_x, effective_padding_y) = ( + receptive_field.compute_receptive_field_from_graph_def( + graph_def, input_node, output_node)) + self.assertEqual(receptive_field_x, 3) + self.assertEqual(receptive_field_y, 3) + self.assertEqual(effective_stride_x, 4) + self.assertEqual(effective_stride_y, 4) + self.assertEqual(effective_padding_x, None) + self.assertEqual(effective_padding_y, None) + + def testComputeRFFromGraphDefNonSquareRF(self): + graph_def = create_test_network_5().as_graph_def() + input_node = 'input_image' + output_node = 'output' + (receptive_field_x, receptive_field_y, effective_stride_x, + effective_stride_y, effective_padding_x, effective_padding_y) = ( + receptive_field.compute_receptive_field_from_graph_def( + graph_def, input_node, output_node)) + self.assertEqual(receptive_field_x, 5) + self.assertEqual(receptive_field_y, 7) + self.assertEqual(effective_stride_x, 4) + self.assertEqual(effective_stride_y, 4) + self.assertEqual(effective_padding_x, 0) + self.assertEqual(effective_padding_y, 0) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD index 598e6513aeb..865fb72a55b 100644 --- a/tensorflow/contrib/stateless/BUILD +++ b/tensorflow/contrib/stateless/BUILD @@ -21,6 +21,7 @@ py_library( srcs_version = "PY2AND3", deps = [ ":stateless_random_ops", + "//tensorflow/python:framework", "//tensorflow/python:util", ], ) diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py index 82e5d36ce44..ca937546f50 100644 --- a/tensorflow/contrib/stateless/__init__.py +++ b/tensorflow/contrib/stateless/__init__.py @@ -34,5 +34,11 @@ from __future__ import print_function # pylint: disable=wildcard-import from tensorflow.contrib.stateless.gen_stateless_random_ops import * +from tensorflow.python.framework import ops from tensorflow.python.util.all_util import remove_undocumented + +ops.NotDifferentiable("StatelessRandomNormal") +ops.NotDifferentiable("StatelessRandomUniform") +ops.NotDifferentiable("StatelessTruncatedNormal") + remove_undocumented(__name__) diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD new file mode 100644 index 00000000000..bc305022642 --- /dev/null +++ b/tensorflow/contrib/summary/BUILD @@ -0,0 +1,59 @@ +licenses(["notice"]) # Apache 2.0 + +exports_files([ + "LICENSE", +]) + +load( + "//tensorflow:tensorflow.bzl", + "py_test", + "tf_gen_op_wrapper_py", +) + +tf_gen_op_wrapper_py( + name = "gen_summary_ops", + out = "gen_summary_ops.py", + deps = ["//tensorflow/core:summary_ops_op_lib"], +) + +py_test( + name = "summary_ops_test", + srcs = ["summary_ops_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":summary_ops", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:platform", + "//tensorflow/python:training", + "//tensorflow/python/eager:context", + "//tensorflow/python/eager:test", + ], +) + +py_library( + name = "summary_ops", + srcs = ["summary_ops.py"], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:internal"], + deps = [ + ":gen_summary_ops", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework_ops", + "//tensorflow/python:summary_op_util", + "//tensorflow/python:training", + "//tensorflow/python/eager:context", + ], +) + +filegroup( + name = "all_files", + srcs = glob( + ["**/*"], + exclude = [ + "**/METADATA", + "**/OWNERS", + ], + ), + visibility = ["//tensorflow:__subpackages__"], +) diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py new file mode 100644 index 00000000000..05e627adf1c --- /dev/null +++ b/tensorflow/contrib/summary/summary_ops.py @@ -0,0 +1,159 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Operations to emit summaries.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.summary import gen_summary_ops +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import summary_op_util +from tensorflow.python.training import training_util + + +# Name for a collection which is expected to have at most a single boolean +# Tensor. If this tensor is True the summary ops will record summaries. +_SHOULD_RECORD_SUMMARIES_NAME = "ShouldRecordSummaries" + + +def should_record_summaries(): + """Returns boolean Tensor which is true if summaries should be recorded.""" + should_record_collection = ops.get_collection(_SHOULD_RECORD_SUMMARIES_NAME) + if not should_record_collection: + return constant_op.constant(False) + if len(should_record_collection) != 1: + raise ValueError( + "More than one tensor specified for whether summaries " + "should be recorded: %s" % should_record_collection) + return should_record_collection[0] + + +# TODO(apassos) consider how to handle local step here. +def record_summaries_every_n_global_steps(n): + """Sets the should_record_summaries Tensor to true if global_step % n == 0.""" + collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME) + collection_ref[:] = [training_util.get_global_step() % n == 0] + + +def always_record_summaries(): + """Sets the should_record_summaries Tensor to always true.""" + collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME) + collection_ref[:] = [constant_op.constant(True)] + + +def never_record_summaries(): + """Sets the should_record_summaries Tensor to always false.""" + collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME) + collection_ref[:] = [constant_op.constant(False)] + + +def create_summary_file_writer(logdir, + max_queue=None, + flush_secs=None, + filename_suffix=None): + """Creates a summary file writer in the current context.""" + if max_queue is None: + max_queue = constant_op.constant(10) + if flush_secs is None: + flush_secs = constant_op.constant(120) + if filename_suffix is None: + filename_suffix = constant_op.constant("") + resource = gen_summary_ops.summary_writer() + gen_summary_ops.create_summary_file_writer(resource, logdir, max_queue, + flush_secs, filename_suffix) + context.context().summary_writer_resource = resource + + +def _nothing(): + """Convenient else branch for when summaries do not record.""" + return + + +def generic(name, tensor, metadata, family=None): + """Writes a tensor summary if possible.""" + + def record(): + with summary_op_util.summary_scope( + name, family, values=[tensor]) as (tag, scope): + gen_summary_ops.write_summary(context.context().summary_writer_resource, + training_util.get_global_step(), tensor, + tag, metadata, name=scope) + return control_flow_ops.cond(should_record_summaries(), record, _nothing) + + +def scalar(name, tensor, family=None): + """Writes a scalar summary if possible.""" + + def record(): + with summary_op_util.summary_scope( + name, family, values=[tensor]) as (tag, scope): + gen_summary_ops.write_scalar_summary( + context.context().summary_writer_resource, + training_util.get_global_step(), tag, tensor, name=scope) + + return control_flow_ops.cond(should_record_summaries(), record, _nothing) + + +def histogram(name, tensor, family=None): + """Writes a histogram summary if possible.""" + + def record(): + with summary_op_util.summary_scope( + name, family, values=[tensor]) as (tag, scope): + gen_summary_ops.write_histogram_summary( + context.context().summary_writer_resource, + training_util.get_global_step(), tag, tensor, name=scope) + + return control_flow_ops.cond(should_record_summaries(), record, _nothing) + + +def image(name, tensor, bad_color=None, max_images=3, family=None): + """Writes an image summary if possible.""" + + def record(): + if bad_color is None: + bad_color_ = constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8) + with summary_op_util.summary_scope( + name, family, values=[tensor]) as (tag, scope): + gen_summary_ops.write_image_summary( + context.context().summary_writer_resource, + training_util.get_global_step(), tag, tensor, bad_color_, max_images, + name=scope) + + return control_flow_ops.cond(should_record_summaries(), record, _nothing) + + +def audio(name, tensor, sample_rate, max_outputs, family=None): + """Writes an audio summary if possible.""" + + def record(): + with summary_op_util.summary_scope( + name, family, values=[tensor]) as (tag, scope): + gen_summary_ops.write_audio_summary( + context.context().summary_writer_resource, + training_util.get_global_step(), + tag, + tensor, + sample_rate=sample_rate, + max_outputs=max_outputs, + name=scope) + + return control_flow_ops.cond(should_record_summaries(), record, _nothing) diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py new file mode 100644 index 00000000000..56c1a16f7f0 --- /dev/null +++ b/tensorflow/contrib/summary/summary_ops_test.py @@ -0,0 +1,52 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tempfile + +from tensorflow.contrib.summary import summary_ops +from tensorflow.python.eager import test +from tensorflow.python.framework import test_util +from tensorflow.python.platform import gfile +from tensorflow.python.training import training_util + + +class TargetTest(test_util.TensorFlowTestCase): + + def testShouldRecordSummary(self): + self.assertFalse(summary_ops.should_record_summaries().numpy()) + summary_ops.always_record_summaries() + self.assertTrue(summary_ops.should_record_summaries().numpy()) + + def testSummaryOps(self): + training_util.get_or_create_global_step() + logdir = tempfile.mkdtemp() + summary_ops.create_summary_file_writer(logdir, max_queue=0) + summary_ops.always_record_summaries() + summary_ops.generic('tensor', 1, '') + summary_ops.scalar('scalar', 2.0) + summary_ops.histogram('histogram', [1.0]) + summary_ops.image('image', [[[[1.0]]]]) + summary_ops.audio('audio', [[1.0]], 1.0, 1) + # The working condition of the ops is tested in the C++ test so we just + # test here that we're calling them correctly. + self.assertTrue(gfile.Exists(logdir)) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD index 7d1325e0466..c952288704a 100644 --- a/tensorflow/contrib/tpu/BUILD +++ b/tensorflow/contrib/tpu/BUILD @@ -39,7 +39,6 @@ py_library( deps = [ ":tpu_lib", ":tpu_py", - "//tensorflow/contrib/learn", "//tensorflow/python:array_ops", "//tensorflow/python:control_flow_ops", "//tensorflow/python:framework_for_generated_wrappers", diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD index 9157c79a907..a567d1bbb08 100644 --- a/tensorflow/contrib/tpu/profiler/BUILD +++ b/tensorflow/contrib/tpu/profiler/BUILD @@ -14,19 +14,30 @@ tf_proto_library_cc( visibility = ["//visibility:public"], ) -cc_binary( - name = "capture_tpu_profile", - srcs = ["capture_tpu_profile.cc"], - visibility = ["//tensorflow/contrib/tpu/profiler:__subpackages__"], +cc_library( + name = "dump_tpu_profile", + srcs = ["dump_tpu_profile.cc"], + hdrs = ["dump_tpu_profile.h"], deps = [ ":op_profile_proto_cc", ":tpu_profiler_proto_cc", ":trace_events_proto_cc", ":trace_events_to_json", "//tensorflow/core:framework", - "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + ], +) + +cc_binary( + name = "capture_tpu_profile", + srcs = ["capture_tpu_profile.cc"], + visibility = ["//tensorflow/contrib/tpu/profiler:__subpackages__"], + deps = [ + ":dump_tpu_profile", + ":tpu_profiler_proto_cc", + "//tensorflow/core:framework_internal", + "//tensorflow/core:lib", "//tensorflow/core/distributed_runtime/rpc:grpc_util", "//tensorflow/core/platform/cloud:gcs_file_system", "@grpc//:grpc++_unsecure", diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc index a0dc15249f7..5b51a72ece8 100644 --- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc +++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc @@ -24,22 +24,12 @@ limitations under the License. #include #include -#include "tensorflow/contrib/tpu/profiler/op_profile.pb.h" +#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h" #include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h" -#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h" -#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h" -#include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/lib/io/compression.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" -#include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/util/command_line_flags.h" -#include "tensorflow/core/util/events_writer.h" namespace tensorflow { namespace tpu { @@ -47,16 +37,6 @@ namespace { using ::tensorflow::TPUProfiler; -using ::grpc::ClientContext; -using ::tensorflow::io::JoinPath; -using ::tensorflow::protobuf::util::JsonOptions; -using ::tensorflow::protobuf::util::MessageToJsonString; - -constexpr char kProfilePluginDirectory[] = "plugins/profile/"; -constexpr char kJsonOpProfileFileName[] = "op_profile.json"; -constexpr char kProtoTraceFileName[] = "trace"; -constexpr char kJsonTraceFileName[] = "trace.json.gz"; -constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph."; constexpr uint64 kMaxEvents = 1000000; string GetCurrentTimeStampAsString() { @@ -66,65 +46,13 @@ string GetCurrentTimeStampAsString() { return s; } -Status WriteGzippedDataToFile(const string& filename, const string& data) { - std::unique_ptr file; - TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(filename, &file)); - io::ZlibCompressionOptions options = io::ZlibCompressionOptions::GZIP(); - io::ZlibOutputBuffer buffer(file.get(), options.input_buffer_size, - options.output_buffer_size, options); - TF_RETURN_IF_ERROR(buffer.Init()); - TF_RETURN_IF_ERROR(buffer.Append(data)); - TF_RETURN_IF_ERROR(buffer.Close()); - TF_RETURN_IF_ERROR(file->Close()); - return Status::OK(); -} - -// Dumps profile data to /plugins/profile//. -inline string CreateProfileRunDirectory(const string& logdir, - const string& run) { - string run_dir = JoinPath(logdir, kProfilePluginDirectory, run); - TF_CHECK_OK(Env::Default()->RecursivelyCreateDir(run_dir)); - return run_dir; -} - -void DumpTraceToLogDirectory(StringPiece run_dir, const string& encoded_trace) { - string proto_path = JoinPath(run_dir, kProtoTraceFileName); - TF_CHECK_OK(WriteStringToFile(Env::Default(), proto_path, encoded_trace)); - LOG(INFO) << "Dumped raw-proto trace data to " << proto_path; - - string json_path = JoinPath(run_dir, kJsonTraceFileName); - Trace trace; - trace.ParseFromString(encoded_trace); - std::cout << "Trace contains " << trace.trace_events_size() << " events." - << std::endl; - TF_CHECK_OK(WriteGzippedDataToFile(json_path, TraceEventsToJson(trace))); - std::cout << "Dumped JSON trace data to " << json_path << std::endl; -} - -void DumpOpProfileToLogDirectory(StringPiece run_dir, - const tpu::op_profile::Profile& profile) { - string path = JoinPath(run_dir, kJsonOpProfileFileName); - string json; - JsonOptions options; - options.always_print_primitive_fields = true; - auto status = MessageToJsonString(profile, &json, options); - if (!status.ok()) { - std::cerr << "Failed to convert op profile to json. Skipping... " - << status.error_message() << std::endl; - return; - } - TF_CHECK_OK(WriteStringToFile(Env::Default(), path, json)); - std::cout << "Dumped json op profile data to " << path << std::endl; -} - ProfileResponse Profile(const string& service_addr, int duration_ms) { ProfileRequest request; request.set_duration_ms(duration_ms); request.set_max_events(kMaxEvents); std::cout << "Limiting the number of trace events to " << kMaxEvents << std::endl; - ProfileResponse response; - ClientContext context; + ::grpc::ClientContext context; ::grpc::ChannelArguments channel_args; // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available. channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, @@ -132,20 +60,11 @@ ProfileResponse Profile(const string& service_addr, int duration_ms) { std::unique_ptr stub = TPUProfiler::NewStub(::grpc::CreateCustomChannel( service_addr, ::grpc::InsecureChannelCredentials(), channel_args)); + ProfileResponse response; TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response))); return response; } -void DumpGraph(StringPiece logdir, StringPiece run, const string& graph_def) { - // The graph plugin expects the graph in //. - string run_dir = JoinPath(logdir, strings::StrCat(kGraphRunPrefix, run)); - TF_CHECK_OK(Env::Default()->RecursivelyCreateDir(run_dir)); - EventsWriter event_writer(JoinPath(run_dir, "events")); - Event event; - event.set_graph_def(graph_def); - event_writer.WriteEvent(event); -} - } // namespace } // namespace tpu } // namespace tensorflow @@ -176,35 +95,8 @@ int main(int argc, char** argv) { tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms); // Use the current timestamp as the run name. tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString(); - tensorflow::string run_dir = - tensorflow::tpu::CreateProfileRunDirectory(FLAGS_logdir, run); - // Ignore computation_graph for now. - if (response.encoded_trace().empty()) { - std::cout << "No trace event is collected during the " << duration_ms - << "ms interval." << std::endl; - } else { - LOG(INFO) << "Converting trace events to TraceViewer JSON."; - tensorflow::tpu::DumpTraceToLogDirectory(run_dir, response.encoded_trace()); - } - int num_graphs = response.computation_graph_size(); - if (num_graphs > 0) { - // The server might generates multiple graphs for one program; we simply - // pick the first one. - if (num_graphs > 1) { - std::cout << num_graphs - << " TPU program variants observed over the profiling period. " - << "One computation graph will be chosen arbitrarily." - << std::endl; - } - tensorflow::tpu::DumpGraph( - FLAGS_logdir, run, response.computation_graph(0).SerializeAsString()); - } - if (response.has_op_profile() && - (response.op_profile().has_by_program_structure() || - response.op_profile().has_by_category())) { - tensorflow::tpu::DumpOpProfileToLogDirectory(run_dir, - response.op_profile()); - } + TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile( + FLAGS_logdir, run, response, &std::cout)); // Print this at the end so that it's not buried in irrelevant LOG messages. std::cout << "NOTE: using the trace duration " << duration_ms << "ms." << std::endl diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc new file mode 100644 index 00000000000..7541a5291d1 --- /dev/null +++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc @@ -0,0 +1,164 @@ +/* Copyright 2017 The TensorFlow Authors All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h" + +#include +#include +#include + +#include "tensorflow/contrib/tpu/profiler/op_profile.pb.h" +#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h" +#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/io/compression.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/protobuf/config.pb.h" +#include "tensorflow/core/util/event.pb.h" +#include "tensorflow/core/util/events_writer.h" + +namespace tensorflow { +namespace tpu { +namespace { + +using ::tensorflow::io::JoinPath; +using ::tensorflow::protobuf::util::JsonOptions; +using ::tensorflow::protobuf::util::MessageToJsonString; + +constexpr char kProfilePluginDirectory[] = "plugins/profile/"; +constexpr char kJsonOpProfileFileName[] = "op_profile.json"; +constexpr char kProtoTraceFileName[] = "trace"; +constexpr char kJsonTraceFileName[] = "trace.json.gz"; +constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph."; + +Status WriteGzippedDataToFile(const string& filename, const string& data) { + std::unique_ptr file; + TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(filename, &file)); + io::ZlibCompressionOptions options = io::ZlibCompressionOptions::GZIP(); + io::ZlibOutputBuffer buffer(file.get(), options.input_buffer_size, + options.output_buffer_size, options); + TF_RETURN_IF_ERROR(buffer.Init()); + TF_RETURN_IF_ERROR(buffer.Append(data)); + TF_RETURN_IF_ERROR(buffer.Close()); + TF_RETURN_IF_ERROR(file->Close()); + return Status::OK(); +} + +Status DumpTraceToLogDirectory(StringPiece run_dir, const string& encoded_trace, + std::ostream* os) { + string proto_path = JoinPath(run_dir, kProtoTraceFileName); + TF_RETURN_IF_ERROR( + WriteStringToFile(Env::Default(), proto_path, encoded_trace)); + LOG(INFO) << "Dumped raw-proto trace data to " << proto_path; + + string json_path = JoinPath(run_dir, kJsonTraceFileName); + Trace trace; + trace.ParseFromString(encoded_trace); + *os << "Trace contains " << trace.trace_events_size() << " events." + << std::endl; + TF_RETURN_IF_ERROR( + WriteGzippedDataToFile(json_path, TraceEventsToJson(trace))); + *os << "Dumped JSON trace data to " << json_path << std::endl; + return Status::OK(); +} + +Status DumpOpProfileToLogDirectory(StringPiece run_dir, + const tpu::op_profile::Profile& profile, + std::ostream* os) { + string path = JoinPath(run_dir, kJsonOpProfileFileName); + string json; + JsonOptions options; + options.always_print_primitive_fields = true; + auto status = MessageToJsonString(profile, &json, options); + if (!status.ok()) { + return errors::Internal( + "Failed to convert op profile to json. Skipping... ", + string(status.error_message())); + } + TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, json)); + *os << "Dumped json op profile data to " << path << std::endl; + return Status::OK(); +} + +Status DumpGraphEvents(const string& logdir, const string& run, + const ProfileResponse& response, std::ostream* os) { + int num_graphs = response.computation_graph_size(); + if (response.computation_graph_size() == 0) return Status::OK(); + // The server might generates multiple graphs for one program; we simply + // pick the first one. + if (num_graphs > 1) { + *os << num_graphs + << " TPU program variants observed over the profiling period. " + << "One computation graph will be chosen arbitrarily." << std::endl; + } + // The graph plugin expects the graph in //. + string run_dir = JoinPath(logdir, strings::StrCat(kGraphRunPrefix, run)); + TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(run_dir)); + EventsWriter event_writer(JoinPath(run_dir, "events")); + Event event; + // Add the computation graph. + event.set_graph_def(response.computation_graph(0).SerializeAsString()); + event_writer.WriteEvent(event); + *os << "Wrote a HLO graph to " << event_writer.FileName() << std::endl; + + if (response.has_hlo_metadata()) { + tensorflow::TaggedRunMetadata tagged_run_metadata; + tagged_run_metadata.set_tag(run); + tagged_run_metadata.set_run_metadata( + response.hlo_metadata().SerializeAsString()); + tensorflow::Event meta_event; + *meta_event.mutable_tagged_run_metadata() = tagged_run_metadata; + event_writer.WriteEvent(meta_event); + *os << "Wrote HLO ops run metadata to " << event_writer.FileName() + << std::endl; + } + return Status::OK(); +} + +} // namespace + +Status WriteTensorboardTPUProfile(const string& logdir, const string& run, + const ProfileResponse& response, + std::ostream* os) { + // Dumps profile data to /plugins/profile//. + string profile_run_dir = JoinPath(logdir, kProfilePluginDirectory, run); + TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir)); + // Ignore computation_graph for now. + if (response.encoded_trace().empty()) { + *os << "No trace event is collected." << std::endl; + } else { + LOG(INFO) << "Converting trace events to TraceViewer JSON."; + TF_RETURN_IF_ERROR( + DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os)); + } + if (response.has_op_profile() && + (response.op_profile().has_by_program_structure() || + response.op_profile().has_by_category())) { + TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir, + response.op_profile(), os)); + } + + TF_RETURN_IF_ERROR(DumpGraphEvents(logdir, run, response, os)); + + return Status::OK(); +} + +} // namespace tpu +} // namespace tensorflow diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h new file mode 100644 index 00000000000..65b92aa4186 --- /dev/null +++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h @@ -0,0 +1,38 @@ +/* Copyright 2017 The TensorFlow Authors All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_ +#define THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_ + +#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace tpu { + +// Dumps all profiling tool data in a TPU profile to a TensorBoard log directory +// with the given run name. This writes user-facing log messages to `os`. +// The following tools are supported: +// - Trace viewer +// - Op profile +// - HLO computation graph +Status WriteTensorboardTPUProfile(const string& logdir, const string& run, + const ProfileResponse& response, + std::ostream* os); + +} // namespace tpu +} // namespace tensorflow + +#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_ diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto index d0a27f1a3d5..88e86eca3b6 100644 --- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto +++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto @@ -2,6 +2,7 @@ syntax = "proto3"; package tensorflow; import "tensorflow/core/framework/graph.proto"; +import "tensorflow/core/protobuf/config.proto"; import "tensorflow/contrib/tpu/profiler/op_profile.proto"; // The TPUProfiler service retrieves performance information about @@ -31,6 +32,10 @@ message ProfileResponse { // Graphs of programs executed on TPUs during the profiling period. repeated GraphDef computation_graph = 2; + // Performance profile that can be used to annotate HLO operations in the + // computation graph. + RunMetadata hlo_metadata = 5; + // Encoded Trace proto message that contains metadata about the trace captured // during the profiling period. Describes the devices and resources that // 'trace_events' refers to. @@ -40,4 +45,5 @@ message ProfileResponse { // If the trace covers multiple programs, the longest-running one is analyzed. // See op_profile.proto for the detailed semantics of the returned profile. tpu.op_profile.Profile op_profile = 4; + // next-field: 6 } diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index 7c883ec9266..6748a765623 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -102,10 +102,12 @@ def _increase_eval_step_op(iterations_per_loop): use_locking=True) -def _tpu_job(run_config): +def _tpu_job(run_config, mode): # The tpu job is determined by the run_config. Right now, this method is # required as tpu_config is not part of the RunConfig. - return None if run_config.master in ['', 'local'] else 'tpu_worker' + master = (run_config.evaluation_master if mode == model_fn_lib.ModeKeys.EVAL + else run_config.master) + return None if master in ['', 'local'] else 'tpu_worker' def _is_running_on_cpu(use_tpu, mode, eval_batch_size): @@ -265,9 +267,9 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): dequeue. """ - def __init__(self, run_config, enqueue_fn, dequeue_ops=None): + def __init__(self, run_config, mode, enqueue_fn, dequeue_ops=None): self._iterations = run_config.tpu_config.iterations_per_loop - self._tpu_job = _tpu_job(run_config) + self._tpu_job = _tpu_job(run_config, mode) self._enqueue_fn = enqueue_fn self._dequeue_ops = dequeue_ops @@ -899,7 +901,7 @@ class _EvalMetrics(object): """ num_shards = run_config.tpu_config.num_shards - job = _tpu_job(run_config) + job = _tpu_job(run_config, model_fn_lib.ModeKeys.EVAL) job_device = '' if job is None else ('/job:%s' % job) # For each i, dequeue_ops[i] is a list containing the tensors from all @@ -978,18 +980,20 @@ class TPUEstimator(estimator_lib.Estimator): Example (MNIST): ``` + # The metric Fn which runs on CPU. + def metric_fn(labels, logits): + predictions = tf.argmax(logits, 1) + return { + 'accuracy': tf.metrics.precision( + labels=labels, predictions=predictions), + } + + # Your model Fn which runs on TPU. def model_fn(features, labels, mode, config, params): ... logits = ... if mode = tf.estimator.ModeKeys.EVAL: - def metric_fn(labels, logits): - predictions = tf.argmax(logits, 1) - return { - 'precision': tf.metrics.precision( - labels=labels, predictions=predictions), - } - return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=loss, @@ -1162,7 +1166,7 @@ class TPUEstimator(estimator_lib.Estimator): with ops.device('/device:CPU:0'): return input_fn(**kwargs) - job = _tpu_job(config) + job = _tpu_job(config, mode) def placement_function(index): if job is None: return '/replica:0/task:0/device:CPU:0' @@ -1190,13 +1194,14 @@ class TPUEstimator(estimator_lib.Estimator): # TODO(b/64607814): Ensure batch_axis works with nested structures. def _create_infeed_enqueue_ops_and_dequeue_fn(inputs_holder, run_config, - batch_axis): + batch_axis, mode): """Utility to convert input_fn to enqueue and dequeue fns for TPU. Args: inputs_holder: An `_InputsHolder` holding features and labels. run_config: A `RunConfig` instance. batch_axis: A python list of batch dimensions. + mode: ModeKeys Returns: A tuple of (dequeue_fn, enqueue_fn) @@ -1239,7 +1244,7 @@ def _create_infeed_enqueue_ops_and_dequeue_fn(inputs_holder, run_config, return infeed_queue.generate_enqueue_ops( sharded_inputs, tpu_ordinal_function=tpu_ordinal_function) else: - job = _tpu_job(run_config) + job = _tpu_job(run_config, mode) def placement_function(index): if job is None: return '/replica:0/task:0/device:CPU:0' @@ -1271,12 +1276,12 @@ def _augment_model_fn(model_fn, train_batch_size, eval_batch_size, use_tpu, num_shards=config.tpu_config.num_shards) dequeue_fn, enqueue_fn = _create_infeed_enqueue_ops_and_dequeue_fn( - inputs, config, batch_axis) + inputs, config, batch_axis, mode) if mode == model_fn_lib.ModeKeys.TRAIN: loss = _train_on_tpu_system(model_fn_wrapper, dequeue_fn) hooks = [ - TPUInfeedOutfeedSessionHook(config, enqueue_fn), + TPUInfeedOutfeedSessionHook(config, mode, enqueue_fn), training.LoggingTensorHook( {'loss': array_ops.identity(loss), 'step': training.get_global_step()}, @@ -1318,7 +1323,7 @@ def _augment_model_fn(model_fn, train_batch_size, eval_batch_size, use_tpu, eval_metric_ops.to_metric_metric_ops_for_tpu( config, dummy_update_op)) hooks = [ - TPUInfeedOutfeedSessionHook(config, enqueue_fn, eval_update_ops), + TPUInfeedOutfeedSessionHook(config, mode, enqueue_fn, eval_update_ops), ] return model_fn_lib.EstimatorSpec( diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index de9eb057e48..9db2ed830f4 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -122,6 +122,7 @@ load( "tf_additional_gpu_tracer_cuda_deps", "tf_pyclif_proto_library", "tf_jspb_proto_library", + "tf_nano_proto_library", ) load( "//tensorflow/core:platform/default/build_config_root.bzl", @@ -212,6 +213,15 @@ tf_jspb_proto_library( deps = [":protos_all_cc"], ) +tf_nano_proto_library( + name = "protos_all_nano_proto", + field_style = "accessors", + generate_equals = 1, + generate_intdefs = 1, + visibility = ["//visibility:public"], + deps = [":protos_all_cc"], +) + exports_files([ "framework/types.proto", ]) @@ -556,6 +566,7 @@ tf_gen_op_libs( "state_ops", "stateless_random_ops", "string_ops", + "summary_ops", "training_ops", ], ) @@ -766,6 +777,7 @@ cc_library( "//tensorflow/core/kernels:state", "//tensorflow/core/kernels:stateless_random_ops", "//tensorflow/core/kernels:string", + "//tensorflow/core/kernels:summary_kernels", "//tensorflow/core/kernels:training_ops", "//tensorflow/core/kernels:word2vec_kernels", ] + tf_additional_cloud_kernel_deps() + if_not_windows([ @@ -2144,8 +2156,6 @@ tf_cc_tests( "platform/port_test.cc", "platform/profile_utils/cpu_utils_test.cc", "platform/subprocess_test.cc", - "platform/vmodule_benchmark_test.cc", - "platform/vmodule_test.cc", ], deps = [ ":lib", @@ -3079,7 +3089,6 @@ cc_test( srcs = ["example/example_parser_configuration_test.cc"], data = [":example_parser_configuration_testdata"], deps = [ - ":core", ":core_cpu", ":core_cpu_internal", ":direct_session_internal", diff --git a/tensorflow/core/example/feature_util.cc b/tensorflow/core/example/feature_util.cc index 6f3cc6c6c5d..f0593ede82f 100644 --- a/tensorflow/core/example/feature_util.cc +++ b/tensorflow/core/example/feature_util.cc @@ -18,77 +18,129 @@ limitations under the License. namespace tensorflow { namespace internal { - -::tensorflow::Feature& ExampleFeature(const string& name, - ::tensorflow::Example* example) { - ::tensorflow::Features* features = example->mutable_features(); - return (*features->mutable_feature())[name]; +Feature& ExampleFeature(const string& name, Example* example) { + return *GetFeature(name, example); } -} // namespace internal +} // namespace internal template <> -bool ExampleHasFeature(const string& name, - const Example& example) { - auto it = example.features().feature().find(name); - return (it != example.features().feature().end()) && +bool HasFeature<>(const string& key, const Features& features) { + return (features.feature().find(key) != features.feature().end()); +} + +template <> +bool HasFeature(const string& key, const Features& features) { + auto it = features.feature().find(key); + return (it != features.feature().end()) && (it->second.kind_case() == Feature::KindCase::kInt64List); } template <> -bool ExampleHasFeature(const string& name, const Example& example) { - auto it = example.features().feature().find(name); - return (it != example.features().feature().end()) && +bool HasFeature(const string& key, const Features& features) { + auto it = features.feature().find(key); + return (it != features.feature().end()) && (it->second.kind_case() == Feature::KindCase::kFloatList); } template <> -bool ExampleHasFeature(const string& name, const Example& example) { - auto it = example.features().feature().find(name); - return (it != example.features().feature().end()) && +bool HasFeature(const string& key, const Features& features) { + auto it = features.feature().find(key); + return (it != features.feature().end()) && (it->second.kind_case() == Feature::KindCase::kBytesList); } +bool HasFeatureList(const string& key, + const SequenceExample& sequence_example) { + auto& feature_list = sequence_example.feature_lists().feature_list(); + return (feature_list.find(key) != feature_list.end()); +} + template <> const protobuf::RepeatedField& GetFeatureValues( - const string& name, const Example& example) { - return example.features().feature().at(name).int64_list().value(); + const Feature& feature) { + return feature.int64_list().value(); } template <> protobuf::RepeatedField* GetFeatureValues( - const string& name, Example* example) { - return internal::ExampleFeature(name, example) - .mutable_int64_list() - ->mutable_value(); + Feature* feature) { + return feature->mutable_int64_list()->mutable_value(); } template <> const protobuf::RepeatedField& GetFeatureValues( - const string& name, const Example& example) { - return example.features().feature().at(name).float_list().value(); + const Feature& feature) { + return feature.float_list().value(); } template <> -protobuf::RepeatedField* GetFeatureValues(const string& name, - Example* example) { - return internal::ExampleFeature(name, example) - .mutable_float_list() - ->mutable_value(); +protobuf::RepeatedField* GetFeatureValues(Feature* feature) { + return feature->mutable_float_list()->mutable_value(); } template <> const protobuf::RepeatedPtrField& GetFeatureValues( - const string& name, const Example& example) { - return example.features().feature().at(name).bytes_list().value(); + const Feature& feature) { + return feature.bytes_list().value(); } template <> -protobuf::RepeatedPtrField* GetFeatureValues(const string& name, - Example* example) { - return internal::ExampleFeature(name, example) - .mutable_bytes_list() - ->mutable_value(); +protobuf::RepeatedPtrField* GetFeatureValues(Feature* feature) { + return feature->mutable_bytes_list()->mutable_value(); } +const protobuf::RepeatedPtrField& GetFeatureList( + const string& key, const SequenceExample& sequence_example) { + return sequence_example.feature_lists().feature_list().at(key).feature(); +} + +protobuf::RepeatedPtrField* GetFeatureList( + const string& feature_list_key, SequenceExample* sequence_example) { + return (*sequence_example->mutable_feature_lists() + ->mutable_feature_list())[feature_list_key] + .mutable_feature(); +} + +template <> +Features* GetFeatures(Features* proto) { + return proto; +} + +template <> +Features* GetFeatures(Example* proto) { + return proto->mutable_features(); +} + +template <> +const Features& GetFeatures(const Features& proto) { + return proto; +} + +template <> +const Features& GetFeatures(const Example& proto) { + return proto.features(); +} + +template <> +const protobuf::RepeatedField& GetFeatureValues( + const Feature& feature); + +template <> +protobuf::RepeatedField* GetFeatureValues( + Feature* feature); + +template <> +const protobuf::RepeatedField& GetFeatureValues( + const Feature& feature); + +template <> +protobuf::RepeatedField* GetFeatureValues(Feature* feature); + +template <> +const protobuf::RepeatedPtrField& GetFeatureValues( + const Feature& feature); + +template <> +protobuf::RepeatedPtrField* GetFeatureValues(Feature* feature); } // namespace tensorflow diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h index 4004411cb17..a87c2c9a57c 100644 --- a/tensorflow/core/example/feature_util.h +++ b/tensorflow/core/example/feature_util.h @@ -13,9 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -// A set of lightweight wrappers which simplify access to Example features. +// A set of lightweight wrappers which simplify access to Feature protos. // // TensorFlow Example proto uses associative maps on top of oneof fields. +// SequenceExample proto uses associative map of FeatureList. // So accessing feature values is not very convenient. // // For example, to read a first value of integer feature "tag": @@ -42,9 +43,59 @@ limitations under the License. // (RepeatedPtrField for byte list). So refer to its documentation of // RepeatedField for full list of supported methods. // -// NOTE: It is also important to mention that due to the nature of oneof proto -// fields setting a feature of one type automatically clears all values stored -// as another type with the same feature name. +// NOTE: Due to the nature of oneof proto fields setting a feature of one type +// automatically clears all values stored as another type with the same feature +// key. +// +// This library also has tools to work with SequenceExample protos. +// +// To get a value from SequenceExample.context: +// int id = GetFeatureValues("tag", se.context()).Get(0); +// To add a value to the context: +// GetFeatureValues("tag", se.mutable_context())->Add(42); +// +// To add values to feature_lists: +// AppendFeatureValues({4.0}, +// GetFeatureList("movie_ratings", &se)->Add()); +// AppendFeatureValues({5.0, 3.0}, +// GetFeatureList("movie_ratings", &se)->Add()); +// This will create a feature list keyed as "images" with two features: +// feature_lists { +// feature_list { +// key: "images" +// value { +// feature { float_list { value: [4.0] } } +// feature { float_list { value: [5.0, 3.0] } } +// } +// } } +// +// Functions exposed by this library: +// HasFeature<[FeatureType]>(key, proto) -> bool +// Returns true if a feature with the specified key, and optionally +// FeatureType, belongs to the Features or Example proto. +// HasFeatureList(key, sequence_example) -> bool +// Returns true if SequenceExample has a feature_list with the key. +// GetFeatureValues(key, proto) -> RepeatedField +// Returns values for the specified key and the FeatureType. +// Supported types for the proto: Example, Features. +// GetFeatureList(key, sequence_example) -> RepeatedPtrField +// Returns Feature protos associated with a key. +// AppendFeatureValues(begin, end, feature) +// AppendFeatureValues(container or initializer_list, feature) +// Copies values into a Feature. +// AppendFeatureValues(begin, end, key, proto) +// AppendFeatureValues(container or initializer_list, key, proto) +// Copies values into Features and Example protos with the specified key. +// +// Auxiliary functions, it is unlikely you'll need to use them directly: +// GetFeatures(proto) -> Features +// A convenience function to get Features proto. +// Supported types for the proto: Example, Features. +// GetFeature(key, proto) -> Feature* +// Returns a Feature proto for the specified key, creates a new if +// necessary. Supported types for the proto: Example, Features. +// GetFeatureValues(feature) -> RepeatedField +// Returns values of the feature for the FeatureType. #ifndef TENSORFLOW_EXAMPLE_FEATURE_H_ #define TENSORFLOW_EXAMPLE_FEATURE_H_ @@ -62,10 +113,11 @@ namespace tensorflow { namespace internal { +// DEPRECATED: Use GetFeature instead. +// TODO(gorban): Update all clients in a followup CL. // Returns a reference to a feature corresponding to the name. // Note: it will create a new Feature if it is missing in the example. -::tensorflow::Feature& ExampleFeature(const string& name, - ::tensorflow::Example* example); +Feature& ExampleFeature(const string& name, Example* example); // Specializations of RepeatedFieldTrait define a type of RepeatedField // corresponding to a selected feature type. @@ -127,89 +179,135 @@ struct FeatureTrait< } // namespace internal -// Returns true if feature with the specified name belongs to the example proto. -// Doesn't check feature type. Note that specialized versions return false if -// the feature has a wrong type. -template -bool ExampleHasFeature(const string& name, const Example& example) { - return example.features().feature().find(name) != - example.features().feature().end(); -} +// Returns true if sequence_example has a feature_list with the specified key. +bool HasFeatureList(const string& key, const SequenceExample& sequence_example); + +// A family of template functions to return mutable Features proto from a +// container proto. Supported ProtoTypes: Example, Features. +template +Features* GetFeatures(ProtoType* proto); + +template +const Features& GetFeatures(const ProtoType& proto); // Base declaration of a family of template functions to return a read only -// repeated field corresponding to a feature with the specified name. +// repeated field of feature values. template const typename internal::RepeatedFieldTrait::Type& -GetFeatureValues(const string& name, const Example& example); +GetFeatureValues(const Feature& feature); -// Base declaration of a family of template functions to return a mutable -// repeated field corresponding to a feature with the specified name. +// Returns a read only repeated field corresponding to a feature with the +// specified name and FeatureType. Supported ProtoTypes: Example, Features. +template +const typename internal::RepeatedFieldTrait::Type& +GetFeatureValues(const string& key, const ProtoType& proto) { + return GetFeatureValues(GetFeatures(proto).feature().at(key)); +} + +// Returns a mutable repeated field of a feature values. template typename internal::RepeatedFieldTrait::Type* GetFeatureValues( - const string& name, Example* example); + Feature* feature); + +// Returns a mutable repeated field corresponding to a feature with the +// specified name and FeatureType. Supported ProtoTypes: Example, Features. +template +typename internal::RepeatedFieldTrait::Type* GetFeatureValues( + const string& key, ProtoType* proto) { + ::tensorflow::Feature& feature = + (*GetFeatures(proto)->mutable_feature())[key]; + return GetFeatureValues(&feature); +} + +// Returns a Feature proto for the specified key, creates a new if necessary. +// Supported types for the proto: Example, Features. +template +Feature* GetFeature(const string& key, ProtoType* proto) { + return &(*GetFeatures(proto)->mutable_feature())[key]; +} + +// Returns a repeated field with features corresponding to a feature_list key. +const protobuf::RepeatedPtrField& GetFeatureList( + const string& key, const SequenceExample& sequence_example); + +// Returns a mutable repeated field with features corresponding to a +// feature_list key. It will create a new FeatureList if necessary. +protobuf::RepeatedPtrField* GetFeatureList( + const string& feature_list_key, SequenceExample* sequence_example); -// Copies elements from the range, defined by [first, last) into a feature. template void AppendFeatureValues(IteratorType first, IteratorType last, - const string& name, Example* example) { + Feature* feature) { using FeatureType = typename internal::FeatureTrait< typename std::iterator_traits::value_type>::Type; - std::copy(first, last, protobuf::RepeatedFieldBackInserter( - GetFeatureValues(name, example))); + std::copy(first, last, + protobuf::RepeatedFieldBackInserter( + GetFeatureValues(feature))); +} + +template +void AppendFeatureValues(std::initializer_list container, + Feature* feature) { + AppendFeatureValues(container.begin(), container.end(), feature); +} + +template +void AppendFeatureValues(const ContainerType& container, Feature* feature) { + using IteratorType = typename ContainerType::const_iterator; + AppendFeatureValues(container.begin(), container.end(), + feature); +} + +// Copies elements from the range, defined by [first, last) into the feature +// obtainable from the (proto, key) combination. +template +void AppendFeatureValues(IteratorType first, IteratorType last, + const string& key, ProtoType* proto) { + AppendFeatureValues(first, last, GetFeature(key, GetFeatures(proto))); } // Copies all elements from the container into a feature. -template -void AppendFeatureValues(const ContainerType& container, const string& name, - Example* example) { +template +void AppendFeatureValues(const ContainerType& container, const string& key, + ProtoType* proto) { using IteratorType = typename ContainerType::const_iterator; - AppendFeatureValues(container.begin(), container.end(), name, - example); + AppendFeatureValues(container.begin(), container.end(), key, + proto); } -// Copies all elements from the initializer list into a feature. -template +// Copies all elements from the initializer list into a Feature contained by +// Features or Example proto. +template void AppendFeatureValues(std::initializer_list container, - const string& name, Example* example) { + const string& key, ProtoType* proto) { using IteratorType = typename std::initializer_list::const_iterator; - AppendFeatureValues(container.begin(), container.end(), name, - example); + AppendFeatureValues(container.begin(), container.end(), key, + proto); } -template <> -bool ExampleHasFeature(const string& name, - const Example& example); +// Returns true if a feature with the specified key belongs to the Features. +// The template parameter pack accepts zero or one template argument - which +// is FeatureType. If the FeatureType not specified (zero template arguments) +// the function will not check the feature type. Otherwise it will return false +// if the feature has a wrong type. +template +bool HasFeature(const string& key, const Features& features); -template <> -bool ExampleHasFeature(const string& name, const Example& example); +// Returns true if a feature with the specified key belongs to the Example. +// Doesn't check feature type if used without FeatureType, otherwise the +// specialized versions return false if the feature has a wrong type. +template +bool HasFeature(const string& key, const Example& example) { + return HasFeature(key, GetFeatures(example)); +}; -template <> -bool ExampleHasFeature(const string& name, const Example& example); - -template <> -const protobuf::RepeatedField& GetFeatureValues( - const string& name, const Example& example); - -template <> -protobuf::RepeatedField* GetFeatureValues( - const string& name, Example* example); - -template <> -const protobuf::RepeatedField& GetFeatureValues( - const string& name, const Example& example); - -template <> -protobuf::RepeatedField* GetFeatureValues(const string& name, - Example* example); - -template <> -const protobuf::RepeatedPtrField& GetFeatureValues( - const string& name, const Example& example); - -template <> -protobuf::RepeatedPtrField* GetFeatureValues(const string& name, - Example* example); +// DEPRECATED: use HasFeature instead. +// TODO(gorban): update all clients in a followup CL. +template +bool ExampleHasFeature(const string& key, const Example& example) { + return HasFeature(key, example); +} } // namespace tensorflow #endif // TENSORFLOW_EXAMPLE_FEATURE_H_ diff --git a/tensorflow/core/example/feature_util_test.cc b/tensorflow/core/example/feature_util_test.cc index eb7b90af1b2..cd32dee306d 100644 --- a/tensorflow/core/example/feature_util_test.cc +++ b/tensorflow/core/example/feature_util_test.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ - #include "tensorflow/core/example/feature_util.h" #include @@ -38,6 +37,16 @@ TEST(GetFeatureValuesInt64Test, ReadsASingleValue) { EXPECT_EQ(42, tag.Get(0)); } +TEST(GetFeatureValuesInt64Test, ReadsASingleValueFromFeature) { + Feature feature; + feature.mutable_int64_list()->add_value(42); + + auto values = GetFeatureValues(feature); + + ASSERT_EQ(1, values.size()); + EXPECT_EQ(42, values.Get(0)); +} + TEST(GetFeatureValuesInt64Test, WritesASingleValue) { Example example; @@ -48,25 +57,33 @@ TEST(GetFeatureValuesInt64Test, WritesASingleValue) { EXPECT_EQ(42, example.features().feature().at("tag").int64_list().value(0)); } +TEST(GetFeatureValuesInt64Test, WritesASingleValueToFeature) { + Feature feature; + + GetFeatureValues(&feature)->Add(42); + + ASSERT_EQ(1, feature.int64_list().value_size()); + EXPECT_EQ(42, feature.int64_list().value(0)); +} + TEST(GetFeatureValuesInt64Test, CheckUntypedFieldExistence) { Example example; - - EXPECT_FALSE(ExampleHasFeature("tag", example)); + ASSERT_FALSE(HasFeature("tag", example)); GetFeatureValues("tag", &example)->Add(0); - EXPECT_TRUE(ExampleHasFeature("tag", example)); + EXPECT_TRUE(HasFeature("tag", example)); } TEST(GetFeatureValuesInt64Test, CheckTypedFieldExistence) { Example example; GetFeatureValues("tag", &example)->Add(3.14); - ASSERT_FALSE(ExampleHasFeature("tag", example)); + ASSERT_FALSE(HasFeature("tag", example)); GetFeatureValues("tag", &example)->Add(42); - EXPECT_TRUE(ExampleHasFeature("tag", example)); + EXPECT_TRUE(HasFeature("tag", example)); auto tag_ro = GetFeatureValues("tag", example); ASSERT_EQ(1, tag_ro.size()); EXPECT_EQ(42, tag_ro.Get(0)); @@ -87,6 +104,16 @@ TEST(GetFeatureValuesInt64Test, CopyIterableToAField) { EXPECT_EQ(3, tag_ro.Get(2)); } +TEST(GetFeatureValuesFloatTest, ReadsASingleValueFromFeature) { + Feature feature; + feature.mutable_float_list()->add_value(3.14); + + auto values = GetFeatureValues(feature); + + ASSERT_EQ(1, values.size()); + EXPECT_NEAR(3.14, values.Get(0), kTolerance); +} + TEST(GetFeatureValuesFloatTest, ReadsASingleValue) { Example example; (*example.mutable_features()->mutable_feature())["tag"] @@ -99,6 +126,15 @@ TEST(GetFeatureValuesFloatTest, ReadsASingleValue) { EXPECT_NEAR(3.14, tag.Get(0), kTolerance); } +TEST(GetFeatureValuesFloatTest, WritesASingleValueToFeature) { + Feature feature; + + GetFeatureValues(&feature)->Add(3.14); + + ASSERT_EQ(1, feature.float_list().value_size()); + EXPECT_NEAR(3.14, feature.float_list().value(0), kTolerance); +} + TEST(GetFeatureValuesFloatTest, WritesASingleValue) { Example example; @@ -114,6 +150,20 @@ TEST(GetFeatureValuesFloatTest, WritesASingleValue) { TEST(GetFeatureValuesFloatTest, CheckTypedFieldExistence) { Example example; + GetFeatureValues("tag", &example)->Add(42); + ASSERT_FALSE(HasFeature("tag", example)); + + GetFeatureValues("tag", &example)->Add(3.14); + + EXPECT_TRUE(HasFeature("tag", example)); + auto tag_ro = GetFeatureValues("tag", example); + ASSERT_EQ(1, tag_ro.size()); + EXPECT_NEAR(3.14, tag_ro.Get(0), kTolerance); +} + +TEST(GetFeatureValuesFloatTest, CheckTypedFieldExistenceForDeprecatedMethod) { + Example example; + GetFeatureValues("tag", &example)->Add(42); ASSERT_FALSE(ExampleHasFeature("tag", example)); @@ -125,6 +175,16 @@ TEST(GetFeatureValuesFloatTest, CheckTypedFieldExistence) { EXPECT_NEAR(3.14, tag_ro.Get(0), kTolerance); } +TEST(GetFeatureValuesStringTest, ReadsASingleValueFromFeature) { + Feature feature; + feature.mutable_bytes_list()->add_value("FOO"); + + auto values = GetFeatureValues(feature); + + ASSERT_EQ(1, values.size()); + EXPECT_EQ("FOO", values.Get(0)); +} + TEST(GetFeatureValuesStringTest, ReadsASingleValue) { Example example; (*example.mutable_features()->mutable_feature())["tag"] @@ -137,6 +197,15 @@ TEST(GetFeatureValuesStringTest, ReadsASingleValue) { EXPECT_EQ("FOO", tag.Get(0)); } +TEST(GetFeatureValuesStringTest, WritesASingleValueToFeature) { + Feature feature; + + *GetFeatureValues(&feature)->Add() = "FOO"; + + ASSERT_EQ(1, feature.bytes_list().value_size()); + EXPECT_EQ("FOO", feature.bytes_list().value(0)); +} + TEST(GetFeatureValuesStringTest, WritesASingleValue) { Example example; @@ -148,15 +217,15 @@ TEST(GetFeatureValuesStringTest, WritesASingleValue) { example.features().feature().at("tag").bytes_list().value(0)); } -TEST(GetFeatureValuesBytesTest, CheckTypedFieldExistence) { +TEST(GetFeatureValuesStringTest, CheckTypedFieldExistence) { Example example; GetFeatureValues("tag", &example)->Add(42); - ASSERT_FALSE(ExampleHasFeature("tag", example)); + ASSERT_FALSE(HasFeature("tag", example)); *GetFeatureValues("tag", &example)->Add() = "FOO"; - EXPECT_TRUE(ExampleHasFeature("tag", example)); + EXPECT_TRUE(HasFeature("tag", example)); auto tag_ro = GetFeatureValues("tag", example); ASSERT_EQ(1, tag_ro.size()); EXPECT_EQ("FOO", tag_ro.Get(0)); @@ -228,5 +297,146 @@ TEST(AppendFeatureValuesTest, StringVariablesUsingInitializerList) { EXPECT_EQ("BAZ", tag_ro.Get(2)); } +TEST(SequenceExampleTest, ReadsASingleValueFromContext) { + SequenceExample se; + (*se.mutable_context()->mutable_feature())["tag"] + .mutable_int64_list() + ->add_value(42); + + auto values = GetFeatureValues("tag", se.context()); + + ASSERT_EQ(1, values.size()); + EXPECT_EQ(42, values.Get(0)); +} + +TEST(SequenceExampleTest, WritesASingleValueToContext) { + SequenceExample se; + + GetFeatureValues("tag", se.mutable_context())->Add(42); + + ASSERT_EQ(1, se.context().feature().at("tag").int64_list().value_size()); + EXPECT_EQ(42, se.context().feature().at("tag").int64_list().value(0)); +} + +TEST(SequenceExampleTest, AppendFeatureValuesToContextSingleArg) { + SequenceExample se; + + AppendFeatureValues({1.1, 2.2, 3.3}, "tag", se.mutable_context()); + + auto tag_ro = GetFeatureValues("tag", se.context()); + ASSERT_EQ(3, tag_ro.size()); + EXPECT_NEAR(1.1, tag_ro.Get(0), kTolerance); + EXPECT_NEAR(2.2, tag_ro.Get(1), kTolerance); + EXPECT_NEAR(3.3, tag_ro.Get(2), kTolerance); +} + +TEST(SequenceExampleTest, CheckTypedFieldExistence) { + SequenceExample se; + + GetFeatureValues("tag", se.mutable_context())->Add(3.14); + ASSERT_FALSE(HasFeature("tag", se.context())); + + GetFeatureValues("tag", se.mutable_context())->Add(42); + + EXPECT_TRUE(HasFeature("tag", se.context())); + auto tag_ro = GetFeatureValues("tag", se.context()); + ASSERT_EQ(1, tag_ro.size()); + EXPECT_EQ(42, tag_ro.Get(0)); +} + +TEST(SequenceExampleTest, ReturnsExistingFeatureLists) { + SequenceExample se; + (*se.mutable_feature_lists()->mutable_feature_list())["tag"] + .mutable_feature() + ->Add(); + + auto feature = GetFeatureList("tag", se); + + ASSERT_EQ(1, feature.size()); +} + +TEST(SequenceExampleTest, CreatesNewFeatureLists) { + SequenceExample se; + + GetFeatureList("tag", &se)->Add(); + + EXPECT_EQ(1, se.feature_lists().feature_list().at("tag").feature_size()); +} + +TEST(SequenceExampleTest, CheckFeatureListExistence) { + SequenceExample se; + ASSERT_FALSE(HasFeatureList("tag", se)); + + GetFeatureList("tag", &se)->Add(); + + ASSERT_TRUE(HasFeatureList("tag", se)); +} + +TEST(SequenceExampleTest, AppendFeatureValuesWithInitializerList) { + SequenceExample se; + + AppendFeatureValues({1, 2, 3}, "ids", se.mutable_context()); + AppendFeatureValues({"cam1-0", "cam2-0"}, + GetFeatureList("images", &se)->Add()); + AppendFeatureValues({"cam1-1", "cam2-2"}, + GetFeatureList("images", &se)->Add()); + + EXPECT_EQ(se.DebugString(), + "context {\n" + " feature {\n" + " key: \"ids\"\n" + " value {\n" + " int64_list {\n" + " value: 1\n" + " value: 2\n" + " value: 3\n" + " }\n" + " }\n" + " }\n" + "}\n" + "feature_lists {\n" + " feature_list {\n" + " key: \"images\"\n" + " value {\n" + " feature {\n" + " bytes_list {\n" + " value: \"cam1-0\"\n" + " value: \"cam2-0\"\n" + " }\n" + " }\n" + " feature {\n" + " bytes_list {\n" + " value: \"cam1-1\"\n" + " value: \"cam2-2\"\n" + " }\n" + " }\n" + " }\n" + " }\n" + "}\n"); +} + +TEST(SequenceExampleTest, AppendFeatureValuesWithVectors) { + SequenceExample se; + + std::vector readings{1.0, 2.5, 5.0}; + AppendFeatureValues(readings, GetFeatureList("movie_ratings", &se)->Add()); + + EXPECT_EQ(se.DebugString(), + "feature_lists {\n" + " feature_list {\n" + " key: \"movie_ratings\"\n" + " value {\n" + " feature {\n" + " float_list {\n" + " value: 1\n" + " value: 2.5\n" + " value: 5\n" + " }\n" + " }\n" + " }\n" + " }\n" + "}\n"); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc index e7092f549b2..f5dadf76daf 100644 --- a/tensorflow/core/framework/allocator.cc +++ b/tensorflow/core/framework/allocator.cc @@ -117,16 +117,6 @@ class CPUAllocator : public Allocator { TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator); }; -namespace { -Allocator* MakeCpuAllocator() { - Allocator* allocator = new CPUAllocator; - if (cpu_allocator_collect_full_stats || LogMemory::IsEnabled()) { - allocator = new TrackingAllocator(allocator, true); - } - return allocator; -} -} // namespace - Allocator* cpu_allocator() { static Allocator* cpu_alloc = AllocatorRegistry::Global()->GetAllocator(); if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) { diff --git a/tensorflow/core/framework/cancellation.cc b/tensorflow/core/framework/cancellation.cc index 1cbed62939f..9da4828bbad 100644 --- a/tensorflow/core/framework/cancellation.cc +++ b/tensorflow/core/framework/cancellation.cc @@ -23,7 +23,9 @@ namespace tensorflow { const CancellationToken CancellationManager::kInvalidToken = -1; CancellationManager::CancellationManager() - : is_cancelling_(false), is_cancelled_(0), next_cancellation_token_(0) {} + : is_cancelling_(false), + is_cancelled_(false), + next_cancellation_token_(0) {} void CancellationManager::StartCancel() { gtl::FlatMap callbacks_to_run; diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc index c2d3f37ab30..b788d6b7778 100644 --- a/tensorflow/core/framework/function.cc +++ b/tensorflow/core/framework/function.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/core/framework/function.h" +#include #include #include #include @@ -271,12 +272,17 @@ class FunctionInstantiationHelper { int nid = -1; const string node_name = input.substr(1); const string node_colon = node_name + ":"; - for (const auto& p : index_) { - if (p.first == node_name || - tensorflow::StringPiece(p.first).starts_with(node_colon)) { - nid = p.second.nid; + const string node_colon_bound = node_name + ";"; + // index_ is a map sorted lexicographically, so the key we are looking for + // must lie in the range [node_name, node_colon_bound). + auto it = index_.lower_bound(node_name); + while (it != index_.end() && it->first <= node_colon_bound) { + if (it->first == node_name || + tensorflow::StringPiece(it->first).starts_with(node_colon)) { + nid = it->second.nid; break; } + ++it; } if (nid == -1) { return errors::InvalidArgument("input[", i, "] == '", input, @@ -421,7 +427,7 @@ class FunctionInstantiationHelper { GetFunctionSignature get_function_; InstantiationResult& result_; // A small index for all names that can be used as a node's input arguments. - std::unordered_map index_; + std::map index_; // This contains information about a node in the new graph including the node // names and input nodes' indexes. struct NodeInfo { diff --git a/tensorflow/core/framework/tensor_testutil.h b/tensorflow/core/framework/tensor_testutil.h index ab224aa7188..4c216a84f04 100644 --- a/tensorflow/core/framework/tensor_testutil.h +++ b/tensorflow/core/framework/tensor_testutil.h @@ -166,10 +166,11 @@ struct Expector { static void Equal(const Tensor& x, const Tensor& y) { ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); AssertSameTypeDims(x, y); - auto a = x.flat(); - auto b = y.flat(); - for (int i = 0; i < a.size(); ++i) { - ExpectEqual(a(i), b(i)); + const auto size = x.NumElements(); + const T* a = x.flat().data(); + const T* b = y.flat().data(); + for (int i = 0; i < size; ++i) { + ExpectEqual(a[i], b[i]); } } }; @@ -182,10 +183,11 @@ struct Expector { static void Equal(const Tensor& x, const Tensor& y) { ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); AssertSameTypeDims(x, y); - auto a = x.flat(); - auto b = y.flat(); - for (int i = 0; i < a.size(); ++i) { - ExpectEqual(a(i), b(i)); + const auto size = x.NumElements(); + const T* a = x.flat().data(); + const T* b = y.flat().data(); + for (int i = 0; i < size; ++i) { + ExpectEqual(a[i], b[i]); } } @@ -199,10 +201,11 @@ struct Expector { static void Near(const Tensor& x, const Tensor& y, const double abs_err) { ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); AssertSameTypeDims(x, y); - auto a = x.flat(); - auto b = y.flat(); - for (int i = 0; i < a.size(); ++i) { - Near(a(i), b(i), abs_err, i); + const auto size = x.NumElements(); + const T* a = x.flat().data(); + const T* b = y.flat().data(); + for (int i = 0; i < size; ++i) { + Near(a[i], b[i], abs_err, i); } } }; diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc index 11756c356aa..9cc7530459e 100644 --- a/tensorflow/core/framework/variant_op_registry.cc +++ b/tensorflow/core/framework/variant_op_registry.cc @@ -88,7 +88,17 @@ bool DecodeUnaryVariant(Variant* variant) { if (decode_fn == nullptr) { return false; } - return (*decode_fn)(variant); + const string type_name = variant->TypeName(); + bool decoded = (*decode_fn)(variant); + if (!decoded) return false; + if (variant->TypeName() != type_name) { + LOG(ERROR) << "DecodeUnaryVariant: Variant type_name before decoding was: " + << type_name + << " but after decoding was: " << variant->TypeName() + << ". Treating this as a failure."; + return false; + } + return true; } // Add some basic registrations for use by others, e.g., for testing. @@ -101,15 +111,59 @@ string MaybeRemoveTFPrefix(const StringPiece& str) { } // namespace #define REGISTER_VARIANT_DECODE_TYPE(T) \ - REGISTER_UNARY_VARIANT_DECODE_FUNCTION(T, MaybeRemoveTFPrefix(TF_STR(T))); + REGISTER_UNARY_VARIANT_DECODE_FUNCTION(T, TF_STR(T)); // No encode/decode registered for std::complex<> and Eigen::half // objects yet. -TF_CALL_INTEGRAL_TYPES(REGISTER_VARIANT_DECODE_TYPE); -TF_CALL_float(REGISTER_VARIANT_DECODE_TYPE); -TF_CALL_double(REGISTER_VARIANT_DECODE_TYPE); -TF_CALL_bool(REGISTER_VARIANT_DECODE_TYPE); +REGISTER_VARIANT_DECODE_TYPE(int); +REGISTER_VARIANT_DECODE_TYPE(float); +REGISTER_VARIANT_DECODE_TYPE(bool); +REGISTER_VARIANT_DECODE_TYPE(double); #undef REGISTER_VARIANT_DECODE_TYPE +// Special casing ZerosLikeFn per device. +UnaryVariantOpRegistry::VariantZerosLikeFn* +UnaryVariantOpRegistry::GetZerosLikeFn(const string& device, + const string& type_name) { + auto found = zeros_like_fns.find(std::make_pair(device, type_name)); + if (found == zeros_like_fns.end()) return nullptr; + return &found->second; +} + +void UnaryVariantOpRegistry::RegisterZerosLikeFn( + const string& device, const string& type_name, + const VariantZerosLikeFn& zeros_like_fn) { + CHECK(!type_name.empty()) << "Need a valid name for UnaryVariantZerosLike"; + VariantZerosLikeFn* existing = GetZerosLikeFn(device, type_name); + CHECK_EQ(existing, nullptr) + << "Unary VariantZerosLikeFn for type_name: " << type_name + << " already registered for device type: " << device; + zeros_like_fns.insert( + std::pair, VariantZerosLikeFn>( + std::make_pair(device, type_name), zeros_like_fn)); +} + +namespace { + +template +Status ZerosLikeVariantPrimitiveType(OpKernelContext* ctx, const T& t, + T* t_out) { + *t_out = T(0); + return Status::OK(); +} +} // namespace + +#define REGISTER_VARIANT_ZEROS_LIKE_TYPE(T) \ + REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION( \ + DEVICE_CPU, T, TF_STR(T), ZerosLikeVariantPrimitiveType); + +// No zeros_like registered for std::complex<> or Eigen::half objects yet. +REGISTER_VARIANT_ZEROS_LIKE_TYPE(int); +REGISTER_VARIANT_ZEROS_LIKE_TYPE(float); +REGISTER_VARIANT_ZEROS_LIKE_TYPE(double); +REGISTER_VARIANT_ZEROS_LIKE_TYPE(bool); + +#undef REGISTER_VARIANT_ZEROS_LIKE_TYPE + } // namespace tensorflow diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h index 389b049fa01..37e54f82c0f 100644 --- a/tensorflow/core/framework/variant_op_registry.h +++ b/tensorflow/core/framework/variant_op_registry.h @@ -19,11 +19,13 @@ limitations under the License. #include #include +#include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/variant.h" #include "tensorflow/core/framework/variant_encode_decode.h" namespace tensorflow { +class OpKernelContext; // A global UnaryVariantOpRegistry is used to hold callback functions // for different variant types. To be used by ShapeOp, RankOp, and // SizeOp, decoding, etc. @@ -32,6 +34,8 @@ class UnaryVariantOpRegistry { public: typedef std::function VariantShapeFn; typedef std::function VariantDecodeFn; + typedef std::function + VariantZerosLikeFn; // Add a shape lookup function to the registry. void RegisterShapeFn(const string& type_name, const VariantShapeFn& shape_fn); @@ -46,11 +50,29 @@ class UnaryVariantOpRegistry { // Returns nullptr if no decode function was found for the given TypeName. VariantDecodeFn* GetDecodeFn(const string& type_name); + // Add a zeros-like function to the registry. + void RegisterZerosLikeFn(const string& device, const string& type_name, + const VariantZerosLikeFn& zeros_like_fn); + + // Returns nullptr if no zeros-like function was found for the given + // device and TypeName. + VariantZerosLikeFn* GetZerosLikeFn(const string& device, + const string& type_name); + static UnaryVariantOpRegistry* Global(); private: std::unordered_map shape_fns; std::unordered_map decode_fns; + // Map std::pair to function. + struct PairHash { + template + std::size_t operator()(const std::pair& x) const { + return std::hash()(x.first) ^ std::hash()(x.second); + } + }; + std::unordered_map, VariantZerosLikeFn, PairHash> + zeros_like_fns; }; // Gets a TensorShape from a Tensor containing a scalar Variant. @@ -72,6 +94,28 @@ Status GetUnaryVariantShape(const Tensor& variant_tensor, TensorShape* shape); // bool DecodeUnaryVariant(Variant* variant); +// Sets *z_out = zeros_like(v). The variant v must have a registered +// ZerosLike function for the given Device. Returns an Internal error +// if v does not have a registered zeros_like function for this device, or if +// ZerosLike fails. +// +// REQUIRES: +// v_out is not null. +// +template +Status CreateZerosLikeVariant(OpKernelContext* ctx, const Variant& v, + Variant* v_out) { + const string& device = DeviceName::value; + UnaryVariantOpRegistry::VariantZerosLikeFn* zeros_like_fn = + UnaryVariantOpRegistry::Global()->GetZerosLikeFn(device, v.TypeName()); + if (zeros_like_fn == nullptr) { + return errors::Internal( + "No unary variant zeros_like function found for Variant type_name: ", + v.TypeName(), " for device type: ", device); + } + return (*zeros_like_fn)(ctx, v, v_out); +} + namespace variant_op_registry_fn_registration { template @@ -120,6 +164,34 @@ class UnaryVariantDecodeRegistration { } }; +template +class UnaryVariantZerosLikeRegistration { + typedef std::function + LocalVariantZerosLikeFn; + + public: + UnaryVariantZerosLikeRegistration( + const string& device, const string& type_name, + const LocalVariantZerosLikeFn& zeros_like_fn) { + auto wrapped_fn = [type_name, zeros_like_fn](OpKernelContext* ctx, + const Variant& v, + Variant* v_out) -> Status { + CHECK_NOTNULL(v_out); + *v_out = T(); + if (v.get() == nullptr) { + return errors::Internal( + "VariantZerosLikeFn: Could not access object, type_name: ", + type_name); + } + const T& t = *v.get(); + T* t_out = v_out->get(); + return zeros_like_fn(ctx, t, t_out); + }; + UnaryVariantOpRegistry::Global()->RegisterZerosLikeFn(device, type_name, + wrapped_fn); + } +}; + }; // namespace variant_op_registry_fn_registration // Register a unary shape variant function with the signature: @@ -151,6 +223,26 @@ class UnaryVariantDecodeRegistration { T> \ register_unary_variant_op_decoder_fn_##ctr(type_name) +// Register a unary zeros_like variant function with the signature: +// Status ZerosLikeFn(OpKernelContext* ctx, const T& t, T* t_out); +// to Variants having TypeName type_name, for device string device. +#define REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION(device, T, type_name, \ + zeros_like_function) \ + REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION_UNIQ_HELPER( \ + __COUNTER__, device, T, type_name, zeros_like_function) + +#define REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION_UNIQ_HELPER( \ + ctr, device, T, type_name, zeros_like_function) \ + REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION_UNIQ(ctr, device, T, type_name, \ + zeros_like_function) + +#define REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION_UNIQ( \ + ctr, device, T, type_name, zeros_like_function) \ + static variant_op_registry_fn_registration:: \ + UnaryVariantZerosLikeRegistration \ + register_unary_variant_op_decoder_fn_##ctr(device, type_name, \ + zeros_like_function) + } // end namespace tensorflow #endif // TENSORFLOW_FRAMEWORK_VARIANT_OP_REGISTRY_H_ diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc index 86fef53dbe6..4e79180217a 100644 --- a/tensorflow/core/framework/variant_op_registry_test.cc +++ b/tensorflow/core/framework/variant_op_registry_test.cc @@ -15,13 +15,25 @@ limitations under the License. #include +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif + #include "tensorflow/core/framework/variant_op_registry.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" namespace tensorflow { +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + namespace { struct VariantValue { @@ -33,7 +45,24 @@ struct VariantValue { *s = TensorShape({-0xdeadbeef}); return Status::OK(); } + static Status CPUZerosLikeFn(OpKernelContext* ctx, const VariantValue& v, + VariantValue* v_out) { + if (v.early_exit) { + return errors::InvalidArgument("early exit zeros_like!"); + } + v_out->zeros_like_set = 1; // CPU + return Status::OK(); + } + static Status GPUZerosLikeFn(OpKernelContext* ctx, const VariantValue& v, + VariantValue* v_out) { + if (v.early_exit) { + return errors::InvalidArgument("early exit zeros_like!"); + } + v_out->zeros_like_set = 2; // GPU + return Status::OK(); + } bool early_exit; + int zeros_like_set; }; REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(VariantValue, "TEST VariantValue", @@ -41,6 +70,14 @@ REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(VariantValue, "TEST VariantValue", REGISTER_UNARY_VARIANT_DECODE_FUNCTION(VariantValue, "TEST VariantValue"); +REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION(DEVICE_CPU, VariantValue, + "TEST VariantValue", + VariantValue::CPUZerosLikeFn); + +REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION(DEVICE_GPU, VariantValue, + "TEST VariantValue", + VariantValue::GPUZerosLikeFn); + } // namespace TEST(VariantOpShapeRegistryTest, TestBasic) { @@ -101,4 +138,67 @@ TEST(VariantOpDecodeRegistryTest, TestDuplicate) { "fjfjfj already registered"); } +TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) { + EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetZerosLikeFn( + DEVICE_CPU, "YOU SHALL NOT PASS"), + nullptr); + + VariantValue vv_early_exit{true /* early_exit */, 0 /* zeros_like_set */}; + Variant v = vv_early_exit; + Variant v_out = VariantValue(); + + OpKernelContext* null_context_pointer = nullptr; + Status s0 = + CreateZerosLikeVariant(null_context_pointer, v, &v_out); + EXPECT_FALSE(s0.ok()); + EXPECT_TRUE( + StringPiece(s0.error_message()).contains("early exit zeros_like")); + + VariantValue vv_ok{false /* early_exit */, 0 /* zeros_like_set */}; + v = vv_ok; + TF_EXPECT_OK( + CreateZerosLikeVariant(null_context_pointer, v, &v_out)); + VariantValue* vv_out = CHECK_NOTNULL(v_out.get()); + EXPECT_EQ(vv_out->zeros_like_set, 1); // CPU +} + +#if GOOGLE_CUDA +TEST(VariantOpZerosLikeRegistryTest, TestBasicGPU) { + EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetZerosLikeFn( + DEVICE_GPU, "YOU SHALL NOT PASS"), + nullptr); + + VariantValue vv_early_exit{true /* early_exit */, 0 /* zeros_like_set */}; + Variant v = vv_early_exit; + Variant v_out = VariantValue(); + + OpKernelContext* null_context_pointer = nullptr; + Status s0 = + CreateZerosLikeVariant(null_context_pointer, v, &v_out); + EXPECT_FALSE(s0.ok()); + EXPECT_TRUE( + StringPiece(s0.error_message()).contains("early exit zeros_like")); + + VariantValue vv_ok{false /* early_exit */, 0 /* zeros_like_set */}; + v = vv_ok; + TF_EXPECT_OK( + CreateZerosLikeVariant(null_context_pointer, v, &v_out)); + VariantValue* vv_out = CHECK_NOTNULL(v_out.get()); + EXPECT_EQ(vv_out->zeros_like_set, 2); // GPU +} +#endif // GOOGLE_CUDA + +TEST(VariantOpZerosLikeRegistryTest, TestDuplicate) { + UnaryVariantOpRegistry registry; + UnaryVariantOpRegistry::VariantZerosLikeFn f; + + registry.RegisterZerosLikeFn(DEVICE_CPU, "fjfjfj", f); + EXPECT_DEATH(registry.RegisterZerosLikeFn(DEVICE_CPU, "fjfjfj", f), + "fjfjfj already registered"); + + registry.RegisterZerosLikeFn(DEVICE_GPU, "fjfjfj", f); + EXPECT_DEATH(registry.RegisterZerosLikeFn(DEVICE_GPU, "fjfjfj", f), + "fjfjfj already registered"); +} + } // namespace tensorflow diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc index 7d938365c5a..a274c799704 100644 --- a/tensorflow/core/graph/graph.cc +++ b/tensorflow/core/graph/graph.cc @@ -523,6 +523,17 @@ Status Graph::IsValidNode(const Node* node) const { return Status::OK(); } +Status Graph::IsValidOutputTensor(const Node* node, int idx) const { + TF_RETURN_IF_ERROR(IsValidNode(node)); + if (idx >= node->num_outputs()) { + return errors::InvalidArgument("Node '", node->name(), "' (type: '", + node->op_def().name(), + "', num of outputs: ", node->num_outputs(), + ") does not have ", "output ", idx); + } + return Status::OK(); +} + Node* Graph::AllocateNode(std::shared_ptr props, const Node* cost_node) { Node* node = nullptr; @@ -572,7 +583,7 @@ int Graph::InternDeviceName(const string& device_name) { } string Edge::DebugString() const { - return strings::Printf("Edge %d %s:%d -> %s:%d", id_, src_->name().c_str(), + return strings::Printf("[id=%d %s:%d -> %s:%d]", id_, src_->name().c_str(), src_output_, dst_->name().c_str(), dst_input_); } diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h index 51ede642d27..25875185e47 100644 --- a/tensorflow/core/graph/graph.h +++ b/tensorflow/core/graph/graph.h @@ -519,6 +519,10 @@ class Graph { // Returns OK if `node` is non-null and belongs to this graph Status IsValidNode(const Node* node) const; + // Returns OK if IsValidNode(`node`) and `idx` is less than + // node->num_outputs() + Status IsValidOutputTensor(const Node* node, int idx) const; + // TODO(josh11b): uint64 hash() const; private: diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 2f9ceaa3bd0..4c793231974 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -1099,6 +1099,44 @@ int MklLayoutRewritePass::SetUpContiguousInputs( CHECK_NOTNULL(workspace_tensors); CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + // TODO(nhasabni): Temporary solution to connect filter input of + // BackpropInput with the converted filter from Conv2D. + bool do_connect_conv2d_backprop_input_filter = false; + Node* conv2d_node = nullptr; + // Filter node is 2nd input (slot index 1) of Conv2D. + int kConv2DFilterInputSlotIdx = 1; + int kConv2DBackpropInputFilterInputSlotIdx = 1; + int kConv2DFilterOutputSlotIdx = 1; + if (old_node->type_string() == csinfo_.conv2d_grad_input) { + // We need to find Conv2D node from Conv2DBackpropInput. + // For that let's first find filter node that is 2nd input (slot 1) + // of BackpropInput. + Node* filter_node = nullptr; + old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node); + CHECK_NOTNULL(filter_node); + + // Now check which nodes receive from filter_node. Filter feeds as + // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias. + for (const Edge* e : filter_node->out_edges()) { + if (e->dst()->type_string() == csinfo_.mkl_conv2d && + e->dst_input() == kConv2DFilterInputSlotIdx + /* filter is 2nd input of Conv2D and _MklConv2D. */) { + if (conv2d_node != nullptr) { + VLOG(1) << "MklLayoutRewritePass: unusual case of same filter" + << " feeding multiple Conv2D nodes: " + << filter_node->DebugString(); + // We will not connect filter input of Conv2DBackpropInput + // to be safe here. + do_connect_conv2d_backprop_input_filter = false; + break; + } else { + conv2d_node = e->dst(); + do_connect_conv2d_backprop_input_filter = true; + } + } + } + } + // Number of input slots to original op // Input slots are represented by .Input() calls in REGISTER_OP. int old_node_input_slots = old_node->op_def().input_arg_size(); @@ -1122,7 +1160,13 @@ int MklLayoutRewritePass::SetUpContiguousInputs( nb->Input(new_node_inputs); nn_slot_idx++; } else { - nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second); + // Special case for connecting filter input of Conv2DBackpropInput + if (do_connect_conv2d_backprop_input_filter && + iidx == kConv2DBackpropInputFilterInputSlotIdx) { + nb->Input(conv2d_node, kConv2DFilterOutputSlotIdx); + } else { + nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second); + } iidx++; nn_slot_idx++; } @@ -1157,9 +1201,17 @@ int MklLayoutRewritePass::SetUpContiguousInputs( } else { Node* mkl_node = nullptr; int mkl_node_output_slot = 0; - GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first, - old_node_inputs[iidx].second, - &mkl_node, &mkl_node_output_slot); + // Special case for connecting filter input of Conv2DBackpropInput + if (do_connect_conv2d_backprop_input_filter && + iidx == kConv2DBackpropInputFilterInputSlotIdx) { + GetNodeProducingMklTensor(g, old_node, conv2d_node, + kConv2DFilterOutputSlotIdx, + &mkl_node, &mkl_node_output_slot); + } else { + GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first, + old_node_inputs[iidx].second, + &mkl_node, &mkl_node_output_slot); + } nb->Input(mkl_node, mkl_node_output_slot); iidx++; nn_slot_idx++; diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 482e339802f..bd1d74368e5 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -788,7 +788,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) { "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;" "A:control->DMT/_0:control;A:control->DMT/_1:control;" "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;" - "C:1->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2"); + "C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2"); } // Conv2D with INT32 which is not supported by Mkl @@ -917,7 +917,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) { "A:control->DMT/_2:control;A:control->DMT/_3:control;" "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;" "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;" - "DMT/_4->H:3;E->H:1;E:1->H:4;F->H:2;F:1->H:5;G->H;" + "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;" "G:control->DMT/_4:control;H->I:1"); } @@ -953,7 +953,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) { "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);" "H(_MklConcat);I(Mul)|A->E;A->I;A:control->DMT/_0:control;" "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;" - "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:1->H:4;F->H:2;" + "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;" "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1"); } @@ -1023,8 +1023,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) { "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;" "C:control->DMT/_0:control;C:control->DMT/_1:control;" "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;" - "DMT/_4->H:5;E->H;E:1->H:3;E:control->DMT/_4:control;F->H:1;" - "F:1->H:4;G->H:2;H->I:1"); + "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;" + "F:2->H:4;G->H:2;H->I:1"); } // ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it @@ -1060,7 +1060,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) { "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);" "H(_MklConcatV2);I(Mul)|A->E;A->I;A:control->DMT/_0:control;" "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;" - "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:1->H:3;" + "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;" "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;" "G->H:2;H->I:1"); } diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc index 90bef111648..b01818f7461 100644 --- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc +++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc @@ -173,13 +173,13 @@ TEST_F(MklToTfConversionPass, Positive) { EXPECT_EQ(DoRunMklToTfConversionPass(), "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);" "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:1;C->Mkl2Tf/_0;" - "C:1->Mkl2Tf/_0:1;D->E:1;M->C:2;Mkl2Tf/_0->E;N->C:3"); + "C:2->Mkl2Tf/_0:1;D->E:1;M->C:2;Mkl2Tf/_0->E;N->C:3"); } } // MklConv2D followed by MklToTf op followed by Non-Mkl layer. // C=MklConv2D(A,M,B,N); D=MklToTf(C:0, C:1) F=Sub(D,E) (for interleaved) -// C=MklConv2D(A,B,M,N); D=MklToTf(C:0, C:1) F=Sub(D,E) (for contiguous) +// C=MklConv2D(A,B,M,N); D=MklToTf(C:0, C:2) F=Sub(D,E) (for contiguous) // MklToTf node should not be inserted again. TEST_F(MklToTfConversionPass, Negative_DoubleInsert) { if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) { @@ -226,7 +226,7 @@ TEST_F(MklToTfConversionPass, Negative_DoubleInsert) { "node { name: 'D' op: '_MklToTf'" " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'data_format' value { s: 'NCHW' } }" - " input: ['C:0', 'C:1']}" + " input: ['C:0', 'C:2']}" "node { name: 'E' op: 'Input'}" "node { name: 'F' op: 'Sub'" " attr {key: 'T' value { type: DT_FLOAT } }" @@ -234,7 +234,7 @@ TEST_F(MklToTfConversionPass, Negative_DoubleInsert) { EXPECT_EQ(DoRunMklToTfConversionPass(), "A(Input);B(Input);C(_MklConv2D);D(_MklToTf);E(Input);" "F(Sub);M(_MklInput);N(_MklInput)|" - "A->C;B->C:1;C->D;C:1->D:1;D->F;E->F:1;M->C:2;N->C:3"); + "A->C;B->C:1;C->D;C:2->D:1;D->F;E->F:1;M->C:2;N->C:3"); } } diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc index 0ab6aff250b..1b1c88f2df4 100644 --- a/tensorflow/core/grappler/costs/graph_properties.cc +++ b/tensorflow/core/grappler/costs/graph_properties.cc @@ -396,6 +396,18 @@ Status GraphProperties::InferStatically() { } input_properties.push_back(properties); } + for (const auto& edge : node->in_edges()) { + if (!edge->src()->IsConstant()) { + continue; + } + const int input_id = edge->dst_input(); + if (input_id >= input_properties.size()) { + continue; + } + const NodeDef& node = edge->src()->def(); + const TensorProto& raw_val = node.attr().at("value").tensor(); + *input_properties[input_id].mutable_value() = raw_val; + } input_properties_[node->name()] = input_properties; // TODO(bsteiner): share this code with the input processing above. diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc index 954c5ead8fc..461e58cf736 100644 --- a/tensorflow/core/grappler/costs/graph_properties_test.cc +++ b/tensorflow/core/grappler/costs/graph_properties_test.cc @@ -345,6 +345,15 @@ TEST_F(GraphPropertiesTest, MergeWithoutLoops) { EXPECT_EQ(DT_FLOAT, prop.dtype()); EXPECT_EQ(expected_outputs[i], PropToString(prop)); } + + // The "Less" node should be fed by 2 int32 scalar constant values. + const auto props = properties.GetInputProperties("Less"); + EXPECT_EQ(2, props.size()); + for (int i = 0; i < props.size(); ++i) { + EXPECT_EQ(DT_INT32, props[i].dtype()); + EXPECT_TRUE(props[i].has_value()); + EXPECT_EQ("int32: []", PropToString(props[i])); + } } TEST_F(GraphPropertiesTest, WhileLoop) { diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 0893a012047..efc5d7c553a 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -259,19 +259,13 @@ cc_library( cc_library( name = "conv_ops_gpu_hdrs", hdrs = ["conv_ops_gpu.h"], - deps = [ - ":eigen_helpers", - "//third_party/eigen3", - ], + deps = ["//third_party/eigen3"], ) cc_library( name = "gpu_util_hdrs", hdrs = ["gpu_utils.h"], - deps = [ - ":eigen_helpers", - "//third_party/eigen3", - ], + deps = ["//third_party/eigen3"], ) tf_cc_test( @@ -2587,14 +2581,17 @@ tf_kernel_library( tf_kernel_library( name = "reduction_ops", + srcs = ["reduction_ops_gpu_kernels.h"], prefix = "reduction_ops", - deps = MATH_DEPS, + deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]), ) tf_kernel_library( name = "segment_reduction_ops", prefix = "segment_reduction_ops", - deps = MATH_DEPS, + deps = MATH_DEPS + if_cuda([ + ":cuda_solvers", + ]), ) tf_kernel_library( @@ -3070,14 +3067,16 @@ tf_kernel_library( tf_kernel_library( name = "l2loss_op", prefix = "l2loss_op", + #srcs = ["reduction_ops_gpu_kernels.h"], deps = [ + ":reduction_ops", + "//third_party/eigen3", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:nn_grad", "//tensorflow/core:nn_ops_op_lib", - "//third_party/eigen3", - ], + ] + if_cuda(["@cub_archive//:cub"]), ) tf_cuda_cc_test( @@ -3346,6 +3345,20 @@ tf_kernel_library( deps = PARSING_DEPS, ) +tf_cc_test( + name = "parse_tensor_test", + srcs = ["parse_tensor_test.cc"], + deps = [ + ":ops_testutil", + ":ops_util", + ":parse_tensor_op", + "//tensorflow/core:framework", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + tf_kernel_library( name = "string_to_number_op", prefix = "string_to_number_op", @@ -4668,6 +4681,8 @@ filegroup( "whole_file_read_ops.*", "sample_distorted_bounding_box_op.*", "ctc_loss_op.*", + "summary_interface.*", + "summary_kernels.*", "spectrogram_convert_test_data.cc", "sql_dataset_ops.cc", # Excluded due to experimental status: @@ -5957,6 +5972,43 @@ tf_kernel_library( ], ) +cc_library( + name = "summary_interface", + srcs = ["summary_interface.cc"], + hdrs = ["summary_interface.h"], + deps = [ + "//tensorflow/compiler/xla:util", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:proto_text", + "//tensorflow/core:protos_all_cc", + ], +) + +cc_test( + name = "summary_interface_test", + srcs = ["summary_interface_test.cc"], + deps = [ + ":summary_interface", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + +tf_kernel_library( + name = "summary_kernels", + srcs = ["summary_kernels.cc"], + deps = [ + ":summary_interface", + "//tensorflow/core:framework", + "//tensorflow/core:summary_ops_op_lib", + ], +) + # ----------------------------------------------------------------------------- # Google-internal targets. These must be at the end for syncrepo. diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc index e07ca5e0c4c..ddc2d457b0e 100644 --- a/tensorflow/core/kernels/bias_op_gpu.cu.cc +++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc @@ -142,9 +142,9 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop, int group_size) { // Initialize the shared memory. typedef typename AccumulatorType::type AccT; - __shared__ AccT s_data[32]; - int32 s_data_size = sizeof(s_data) / sizeof(T); - for (int32 index = threadIdx.x; index < s_data_size; index += blockDim.x) { + const int32 kSDataSize = 32; + __shared__ AccT s_data[kSDataSize]; + for (int32 index = threadIdx.x; index < kSDataSize; index += blockDim.x) { s_data[index] = AccT(0); } __syncthreads(); diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc index b4b37dd4b8e..cdc11452827 100644 --- a/tensorflow/core/kernels/constant_op.cc +++ b/tensorflow/core/kernels/constant_op.cc @@ -17,6 +17,10 @@ limitations under the License. #define EIGEN_USE_THREADS +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif + #include "tensorflow/core/kernels/constant_op.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" @@ -26,13 +30,14 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/variant_op_registry.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/kernels/fill_functor.h" #include "tensorflow/core/platform/macros.h" #ifdef TENSORFLOW_USE_SYCL #include "tensorflow/core/common_runtime/sycl/sycl_util.h" -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace tensorflow { @@ -40,9 +45,8 @@ ConstantOp::ConstantOp(OpKernelConstruction* ctx) : OpKernel(ctx), tensor_(ctx->output_type(0)) { const TensorProto* proto = nullptr; OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto)); - OP_REQUIRES_OK(ctx, - ctx->device()->MakeTensorFromProto( - *proto, AllocatorAttributes(), &tensor_)); + OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto( + *proto, AllocatorAttributes(), &tensor_)); OP_REQUIRES( ctx, ctx->output_type(0) == tensor_.dtype(), errors::InvalidArgument("Type mismatch between value (", @@ -85,9 +89,9 @@ REGISTER_KERNEL(GPU, bool); #endif #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(D, TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Const").Device(DEVICE_##D).TypeConstraint("dtype"), \ +#define REGISTER_SYCL_KERNEL(D, TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Const").Device(DEVICE_##D).TypeConstraint("dtype"), \ ConstantOp); REGISTER_SYCL_KERNEL(SYCL, float); REGISTER_SYCL_KERNEL(SYCL, double); @@ -194,18 +198,18 @@ struct FillFunctor { void operator()(const SYCLDevice& d, typename TTypes::Flat out, typename TTypes::ConstScalar in) { #if !defined(EIGEN_HAS_INDEX_LIST) - Eigen::array rank1{1}; + Eigen::array rank1{1}; #else - Eigen::IndexList> rank1; + Eigen::IndexList > rank1; #endif - const int size = out.dimension(0); - Eigen::array broadcast_dims{size}; + const int size = out.dimension(0); + Eigen::array broadcast_dims{size}; - To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims); + To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims); } }; -} -#endif // TENSORFLOW_USE_SYCL +} // namespace functor +#endif // TENSORFLOW_USE_SYCL #define REGISTER_KERNEL(D, TYPE) \ REGISTER_KERNEL_BUILDER(Name("Fill") \ @@ -273,11 +277,23 @@ class ZerosLikeOp : public OpKernel { void Compute(OpKernelContext* ctx) override { const Tensor& input = ctx->input(0); - Tensor* out = nullptr; - OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output( - {0}, 0, input.shape(), &out)); - functor::SetZeroFunctor f; - f(ctx->eigen_device(), out->flat()); + const Device& d = ctx->eigen_device(); + if (std::is_same::value) { + OP_REQUIRES(ctx, input.dims() == 0, + errors::InvalidArgument( + "ZerosLike of non-unary Variant not supported.")); + const Variant& v = input.scalar()(); + Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({})); + Variant* out_v = &(out.scalar()()); + OP_REQUIRES_OK(ctx, CreateZerosLikeVariant(ctx, v, out_v)); + ctx->set_output(0, out); + } else { + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output( + {0}, 0, input.shape(), &out)); + functor::SetZeroFunctor f; + f(d, out->flat()); + } } }; @@ -288,6 +304,7 @@ class ZerosLikeOp : public OpKernel { #define REGISTER_CPU(type) REGISTER_KERNEL(type, CPU) TF_CALL_POD_STRING_TYPES(REGISTER_CPU); +REGISTER_CPU(Variant); #undef REGISTER_CPU #ifdef TENSORFLOW_USE_SYCL @@ -315,6 +332,14 @@ REGISTER_KERNEL_BUILDER(Name("ZerosLike") .TypeConstraint("T") .HostMemory("y"), ZerosLikeOp); +// TODO(ebrevdo): Once rendezvous has been properly set up for +// Variants, we'll no longer need a HostMemory attribute for this case. +REGISTER_KERNEL_BUILDER(Name("ZerosLike") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .HostMemory("x") + .HostMemory("y"), + ZerosLikeOp); #endif // GOOGLE_CUDA #undef REGISTER_KERNEL diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h index 4bb0b7f3b41..8de8f1b2650 100644 --- a/tensorflow/core/kernels/conv_2d.h +++ b/tensorflow/core/kernels/conv_2d.h @@ -225,13 +225,13 @@ struct PadInput { const std::array& padding_right, typename TTypes::Tensor out, TensorFormat format) { - Eigen::array, NDIMS> padding; - padding[GetTensorDimIndex(format, 'N')] = std::make_pair(0, 0); + Eigen::array, NDIMS> padding; + padding[GetTensorDimIndex(format, 'N')] = {0, 0}; for (int i = 0; i < NDIMS - 2; ++i) { - padding[GetTensorDimIndex(format, '0' + i)] = - std::make_pair(padding_left[i], padding_right[i]); + padding[GetTensorDimIndex(format, '0' + i)] = { + padding_left[i], padding_right[i]}; } - padding[GetTensorDimIndex(format, 'C')] = std::make_pair(0, 0); + padding[GetTensorDimIndex(format, 'C')] = {0, 0}; out.device(d) = in.pad(padding); } }; diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc index 65514937f4e..8eb705b2e5f 100644 --- a/tensorflow/core/kernels/conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc @@ -91,6 +91,20 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; +template +struct LaunchConv2DBackpropInputOp { + void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& out_backprop, const Tensor& input, + int row_stride, int col_stride, const Padding& padding, + Tensor* filter_backprop, TensorFormat data_format) { + const CPUDevice& d = ctx->eigen_device(); + functor::SpatialConvolutionBackwardInput()( + d, filter_backprop->tensor(), input.tensor(), + out_backprop.tensor(), filter_backprop->dim_size(0), + filter_backprop->dim_size(1), row_stride, col_stride); + } +}; + #ifdef TENSORFLOW_USE_LIBXSMM template struct LaunchXsmmBackwardFilter { @@ -237,11 +251,9 @@ class Conv2DFastBackpropFilterOp : public OpKernel { } #endif - functor::SpatialConvolutionBackwardKernel()( - context->eigen_device(), filter_backprop->tensor(), - input.tensor(), out_backprop.tensor(), - dims.spatial_dims[0].filter_size, dims.spatial_dims[1].filter_size, - dims.spatial_dims[0].stride, dims.spatial_dims[1].stride); + LaunchConv2DBackpropInputOp()( + context, false, false, out_backprop, input, dims.spatial_dims[0].stride, + dims.spatial_dims[1].stride, padding_, filter_backprop, data_format_); } private: @@ -495,15 +507,10 @@ class Conv2DSlowBackpropFilterOp : public OpKernel { OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_)); use_cudnn_ &= CanUseCudnn(); cudnn_use_autotune_ = CudnnUseAutotune(); - cudnn_disable_conv_1x1_optimization_ = CudnnDisableConv1x1Optimization(); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); } void Compute(OpKernelContext* context) override { - using perftools::gputools::dnn::AlgorithmConfig; - using perftools::gputools::dnn::AlgorithmType; - using perftools::gputools::dnn::ProfileResult; - using perftools::gputools::dnn::kDefaultAlgorithm; const Tensor& input = context->input(0); const Tensor& filter_sizes = context->input(1); const Tensor& out_backprop = context->input(2); @@ -512,340 +519,22 @@ class Conv2DSlowBackpropFilterOp : public OpKernel { errors::InvalidArgument( "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ", filter_sizes.dims())); - const TensorShape& input_shape = input.shape(); TensorShape filter_shape; OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape( filter_sizes.vec(), &filter_shape)); - ConvBackpropDimensions dims; - OP_REQUIRES_OK(context, - ConvBackpropComputeDimensions( - "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2, - input.shape(), filter_shape, out_backprop.shape(), - strides_, padding_, data_format_, &dims)); - Tensor* filter_backprop = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, filter_shape, &filter_backprop)); - const int padding_rows = - (padding_ == VALID) - ? 0 - : std::max(0, (dims.spatial_dims[0].output_size - 1) * - dims.spatial_dims[0].stride + - dims.spatial_dims[0].filter_size - - dims.spatial_dims[0].input_size); - const int padding_cols = - (padding_ == VALID) - ? 0 - : std::max(0, (dims.spatial_dims[1].output_size - 1) * - dims.spatial_dims[1].stride + - dims.spatial_dims[1].filter_size - - dims.spatial_dims[1].input_size); + // For now we take the stride from the second and third dimensions only (we + // do not support striding on the batch or depth dimension). + const int stride_rows = GetTensorDim(strides_, data_format_, 'H'); + const int stride_cols = GetTensorDim(strides_, data_format_, 'W'); - // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only - // calling it when that is true. Remove this check when (if?) cuDNN starts - // supporting different padding. - bool rows_odd = (padding_rows % 2 != 0); - bool cols_odd = (padding_cols % 2 != 0); - - auto* stream = context->op_device_context()->stream(); - OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); - - if (!use_cudnn_) { - context->SetStatus(errors::Unimplemented( - "Conv2DBackprop for GPU is not currently supported " - "without cudnn")); - return; - } - - if (!cudnn_disable_conv_1x1_optimization_ && - dims.spatial_dims[0].filter_size == 1 && - dims.spatial_dims[1].filter_size == 1 && - dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 && - data_format_ == FORMAT_NHWC) { - const uint64 m = dims.in_depth; - const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size * - dims.spatial_dims[1].input_size; - const uint64 n = dims.out_depth; - - // The shape of output backprop is - // [batch, out_rows, out_cols, out_depth] - // From cublas's perspective, it is: n x k - auto a_ptr = AsDeviceMemory(out_backprop.template flat().data(), - out_backprop.template flat().size()); - - // The shape of input is - // [batch, in_rows, in_cols, in_depth], - // From cublas's perspective, it is: m x k - auto b_ptr = AsDeviceMemory(input.template flat().data(), - input.template flat().size()); - - // the shape of the filter backprop from the conv_2d should be - // [1, 1, in_depth, out_depth] - // From cublas's perspective, it is: n x m - auto c_ptr = AsDeviceMemory(filter_backprop->template flat().data(), - filter_backprop->template flat().size()); - - bool blas_launch_status = - stream - ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose, - perftools::gputools::blas::Transpose::kTranspose, - n, m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n) - .ok(); - if (!blas_launch_status) { - context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, - ", n=", n, ", k=", k)); - } - return; - } else if (dims.spatial_dims[0].filter_size == - dims.spatial_dims[0].input_size && - dims.spatial_dims[1].filter_size == - dims.spatial_dims[1].input_size && - padding_ == VALID && data_format_ == FORMAT_NHWC) { - // The input data and filter have the same height/width, so call cublas - // directly. - const uint64 m = dims.spatial_dims[0].input_size * - dims.spatial_dims[1].input_size * dims.in_depth; - const uint64 k = dims.batch_size; - const uint64 n = dims.out_depth; - - auto a_ptr = AsDeviceMemory(input.template flat().data(), - input.template flat().size()); - auto b_ptr = AsDeviceMemory(out_backprop.template flat().data(), - out_backprop.template flat().size()); - auto c_ptr = AsDeviceMemory(filter_backprop->template flat().data(), - filter_backprop->template flat().size()); - - bool blas_launch_status = - stream - ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose, - perftools::gputools::blas::Transpose::kTranspose, - n, m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n) - .ok(); - if (!blas_launch_status) { - context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, - ", n=", n, ", k=", k)); - } - return; - } - - Tensor compatible_input; - if (rows_odd || cols_odd) { - // If a padding dimension is odd, we have one more element on the right - // side or the bottom side. This is unsupported in cudnn. Therefore, - // we pad that extra element and make it compatible. - OP_REQUIRES_OK( - context, - context->allocate_temp( - DataTypeToEnum::value, - ShapeFromFormat(data_format_, dims.batch_size, - dims.spatial_dims[0].input_size + rows_odd, - dims.spatial_dims[1].input_size + cols_odd, - dims.in_depth), - &compatible_input)); - - functor::PadInput()( - context->template eigen_device(), - To32Bit(input.tensor()), {{0, 0}}, {{rows_odd, cols_odd}}, - To32Bit(compatible_input.tensor()), data_format_); - } else { - compatible_input = input; - } - - CHECK(padding_rows >= 0 && padding_cols >= 0) - << "Negative row or col paddings: (" << padding_rows << ", " - << padding_cols << ")"; - perftools::gputools::dnn::BatchDescriptor input_desc; - input_desc.set_count(dims.batch_size) - .set_height(GetTensorDim(compatible_input, data_format_, 'H')) - .set_width(GetTensorDim(compatible_input, data_format_, 'W')) - .set_feature_map_count(dims.in_depth) - .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); - perftools::gputools::dnn::BatchDescriptor output_desc; - output_desc.set_count(dims.batch_size) - .set_height(dims.spatial_dims[0].output_size) - .set_width(dims.spatial_dims[1].output_size) - .set_feature_map_count(dims.out_depth) - .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); - perftools::gputools::dnn::FilterDescriptor filter_desc; - filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size) - .set_input_filter_width(dims.spatial_dims[1].filter_size) - .set_input_feature_map_count(dims.in_depth) - .set_output_feature_map_count(dims.out_depth); - perftools::gputools::dnn::ConvolutionDescriptor conv_desc; - conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride) - .set_horizontal_filter_stride(dims.spatial_dims[1].stride) - .set_zero_padding_height(padding_rows / 2) - .set_zero_padding_width(padding_cols / 2); - - // NOTE(zhengxq): - // cuDNN only supports the following layouts : - // Input : B x D x R x C - // Filter : OD x ID x R x C - // Whereas, we have - // Input : B x R x C x D - // Filter : R x C x ID x OD - // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C) - // The first TransformDepth performs - // (B x R x C x D) => (B x D x R x C). - // Since the tensor returned from cuDNN is B x D x R x C also, - // the second TransformDepth performs - // (B x D x R x C) => (B x R x C x D). - - Tensor pre_transformed_filter_backprop; - OP_REQUIRES_OK(context, context->allocate_temp( - DataTypeToEnum::value, - TensorShape({dims.out_depth, dims.in_depth, - dims.spatial_dims[0].filter_size, - dims.spatial_dims[1].filter_size}), - &pre_transformed_filter_backprop)); - - Tensor transformed_out_backprop; - if (data_format_ == FORMAT_NHWC) { - TensorShape nchw_shape = ShapeFromFormat( - FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size, - dims.spatial_dims[1].output_size, dims.out_depth); - if (dims.out_depth > 1) { - OP_REQUIRES_OK(context, context->allocate_temp( - DataTypeToEnum::value, nchw_shape, - &transformed_out_backprop)); - functor::NHWCToNCHW()( - context->eigen_device(), out_backprop.tensor(), - transformed_out_backprop.tensor()); - } else { - // If depth <= 1, just reshape. - CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape)); - } - } else { - transformed_out_backprop = out_backprop; - } - - Tensor transformed_input; - if (data_format_ == FORMAT_NHWC) { - TensorShape nchw_shape = ShapeFromFormat( - FORMAT_NCHW, GetTensorDim(compatible_input, data_format_, 'N'), - GetTensorDim(compatible_input, data_format_, 'H'), - GetTensorDim(compatible_input, data_format_, 'W'), - GetTensorDim(compatible_input, data_format_, 'C')); - if (nchw_shape.dim_size(1) > 1) { - OP_REQUIRES_OK(context, - context->allocate_temp(DataTypeToEnum::value, - nchw_shape, &transformed_input)); - functor::NHWCToNCHW()( - context->eigen_device(), - const_cast(compatible_input).tensor(), - transformed_input.tensor()); - } else { - // If depth <= 1, just reshape. - CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape)); - } - } else { - transformed_input = compatible_input; - } - - auto out_backprop_ptr = - AsDeviceMemory(transformed_out_backprop.template flat().data(), - transformed_out_backprop.template flat().size()); - auto filter_backprop_ptr = AsDeviceMemory( - pre_transformed_filter_backprop.template flat().data(), - pre_transformed_filter_backprop.template flat().size()); - auto input_ptr = - AsDeviceMemory(transformed_input.template flat().data(), - transformed_input.template flat().size()); - - static int64 ConvolveBackwardFilterScratchSize = GetCudnnWorkspaceLimit( - "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB by default - ); - int device_id = stream->parent()->device_ordinal(); - DataType dtype = input.dtype(); - ConvParameters conv_parameters = { - dims.batch_size, // batch - dims.in_depth, // in_depths - {{input_desc.height(), // in_rows - input_desc.width()}}, // in_cols - dims.out_depth, // out_depths - {{dims.spatial_dims[0].filter_size, // filter_rows - dims.spatial_dims[1].filter_size}}, // filter_cols - {{dims.spatial_dims[0].stride, // stride_rows - dims.spatial_dims[1].stride}}, // stride_cols - {{padding_rows, // padding_rows - padding_cols}}, // padding_cols - dtype, // tensor datatype - device_id, // device_id - }; - AlgorithmConfig algorithm_config; - if (cudnn_use_autotune_ && !AutoTuneConvBwdFilter::GetInstance()->Find( - conv_parameters, &algorithm_config)) { - std::vector algorithms; - CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); - ProfileResult best_result; - ProfileResult best_result_no_scratch; - for (auto profile_algorithm : algorithms) { - // TODO(zhengxq): profile each algorithm multiple times to better - // accuracy. - CudnnScratchAllocator scratch_allocator( - ConvolveBackwardFilterScratchSize, context); - ProfileResult profile_result; - bool cudnn_launch_status = - stream - ->ThenConvolveBackwardFilterWithAlgorithm( - input_desc, input_ptr, output_desc, out_backprop_ptr, - conv_desc, filter_desc, &filter_backprop_ptr, - &scratch_allocator, AlgorithmConfig(profile_algorithm), - &profile_result) - .ok(); - if (cudnn_launch_status) { - if (profile_result.is_valid()) { - if (profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; - } - if (scratch_allocator.TotalByteSize() == 0 && - profile_result.elapsed_time_in_ms() < - best_result_no_scratch.elapsed_time_in_ms()) { - best_result_no_scratch = profile_result; - } - } - } - } - OP_REQUIRES(context, - best_result.is_valid() || best_result_no_scratch.is_valid(), - errors::NotFound("No algorithm worked!")); - if (best_result.is_valid()) { - algorithm_config.set_algorithm(best_result.algorithm()); - } - if (best_result_no_scratch.is_valid()) { - algorithm_config.set_algorithm_no_scratch( - best_result_no_scratch.algorithm()); - } - AutoTuneConvBwdFilter::GetInstance()->Insert(conv_parameters, - algorithm_config); - } - CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize, - context); - bool cudnn_launch_status = - stream - ->ThenConvolveBackwardFilterWithAlgorithm( - input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc, - filter_desc, &filter_backprop_ptr, &scratch_allocator, - algorithm_config, nullptr) - .ok(); - - if (!cudnn_launch_status) { - context->SetStatus(errors::Internal( - "cuDNN Backward Filter function launch failure : input shape(", - input_shape.DebugString(), ") filter shape(", - filter_shape.DebugString(), ")")); - return; - } - - auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; - functor::ReverseTransformFilter()( - context->eigen_device(), - toConstTensor(pre_transformed_filter_backprop).template tensor(), - filter_backprop->tensor()); + launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input, + stride_rows, stride_cols, padding_, filter_backprop, + data_format_); } private: @@ -853,12 +542,351 @@ class Conv2DSlowBackpropFilterOp : public OpKernel { Padding padding_; bool use_cudnn_; TensorFormat data_format_; + LaunchConv2DBackpropFilterOp launcher_; bool cudnn_use_autotune_; - bool cudnn_disable_conv_1x1_optimization_; TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropFilterOp); }; +template +void LaunchConv2DBackpropFilterOp::operator()( + OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& out_backprop, const Tensor& input, int row_stride, + int col_stride, const Padding& padding, Tensor* filter_backprop, + TensorFormat data_format) { + using perftools::gputools::dnn::AlgorithmConfig; + using perftools::gputools::dnn::AlgorithmType; + using perftools::gputools::dnn::ProfileResult; + + std::vector strides(4, 1); + strides[GetTensorDimIndex(data_format, 'H')] = row_stride; + strides[GetTensorDimIndex(data_format, 'W')] = col_stride; + TensorShape filter_shape = filter_backprop->shape(); + + ConvBackpropDimensions dims; + OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions( + "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2, + input.shape(), filter_shape, out_backprop.shape(), + strides, padding, data_format, &dims)); + + const int padding_rows = + (padding == VALID) + ? 0 + : std::max(0, (dims.spatial_dims[0].output_size - 1) * + dims.spatial_dims[0].stride + + dims.spatial_dims[0].filter_size - + dims.spatial_dims[0].input_size); + const int padding_cols = + (padding == VALID) + ? 0 + : std::max(0, (dims.spatial_dims[1].output_size - 1) * + dims.spatial_dims[1].stride + + dims.spatial_dims[1].filter_size - + dims.spatial_dims[1].input_size); + + // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only + // calling it when that is true. Remove this check when (if?) cuDNN starts + // supporting different padding. + bool rows_odd = (padding_rows % 2 != 0); + bool cols_odd = (padding_cols % 2 != 0); + + auto* stream = ctx->op_device_context()->stream(); + OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available.")); + + if (!use_cudnn) { + ctx->SetStatus(errors::Unimplemented( + "Conv2DBackprop for GPU is not currently supported " + "without cudnn")); + return; + } + + bool cudnn_disable_conv_1x1_optimization_ = CudnnDisableConv1x1Optimization(); + if (!cudnn_disable_conv_1x1_optimization_ && + dims.spatial_dims[0].filter_size == 1 && + dims.spatial_dims[1].filter_size == 1 && + dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 && + data_format == FORMAT_NHWC) { + const uint64 m = dims.in_depth; + const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size * + dims.spatial_dims[1].input_size; + const uint64 n = dims.out_depth; + + // The shape of output backprop is + // [batch, out_rows, out_cols, out_depth] + // From cublas's perspective, it is: n x k + auto a_ptr = AsDeviceMemory(out_backprop.template flat().data(), + out_backprop.template flat().size()); + + // The shape of input is + // [batch, in_rows, in_cols, in_depth], + // From cublas's perspective, it is: m x k + auto b_ptr = AsDeviceMemory(input.template flat().data(), + input.template flat().size()); + + // the shape of the filter backprop from the conv_2d should be + // [1, 1, in_depth, out_depth] + // From cublas's perspective, it is: n x m + auto c_ptr = AsDeviceMemory(filter_backprop->template flat().data(), + filter_backprop->template flat().size()); + + bool blas_launch_status = + stream + ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose, + perftools::gputools::blas::Transpose::kTranspose, n, + m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, + ", n=", n, ", k=", k)); + } + return; + } else if (dims.spatial_dims[0].filter_size == + dims.spatial_dims[0].input_size && + dims.spatial_dims[1].filter_size == + dims.spatial_dims[1].input_size && + padding == VALID && data_format == FORMAT_NHWC) { + // The input data and filter have the same height/width, so call cublas + // directly. + const uint64 m = dims.spatial_dims[0].input_size * + dims.spatial_dims[1].input_size * dims.in_depth; + const uint64 k = dims.batch_size; + const uint64 n = dims.out_depth; + + auto a_ptr = AsDeviceMemory(input.template flat().data(), + input.template flat().size()); + auto b_ptr = AsDeviceMemory(out_backprop.template flat().data(), + out_backprop.template flat().size()); + auto c_ptr = AsDeviceMemory(filter_backprop->template flat().data(), + filter_backprop->template flat().size()); + + bool blas_launch_status = + stream + ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose, + perftools::gputools::blas::Transpose::kTranspose, n, + m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, + ", n=", n, ", k=", k)); + } + return; + } + + Tensor compatible_input; + if (rows_odd || cols_odd) { + // If a padding dimension is odd, we have one more element on the right + // side or the bottom side. This is unsupported in cudnn. Therefore, + // we pad that extra element and make it compatible. + OP_REQUIRES_OK( + ctx, ctx->allocate_temp( + DataTypeToEnum::value, + ShapeFromFormat(data_format, dims.batch_size, + dims.spatial_dims[0].input_size + rows_odd, + dims.spatial_dims[1].input_size + cols_odd, + dims.in_depth), + &compatible_input)); + + functor::PadInput()( + ctx->template eigen_device(), To32Bit(input.tensor()), + {{0, 0}}, {{rows_odd, cols_odd}}, + To32Bit(compatible_input.tensor()), data_format); + } else { + compatible_input = input; + } + + CHECK(padding_rows >= 0 && padding_cols >= 0) + << "Negative row or col paddings: (" << padding_rows << ", " + << padding_cols << ")"; + perftools::gputools::dnn::BatchDescriptor input_desc; + input_desc.set_count(dims.batch_size) + .set_height(GetTensorDim(compatible_input, data_format, 'H')) + .set_width(GetTensorDim(compatible_input, data_format, 'W')) + .set_feature_map_count(dims.in_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::BatchDescriptor output_desc; + output_desc.set_count(dims.batch_size) + .set_height(dims.spatial_dims[0].output_size) + .set_width(dims.spatial_dims[1].output_size) + .set_feature_map_count(dims.out_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::FilterDescriptor filter_desc; + filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size) + .set_input_filter_width(dims.spatial_dims[1].filter_size) + .set_input_feature_map_count(dims.in_depth) + .set_output_feature_map_count(dims.out_depth); + perftools::gputools::dnn::ConvolutionDescriptor conv_desc; + conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride) + .set_horizontal_filter_stride(dims.spatial_dims[1].stride) + .set_zero_padding_height(padding_rows / 2) + .set_zero_padding_width(padding_cols / 2); + + // NOTE(zhengxq): + // cuDNN only supports the following layouts : + // Input : B x D x R x C + // Filter : OD x ID x R x C + // Whereas, we have + // Input : B x R x C x D + // Filter : R x C x ID x OD + // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C) + // The first TransformDepth performs + // (B x R x C x D) => (B x D x R x C). + // Since the tensor returned from cuDNN is B x D x R x C also, + // the second TransformDepth performs + // (B x D x R x C) => (B x R x C x D). + + Tensor pre_transformed_filter_backprop; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp(DataTypeToEnum::value, + TensorShape({dims.out_depth, dims.in_depth, + dims.spatial_dims[0].filter_size, + dims.spatial_dims[1].filter_size}), + &pre_transformed_filter_backprop)); + + Tensor transformed_out_backprop; + if (data_format == FORMAT_NHWC) { + TensorShape nchw_shape = ShapeFromFormat( + FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size, + dims.spatial_dims[1].output_size, dims.out_depth); + if (dims.out_depth > 1) { + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum::value, nchw_shape, + &transformed_out_backprop)); + functor::NHWCToNCHW()( + ctx->eigen_device(), out_backprop.tensor(), + transformed_out_backprop.tensor()); + } else { + // If depth <= 1, just reshape. + CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape)); + } + } else { + transformed_out_backprop = out_backprop; + } + + Tensor transformed_input; + if (data_format == FORMAT_NHWC) { + TensorShape nchw_shape = ShapeFromFormat( + FORMAT_NCHW, GetTensorDim(compatible_input, data_format, 'N'), + GetTensorDim(compatible_input, data_format, 'H'), + GetTensorDim(compatible_input, data_format, 'W'), + GetTensorDim(compatible_input, data_format, 'C')); + if (nchw_shape.dim_size(1) > 1) { + OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::value, + nchw_shape, &transformed_input)); + functor::NHWCToNCHW()( + ctx->eigen_device(), + const_cast(compatible_input).tensor(), + transformed_input.tensor()); + } else { + // If depth <= 1, just reshape. + CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape)); + } + } else { + transformed_input = compatible_input; + } + + auto out_backprop_ptr = + AsDeviceMemory(transformed_out_backprop.template flat().data(), + transformed_out_backprop.template flat().size()); + auto filter_backprop_ptr = + AsDeviceMemory(pre_transformed_filter_backprop.template flat().data(), + pre_transformed_filter_backprop.template flat().size()); + auto input_ptr = AsDeviceMemory(transformed_input.template flat().data(), + transformed_input.template flat().size()); + + static int64 ConvolveBackwardFilterScratchSize = GetCudnnWorkspaceLimit( + "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB by default + ); + int device_id = stream->parent()->device_ordinal(); + DataType dtype = input.dtype(); + ConvParameters conv_parameters = { + dims.batch_size, // batch + dims.in_depth, // in_depths + {{input_desc.height(), // in_rows + input_desc.width()}}, // in_cols + dims.out_depth, // out_depths + {{dims.spatial_dims[0].filter_size, // filter_rows + dims.spatial_dims[1].filter_size}}, // filter_cols + {{dims.spatial_dims[0].stride, // stride_rows + dims.spatial_dims[1].stride}}, // stride_cols + {{padding_rows, // padding_rows + padding_cols}}, // padding_cols + dtype, // tensor datatype + device_id, // device_id + }; + AlgorithmConfig algorithm_config; + if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find( + conv_parameters, &algorithm_config)) { + std::vector algorithms; + CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms( + conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); + ProfileResult best_result; + ProfileResult best_result_no_scratch; + for (auto profile_algorithm : algorithms) { + // TODO(zhengxq): profile each algorithm multiple times to better + // accuracy. + CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize, + ctx); + ProfileResult profile_result; + bool cudnn_launch_status = + stream + ->ThenConvolveBackwardFilterWithAlgorithm( + input_desc, input_ptr, output_desc, out_backprop_ptr, + conv_desc, filter_desc, &filter_backprop_ptr, + &scratch_allocator, AlgorithmConfig(profile_algorithm), + &profile_result) + .ok(); + if (cudnn_launch_status) { + if (profile_result.is_valid()) { + if (profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + } + if (scratch_allocator.TotalByteSize() == 0 && + profile_result.elapsed_time_in_ms() < + best_result_no_scratch.elapsed_time_in_ms()) { + best_result_no_scratch = profile_result; + } + } + } + } + OP_REQUIRES(ctx, + best_result.is_valid() || best_result_no_scratch.is_valid(), + errors::NotFound("No algorithm worked!")); + if (best_result.is_valid()) { + algorithm_config.set_algorithm(best_result.algorithm()); + } + if (best_result_no_scratch.is_valid()) { + algorithm_config.set_algorithm_no_scratch( + best_result_no_scratch.algorithm()); + } + AutoTuneConvBwdFilter::GetInstance()->Insert(conv_parameters, + algorithm_config); + } + CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize, + ctx); + bool cudnn_launch_status = + stream + ->ThenConvolveBackwardFilterWithAlgorithm( + input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc, + filter_desc, &filter_backprop_ptr, &scratch_allocator, + algorithm_config, nullptr) + .ok(); + + if (!cudnn_launch_status) { + ctx->SetStatus(errors::Internal( + "cuDNN Backward Filter function launch failure : input shape(", + input.shape().DebugString(), ") filter shape(", + filter_shape.DebugString(), ")")); + return; + } + + auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; + functor::ReverseTransformFilter()( + ctx->eigen_device(), + toConstTensor(pre_transformed_filter_backprop).template tensor(), + filter_backprop->tensor()); +} + // Forward declarations of the functor specializations for GPU. namespace functor { #define DECLARE_GPU_SPEC(T) \ diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc index a5a9549a2f9..ce561aa99c2 100644 --- a/tensorflow/core/kernels/conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/conv_grad_input_ops.cc @@ -97,29 +97,17 @@ typedef Eigen::GpuDevice GPUDevice; // for CPU for now since nvcc times out when trying to compile them. // TODO(yangke): enable them for GPUs when we have a faster compiler. -template -struct LaunchBackwardInputConvolution { - bool operator()(OpKernelContext* context, const Device&, - typename TTypes::Tensor, - typename TTypes::ConstTensor, - typename TTypes::ConstTensor, int, int, int, int, - TensorFormat) const { - return false; - } -}; - -template <> -struct LaunchBackwardInputConvolution { - bool operator()(OpKernelContext* context, const CPUDevice& d, - typename TTypes::Tensor input_backward, - typename TTypes::ConstTensor kernel, - typename TTypes::ConstTensor output_backward, - int input_rows, int input_cols, int row_stride, - int col_stride, TensorFormat data_format) const { - functor::SpatialConvolutionBackwardInput()( - d, input_backward, kernel, output_backward, input_rows, input_cols, - row_stride, col_stride); - return true; +template +struct LaunchConv2DBackpropInputOp { + void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& out_backprop, const Tensor& filter, + int row_stride, int col_stride, const Padding& padding, + Tensor* in_backprop, TensorFormat data_format) { + const CPUDevice& d = ctx->eigen_device(); + functor::SpatialConvolutionBackwardInput()( + d, in_backprop->tensor(), filter.tensor(), + out_backprop.tensor(), in_backprop->dim_size(1), + in_backprop->dim_size(2), row_stride, col_stride); } }; @@ -268,11 +256,10 @@ class Conv2DFastBackpropInputOp : public OpKernel { } #endif - LaunchBackwardInputConvolution()( - context, context->eigen_device(), in_backprop->tensor(), - filter.tensor(), out_backprop.tensor(), - dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size, - dims.spatial_dims[0].stride, dims.spatial_dims[1].stride, data_format_); + LaunchConv2DBackpropInputOp()( + context, false, false, out_backprop, filter, + dims.spatial_dims[0].stride, dims.spatial_dims[1].stride, padding_, + in_backprop, data_format_); } private: @@ -600,10 +587,6 @@ class Conv2DSlowBackpropInputOp : public OpKernel { } void Compute(OpKernelContext* context) override { - using perftools::gputools::dnn::AlgorithmConfig; - using perftools::gputools::dnn::AlgorithmType; - using perftools::gputools::dnn::ProfileResult; - using perftools::gputools::dnn::kDefaultAlgorithm; const Tensor& input_sizes = context->input(0); const Tensor& filter = context->input(1); const Tensor& out_backprop = context->input(2); @@ -615,340 +598,18 @@ class Conv2DSlowBackpropInputOp : public OpKernel { TensorShape input_shape; OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape( input_sizes.vec(), &input_shape)); - const TensorShape& filter_shape = filter.shape(); - - ConvBackpropDimensions dims; - OP_REQUIRES_OK( - context, ConvBackpropComputeDimensions( - "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2, - input_shape, filter_shape, out_backprop.shape(), strides_, - padding_, data_format_, &dims)); Tensor* in_backprop = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, input_shape, &in_backprop)); - const int padding_rows = - (padding_ == VALID) - ? 0 - : std::max(0, (dims.spatial_dims[0].output_size - 1) * - dims.spatial_dims[0].stride + - dims.spatial_dims[0].filter_size - - dims.spatial_dims[0].input_size); - const int padding_cols = - (padding_ == VALID) - ? 0 - : std::max(0, (dims.spatial_dims[1].output_size - 1) * - dims.spatial_dims[1].stride + - dims.spatial_dims[1].filter_size - - dims.spatial_dims[1].input_size); + // For now we take the stride from the second and third dimensions only (we + // do not support striding on the batch or depth dimension). + const int stride_rows = GetTensorDim(strides_, data_format_, 'H'); + const int stride_cols = GetTensorDim(strides_, data_format_, 'W'); - // TODO(keveman): cuDNN only supports equal padding on both sides, so only - // calling it when that is true. Remove this check when (if?) cuDNN starts - // supporting different padding. - bool rows_odd = (padding_rows % 2 != 0); - bool cols_odd = (padding_cols % 2 != 0); - - auto* stream = context->op_device_context()->stream(); - OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); - - if (!use_cudnn_) { - context->SetStatus(errors::Unimplemented( - "Conv2DBackpropInput for GPU is not currently supported " - "without cudnn")); - return; - } - - if (dims.spatial_dims[0].filter_size == 1 && - dims.spatial_dims[1].filter_size == 1 && - dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 && - data_format_ == FORMAT_NHWC) { - // 1x1 filter, so call cublas directly. - const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size * - dims.spatial_dims[1].input_size; - const uint64 k = dims.out_depth; - const uint64 n = dims.in_depth; - - auto a_ptr = AsDeviceMemory(out_backprop.template flat().data(), - out_backprop.template flat().size()); - auto b_ptr = AsDeviceMemory(filter.template flat().data(), - filter.template flat().size()); - auto c_ptr = AsDeviceMemory(in_backprop->template flat().data(), - in_backprop->template flat().size()); - - auto transpose = perftools::gputools::blas::Transpose::kTranspose; - auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; - - bool blas_launch_status = - stream - ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k, - a_ptr, k, 0.0f, &c_ptr, n) - .ok(); - if (!blas_launch_status) { - context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, - ", n=", n, ", k=", k)); - } - return; - } else if (dims.spatial_dims[0].filter_size == - dims.spatial_dims[0].input_size && - dims.spatial_dims[1].filter_size == - dims.spatial_dims[1].input_size && - padding_ == VALID && data_format_ == FORMAT_NHWC) { - // The input data and filter have the same height/width, so call cublas - // directly. - const uint64 m = dims.batch_size; - const uint64 k = dims.out_depth; - const uint64 n = dims.spatial_dims[0].input_size * - dims.spatial_dims[1].input_size * dims.in_depth; - - auto a_ptr = AsDeviceMemory(out_backprop.template flat().data(), - out_backprop.template flat().size()); - auto b_ptr = AsDeviceMemory(filter.template flat().data(), - filter.template flat().size()); - auto c_ptr = AsDeviceMemory(in_backprop->template flat().data(), - in_backprop->template flat().size()); - - auto transpose = perftools::gputools::blas::Transpose::kTranspose; - auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; - - bool blas_launch_status = - stream - ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k, - a_ptr, k, 0.0f, &c_ptr, n) - .ok(); - if (!blas_launch_status) { - context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, - ", n=", n, ", k=", k)); - } - return; - } - - TensorShape compatible_input_shape; - if (rows_odd || cols_odd) { - // If a padding dimension is odd, we have one more element on the right - // side or the bottom side. This is unsupported in cudnn. Therefore, - // we pad that extra element and make it compatible. - compatible_input_shape = ShapeFromFormat( - data_format_, dims.batch_size, - dims.spatial_dims[0].input_size + rows_odd, - dims.spatial_dims[1].input_size + cols_odd, dims.in_depth); - } else { - compatible_input_shape = input_shape; - } - - CHECK(padding_rows >= 0 && padding_cols >= 0) - << "Negative row or col paddings: (" << padding_rows << ", " - << padding_cols << ")"; - perftools::gputools::dnn::BatchDescriptor input_desc; - input_desc.set_count(dims.batch_size) - .set_height(GetTensorDim(compatible_input_shape, data_format_, 'H')) - .set_width(GetTensorDim(compatible_input_shape, data_format_, 'W')) - .set_feature_map_count(dims.in_depth) - .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); - perftools::gputools::dnn::BatchDescriptor output_desc; - output_desc.set_count(dims.batch_size) - .set_height(dims.spatial_dims[0].output_size) - .set_width(dims.spatial_dims[1].output_size) - .set_feature_map_count(dims.out_depth) - .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); - perftools::gputools::dnn::FilterDescriptor filter_desc; - filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size) - .set_input_filter_width(dims.spatial_dims[1].filter_size) - .set_input_feature_map_count(dims.in_depth) - .set_output_feature_map_count(dims.out_depth); - perftools::gputools::dnn::ConvolutionDescriptor conv_desc; - conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride) - .set_horizontal_filter_stride(dims.spatial_dims[1].stride) - .set_zero_padding_height(padding_rows / 2) - .set_zero_padding_width(padding_cols / 2); - - // NOTE(keveman): - // cuDNN only supports the following layouts : - // Input : B x D x R x C - // Filter : OD x ID x R x C - // Whereas, we have - // Input : B x R x C x D - // Filter : R x C x ID x OD - // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C) - // The first TransformDepth performs - // (B x R x C x D) => (B x D x R x C). - // Since the tensor returned from cuDNN is B x D x R x C also, - // the second TransformDepth performs - // (B x D x R x C) => (B x R x C x D). - Tensor transformed_filter; - OP_REQUIRES_OK(context, context->allocate_temp( - DataTypeToEnum::value, - TensorShape({dims.out_depth, dims.in_depth, - dims.spatial_dims[0].filter_size, - dims.spatial_dims[1].filter_size}), - &transformed_filter)); - - functor::TransformFilter()( - context->eigen_device(), To32Bit(filter.tensor()), - To32Bit(transformed_filter.tensor())); - - Tensor transformed_out_backprop; - if (data_format_ == FORMAT_NHWC) { - TensorShape nchw_shape = ShapeFromFormat( - FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size, - dims.spatial_dims[1].output_size, dims.out_depth); - if (dims.out_depth > 1) { - OP_REQUIRES_OK(context, context->allocate_temp( - DataTypeToEnum::value, nchw_shape, - &transformed_out_backprop)); - functor::NHWCToNCHW()( - context->eigen_device(), out_backprop.tensor(), - transformed_out_backprop.tensor()); - } else { - // If depth <= 1, then just reshape. - CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape)); - } - } else { - transformed_out_backprop = out_backprop; - } - - Tensor pre_transformed_in_backprop; - OP_REQUIRES_OK( - context, - context->allocate_temp( - DataTypeToEnum::value, - ShapeFromFormat( - FORMAT_NCHW, - GetTensorDim(compatible_input_shape, data_format_, 'N'), - GetTensorDim(compatible_input_shape, data_format_, 'H'), - GetTensorDim(compatible_input_shape, data_format_, 'W'), - GetTensorDim(compatible_input_shape, data_format_, 'C')), - &pre_transformed_in_backprop)); - - auto out_backprop_ptr = - AsDeviceMemory(transformed_out_backprop.template flat().data(), - transformed_out_backprop.template flat().size()); - auto filter_ptr = - AsDeviceMemory(transformed_filter.template flat().data(), - transformed_filter.template flat().size()); - auto in_backprop_ptr = - AsDeviceMemory(pre_transformed_in_backprop.template flat().data(), - pre_transformed_in_backprop.template flat().size()); - - static int64 ConvolveBackwardDataScratchSize = GetCudnnWorkspaceLimit( - "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB by default - ); - CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, - context); - int device_id = stream->parent()->device_ordinal(); - DataType dtype = out_backprop.dtype(); - ConvParameters conv_parameters = { - dims.batch_size, // batch - dims.in_depth, // in_depths - {{input_desc.height(), // in_rows - input_desc.width()}}, // in_cols - dims.out_depth, // out_depths - {{dims.spatial_dims[0].filter_size, // filter_rows - dims.spatial_dims[1].filter_size}}, // filter_cols - {{dims.spatial_dims[0].stride, // stride_rows - dims.spatial_dims[1].stride}}, // stride_cols - {{padding_rows, // padding_rows - padding_cols}}, // padding_cols - dtype, // tensor data type - device_id, // device_id - }; - AlgorithmConfig algorithm_config; - if (cudnn_use_autotune_ && !AutoTuneConvBwdData::GetInstance()->Find( - conv_parameters, &algorithm_config)) { - std::vector algorithms; - CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms( - conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); - ProfileResult best_result; - ProfileResult best_result_no_scratch; - for (auto profile_algorithm : algorithms) { - // TODO(zhengxq): profile each algorithm multiple times to better - // accuracy. - CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, - context); - ProfileResult profile_result; - bool cudnn_launch_status = - stream - ->ThenConvolveBackwardDataWithAlgorithm( - filter_desc, filter_ptr, output_desc, out_backprop_ptr, - conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator, - AlgorithmConfig(profile_algorithm), &profile_result) - .ok(); - if (cudnn_launch_status) { - if (profile_result.is_valid()) { - if (profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; - } - if (scratch_allocator.TotalByteSize() == 0 && - profile_result.elapsed_time_in_ms() < - best_result_no_scratch.elapsed_time_in_ms()) { - best_result_no_scratch = profile_result; - } - } - } - } - OP_REQUIRES(context, - best_result.is_valid() || best_result_no_scratch.is_valid(), - errors::NotFound("No algorithm worked!")); - if (best_result.is_valid()) { - algorithm_config.set_algorithm(best_result.algorithm()); - } - if (best_result_no_scratch.is_valid()) { - algorithm_config.set_algorithm_no_scratch( - best_result_no_scratch.algorithm()); - } - AutoTuneConvBwdData::GetInstance()->Insert(conv_parameters, - algorithm_config); - } - bool cudnn_launch_status = - stream - ->ThenConvolveBackwardDataWithAlgorithm( - filter_desc, filter_ptr, output_desc, out_backprop_ptr, - conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator, - algorithm_config, nullptr) - .ok(); - - if (!cudnn_launch_status) { - context->SetStatus(errors::Internal( - "cuDNN Backward Data function launch failure : input shape(", - input_shape.DebugString(), ") filter shape(", - filter_shape.DebugString(), ")")); - return; - } - - if (rows_odd || cols_odd) { - Tensor in_backprop_remove_padding; - OP_REQUIRES_OK( - context, - context->allocate_temp( - DataTypeToEnum::value, - ShapeFromFormat(FORMAT_NCHW, - GetTensorDim(input_shape, data_format_, 'N'), - GetTensorDim(input_shape, data_format_, 'H'), - GetTensorDim(input_shape, data_format_, 'W'), - GetTensorDim(input_shape, data_format_, 'C')), - &in_backprop_remove_padding)); - - // Remove the padding for odd rows or cols. - functor::PadInput()( - context->template eigen_device(), - To32Bit(const_cast(pre_transformed_in_backprop) - .tensor()), - {{0, 0}}, {{-rows_odd, -cols_odd}}, - To32Bit(in_backprop_remove_padding.tensor()), FORMAT_NCHW); - - pre_transformed_in_backprop = in_backprop_remove_padding; - } - - if (data_format_ == FORMAT_NHWC) { - auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; - functor::NCHWToNHWC()( - context->eigen_device(), - toConstTensor(pre_transformed_in_backprop).template tensor(), - in_backprop->tensor()); - } else { - *in_backprop = pre_transformed_in_backprop; - } + launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter, + stride_rows, stride_cols, padding_, in_backprop, data_format_); } private: @@ -956,11 +617,354 @@ class Conv2DSlowBackpropInputOp : public OpKernel { Padding padding_; bool use_cudnn_; TensorFormat data_format_; + LaunchConv2DBackpropInputOp launcher_; bool cudnn_use_autotune_; TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropInputOp); }; +template +void LaunchConv2DBackpropInputOp::operator()( + OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& out_backprop, const Tensor& filter, int row_stride, + int col_stride, const Padding& padding, Tensor* in_backprop, + TensorFormat data_format) { + using perftools::gputools::dnn::AlgorithmConfig; + using perftools::gputools::dnn::AlgorithmType; + using perftools::gputools::dnn::ProfileResult; + + std::vector strides(4, 1); + strides[GetTensorDimIndex(data_format, 'H')] = row_stride; + strides[GetTensorDimIndex(data_format, 'W')] = col_stride; + TensorShape input_shape = in_backprop->shape(); + + const TensorShape& filter_shape = filter.shape(); + ConvBackpropDimensions dims; + OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions( + "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2, + input_shape, filter_shape, out_backprop.shape(), + strides, padding, data_format, &dims)); + + const int padding_rows = + (padding == VALID) + ? 0 + : std::max(0, (dims.spatial_dims[0].output_size - 1) * + dims.spatial_dims[0].stride + + dims.spatial_dims[0].filter_size - + dims.spatial_dims[0].input_size); + const int padding_cols = + (padding == VALID) + ? 0 + : std::max(0, (dims.spatial_dims[1].output_size - 1) * + dims.spatial_dims[1].stride + + dims.spatial_dims[1].filter_size - + dims.spatial_dims[1].input_size); + + // TODO(keveman): cuDNN only supports equal padding on both sides, so only + // calling it when that is true. Remove this check when (if?) cuDNN starts + // supporting different padding. + bool rows_odd = (padding_rows % 2 != 0); + bool cols_odd = (padding_cols % 2 != 0); + + auto* stream = ctx->op_device_context()->stream(); + OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available.")); + + if (!use_cudnn) { + ctx->SetStatus(errors::Unimplemented( + "Conv2DBackpropInput for GPU is not currently supported " + "without cudnn")); + return; + } + + if (dims.spatial_dims[0].filter_size == 1 && + dims.spatial_dims[1].filter_size == 1 && + dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 && + data_format == FORMAT_NHWC) { + // 1x1 filter, so call cublas directly. + const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size * + dims.spatial_dims[1].input_size; + const uint64 k = dims.out_depth; + const uint64 n = dims.in_depth; + + auto a_ptr = AsDeviceMemory(out_backprop.template flat().data(), + out_backprop.template flat().size()); + auto b_ptr = AsDeviceMemory(filter.template flat().data(), + filter.template flat().size()); + auto c_ptr = AsDeviceMemory(in_backprop->template flat().data(), + in_backprop->template flat().size()); + + auto transpose = perftools::gputools::blas::Transpose::kTranspose; + auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; + + bool blas_launch_status = + stream + ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k, + a_ptr, k, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, + ", n=", n, ", k=", k)); + } + return; + } else if (dims.spatial_dims[0].filter_size == + dims.spatial_dims[0].input_size && + dims.spatial_dims[1].filter_size == + dims.spatial_dims[1].input_size && + padding == VALID && data_format == FORMAT_NHWC) { + // The input data and filter have the same height/width, so call cublas + // directly. + const uint64 m = dims.batch_size; + const uint64 k = dims.out_depth; + const uint64 n = dims.spatial_dims[0].input_size * + dims.spatial_dims[1].input_size * dims.in_depth; + + auto a_ptr = AsDeviceMemory(out_backprop.template flat().data(), + out_backprop.template flat().size()); + auto b_ptr = AsDeviceMemory(filter.template flat().data(), + filter.template flat().size()); + auto c_ptr = AsDeviceMemory(in_backprop->template flat().data(), + in_backprop->template flat().size()); + + auto transpose = perftools::gputools::blas::Transpose::kTranspose; + auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; + + bool blas_launch_status = + stream + ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k, + a_ptr, k, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, + ", n=", n, ", k=", k)); + } + return; + } + + TensorShape compatible_input_shape; + if (rows_odd || cols_odd) { + // If a padding dimension is odd, we have one more element on the right + // side or the bottom side. This is unsupported in cudnn. Therefore, + // we pad that extra element and make it compatible. + compatible_input_shape = ShapeFromFormat( + data_format, dims.batch_size, + dims.spatial_dims[0].input_size + rows_odd, + dims.spatial_dims[1].input_size + cols_odd, dims.in_depth); + } else { + compatible_input_shape = input_shape; + } + + CHECK(padding_rows >= 0 && padding_cols >= 0) + << "Negative row or col paddings: (" << padding_rows << ", " + << padding_cols << ")"; + perftools::gputools::dnn::BatchDescriptor input_desc; + input_desc.set_count(dims.batch_size) + .set_height(GetTensorDim(compatible_input_shape, data_format, 'H')) + .set_width(GetTensorDim(compatible_input_shape, data_format, 'W')) + .set_feature_map_count(dims.in_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::BatchDescriptor output_desc; + output_desc.set_count(dims.batch_size) + .set_height(dims.spatial_dims[0].output_size) + .set_width(dims.spatial_dims[1].output_size) + .set_feature_map_count(dims.out_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::FilterDescriptor filter_desc; + filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size) + .set_input_filter_width(dims.spatial_dims[1].filter_size) + .set_input_feature_map_count(dims.in_depth) + .set_output_feature_map_count(dims.out_depth); + perftools::gputools::dnn::ConvolutionDescriptor conv_desc; + conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride) + .set_horizontal_filter_stride(dims.spatial_dims[1].stride) + .set_zero_padding_height(padding_rows / 2) + .set_zero_padding_width(padding_cols / 2); + + // NOTE(keveman): + // cuDNN only supports the following layouts : + // Input : B x D x R x C + // Filter : OD x ID x R x C + // Whereas, we have + // Input : B x R x C x D + // Filter : R x C x ID x OD + // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C) + // The first TransformDepth performs + // (B x R x C x D) => (B x D x R x C). + // Since the tensor returned from cuDNN is B x D x R x C also, + // the second TransformDepth performs + // (B x D x R x C) => (B x R x C x D). + Tensor transformed_filter; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp(DataTypeToEnum::value, + TensorShape({dims.out_depth, dims.in_depth, + dims.spatial_dims[0].filter_size, + dims.spatial_dims[1].filter_size}), + &transformed_filter)); + + functor::TransformFilter()( + ctx->eigen_device(), To32Bit(filter.tensor()), + To32Bit(transformed_filter.tensor())); + + Tensor transformed_out_backprop; + if (data_format == FORMAT_NHWC) { + TensorShape nchw_shape = ShapeFromFormat( + FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size, + dims.spatial_dims[1].output_size, dims.out_depth); + if (dims.out_depth > 1) { + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum::value, nchw_shape, + &transformed_out_backprop)); + functor::NHWCToNCHW()( + ctx->eigen_device(), out_backprop.tensor(), + transformed_out_backprop.tensor()); + } else { + // If depth <= 1, then just reshape. + CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape)); + } + } else { + transformed_out_backprop = out_backprop; + } + + Tensor pre_transformed_in_backprop; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp( + DataTypeToEnum::value, + ShapeFromFormat( + FORMAT_NCHW, + GetTensorDim(compatible_input_shape, data_format, 'N'), + GetTensorDim(compatible_input_shape, data_format, 'H'), + GetTensorDim(compatible_input_shape, data_format, 'W'), + GetTensorDim(compatible_input_shape, data_format, 'C')), + &pre_transformed_in_backprop)); + + auto out_backprop_ptr = + AsDeviceMemory(transformed_out_backprop.template flat().data(), + transformed_out_backprop.template flat().size()); + auto filter_ptr = + AsDeviceMemory(transformed_filter.template flat().data(), + transformed_filter.template flat().size()); + auto in_backprop_ptr = + AsDeviceMemory(pre_transformed_in_backprop.template flat().data(), + pre_transformed_in_backprop.template flat().size()); + + static int64 ConvolveBackwardDataScratchSize = GetCudnnWorkspaceLimit( + "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB by default + ); + CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx); + int device_id = stream->parent()->device_ordinal(); + DataType dtype = out_backprop.dtype(); + ConvParameters conv_parameters = { + dims.batch_size, // batch + dims.in_depth, // in_depths + {{input_desc.height(), // in_rows + input_desc.width()}}, // in_cols + dims.out_depth, // out_depths + {{dims.spatial_dims[0].filter_size, // filter_rows + dims.spatial_dims[1].filter_size}}, // filter_cols + {{dims.spatial_dims[0].stride, // stride_rows + dims.spatial_dims[1].stride}}, // stride_cols + {{padding_rows, // padding_rows + padding_cols}}, // padding_cols + dtype, // tensor data type + device_id, // device_id + }; + AlgorithmConfig algorithm_config; + if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find( + conv_parameters, &algorithm_config)) { + std::vector algorithms; + CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms( + conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)); + ProfileResult best_result; + ProfileResult best_result_no_scratch; + for (auto profile_algorithm : algorithms) { + // TODO(zhengxq): profile each algorithm multiple times to better + // accuracy. + CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, + ctx); + ProfileResult profile_result; + bool cudnn_launch_status = + stream + ->ThenConvolveBackwardDataWithAlgorithm( + filter_desc, filter_ptr, output_desc, out_backprop_ptr, + conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator, + AlgorithmConfig(profile_algorithm), &profile_result) + .ok(); + if (cudnn_launch_status) { + if (profile_result.is_valid()) { + if (profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + } + if (scratch_allocator.TotalByteSize() == 0 && + profile_result.elapsed_time_in_ms() < + best_result_no_scratch.elapsed_time_in_ms()) { + best_result_no_scratch = profile_result; + } + } + } + } + OP_REQUIRES(ctx, + best_result.is_valid() || best_result_no_scratch.is_valid(), + errors::NotFound("No algorithm worked!")); + if (best_result.is_valid()) { + algorithm_config.set_algorithm(best_result.algorithm()); + } + if (best_result_no_scratch.is_valid()) { + algorithm_config.set_algorithm_no_scratch( + best_result_no_scratch.algorithm()); + } + AutoTuneConvBwdData::GetInstance()->Insert(conv_parameters, + algorithm_config); + } + bool cudnn_launch_status = + stream + ->ThenConvolveBackwardDataWithAlgorithm( + filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc, + input_desc, &in_backprop_ptr, &scratch_allocator, + algorithm_config, nullptr) + .ok(); + + if (!cudnn_launch_status) { + ctx->SetStatus(errors::Internal( + "cuDNN Backward Data function launch failure : input shape(", + input_shape.DebugString(), ") filter shape(", + filter_shape.DebugString(), ")")); + return; + } + + if (rows_odd || cols_odd) { + Tensor in_backprop_remove_padding; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp( + DataTypeToEnum::value, + ShapeFromFormat(FORMAT_NCHW, + GetTensorDim(input_shape, data_format, 'N'), + GetTensorDim(input_shape, data_format, 'H'), + GetTensorDim(input_shape, data_format, 'W'), + GetTensorDim(input_shape, data_format, 'C')), + &in_backprop_remove_padding)); + + // Remove the padding for odd rows or cols. + functor::PadInput()( + ctx->template eigen_device(), + To32Bit(const_cast(pre_transformed_in_backprop) + .tensor()), + {{0, 0}}, {{-rows_odd, -cols_odd}}, + To32Bit(in_backprop_remove_padding.tensor()), FORMAT_NCHW); + + pre_transformed_in_backprop = in_backprop_remove_padding; + } + + if (data_format == FORMAT_NHWC) { + auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; + functor::NCHWToNHWC()( + ctx->eigen_device(), + toConstTensor(pre_transformed_in_backprop).template tensor(), + in_backprop->tensor()); + } else { + *in_backprop = pre_transformed_in_backprop; + } +} + // Forward declarations of the functor specializations for GPU. namespace functor { #define DECLARE_GPU_SPEC(T) \ diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h index 3ea9510afba..2926bb3a867 100644 --- a/tensorflow/core/kernels/conv_grad_ops.h +++ b/tensorflow/core/kernels/conv_grad_ops.h @@ -168,6 +168,43 @@ limitations under the License. namespace tensorflow { +// Forward declaration. +class OpKernelContext; + +template +struct LaunchConv2DBackpropInputOp { + void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& out_backprop, const Tensor& filter, + int row_stride, int col_stride, const Padding& padding, + Tensor* in_backprop, TensorFormat data_format); +}; + +template +struct LaunchConv2DBackpropFilterOp { + void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& out_backprop, const Tensor& input, + int row_stride, int col_stride, const Padding& padding, + Tensor* filter_backprop, TensorFormat data_format); +}; + +#ifdef GOOGLE_CUDA +template +struct LaunchConv2DBackpropInputOp { + void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& input, const Tensor& filter, int row_stride, + int col_stride, const Padding& padding, Tensor* output, + TensorFormat data_format); +}; + +template +struct LaunchConv2DBackpropFilterOp { + void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& out_backprop, const Tensor& input, + int row_stride, int col_stride, const Padding& padding, + Tensor* filter_backprop, TensorFormat data_format); +}; +#endif // GOOGLE_CUDA + // Information about a single spatial dimension for a convolution // backpropagation. struct ConvBackpropSpatialDimension { diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index 2c77a389527..bbb9e36fc9d 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -58,10 +58,10 @@ typedef Eigen::GpuDevice GPUDevice; namespace { template struct LaunchGeneric { - static void launch(OpKernelContext* ctx, const Tensor& input, - const Tensor& filter, int row_stride, int col_stride, - const Eigen::PaddingType& padding, Tensor* output, - TensorFormat data_format) { + void operator()(OpKernelContext* ctx, const Tensor& input, + const Tensor& filter, int row_stride, int col_stride, + const Padding& padding, Tensor* output, + TensorFormat data_format) { CHECK(data_format == FORMAT_NHWC) << "Generic conv implementation only " "supports NHWC tensor format for now."; if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 && @@ -86,8 +86,7 @@ struct LaunchGeneric { filter.shaped({filter.dim_size(2), filter.dim_size(3)}), dim_pair); } else if (filter.dim_size(0) == input.dim_size(1) && - filter.dim_size(1) == input.dim_size(2) && - padding == Eigen::PADDING_VALID) { + filter.dim_size(1) == input.dim_size(2) && padding == VALID) { // If the input data and filter have the same height/width, // the 2D convolution is reduced to matrix multiplication. const int k = // Length of reduction dimension. @@ -104,28 +103,26 @@ struct LaunchGeneric { functor::SpatialConvolution()( ctx->eigen_device(), output->tensor(), input.tensor(), filter.tensor(), row_stride, col_stride, - padding); + BrainPadding2EigenPadding(padding)); } } }; } // namespace template -class LaunchConv2DOp { - public: - void launch(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, - const Tensor& input, const Tensor& filter, int row_stride, - int col_stride, const Eigen::PaddingType& padding, Tensor* output, - TensorFormat data_format) { +struct LaunchConv2DOp { + void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& input, const Tensor& filter, int row_stride, + int col_stride, const Padding& padding, Tensor* output, + TensorFormat data_format) { if (data_format != FORMAT_NHWC) { ctx->SetStatus( errors::Unimplemented("Generic conv implementation only supports " "NHWC tensor format for now.")); return; } - LaunchGeneric::launch(ctx, input, filter, row_stride, - col_stride, padding, output, - data_format); + LaunchGeneric()(ctx, input, filter, row_stride, col_stride, + padding, output, data_format); } }; @@ -387,9 +384,8 @@ class Conv2DOp : public BinaryOp { return; } - launcher_.launch(context, use_cudnn_, cudnn_use_autotune_, input, filter, - stride_rows, stride_cols, - BrainPadding2EigenPadding(padding_), output, data_format_); + launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter, + stride_rows, stride_cols, padding_, output, data_format_); } private: @@ -445,10 +441,10 @@ typedef AutoTuneSingleton -void LaunchConv2DOp::launch( +void LaunchConv2DOp::operator()( OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, const Tensor& input_param, const Tensor& filter, int row_stride, - int col_stride, const Eigen::PaddingType& padding, Tensor* output, + int col_stride, const Padding& padding, Tensor* output, TensorFormat data_format) { using perftools::gputools::dnn::AlgorithmConfig; using perftools::gputools::dnn::AlgorithmType; @@ -492,8 +488,8 @@ void LaunchConv2DOp::launch( } return; } else if (filter.dim_size(0) == input.dim_size(1) && - filter.dim_size(1) == input.dim_size(2) && - padding == Eigen::PADDING_VALID && data_format == FORMAT_NHWC) { + filter.dim_size(1) == input.dim_size(2) && padding == VALID && + data_format == FORMAT_NHWC) { // The input data and filter have the same height/width, so call cublas // directly. const uint64 m = input.dim_size(0); @@ -533,7 +529,7 @@ void LaunchConv2DOp::launch( const int64 out_depths = GetTensorDim(*output, data_format, 'C'); const int64 patch_rows = filter.dim_size(0); const int64 patch_cols = filter.dim_size(1); - if (padding == Eigen::PADDING_SAME) { + if (padding == SAME) { // Total padding on rows and cols is // Pr = (R' - 1) * S + Kr - R // Pc = (C' - 1) * S + Kc - C diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h index 60091fc27fd..e29271dff27 100644 --- a/tensorflow/core/kernels/conv_ops.h +++ b/tensorflow/core/kernels/conv_ops.h @@ -32,14 +32,23 @@ namespace tensorflow { class OpKernelContext; template -class LaunchConv2DOp { - public: - void launch(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, - const Tensor& input, const Tensor& filter, int row_stride, - int col_stride, const Eigen::PaddingType& padding, Tensor* output, - TensorFormat data_format); +struct LaunchConv2DOp { + void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& input, const Tensor& filter, int row_stride, + int col_stride, const Padding& padding, Tensor* output, + TensorFormat data_format); }; +#ifdef GOOGLE_CUDA +template +struct LaunchConv2DOp { + void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, + const Tensor& input, const Tensor& filter, int row_stride, + int col_stride, const Padding& padding, Tensor* output, + TensorFormat data_format); +}; +#endif // GOOGLE_CUDA + // Used to keep track of persistent memory buffers used within the op. // It uses malloc and free to avoid the time cost of initializing the memory. template @@ -55,17 +64,6 @@ struct Im2ColBufferResource : public ResourceBase { string DebugString() { return "Im2ColBufferResource"; } }; -#ifdef GOOGLE_CUDA -template -class LaunchConv2DOp { - public: - void launch(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, - const Tensor& input, const Tensor& filter, int row_stride, - int col_stride, const Eigen::PaddingType& padding, Tensor* output, - TensorFormat data_format); -}; -#endif // GOOGLE_CUDA - } // namespace tensorflow #endif // TENSORFLOW_KERNELS_CONV_OPS_H diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc index a1f60019141..fb03adb7a53 100644 --- a/tensorflow/core/kernels/ctc_loss_op.cc +++ b/tensorflow/core/kernels/ctc_loss_op.cc @@ -91,7 +91,14 @@ class CTCLossOp : public OpKernel { OP_REQUIRES(ctx, batch_size != 0, errors::InvalidArgument("batch_size must not be 0")); - TensorShape labels_shape({batch_size, max_time}); + // Figure out the maximum label length to use as sparse tensor dimension. + auto labels_indices_t = labels_indices->matrix(); + int64 max_label_len = 0; + for (int i = 0; i < labels_indices->dim_size(0); i++) { + max_label_len = std::max(max_label_len, labels_indices_t(i, 1) + 1); + } + + TensorShape labels_shape({batch_size, max_label_len}); std::vector order{0, 1}; sparse::SparseTensor labels_sp(*labels_indices, *labels_values, labels_shape, order); diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc index 3a8ccfe6b74..5c6b5eec829 100644 --- a/tensorflow/core/kernels/cuda_solvers.cc +++ b/tensorflow/core/kernels/cuda_solvers.cc @@ -30,10 +30,13 @@ #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/platform/cuda.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/platform/types.h" +using ::perftools::gputools::cuda::ScopedActivateExecutorContext; + namespace tensorflow { namespace { @@ -148,7 +151,12 @@ Status CudaSolver::CopyLapackInfoToHostAsync( // This callback checks that all batch items in all calls were processed // successfully and passes status to the info_checker_callback accordingly. auto wrapped_info_checker_callback = - [info_checker_callback](std::vector host_lapack_infos) { + [](OpKernelContext* context, + std::function&)> + info_checker_callback, + std::vector host_lapack_infos) { + auto stream = context->op_device_context()->stream(); + ScopedActivateExecutorContext scoped_activation{stream->parent()}; Status status; for (const auto& host_lapack_info : host_lapack_infos) { for (int i = 0; i < host_lapack_info.size() && status.ok(); ++i) { @@ -166,8 +174,10 @@ Status CudaSolver::CopyLapackInfoToHostAsync( } info_checker_callback(status, host_lapack_infos); }; + auto cb = - std::bind(wrapped_info_checker_callback, std::move(host_lapack_infos)); + std::bind(wrapped_info_checker_callback, context_, + std::move(info_checker_callback), std::move(host_lapack_infos)); auto stream = context_->op_device_context()->stream(); context_->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( stream, std::move(cb)); diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc index eb173c7040d..6adaecba04b 100644 --- a/tensorflow/core/kernels/cwise_op_sub.cc +++ b/tensorflow/core/kernels/cwise_op_sub.cc @@ -18,7 +18,10 @@ limitations under the License. namespace tensorflow { REGISTER7(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double, int32, int64, complex64, complex128); -#if defined(__ANDROID_TYPES_SLIM__) +#if !defined(__ANDROID_TYPES_SLIM__) +// Sub op for int8, uint8, int16, uint16 +REGISTER4(BinaryOp, CPU, "Sub", functor::sub, int8, uint8, int16, uint16); +#else // We only register the first type when we have multi-argument calls in the // case where we're trying to reduce executable size, but it turns out that the // int32 version of this op is needed, so explicitly include it. diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc index 89bcbc9c373..37c94865942 100644 --- a/tensorflow/core/kernels/debug_ops_test.cc +++ b/tensorflow/core/kernels/debug_ops_test.cc @@ -573,7 +573,8 @@ TEST_F(DebugNumericSummaryOpTest, UInt8Success) { TEST_F(DebugNumericSummaryOpTest, BoolSuccess) { TF_ASSERT_OK(Init(DT_BOOL)); - AddInputFromArray(TensorShape({2, 3}), {0, 0, 1, 1, 1, 0}); + AddInputFromArray(TensorShape({2, 3}), + {false, false, true, true, true, false}); TF_ASSERT_OK(RunOpKernel()); Tensor expected(allocator(), DT_DOUBLE, TensorShape({16})); diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc index 00d7f564082..9804d7d38e1 100644 --- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc @@ -361,19 +361,15 @@ static void ComputeBackpropInput(const DepthwiseArgs& args, } } -// Kernels to compute the input backprop for depthwise convolution. -template -struct LaunchDepthwiseConvBackpropInputOp; - // Computes the depthwise conv2d backprop input of 'out_backprop' by // 'depthwise_filter' and stores the result in 'in_backprop'. template struct LaunchDepthwiseConvBackpropInputOp { typedef typename Eigen::internal::packet_traits::type Packet; - static void launch(OpKernelContext* ctx, const DepthwiseArgs& args, - const T* out_backprop, const T* depthwise_filter, - T* in_backprop, TensorFormat data_format) { + void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, + const T* out_backprop, const T* depthwise_filter, + T* in_backprop, TensorFormat data_format) { OP_REQUIRES( ctx, data_format == FORMAT_NHWC, errors::Unimplemented( @@ -514,27 +510,8 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args, #if GOOGLE_CUDA -template -struct DepthwiseConv2dBackpropInputGPULaunch { - static void Run(const GPUDevice& d, const DepthwiseArgs args, - const T* out_backprop, const T* filter, T* in_backprop, - TensorFormat data_format); -}; - -template -struct LaunchDepthwiseConvBackpropInputOp { - static void launch(OpKernelContext* ctx, const DepthwiseArgs args, - const T* out_backprop, const T* filter, T* in_backprop, - TensorFormat data_format) { - const GPUDevice& d = ctx->eigen_device(); - DepthwiseConv2dBackpropInputGPULaunch().Run( - d, args, out_backprop, filter, in_backprop, data_format); - auto stream = ctx->op_device_context()->stream(); - OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for " - "DepthwiseConv2dBackpropInp" - "utGPULaunch failed")); - } -}; +extern template struct LaunchDepthwiseConvBackpropInputOp; +extern template struct LaunchDepthwiseConvBackpropInputOp; #endif // GOOGLE_CUDA @@ -598,7 +575,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel { if (input_shape.num_elements() == 0) { return; } - LaunchDepthwiseConvBackpropInputOp::launch( + LaunchDepthwiseConvBackpropInputOp()( context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr, data_format_); } @@ -744,9 +721,9 @@ template struct LaunchDepthwiseConvBackpropFilterOp { typedef typename Eigen::internal::packet_traits::type Packet; - static void launch(OpKernelContext* ctx, const DepthwiseArgs& args, - const T* out_backprop, const T* input, T* filter_backprop, - TensorFormat data_format) { + void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, + const T* out_backprop, const T* input, T* filter_backprop, + TensorFormat data_format) { OP_REQUIRES( ctx, data_format == FORMAT_NHWC, errors::Unimplemented( @@ -907,35 +884,8 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args, #if GOOGLE_CUDA -template -struct DepthwiseConv2dBackpropFilterGPULaunch { - static void Run(const GPUDevice& d, const DepthwiseArgs args, - const T* out_backprop, const T* input, T* filter_backprop, - TensorFormat data_format); -}; - -template -struct LaunchDepthwiseConvBackpropFilterOp { - static void launch(OpKernelContext* ctx, const DepthwiseArgs args, - const T* out_backprop, const T* input, T* filter_backprop, - TensorFormat data_format) { - const GPUDevice& d = ctx->eigen_device(); - auto stream = ctx->op_device_context()->stream(); - - // Initialize the results to 0. - int num_filter_backprop = - args.filter_rows * args.filter_cols * args.out_depth; - perftools::gputools::DeviceMemoryBase filter_bp_ptr(filter_backprop, - num_filter_backprop); - stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T)); - - DepthwiseConv2dBackpropFilterGPULaunch().Run( - d, args, out_backprop, input, filter_backprop, data_format); - OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for " - "DepthwiseConv2dBackpropFil" - "terGPULaunch failed")); - } -}; +extern template struct LaunchDepthwiseConvBackpropFilterOp; +extern template struct LaunchDepthwiseConvBackpropFilterOp; #endif // GOOGLE_CUDA @@ -1001,7 +951,7 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel { if (filter_shape.num_elements() == 0) { return; } - LaunchDepthwiseConvBackpropFilterOp::launch( + LaunchDepthwiseConvBackpropFilterOp()( context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr, data_format_); } diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc index 3c01546d8d7..bbeeaf78954 100644 --- a/tensorflow/core/kernels/depthwise_conv_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_op.cc @@ -54,9 +54,6 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; -template -struct LaunchDepthwiseConvOp; - // Computes the vectorized product of 'input_buffer' and 'filter' and stores // result in 'output' at location specified by 'out_r' and 'out_c'. // @@ -156,9 +153,9 @@ template struct LaunchDepthwiseConvOp { typedef typename Eigen::internal::packet_traits::type Packet; - static void launch(OpKernelContext* ctx, const DepthwiseArgs& args, - const T* input, const T* depthwise_filter, T* output, - TensorFormat data_format) { + void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, + const T* input, const T* depthwise_filter, T* output, + TensorFormat data_format) { OP_REQUIRES( ctx, data_format == FORMAT_NHWC, errors::Unimplemented( @@ -248,27 +245,9 @@ extern template class LaunchConv2DOp; #if GOOGLE_CUDA -template -struct DepthwiseConv2dGPULaunch { - static void Run(const GPUDevice& d, const DepthwiseArgs args, const T* input, - const T* filter, T* output, TensorFormat data_format); -}; - -template -struct LaunchDepthwiseConvOp { - static void launch(OpKernelContext* ctx, const DepthwiseArgs args, - const T* input, const T* filter, T* output, - TensorFormat data_format) { - const GPUDevice& d = ctx->eigen_device(); - DepthwiseConv2dGPULaunch().Run(d, args, input, filter, output, - data_format); - auto stream = ctx->op_device_context()->stream(); - OP_REQUIRES( - ctx, stream->ok(), - errors::Internal( - "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed")); - } -}; +// Extern template instantiated in depthwise_conv_op_gpu.cc. +extern template struct LaunchDepthwiseConvOp; +extern template struct LaunchDepthwiseConvOp; // Extern template instantiated in conv_ops.cc. extern template class LaunchConv2DOp; @@ -393,9 +372,8 @@ class DepthwiseConv2dNativeOp : public BinaryOp { // If in_depth==1, this operation is just a standard convolution, so // invoke that op. if (std::is_same::value && in_depth == 1) { - launcher_.launch(context, use_cudnn_, cudnn_use_autotune_, input, filter, - stride_, stride_, BrainPadding2EigenPadding(padding_), - output, data_format_); + launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter, + stride_, stride_, padding_, output, data_format_); return; } @@ -417,8 +395,8 @@ class DepthwiseConv2dNativeOp : public BinaryOp { auto input_ptr = input.template flat().data(); auto filter_ptr = filter.template flat().data(); auto output_ptr = output->template flat().data(); - LaunchDepthwiseConvOp::launch( - context, args, input_ptr, filter_ptr, output_ptr, data_format_); + LaunchDepthwiseConvOp()(context, args, input_ptr, filter_ptr, + output_ptr, data_format_); } private: diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h index 1960b02bbea..aa5b5c76f6a 100644 --- a/tensorflow/core/kernels/depthwise_conv_op.h +++ b/tensorflow/core/kernels/depthwise_conv_op.h @@ -56,6 +56,53 @@ struct DepthwiseArgs { out_depth(0) {} }; +// Forward declaration. +class OpKernelContext; + +template +struct LaunchDepthwiseConvOp { + void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, + const T* input, const T* filter, T* output, + TensorFormat data_format); +}; + +template +struct LaunchDepthwiseConvBackpropInputOp { + void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, + const T* out_backprop, const T* filter, T* in_backprop, + TensorFormat data_format); +}; + +template +struct LaunchDepthwiseConvBackpropFilterOp { + void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, + const T* out_backprop, const T* input, T* filter_backprop, + TensorFormat data_format); +}; + +#if GOOGLE_CUDA +template +struct LaunchDepthwiseConvOp { + void operator()(OpKernelContext* ctx, const DepthwiseArgs args, + const T* input, const T* filter, T* output, + TensorFormat data_format); +}; + +template +struct LaunchDepthwiseConvBackpropInputOp { + void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args, + const T* out_backprop, const T* filter, T* in_backprop, + TensorFormat data_format); +}; + +template +struct LaunchDepthwiseConvBackpropFilterOp { + void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args, + const T* out_backprop, const T* input, T* filter_backprop, + TensorFormat data_format); +}; +#endif + } // namespace tensorflow namespace tensorflow { diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc index f63a99a7308..fcfcd188d2d 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc @@ -17,6 +17,7 @@ limitations under the License. #define EIGEN_USE_GPU #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/depthwise_conv_op.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/cuda_kernel_helper.h" @@ -689,21 +690,27 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args, // A simple launch pad to launch the Cuda kernel for depthwise convolution. template -struct DepthwiseConv2dGPULaunch { - static void Run(const GpuDevice& d, const DepthwiseArgs args, const T* input, - const T* filter, T* output, TensorFormat data_format) { - if (args.filter_rows == 3 && args.filter_cols == 3) { - LaunchDepthwiseConv2dGPU(d, args, input, filter, output, +void LaunchDepthwiseConvOp::operator()(OpKernelContext* ctx, + const DepthwiseArgs args, + const T* input, + const T* filter, T* output, + TensorFormat data_format) { + const GPUDevice& d = ctx->eigen_device(); + if (args.filter_rows == 3 && args.filter_cols == 3) { + LaunchDepthwiseConv2dGPU(d, args, input, filter, output, + data_format); + } else { + LaunchDepthwiseConv2dGPU(d, args, input, filter, output, data_format); - } else { - LaunchDepthwiseConv2dGPU(d, args, input, filter, output, - data_format); - } } -}; + auto stream = ctx->op_device_context()->stream(); + OP_REQUIRES(ctx, stream->ok(), + errors::Internal( + "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed")); +} -template struct DepthwiseConv2dGPULaunch; -template struct DepthwiseConv2dGPULaunch; +template struct LaunchDepthwiseConvOp; +template struct LaunchDepthwiseConvOp; // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input. template -struct DepthwiseConv2dBackpropInputGPULaunch { - static void Run(const GpuDevice& d, const DepthwiseArgs args, - const T* out_backprop, const T* filter, T* in_backprop, - TensorFormat data_format) { - if (args.filter_rows == 3 && args.filter_cols == 3) { - LaunchDepthwiseConv2dBackpropInputGPU( - d, args, out_backprop, filter, in_backprop, data_format); - } else { - LaunchDepthwiseConv2dBackpropInputGPU( - d, args, out_backprop, filter, in_backprop, data_format); - } +void LaunchDepthwiseConvBackpropInputOp::operator()( + OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop, + const T* filter, T* in_backprop, TensorFormat data_format) { + const GPUDevice& d = ctx->eigen_device(); + if (args.filter_rows == 3 && args.filter_cols == 3) { + LaunchDepthwiseConv2dBackpropInputGPU( + d, args, out_backprop, filter, in_backprop, data_format); + } else { + LaunchDepthwiseConv2dBackpropInputGPU( + d, args, out_backprop, filter, in_backprop, data_format); } -}; + auto stream = ctx->op_device_context()->stream(); + OP_REQUIRES(ctx, stream->ok(), + errors::Internal("Launch of gpu kernel for " + "DepthwiseConv2dBackpropInp" + "utGPULaunch failed")); +} -template struct DepthwiseConv2dBackpropInputGPULaunch; -template struct DepthwiseConv2dBackpropInputGPULaunch; +template struct LaunchDepthwiseConvBackpropInputOp; +template struct LaunchDepthwiseConvBackpropInputOp; // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template -struct DepthwiseConv2dBackpropFilterGPULaunch { - static void Run(const GpuDevice& d, const DepthwiseArgs args, - const T* out_backprop, const T* input, T* filter_backprop, - TensorFormat data_format) { - if (args.filter_rows == 3 && args.filter_cols == 3) { - LaunchDepthwiseConv2dBackpropFilterGPU( - d, args, out_backprop, input, filter_backprop, data_format); - } else { - LaunchDepthwiseConv2dBackpropFilterGPU( - d, args, out_backprop, input, filter_backprop, data_format); - } - } -}; +void LaunchDepthwiseConvBackpropFilterOp::operator()( + OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop, + const T* input, T* filter_backprop, TensorFormat data_format) { + const GPUDevice& d = ctx->eigen_device(); + auto stream = ctx->op_device_context()->stream(); -template struct DepthwiseConv2dBackpropFilterGPULaunch; -template struct DepthwiseConv2dBackpropFilterGPULaunch; + // Initialize the results to 0. + int num_filter_backprop = + args.filter_rows * args.filter_cols * args.out_depth; + perftools::gputools::DeviceMemoryBase filter_bp_ptr(filter_backprop, + num_filter_backprop); + stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T)); + + if (args.filter_rows == 3 && args.filter_cols == 3) { + LaunchDepthwiseConv2dBackpropFilterGPU( + d, args, out_backprop, input, filter_backprop, data_format); + } else { + LaunchDepthwiseConv2dBackpropFilterGPU( + d, args, out_backprop, input, filter_backprop, data_format); + } + OP_REQUIRES(ctx, stream->ok(), + errors::Internal("Launch of gpu kernel for " + "DepthwiseConv2dBackpropFil" + "terGPULaunch failed")); +} + +template struct LaunchDepthwiseConvBackpropFilterOp; +template struct LaunchDepthwiseConvBackpropFilterOp; } // namespace tensorflow #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/group_by_window_dataset_op.cc b/tensorflow/core/kernels/group_by_window_dataset_op.cc index a53e9456ad2..a4f9608b1fa 100644 --- a/tensorflow/core/kernels/group_by_window_dataset_op.cc +++ b/tensorflow/core/kernels/group_by_window_dataset_op.cc @@ -36,20 +36,14 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { graph_def_version_(ctx->graph_def_version()) { OP_REQUIRES_OK(ctx, ctx->GetAttr("key_func", &key_func_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_func", &reduce_func_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("window_size_func", &window_size_func_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_)); } void MakeDataset(OpKernelContext* ctx, DatasetBase* input, DatasetBase** output) override { - int64 window_size = 0; - OP_REQUIRES_OK( - ctx, ParseScalarArgument(ctx, "window_size", &window_size)); - OP_REQUIRES( - ctx, window_size > 0, - errors::InvalidArgument("Window size must be greater than zero.")); - - // Get captured inputs for the key and reduce functions. + // Get captured inputs for the key, reduce, and window_size functions. OpInputList key_func_other_argument_inputs; OP_REQUIRES_OK(ctx, ctx->input_list("key_func_other_arguments", &key_func_other_argument_inputs)); @@ -67,6 +61,16 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { for (const Tensor& t : reduce_func_other_argument_inputs) { reduce_func_other_arguments.push_back(t); } + OpInputList window_size_func_other_argument_inputs; + OP_REQUIRES_OK(ctx, + ctx->input_list("window_size_func_other_arguments", + &window_size_func_other_argument_inputs)); + std::vector window_size_func_other_arguments; + window_size_func_other_arguments.reserve( + window_size_func_other_argument_inputs.size()); + for (const Tensor& t : window_size_func_other_argument_inputs) { + window_size_func_other_arguments.push_back(t); + } // TODO(mrry): Refactor CapturedFunction to share the runtime // state between multiple functions? std::unique_ptr captured_key_func; @@ -79,24 +83,30 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { ctx, CapturedFunction::Create(ctx, reduce_func_, graph_def_version_, std::move(reduce_func_other_arguments), &captured_reduce_func)); + std::unique_ptr captured_window_size_func; + OP_REQUIRES_OK(ctx, CapturedFunction::Create( + ctx, window_size_func_, graph_def_version_, + std::move(window_size_func_other_arguments), + &captured_window_size_func)); - *output = new Dataset(input, window_size, std::move(captured_key_func), - std::move(captured_reduce_func), output_types_, - output_shapes_); + *output = new Dataset( + input, std::move(captured_key_func), std::move(captured_reduce_func), + std::move(captured_window_size_func), output_types_, output_shapes_); } private: class Dataset : public DatasetBase { public: - Dataset(const DatasetBase* input, int64 window_size, + Dataset(const DatasetBase* input, std::unique_ptr captured_key_func, std::unique_ptr captured_reduce_func, + std::unique_ptr captured_window_size_func, const DataTypeVector& output_types, const std::vector& output_shapes) : input_(input), - window_size_(window_size), captured_key_func_(std::move(captured_key_func)), captured_reduce_func_(std::move(captured_reduce_func)), + captured_window_size_func_(std::move(captured_window_size_func)), output_types_(output_types), output_shapes_(output_shapes) { input_->Ref(); @@ -182,10 +192,44 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { } const int64 key = key_func_output[0].scalar()(); + if (window_sizes_.find(key) == window_sizes_.end()) { + // Run window_size function + FunctionLibraryRuntime::Options opts2; + opts2.step_id = CapturedFunction::generate_step_id(); + opts2.runner = ctx->runner(); + ScopedStepContainer step_container2( + opts2.step_id, [this, ctx](const string& name) { + dataset() + ->captured_window_size_func_->resource_manager() + ->Cleanup(name) + .IgnoreError(); + }); + opts2.step_container = &step_container2; + + // Run the window size function on the key to identify its + // window size. + std::vector window_size_func_output; + TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Run( + opts2, key_func_output, &window_size_func_output)); + + if (window_size_func_output.size() != 1 || + window_size_func_output[0].dtype() != DT_INT64 || + window_size_func_output[0].NumElements() != 1) { + // TODO(mrry): Support non-int64 window sizes. + return errors::InvalidArgument( + "`window_size_func` must return a scalar int64."); + } + const int64 window_size = + window_size_func_output[0].scalar()(); + window_sizes_[key] = window_size; + } + + const int64 window_size = window_sizes_[key]; + std::vector>& group = groups_[key]; group.push_back(std::move(next_input_element)); - if (group.size() == dataset()->window_size_) { + if (group.size() == window_size) { TF_RETURN_IF_ERROR(StartFlushingGroup(ctx, key)); break; } @@ -297,6 +341,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { bool end_of_input_ GUARDED_BY(mu_) = false; std::map>> groups_ GUARDED_BY(mu_); std::unique_ptr current_group_iterator_ GUARDED_BY(mu_); + std::map window_sizes_ GUARDED_BY(mu_); }; // A resource name for the temporary window dataset that is @@ -304,9 +349,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { static constexpr const char* kWindowResourceName = "__window_dataset"; const DatasetBase* const input_; - const int64 window_size_; const std::unique_ptr captured_key_func_; const std::unique_ptr captured_reduce_func_; + const std::unique_ptr captured_window_size_func_; const DataTypeVector output_types_; const std::vector output_shapes_; }; @@ -316,6 +361,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { std::vector output_shapes_; const NameAttrList* key_func_; const NameAttrList* reduce_func_; + const NameAttrList* window_size_func_; }; REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU), diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc index 9875cd027d5..f8ed9351579 100644 --- a/tensorflow/core/kernels/l2loss_op.cc +++ b/tensorflow/core/kernels/l2loss_op.cc @@ -27,10 +27,9 @@ limitations under the License. namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -typedef Eigen::GpuDevice GPUDevice; -template -class L2LossOp : public OpKernel { +template +class L2LossOp : public OpKernel { public: explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {} @@ -42,8 +41,9 @@ class L2LossOp : public OpKernel { Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output)); - functor::L2Loss()(context->eigen_device(), - input.flat(), output->scalar()); + const CPUDevice& d = context->eigen_device(); + output->scalar().device(d) = + (input.flat().square() * static_cast(0.5)).sum(); } }; @@ -57,33 +57,4 @@ REGISTER_KERNEL(double); REGISTER_KERNEL(Eigen::half); #undef REGISTER_KERNEL -#if GOOGLE_CUDA -// Forward declarations of the functor specializations for GPU. -namespace functor { -#define DECLARE_GPU_SPEC(T) \ - template <> \ - void L2Loss::operator()(const GPUDevice& d, \ - typename TTypes::ConstTensor input, \ - typename TTypes::Scalar output); \ - extern template struct L2Loss; - -DECLARE_GPU_SPEC(float); -DECLARE_GPU_SPEC(double); -DECLARE_GPU_SPEC(Eigen::half); -#undef DECLARE_GPU_SPEC -} // namespace functor - -// Registration of the GPU implementations. -#define REGISTER_GPU_KERNEL(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("L2Loss").Device(DEVICE_GPU).TypeConstraint("T"), \ - L2LossOp); - -REGISTER_GPU_KERNEL(float); -REGISTER_GPU_KERNEL(double); -REGISTER_GPU_KERNEL(Eigen::half); -#undef REGISTER_GPU_KERNEL - -#endif // GOOGLE_CUDA - } // namespace tensorflow diff --git a/tensorflow/core/kernels/l2loss_op.h b/tensorflow/core/kernels/l2loss_op.h index f7204cefdd4..4953aa237cd 100644 --- a/tensorflow/core/kernels/l2loss_op.h +++ b/tensorflow/core/kernels/l2loss_op.h @@ -15,25 +15,19 @@ limitations under the License. #ifndef TENSORFLOW_KERNELS_L2LOSS_OP_H_ #define TENSORFLOW_KERNELS_L2LOSS_OP_H_ -// Functor definition for L2LossOp, must be compilable by nvcc. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_types.h" namespace tensorflow { -namespace functor { -// Functor used by L2LossOp to do the computations. template -struct L2Loss { - void operator()(const Device& d, typename TTypes::ConstTensor input, - typename TTypes::Scalar output) { - // We flatten the input tensor and reduce on dimension 0, producing - // a single number which is Mul(Sum(x^2), 0.5). - output.device(d) = (input.square() * static_cast(0.5)).sum(); - } +struct L2LossOp : public OpKernel { + explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) {} }; -} // namespace functor } // namespace tensorflow #endif // TENSORFLOW_KERNELS_L2LOSS_OP_H_ diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc index 420df370865..73b6472254c 100644 --- a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc +++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc @@ -21,12 +21,55 @@ limitations under the License. #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/reduction_ops_common.h" +#include "tensorflow/core/kernels/reduction_ops_gpu_kernels.h" + namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; -template struct functor::L2Loss; -template struct functor::L2Loss; -template struct functor::L2Loss; + +// TODO(eriche): can add specialization for half2 +template +struct squareHalf { + __host__ __device__ T operator()(const T& x) const { + return static_cast(0.5) * x * x; + } +}; + +template +class L2LossOp : public OpKernel { + public: + explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // The input tensor can be of any number of dimensions, even though it's + // 2D in most typical applications. + const Tensor& input = context->input(0); + // The output is a single number. + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + typedef cub::TransformInputIterator, T*> inputIterType; + inputIterType input_itr((T*)input.flat().data(), squareHalf()); + typedef const Eigen::array::Tensor::Index, 1>& ReductionAxes; + + Constants constants; + functor::ReduceImpl( + context, (T*)output->flat().data(), input_itr, 1, + input.flat().size(), 1, 1, 0, constants.kZero, cub::Sum(), T(0)); + } +}; + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("L2Loss").Device(DEVICE_GPU).TypeConstraint("T"), \ + L2LossOp); + +REGISTER_GPU_KERNEL(float); +REGISTER_GPU_KERNEL(double); +REGISTER_GPU_KERNEL(Eigen::half); +#undef REGISTER_GPU_KERNEL } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc index ef7338e0e0d..50700c8bc8a 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc @@ -97,8 +97,12 @@ class MklConv2DCustomBackpropInputOp : public OpKernel { errors::InvalidArgument( "Conv2DCustomBackpropInput: size must be 4-dim")); - MklSizesToTFSizes(context, data_format, mkl_context.filter_shape, - &filter_shape); + const int64* filter_sizes = + (const int64*) mkl_context.filter_shape.GetSizes(); + const int64 filter_dims = mkl_context.filter_shape.GetDimension(); + + OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(filter_sizes, + filter_dims, &filter_shape)); } else { filter_shape = filter.shape(); } diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 203e6946314..b50a6343ba9 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -265,6 +265,28 @@ class MklConv2DOp : public OpKernel { sizeof(T)); AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape, mkl_output_mkl_shape); + // Filter output to be used in the backprop_input + TensorShape mkl_filter_output_tf_shape; + MklShape mkl_filter_output_mkl_shape; + mkl_filter_output_mkl_shape.SetMklTensor(true); + mkl_filter_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd, + dnnResourceFilter); + + size_t filter_sizes[4] = {filter.dim_size(0), filter.dim_size(1), + filter.dim_size(2), filter.dim_size(3)}; + mkl_filter_output_mkl_shape.SetTfLayout(filter.dims(), filter_sizes, + mkl_context.filter_strides); + + mkl_filter_output_mkl_shape.SetTfDimOrder(mkl_context.filter_dims, + data_format_); + mkl_filter_output_tf_shape.AddDim( + dnnLayoutGetMemorySize_F32( + static_cast( + mkl_filter_output_mkl_shape.GetMklLayout())) / + sizeof(T)); + AllocateOutputSetMklShape(context, 1, &mkl_context.output_filter, + mkl_filter_output_tf_shape, mkl_filter_output_mkl_shape); + mkl_context.conv_res[dnnResourceDst] = static_cast(output->flat().data()); @@ -303,6 +325,7 @@ class MklConv2DOp : public OpKernel { dnnPrimitive_t prim_fwd; void* conv_res[dnnResourceNumber]; dnnLayout_t lt_filter, lt_bias, lt_input; + Tensor* output_filter = nullptr; // Create MKL dnnLayout_t objects for tensors coming into the layer void MklCreateInputLayouts(OpKernelContext* context) { @@ -383,8 +406,8 @@ class MklConv2DOp : public OpKernel { CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, lt_filter, mkl_lt_internal_filter), E_SUCCESS); - AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor, - mkl_lt_internal_filter, &mkl_buf_convert_filter); + mkl_buf_convert_filter = const_cast(static_cast( + output_filter->flat().data())); CHECK_EQ( dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter, mkl_buf_convert_filter), diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc index b3763f17bc1..03c3fb09a1d 100644 --- a/tensorflow/core/kernels/mkl_reshape_op.cc +++ b/tensorflow/core/kernels/mkl_reshape_op.cc @@ -43,30 +43,26 @@ class MklReshapeOp : public OpKernel { OP_REQUIRES(context, IsLegacyVector(sizes.shape()), errors::InvalidArgument("sizes input must be 1-D, not shape ", sizes.shape().DebugString())); - const int64 num_dims = sizes.NumElements(); // Compute the output shape. Determine product of specified // dimensions, and find the index of the unspecified one. TensorShape shape; int64 product = 1; int unknown_index = -1; - auto vec_size = sizes.flat(); - for (int d = 0; d < num_dims; ++d) { - const int32 size = vec_size(d); - if (size == -1) { - OP_REQUIRES( - context, unknown_index == -1, - errors::InvalidArgument("only one input size may be -1, not both ", - unknown_index, " and ", d)); - unknown_index = d; - shape.AddDim(1); - } else { - OP_REQUIRES(context, size >= 0, - errors::InvalidArgument( - "size ", d, " must be non-negative, not ", size)); - shape.AddDim(size); - product *= size; - } + switch (sizes.dtype()) { + case DT_INT32: + OP_REQUIRES_OK(context, ValidateSizes(sizes, &product, + &unknown_index, &shape)); + break; + case DT_INT64: + OP_REQUIRES_OK(context, ValidateSizes(sizes, &product, + &unknown_index, &shape)); + break; + default: + context->CtxFailure(errors::InvalidArgument( + "desired shape must be a DT_INT32 or DT_INT64 vector, not a ", + DataTypeString(sizes.dtype()))); + return; } if (unknown_index != -1) { OP_REQUIRES( @@ -132,6 +128,34 @@ class MklReshapeOp : public OpKernel { CopyTfTensorInToOutWithShape(context, 0, 0, shape); } } + private: + template + Status ValidateSizes(const Tensor& sizes, int64* product, int* unknown_index, + TensorShape* shape) { + *product = 1; + *unknown_index = -1; + const int64 num_dims = sizes.NumElements(); + auto Svec = sizes.flat(); + for (int d = 0; d < num_dims; ++d) { + const Tshape size = Svec(d); + if (size == -1) { + if (*unknown_index != -1) { + return errors::InvalidArgument( + "Only one input size may be -1, not both ", *unknown_index, + " and ", d); + } + *unknown_index = d; + shape->AddDim(1); + } else if (size < 0) { + return errors::InvalidArgument("Size ", d, + " must be non-negative, not ", size); + } else { + shape->AddDim(size); + (*product) *= size; + } + } + return Status::OK(); + } }; #define REGISTER_MKL_CPU(T) \ @@ -141,6 +165,13 @@ class MklReshapeOp : public OpKernel { .TypeConstraint("T") \ .TypeConstraint("Tshape") \ .Label(mkl_op_registry::kMklOpLabel), \ + MklReshapeOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklReshape") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint("T") \ + .TypeConstraint("Tshape") \ + .Label(mkl_op_registry::kMklOpLabel), \ MklReshapeOp); TF_CALL_float(REGISTER_MKL_CPU); #undef REGISTER_MKL_CPU diff --git a/tensorflow/core/kernels/ops_util.cc b/tensorflow/core/kernels/ops_util.cc index 130939263be..efacd05dd39 100644 --- a/tensorflow/core/kernels/ops_util.cc +++ b/tensorflow/core/kernels/ops_util.cc @@ -37,11 +37,6 @@ Eigen::PaddingType BrainPadding2EigenPadding(Padding padding) { Status GetBroadcastSize(const int index, const int in_size, const int ksize, const int stride, const int pad_size, int* bindex, int* bsize) { - // Cannot have strides larger than the patch size. - if (stride > ksize) { - return errors::InvalidArgument( - "stride must be less than or equal to kernel size"); - } // Cannot have index beyond the input size. if (index * stride > in_size) { return errors::InvalidArgument( diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc index 42ffef6735b..9d53882deef 100644 --- a/tensorflow/core/kernels/ops_util_test.cc +++ b/tensorflow/core/kernels/ops_util_test.cc @@ -173,12 +173,6 @@ TEST_F(OpsUtilTest, Get2dOutputSizeVerbose) { VerifyGet2dOutputVerboseSizeValues(pad_struct2, error::OK); } -// Test stride > ksize fails with INVALID_ARGUMENT. -TEST_F(OpsUtilTest, GetBroadcastTest3_1_2_0) { - bcast_struct bcast = {{0, 3, 1, 2, 0}, {0, 3}}; - VerifyBoundaries(bcast, error::INVALID_ARGUMENT); -} - // Test index * stride > in_size fails with INVALID_ARGUMENT. TEST_F(OpsUtilTest, GetBroadcastTestBadIndex) { bcast_struct bcast = {{2, 3, 1, 2, 0}, {0, 3}}; @@ -281,6 +275,38 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_2) { } } +// in_size = 3, ksize = 1, stride = 2, pad_size = 0 +TEST_F(OpsUtilTest, GetBroadcastTest3_1_2_0) { + bcast_struct bcast[] = { + {{0, 3, 1, 2, 0}, {0, 1}}, + {{1, 3, 1, 2, 0}, {2, 1}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 2, stride = 3, pad_size = 0 +TEST_F(OpsUtilTest, GetBroadcastTest3_2_3_0) { + bcast_struct bcast[] = { + {{0, 3, 2, 3, 0}, {0, 2}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 2, stride = 3, pad_size = 1 +TEST_F(OpsUtilTest, GetBroadcastTest3_2_3_1) { + bcast_struct bcast[] = { + {{0, 3, 2, 3, 1}, {0, 1}}, + {{1, 3, 2, 3, 1}, {2, 1}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + TEST_F(OpsUtilTest, SanitizeThreadSuffix) { EXPECT_EQ("_aBc123_-___", SanitizeThreadSuffix("/aBc123_- /")); } diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc index 6e8b09d0500..6196c5ed93e 100644 --- a/tensorflow/core/kernels/pad_op.cc +++ b/tensorflow/core/kernels/pad_op.cc @@ -146,9 +146,9 @@ class PadOp : public OpKernel { Tensor* output) { CHECK_EQ(Dims, paddings.dimension(0)); CHECK_EQ(2, paddings.dimension(1)); - Eigen::array, Dims> paddings_array; + Eigen::array, Dims> paddings_array; for (int i = 0; i < Dims; ++i) { - paddings_array[i] = std::make_pair(paddings(i, 0), paddings(i, 1)); + paddings_array[i] = {paddings(i, 0), paddings(i, 1)}; } functor::Pad functor; functor(context->eigen_device(), output->tensor(), input, @@ -180,7 +180,7 @@ namespace functor { void Pad::operator()( \ const GPUDevice& d, typename TTypes::Tensor output, \ typename TTypes::ConstTensor input, \ - Eigen::array, Dims> paddings, T pad_value); \ + Eigen::array, Dims> paddings, T pad_value); \ extern template struct Pad; #define DECLARE_GPU_SPECS(T) \ diff --git a/tensorflow/core/kernels/pad_op.h b/tensorflow/core/kernels/pad_op.h index 6a973833e2d..95a7c9a3ae5 100644 --- a/tensorflow/core/kernels/pad_op.h +++ b/tensorflow/core/kernels/pad_op.h @@ -31,7 +31,7 @@ struct Pad { // See pad_op.cc for details. void operator()(const Device& d, typename TTypes::Tensor output, typename TTypes::ConstTensor input, - Eigen::array, Dims> paddings, + Eigen::array, Dims> paddings, T pad_value) { if (Eigen::internal::is_same::value && (output.size() <= std::numeric_limits::max())) { @@ -47,7 +47,7 @@ struct Pad { // In the scalar case we simply copy the input. void operator()(const Device& d, typename TTypes::Tensor output, typename TTypes::ConstTensor input, - Eigen::array, 0>, T) { + Eigen::array, 0>, T) { output.device(d) = input; } }; diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc index 79199ff5c3f..dd645262d2e 100644 --- a/tensorflow/core/kernels/parse_tensor_op.cc +++ b/tensorflow/core/kernels/parse_tensor_op.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/framework/register_types.h" namespace tensorflow { @@ -65,4 +66,32 @@ class ParseTensorOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("ParseTensor").Device(DEVICE_CPU), ParseTensorOp); + +template +class SerializeTensorOp : public OpKernel { + public: + using OpKernel::OpKernel; + + void Compute(OpKernelContext* context) override { + const Tensor& tensor = context->input(0); + TensorProto proto; + if (tensor.dtype() == DT_STRING) { + tensor.AsProtoField(&proto); + } else { + tensor.AsProtoTensorContent(&proto); + } + Tensor* proto_string = nullptr; + OP_REQUIRES_OK( + context, context->allocate_output(0, TensorShape({}), &proto_string)); + CHECK(proto.SerializeToString(&proto_string->scalar()())); + } +}; + +#define REGISTER(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("SerializeTensor").Device(DEVICE_CPU).TypeConstraint("T"), \ + SerializeTensorOp); +TF_CALL_ALL_TYPES(REGISTER) +#undef REGISTER + } // namespace tensorflow diff --git a/tensorflow/core/kernels/parse_tensor_test.cc b/tensorflow/core/kernels/parse_tensor_test.cc new file mode 100644 index 00000000000..f6f60fee71c --- /dev/null +++ b/tensorflow/core/kernels/parse_tensor_test.cc @@ -0,0 +1,213 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include +#include + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/ops_testutil.h" + +namespace tensorflow { +namespace { + +class SerializeTensorOpTest : public OpsTestBase { + protected: + template + void MakeOp(const TensorShape& input_shape, + std::function functor) { + TF_ASSERT_OK( + NodeDefBuilder("myop", "SerializeTensor") + .Input(FakeInput(DataTypeToEnum::value)) + .Finalize(node_def())); + TF_ASSERT_OK(InitOp()); + AddInput(input_shape, functor); + } + void ParseSerializedWithNodeDef(const NodeDef& parse_node_def, + Tensor* serialized, + Tensor* parse_output) { + std::unique_ptr device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + gtl::InlinedVector inputs; + inputs.push_back({nullptr, serialized}); + Status status; + std::unique_ptr op( + CreateOpKernel(DEVICE_CPU, device.get(), + cpu_allocator(), parse_node_def, + TF_GRAPH_DEF_VERSION, &status)); + TF_EXPECT_OK(status); + OpKernelContext::Params params; + params.device = device.get(); + params.inputs = &inputs; + params.frame_iter = FrameAndIter(0, 0); + params.op_kernel = op.get(); + std::vector attrs; + test::SetOutputAttrs(¶ms, &attrs); + OpKernelContext ctx(¶ms); + op->Compute(&ctx); + TF_EXPECT_OK(status); + *parse_output = *ctx.mutable_output(0); + } + template + void ParseSerializedOutput(Tensor* serialized, Tensor* parse_output) { + NodeDef parse; + TF_ASSERT_OK(NodeDefBuilder("parse", "ParseTensor") + .Input(FakeInput(DT_STRING)) + .Attr("out_type", DataTypeToEnum::value) + .Finalize(&parse)); + ParseSerializedWithNodeDef(parse, serialized, parse_output); + } +}; + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_half) { + MakeOp(TensorShape({10}), [](int x) -> Eigen::half { + return static_cast(x / 10.); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_float) { + MakeOp(TensorShape({1, 10}), [](int x) -> float { + return static_cast(x / 10.); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_double) { + MakeOp(TensorShape({5, 5}), [](int x) -> double { + return static_cast(x / 10.); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int64) { + MakeOp(TensorShape({2, 3, 4}), [](int x) -> int64 { + return static_cast(x - 10); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int32) { + MakeOp(TensorShape({4, 2}), [](int x) -> int32 { + return static_cast(x + 7); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int16) { + MakeOp(TensorShape({8}), [](int x) -> int16 { + return static_cast(x + 18); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int8) { + MakeOp(TensorShape({2}), [](int x) -> int8 { + return static_cast(x + 8); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_uint16) { + MakeOp(TensorShape({1, 3}), [](int x) -> uint16 { + return static_cast(x + 2); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_uint8) { + MakeOp(TensorShape({2, 1, 1}), [](int x) -> uint8 { + return static_cast(x + 1); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_complex64) { + MakeOp(TensorShape({}), [](int x) -> complex64 { + return complex64{ static_cast(x / 8.), + static_cast(x / 2.) }; + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_complex128) { + MakeOp(TensorShape({3}), [](int x) -> complex128 { + return complex128{ x / 3., x / 2. }; + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_bool) { + MakeOp(TensorShape({1}), [](int x) -> bool { + return static_cast(x % 2); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_string) { + MakeOp(TensorShape({10}), [](int x) -> std::string { + return std::to_string(x / 10.); + }); + TF_ASSERT_OK(RunOpKernel()); + Tensor parse_output; + ParseSerializedOutput(GetOutput(0), &parse_output); + test::ExpectTensorEqual(parse_output, GetInput(0)); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h index 5db9e6032e0..e43d2828f30 100644 --- a/tensorflow/core/kernels/reduction_ops.h +++ b/tensorflow/core/kernels/reduction_ops.h @@ -20,6 +20,7 @@ limitations under the License. #include #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_types.h" namespace tensorflow { @@ -67,7 +68,7 @@ void FillIdentityEigenImpl(const Device& d, OUT_T out, const Reducer& reducer) { template struct ReduceFunctor { template - static void Reduce(const Device& d, OUT_T out, IN_T in, + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, const ReductionAxes& reduction_axes, const Reducer& reducer); diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h index 553f8895232..71af9d88dc1 100644 --- a/tensorflow/core/kernels/reduction_ops_common.h +++ b/tensorflow/core/kernels/reduction_ops_common.h @@ -190,24 +190,24 @@ class ReductionOp : public OpKernel { Functor::FillIdentity(d, tmp_out.flat(), reducer); } else if ((helper.ndims() == 1) && helper.reduce_first_axis()) { // Reduce to a scalar. - Functor::Reduce(d, helper.out(&tmp_out), helper.in(data), + Functor::Reduce(ctx, helper.out(&tmp_out), helper.in(data), constants.kZero, reducer); } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) { // Can be viewed as a reduction of a matrix along 1st dimension. - Functor::Reduce(d, helper.out(&tmp_out), helper.in(data), + Functor::Reduce(ctx, helper.out(&tmp_out), helper.in(data), constants.kZero, reducer); } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) { // Can be viewed as a reduction of a matrix along 2nd dimension. - Functor::Reduce(d, helper.out(&tmp_out), helper.in(data), + Functor::Reduce(ctx, helper.out(&tmp_out), helper.in(data), constants.kOne, reducer); } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) { // Can be viewed as a reduction of a 3D tensor along 1st and 3rd // dimensions. - Functor::Reduce(d, helper.out(&tmp_out), helper.in(data), + Functor::Reduce(ctx, helper.out(&tmp_out), helper.in(data), constants.kZeroTwo, reducer); } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) { // Can be viewed as a reduction of a 3D tensor along 2nd dimension. - Functor::Reduce(d, helper.out(&tmp_out), helper.in(data), + Functor::Reduce(ctx, helper.out(&tmp_out), helper.in(data), constants.kOne, reducer); } else { // If we don't hit one of the cases above, transpose the data so that @@ -223,7 +223,7 @@ class ReductionOp : public OpKernel { const int64 unreduced = tmp_out.NumElements(); const int64 reduced = shuffled.NumElements() / unreduced; const Tensor& const_shuffled = shuffled; - Functor::Reduce(d, tmp_out.flat(), + Functor::Reduce(ctx, tmp_out.flat(), const_shuffled.shaped({unreduced, reduced}), constants.kOne, reducer); } @@ -258,9 +258,10 @@ namespace functor { template struct ReduceFunctorBase { template - static void Reduce(const Device& d, OUT_T out, IN_T in, + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, const ReductionAxes& reduction_axes, const Reducer& reducer) { + const Device& d = ctx->eigen_device(); ReduceEigenImpl(d, out, in, reduction_axes, reducer); } diff --git a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc index ec4490db83f..8fd9165eb9f 100644 --- a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc @@ -17,8 +17,7 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/framework/numeric_types.h" -#include "tensorflow/core/kernels/reduction_ops.h" +#include "tensorflow/core/kernels/reduction_ops_gpu_kernels.h" namespace tensorflow { namespace functor { @@ -33,15 +32,27 @@ typedef TTypes::Tensor::Index Index; template struct ReduceFunctor { template - static void Reduce(const GPUDevice& d, OUT_T out, IN_T in, + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, const ReductionAxes& reduction_axes, - const Reducer& reducer) { - ReduceEigenImpl(d, To32Bit(out), To32Bit(in), reduction_axes, reducer); + const Reducer& reducer); +}; + +template +struct ReduceFunctor> { + template + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Eigen::internal::SumReducer& reducer) { + ReduceImpl( + ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0), + in.rank() >= 2 ? in.dimension(1) : 1, + in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, + cub::Sum(), T(0)); } template static void FillIdentity(const GPUDevice& d, OUT_T out, - const Reducer& reducer) { + const Eigen::internal::SumReducer& reducer) { FillIdentityEigenImpl(d, To32Bit(out), reducer); } }; @@ -49,19 +60,30 @@ struct ReduceFunctor { template struct ReduceFunctor> { template - static void Reduce(const GPUDevice& d, OUT_T out, IN_T in, + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, const ReductionAxes& reduction_axes, const Eigen::internal::MeanReducer& reducer) { - typedef typename IN_T::Index Index; - // Eigen sum reductions are much faster on GPU than mean reductions: - // Simply trigger them by computing the sum of the weighted inputs. - Index num_coeffs_to_reduce = 1; - for (int i = 0; i < Eigen::internal::array_size::value; - ++i) { - num_coeffs_to_reduce *= in.dimension(reduction_axes[i]); - } - T scale = T(1.0 / num_coeffs_to_reduce); - out.device(d) = (in * scale).sum(reduction_axes); + int divisor = 1; + if (out.rank() == 0) + divisor = in.size(); + else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0) + divisor = in.dimension(0); + else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1) + divisor = in.dimension(1); + else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 && + reduction_axes[1] == 2) + divisor = in.dimension(0) * in.dimension(2); + else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1) + divisor = in.dimension(1); + + DividesBy div_op(static_cast(divisor)); + TransformOutputIterator> itr((T*)out.data(), div_op); + ReduceImpl>, T*, + ReductionAxes>(ctx, itr, (T*)in.data(), in.rank(), + in.dimension(0), + in.rank() >= 2 ? in.dimension(1) : 1, + in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), + reduction_axes, cub::Sum(), T(0)); } template @@ -71,15 +93,159 @@ struct ReduceFunctor> { } }; +template <> +struct ReduceFunctor> { + template + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Eigen::internal::MeanReducer& reducer) { + float divisor = 1.f; + if (out.rank() == 0) + divisor = in.size(); + else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0) + divisor = in.dimension(0); + else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1) + divisor = in.dimension(1); + else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 && + reduction_axes[1] == 2) + divisor = in.dimension(0) * in.dimension(2); + else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1) + divisor = in.dimension(1); + DividesBy div_op(divisor); + + typedef cub::TransformInputIterator + inputIterType; + inputIterType input_itr((Eigen::half*)in.data(), HalfToFloat()); + + typedef TransformOutputIterator> + outputIterType; + outputIterType itr((Eigen::half*)out.data(), div_op); + + ReduceImpl( + ctx, itr, input_itr, in.rank(), in.dimension(0), + in.rank() >= 2 ? in.dimension(1) : 1, + in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, + cub::Sum(), 0.f); + } + + template + static void FillIdentity( + const GPUDevice& d, OUT_T out, + const Eigen::internal::MeanReducer& reducer) { + FillIdentityEigenImpl(d, To32Bit(out), reducer); + } +}; + +template +struct ReduceFunctor> { + template + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Eigen::internal::MaxReducer& reducer) { + ReduceImpl( + ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0), + in.rank() >= 2 ? in.dimension(1) : 1, + in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, + cub::Max(), std::numeric_limits::lowest()); + } + + template + static void FillIdentity(const GPUDevice& d, OUT_T out, + const Eigen::internal::MaxReducer& reducer) { + FillIdentityEigenImpl(d, To32Bit(out), reducer); + } +}; + +template +struct ReduceFunctor> { + template + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Eigen::internal::MinReducer& reducer) { + ReduceImpl( + ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0), + in.rank() >= 2 ? in.dimension(1) : 1, + in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, + cub::Min(), std::numeric_limits::max()); + } + + template + static void FillIdentity(const GPUDevice& d, OUT_T out, + const Eigen::internal::MinReducer& reducer) { + FillIdentityEigenImpl(d, To32Bit(out), reducer); + } +}; + +template +struct ReduceFunctor> { + template + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Eigen::internal::ProdReducer& reducer) { + ReduceImpl, T*, T*, ReductionAxes>( + ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0), + in.rank() >= 2 ? in.dimension(1) : 1, + in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, + Prod(), T(1)); + } + + template + static void FillIdentity(const GPUDevice& d, OUT_T out, + const Eigen::internal::ProdReducer& reducer) { + FillIdentityEigenImpl(d, To32Bit(out), reducer); + } +}; + +template <> +struct ReduceFunctor { + template + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Eigen::internal::AndReducer& reducer) { + ReduceImpl( + ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0), + in.rank() >= 2 ? in.dimension(1) : 1, + in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, And(), + true); + } + + template + static void FillIdentity(const GPUDevice& d, OUT_T out, + const Eigen::internal::AndReducer& reducer) { + FillIdentityEigenImpl(d, To32Bit(out), reducer); + } +}; + +template <> +struct ReduceFunctor { + template + static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Eigen::internal::OrReducer& reducer) { + ReduceImpl( + ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0), + in.rank() >= 2 ? in.dimension(1) : 1, + in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, Or(), + false); + } + + template + static void FillIdentity(const GPUDevice& d, OUT_T out, + const Eigen::internal::OrReducer& reducer) { + FillIdentityEigenImpl(d, To32Bit(out), reducer); + } +}; + // T: the data type // REDUCER: the reducer functor // NUM_AXES: the number of axes to reduce // IN_DIMS: the number of dimensions of the input tensor -#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES) \ - template void ReduceFunctor::Reduce( \ - const GPUDevice& d, TTypes::Tensor out, \ - TTypes::ConstTensor in, \ - const Eigen::array& reduction_axes, \ +#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES) \ + template void ReduceFunctor::Reduce( \ + OpKernelContext* ctx, TTypes::Tensor out, \ + TTypes::ConstTensor in, \ + const Eigen::array& reduction_axes, \ const REDUCER& reducer); #define DEFINE_IDENTITY(T, REDUCER) \ diff --git a/tensorflow/core/kernels/reduction_ops_gpu_kernels.h b/tensorflow/core/kernels/reduction_ops_gpu_kernels.h new file mode 100644 index 00000000000..ce471c672c7 --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_gpu_kernels.h @@ -0,0 +1,713 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "external/cub_archive/cub/device/device_reduce.cuh" +#include "external/cub_archive/cub/device/device_segmented_reduce.cuh" +#include "external/cub_archive/cub/iterator/counting_input_iterator.cuh" +#include "external/cub_archive/cub/iterator/transform_input_iterator.cuh" +#include "external/cub_archive/cub/warp/warp_reduce.cuh" +#include "cuda/include/cuComplex.h" +#include "tensorflow/core/framework/numeric_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/reduction_ops.h" +#include "tensorflow/core/lib/core/bits.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" +#include "tensorflow/core/util/permutation_input_iterator.h" +#include "tensorflow/core/util/transform_output_iterator.h" + +#include + +namespace tensorflow { +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +template +struct Prod { + __host__ __device__ T operator()(const T& a, const T& b) const { + return a * b; + } +}; + +// needed to work around a compiler bug in nvcc - it doesn't seem to like +// the overloaded multiply op for std::complex +template <> +struct Prod> { + __host__ __device__ std::complex operator()( + const std::complex& a, const std::complex& b) const { + auto result = cuCmulf(make_cuComplex(a.real(), a.imag()), + make_cuComplex(b.real(), b.imag())); + return std::complex(result.x, result.y); + } +}; + +template <> +struct Prod> { + __host__ __device__ std::complex operator()( + const std::complex& a, const std::complex& b) const { + auto result = cuCmul(make_cuDoubleComplex(a.real(), a.imag()), + make_cuDoubleComplex(b.real(), b.imag())); + return std::complex(result.x, result.y); + } +}; + +template +struct DividesBy { + T divisor; + + __host__ __device__ explicit DividesBy(T divisor) : divisor(divisor) {} + + __host__ __device__ outT operator()(const T& x) const { return x / divisor; } +}; + +// needed to work around a compiler bug in nvcc - it doesn't seem to like +// the overloaded ops for std::complex +template <> +struct DividesBy> { + cuFloatComplex divisor; + + __host__ __device__ explicit DividesBy(std::complex divisor) + : divisor(make_cuComplex(divisor.real(), divisor.imag())) {} + + // implements + __host__ __device__ std::complex operator()( + const std::complex& x) const { + auto result = cuCdivf(make_cuComplex(x.real(), x.imag()), divisor); + return std::complex(result.x, result.y); + } +}; + +template <> +struct DividesBy> { + cuDoubleComplex divisor; + + __host__ __device__ explicit DividesBy(std::complex divisor) + : divisor(make_cuDoubleComplex(divisor.real(), divisor.imag())) {} + + // implements + __host__ __device__ std::complex operator()( + const std::complex& x) const { + auto result = cuCdiv(make_cuDoubleComplex(x.real(), x.imag()), divisor); + return std::complex(result.x, result.y); + } +}; + +template <> +struct DividesBy { + float divisor; + + __host__ __device__ explicit DividesBy(float divisor) : divisor(divisor) {} + + __host__ __device__ Eigen::half operator()(const float& x) const { + return Eigen::half(x / divisor); + } +}; + +struct HalfToFloat { + __host__ __device__ float operator()(const Eigen::half& x) const { + return Eigen::half_impl::half_to_float(x); + } +}; + +struct FloatToHalf { + __host__ __device__ Eigen::half operator()(const float& x) const { + return Eigen::half_impl::float_to_half_rtne(x); + } +}; + +struct And { + __host__ __device__ bool operator()(const bool& a, const bool& b) const { + return a && b; + } +}; + +struct Or { + __host__ __device__ bool operator()(const bool& a, const bool& b) const { + return a || b; + } +}; + +// each block does a grid strided loop and reduces its values locally +// the case of one block is used for low latency small reductions to scalars +template +__global__ void BlockReduceKernel( + T in, outT out, int num_elems, Op op, + typename std::iterator_traits::value_type initVal) { + const int bid = blockIdx.x; + const int tid = threadIdx.x; + + const int gid = bid * blockDim.x + tid; + const int stride = blockDim.x * gridDim.x; + + typedef typename std::iterator_traits::value_type value_type; + + value_type sum = initVal; + if (gid < num_elems) { + sum = in[gid]; + for (int pos = gid + stride; pos < num_elems; pos += stride) { + sum = op(sum, in[pos]); + } + } + + typedef cub::BlockReduce BlockReduce; + + __shared__ typename BlockReduce::TempStorage temp_storage; + + // only include input values in the reduction + // + // elements: ----------------- + // grid: |====|====|====|====|====| + const int num_elements_to_reduce = + max(min(num_elems - bid * blockDim.x, num_threads), 0); + + sum = BlockReduce(temp_storage) + .template Reduce(sum, op, num_elements_to_reduce); + + if (tid == 0) out[bid] = sum; +} + +// maps a warp to each row +template +__global__ void RowReduceKernel( + T in, outT out, int num_rows, int num_cols, Op op, + typename std::iterator_traits::value_type initVal) { + typedef typename std::iterator_traits::value_type value_type; + const int row = (blockIdx.x * blockDim.x + threadIdx.x) / 32; + const int lane = threadIdx.x % 32; + + if (num_cols == 1) { + int gid = threadIdx.x + blockIdx.x * blockDim.x; + if (gid < num_rows) out[gid] = in[gid]; + return; + } + + value_type sum = initVal; + int col = lane; + + if (row < num_rows && col < num_cols) { + sum = in[row * num_cols + col]; + col += 32; + for (; col < num_cols; col += 32) { + sum = op(sum, in[row * num_cols + col]); + } + } + + typedef cub::WarpReduce WarpReduce; + + __shared__ typename WarpReduce::TempStorage temp_storage; + + sum = WarpReduce(temp_storage).template Reduce(sum, op, min(num_cols, 32)); + + if (row < num_rows && lane == 0) out[row] = sum; +} + +// Works only if there are <= 16 columns +// each warps sums over multiple rows at once +template +__global__ void ColumnReduceMax16ColumnsKernel( + T in, outT out, int num_rows, int num_cols, Op op, + typename std::iterator_traits::value_type initVal) { + typedef typename std::iterator_traits::value_type value_type; + int rows_per_warp = 32 / num_cols; + + const int lane = threadIdx.x % 32; + const int lane_row = lane / num_cols; + + const int start_row_warp = + rows_per_warp * (blockIdx.y * blockDim.y + threadIdx.y); + const int start_row_lane = start_row_warp + lane_row; + int row = start_row_lane; + int col = lane % num_cols; + + value_type sum = initVal; + if (row * num_cols + col < num_rows * num_cols) + sum = in[row * num_cols + col]; + + __shared__ value_type partial_sums[32][33]; + + row += rows_per_warp * gridDim.y * blockDim.y; + for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) { + int global_pos = row * num_cols + col; + if (global_pos < (num_rows * num_cols)) + sum = op(sum, in[row * num_cols + col]); + } + + const int rows_in_this_warp = min(rows_per_warp, num_rows - start_row_warp); + // not the most efficient way to do this sum + for (int i = 1; i < rows_in_this_warp; ++i) { + value_type tmp = + cub::ShuffleIndex(sum, threadIdx.x + i * num_cols, 32, 0xffffffff); + if (lane < num_cols) sum = op(sum, tmp); + } + + if (lane < num_cols) partial_sums[lane][threadIdx.y] = sum; + + __syncthreads(); + + if (threadIdx.y == 0 && threadIdx.x < num_cols) { + value_type s = partial_sums[threadIdx.x][0]; + + if (blockDim.y > 1) { + for (int row = 1; row < blockDim.y; ++row) { + s = op(s, partial_sums[threadIdx.x][row]); + } + } + + out[col * gridDim.y + blockIdx.y] = s; + } +} + +// Maps each block to a column range 32 wide +template +__global__ void ColumnReduceKernel( + T in, outT out, int num_rows, int num_cols, Op op, + typename std::iterator_traits::value_type initVal) { + typedef typename std::iterator_traits::value_type value_type; + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * 32 + threadIdx.x; + + value_type sum = initVal; + if (row < num_rows && col < num_cols) + sum = in[row * num_cols + col]; + + __shared__ value_type partial_sums[32][33]; + + row += gridDim.y * blockDim.y; + + if (col < num_cols) { + for (; row < num_rows; row += gridDim.y * blockDim.y) { + sum = op(sum, in[row * num_cols + col]); + } + } + + partial_sums[threadIdx.x][threadIdx.y] = sum; + + __syncthreads(); + + if (threadIdx.y == 0 && col < num_cols) { + value_type s = partial_sums[threadIdx.x][0]; + + // only include input values in the reduction + // elem block_rows + // - = + // - = + // # # block boundary + // - = + // - = + // # # block boundary + // - = + // = + const int numRowsThisBlock = + min(blockDim.y, num_rows - blockIdx.y * blockDim.y); + + for (int row = 1; row < numRowsThisBlock; ++row) { + s = op(s, partial_sums[threadIdx.x][row]); + } + + out[col * gridDim.y + blockIdx.y] = s; + } +} + +// does multiple warp size segmented reductions in parallel +// segments cannot cross warp boundaries (mainly used for reducing the segments +// that come from the Max16Columns column reduction kernel) +template +__global__ void CleanupSegments( + T partial_sums, outT out, int num_rows, int num_cols, int segment_size, + Op op, typename std::iterator_traits::value_type initVal) { + typedef typename std::iterator_traits::value_type value_type; + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + value_type val = initVal; + if (tid < segment_size * num_cols) + val = partial_sums[tid]; + + typedef cub::WarpReduce WarpReduce; + + __shared__ typename WarpReduce::TempStorage temp_storage; + + const bool head_flag = (threadIdx.x % segment_size) == 0; + value_type sum = + WarpReduce(temp_storage).HeadSegmentedReduce(val, head_flag, op); + + if (head_flag && tid < segment_size * num_cols) { + out[tid / segment_size] = sum; + } +} + +// assigns one thread to a column +template +__global__ void ColumnReduceSimpleKernel(T in, outT out, int num_planes, + int num_rows, int num_cols, Op op) { + typedef typename std::iterator_traits::value_type value_type; + const int gid = threadIdx.x + blockIdx.x * blockDim.x; + const int elems_per_plane = num_rows * num_cols; + + const int plane = gid / num_cols; + const int col = gid % num_cols; + + if (plane >= num_planes) return; + + if (num_rows == 1) { + out[plane * elems_per_plane + col] = in[plane * elems_per_plane + col]; + return; + } + + value_type sum = op(in[plane * elems_per_plane + col], + in[plane * elems_per_plane + num_cols + col]); + for (int row = 2; row < num_rows; ++row) { + sum = op(sum, in[plane * elems_per_plane + row * num_cols + col]); + } + + out[plane * num_cols + col] = sum; +} + +struct RowOffset { + __host__ __device__ explicit RowOffset(const int& cols) : cols_(cols) {} + + __host__ __device__ int operator()(const int& x) const { return cols_ * x; } + + int cols_; +}; + +struct GatherOp { + __host__ __device__ GatherOp(const int& extent_x, const int& extent_y, + const int& extent_z, bool kOne) + : extent_x_(extent_x), + extent_y_(extent_y), + extent_z_(extent_z), + kOne_(kOne) { + if (kOne_) + group_size_ = extent_y_; + else + group_size_ = extent_x_ * extent_z_; + } + + __host__ __device__ int operator()(const int& ind) const { + const int group = kOne_ ? ind / group_size_ : ind % group_size_; + const int offset = kOne_ ? ind % group_size_ : ind / group_size_; + + const int x = group / extent_z_; + const int z = group % extent_z_; + + return x * extent_y_ * extent_z_ + z + offset * extent_z_; + } + + int extent_x_; + int extent_y_; + int extent_z_; + bool kOne_; + int group_size_; +}; + +template +void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in, + int in_size, Op op, T init, + const cudaStream_t& cu_stream) { + // handle situations where low latency is important better than CUB + if (in_size <= 4096) { + const int num_blocks = 1; + const int num_threads = 256; + BlockReduceKernel + <<>>(in, out, in_size, op, init); + return; + } else if (in_size <= 1 << 19) { + const int num_threads = 256; + const int num_blocks = min(32, Eigen::divup(in_size, num_threads)); + // it seems like tailoring this to the GPU + // would be more effective, but all attempts + // at making this a multiple of the number of + // multiprocessors have lead to lower perf + // in general + // TODO(eriche) investigate this more + + Tensor temp_storage; + OP_REQUIRES_OK( + ctx, + ctx->allocate_temp( + DT_INT8, TensorShape({static_cast(num_blocks * sizeof(T))}), + &temp_storage)); + + BlockReduceKernel + <<>>( + in, (T*)temp_storage.flat().data(), in_size, op, init); + + // take care that we only reduce blocks that had some valid elements in them + // TODO(eriche): CUB currently has a bug in HeadSegmentedReduce that + // requires it to be used with a full warp. Can reduce 32 -> num_blocks + // when this is fixed. + CleanupSegments<<<1, 32, 0, cu_stream>>>( + (T*)temp_storage.flat().data(), out, 1, 1, num_blocks, op, + init); + return; + } + std::size_t temp_storage_bytes = 0; + + Tensor temp_storage; + // written as a loop because it reduces clutter + // first pass allocates memory, second launches kernel(s) + for (int i = 0; i < 2; ++i) { + auto success = cub::DeviceReduce::Reduce( + i == 0 ? nullptr : temp_storage.flat().data(), + temp_storage_bytes, in, out, in_size, op, init, cu_stream); + + OP_REQUIRES( + ctx, success == 0, + errors::Internal("CUB reduce error", cudaGetErrorString(success))); + + if (i == 0) + OP_REQUIRES_OK( + ctx, + ctx->allocate_temp( + DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), + &temp_storage)); + } +} + +template +void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows, + int num_cols, Op op, T init, + const cudaStream_t& cu_stream) { + if (num_cols < 1024) { + const int threads_per_block = 128; + const int warps_per_block = threads_per_block / 32; + int num_blocks = (num_rows + warps_per_block - 1) / warps_per_block; + + RowReduceKernel<<>>( + in, out, num_rows, num_cols, op, init); + return; + } + + // setup segment offsets with counting and transform iterator + RowOffset row_offset_op(num_cols); + cub::CountingInputIterator counting_iter(0); + cub::TransformInputIterator> + transform_iter(counting_iter, row_offset_op); + + std::size_t temp_storage_bytes = 0; + Tensor temp_storage; + for (int i = 0; i < 2; ++i) { + auto success = cub::DeviceSegmentedReduce::Reduce( + i == 0 ? nullptr : temp_storage.flat().data(), + temp_storage_bytes, in, out, num_rows, transform_iter, + transform_iter + 1, op, init, cu_stream); + + OP_REQUIRES(ctx, success == 0, + errors::Internal("CUB segmented reduce error", + cudaGetErrorString(success))); + + if (i == 0) + OP_REQUIRES_OK( + ctx, + ctx->allocate_temp( + DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), + &temp_storage)); + } +} + +template +void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in, + int extent_x, int extent_y, Op op, T init, + const cudaStream_t& cu_stream) { + int rows_per_warp = 32 / extent_y; + dim3 block_dim(32, min(Eigen::divup(extent_x, rows_per_warp), 32), 1); + dim3 grid_dim(1, + Eigen::divup(static_cast(extent_x), + rows_per_warp * block_dim.y), + 1); + + grid_dim.y = min((int)grid_dim.y, 32); + + if (grid_dim.y > 2 && grid_dim.y < 32) { + int log2 = Log2Floor(grid_dim.y); + grid_dim.y = 1 << log2; + } + + if (grid_dim.y == 1) { + ColumnReduceMax16ColumnsKernel<<>>( + in, out, extent_x, extent_y, op, init); + } else { + Tensor temp_storage; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DT_INT8, + TensorShape({static_cast( + sizeof(T) * extent_y * grid_dim.y)}), + &temp_storage)); + ColumnReduceMax16ColumnsKernel<<>>( + in, (T*)temp_storage.flat().data(), extent_x, extent_y, op, + init); + + dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1); + dim3 num_threads(128, 1, 1); + CleanupSegments<<>>( + (T*)temp_storage.flat().data(), out, extent_x, extent_y, + grid_dim.y, op, init); + } +} + +template +void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in, + int extent_x, int extent_y, Op op, + T init, const cudaStream_t& cu_stream) { + dim3 block_dim(32, min(extent_x, 32), 1); + dim3 grid_dim((extent_y + 31) / 32, 1, 1); + + if (grid_dim.x < 16) grid_dim.y = min((extent_x + 31) / 32, 32); + + if (grid_dim.y > 2 && grid_dim.y < 32) { + int log2 = Log2Floor(grid_dim.y); + grid_dim.y = 1 << log2; + } + + if (grid_dim.y == 1) { + ColumnReduceKernel<<>>( + in, out, extent_x, extent_y, op, init); + } else { + Tensor temp_storage; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DT_INT8, + TensorShape({static_cast( + sizeof(T) * extent_y * grid_dim.y)}), + &temp_storage)); + + ColumnReduceKernel<<>>( + in, (T*)temp_storage.flat().data(), extent_x, extent_y, op, + init); + + dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1); + dim3 num_threads(128, 1, 1); + CleanupSegments<<>>( + (T*)temp_storage.flat().data(), out, extent_x, extent_y, + grid_dim.y, op, init); + } +} + +template +void LaunchColumnReduction(OpKernelContext* ctx, OUT_T out, IN_T in, + int extent_x, int extent_y, Op op, T init, + const cudaStream_t& cu_stream) { + if (extent_y <= 16) { + LaunchColumnReduction_LTE16Cols(ctx, out, in, extent_x, extent_y, op, init, + cu_stream); + } else if (extent_y <= 4096) { + LaunchColumnReduction_LTE4096Cols(ctx, out, in, extent_x, extent_y, op, + init, cu_stream); + } else { + int threads_per_block = 128; + int num_blocks = Eigen::divup(extent_y, threads_per_block); + + ColumnReduceSimpleKernel<<>>( + in, out, 1, extent_x, extent_y, op); + } +} + +template +void Launch3DYReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x, + int extent_y, int extent_z, Op op, T init, + const cudaStream_t& cu_stream) { + int threads_per_block = 128; + int num_blocks = + (extent_x * extent_z + threads_per_block - 1) / threads_per_block; + + // TODO(eriche): this won't be very good in the case of small x + // small z and large y. + ColumnReduceSimpleKernel<<>>( + in, out, extent_x, extent_y, extent_z, op); +} + +template +void Launch3DXZReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x, + int extent_y, int extent_z, Op op, T init, + const cudaStream_t& cu_stream) { + // setup segment offsets with counting and transform iterator + RowOffset row_offset_op(extent_x * extent_z); + cub::CountingInputIterator counting_iter(0); + cub::TransformInputIterator> + transform_iter(counting_iter, row_offset_op); + + GatherOp gather_op(extent_x, extent_y, extent_z, false); + typedef cub::TransformInputIterator> + gatherIterType; + gatherIterType gather_iter(counting_iter, gather_op); + + PermutationInputIterator permute_iter(in, + gather_iter); + + std::size_t temp_storage_bytes = 0; + Tensor temp_storage; + + for (int i = 0; i < 2; ++i) { + auto success = cub::DeviceSegmentedReduce::Reduce( + i == 0 ? nullptr : temp_storage.flat().data(), + temp_storage_bytes, permute_iter, out, extent_y, transform_iter, + transform_iter + 1, op, init, cu_stream); + + OP_REQUIRES(ctx, success == 0, + errors::Internal("CUB segmented reduce error", + cudaGetErrorString(success))); + + if (i == 0) + OP_REQUIRES_OK( + ctx, + ctx->allocate_temp( + DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), + &temp_storage)); + } +} + +template +void ReduceImpl(OpKernelContext* ctx, OUT_T out, IN_T in, int in_rank, + int in_dim0, int in_dim1, int in_dim2, int out_rank, + const ReductionAxes& reduction_axes, Op op, T init) { + const cudaStream_t& cu_stream = GetCudaStream(ctx); + if (out_rank == 0) { + const int in_size = in_dim0 * in_dim1 * in_dim2; + LaunchScalarReduction(ctx, out, in, in_size, op, init, cu_stream); + } else if (in_rank == 2 && out_rank == 1 && + reduction_axes[0] == 1) { // row reduction + LaunchRowReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream); + } else if (in_rank == 2 && out_rank == 1 && + reduction_axes[0] == 0) { // column reduction + LaunchColumnReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream); + } else if (in_rank == 3 && out_rank == 2 && reduction_axes[0] == 1) { + Launch3DYReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init, + cu_stream); + } else if (in_rank == 3 && out_rank == 1 && reduction_axes[0] == 0 && + reduction_axes[1] == 2) { + Launch3DXZReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init, + cu_stream); + } else { + std::stringstream ss; + ss << "Invalid reduction requested: in_rank, out_rank, axes " << in_rank + << " " << out_rank; + if (out_rank == 1) ss << " " << reduction_axes[0]; + if (out_rank == 2) ss << " " << reduction_axes[1]; + LOG(FATAL) << ss.str(); + } +} + +} // namespace functor +} // namespace tensorflow + +#endif diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc index 9cdebdd4f23..9bbe993a2f9 100644 --- a/tensorflow/core/kernels/reduction_ops_test.cc +++ b/tensorflow/core/kernels/reduction_ops_test.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" #include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" @@ -22,14 +23,59 @@ namespace tensorflow { // Creates a Graph which "reduce"s a 3D float tensor of "num" elements // into a scalar. -static Graph* ToScalar(const string& reduce, int num) { - Graph* g = new Graph(OpRegistry::Global()); - Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)})); - data.flat().setRandom(); - Tensor axes(DT_INT32, TensorShape({3})); +template +static Graph* ToScalar(const string& reduce, int num_x, int num_y) { + auto* g = new Graph(OpRegistry::Global()); + Tensor data(DataTypeToEnum::value, TensorShape({num_x, num_y})); + data.flat().setRandom(); + Tensor axes(DT_INT32, TensorShape({2})); axes.flat()(0) = 0; axes.flat()(1) = 1; - axes.flat()(2) = 2; + test::graph::Reduce(g, reduce, test::graph::Constant(g, data), + test::graph::Constant(g, axes)); + return g; +} + +static Graph* ColReduce(const string& reduce, int num_x, int num_y) { + auto* g = new Graph(OpRegistry::Global()); + Tensor data(DT_FLOAT, TensorShape({num_x, num_y})); + data.flat().setRandom(); + Tensor axes(DT_INT32, TensorShape({1})); + axes.flat()(0) = 0; + test::graph::Reduce(g, reduce, test::graph::Constant(g, data), + test::graph::Constant(g, axes)); + return g; +} + +static Graph* RowReduce(const string& reduce, int num_x, int num_y) { + auto* g = new Graph(OpRegistry::Global()); + Tensor data(DT_FLOAT, TensorShape({num_x, num_y})); + data.flat().setRandom(); + Tensor axes(DT_INT32, TensorShape({1})); + axes.flat()(0) = 1; + test::graph::Reduce(g, reduce, test::graph::Constant(g, data), + test::graph::Constant(g, axes)); + return g; +} + +static Graph* ThreeDYReduce(const string& reduce, int num_y, int num_z) { + auto* g = new Graph(OpRegistry::Global()); + Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z})); + data.flat().setRandom(); + Tensor axes(DT_INT32, TensorShape({1})); + axes.flat()(0) = 1; + test::graph::Reduce(g, reduce, test::graph::Constant(g, data), + test::graph::Constant(g, axes)); + return g; +} + +static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) { + auto* g = new Graph(OpRegistry::Global()); + Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z})); + data.flat().setRandom(); + Tensor axes(DT_INT32, TensorShape({2})); + axes.flat()(0) = 0; + axes.flat()(1) = 2; test::graph::Reduce(g, reduce, test::graph::Constant(g, data), test::graph::Constant(g, axes)); return g; @@ -37,51 +83,100 @@ static Graph* ToScalar(const string& reduce, int num) { // Creates a bench which reduces a 3D tensor with total "num" floats // into a scalar on a "device". Runs the bench for "iters" times. +template static void ReduceToScalar(int iters, const string& device, - const string& reduce, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * sizeof(float)); - test::Benchmark(device, ToScalar(reduce, num)).Run(iters); + const string& reduce, int num_x, int num_y) { + testing::ItemsProcessed(static_cast(iters) * num_x * num_y); + testing::BytesProcessed(static_cast(iters) * num_x * num_y * + sizeof(T)); + test::Benchmark(device, ToScalar(reduce, num_x, num_y)).Run(iters); } -static void BM_Sum3DToScalarCPU(int iters, int num) { - ReduceToScalar(iters, "cpu", "Sum", num); +static void DoRowReduce(int iters, const string& device, const string& reduce, + int num_x, int num_y) { + testing::ItemsProcessed(static_cast(iters) * num_x * num_y); + testing::BytesProcessed(static_cast(iters) * num_x * num_y * + sizeof(float)); + test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters); } -BENCHMARK(BM_Sum3DToScalarCPU)->Range(1 << 13, 1 << 20); -static void BM_Max3DToScalarCPU(int iters, int num) { - ReduceToScalar(iters, "cpu", "Max", num); +static void DoColReduce(int iters, const string& device, const string& reduce, + int num_x, int num_y) { + testing::ItemsProcessed(static_cast(iters) * num_x * num_y); + testing::BytesProcessed(static_cast(iters) * num_x * num_y * + sizeof(float)); + test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters); } -BENCHMARK(BM_Max3DToScalarCPU)->Range(1 << 13, 1 << 20); -static void BM_Prod3DToScalarCPU(int iters, int num) { - ReduceToScalar(iters, "cpu", "Prod", num); +static void Do3DYReduce(int iters, const string& device, const string& reduce, + int num_x, int num_y) { + testing::ItemsProcessed(static_cast(iters) * num_x * num_y); + testing::BytesProcessed(static_cast(iters) * num_x * num_y * + sizeof(float)); + test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters); } -BENCHMARK(BM_Prod3DToScalarCPU)->Range(1 << 13, 1 << 20); -static void BM_Mean3DToScalarCPU(int iters, int num) { - ReduceToScalar(iters, "cpu", "Mean", num); +static void Do3DXZReduce(int iters, const string& device, const string& reduce, + int num_x, int num_y) { + testing::ItemsProcessed(static_cast(iters) * num_x * num_y); + testing::BytesProcessed(static_cast(iters) * num_x * num_y * + sizeof(float)); + test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters); } -BENCHMARK(BM_Mean3DToScalarCPU)->Range(1 << 13, 1 << 20); -static void BM_Sum3DToScalarGPU(int iters, int num) { - ReduceToScalar(iters, "gpu", "Sum", num); +static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) { + ReduceToScalar(iters, "gpu", "Sum", num_x, num_y); } -BENCHMARK(BM_Sum3DToScalarGPU)->Range(1 << 13, 1 << 20); +BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192); -static void BM_Max3DToScalarGPU(int iters, int num) { - ReduceToScalar(iters, "gpu", "Max", num); +static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) { + ReduceToScalar>(iters, "gpu", "Sum", num_x, num_y); } -BENCHMARK(BM_Max3DToScalarGPU)->Range(1 << 13, 1 << 20); +BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192); -static void BM_Prod3DToScalarGPU(int iters, int num) { - ReduceToScalar(iters, "gpu", "Prod", num); +static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) { + ReduceToScalar(iters, "gpu", "Sum", num_x, num_y); } -BENCHMARK(BM_Prod3DToScalarGPU)->Range(1 << 13, 1 << 20); +BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192); -static void BM_Mean3DToScalarGPU(int iters, int num) { - ReduceToScalar(iters, "gpu", "Mean", num); +static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) { + DoRowReduce(iters, "gpu", "Sum", num_x, num_y); } -BENCHMARK(BM_Mean3DToScalarGPU)->Range(1 << 13, 1 << 20); +BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192); + +static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) { + DoColReduce(iters, "gpu", "Sum", num_x, num_y); +} +BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192); + +static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) { + Do3DYReduce(iters, "gpu", "Sum", num_x, num_y); +} +BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096); + +static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) { + Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y); +} +BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096); + +static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) { + ReduceToScalar(iters, "gpu", "Mean", num_x, num_y); +} +BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); + +static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) { + ReduceToScalar(iters, "gpu", "Max", num_x, num_y); +} +BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); + +static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) { + ReduceToScalar(iters, "gpu", "Min", num_x, num_y); +} +BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); + +static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) { + ReduceToScalar(iters, "gpu", "All", num_x, num_y); +} +BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); } // end namespace tensorflow diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc index 12b10d61da8..98f3718c128 100644 --- a/tensorflow/core/kernels/resource_variable_ops.cc +++ b/tensorflow/core/kernels/resource_variable_ops.cc @@ -431,7 +431,16 @@ class ResourceGatherOp : public OpKernel { TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU); TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU); +// Registers GPU kernels. +#if GOOGLE_CUDA +#define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type) + +TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GATHER_GPU); + +#endif // GOOGLE_CUDA + #undef REGISTER_GATHER_CPU +#undef REGISTER_GATHER_GPU #undef REGISTER_GATHER_ALL_INDICES #undef REGISTER_GATHER_FULL diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc index 80d49017406..6b06cf650a8 100644 --- a/tensorflow/core/kernels/save_restore_tensor.cc +++ b/tensorflow/core/kernels/save_restore_tensor.cc @@ -216,9 +216,12 @@ void RestoreTensor(OpKernelContext* context, if (output_shape.num_elements() == 0) return; -#define READER_COPY(T) \ - case DataTypeToEnum::value: \ - reader->CopySliceData(tensor_name, slice_to_load, t->flat().data()); \ +#define READER_COPY(T) \ + case DataTypeToEnum::value: \ + OP_REQUIRES(context, \ + reader->CopySliceData(tensor_name, slice_to_load, \ + t->flat().data()), \ + errors::InvalidArgument("Error copying slice data")); \ break; switch (type) { diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc index 9cdbe89457c..8f7eff113cd 100644 --- a/tensorflow/core/kernels/segment_reduction_ops.cc +++ b/tensorflow/core/kernels/segment_reduction_ops.cc @@ -16,6 +16,9 @@ limitations under the License. // See docs in ../ops/math_ops.cc. #define EIGEN_USE_THREADS +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA #include "tensorflow/core/kernels/segment_reduction_ops.h" #include @@ -32,6 +35,15 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/util/util.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/kernels/cuda_solvers.h" +#include "tensorflow/core/platform/cuda.h" + +using ::perftools::gputools::cuda::ScopedActivateExecutorContext; +#endif // GOOGLE_CUDA + namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; @@ -183,6 +195,105 @@ class SegmentReductionOp : public OpKernel { } }; +#ifdef GOOGLE_CUDA +// SegmentSumGPUOp is a segment sum operator implemented for GPU only. +// TODO: This implementation of SegmentSumGPUOp is sometimes slower than +// its unsorted counterpart (mostly when problem size is small). +// This is due to the following two main reasons and a cost-effective way +// to resolve these problems is desirable. +// 1. Sorted segment sum requires a memory transfer from device to host in +// order to know the size of the output dimension whereas unsorted segment +// sum receives the size of the output dimension as an input parameter. +// 2. Sorted segment sum is essentially a tiled version of unsorted segment +// sum and therefore such optimization comes at an inherent cost. However +// such cost may not be justified when the problem size is small. When to +// use the tiled version or the untiled version depends on many factors +// including data alignments, ratio of calculation to memory traffic and +// obviously, the problem sizes. +template +class SegmentSumGPUOp : public AsyncOpKernel { + public: + explicit SegmentSumGPUOp(OpKernelConstruction* context) + : AsyncOpKernel(context) {} + + void ComputeAsync(OpKernelContext* context, DoneCallback done) override { + const Tensor& input = context->input(0); + const Tensor& segment_ids = context->input(1); + + OP_REQUIRES_ASYNC( + context, TensorShapeUtils::IsVector(segment_ids.shape()), + errors::InvalidArgument("segment_ids should be a vector."), done); + + const int64 num_indices = segment_ids.NumElements(); + OP_REQUIRES_ASYNC( + context, num_indices == input.dim_size(0), + errors::InvalidArgument( + "segment_ids should be the same size as dimension 0 of" + " input."), + done); + + if (num_indices == 0) { + TensorShape output_shape = input.shape(); + output_shape.set_dim(0, 0); + + Tensor* output = nullptr; + OP_REQUIRES_OK_ASYNC( + context, context->allocate_output(0, output_shape, &output), done); + done(); + return; + } + + perftools::gputools::DeviceMemoryBase output_rows_device( + (void*)(segment_ids.template flat().data() + (num_indices - 1))); + ScratchSpace output_rows_host(context, 1, /* on_host */ true); + + auto stream = context->op_device_context()->stream(); + OP_REQUIRES_ASYNC( + context, stream + ->ThenMemcpy(output_rows_host.mutable_data(), + output_rows_device, sizeof(Index)) + .ok(), + errors::Internal( + "SegmentSumGPUOp: failed to copy output_rows from device"), + done); + + functor::SegmentSumFunctor functor_; + auto create_and_check_output = [context, output_rows_host, &input, + &segment_ids, &functor_, done]() { + // Ensure that within the callback, the proper GPU settings are + // configured. + auto stream = context->op_device_context()->stream(); + ScopedActivateExecutorContext scoped_activation{stream->parent()}; + + Index output_rows = *output_rows_host.data(); + output_rows++; + OP_REQUIRES_ASYNC(context, output_rows > 0, + errors::InvalidArgument("segment ids must be >= 0"), + done); + + TensorShape output_shape = input.shape(); + output_shape.set_dim(0, output_rows); + + Tensor* output = nullptr; + OP_REQUIRES_OK_ASYNC( + context, context->allocate_output(0, output_shape, &output), done); + + auto output_flat = output->flat_outer_dims(); + auto data_ptr = input.template flat().data(); + auto segment_flat = segment_ids.flat(); + functor_(context, context->eigen_device(), output_rows, + segment_ids.shape(), segment_flat, input.NumElements(), data_ptr, + output_flat); + + done(); + }; + + context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( + stream, create_and_check_output); + } +}; +#endif // GOOGLE_CUDA + #define REGISTER_CPU_KERNEL_SEGMENT(name, functor, type, index_type, \ default_value) \ REGISTER_KERNEL_BUILDER( \ @@ -227,6 +338,23 @@ REGISTER_COMPLEX_CPU_KERNELS_ALL(complex128); #undef REGISTER_REAL_CPU_KERNELS_ALL #undef REGISTER_COMPLEX_CPU_KERNELS_ALL +#if GOOGLE_CUDA +#define REGISTER_GPU_SORTED_KERNELS(type, index_type) \ + REGISTER_KERNEL_BUILDER(Name("SegmentSum") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tindices"), \ + SegmentSumGPUOp) + +#define REGISTER_GPU_SORTED_KERNELS_ALL(type) \ + REGISTER_GPU_SORTED_KERNELS(type, int32); \ + REGISTER_GPU_SORTED_KERNELS(type, int64); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL); +#undef REGISTER_GPU_SORTED_KERNELS +#undef REGISTER_GPU_SORTED_KERNELS_ALL +#endif // GOOGLE_CUDA + namespace functor { // UnsortedSegmentSumFunctor implementation for CPUDevice. diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h index ee09c213b7c..412c1d601d3 100644 --- a/tensorflow/core/kernels/segment_reduction_ops.h +++ b/tensorflow/core/kernels/segment_reduction_ops.h @@ -26,6 +26,28 @@ namespace tensorflow { class OpKernelContext; namespace functor { + +#ifdef GOOGLE_CUDA +typedef Eigen::GpuDevice GPUDevice; +// Functor for SegmentSumGPUOp. +// 'output_rows': the number of output segments (unique segment ids in +// 'segment_ids'). +// 'segment_ids_shape': shape of 'segment_ids' tensor. +// 'segment_ids': unsorted map from input to output segment ids at which to +// perform segment sum operation. +// 'data_size': size of input data tensor. +// 'data': input data tensor. +// 'output': output reshaped to {output_rows, output.size/output_rows} +template +struct SegmentSumFunctor { + void operator()(OpKernelContext* ctx, const GPUDevice& d, + const Index output_rows, const TensorShape& segment_ids_shape, + typename TTypes::ConstFlat segment_ids, + const Index data_size, const T* data, + typename TTypes::Tensor output); +}; +#endif + // BaseFunctor for definition of UnsorteSegmentReductionOp // for usage without templates. template diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc index b132b1e8f8b..26fcafee34a 100644 --- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc @@ -54,6 +54,77 @@ __device__ __forceinline__ void AccumulateInto( CudaAtomicAdd(dest_scalar + 1, value.imag()); } +// SortedSegmentSumFunctor kernel reduces input data just as +// UnsortedSegmentSumCustomKernel does except that input data +// is partitioned along the outer reduction dimension. This is +// because consecutive rows (elements in a row share the same +// outer dimension index) in the flattened 2D input data likely +// belong to the same segment in sorted segment sum operation. +// Therefore such partitioning strategy has two advantages over +// the UnsortedSegmentSumFunctor kernel: +// 1. Each thread reduces across multiple rows before writing +// answers to the global memory, we can therefore +// write reduction results to global memory less often. +// 2. We may know that the current thread is the only contributor +// to an output element because of the increasing nature of segment +// ids. In such cases, we do not need to use atomic operations +// to write results to global memory. +// In the flattened view of input data (with only outer and inner +// dimension), every thread processes a strip of input data of +// size OuterDimTileSize x 1. This strip runs across multiple +// rows of input data and all reduction elements share one inner +// dimension index. +template +__global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size, + const Index inner_dim_size, + const Index output_outer_dim_size, + const Index* segment_ids, + const T* input, T* output, + const Index total_stripe_count) { + CUDA_1D_KERNEL_LOOP(stripe_index, total_stripe_count) { + const Index segment_offset = stripe_index % inner_dim_size; + const Index input_outer_dim_index_base = + stripe_index / inner_dim_size * Index(OuterDimTileSize); + + T sum = T(0); + Index first_segment_id = segment_ids[input_outer_dim_index_base]; + Index last_output_segment_id = output_outer_dim_size; + + const Index actual_stripe_height = + min(Index(OuterDimTileSize), + input_outer_dim_size - input_outer_dim_index_base); + for (Index j = 0; j < actual_stripe_height; j++) { + Index current_output_segment_id = + segment_ids[input_outer_dim_index_base + j]; + // Decide whether to write result to global memory. + // Result is only written to global memory if we move + // to another segment. Otherwise we can keep accumulating + // locally. + if (current_output_segment_id > last_output_segment_id) { + const Index output_index = + last_output_segment_id * inner_dim_size + segment_offset; + // decide whether to write result to global memory using atomic + // operations + if (last_output_segment_id == first_segment_id) { + AccumulateInto(output + output_index, sum); + } else { + *(output + output_index) = sum; + } + sum = T(0); + } + sum += ldg(input + (input_outer_dim_index_base + j) * inner_dim_size + + segment_offset); + last_output_segment_id = current_output_segment_id; + } + // For the last result in a strip, always write using atomic operations + // due to possible race conditions with threads computing + // the following strip. + const Index output_index = + last_output_segment_id * inner_dim_size + segment_offset; + AccumulateInto(output + output_index, sum); + } +} + // UnsortedSegmentSumFunctor kernel processes 'input_total_size' elements. // Each element is mapped from input to output by a combination of its // 'segment_ids' mapping and 'inner_dim_size'. @@ -80,6 +151,47 @@ __global__ void UnsortedSegmentSumCustomKernel( namespace functor { +template +void SegmentSumFunctor::operator()( + OpKernelContext* ctx, const GPUDevice& d, const Index output_rows, + const TensorShape& segment_ids_shape, + typename TTypes::ConstFlat segment_ids, const Index data_size, + const T* data, typename TTypes::Tensor output) { + if (output.size() == 0) { + return; + } + // Set 'output' to zeros. + CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d); + SetZero<<>>( + output.size(), output.data()); + if (data_size == 0 || segment_ids_shape.num_elements() == 0) { + return; + } + + // Launch kernel to compute sorted segment sum. + // Notes: + // *) 'input_total_size' is the total number of elements to process. + // *) 'segment_ids.shape' is a prefix of data's shape. + // *) 'input_outer_dim_size' is the total number of segments to process. + const Index input_total_size = data_size; + const Index input_outer_dim_size = segment_ids.dimension(0); + const Index input_inner_dim_size = input_total_size / input_outer_dim_size; + + const int OuterDimTileSize = 8; + + const Index input_outer_dim_num_stripe = + Eigen::divup(input_outer_dim_size, Index(OuterDimTileSize)); + + const Index total_stripe_count = + input_inner_dim_size * input_outer_dim_num_stripe; + + config = GetCudaLaunchConfig(total_stripe_count, d); + SortedSegmentSumCustomKernel<<< + config.block_count, config.thread_per_block, 0, d.stream()>>>( + input_outer_dim_size, input_inner_dim_size, output_rows, + segment_ids.data(), data, output.data(), total_stripe_count); +}; + // UnsortedSegmentSumFunctor implementation for GPUDevice. template struct UnsortedSegmentSumFunctor: UnsortedSegmentBaseFunctor { @@ -117,6 +229,15 @@ struct UnsortedSegmentSumFunctor: UnsortedSegmentBaseFuncto } }; +#define DEFINE_SORTED_GPU_SPECS_INDEX(T, Index) \ + template struct SegmentSumFunctor + +#define DEFINE_SORTED_GPU_SPECS(T) \ + DEFINE_SORTED_GPU_SPECS_INDEX(T, int32); \ + DEFINE_SORTED_GPU_SPECS_INDEX(T, int64); + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_SORTED_GPU_SPECS); + #define DEFINE_GPU_SPECS_INDEX(T, Index) \ template struct UnsortedSegmentSumFunctor diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc index a305598fe2b..96eaa4ac75b 100644 --- a/tensorflow/core/kernels/shape_op_test.cc +++ b/tensorflow/core/kernels/shape_op_test.cc @@ -101,7 +101,7 @@ TEST_F(ShapeOpTest, Simple) { Tensor variant_tensor(DT_VARIANT, TensorShape({1})); Status s = session.Run({{input, variant_tensor}}, {shape_output}, &outputs); EXPECT_FALSE(s.ok()); - ExpectHasError(s, "Shape of non-scalar Variant not supported."); + ExpectHasError(s, "Shape of non-unary Variant not supported."); } { diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h index 0c39d46aeaf..ac607f4e8b8 100644 --- a/tensorflow/core/kernels/shape_ops.h +++ b/tensorflow/core/kernels/shape_ops.h @@ -35,7 +35,7 @@ inline Status GetRegularOrVariantShape(OpKernelContext* ctx, int input_index, if (ctx->input_dtype(0) == DT_VARIANT) { if (inp.dims() != 0) { return errors::InvalidArgument( - "Shape of non-scalar Variant not supported."); + "Shape of non-unary Variant not supported."); } TF_RETURN_IF_ERROR(GetUnaryVariantShape(inp, shape)); } else { diff --git a/tensorflow/core/kernels/summary_interface.cc b/tensorflow/core/kernels/summary_interface.cc new file mode 100644 index 00000000000..19e0f702f9f --- /dev/null +++ b/tensorflow/core/kernels/summary_interface.cc @@ -0,0 +1,432 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/summary.pb.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/summary_interface.h" +#include "tensorflow/core/lib/histogram/histogram.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/png/png_io.h" +#include "tensorflow/core/lib/wav/wav_io.h" +#include "tensorflow/core/util/event.pb.h" +#include "tensorflow/core/util/events_writer.h" + +namespace tensorflow { +namespace { +template +Status TensorValueAt(Tensor t, int index, T* out) { + switch (t.dtype()) { + case DT_FLOAT: + *out = t.flat()(index); + break; + case DT_DOUBLE: + *out = t.flat()(index); + break; + case DT_HALF: + *out = T(t.flat()(index)); + break; + case DT_INT32: + *out = t.flat()(index); + break; + case DT_UINT8: + *out = t.flat()(index); + break; + case DT_INT16: + *out = t.flat()(index); + break; + case DT_INT8: + *out = t.flat()(index); + break; + case DT_BOOL: + *out = t.flat()(index); + break; + case DT_INT64: + *out = t.flat()(index); + break; + default: + return errors::Unimplemented("Scalar summary for dtype ", + DataTypeString(t.dtype()), + " is not supported."); + } + return Status::OK(); +} + +typedef Eigen::Tensor Uint8Image; + +// Add the sequence of images specified by ith_image to the summary. +// +// Factoring this loop out into a helper function lets ith_image behave +// differently in the float and uint8 cases: the float case needs a temporary +// buffer which can be shared across calls to ith_image, but the uint8 case +// does not. +Status AddImages(const string& tag, int max_images, int batch_size, int w, + int h, int depth, + const std::function& ith_image, Summary* s) { + const int N = std::min(max_images, batch_size); + for (int i = 0; i < N; ++i) { + Summary::Value* v = s->add_value(); + // The tag depends on the number of requested images (not the number + // produced.) + // + // Note that later on avisu uses "/" to figure out a consistent naming + // convention for display, so we append "/image" to guarantee that the + // image(s) won't be displayed in the global scope with no name. + if (max_images > 1) { + v->set_tag(strings::StrCat(tag, "/image/", i)); + } else { + v->set_tag(strings::StrCat(tag, "/image")); + } + + auto image = ith_image(i); + Summary::Image* si = v->mutable_image(); + si->set_height(h); + si->set_width(w); + si->set_colorspace(depth); + const int channel_bits = 8; + const int compression = -1; // Use zlib default + if (!png::WriteImageToBuffer(image.data(), w, h, w * depth, depth, + channel_bits, compression, + si->mutable_encoded_image_string(), nullptr)) { + return errors::Internal("PNG encoding failed"); + } + } + return Status::OK(); +} + +template +void NormalizeFloatImage(int hw, int depth, + typename TTypes::ConstMatrix values, + typename TTypes::ConstVec bad_color, + Uint8Image* image) { + if (!image->size()) return; // Nothing to do for empty images + + // Rescale the image to uint8 range. + // + // We are trying to generate an RGB image from a float/half tensor. We do + // not have any info about the expected range of values in the tensor + // but the generated image needs to have all RGB values within [0, 255]. + // + // We use two different algorithms to generate these values. If the + // tensor has only positive values we scale them all by 255/max(values). + // If the tensor has both negative and positive values we scale them by + // the max of their absolute values and center them around 127. + // + // This works for most cases, but does not respect the relative dynamic + // range across different instances of the tensor. + + // Compute min and max ignoring nonfinite pixels + float image_min = std::numeric_limits::infinity(); + float image_max = -image_min; + for (int i = 0; i < hw; i++) { + bool finite = true; + for (int j = 0; j < depth; j++) { + if (!Eigen::numext::isfinite(values(i, j))) { + finite = false; + break; + } + } + if (finite) { + for (int j = 0; j < depth; j++) { + float value(values(i, j)); + image_min = std::min(image_min, value); + image_max = std::max(image_max, value); + } + } + } + + // Pick an affine transform into uint8 + const float kZeroThreshold = 1e-6; + T scale, offset; + if (image_min < 0) { + float max_val = std::max(std::abs(image_min), std::abs(image_max)); + scale = T(max_val < kZeroThreshold ? 0.0f : 127.0f / max_val); + offset = T(128.0f); + } else { + scale = T(image_max < kZeroThreshold ? 0.0f : 255.0f / image_max); + offset = T(0.0f); + } + + // Transform image, turning nonfinite values to bad_color + for (int i = 0; i < hw; i++) { + bool finite = true; + for (int j = 0; j < depth; j++) { + if (!Eigen::numext::isfinite(values(i, j))) { + finite = false; + break; + } + } + if (finite) { + image->chip<0>(i) = + (values.template chip<0>(i) * scale + offset).template cast(); + } else { + image->chip<0>(i) = bad_color; + } + } +} + +template +Status NormalizeAndAddImages(const Tensor& tensor, int max_images, int h, int w, + int hw, int depth, int batch_size, + const string& base_tag, Tensor bad_color_tensor, + Summary* s) { + // For float and half images, nans and infs are replaced with bad_color. + if (bad_color_tensor.dim_size(0) < depth) { + return errors::InvalidArgument( + "expected depth <= bad_color.size, got depth = ", depth, + ", bad_color.size = ", bad_color_tensor.dim_size(0)); + } + auto bad_color_full = bad_color_tensor.vec(); + typename TTypes::ConstVec bad_color(bad_color_full.data(), depth); + + // Float images must be scaled and translated. + Uint8Image image(hw, depth); + auto ith_image = [&tensor, &image, bad_color, batch_size, hw, depth](int i) { + auto tensor_eigen = tensor.template shaped({batch_size, hw, depth}); + typename TTypes::ConstMatrix values( + &tensor_eigen(i, 0, 0), Eigen::DSizes(hw, depth)); + NormalizeFloatImage(hw, depth, values, bad_color, &image); + return image; + }; + return AddImages(base_tag, max_images, batch_size, w, h, depth, ith_image, s); +} + +} // namespace + +class SummaryWriterImpl : public SummaryWriterInterface { + public: + SummaryWriterImpl(int max_queue, int flush_millis) + : SummaryWriterInterface(), + max_queue_(max_queue), + flush_millis_(flush_millis) {} + + Status Initialize(const string& logdir, const string& filename_suffix, + Env* env) { + Status is_dir = env->IsDirectory(logdir); + if (!is_dir.ok()) { + if (is_dir.code() != tensorflow::error::NOT_FOUND) { + return is_dir; + } + TF_RETURN_IF_ERROR(env->CreateDir(logdir)); + } + mutex_lock ml(mu_); + events_writer_ = + xla::MakeUnique(io::JoinPath(logdir, "events")); + if (!events_writer_->InitWithSuffix(filename_suffix)) { + return errors::Unknown("Could not initialize events writer."); + } + last_flush_ = Env::Default()->NowMicros(); + return Status::OK(); + } + + Status Flush() override { + mutex_lock ml(mu_); + return InternalFlush(); + } + + ~SummaryWriterImpl() override { + (void)Flush(); // Ignore errors. + } + + Status WriteTensor(int64 global_step, Tensor t, const string& tag, + const string& serialized_metadata) override { + Summary s; + Summary::Value* v = s.add_value(); + t.AsProtoTensorContent(v->mutable_tensor()); + v->set_tag(tag); + v->mutable_metadata()->ParseFromString(serialized_metadata); + return Enqueue(global_step, s); + } + + Status WriteScalar(int64 global_step, Tensor t, const string& tag) override { + Summary s; + Summary::Value* v = s.add_value(); + v->set_tag(tag); + float value; + TF_RETURN_IF_ERROR(TensorValueAt(t, 0, &value)); + v->set_simple_value(value); + return Enqueue(global_step, s); + } + + Status WriteHistogram(int64 global_step, Tensor t, + const string& tag) override { + Summary s; + Summary::Value* v = s.add_value(); + v->set_tag(tag); + histogram::Histogram histo; + for (int64 i = 0; i < t.NumElements(); i++) { + double double_val; + TF_RETURN_IF_ERROR(TensorValueAt(t, i, &double_val)); + if (Eigen::numext::isnan(double_val)) { + return errors::InvalidArgument("Nan in summary histogram for: ", tag); + } else if (Eigen::numext::isinf(double_val)) { + return errors::InvalidArgument("Infinity in summary histogram for: ", + tag); + } + histo.Add(double_val); + } + + histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */); + return Enqueue(global_step, s); + } + + Status WriteImage(int64 global_step, Tensor tensor, const string& tag, + int max_images, Tensor bad_color) override { + if (!(tensor.dims() == 4 && + (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 || + tensor.dim_size(3) == 4))) { + return errors::InvalidArgument( + "Tensor must be 4-D with last dim 1, 3, or 4, not ", + tensor.shape().DebugString()); + } + if (!(tensor.dim_size(0) < (1LL << 31) && + tensor.dim_size(1) < (1LL << 31) && + tensor.dim_size(2) < (1LL << 31) && + (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29))) { + return errors::InvalidArgument("Tensor too large for summary ", + tensor.shape().DebugString()); + } + Summary s; + // The casts and h * w cannot overflow because of the limits above. + const int batch_size = static_cast(tensor.dim_size(0)); + const int h = static_cast(tensor.dim_size(1)); + const int w = static_cast(tensor.dim_size(2)); + const int hw = h * w; // Compact these two dims for simplicity + const int depth = static_cast(tensor.dim_size(3)); + if (tensor.dtype() == DT_UINT8) { + // For uint8 input, no normalization is necessary + auto ith_image = [&tensor, batch_size, hw, depth](int i) { + auto values = tensor.shaped({batch_size, hw, depth}); + return typename TTypes::ConstMatrix( + &values(i, 0, 0), Eigen::DSizes(hw, depth)); + }; + TF_RETURN_IF_ERROR( + AddImages(tag, max_images, batch_size, w, h, depth, ith_image, &s)); + } else if (tensor.dtype() == DT_HALF) { + TF_RETURN_IF_ERROR(NormalizeAndAddImages( + tensor, max_images, h, w, hw, depth, batch_size, tag, bad_color, &s)); + } else if (tensor.dtype() == DT_FLOAT) { + TF_RETURN_IF_ERROR(NormalizeAndAddImages( + tensor, max_images, h, w, hw, depth, batch_size, tag, bad_color, &s)); + } else { + return errors::InvalidArgument( + "Only DT_INT8, DT_HALF, and DT_FLOAT images are supported. Got ", + DataTypeString(tensor.dtype())); + } + + return Enqueue(global_step, s); + } + + Status WriteAudio(int64 global_step, Tensor tensor, const string& tag, + int max_outputs, float sample_rate) override { + if (sample_rate <= 0.0f) { + return errors::InvalidArgument("sample_rate must be > 0"); + } + const int batch_size = tensor.dim_size(0); + const int64 length_frames = tensor.dim_size(1); + const int64 num_channels = + tensor.dims() == 2 ? 1 : tensor.dim_size(tensor.dims() - 1); + Summary s; + const int N = std::min(max_outputs, batch_size); + for (int i = 0; i < N; ++i) { + Summary::Value* v = s.add_value(); + if (max_outputs > 1) { + v->set_tag(strings::StrCat(tag, "/audio/", i)); + } else { + v->set_tag(strings::StrCat(tag, "/audio")); + } + + Summary::Audio* sa = v->mutable_audio(); + sa->set_sample_rate(sample_rate); + sa->set_num_channels(num_channels); + sa->set_length_frames(length_frames); + sa->set_content_type("audio/wav"); + + auto values = + tensor.shaped({batch_size, length_frames, num_channels}); + auto channels_by_frames = typename TTypes::ConstMatrix( + &values(i, 0, 0), + Eigen::DSizes(length_frames, num_channels)); + size_t sample_rate_truncated = lrintf(sample_rate); + if (sample_rate_truncated == 0) { + sample_rate_truncated = 1; + } + TF_RETURN_IF_ERROR(wav::EncodeAudioAsS16LEWav( + channels_by_frames.data(), sample_rate_truncated, num_channels, + length_frames, sa->mutable_encoded_audio_string())); + } + + return Enqueue(global_step, s); + } + + string DebugString() override { return "SummaryWriterImpl"; } + + private: + Status Enqueue(int64 global_step, const Summary& summary) { + mutex_lock ml(mu_); + queue_.emplace_back(global_step, summary, Env::Default()->NowMicros()); + if (queue_.size() >= max_queue_ || + Env::Default()->NowMicros() - last_flush_ > 1000 * flush_millis_) { + return InternalFlush(); + } + return Status::OK(); + } + + Status InternalFlush() EXCLUSIVE_LOCKS_REQUIRED(mu_) { + for (const EventInfo& e : queue_) { + Event event; + event.set_step(std::get<0>(e)); + *event.mutable_summary() = std::get<1>(e); + event.set_wall_time(std::get<2>(e)); + events_writer_->WriteEvent(event); + } + queue_.clear(); + if (!events_writer_->Flush()) { + return errors::InvalidArgument("Could not flush events file."); + } + last_flush_ = Env::Default()->NowMicros(); + return Status::OK(); + } + + const int max_queue_; + const int flush_millis_; + uint64 last_flush_; + using EventInfo = std::tuple; + mutex mu_; + std::vector queue_ GUARDED_BY(mu_); + // A pointer to allow deferred construction. + std::unique_ptr events_writer_ GUARDED_BY(mu_); + std::vector> registered_summaries_ + GUARDED_BY(mu_); +}; + +Status CreateSummaryWriter(int max_queue, int flush_millis, + const string& logdir, const string& filename_suffix, + Env* env, SummaryWriterInterface** result) { + SummaryWriterImpl* w = new SummaryWriterImpl(max_queue, flush_millis); + Status s = w->Initialize(logdir, filename_suffix, env); + if (!s.ok()) { + w->Unref(); + *result = nullptr; + return s; + } + *result = w; + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/summary_interface.h b/tensorflow/core/kernels/summary_interface.h new file mode 100644 index 00000000000..ae2fbb70fe3 --- /dev/null +++ b/tensorflow/core/kernels/summary_interface.h @@ -0,0 +1,59 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_ +#define TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_ + + +#include "tensorflow/core/framework/resource_mgr.h" + +namespace tensorflow { + +// Main interface for the summary writer resource. +class SummaryWriterInterface : public ResourceBase { + public: + virtual ~SummaryWriterInterface() override {} + + // Flushes all unwritten messages in the queue. + virtual Status Flush() = 0; + + // These are called in the OpKernel::Compute methods for the summary ops. + virtual Status WriteTensor(int64 global_step, Tensor t, const string& tag, + const string& serialized_metadata) = 0; + + virtual Status WriteScalar(int64 global_step, Tensor t, + const string& tag) = 0; + + virtual Status WriteHistogram(int64 global_step, Tensor t, + const string& tag) = 0; + + virtual Status WriteImage(int64 global_step, Tensor t, const string& tag, + int max_images, Tensor bad_color) = 0; + + virtual Status WriteAudio(int64 global_step, Tensor t, const string& tag, + int max_outputs_, float sample_rate) = 0; +}; + +// Creates a SummaryWriterInterface instance which writes to a file. It will +// enqueue up to max_queue summaries, and flush at least every flush_millis +// milliseconds. The summaries will be written to the directory specified by +// logdir and with the filename suffixed by filename_suffix. The caller owns a +// reference to result if the returned status is ok. +Status CreateSummaryWriter(int max_queue, int flush_millis, + const string& logdir, const string& filename_suffix, + Env* env, SummaryWriterInterface** result); + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_ diff --git a/tensorflow/core/kernels/summary_interface_test.cc b/tensorflow/core/kernels/summary_interface_test.cc new file mode 100644 index 00000000000..0e24e8122a0 --- /dev/null +++ b/tensorflow/core/kernels/summary_interface_test.cc @@ -0,0 +1,170 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/core/framework/summary.pb.h" +#include "tensorflow/core/kernels/summary_interface.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/refcount.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/io/record_reader.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/util/event.pb.h" + +namespace tensorflow { +namespace { + +Status SummaryTestHelper( + const string& test_name, + std::function writer_fn, + std::function test_fn) { + static std::set* tests = new std::set(); + CHECK(tests->insert(test_name).second) << ": " << test_name; + + SummaryWriterInterface* writer; + Env* env = Env::Default(); + TF_CHECK_OK( + CreateSummaryWriter(1, 1, testing::TmpDir(), test_name, env, &writer)); + core::ScopedUnref deleter(writer); + + TF_CHECK_OK(writer_fn(writer)); + TF_CHECK_OK(writer->Flush()); + + std::vector files; + TF_CHECK_OK(env->GetChildren(testing::TmpDir(), &files)); + bool found = false; + for (const string& f : files) { + if (StringPiece(f).contains(test_name)) { + if (found) { + return errors::Unknown("Found more than one file for ", test_name); + } + found = true; + std::unique_ptr read_file; + TF_CHECK_OK(env->NewRandomAccessFile(io::JoinPath(testing::TmpDir(), f), + &read_file)); + io::RecordReader reader(read_file.get(), io::RecordReaderOptions()); + string record; + uint64 offset = 0; + TF_CHECK_OK(reader.ReadRecord(&offset, + &record)); // The first event is irrelevant + TF_CHECK_OK(reader.ReadRecord(&offset, &record)); + Event e; + e.ParseFromString(record); + test_fn(e); + } + } + if (!found) { + return errors::Unknown("Found no file for ", test_name); + } + return Status::OK(); +} + +TEST(SummaryInterfaceTest, WriteTensor) { + TF_CHECK_OK(SummaryTestHelper("tensor_test", + [](SummaryWriterInterface* writer) { + Tensor one(DT_FLOAT, TensorShape({})); + one.scalar()() = 1.0; + TF_RETURN_IF_ERROR(writer->WriteTensor( + 2, one, "name", + SummaryMetadata().SerializeAsString())); + TF_RETURN_IF_ERROR(writer->Flush()); + return Status::OK(); + }, + [](const Event& e) { + EXPECT_EQ(e.step(), 2); + CHECK_EQ(e.summary().value_size(), 1); + EXPECT_EQ(e.summary().value(0).tag(), "name"); + })); +} + +TEST(SummaryInterfaceTest, WriteScalar) { + TF_CHECK_OK(SummaryTestHelper( + "scalar_test", + [](SummaryWriterInterface* writer) { + Tensor one(DT_FLOAT, TensorShape({})); + one.scalar()() = 1.0; + TF_RETURN_IF_ERROR(writer->WriteScalar(2, one, "name")); + TF_RETURN_IF_ERROR(writer->Flush()); + return Status::OK(); + }, + [](const Event& e) { + EXPECT_EQ(e.step(), 2); + CHECK_EQ(e.summary().value_size(), 1); + EXPECT_EQ(e.summary().value(0).tag(), "name"); + EXPECT_EQ(e.summary().value(0).simple_value(), 1.0); + })); +} + +TEST(SummaryInterfaceTest, WriteHistogram) { + TF_CHECK_OK(SummaryTestHelper("hist_test", + [](SummaryWriterInterface* writer) { + Tensor one(DT_FLOAT, TensorShape({})); + one.scalar()() = 1.0; + TF_RETURN_IF_ERROR( + writer->WriteHistogram(2, one, "name")); + TF_RETURN_IF_ERROR(writer->Flush()); + return Status::OK(); + }, + [](const Event& e) { + EXPECT_EQ(e.step(), 2); + CHECK_EQ(e.summary().value_size(), 1); + EXPECT_EQ(e.summary().value(0).tag(), "name"); + EXPECT_TRUE(e.summary().value(0).has_histo()); + })); +} + +TEST(SummaryInterfaceTest, WriteImage) { + TF_CHECK_OK(SummaryTestHelper( + "image_test", + [](SummaryWriterInterface* writer) { + Tensor one(DT_UINT8, TensorShape({1, 1, 1, 1})); + one.scalar()() = 1; + TF_RETURN_IF_ERROR(writer->WriteImage(2, one, "name", 1, Tensor())); + TF_RETURN_IF_ERROR(writer->Flush()); + return Status::OK(); + }, + [](const Event& e) { + EXPECT_EQ(e.step(), 2); + CHECK_EQ(e.summary().value_size(), 1); + EXPECT_EQ(e.summary().value(0).tag(), "name/image"); + CHECK(e.summary().value(0).has_image()); + EXPECT_EQ(e.summary().value(0).image().height(), 1); + EXPECT_EQ(e.summary().value(0).image().width(), 1); + EXPECT_EQ(e.summary().value(0).image().colorspace(), 1); + })); +} + +TEST(SummaryInterfaceTest, WriteAudio) { + TF_CHECK_OK(SummaryTestHelper( + "audio_test", + [](SummaryWriterInterface* writer) { + Tensor one(DT_FLOAT, TensorShape({1, 1})); + one.scalar()() = 1.0; + TF_RETURN_IF_ERROR(writer->WriteAudio(2, one, "name", 1, 1)); + TF_RETURN_IF_ERROR(writer->Flush()); + return Status::OK(); + }, + [](const Event& e) { + EXPECT_EQ(e.step(), 2); + CHECK_EQ(e.summary().value_size(), 1); + EXPECT_EQ(e.summary().value(0).tag(), "name/audio"); + CHECK(e.summary().value(0).has_audio()); + })); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc new file mode 100644 index 00000000000..d0eca0f1e7f --- /dev/null +++ b/tensorflow/core/kernels/summary_kernels.cc @@ -0,0 +1,226 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/kernels/summary_interface.h" + +namespace tensorflow { + +REGISTER_KERNEL_BUILDER(Name("SummaryWriter").Device(DEVICE_CPU), + ResourceHandleOp); + +class CreateSummaryFileWriterOp : public OpKernel { + public: + explicit CreateSummaryFileWriterOp(OpKernelConstruction* ctx) + : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor* tmp; + OP_REQUIRES_OK(ctx, ctx->input("logdir", &tmp)); + const string logdir = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("max_queue", &tmp)); + const int32 max_queue = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("flush_millis", &tmp)); + const int32 flush_millis = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("filename_suffix", &tmp)); + const string filename_suffix = tmp->scalar()(); + SummaryWriterInterface* s; + OP_REQUIRES_OK(ctx, CreateSummaryWriter(max_queue, flush_millis, logdir, + filename_suffix, ctx->env(), &s)); + Status status = CreateResource(ctx, HandleFromInput(ctx, 0), s); + if (!status.ok()) { + s->Unref(); + ctx->SetStatus(status); + return; + } + } +}; +REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU), + CreateSummaryFileWriterOp); + +class FlushSummaryWriterOp : public OpKernel { + public: + explicit FlushSummaryWriterOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + SummaryWriterInterface* s; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s)); + core::ScopedUnref unref(s); + OP_REQUIRES_OK(ctx, s->Flush()); + } +}; +REGISTER_KERNEL_BUILDER(Name("FlushSummaryWriter").Device(DEVICE_CPU), + FlushSummaryWriterOp); + +class CloseSummaryWriterOp : public OpKernel { + public: + explicit CloseSummaryWriterOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + OP_REQUIRES_OK(ctx, DeleteResource( + ctx, HandleFromInput(ctx, 0))); + } +}; +REGISTER_KERNEL_BUILDER(Name("CloseSummaryWriter").Device(DEVICE_CPU), + CloseSummaryWriterOp); + +class WriteSummaryOp : public OpKernel { + public: + explicit WriteSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + SummaryWriterInterface* s; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s)); + core::ScopedUnref unref(s); + const Tensor* tmp; + OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp)); + const int64 global_step = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp)); + const string& tag = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("summary_metadata", &tmp)); + const string& serialized_metadata = tmp->scalar()(); + + const Tensor* t; + OP_REQUIRES_OK(ctx, ctx->input("tensor", &t)); + + OP_REQUIRES_OK(ctx, + s->WriteTensor(global_step, *t, tag, serialized_metadata)); + } +}; +REGISTER_KERNEL_BUILDER(Name("WriteSummary").Device(DEVICE_CPU), + WriteSummaryOp); + +class WriteScalarSummaryOp : public OpKernel { + public: + explicit WriteScalarSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + SummaryWriterInterface* s; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s)); + core::ScopedUnref unref(s); + const Tensor* tmp; + OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp)); + const int64 global_step = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp)); + const string& tag = tmp->scalar()(); + + const Tensor* t; + OP_REQUIRES_OK(ctx, ctx->input("value", &t)); + + OP_REQUIRES_OK(ctx, s->WriteScalar(global_step, *t, tag)); + } +}; +REGISTER_KERNEL_BUILDER(Name("WriteScalarSummary").Device(DEVICE_CPU), + WriteScalarSummaryOp); + +class WriteHistogramSummaryOp : public OpKernel { + public: + explicit WriteHistogramSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + SummaryWriterInterface* s; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s)); + core::ScopedUnref unref(s); + const Tensor* tmp; + OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp)); + const int64 global_step = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp)); + const string& tag = tmp->scalar()(); + + const Tensor* t; + OP_REQUIRES_OK(ctx, ctx->input("values", &t)); + + OP_REQUIRES_OK(ctx, s->WriteHistogram(global_step, *t, tag)); + } +}; +REGISTER_KERNEL_BUILDER(Name("WriteHistogramSummary").Device(DEVICE_CPU), + WriteHistogramSummaryOp); + +class WriteImageSummaryOp : public OpKernel { + public: + explicit WriteImageSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + int64 max_images_tmp; + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_images", &max_images_tmp)); + OP_REQUIRES(ctx, max_images_tmp < (1LL << 31), + errors::InvalidArgument("max_images must be < 2^31")); + max_images_ = static_cast(max_images_tmp); + } + + void Compute(OpKernelContext* ctx) override { + SummaryWriterInterface* s; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s)); + core::ScopedUnref unref(s); + const Tensor* tmp; + OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp)); + const int64 global_step = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp)); + const string& tag = tmp->scalar()(); + const Tensor* bad_color; + OP_REQUIRES_OK(ctx, ctx->input("bad_color", &bad_color)); + OP_REQUIRES( + ctx, TensorShapeUtils::IsVector(bad_color->shape()), + errors::InvalidArgument("bad_color must be a vector, got shape ", + bad_color->shape().DebugString())); + + const Tensor* t; + OP_REQUIRES_OK(ctx, ctx->input("tensor", &t)); + + OP_REQUIRES_OK( + ctx, s->WriteImage(global_step, *t, tag, max_images_, *bad_color)); + } + + private: + int32 max_images_; +}; +REGISTER_KERNEL_BUILDER(Name("WriteImageSummary").Device(DEVICE_CPU), + WriteImageSummaryOp); + +class WriteAudioSummaryOp : public OpKernel { + public: + explicit WriteAudioSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("max_outputs", &max_outputs_)); + OP_REQUIRES(ctx, max_outputs_ > 0, + errors::InvalidArgument("max_outputs must be > 0")); + } + + void Compute(OpKernelContext* ctx) override { + SummaryWriterInterface* s; + OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s)); + core::ScopedUnref unref(s); + const Tensor* tmp; + OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp)); + const int64 global_step = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp)); + const string& tag = tmp->scalar()(); + OP_REQUIRES_OK(ctx, ctx->input("sample_rate", &tmp)); + const float sample_rate = tmp->scalar()(); + + const Tensor* t; + OP_REQUIRES_OK(ctx, ctx->input("tensor", &t)); + + OP_REQUIRES_OK( + ctx, s->WriteAudio(global_step, *t, tag, max_outputs_, sample_rate)); + } + + private: + int max_outputs_; + bool has_sample_rate_attr_; + float sample_rate_attr_; +}; +REGISTER_KERNEL_BUILDER(Name("WriteAudioSummary").Device(DEVICE_CPU), + WriteAudioSummaryOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc index b14e5551039..36b8ff09d73 100644 --- a/tensorflow/core/kernels/variable_ops.cc +++ b/tensorflow/core/kernels/variable_ops.cc @@ -83,7 +83,6 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL); IsVariableInitializedOp); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); -TF_CALL_bool(REGISTER_GPU_KERNELS) #undef REGISTER_GPU_KERNELS #endif // GOOGLE_CUDA diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc index 258793aa1e6..3c7e5ca696d 100644 --- a/tensorflow/core/lib/jpeg/jpeg_mem.cc +++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc @@ -70,13 +70,24 @@ class FewerArgsForCompiler { int stride_; }; +// Check whether the crop window is valid, assuming crop is true. +bool IsCropWindowValid(const UncompressFlags& flags, int input_image_width, + int input_image_height) { + // Crop window is valid only if it is non zero and all the window region is + // within the original image. + return flags.crop_width > 0 && flags.crop_height > 0 && flags.crop_x >= 0 && + flags.crop_y >= 0 && + flags.crop_y + flags.crop_height <= input_image_height && + flags.crop_x + flags.crop_width <= input_image_width; +} + uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) { // unpack the argball const int datasize = argball->datasize_; const auto& flags = argball->flags_; const int ratio = flags.ratio; int components = flags.components; - int stride = flags.stride; // may be 0 + int stride = flags.stride; // may be 0 int64* const nwarn = argball->pnwarn_; // may be NULL // Can't decode if the ratio is not recognized by libjpeg @@ -159,8 +170,43 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) { return nullptr; } + JDIMENSION target_output_width = cinfo.output_width; + JDIMENSION target_output_height = cinfo.output_height; + JDIMENSION skipped_scanlines = 0; +#if !defined(WIN32) + if (flags.crop) { + // Update target output height and width based on crop window. + target_output_height = flags.crop_height; + target_output_width = flags.crop_width; + + // So far, cinfo holds the original input image information. + if (!IsCropWindowValid(flags, cinfo.output_width, cinfo.output_height)) { + LOG(ERROR) << "Invalid crop window: x=" << flags.crop_x + << ", y=" << flags.crop_y << ", w=" << target_output_width + << ", h=" << target_output_height + << " for image_width: " << cinfo.output_width + << " and image_height: " << cinfo.output_height; + jpeg_destroy_decompress(&cinfo); + return nullptr; + } + + // Update cinfo.output_width. It is tricky that cinfo.output_width must + // fall on an Minimum Coded Unit (MCU) boundary; if it doesn't, then it will + // be moved left to the nearest MCU boundary, and width will be increased + // accordingly. Therefore, the final cinfo.crop_width might differ from the + // given flags.crop_width. Please see libjpeg library for details. + JDIMENSION crop_width = flags.crop_width; + JDIMENSION crop_x = flags.crop_x; + jpeg_crop_scanline(&cinfo, &crop_x, &crop_width); + + // Update cinfo.output_scanline. + skipped_scanlines = jpeg_skip_scanlines(&cinfo, flags.crop_y); + CHECK_EQ(skipped_scanlines, flags.crop_y); + } +#endif + // check for compatible stride - const int min_stride = cinfo.output_width * components * sizeof(JSAMPLE); + const int min_stride = target_output_width * components * sizeof(JSAMPLE); if (stride == 0) { stride = min_stride; } else if (stride < min_stride) { @@ -170,47 +216,88 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) { } // Remember stride and height for use in Uncompress - argball->height_ = cinfo.output_height; + argball->height_ = target_output_height; argball->stride_ = stride; - uint8* const dstdata = argball->allocate_output_( - cinfo.output_width, cinfo.output_height, components); +#if defined(WIN32) + uint8* dstdata = nullptr; + if (flags.crop) { + dstdata = new JSAMPLE[stride * target_output_height]; + } else { + dstdata = argball->allocate_output_(target_output_width, + target_output_height, components); + } +#else + uint8* dstdata = argball->allocate_output_(target_output_width, + target_output_height, components); +#endif if (dstdata == nullptr) { jpeg_destroy_decompress(&cinfo); return nullptr; } JSAMPLE* output_line = static_cast(dstdata); - // Temporary buffer used for CMYK -> RGB conversion. + // jpeg_read_scanlines requires the buffers to be allocated based on + // cinfo.output_width, but the target image width might be different if crop + // is enabled and crop_width is not MCU aligned. In this case, we need to + // realign the scanline output to achieve the exact cropping. Notably, only + // cinfo.output_width needs to fall on MCU boundary, while cinfo.output_height + // has no such constraint. + const bool need_realign_cropped_scanline = + (target_output_width != cinfo.output_width); const bool use_cmyk = (cinfo.out_color_space == JCS_CMYK); - tempdata = use_cmyk ? new JSAMPLE[cinfo.output_width * 4] : nullptr; + + if (use_cmyk) { + // Temporary buffer used for CMYK -> RGB conversion. + tempdata = new JSAMPLE[cinfo.output_width * 4]; + } else if (need_realign_cropped_scanline) { + // Temporary buffer used for MCU-aligned scanline data. + tempdata = new JSAMPLE[cinfo.output_width * components]; + } // If there is an error reading a line, this aborts the reading. // Save the fraction of the image that has been read. - argball->height_read_ = cinfo.output_height; - while (cinfo.output_scanline < cinfo.output_height) { + argball->height_read_ = target_output_height; + + // These variables are just to avoid repeated computation in the loop. + const int max_scanlines_to_read = skipped_scanlines + target_output_height; + const int mcu_align_offset = + (cinfo.output_width - target_output_width) * (use_cmyk ? 4 : components); + while (cinfo.output_scanline < max_scanlines_to_read) { int num_lines_read = 0; - if (cinfo.out_color_space == JCS_CMYK) { + if (use_cmyk) { num_lines_read = jpeg_read_scanlines(&cinfo, &tempdata, 1); - // Convert CMYK to RGB - for (size_t i = 0; i < cinfo.output_width; ++i) { - int c = tempdata[4 * i + 0]; - int m = tempdata[4 * i + 1]; - int y = tempdata[4 * i + 2]; - int k = tempdata[4 * i + 3]; - int r, g, b; - if (cinfo.saw_Adobe_marker) { - r = (k * c) / 255; - g = (k * m) / 255; - b = (k * y) / 255; - } else { - r = (255 - k) * (255 - c) / 255; - g = (255 - k) * (255 - m) / 255; - b = (255 - k) * (255 - y) / 255; + if (num_lines_read > 0) { + // Convert CMYK to RGB if scanline read succeeded. + for (size_t i = 0; i < target_output_width; ++i) { + int offset = 4 * i; + if (need_realign_cropped_scanline) { + // Align the offset for MCU boundary. + offset += mcu_align_offset; + } + const int c = tempdata[offset + 0]; + const int m = tempdata[offset + 1]; + const int y = tempdata[offset + 2]; + const int k = tempdata[offset + 3]; + int r, g, b; + if (cinfo.saw_Adobe_marker) { + r = (k * c) / 255; + g = (k * m) / 255; + b = (k * y) / 255; + } else { + r = (255 - k) * (255 - c) / 255; + g = (255 - k) * (255 - m) / 255; + b = (255 - k) * (255 - y) / 255; + } + output_line[3 * i + 0] = r; + output_line[3 * i + 1] = g; + output_line[3 * i + 2] = b; } - output_line[3 * i + 0] = r; - output_line[3 * i + 1] = g; - output_line[3 * i + 2] = b; + } + } else if (need_realign_cropped_scanline) { + num_lines_read = jpeg_read_scanlines(&cinfo, &tempdata, 1); + if (num_lines_read > 0) { + memcpy(output_line, tempdata + mcu_align_offset, min_stride); } } else { num_lines_read = jpeg_read_scanlines(&cinfo, &output_line, 1); @@ -218,12 +305,13 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) { // Handle error cases if (num_lines_read == 0) { LOG(ERROR) << "Premature end of JPEG data. Stopped at line " - << cinfo.output_scanline << "/" << cinfo.output_height; + << cinfo.output_scanline - skipped_scanlines << "/" + << target_output_height; if (!flags.try_recover_truncated_jpeg) { - argball->height_read_ = cinfo.output_scanline; + argball->height_read_ = cinfo.output_scanline - skipped_scanlines; error = JPEGERRORS_UNEXPECTED_END_OF_DATA; } else { - for (size_t line = cinfo.output_scanline; line < cinfo.output_height; + for (size_t line = cinfo.output_scanline; line < max_scanlines_to_read; ++line) { if (line == 0) { // If even the first line is missing, fill with black color @@ -235,9 +323,9 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) { output_line += stride; } argball->height_read_ = - cinfo.output_height; // consider all lines as read + target_output_height; // consider all lines as read // prevent error-on-exit in libjpeg: - cinfo.output_scanline = cinfo.output_height; + cinfo.output_scanline = max_scanlines_to_read; } break; } @@ -248,23 +336,33 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) { delete[] tempdata; tempdata = nullptr; +#if !defined(WIN32) + if (flags.crop && cinfo.output_scanline < cinfo.output_height) { + // Skip the rest of scanlines, required by jpeg_destroy_decompress. + jpeg_skip_scanlines(&cinfo, + cinfo.output_height - flags.crop_y - flags.crop_height); + // After this, cinfo.output_height must be equal to cinfo.output_height; + // otherwise, jpeg_destroy_decompress would fail. + } +#endif + // Convert the RGB data to RGBA, with alpha set to 0xFF to indicate // opacity. // RGBRGBRGB... --> RGBARGBARGBA... if (components == 4) { // Start on the last line. JSAMPLE* scanlineptr = static_cast( - dstdata + static_cast(cinfo.output_height - 1) * stride); + dstdata + static_cast(target_output_height - 1) * stride); const JSAMPLE kOpaque = -1; // All ones appropriate for JSAMPLE. - const int right_rgb = (cinfo.output_width - 1) * 3; - const int right_rgba = (cinfo.output_width - 1) * 4; + const int right_rgb = (target_output_width - 1) * 3; + const int right_rgba = (target_output_width - 1) * 4; - for (int y = cinfo.output_height; y-- > 0;) { + for (int y = target_output_height; y-- > 0;) { // We do all the transformations in place, going backwards for each row. const JSAMPLE* rgb_pixel = scanlineptr + right_rgb; JSAMPLE* rgba_pixel = scanlineptr + right_rgba; scanlineptr -= stride; - for (int x = cinfo.output_width; x-- > 0; + for (int x = target_output_width; x-- > 0; rgba_pixel -= 4, rgb_pixel -= 3) { // We copy the 3 bytes at rgb_pixel into the 4 bytes at rgba_pixel // The "a" channel is set to be opaque. @@ -319,8 +417,61 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) { LOG(ERROR) << "Unhandled case " << error; break; } - jpeg_destroy_decompress(&cinfo); +#if defined(WIN32) + // TODO(tanmingxing): delete all these code after migrating to libjpeg_turbo + // for Windows. + if (flags.crop) { + // Update target output height and width based on crop window. + target_output_height = flags.crop_height; + target_output_width = flags.crop_width; + + // cinfo holds the original input image information. + if (!IsCropWindowValid(flags, cinfo.output_width, cinfo.output_height)) { + LOG(ERROR) << "Invalid crop window: x=" << flags.crop_x + << ", y=" << flags.crop_y << ", w=" << target_output_width + << ", h=" << target_output_height + << " for image_width: " << cinfo.output_width + << " and image_height: " << cinfo.output_height; + delete[] dstdata; + jpeg_destroy_decompress(&cinfo); + return nullptr; + } + + const uint8* full_image = dstdata; + dstdata = argball->allocate_output_(target_output_width, + target_output_height, components); + if (dstdata == nullptr) { + delete[] full_image; + jpeg_destroy_decompress(&cinfo); + return nullptr; + } + + const int full_image_stride = stride; + // Update stride and hight for crop window. + const int min_stride = target_output_width * components * sizeof(JSAMPLE); + if (flags.stride == 0) { + stride = min_stride; + } + argball->height_ = target_output_height; + argball->stride_ = stride; + + if (argball->height_read_ > target_output_height) { + argball->height_read_ = target_output_height; + } + const int crop_offset = flags.crop_x * components * sizeof(JSAMPLE); + const uint8* full_image_ptr = full_image + flags.crop_y * full_image_stride; + uint8* crop_image_ptr = dstdata; + for (int i = 0; i < argball->height_read_; i++) { + memcpy(crop_image_ptr, full_image_ptr + crop_offset, min_stride); + crop_image_ptr += stride; + full_image_ptr += full_image_stride; + } + delete[] full_image; + } +#endif + + jpeg_destroy_decompress(&cinfo); return dstdata; } diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.h b/tensorflow/core/lib/jpeg/jpeg_mem.h index ac34f29f221..59342d28c0f 100644 --- a/tensorflow/core/lib/jpeg/jpeg_mem.h +++ b/tensorflow/core/lib/jpeg/jpeg_mem.h @@ -61,6 +61,17 @@ struct UncompressFlags { // // Setting this has a quality/speed trade-off implication. J_DCT_METHOD dct_method = JDCT_DEFAULT; + + // Settings of crop window before decompression. + bool crop = false; + // Vertical coordinate of the top-left corner of the result in the input. + int crop_x = 0; + // Horizontal coordinate of the top-left corner of the result in the input. + int crop_y = 0; + // Width of the output image. + int crop_width = 0; + // Height of the output image. + int crop_height = 0; }; // Uncompress some raw JPEG data given by the pointer srcdata and the length diff --git a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc index cc8646750e1..15266af1dbd 100644 --- a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc +++ b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc @@ -57,7 +57,7 @@ void ReadFileToStringOrDie(Env* env, const string& filename, string* output) { void TestJPEG(Env* env, const string& jpegfile) { // Read the data from the jpeg file into memory string jpeg; - ReadFileToStringOrDie(Env::Default(), jpegfile, &jpeg); + ReadFileToStringOrDie(env, jpegfile, &jpeg); const int fsize = jpeg.size(); const uint8* const temp = bit_cast(jpeg.data()); @@ -95,6 +95,194 @@ TEST(JpegMemTest, Jpeg) { TestJPEG(env, data_path + "jpeg_merge_test1_cmyk.jpg"); } +void TestCropAndDecodeJpeg(Env* env, const string& jpegfile, + const UncompressFlags& default_flags) { + // Read the data from the jpeg file into memory + string jpeg; + ReadFileToStringOrDie(env, jpegfile, &jpeg); + const int fsize = jpeg.size(); + auto temp = bit_cast(jpeg.data()); + + // Decode the whole image. + std::unique_ptr imgdata1; + int w1, h1, c1; + { + UncompressFlags flags = default_flags; + if (flags.stride == 0) { + imgdata1.reset(Uncompress(temp, fsize, flags, &w1, &h1, &c1, nullptr)); + } else { + // If stride is not zero, the default allocator would fail because it + // allocate w*h*c bytes, but the actual required bytes should be stride*h. + // Therefore, we provide a specialized allocator here. + uint8* buffer = nullptr; + imgdata1.reset(Uncompress(temp, fsize, flags, nullptr, + [&](int width, int height, int components) { + w1 = width; + h1 = height; + c1 = components; + buffer = new uint8[flags.stride * height]; + return buffer; + })); + } + ASSERT_NE(imgdata1, nullptr); + } + + auto check_crop_and_decode_func = [&](int crop_x, int crop_y, int crop_width, + int crop_height) { + std::unique_ptr imgdata2; + int w, h, c; + UncompressFlags flags = default_flags; + flags.crop = true; + flags.crop_x = crop_x; + flags.crop_y = crop_y; + flags.crop_width = crop_width; + flags.crop_height = crop_height; + if (flags.stride == 0) { + imgdata2.reset(Uncompress(temp, fsize, flags, &w, &h, &c, nullptr)); + } else { + uint8* buffer = nullptr; + imgdata2.reset(Uncompress(temp, fsize, flags, nullptr, + [&](int width, int height, int components) { + w = width; + h = height; + c = components; + buffer = new uint8[flags.stride * height]; + return buffer; + })); + } + ASSERT_NE(imgdata2, nullptr); + + ASSERT_EQ(w, crop_width); + ASSERT_EQ(h, crop_height); + ASSERT_EQ(c, c1); + + const int stride1 = (flags.stride != 0) ? flags.stride : w1 * c; + const int stride2 = (flags.stride != 0) ? flags.stride : w * c; + for (int i = 0; i < crop_height; i++) { + const uint8* p1 = &imgdata1[(i + crop_y) * stride1 + crop_x * c]; + const uint8* p2 = &imgdata2[i * stride2]; + + for (int j = 0; j < c * w; j++) { + ASSERT_EQ(p1[j], p2[j]) + << "p1 != p2 in [" << i << "][" << j / 3 << "][" << j % 3 << "]"; + } + } + }; + + // Check different crop windows. + check_crop_and_decode_func(0, 0, 5, 5); + check_crop_and_decode_func(0, 0, w1, 5); + check_crop_and_decode_func(0, 0, 5, h1); + check_crop_and_decode_func(0, 0, w1, h1); + check_crop_and_decode_func(w1 - 5, h1 - 6, 5, 6); + check_crop_and_decode_func(5, 6, 10, 15); +} + +TEST(JpegMemTest, CropAndDecodeJpeg) { + Env* env = Env::Default(); + const string data_path = kTestData; + UncompressFlags flags; + + // Test basic flags for jpeg and cmyk jpeg. + TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags); + TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1_cmyk.jpg", flags); +} + +TEST(JpegMemTest, CropAndDecodeJpegWithRatio) { + Env* env = Env::Default(); + const string data_path = kTestData; + UncompressFlags flags; + for (int ratio : {1, 2, 4, 8}) { + flags.ratio = ratio; + TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags); + } +} + +TEST(JpegMemTest, CropAndDecodeJpegWithComponents) { + Env* env = Env::Default(); + const string data_path = kTestData; + UncompressFlags flags; + for (const int components : {0, 1, 3}) { + flags.components = components; + TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags); + } +} + +TEST(JpegMemTest, CropAndDecodeJpegWithUpScaling) { + Env* env = Env::Default(); + const string data_path = kTestData; + UncompressFlags flags; + flags.fancy_upscaling = true; + TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags); +} + +TEST(JpegMemTest, CropAndDecodeJpegWithStride) { + Env* env = Env::Default(); + const string data_path = kTestData; + + // Read the data from the jpeg file into memory + string jpeg; + ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg); + const int fsize = jpeg.size(); + auto temp = bit_cast(jpeg.data()); + + int w, h, c; + ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c)); + + // stride must be either 0 or > w*c; otherwise, uncompress would fail. + UncompressFlags flags; + flags.stride = w * c; + TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags); + flags.stride = w * c * 3; + TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags); + flags.stride = w * c + 100; + TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags); +} + +void CheckInvalidCropWindowFailed(const uint8* const temp, int fsize, int x, + int y, int w, int h) { + std::unique_ptr imgdata; + int ww, hh, cc; + UncompressFlags flags; + flags.components = 3; + flags.crop = true; + flags.crop_x = x; + flags.crop_y = y; + flags.crop_width = w; + flags.crop_height = h; + imgdata.reset(Uncompress(temp, fsize, flags, &ww, &hh, &cc, nullptr)); + CHECK(imgdata == nullptr); +} + +TEST(JpegMemTest, CropAndDecodeJpegWithInvalidCropWindow) { + Env* env = Env::Default(); + const string data_path = kTestData; + + // Read the data from the jpeg file into memory + string jpeg; + ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg); + const int fsize = jpeg.size(); + auto temp = bit_cast(jpeg.data()); + + int w, h, c; + ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c)); + + // Width and height for the crop window must be non zero. + CheckInvalidCropWindowFailed(temp, fsize, 11, 11, /*w=*/0, 11); + CheckInvalidCropWindowFailed(temp, fsize, 11, 11, 11, /*h=*/0); + + // Crop window must be non negative. + CheckInvalidCropWindowFailed(temp, fsize, /*x=*/-1, 11, 11, 11); + CheckInvalidCropWindowFailed(temp, fsize, 11, /*y=*/-1, 11, 11); + CheckInvalidCropWindowFailed(temp, fsize, 11, 11, /*w=*/-1, 11); + CheckInvalidCropWindowFailed(temp, fsize, 11, 11, 11, /*h=*/-1); + + // Invalid crop window width: x + crop_width = w + 1 > w + CheckInvalidCropWindowFailed(temp, fsize, /*x=*/w - 10, 11, 11, 11); + // Invalid crop window height: y + crop_height= h + 1 > h + CheckInvalidCropWindowFailed(temp, fsize, 11, /*y=*/h - 10, 11, 11); +} + TEST(JpegMemTest, Jpeg2) { // create known data, for size in_w x in_h const int in_w = 256; diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index ad290d123e5..22d4a0056f8 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -10467,8 +10467,8 @@ op { type_list_attr: "Treduce_func_other_arguments" } input_arg { - name: "window_size" - type: DT_INT64 + name: "window_size_func_other_arguments" + type_list_attr: "Twindow_size_func_other_arguments" } output_arg { name: "handle" @@ -10482,6 +10482,10 @@ op { name: "reduce_func" type: "func" } + attr { + name: "window_size_func" + type: "func" + } attr { name: "Tkey_func_other_arguments" type: "list(type)" @@ -10492,6 +10496,11 @@ op { type: "list(type)" has_minimum: true } + attr { + name: "Twindow_size_func_other_arguments" + type: "list(type)" + has_minimum: true + } attr { name: "output_types" type: "list(type)" diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc index f6bd5768d7c..37d9a737e29 100644 --- a/tensorflow/core/ops/dataset_ops.cc +++ b/tensorflow/core/ops/dataset_ops.cc @@ -237,12 +237,15 @@ REGISTER_OP("GroupByWindowDataset") .Input("input_dataset: resource") .Input("key_func_other_arguments: Tkey_func_other_arguments") .Input("reduce_func_other_arguments: Treduce_func_other_arguments") - .Input("window_size: int64") + .Input( + "window_size_func_other_arguments: Twindow_size_func_other_arguments") .Output("handle: resource") .Attr("key_func: func") .Attr("reduce_func: func") + .Attr("window_size_func: func") .Attr("Tkey_func_other_arguments: list(type) >= 0") .Attr("Treduce_func_other_arguments: list(type) >= 0") + .Attr("Twindow_size_func_other_arguments: list(type) >= 0") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") .SetShapeFn(shape_inference::ScalarShape) diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index 6ff05bd2a62..6eb05874aa5 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -499,7 +499,7 @@ Returns x + y element-wise. )doc"); REGISTER_OP("Sub") - .BINARY_FEWER() + .BINARY_MORE() .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn) .Doc(R"doc( Returns x - y element-wise. diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index 0a96258dd1f..8a2d5e8c05a 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -1945,7 +1945,7 @@ Computes softsign gradients for a softsign operation. gradients: The backpropagated gradients to the corresponding softsign operation. features: The features passed as input to the corresponding softsign operation. -backprops: The gradients: `gradients / (1 + abs(-features)) ** 2`. +backprops: The gradients: `gradients / (1 + abs(features)) ** 2`. )doc"); // -------------------------------------------------------------------------- @@ -2791,7 +2791,9 @@ REGISTER_OP("_MklConv2D") .Input("mkl_input: uint8") .Input("mkl_filter: uint8") .Output("output: T") + .Output("filter_output: T") .Output("mkl_output: uint8") + .Output("mkl_filter_output: uint8") .Attr("T: {half, float, double}") .Attr("strides: list(int)") .Attr("use_cudnn_on_gpu: bool = true") @@ -2813,7 +2815,9 @@ REGISTER_OP("_MklConv2DWithBias") .Input("mkl_filter: uint8") .Input("mkl_bias: uint8") .Output("output: T") + .Output("filter_output: T") .Output("mkl_output: uint8") + .Output("mkl_filter_output: uint8") .Attr("T: {half, float, double}") .Attr("strides: list(int)") .Attr("use_cudnn_on_gpu: bool = true") diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index 13356e1d8a6..35c31c6cb81 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -9611,8 +9611,8 @@ op { type_list_attr: "Treduce_func_other_arguments" } input_arg { - name: "window_size" - type: DT_INT64 + name: "window_size_func_other_arguments" + type_list_attr: "Twindow_size_func_other_arguments" } output_arg { name: "handle" @@ -9627,6 +9627,10 @@ op { name: "reduce_func" type: "func" } + attr { + name: "window_size_func" + type: "func" + } attr { name: "Tkey_func_other_arguments" type: "list(type)" @@ -9637,6 +9641,11 @@ op { type: "list(type)" has_minimum: true } + attr { + name: "Twindow_size_func_other_arguments" + type: "list(type)" + has_minimum: true + } attr { name: "output_types" type: "list(type)" @@ -15795,6 +15804,25 @@ op { } summary: "Transforms a serialized tensorflow.TensorProto proto into a Tensor." } +op { + name: "SerializeTensor" + input_arg { + name: "tensor" + description: "A Tensor of type `T`." + type: "T" + } + output_arg { + name: "serialized" + description: "A serialized TensorProto proto of the input tensor." + type_attr: DT_STRING + } + attr { + name: "T" + type: "type" + description: "The type of the input tensor." + } + summary: "Transforms a Tensor into a serialized TensorProto proto." +} op { name: "Placeholder" output_arg { @@ -24677,7 +24705,7 @@ op { } output_arg { name: "backprops" - description: "The gradients: `gradients / (1 + abs(-features)) ** 2`." + description: "The gradients: `gradients / (1 + abs(features)) ** 2`." type_attr: "T" } attr { diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc index 2e605fdffcf..1f7ebe91cf0 100644 --- a/tensorflow/core/ops/parsing_ops.cc +++ b/tensorflow/core/ops/parsing_ops.cc @@ -292,6 +292,19 @@ out_type: The type of the serialized tensor. The provided type must match the output: A Tensor of type `out_type`. )doc"); +REGISTER_OP("SerializeTensor") + .Input("tensor: T") + .Output("serialized: string") + .Attr("T: type") + .SetShapeFn(shape_inference::ScalarShape) + .Doc(R"doc( +Transforms a Tensor into a serialized TensorProto proto. + +tensor: A Tensor of type `T`. +T: The type of the input tensor. +serialized: A serialized TensorProto proto of the input tensor. +)doc"); + REGISTER_OP("DecodeJSONExample") .Input("json_examples: string") .Output("binary_examples: string") diff --git a/tensorflow/core/ops/summary_ops.cc b/tensorflow/core/ops/summary_ops.cc new file mode 100644 index 00000000000..f778b487972 --- /dev/null +++ b/tensorflow/core/ops/summary_ops.cc @@ -0,0 +1,218 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); + +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +REGISTER_OP("SummaryWriter") + .Output("writer: resource") + .Attr("shared_name: string = ''") + .Attr("container: string = ''") + .SetShapeFn(shape_inference::ScalarShape) + .Doc(R"doc( +Returns a handle to be used to access a summary writer. + +The summary writer is an in-graph resource which can be used by ops to write +summaries to event files. + +writer: the summary writer resource. Scalar handle. +)doc"); + +REGISTER_OP("CreateSummaryFileWriter") + .Input("writer: resource") + .Input("logdir: string") + .Input("max_queue: int32") + .Input("flush_millis: int32") + .Input("filename_suffix: string") + .Doc(R"doc( +Creates a summary file writer accessible by the given resource handle. + +writer: A handle to the summary writer resource +logdir: Directory where the event file will be written. +max_queue: Size of the queue of pending events and summaries. +flush_millis: How often, in milliseconds, to flush the pending events and + summaries to disk. +filename_suffix: Every event file's name is suffixed with this suffix. +)doc"); + +REGISTER_OP("FlushSummaryWriter") + .Input("writer: resource") + .SetShapeFn(shape_inference::NoOutputs) + .Doc(R"( +Flushes the writer's unwritten events. + +writer: A handle to the summary writer resource. +)"); + +REGISTER_OP("CloseSummaryWriter") + .Input("writer: resource") + .SetShapeFn(shape_inference::NoOutputs) + .Doc(R"( +Flushes and closes the summary writer. + +Also removes it from the resource manager. To reopen, use another +CreateSummaryFileWriter op. + +writer: A handle to the summary writer resource. +)"); + +REGISTER_OP("WriteSummary") + .Input("writer: resource") + .Input("global_step: int64") + .Input("tensor: T") + .Input("tag: string") + .Input("summary_metadata: string") + .Attr("T: type") + .SetShapeFn(shape_inference::NoOutputs) + .Doc(R"doc( +Outputs a `Summary` protocol buffer with a tensor. + +writer: A handle to a summary writer. +global_step: The step to write the summary for. +tensor: A tensor to serialize. +tag: The summary's tag. +summary_metadata: Serialized SummaryMetadata protocol buffer containing + plugin-related metadata for this summary. +)doc"); + +REGISTER_OP("WriteScalarSummary") + .Input("writer: resource") + .Input("global_step: int64") + .Input("tag: string") + .Input("value: T") + .Attr("T: realnumbertype") + .SetShapeFn(shape_inference::NoOutputs) + .Doc(R"doc( +Writes a `Summary` protocol buffer with scalar values. + +The input `tag` and `value` must have the scalars. + +writer: A handle to a summary writer. +global_step: The step to write the summary for. +tag: Tag for the summary. +value: Value for the summary. +)doc"); + +REGISTER_OP("WriteHistogramSummary") + .Input("writer: resource") + .Input("global_step: int64") + .Input("tag: string") + .Input("values: T") + .Attr("T: realnumbertype = DT_FLOAT") + .SetShapeFn(shape_inference::NoOutputs) + .Doc(R"doc( +Writes a `Summary` protocol buffer with a histogram. + +The generated +[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto) +has one summary value containing a histogram for `values`. + +This op reports an `InvalidArgument` error if any value is not finite. + +writer: A handle to a summary writer. +global_step: The step to write the summary for. +tag: Scalar. Tag to use for the `Summary.Value`. +values: Any shape. Values to use to build the histogram. +)doc"); + +REGISTER_OP("WriteImageSummary") + .Input("writer: resource") + .Input("global_step: int64") + .Input("tag: string") + .Input("tensor: T") + .Input("bad_color: uint8") + .Attr("max_images: int >= 1 = 3") + .Attr("T: {uint8, float, half} = DT_FLOAT") + .SetShapeFn(shape_inference::NoOutputs) + .Doc(R"doc( +Writes a `Summary` protocol buffer with images. + +The summary has up to `max_images` summary values containing images. The +images are built from `tensor` which must be 4-D with shape `[batch_size, +height, width, channels]` and where `channels` can be: + +* 1: `tensor` is interpreted as Grayscale. +* 3: `tensor` is interpreted as RGB. +* 4: `tensor` is interpreted as RGBA. + +The images have the same number of channels as the input tensor. For float +input, the values are normalized one image at a time to fit in the range +`[0, 255]`. `uint8` values are unchanged. The op uses two different +normalization algorithms: + +* If the input values are all positive, they are rescaled so the largest one + is 255. + +* If any input value is negative, the values are shifted so input value 0.0 + is at 127. They are then rescaled so that either the smallest value is 0, + or the largest one is 255. + +The `tag` argument is a scalar `Tensor` of type `string`. It is used to +build the `tag` of the summary values: + +* If `max_images` is 1, the summary value tag is '*tag*/image'. +* If `max_images` is greater than 1, the summary value tags are + generated sequentially as '*tag*/image/0', '*tag*/image/1', etc. + +The `bad_color` argument is the color to use in the generated images for +non-finite input values. It is a `unit8` 1-D tensor of length `channels`. +Each element must be in the range `[0, 255]` (It represents the value of a +pixel in the output image). Non-finite values in the input tensor are +replaced by this tensor in the output image. The default value is the color +red. + +writer: A handle to a summary writer. +global_step: The step to write the summary for. +tag: Scalar. Used to build the `tag` attribute of the summary values. +tensor: 4-D of shape `[batch_size, height, width, channels]` where + `channels` is 1, 3, or 4. +max_images: Max number of batch elements to generate images for. +bad_color: Color to use for pixels with non-finite values. +)doc"); + +REGISTER_OP("WriteAudioSummary") + .Input("writer: resource") + .Input("global_step: int64") + .Input("tag: string") + .Input("tensor: float") + .Input("sample_rate: float") + .Attr("max_outputs: int >= 1 = 3") + .SetShapeFn(shape_inference::NoOutputs) + .Doc(R"doc( +Writes a `Summary` protocol buffer with audio. + +The summary has up to `max_outputs` summary values containing audio. The +audio is built from `tensor` which must be 3-D with shape `[batch_size, +frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are +assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`. + +The `tag` argument is a scalar `Tensor` of type `string`. It is used to +build the `tag` of the summary values: + +* If `max_outputs` is 1, the summary value tag is '*tag*/audio'. +* If `max_outputs` is greater than 1, the summary value tags are + generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc. + +writer: A handle to a summary writer. +global_step: The step to write the summary for. +tag: Scalar. Used to build the `tag` attribute of the summary values. +tensor: 2-D of shape `[batch_size, frames]`. +sample_rate: The sample rate of the signal in hertz. +max_outputs: Max number of batch elements to generate audio for. +)doc"); + +} // namespace tensorflow diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index 126558cac38..e1ad66c387a 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -75,6 +75,9 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[], def tf_jspb_proto_library(**kwargs): pass +def tf_nano_proto_library(**kwargs): + pass + def tf_proto_library(name, srcs = [], has_services = None, protodeps = [], visibility = [], testonly = 0, cc_libs = [], diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc index ac0988e7047..ebdd4b624aa 100644 --- a/tensorflow/core/platform/default/logging.cc +++ b/tensorflow/core/platform/default/logging.cc @@ -14,7 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/platform/default/logging.h" -#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/env_time.h" #include "tensorflow/core/platform/macros.h" @@ -25,12 +24,8 @@ limitations under the License. #endif #include -#include #include -#include -#include - namespace tensorflow { namespace internal { @@ -88,11 +83,11 @@ void LogMessage::GenerateLogMessage() { const size_t time_buffer_size = 30; char time_buffer[time_buffer_size]; strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S", - localtime(&now_seconds)); + localtime(&now_seconds)); // TODO(jeff,sanjay): Replace this with something that logs through the env. fprintf(stderr, "%s.%06d: %c %s:%d] %s\n", time_buffer, micros_remainder, - "IWEF"[severity_], fname_, line_, str().c_str()); + "IWEF"[severity_], fname_, line_, str().c_str()); } #endif @@ -129,48 +124,6 @@ int64 MinVLogLevelFromEnv() { return LogLevelStrToInt(tf_env_var_val); } -using VmoduleMap = std::unordered_map; - -// Returns a mapping from module name to VLOG level, derived from the -// TF_CPP_VMOUDLE environment variable; ownership is transferred to the caller. -VmoduleMap* VmoduleRecordsFromEnv() { - // The value of the env var is supposed to be of the form: - // "foo=1,bar=2,baz=3" - const char* tf_env_var_val = getenv("TF_CPP_VMODULE"); - auto* result = new VmoduleMap(); - if (tf_env_var_val == nullptr) return result; - while (true) { - const char* eq = strchr(tf_env_var_val, '='); - if (eq == nullptr) break; - const char* after_eq = eq + 1; - - // Comma either points at the next comma delimiter, or at a null terminator. - // We check that the integer we parse ends at this delimiter. - const char* comma = strchr(after_eq, ','); - const char* new_tf_env_var_val; - if (comma == nullptr) { - comma = strchr(after_eq, '\0'); - new_tf_env_var_val = comma; - } else { - new_tf_env_var_val = comma + 1; - } - - char* endptr = nullptr; - int level = strtol(after_eq, &endptr, 10); - if (endptr != comma) { - fprintf(stderr, - "warning: could not parse integer in vmodule specification in " - "\"%s\".\n", - after_eq); - break; - } - StringPiece module(tf_env_var_val, eq - tf_env_var_val); - tf_env_var_val = new_tf_env_var_val; - (*result)[module] = level; - } - return result; -} - } // namespace LogMessage::~LogMessage() { @@ -184,19 +137,6 @@ int64 LogMessage::MinVLogLevel() { return min_vlog_level; } -bool LogMessage::VmoduleActivated(const char* fname, int lvl) { - static VmoduleMap* vmodule_records = VmoduleRecordsFromEnv(); - const char* last_slash = strrchr(fname, '/'); - const char* module_start = last_slash == nullptr ? fname : last_slash + 1; - const char* dot_after = strchr(module_start, '.'); - const char* module_limit = - dot_after == nullptr ? strchr(fname, '\0') : dot_after; - StringPiece module(module_start, module_limit - module_start); - auto it = vmodule_records->find(module); - if (it == vmodule_records->end()) return false; - return it->second >= lvl; -} - LogMessageFatal::LogMessageFatal(const char* file, int line) : LogMessage(file, line, FATAL) {} LogMessageFatal::~LogMessageFatal() { diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h index c8c9b2da11a..04ff9e12b6f 100644 --- a/tensorflow/core/platform/default/logging.h +++ b/tensorflow/core/platform/default/logging.h @@ -46,16 +46,6 @@ class LogMessage : public std::basic_ostringstream { // but VLOG(3) will not. Defaults to 0. static int64 MinVLogLevel(); - // Returns whether VLOG level lvl is activated for the file fname. - // - // E.g. if the environment variable TF_CPP_VMODULE contains foo=3 and fname is - // foo.cc and lvl is <= 3, this will return true. - // - // It is expected that the result of this query will be cached in the VLOG-ing - // call site to avoid repeated lookups. This routine performs a hash-map - // access against the VLOG-ing specification provided by the env var. - static bool VmoduleActivated(const char* fname, int lvl); - protected: void GenerateLogMessage(); @@ -86,38 +76,18 @@ class LogMessageFatal : public LogMessage { #define LOG(severity) _TF_LOG_##severity -#if defined(IS_MOBILE_PLATFORM) - +#ifdef IS_MOBILE_PLATFORM // Turn VLOG off when under mobile devices for considerations of binary size. -#define _VLOG_IS_ON(lvl, file) ((lvl) <= 0) - -#elif defined(PLATFORM_WINDOWS) - -// TODO(b/64279502) The _VLOG_IS_ON definition below appears to cause MSVC to -// fatal error, so we fall back to the vmodule-less implementation for now. -#define _VLOG_IS_ON(lvl, file) \ - ((lvl) <= ::tensorflow::internal::LogMessage::MinVLogLevel()) - +#define VLOG_IS_ON(lvl) ((lvl) <= 0) #else - -// Otherwise, set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level -// of VLOG, or TF_CPP_VMODULE to set the minimum log level for individual -// translation units. -#define _VLOG_IS_ON(lvl, file) \ - (([](int level, const char* fname) { \ - if (level <= ::tensorflow::internal::LogMessage::MinVLogLevel()) \ - return true; \ - static bool vmodule_activated = \ - ::tensorflow::internal::LogMessage::VmoduleActivated(fname, level); \ - return vmodule_activated; \ - })(lvl, file)) - +// Otherwise, Set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level +// of VLOG +#define VLOG_IS_ON(lvl) \ + ((lvl) <= ::tensorflow::internal::LogMessage::MinVLogLevel()) #endif -#define VLOG_IS_ON(lvl) _VLOG_IS_ON(lvl, __FILE__) - -#define VLOG(lvl) \ - if (TF_PREDICT_FALSE(_VLOG_IS_ON(lvl, __FILE__))) \ +#define VLOG(lvl) \ + if (TF_PREDICT_FALSE(VLOG_IS_ON(lvl))) \ ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::INFO) // CHECK dies with a fatal error if condition is not true. It is *not* diff --git a/tensorflow/core/platform/vmodule_benchmark_test.cc b/tensorflow/core/platform/vmodule_benchmark_test.cc deleted file mode 100644 index 0f9e75bf9cd..00000000000 --- a/tensorflow/core/platform/vmodule_benchmark_test.cc +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/test_benchmark.h" - -namespace tensorflow { - -static void BM_DisabledVlog(int iters) { - for (int i = 0; i < iters; ++i) { - VLOG(1) << "Testing VLOG(1)!"; - } -} -BENCHMARK(BM_DisabledVlog); - -} // namespace tensorflow diff --git a/tensorflow/core/platform/vmodule_test.cc b/tensorflow/core/platform/vmodule_test.cc deleted file mode 100644 index 47b4b2e0e78..00000000000 --- a/tensorflow/core/platform/vmodule_test.cc +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// Test that popens a child process with the VLOG-ing environment variable set -// for the logging framework, and observes VLOG_IS_ON and VLOG macro output. - -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/platform.h" -#include "tensorflow/core/platform/test.h" - -#include - -namespace tensorflow { -namespace { - -int RealMain(const char* argv0, bool do_vlog) { - if (do_vlog) { -#if !defined(PLATFORM_GOOGLE) - // Note, we only test this when !defined(PLATFORM_GOOGLE) because - // VmoduleActivated doesn't exist in that implementation. - // - // Also, we call this internal API to simulate what would happen if - // differently-named translation units attempted to VLOG, so we don't need - // to create dummy translation unit files. - bool ok = internal::LogMessage::VmoduleActivated("vmodule_test.cc", 7) && - internal::LogMessage::VmoduleActivated("shoobadooba.h", 3); - if (!ok) { - fprintf(stderr, "vmodule activated levels not as expected.\n"); - return EXIT_FAILURE; - } -#endif - - // Print info on which VLOG levels are activated. - fprintf(stderr, "VLOG_IS_ON(8)? %d\n", VLOG_IS_ON(8)); - fprintf(stderr, "VLOG_IS_ON(7)? %d\n", VLOG_IS_ON(7)); - fprintf(stderr, "VLOG_IS_ON(6)? %d\n", VLOG_IS_ON(6)); - // Do some VLOG-ing. - VLOG(8) << "VLOG(8)"; - VLOG(7) << "VLOG(7)"; - VLOG(6) << "VLOG(6)"; - LOG(INFO) << "INFO"; - return EXIT_SUCCESS; - } - - // Popen the child process. - std::string command = std::string(argv0); -#if defined(PLATFORM_GOOGLE) - command = command + " do_vlog --vmodule=vmodule_test=7 --alsologtostderr"; -#else - command = - "TF_CPP_VMODULE=vmodule_test=7,shoobadooba=3 " + command + " do_vlog"; -#endif - command += " 2>&1"; - fprintf(stderr, "Running: \"%s\"\n", command.c_str()); - FILE* f = popen(command.c_str(), "r"); - if (f == nullptr) { - fprintf(stderr, "Failed to popen child: %s\n", strerror(errno)); - return EXIT_FAILURE; - } - - // Read data from the child's stdout. - constexpr int kBufferSizeBytes = 4096; - char buffer[kBufferSizeBytes]; - size_t result = fread(buffer, sizeof(buffer[0]), kBufferSizeBytes - 1, f); - if (result == 0) { - fprintf(stderr, "Failed to read from child stdout: %zu %s\n", result, - strerror(errno)); - return EXIT_FAILURE; - } - buffer[result] = '\0'; - int status = pclose(f); - if (status == -1) { - fprintf(stderr, "Failed to close popen child: %s\n", strerror(errno)); - return EXIT_FAILURE; - } - - // Check output is as expected. - const char kExpected[] = - "VLOG_IS_ON(8)? 0\nVLOG_IS_ON(7)? 1\nVLOG_IS_ON(6)? 1\n"; - if (strstr(buffer, kExpected) == nullptr) { - fprintf(stderr, "error: unexpected output from child: \"%.*s\"\n", - kBufferSizeBytes, buffer); - return EXIT_FAILURE; - } - bool ok = strstr(buffer, "VLOG(7)\n") != nullptr && - strstr(buffer, "VLOG(6)\n") != nullptr && - strstr(buffer, "VLOG(8)\n") == nullptr; - if (!ok) { - fprintf(stderr, "error: VLOG output not as expected: \"%.*s\"\n", - kBufferSizeBytes, buffer); - return EXIT_FAILURE; - } - - // Success! - return EXIT_SUCCESS; -} - -} // namespace -} // namespace tensorflow - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - bool do_vlog = argc >= 2 && strcmp(argv[1], "do_vlog") == 0; - return tensorflow::RealMain(argv[0], do_vlog); -} diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md index 5c50a86c88f..f0d4dafd3ea 100644 --- a/tensorflow/core/profiler/README.md +++ b/tensorflow/core/profiler/README.md @@ -56,7 +56,7 @@ with tf.contrib.tfprof.ProfileContext() as pctx: ```shell # Profiling from Python API is not interactive. -# Dump the profiles to files and profile with interactive command line. +# Dump the profiles to files and profile with interactive command line or web UI. with tf.contrib.tfprof.ProfileContext() as pctx: pctx.add_auto_profile_dump('/tmp/profiles', [100]) train_loop() @@ -66,7 +66,15 @@ bazel-bin/tensorflow/core/profiler/profiler \ --run_meta_path=/tmp/profiles/run_meta \ --op_log_path=/tmp/profiles/tfprof_log \ tfprof> op -select micros,bytes,occurrence -order_by micros + + +# To be open sourced... +bazel-bin/third_party/tensorflow/python/profiler/profiler_ui \ + --graph_path=/tmp/profiles/graph.pbtxt \ + --run_meta_path=/tmp/profiles/run_meta \ + --op_log_path=/tmp/profiles/tfprof_log \ ``` +![ProfilerUI](g3doc/profiler_ui.jpg) Detail Tutorials @@ -239,5 +247,6 @@ bug fix. `OpLogProto` is a good plus if it is used. #### Teams * Xin Pan (xpan@google.com, github: panyx0718) +* Chris Antaki * Yao Zhang * Jon Shlens diff --git a/tensorflow/core/profiler/g3doc/advise.md b/tensorflow/core/profiler/g3doc/advise.md index d87b0d8603d..d0de8317f69 100644 --- a/tensorflow/core/profiler/g3doc/advise.md +++ b/tensorflow/core/profiler/g3doc/advise.md @@ -86,7 +86,7 @@ For example: * Checks RecvTensor RPC latency and bandwidth. * Checks CPU/Memory utilization of the job. -####AcceleratorUtilization Checker +#### AcceleratorUtilization Checker * Checks what percentage of time the accelerator spends on computation. #### OperationChecker @@ -100,7 +100,7 @@ For example: * Checks the most expensive graph nodes. * Checks the most expensive graph-building Python codes. -####Contribute Your Checker +#### Contribute Your Checker Follow examples of accelerator_utilization_checker.h diff --git a/tensorflow/core/profiler/g3doc/command_line.md b/tensorflow/core/profiler/g3doc/command_line.md index 857b5e64590..fb4207c7841 100644 --- a/tensorflow/core/profiler/g3doc/command_line.md +++ b/tensorflow/core/profiler/g3doc/command_line.md @@ -51,13 +51,13 @@ It defines _checkpoint_variable op type. It also provides checkpointed tensors' Note: this feature is not well maintained now. -###Start `tfprof` +### Start `tfprof` #### Build `tfprof` ```shell # Build the tool. -bazel build --config opt third_party/tensorflow/core/profiler/... +bazel build --config opt tensorflow/core/profiler:profiler # Help information, including detail 'option' instructions. bazel-bin/tensorflow/core/profiler/profiler help @@ -140,9 +140,9 @@ tfprof> -output ``` -###Examples +### Examples -####Profile Python Time +#### Profile Python Time ```shell # Requires --graph_path --op_log_path tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md index 15712d04c25..ddee63ad42a 100644 --- a/tensorflow/core/profiler/g3doc/options.md +++ b/tensorflow/core/profiler/g3doc/options.md @@ -1,6 +1,6 @@ -##Options +## Options -###Overview +### Overview For all tfprof views, the profiles are processed with the following procedures @@ -35,7 +35,7 @@ For all tfprof views, the profiles are processed with the following procedures 4) Finally, the filtered data structure is output in a format depending on the `-output` option. -####Option Semantics In Different View +#### Option Semantics In Different View options usually have the same semantics in different views. However, some can vary. For example `-max_depth` in scope view means the depth of name scope tree. In op view, it means the length of operation list. @@ -68,7 +68,7 @@ output_bytes: The memory output by the operation. It's not necessarily requested by the current operation. For example, it can be a tensor forwarded from input to output, with in-place mutation. -###Docs +### Docs `-max_depth`: Show nodes that are at most this number of hops from starting node in the data structure. diff --git a/tensorflow/core/profiler/g3doc/profile_memory.md b/tensorflow/core/profiler/g3doc/profile_memory.md index a00683d0626..6eda5abdd97 100644 --- a/tensorflow/core/profiler/g3doc/profile_memory.md +++ b/tensorflow/core/profiler/g3doc/profile_memory.md @@ -1,4 +1,4 @@ -##Profile Memory +## Profile Memory It is generally a good idea to visualize the memory usage in timeline. It allows you to see the memory consumption of each GPU over time. diff --git a/tensorflow/core/profiler/g3doc/profile_model_architecture.md b/tensorflow/core/profiler/g3doc/profile_model_architecture.md index a42b2e918da..61bb66bd21b 100644 --- a/tensorflow/core/profiler/g3doc/profile_model_architecture.md +++ b/tensorflow/core/profiler/g3doc/profile_model_architecture.md @@ -1,9 +1,9 @@ -##Profile Model Architecture +## Profile Model Architecture * [Profile Model Parameters](#profile-model-parameters) * [Profile Model Float Operations](#profile-model-float-operations) -###Profile Model Parameters +### Profile Model Parameters Notes: `VariableV2` operation type might contain variables created by TensorFlow @@ -39,9 +39,9 @@ param_stats = tf.profiler.profile( sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) ``` -###Profile Model Float Operations +### Profile Model Float Operations -####Caveats +#### Caveats For an operation to have float operation statistics: diff --git a/tensorflow/core/profiler/g3doc/profile_time.md b/tensorflow/core/profiler/g3doc/profile_time.md index e11a75553b2..4aafc697a9b 100644 --- a/tensorflow/core/profiler/g3doc/profile_time.md +++ b/tensorflow/core/profiler/g3doc/profile_time.md @@ -1,4 +1,4 @@ -##Profile Time +## Profile Time * [Times in TensorFlow and tfprof](#times-in-tensorflow-and-tfprof) * [Profile by Python Code](#profile-by-python-code) @@ -7,7 +7,7 @@ * [Profile by Name Scope](#profile-by-name-scope) -###Times in TensorFlow and tfprof +### Times in TensorFlow and tfprof When we run a model, Tensorflow schedules and runs the nodes (operations) in the graph. An operation can be placed on an accelerator or on CPU. @@ -37,7 +37,7 @@ When an operation is placed on CPU, it will completely run on CPU. Hence, should be 0. -###Profile by Python Code +### Profile by Python Code ```python # In code view, the time of each line of Python code is the aggregated # times of all operations created by that line. @@ -112,7 +112,7 @@ Set ```-output timeline:outfile=``` to generate timeline instead of st -###Profile by Operation Type +### Profile by Operation Type ```python # In op view, you can view the aggregated time of each operation type. tfprof> op -select micros,occurrence -order_by micros @@ -138,7 +138,7 @@ MatMul 618.97ms (63.56%, 16.51%), |/job:worker/replica:0/ ``` -###Profile by Graph +### Profile by Graph Usually, use graph view to generate a timeline to visualize the result. @@ -163,7 +163,7 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file. ****************************************************** ``` -###Profile by Name Scope +### Profile by Name Scope Usually scope view allows you to pin point the problematic places if you have properly named your operations with tf.name_scope or tf.variable_scope. diff --git a/tensorflow/core/profiler/g3doc/profiler_ui.jpg b/tensorflow/core/profiler/g3doc/profiler_ui.jpg new file mode 100644 index 00000000000..36aa94502a8 Binary files /dev/null and b/tensorflow/core/profiler/g3doc/profiler_ui.jpg differ diff --git a/tensorflow/core/util/permutation_input_iterator.h b/tensorflow/core/util/permutation_input_iterator.h new file mode 100644 index 00000000000..f6375b25157 --- /dev/null +++ b/tensorflow/core/util/permutation_input_iterator.h @@ -0,0 +1,134 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_ +#define TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_ + +#include +#include + +namespace tensorflow { + +template +class PermutationInputIterator { + public: + // Required iterator traits + typedef PermutationInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of + ///< subtracting one iterator from another + typedef ValueType + value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the + ///< iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the + ///< iterator can point to + + typedef std::random_access_iterator_tag + iterator_category; ///< The iterator category + + private: + InputIteratorT input_itr; + IndexIteratorT index_itr; + + public: + /// Constructor + __host__ __device__ __forceinline__ PermutationInputIterator( + InputIteratorT input_itr, ///< Input iterator to wrap + IndexIteratorT index_itr) ///< Conversion functor to wrap + : input_itr(input_itr), index_itr(index_itr) {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) { + self_type retval = *this; + index_itr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() { + index_itr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const { + return input_itr[*index_itr]; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const { + self_type retval(input_itr, index_itr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { + index_itr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const { + self_type retval(input_itr, index_itr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { + index_itr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type + operator-(self_type other) const { + return index_itr - other.index_itr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const { + return input_itr[index_itr[n]]; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() { + return input_itr + *index_itr; + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { + return (index_itr == rhs.index_itr && input_itr == rhs.input_itr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { + return !(*this == rhs); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { + return os; + } +}; + +} // end namespace tensorflow + +#endif // TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_ diff --git a/tensorflow/core/util/tensor_slice_reader.h b/tensorflow/core/util/tensor_slice_reader.h index eeb31295737..5932d59a159 100644 --- a/tensorflow/core/util/tensor_slice_reader.h +++ b/tensorflow/core/util/tensor_slice_reader.h @@ -165,13 +165,18 @@ bool TensorSliceReader::CopySliceData(const string& name, CHECK_GE(idx, 0) << "Failed to find the index for filename " << fname; // We read a record in the corresponding sstable const string key = EncodeTensorNameSlice(name, slice_s); - CHECK(sss_[idx]->Get(key, &value)) - << "Failed to seek to the record for tensor " << name << ", slice " - << slice_s.DebugString() << ": computed key = " << key; + if (!sss_[idx]->Get(key, &value)) { + VLOG(1) << "Failed to seek to the record for tensor " << name + << ", slice " << slice_s.DebugString() + << ": computed key = " << key; + return false; + } SavedTensorSlices sts; - CHECK(ParseProtoUnlimited(&sts, value)) - << "Failed to parse the record for tensor " << name << ", slice " - << slice_s.DebugString() << ": computed key = " << key; + if (!ParseProtoUnlimited(&sts, value)) { + VLOG(1) << "Failed to parse the record for tensor " << name << ", slice " + << slice_s.DebugString() << ": computed key = " << key; + return false; + } CopyDataFromTensorSliceToTensorSlice( tss->shape(), slice_s, slice, checkpoint::TensorProtoData(sts.data().data()), data); diff --git a/tensorflow/core/util/transform_output_iterator.h b/tensorflow/core/util/transform_output_iterator.h new file mode 100644 index 00000000000..1640791ad17 --- /dev/null +++ b/tensorflow/core/util/transform_output_iterator.h @@ -0,0 +1,149 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_ +#define TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_ + +#include +#include + +namespace tensorflow { + +template +class TransformOutputIterator { + private: + // Proxy object + struct Reference { + StoreType* ptr; + ConversionOp conversion_op; + + /// Constructor + __host__ __device__ __forceinline__ Reference(StoreType* ptr, + ConversionOp conversion_op) + : ptr(ptr), conversion_op(conversion_op) {} + + /// Assignment + __host__ __device__ __forceinline__ InputType operator=(InputType val) { + *ptr = conversion_op(val); + return val; + } + }; + + public: + // Required iterator traits + typedef TransformOutputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of + ///< subtracting one iterator from another + typedef void + value_type; ///< The type of the element the iterator can point to + typedef void pointer; ///< The type of a pointer to an element the iterator + ///< can point to + typedef Reference reference; ///< The type of a reference to an element the + ///< iterator can point to + + typedef std::random_access_iterator_tag + iterator_category; ///< The iterator category + + /*private:*/ + + StoreType* ptr; + ConversionOp conversion_op; + + public: + /// Constructor + template + __host__ __device__ __forceinline__ TransformOutputIterator( + QualifiedStoreType* ptr, + ConversionOp conversionOp) ///< Native pointer to wrap + : ptr(ptr), conversion_op(conversionOp) {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) { + self_type retval = *this; + ptr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() { + ptr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const { + return Reference(ptr, conversion_op); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const { + self_type retval(ptr + n, conversion_op); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const { + self_type retval(ptr - n, conversion_op); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type + operator-(self_type other) const { + return ptr - other.ptr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const { + return Reference(ptr + n, conversion_op); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) { + return (ptr != rhs.ptr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { + return os; + } +}; + +} // end namespace tensorflow + +#endif // TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_ diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md index 194649a304d..4991783a53a 100644 --- a/tensorflow/docs_src/community/welcome.md +++ b/tensorflow/docs_src/community/welcome.md @@ -37,6 +37,7 @@ Asia: * [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_ * [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_ * [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_ +* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/) Europe: diff --git a/tensorflow/docs_src/get_started/estimator.md b/tensorflow/docs_src/get_started/estimator.md index a55454f8af3..4f3a438d17d 100644 --- a/tensorflow/docs_src/get_started/estimator.md +++ b/tensorflow/docs_src/get_started/estimator.md @@ -273,9 +273,7 @@ Then, the code creates a `DNNClassifier` model using the following arguments: containing 10, 20, and 10 neurons, respectively. * `n_classes=3`. Three target classes, representing the three Iris species. * `model_dir=/tmp/iris_model`. The directory in which TensorFlow will save - checkpoint data during model training. For more on logging and monitoring - with TensorFlow, see - @{$monitors$Logging and Monitoring Basics with tf.estimator}. + checkpoint data and TensorBoard summaries during model training. ## Describe the training input pipeline {#train-input} @@ -315,9 +313,7 @@ classifier.train(input_fn=train_input_fn, steps=1000) However, if you're looking to track the model while it trains, you'll likely want to instead use a TensorFlow @{tf.train.SessionRunHook$`SessionRunHook`} -to perform logging operations. See the tutorial -@{$monitors$Logging and Monitoring Basics with tf.estimator} -for more on this topic. +to perform logging operations. ## Evaluate Model Accuracy {#evaluate-accuracy} diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md index 3e700daa304..003fac1a287 100644 --- a/tensorflow/docs_src/get_started/index.md +++ b/tensorflow/docs_src/get_started/index.md @@ -24,8 +24,6 @@ To learn about the high-level API, read the following guides: API. * @{$get_started/input_fn$Building Input Functions}, which takes you into a somewhat more sophisticated use of this API. - * @{$get_started/monitors$Logging and Monitoring Basics with tf.contrib.learn}, - which explains how to audit the progress of model training. TensorBoard is a utility to visualize different aspects of machine learning. The following guides explain how to use TensorBoard: diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md index 422f45c586a..7706c07b1d9 100644 --- a/tensorflow/docs_src/get_started/input_fn.md +++ b/tensorflow/docs_src/get_started/input_fn.md @@ -249,7 +249,7 @@ here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/input_fn/bos ### Importing the Housing Data -To start, set up your imports (including `pandas` and `tensorflow`) and @{$monitors#enabling-logging-with-tensorflow$set logging verbosity} to +To start, set up your imports (including `pandas` and `tensorflow`) and set logging verbosity to `INFO` for more detailed log output: ```python diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files index b656033f7e8..bb67eaddda3 100644 --- a/tensorflow/docs_src/get_started/leftnav_files +++ b/tensorflow/docs_src/get_started/leftnav_files @@ -5,7 +5,6 @@ mnist/pros.md mnist/mechanics.md estimator.md input_fn.md -monitors.md summaries_and_tensorboard.md graph_viz.md tensorboard_histograms.md diff --git a/tensorflow/docs_src/get_started/monitors.md b/tensorflow/docs_src/get_started/monitors.md deleted file mode 100644 index 5606e953658..00000000000 --- a/tensorflow/docs_src/get_started/monitors.md +++ /dev/null @@ -1,406 +0,0 @@ -# Logging and Monitoring Basics with tf.contrib.learn - -When training a model, it’s often valuable to track and evaluate progress in -real time. In this tutorial, you’ll learn how to use TensorFlow’s logging -capabilities and the `Monitor` API to audit the in-progress training of a neural -network classifier for categorizing irises. This tutorial builds on the code -developed in @{$estimator$tf.estimator Quickstart} so if you -haven't yet completed that tutorial, you may want to explore it first, -especially if you're looking for an intro/refresher on tf.contrib.learn basics. - -## Setup {#setup} - -For this tutorial, you'll be building upon the following code from -@{$estimator$tf.estimator Quickstart}: - -```python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os - -import numpy as np -import tensorflow as tf - -# Data sets -IRIS_TRAINING = os.path.join(os.path.dirname(__file__), "iris_training.csv") -IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv") - -def main(unused_argv): - # Load datasets. - training_set = tf.contrib.learn.datasets.base.load_csv_with_header( - filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32) - test_set = tf.contrib.learn.datasets.base.load_csv_with_header( - filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32) - - # Specify that all features have real-value data - feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)] - - # Build 3 layer DNN with 10, 20, 10 units respectively. - classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns, - hidden_units=[10, 20, 10], - n_classes=3, - model_dir="/tmp/iris_model") - - # Fit model. - classifier.fit(x=training_set.data, - y=training_set.target, - steps=2000) - - # Evaluate accuracy. - accuracy_score = classifier.evaluate(x=test_set.data, - y=test_set.target)["accuracy"] - print('Accuracy: {0:f}'.format(accuracy_score)) - - # Classify two new flower samples. - new_samples = np.array( - [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float) - y = list(classifier.predict(new_samples, as_iterable=True)) - print('Predictions: {}'.format(str(y))) - -if __name__ == "__main__": - tf.app.run() -``` - -Copy the above code into a file, and download the corresponding -[training](http://download.tensorflow.org/data/iris_training.csv) and -[test](http://download.tensorflow.org/data/iris_test.csv) data sets to the same -directory. - -In the following sections, you'll progressively make updates to the above code -to add logging and monitoring capabilities. Final code incorporating all updates -is [available for download -here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/monitors/iris_monitors.py). - -## Overview - -The @{$estimator$tf.estimator Quickstart tutorial} walked through -how to implement a neural net classifier to categorize iris examples into one of -three species. - -But when [the code](#setup) from this tutorial is run, the output contains no -logging tracking how model training is progressing—only the results of the -`print` statements that were included: - -```none -Accuracy: 0.933333 -Predictions: [1 2] -``` - -Without any logging, model training feels like a bit of a black box; you can't -see what's happening as TensorFlow steps through gradient descent, get a sense -of whether the model is converging appropriately, or audit to determine whether -[early stopping](https://en.wikipedia.org/wiki/Early_stopping) might be -appropriate. - -One way to address this problem would be to split model training into multiple -`fit` calls with smaller numbers of steps in order to evaluate accuracy more -progressively. However, this is not recommended practice, as it greatly slows -down model training. Fortunately, tf.contrib.learn offers another solution: a -@{tf.contrib.learn.monitors$Monitor API} designed to help -you log metrics and evaluate your model while training is in progress. In the -following sections, you'll learn how to enable logging in TensorFlow, set up a -ValidationMonitor to do streaming evaluations, and visualize your metrics using -TensorBoard. - -## Enabling Logging with TensorFlow - -TensorFlow uses five different levels for log messages. In order of ascending -severity, they are `DEBUG`, `INFO`, `WARN`, `ERROR`, and `FATAL`. When you -configure logging at any of these levels, TensorFlow will output all log -messages corresponding to that level and all levels of higher severity. For -example, if you set a logging level of `ERROR`, you'll get log output containing -`ERROR` and `FATAL` messages, and if you set a level of `DEBUG`, you'll get log -messages from all five levels. - -By default, TensorFlow is configured at a logging level of `WARN`, but when -tracking model training, you'll want to adjust the level to `INFO`, which will -provide additional feedback as `fit` operations are in progress. - -Add the following line to the beginning of your code (right after your -`import`s): - -```python -tf.logging.set_verbosity(tf.logging.INFO) -``` - -Now when you run the code, you'll see additional log output like the following: - -```none -INFO:tensorflow:loss = 1.18812, step = 1 -INFO:tensorflow:loss = 0.210323, step = 101 -INFO:tensorflow:loss = 0.109025, step = 201 -``` - -With `INFO`-level logging, tf.contrib.learn automatically outputs [training-loss -metrics](https://en.wikipedia.org/wiki/Loss_function) to stderr after every 100 -steps. - -## Configuring a ValidationMonitor for Streaming Evaluation - -Logging training loss is helpful to get a sense whether your model is -converging, but what if you want further insight into what's happening during -training? tf.contrib.learn provides several high-level `Monitor`s you can attach -to your `fit` operations to further track metrics and/or debug lower-level -TensorFlow operations during model training, including: - -Monitor | Description -------------------- | ----------- -`CaptureVariable` | Saves a specified variable's values into a collection at every _n_ steps of training -`PrintTensor` | Logs a specified tensor's values at every _n_ steps of training -`SummarySaver` | Saves @{tf.Summary} [protocol buffers](https://developers.google.com/protocol-buffers/) for a given tensor using a @{tf.summary.FileWriter} at every _n_ steps of training -`ValidationMonitor` | Logs a specified set of evaluation metrics at every _n_ steps of training, and, if desired, implements early stopping under certain conditions - -### Evaluating Every *N* Steps - -For the iris neural network classifier, while logging training loss, you might -also want to simultaneously evaluate against test data to see how well the model -is generalizing. You can accomplish this by configuring a `ValidationMonitor` -with the test data (`test_set.data` and `test_set.target`), and setting how -often to evaluate with `every_n_steps`. The default value of `every_n_steps` is -`100`; here, set `every_n_steps` to `50` to evaluate after every 50 steps of -model training: - -```python -validation_monitor = tf.contrib.learn.monitors.ValidationMonitor( - test_set.data, - test_set.target, - every_n_steps=50) -``` - -Place this code right before the line instantiating the `classifier`. - -`ValidationMonitor`s rely on saved checkpoints to perform evaluation operations, -so you'll want to modify instantiation of the `classifier` to add a -@{tf.contrib.learn.RunConfig} that includes -`save_checkpoints_secs`, which specifies how many seconds should elapse between -checkpoint saves during training. Because the iris data set is quite small, and -thus trains quickly, it makes sense to set `save_checkpoints_secs` to 1 (saving -a checkpoint every second) to ensure a sufficient number of checkpoints: - -```python -classifier = tf.contrib.learn.DNNClassifier( - feature_columns=feature_columns, - hidden_units=[10, 20, 10], - n_classes=3, - model_dir="/tmp/iris_model", - config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1)) -``` - -NOTE: The `model_dir` parameter specifies an explicit directory -(`/tmp/iris_model`) for model data to be stored; this directory path will be -easier to reference later on than an autogenerated one. Each time you run the -code, any existing data in `/tmp/iris_model` will be loaded, and model training -will continue where it left off in the last run (e.g., running the script twice -in succession will execute 4000 steps during training—2000 during each -`fit` operation). To start over model training from scratch, delete -`/tmp/iris_model` before running the code. - -Finally, to attach your `validation_monitor`, update the `fit` call to include a -`monitors` param, which takes a list of all monitors to run during model -training: - -```python -classifier.fit(x=training_set.data, - y=training_set.target, - steps=2000, - monitors=[validation_monitor]) -``` - -Now, when you rerun the code, you should see validation metrics in your log -output, e.g.: - -```none -INFO:tensorflow:Validation (step 50): loss = 1.71139, global_step = 0, accuracy = 0.266667 -... -INFO:tensorflow:Validation (step 300): loss = 0.0714158, global_step = 268, accuracy = 0.966667 -... -INFO:tensorflow:Validation (step 1750): loss = 0.0574449, global_step = 1729, accuracy = 0.966667 -``` - -### Customizing the Evaluation Metrics with MetricSpec - -By default, if no evaluation metrics are specified, `ValidationMonitor` will log -both [loss](https://en.wikipedia.org/wiki/Loss_function) and accuracy, but you -can customize the list of metrics that will be run every 50 steps. To specify -the exact metrics you'd like to run in each evaluation pass, you can add a -`metrics` param to the `ValidationMonitor` constructor. `metrics` takes a dict -of key/value pairs, where each key is the name you'd like logged for the metric, -and the corresponding value is a -[`MetricSpec`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/metric_spec.py) -object. - -The `MetricSpec` constructor accepts four parameters: - -* `metric_fn`. The function that calculates and returns the value of a metric. - This can be a predefined function available in the - @{tf.contrib.metrics} module, such as - @{tf.contrib.metrics.streaming_precision} or - @{tf.contrib.metrics.streaming_recall}. - - Alternatively, you can define your own custom metric function, which must - take `predictions` and `labels` tensors as arguments (a `weights` argument - can also optionally be supplied). The function must return the value of the - metric in one of two formats: - - * A single tensor - * A pair of ops `(value_op, update_op)`, where `value_op` returns the - metric value and `update_op` performs a corresponding operation to - update internal model state. - -* `prediction_key`. The key of the tensor containing the predictions returned - by the model. This argument may be omitted if the model returns either a - single tensor or a dict with a single entry. For a `DNNClassifier` model, - class predictions will be returned in a tensor with the key - @{tf.contrib.learn.PredictionKey.CLASSES}. - -* `label_key`. The key of the tensor containing the labels returned by the - model, as specified by the model's @{$input_fn$`input_fn`}. As - with `prediction_key`, this argument may be omitted if the `input_fn` - returns either a single tensor or a dict with a single entry. In the iris - example in this tutorial, the `DNNClassifier` does not have an `input_fn` - (`x`,`y` data is passed directly to `fit`), so it's not necessary to provide - a `label_key`. - -* `weights_key`. *Optional*. The key of the tensor (returned by the - @{$input_fn$`input_fn`}) containing weights inputs for the - `metric_fn`. - -The following code creates a `validation_metrics` dict that defines three -metrics to log during model evaluation: - -* `"accuracy"`, using @{tf.contrib.metrics.streaming_accuracy} - as the `metric_fn` -* `"precision"`, using @{tf.contrib.metrics.streaming_precision} - as the `metric_fn` -* `"recall"`, using @{tf.contrib.metrics.streaming_recall} - as the `metric_fn` - -```python -validation_metrics = { - "accuracy": - tf.contrib.learn.MetricSpec( - metric_fn=tf.contrib.metrics.streaming_accuracy, - prediction_key=tf.contrib.learn.PredictionKey.CLASSES), - "precision": - tf.contrib.learn.MetricSpec( - metric_fn=tf.contrib.metrics.streaming_precision, - prediction_key=tf.contrib.learn.PredictionKey.CLASSES), - "recall": - tf.contrib.learn.MetricSpec( - metric_fn=tf.contrib.metrics.streaming_recall, - prediction_key=tf.contrib.learn.PredictionKey.CLASSES) -} -``` - -Add the above code before the `ValidationMonitor` constructor. Then revise the -`ValidationMonitor` constructor as follows to add a `metrics` parameter to log -the accuracy, precision, and recall metrics specified in `validation_metrics` -(loss is always logged, and doesn't need to be explicitly specified): - -```python -validation_monitor = tf.contrib.learn.monitors.ValidationMonitor( - test_set.data, - test_set.target, - every_n_steps=50, - metrics=validation_metrics) -``` - -Rerun the code, and you should see precision and recall included in your log -output, e.g.: - -```none -INFO:tensorflow:Validation (step 50): recall = 0.0, loss = 1.20626, global_step = 1, precision = 0.0, accuracy = 0.266667 -... -INFO:tensorflow:Validation (step 600): recall = 1.0, loss = 0.0530696, global_step = 571, precision = 1.0, accuracy = 0.966667 -... -INFO:tensorflow:Validation (step 1500): recall = 1.0, loss = 0.0617403, global_step = 1452, precision = 1.0, accuracy = 0.966667 -``` - -### Early Stopping with ValidationMonitor - -Note that in the above log output, by step 600, the model has already achieved -precision and recall rates of 1.0. This raises the question as to whether model -training could benefit from -[early stopping](https://en.wikipedia.org/wiki/Early_stopping). - -In addition to logging eval metrics, `ValidationMonitor`s make it easy to -implement early stopping when specified conditions are met, via three params: - -| Param | Description | -| -------------------------------- | ----------------------------------------- | -| `early_stopping_metric` | Metric that triggers early stopping | -: : (e.g., loss or accuracy) under conditions : -: : specified in `early_stopping_rounds` and : -: : `early_stopping_metric_minimize`. Default : -: : is `"loss"`. : -| `early_stopping_metric_minimize` | `True` if desired model behavior is to | -: : minimize the value of : -: : `early_stopping_metric`; `False` if : -: : desired model behavior is to maximize the : -: : value of `early_stopping_metric`. Default : -: : is `True`. : -| `early_stopping_rounds` | Sets a number of steps during which if | -: : the `early_stopping_metric` does not : -: : decrease (if : -: : `early_stopping_metric_minimize` is : -: : `True`) or increase (if : -: : `early_stopping_metric_minimize` is : -: : `False`), training will be stopped. : -: : Default is `None`, which means early : -: : stopping will never occur. : - -Make the following revision to the `ValidationMonitor` constructor, which -specifies that if loss (`early_stopping_metric="loss"`) does not decrease -(`early_stopping_metric_minimize=True`) over a period of 200 steps -(`early_stopping_rounds=200`), model training will stop immediately at that -point, and not complete the full 2000 steps specified in `fit`: - -```python -validation_monitor = tf.contrib.learn.monitors.ValidationMonitor( - test_set.data, - test_set.target, - every_n_steps=50, - metrics=validation_metrics, - early_stopping_metric="loss", - early_stopping_metric_minimize=True, - early_stopping_rounds=200) -``` - -Rerun the code to see if model training stops early: - -```none -... -INFO:tensorflow:Validation (step 1150): recall = 1.0, loss = 0.056436, global_step = 1119, precision = 1.0, accuracy = 0.966667 -INFO:tensorflow:Stopping. Best step: 800 with loss = 0.048313818872. -``` - -Indeed, here training stops at step 1150, indicating that for the past 200 -steps, loss did not decrease, and that overall, step 800 produced the smallest -loss value against the test data set. This suggests that additional calibration -of hyperparameters by decreasing the step count might further improve the model. - -## Visualizing Log Data with TensorBoard - -Reading through the log produced by `ValidationMonitor` provides plenty of raw -data on model performance during training, but it may also be helpful to see -visualizations of this data to get further insight into trends—for -example, how accuracy is changing over step count. You can use TensorBoard (a -separate program packaged with TensorFlow) to plot graphs like this by setting -the `logdir` command-line argument to the directory where you saved your model -training data (here, `/tmp/iris_model`). Run the following on your command line: - -
$ tensorboard --logdir=/tmp/iris_model/
-Starting TensorBoard 39 on port 6006
- -Then navigate to `http://0.0.0.0:`*``* in your browser, where -*``* is the port specified in the command-line output (here, -`6006`). - -If you click on the accuracy field, you'll see an image like the following, -which shows accuracy plotted against step count: - -![Accuracy over step count in TensorBoard](https://www.tensorflow.org/images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard") - -For more on using TensorBoard, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning} and @{$graph_viz$TensorBoard: Graph Visualization}. diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md index 43e09906f73..d5e481520c4 100644 --- a/tensorflow/docs_src/install/install_linux.md +++ b/tensorflow/docs_src/install/install_linux.md @@ -151,10 +151,10 @@ Take the following steps to install TensorFlow with Virtualenv: (tensorflow)$ pip install --upgrade tensorflow-gpu # for Python 2.7 and GPU (tensorflow)$ pip3 install --upgrade tensorflow-gpu # for Python 3.n and GPU - If the preceding command succeeds, skip Step 5. If the preceding - command fails, perform Step 5. + If the preceding command succeeds, skip Step 6. If the preceding + command fails, perform Step 6. - 5. (Optional) If Step 4 failed (typically because you invoked a pip version + 6. (Optional) If Step 5 failed (typically because you invoked a pip version lower than 8.1), install TensorFlow in the active virtualenv environment by issuing a command of the following format: diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md index bf3cb5bf196..ba26bd5e941 100644 --- a/tensorflow/docs_src/programmers_guide/datasets.md +++ b/tensorflow/docs_src/programmers_guide/datasets.md @@ -1,4 +1,4 @@ -# Using the `Dataset` API for TensorFlow Input Pipelines +# Importing Data The `Dataset` API enables you to build complex input pipelines from simple, reusable pieces. For example, the pipeline for an image model might @@ -735,7 +735,7 @@ def dataset_input_fn(): return {"image_data": image, "date_time": parsed["date_time"]}, label - # Use `Dataset.map()` to build a pair of a feature dictionary and a label + # Use `Dataset.map()` to build a pair of a feature dictionary and a label # tensor for each example. dataset = dataset.map(parser) dataset = dataset.shuffle(buffer_size=10000) diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md index a5724ea294e..755bb049c99 100644 --- a/tensorflow/docs_src/programmers_guide/estimators.md +++ b/tensorflow/docs_src/programmers_guide/estimators.md @@ -134,7 +134,7 @@ The heart of every Estimator--whether pre-made or custom--is its evaluation, and prediction. When you are using a pre-made Estimator, someone else has already implemented the model function. When relying on a custom Estimator, you must write the model function yourself. A -${$extend/estimators$companion document) +@{$extend/estimators$companion document} explains how to write the model function. diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md index 22fe229422a..eef35d6dcc7 100644 --- a/tensorflow/docs_src/programmers_guide/index.md +++ b/tensorflow/docs_src/programmers_guide/index.md @@ -4,6 +4,8 @@ The documents in this unit dive into the details of writing TensorFlow code. For TensorFlow 1.3, we revised this document extensively. The units are now as follows: + * @{$programmers_guide/estimators$Estimators}, which introduces a high-level + TensorFlow API that greatly simplifies ML programming. * @{$programmers_guide/tensors$Tensors}, which explains how to create, manipulate, and access Tensors--the fundamental object in TensorFlow. * @{$programmers_guide/variables$Variables}, which details how @@ -18,8 +20,6 @@ The units are now as follows: such as Estimators or Keras, the high-level API creates and manages graphs and sessions for you, but understanding graphs and sessions can still be helpful. - * @{$programmers_guide/estimators$Estimators}, which introduces a high-level - TensorFlow API that greatly simplifies ML programming. * @{$programmers_guide/saved_model$Saving and Restoring}, which explains how to save and restore variables and models. * @{$programmers_guide/datasets$Input Pipelines}, which explains how to diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files index 5082e7f36c8..0c42f119c95 100644 --- a/tensorflow/docs_src/programmers_guide/leftnav_files +++ b/tensorflow/docs_src/programmers_guide/leftnav_files @@ -1,8 +1,8 @@ index.md +estimators.md tensors.md variables.md graphs.md -estimators.md saved_model.md datasets.md threading_and_queues.md diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index 0781347fd6e..dda707aea26 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -209,6 +209,95 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va return op.Output(0) } +// Writes a `Summary` protocol buffer with scalar values. +// +// The input `tag` and `value` must have the scalars. +// +// Arguments: +// writer: A handle to a summary writer. +// global_step: The step to write the summary for. +// tag: Tag for the summary. +// value: Value for the summary. +// +// Returns the created operation. +func WriteScalarSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "WriteScalarSummary", + Input: []tf.Input{ + writer, global_step, tag, value, + }, + } + return scope.AddOperation(opspec) +} + +// Outputs a `Summary` protocol buffer with a tensor. +// +// Arguments: +// writer: A handle to a summary writer. +// global_step: The step to write the summary for. +// tensor: A tensor to serialize. +// tag: The summary's tag. +// summary_metadata: Serialized SummaryMetadata protocol buffer containing +// plugin-related metadata for this summary. +// +// Returns the created operation. +func WriteSummary(scope *Scope, writer tf.Output, global_step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "WriteSummary", + Input: []tf.Input{ + writer, global_step, tensor, tag, summary_metadata, + }, + } + return scope.AddOperation(opspec) +} + +// Flushes and closes the summary writer. +// +// Also removes it from the resource manager. To reopen, use another +// CreateSummaryFileWriter op. +// +// Arguments: +// writer: A handle to the summary writer resource. +// +// Returns the created operation. +func CloseSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "CloseSummaryWriter", + Input: []tf.Input{ + writer, + }, + } + return scope.AddOperation(opspec) +} + +// Flushes the writer's unwritten events. +// +// Arguments: +// writer: A handle to the summary writer resource. +// +// Returns the created operation. +func FlushSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "FlushSummaryWriter", + Input: []tf.Input{ + writer, + }, + } + return scope.AddOperation(opspec) +} + // FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient. type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr) @@ -2149,6 +2238,34 @@ func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset return offset } +// Writes a `Summary` protocol buffer with a histogram. +// +// The generated +// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto) +// has one summary value containing a histogram for `values`. +// +// This op reports an `InvalidArgument` error if any value is not finite. +// +// Arguments: +// writer: A handle to a summary writer. +// global_step: The step to write the summary for. +// tag: Scalar. Tag to use for the `Summary.Value`. +// values: Any shape. Values to use to build the histogram. +// +// Returns the created operation. +func WriteHistogramSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "WriteHistogramSummary", + Input: []tf.Input{ + writer, global_step, tag, values, + }, + } + return scope.AddOperation(opspec) +} + // Concatenates tensors along one dimension. // // Arguments: @@ -7087,6 +7204,48 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio return op.Output(0) } +// SummaryWriterAttr is an optional argument to SummaryWriter. +type SummaryWriterAttr func(optionalAttr) + +// SummaryWriterSharedName sets the optional shared_name attribute to value. +// If not specified, defaults to "" +func SummaryWriterSharedName(value string) SummaryWriterAttr { + return func(m optionalAttr) { + m["shared_name"] = value + } +} + +// SummaryWriterContainer sets the optional container attribute to value. +// If not specified, defaults to "" +func SummaryWriterContainer(value string) SummaryWriterAttr { + return func(m optionalAttr) { + m["container"] = value + } +} + +// Returns a handle to be used to access a summary writer. +// +// The summary writer is an in-graph resource which can be used by ops to write +// summaries to event files. +// +// Returns the summary writer resource. Scalar handle. +func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Output) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "SummaryWriter", + + Attrs: attrs, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // Returns the set of files matching one or more glob patterns. // // Note that this routine only supports wildcard characters in the @@ -10570,6 +10729,61 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf. return op.Output(0) } +// WriteAudioSummaryAttr is an optional argument to WriteAudioSummary. +type WriteAudioSummaryAttr func(optionalAttr) + +// WriteAudioSummaryMaxOutputs sets the optional max_outputs attribute to value. +// +// value: Max number of batch elements to generate audio for. +// If not specified, defaults to 3 +// +// REQUIRES: value >= 1 +func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr { + return func(m optionalAttr) { + m["max_outputs"] = value + } +} + +// Writes a `Summary` protocol buffer with audio. +// +// The summary has up to `max_outputs` summary values containing audio. The +// audio is built from `tensor` which must be 3-D with shape `[batch_size, +// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are +// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`. +// +// The `tag` argument is a scalar `Tensor` of type `string`. It is used to +// build the `tag` of the summary values: +// +// * If `max_outputs` is 1, the summary value tag is '*tag*/audio'. +// * If `max_outputs` is greater than 1, the summary value tags are +// generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc. +// +// Arguments: +// writer: A handle to a summary writer. +// global_step: The step to write the summary for. +// tag: Scalar. Used to build the `tag` attribute of the summary values. +// tensor: 2-D of shape `[batch_size, frames]`. +// sample_rate: The sample rate of the signal in hertz. +// +// Returns the created operation. +func WriteAudioSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "WriteAudioSummary", + Input: []tf.Input{ + writer, global_step, tag, tensor, sample_rate, + }, + Attrs: attrs, + } + return scope.AddOperation(opspec) +} + // FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D. type FusedResizeAndPadConv2DAttr func(optionalAttr) @@ -15797,6 +16011,30 @@ func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64 return op.Output(0) } +// Creates a summary file writer accessible by the given resource handle. +// +// Arguments: +// writer: A handle to the summary writer resource +// logdir: Directory where the event file will be written. +// max_queue: Size of the queue of pending events and summaries. +// flush_millis: How often, in milliseconds, to flush the pending events and +// summaries to disk. +// filename_suffix: Every event file's name is suffixed with this suffix. +// +// Returns the created operation. +func CreateSummaryFileWriter(scope *Scope, writer tf.Output, logdir tf.Output, max_queue tf.Output, flush_millis tf.Output, filename_suffix tf.Output) (o *tf.Operation) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "CreateSummaryFileWriter", + Input: []tf.Input{ + writer, logdir, max_queue, flush_millis, filename_suffix, + }, + } + return scope.AddOperation(opspec) +} + // EncodeBase64Attr is an optional argument to EncodeBase64. type EncodeBase64Attr func(optionalAttr) @@ -17172,6 +17410,84 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) ( return op.Output(0) } +// WriteImageSummaryAttr is an optional argument to WriteImageSummary. +type WriteImageSummaryAttr func(optionalAttr) + +// WriteImageSummaryMaxImages sets the optional max_images attribute to value. +// +// value: Max number of batch elements to generate images for. +// If not specified, defaults to 3 +// +// REQUIRES: value >= 1 +func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr { + return func(m optionalAttr) { + m["max_images"] = value + } +} + +// Writes a `Summary` protocol buffer with images. +// +// The summary has up to `max_images` summary values containing images. The +// images are built from `tensor` which must be 4-D with shape `[batch_size, +// height, width, channels]` and where `channels` can be: +// +// * 1: `tensor` is interpreted as Grayscale. +// * 3: `tensor` is interpreted as RGB. +// * 4: `tensor` is interpreted as RGBA. +// +// The images have the same number of channels as the input tensor. For float +// input, the values are normalized one image at a time to fit in the range +// `[0, 255]`. `uint8` values are unchanged. The op uses two different +// normalization algorithms: +// +// * If the input values are all positive, they are rescaled so the largest one +// is 255. +// +// * If any input value is negative, the values are shifted so input value 0.0 +// is at 127. They are then rescaled so that either the smallest value is 0, +// or the largest one is 255. +// +// The `tag` argument is a scalar `Tensor` of type `string`. It is used to +// build the `tag` of the summary values: +// +// * If `max_images` is 1, the summary value tag is '*tag*/image'. +// * If `max_images` is greater than 1, the summary value tags are +// generated sequentially as '*tag*/image/0', '*tag*/image/1', etc. +// +// The `bad_color` argument is the color to use in the generated images for +// non-finite input values. It is a `unit8` 1-D tensor of length `channels`. +// Each element must be in the range `[0, 255]` (It represents the value of a +// pixel in the output image). Non-finite values in the input tensor are +// replaced by this tensor in the output image. The default value is the color +// red. +// +// Arguments: +// writer: A handle to a summary writer. +// global_step: The step to write the summary for. +// tag: Scalar. Used to build the `tag` attribute of the summary values. +// tensor: 4-D of shape `[batch_size, height, width, channels]` where +// `channels` is 1, 3, or 4. +// bad_color: Color to use for pixels with non-finite values. +// +// Returns the created operation. +func WriteImageSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) { + if scope.Err() != nil { + return + } + attrs := map[string]interface{}{} + for _, a := range optional { + a(attrs) + } + opspec := tf.OpSpec{ + Type: "WriteImageSummary", + Input: []tf.Input{ + writer, global_step, tag, tensor, bad_color, + }, + Attrs: attrs, + } + return scope.AddOperation(opspec) +} + // Pads a tensor with zeros. // // This operation pads a `input` with zeros according to the `paddings` you @@ -20306,7 +20622,7 @@ func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional . // gradients: The backpropagated gradients to the corresponding softsign operation. // features: The features passed as input to the corresponding softsign operation. // -// Returns The gradients: `gradients / (1 + abs(-features)) ** 2`. +// Returns The gradients: `gradients / (1 + abs(features)) ** 2`. func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) { if scope.Err() != nil { return diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD index 64b37677357..ee07fc48132 100644 --- a/tensorflow/java/BUILD +++ b/tensorflow/java/BUILD @@ -5,7 +5,9 @@ package(default_visibility = ["//visibility:private"]) licenses(["notice"]) # Apache 2.0 -load("build_defs", "JAVACOPTS") +load(":build_defs.bzl", "JAVACOPTS") +load(":src/gen/gen_ops.bzl", "tf_java_op_gen_srcjar") +load("//tensorflow:tensorflow.bzl", "tf_copts") java_library( name = "tensorflow", @@ -34,12 +36,58 @@ filegroup( filegroup( name = "java_op_sources", - srcs = glob(["src/main/java/org/tensorflow/op/**/*.java"]), + srcs = glob(["src/main/java/org/tensorflow/op/**/*.java"]) + [ + ":java_op_gen_sources", + ], visibility = [ "//tensorflow/java:__pkg__", ], ) +tf_java_op_gen_srcjar( + name = "java_op_gen_sources", + gen_base_package = "org.tensorflow.op", + gen_tool = "java_op_gen_tool", + ops_libs = [ + "array_ops", + "candidate_sampling_ops", + "control_flow_ops", + "data_flow_ops", + "image_ops", + "io_ops", + "linalg_ops", + "logging_ops", + "math_ops", + "nn_ops", + "no_op", + "parsing_ops", + "random_ops", + "sparse_ops", + "state_ops", + "string_ops", + "training_ops", + "user_ops", + ], +) + +# Build the gen tool as a library, as it will be linked to a core/ops binary +# file before making it an executable. See tf_java_op_gen_srcjar(). +cc_library( + name = "java_op_gen_tool", + srcs = glob([ + "src/gen/cc/*.h", + "src/gen/cc/*.cc", + ]), + copts = tf_copts(), + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:proto_text", + "//tensorflow/core:protos_all_cc", + ], +) + java_library( name = "testutil", testonly = 1, diff --git a/tensorflow/java/maven/README.md b/tensorflow/java/maven/README.md index 17bb799961d..62277753618 100644 --- a/tensorflow/java/maven/README.md +++ b/tensorflow/java/maven/README.md @@ -1,11 +1,13 @@ # TensorFlow for Java using Maven -The [TensorFlow Java -API](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary) -is available through artifacts uploaded to [Maven -Central](https://oss.sonatype.org/content/repositories/snapshots/org/tensorflow/). -This document describes the process of updating the release artifacts. It does -_not_ describe how to use the artifacts, for which the reader is referred to the +The +[TensorFlow Java API](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary) +is available on Maven Central and JCenter through artifacts uploaded to +[OSS Sonatype](https://oss.sonatype.org/content/repositories/releases/org/tensorflow/) and +[Bintray](https://bintray.com/google/tensorflow/tensorflow) respectively. This +document describes the process of updating the release artifacts. It does _not_ +describe how to use the artifacts, for which the reader is referred to +the [TensorFlow for Java installation instructions](https://www.tensorflow.org/code/tensorflow/java/README.md). ## Background @@ -20,7 +22,7 @@ Hence, the process for building and uploading release artifacts is not a single ## Artifact Structure -There are five artifacts and thus `pom.xml`s involved in this release: +There are six artifacts and thus `pom.xml`s involved in this release: 1. `tensorflow`: The single dependency for projects requiring TensorFlow for Java. This convenience package depends on the two below, and is the one that @@ -37,16 +39,22 @@ There are five artifacts and thus `pom.xml`s involved in this release: 4. `proto`: Generated Java code for TensorFlow protocol buffers (e.g., `MetaGraphDef`, `ConfigProto` etc.) -5. [`parentpom`](https://maven.apache.org/pom/index.html): Common settings +5. `tensorflow-android`: A package geared towards + supporting [TensorFlow on Android](../../contrib/android/README.md), and is + a self-contained Android AAR library containing all necessary native and + Java code. + +6. [`parentpom`](https://maven.apache.org/pom/index.html): Common settings shared by all of the above. + ## Updating the release -The TensorFlow artifacts at Maven Central are created from files built as part -of the TensorFlow release process (which uses `bazel`). The author's lack of -familiarity with Maven best practices combined with the use of a different build -system means that this process is possibly not ideal, but it's what we've got. -Suggestions are welcome. +The Maven artifacts are created from files built as part of the TensorFlow +release process (which uses `bazel`). The author's lack of familiarity with +Maven best practices combined with the use of a different build system means +that this process is possibly not ideal, but it's what we've got. Suggestions +are welcome. In order to isolate the environment used for building, all release processes are conducted in a [Docker](https://www.docker.com) container. @@ -59,16 +67,28 @@ conducted in a [Docker](https://www.docker.com) container. account does not have permissions, then you'll need to ask someone who does to [file a ticket](https://issues.sonatype.org/) to add to the permissions ([sample ticket](https://issues.sonatype.org/browse/MVNCENTRAL-1637)). -- A GPG signing key, required [to sign the release artifacts](http://central.sonatype.org/pages/apache-maven.html#gpg-signed-components). +- An account at [bintray.com](https://bintray.com) that has permissions to + update the [tensorflow repository](https://bintray.com/google/tensorflow). + If your account does not have permissions, then you'll need to ask one of + the [organization administrators](https://bintray.com/google) to give you + permissions to update the `tensorflow` repository. Please keep the + [repository option](https://bintray.com/google/tensorflow/edit?tab=general) + to *"GPG sign uploaded files using Bintray's public/private key pair"* + **unchecked**, otherwise it will conflict with locally signed artifacts. +- A GPG signing key, required + [to sign the release artifacts](http://central.sonatype.org/pages/apache-maven.html#gpg-signed-components). -### Deploying to Maven Central +### Deploying to Sonatype and Bintray -1. Create a file with your OSSRH credentials (or perhaps you use `mvn` and have - it in `~/.m2/settings.xml`): +1. Create a file with your OSSRH credentials and + [Bintray API key](https://bintray.com/docs/usermanual/interacting/interacting_interacting.html#anchorAPIKEY) + (or perhaps you use `mvn` and have it in `~/.m2/settings.xml`): ```sh SONATYPE_USERNAME="your_sonatype.org_username_here" SONATYPE_PASSWORD="your_sonatype.org_password_here" + BINTRAY_USERNAME="your_bintray_username_here" + BINTRAY_API_KEY="your_bintray_api_key_here" GPG_PASSPHRASE="your_gpg_passphrase_here" cat >/tmp/settings.xml < @@ -78,19 +98,16 @@ conducted in a [Docker](https://www.docker.com) container. ${SONATYPE_USERNAME} ${SONATYPE_PASSWORD} + + bintray + ${BINTRAY_USERNAME} + ${BINTRAY_API_KEY} + - - - ossrh - - true - - - gpg2 - ${GPG_PASSPHRASE} - - - + + gpg2 + ${GPG_PASSPHRASE} + EOF ``` @@ -98,30 +115,83 @@ conducted in a [Docker](https://www.docker.com) container. 2. Run the `release.sh` script. 3. If the script above succeeds then the artifacts would have been uploaded to - the private staging repository. After verifying the release, visit - https://oss.sonatype.org/#stagingRepositories, find the `org.tensorflow` - release and click on either `Release` to finalize the release, or `Drop` to - abort. Some things of note: + the private staging repository in Sonatype, and as unpublished artifacts in + Bintray. After verifying the release, you should finalize or abort the + release on both sites. +4. Visit https://oss.sonatype.org/#stagingRepositories, find the `org.tensorflow` + release and click on either `Release` to finalize the release, or `Drop` to + abort. + +5. Visit https://bintray.com/google/tensorflow/tensorflow, and select the + version you just uploaded. Notice there's a message about unpublished + artifacts. Click on either `Publish` to finalize the release, or `Discard` + to abort. + +6. Some things of note: - For details, look at the [Sonatype guide](http://central.sonatype.org/pages/releasing-the-deployment.html). - Syncing with [Maven Central](http://repo1.maven.org/maven2/org/tensorflow/) can take 10 minutes to 2 hours (as per the [OSSRH guide](http://central.sonatype.org/pages/ossrh-guide.html#releasing-to-central)). + - For Bintray details, refer to their guide on + [managing uploaded content](https://bintray.com/docs/usermanual/uploads/uploads_managinguploadedcontent.html#_publishing). -4. Upon successful release, commit changes to all the `pom.xml` files +7. Upon successful release, commit changes to all the `pom.xml` files (which should have the updated version number). ### Snapshots If the `TF_VERSION` provided to the `release.sh` script ends in `-SNAPSHOT`, then instead of using official release files, the nightly build artifacts from -https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/ and -https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow-windows/ will -be used to upload to the Maven Central snapshots repository. +https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/, +https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow-windows/ and +https://ci.tensorflow.org/view/Nightly/job/nightly-android +will be used to upload to the Maven Central snapshots repository. (Note that +snapshots are only uploaded to Maven Central, not Bintray.) +### Skip deploying to a repository + +Should you need, setting environment variables `DEPLOY_OSSRH=0` or +`DEPLOY_BINTRAY=0` when calling `release.sh` will skip deploying to OSSRH or +Bintray respectively. Note that snapshots are only uploaded to OSSRH, so you +cannot skip deploying to OSSRH for a `-SNAPSHOT` version. + +## The overall flow + +This section provides some pointers around how artifacts are currently +assembled. + +All native and java code is first built and tested on +a [Tensorflow Jenkins server](https://ci.tensorflow.org/) which run various +scripts under the [`tools/ci_build`](../../tools/ci_build/) directory. Of +particular interest may be `tools/ci_build/builds/libtensorflow.sh` which +bundles Java-related build sources and outputs into archives, and +`tools/ci_build/builds/android_full.sh` which produces an Android AAR package. + +Maven artifacts however are not created in Jenkins. Instead, artifacts are +created and deployed externally on-demand, when a maintainer runs the +`release.sh` script. + +This script spins up a Docker instance which downloads the archives created by +successful runs of various `tools/ci_build` scripts on the Tensorflow Jenkins +server. + +It organizes these archives locally into a maven-friendly layout, and runs `mvn +deploy` to create maven artifacts within the container. Native libraries built +in Jenkins are used as-is, but srcjars for java code are used to compile class +files and generate javadocs.) It also downloads the Android AAR from the Jenkins +server and directly deploys it via `mvn gpg:sign-and-deploy-file`. + +`release.sh` then stages these artifacts to OSSRH and Bintray, and if all goes +well a maintainer can log into both sites to promote them as a new release. + +There is a small change to the flow for a standard (rather than a `-SNAPSHOT`) +release. Rather than downloading archives directly from jobs on the Jenkins +server, the script uses a static repository of QA-blessed archives. ## References - [Sonatype guide](http://central.sonatype.org/pages/ossrh-guide.html) for hosting releases. - [Ticket that created the `org/tensorflow` configuration](https://issues.sonatype.org/browse/OSSRH-28072) on OSSRH. +- The [Bintray User Manual](https://bintray.com/docs/usermanual/index.html) diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml index cc4fbc4a750..0a3552d7563 100644 --- a/tensorflow/java/maven/pom.xml +++ b/tensorflow/java/maven/pom.xml @@ -33,18 +33,35 @@ proto - - - - + + + ossrh - https://oss.sonatype.org/content/repositories/snapshots - - - ossrh - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - + + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + + ossrh + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + + bintray + + + + bintray + https://api.bintray.com/maven/google/tensorflow/tensorflow/;publish=0 + + + + @@ -55,19 +72,6 @@ - - - org.sonatype.plugins - nexus-staging-maven-plugin - 1.6.7 - true - - ossrh - https://oss.sonatype.org/ - - false - - org.apache.maven.plugins diff --git a/tensorflow/java/maven/release.sh b/tensorflow/java/maven/release.sh index b95a4d4674e..9012ea14ea6 100755 --- a/tensorflow/java/maven/release.sh +++ b/tensorflow/java/maven/release.sh @@ -49,6 +49,8 @@ fi set -ex docker run \ -e TF_VERSION="${TF_VERSION}" \ + -e DEPLOY_OSSRH="${DEPLOY_OSSRH:-true}" \ + -e DEPLOY_BINTRAY="${DEPLOY_BINTRAY:-true}" \ -v ${PWD}:/tensorflow \ -v "${SETTINGS_XML}":/root/.m2/settings.xml \ -v ${HOME}/.gnupg:/root/.gnupg \ diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh index 6b4d5d70327..a2ce0971954 100644 --- a/tensorflow/java/maven/run_inside_container.sh +++ b/tensorflow/java/maven/run_inside_container.sh @@ -19,11 +19,23 @@ RELEASE_URL_PREFIX="https://storage.googleapis.com/tensorflow/libtensorflow" + +# By default we deploy to both ossrh and bintray. These two +# environment variables can be set to skip either repository. +DEPLOY_BINTRAY="${DEPLOY_BINTRAY:-true}" +DEPLOY_OSSRH="${DEPLOY_OSSRH:-true}" + IS_SNAPSHOT="false" if [[ "${TF_VERSION}" == *"-SNAPSHOT" ]]; then IS_SNAPSHOT="true" + # Bintray does not allow snapshots. + DEPLOY_BINTRAY="false" fi PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip" +if [[ "${DEPLOY_BINTRAY}" != "true" && "${DEPLOY_OSSRH}" != "true" ]]; then + echo "Must deploy to at least one of Bintray or OSSRH" >&2 + exit 2 +fi set -ex @@ -39,6 +51,20 @@ update_version_in_pom() { mvn versions:set -DnewVersion="${TF_VERSION}" } +# Fetch a property from pom files for a given profile. +# Arguments: +# profile - name of the selected profile. +# property - name of the property to be retrieved. +# Output: +# Echo property value to stdout +mvn_property() { + local profile="$1" + local prop="$2" + mvn -q --non-recursive exec:exec -P "${profile}" \ + -Dexec.executable='echo' \ + -Dexec.args="\${${prop}}" +} + download_libtensorflow() { if [[ "${IS_SNAPSHOT}" == "true" ]]; then URL="http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/TYPE=cpu-slave/lastSuccessfulBuild/artifact/lib_package/libtensorflow-src.jar" @@ -137,29 +163,50 @@ generate_java_protos() { rm -rf "${DIR}/proto/tmp" } +# Deploy artifacts using a specific profile. +# Arguments: +# profile - name of selected profile. +# Outputs: +# n/a +deploy_profile() { + local profile="$1" + # Deploy the non-android pieces. + mvn deploy -P"${profile}" + # Determine the correct pom file property to use + # for the repository url. + local rtype + if [[ "${IS_SNAPSHOT}" == "true" ]]; then + rtype='snapshotRepository' + else + rtype='repository' + fi + local url=$(mvn_property "${profile}" "project.distributionManagement.${rtype}.url") + local repositoryId=$(mvn_property "${profile}" "project.distributionManagement.${rtype}.id") + mvn gpg:sign-and-deploy-file \ + -Dfile="${DIR}/tensorflow-android/target/tensorflow.aar" \ + -DpomFile="${DIR}/tensorflow-android/target/pom-android.xml" \ + -Durl="${url}" \ + -DrepositoryId="${repositoryId}" +} + # If successfully built, try to deploy. # If successfully deployed, clean. # If deployment fails, debug with # ./release.sh ${TF_VERSION} ${SETTINGS_XML} bash # To get a shell to poke around the maven artifacts with. deploy_artifacts() { - # This deploys the non-android pieces - mvn deploy - - # Sign and deploy the previously downloaded aar file as a single - # maven artifact. - if [[ "${IS_SNAPSHOT}" == "true" ]]; then - REPO="https://oss.sonatype.org/content/repositories/snapshots" - else - REPO="https://oss.sonatype.org/service/local/staging/deploy/maven2/" + # Deploy artifacts to ossrh if requested. + if [[ "${DEPLOY_OSSRH}" == "true" ]]; then + deploy_profile 'ossrh' + fi + # Deploy artifacts to bintray if requested. + if [[ "${DEPLOY_BINTRAY}" == "true" ]]; then + deploy_profile 'bintray' fi - mvn gpg:sign-and-deploy-file -Dfile="${DIR}/tensorflow-android/target/tensorflow.aar" -DpomFile="${DIR}/tensorflow-android/target/pom-android.xml" -Durl=${REPO} -DrepositoryId=ossrh - # Clean up when everything works clean } - if [ -z "${TF_VERSION}" ] then echo "Must set the TF_VERSION environment variable" @@ -189,8 +236,14 @@ set +ex if [[ "${IS_SNAPSHOT}" == "false" ]]; then echo "Uploaded to the staging repository" echo "After validating the release: " - echo "1. Login to https://oss.sonatype.org/#stagingRepositories" - echo "2. Find the 'org.tensorflow' staging release and click either 'Release' to release or 'Drop' to abort" + if [[ "${DEPLOY_OSSRH}" == "true" ]]; then + echo "* Login to https://oss.sonatype.org/#stagingRepositories" + echo "* Find the 'org.tensorflow' staging release and click either 'Release' to release or 'Drop' to abort" + fi + if [[ "${DEPLOY_BINTRAY}" == "true" ]]; then + echo "* Login to https://bintray.com/google/tensorflow/tensorflow" + echo "* Either 'Publish' unpublished items to release, or 'Discard' to abort" + fi else echo "Uploaded to the snapshot repository" fi diff --git a/tensorflow/java/src/gen/cc/op_gen_main.cc b/tensorflow/java/src/gen/cc/op_gen_main.cc new file mode 100644 index 00000000000..bc698124bf9 --- /dev/null +++ b/tensorflow/java/src/gen/cc/op_gen_main.cc @@ -0,0 +1,82 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include +#include + +#include "tensorflow/core/platform/init_main.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/util/command_line_flags.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/java/src/gen/cc/op_generator.h" + +namespace tensorflow { +namespace op_gen { + +const char kUsageHeader[] = + "\n\nGenerator of operation wrappers in Java.\n\n" + "This executable generates wrappers for all registered operations it has " + "been compiled with. A wrapper exposes an intuitive and strongly-typed\n" + "interface for building its underlying operation and linking it into a " + "graph.\n\n" + "Operation wrappers are generated under the path specified by the " + "'--output_dir' argument. This path can be absolute or relative to the\n" + "current working directory and will be created if it does not exists.\n\n" + "The '--lib_name' argument is used to classify the set of operations. If " + "the chosen name contains more than one word, it must be provided in \n" + "snake_case. This value is declined into other meaningful names, such as " + "the group and package of the generated operations. For example,\n" + "'--lib_name=my_lib' generates the operations under the " + "'org.tensorflow.op.mylib' package and add them to the 'myLib()' operator\n" + "group.\n\n" + "Note that the operator group assigned to the generated wrappers is just " + "an annotation tag at this stage. Operations will not be available through\n" + "the 'org.tensorflow.op.Ops' API as a group until the generated classes " + "are compiled using an appropriate annotation processor.\n\n" + "Finally, the '--base_package' overrides the default parent package " + "under which the generated subpackage and classes are to be located.\n\n"; + +} // namespace op_gen +} // namespace tensorflow + +int main(int argc, char* argv[]) { + tensorflow::string lib_name; + tensorflow::string output_dir; + tensorflow::string base_package = "org.tensorflow.op"; + std::vector flag_list = { + tensorflow::Flag("output_dir", &output_dir, + "Root directory into which output files are generated"), + tensorflow::Flag("lib_name", &lib_name, + "A name, in snake_case, used to classify this set of operations"), + tensorflow::Flag("base_package", &base_package, + "Package parent to the generated subpackage and classes") + }; + tensorflow::string usage = tensorflow::op_gen::kUsageHeader; + usage += tensorflow::Flags::Usage(argv[0], flag_list); + bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list); + tensorflow::port::InitMain(usage.c_str(), &argc, &argv); + QCHECK(parsed_flags_ok && !lib_name.empty() && !output_dir.empty()) << usage; + + tensorflow::OpGenerator generator; + tensorflow::OpList ops; + tensorflow::OpRegistry::Global()->Export(true, &ops); + tensorflow::Status status = + generator.Run(ops, lib_name, base_package, output_dir); + TF_QCHECK_OK(status); + + return 0; +} diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc new file mode 100644 index 00000000000..814a08c6ccf --- /dev/null +++ b/tensorflow/java/src/gen/cc/op_generator.cc @@ -0,0 +1,68 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/java/src/gen/cc/op_generator.h" + +namespace tensorflow { +namespace { + +string CamelCase(const string& str, char delimiter, bool upper) { + string result; + bool cap = upper; + for (string::const_iterator it = str.begin(); it != str.end(); ++it) { + const char c = *it; + if (c == delimiter) { + cap = true; + } else if (cap) { + result += toupper(c); + cap = false; + } else { + result += c; + } + } + return result; +} + +} // namespace + +OpGenerator::OpGenerator() + : env(Env::Default()) { +} + +OpGenerator::~OpGenerator() {} + +Status OpGenerator::Run(const OpList& ops, const string& lib_name, + const string& base_package, const string& output_dir) { + const string package = + base_package + '.' + str_util::StringReplace(lib_name, "_", "", true); + const string package_path = + output_dir + '/' + str_util::StringReplace(package, ".", "/", true); + const string group = CamelCase(lib_name, '_', false); + + if (!env->FileExists(package_path).ok()) { + TF_CHECK_OK(env->RecursivelyCreateDir(package_path)); + } + + LOG(INFO) << "Generating Java wrappers for '" << lib_name << "' operations"; + // TODO(karllessard) generate wrappers from list of ops + + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/java/src/gen/cc/op_generator.h b/tensorflow/java/src/gen/cc/op_generator.h new file mode 100644 index 00000000000..98a1f8d5346 --- /dev/null +++ b/tensorflow/java/src/gen/cc/op_generator.h @@ -0,0 +1,51 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_ +#define TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_ + +#include + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { + +/// \brief A generator of Java operation wrappers. +/// +/// Such generator is normally ran only once per executable, outputting +/// wrappers for the all registered operations it has been compiled with. +/// Nonetheless, it is designed to support multiple runs, giving a different +/// list of operations on each cycle. +class OpGenerator { + public: + OpGenerator(); + virtual ~OpGenerator(); + + /// \brief Generates wrappers for the given list of 'ops'. + /// + /// Output files are generated in //, + /// where 'lib_package' is derived from 'lib_name'. + Status Run(const OpList& ops, const string& lib_name, + const string& base_package, const string& output_dir); + + private: + Env* env; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_ diff --git a/tensorflow/java/src/gen/gen_ops.bzl b/tensorflow/java/src/gen/gen_ops.bzl new file mode 100644 index 00000000000..e0d5556122b --- /dev/null +++ b/tensorflow/java/src/gen/gen_ops.bzl @@ -0,0 +1,59 @@ +# -*- Python -*- + +load("//tensorflow:tensorflow.bzl", "tf_copts") + +# Given a list of "ops_libs" (a list of files in the core/ops directory +# without their .cc extensions), generate Java wrapper code for all operations +# found in the ops files. +# Then, combine all those source files into a single archive (.srcjar). +# +# For example: +# tf_java_op_gen_srcjar("gen_sources", "gen_tool", "my.package", [ "array_ops", "math_ops" ]) +# +# will create a genrule named "gen_sources" that first generate source files: +# ops/src/main/java/my/package/array/*.java +# ops/src/main/java/my/package/math/*.java +# +# and then archive those source files in: +# ops/gen_sources.srcjar +# +def tf_java_op_gen_srcjar(name, + gen_tool, + gen_base_package, + ops_libs=[], + ops_libs_pkg="//tensorflow/core", + out_dir="ops/", + out_src_dir="src/main/java/", + visibility=["//tensorflow/java:__pkg__"]): + + gen_tools = [] + gen_cmds = ["rm -rf $(@D)"] # Always start from fresh when generating source files + + # Construct an op generator binary for each ops library. + for ops_lib in ops_libs: + gen_lib = ops_lib[:ops_lib.rfind('_')] + out_gen_tool = out_dir + ops_lib + "_gen_tool" + + native.cc_binary( + name=out_gen_tool, + copts=tf_copts(), + linkopts=["-lm"], + linkstatic=1, # Faster to link this one-time-use binary dynamically + deps=[gen_tool, ops_libs_pkg + ":" + ops_lib + "_op_lib"]) + + gen_tools += [":" + out_gen_tool] + gen_cmds += ["$(location :" + out_gen_tool + ")" + + " --output_dir=$(@D)/" + out_src_dir + + " --lib_name=" + gen_lib + + " --base_package=" + gen_base_package] + + # Generate a source archive containing generated code for these ops. + gen_srcjar = out_dir + name + ".srcjar" + gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) ."] + + native.genrule( + name=name, + srcs=["@local_jdk//:jar"] + ["@local_jdk//:jdk"], + outs=[gen_srcjar], + tools=gen_tools, + cmd='&&'.join(gen_cmds)) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 6597889fbcb..26e0f86c37b 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -1766,6 +1766,8 @@ py_library( srcs_version = "PY2AND3", deps = [ ":array_ops", + ":array_ops_gen", + ":dtypes", ":framework_ops", ":resource_variable_ops_gen", ":tensor_shape", @@ -1775,7 +1777,7 @@ py_library( "//tensorflow/python/eager:context", "//tensorflow/python/eager:custom_gradient", "//tensorflow/python/eager:tape", - "//tensorflow/python/eager:tensor", + "//tensorflow/python/eager:tensor_node", ], ) @@ -2347,7 +2349,7 @@ cuda_py_test( cuda_py_test( name = "gradients_test", - size = "small", + size = "medium", srcs = ["ops/gradients_test.py"], additional_deps = [ ":array_grad", @@ -2550,6 +2552,7 @@ py_library( srcs_version = "PY2AND3", deps = [ ":array_ops", + ":checkpoint_ops_gen", ":client", ":control_flow_ops", ":data_flow_ops", @@ -3573,6 +3576,28 @@ py_test( ], ) +py_test( + name = "checkpoint_ops_test", + size = "small", + srcs = ["training/checkpoint_ops_test.py"], + srcs_version = "PY2AND3", + tags = ["no_windows"], + deps = [ + ":checkpoint_ops_gen", + ":client", + ":client_testlib", + ":framework_for_generated_wrappers", + ":io_ops", + ":partitioned_variables", + ":platform", + ":pywrap_tensorflow", + ":state_ops", + ":training", + ":variable_scope", + ":variables", + ], +) + py_test( name = "monitored_session_test", size = "small", diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i index 08dd3922dbe..fa49e66e87b 100644 --- a/tensorflow/python/client/tf_session.i +++ b/tensorflow/python/client/tf_session.i @@ -373,6 +373,33 @@ def TF_Reset(target, containers=None, config=None): TF_DeleteSessionOptions(opts) %} +// We use TF_GraphToFunction_wrapper instead of TF_GraphToFunction +%ignore TF_GraphToFunction; +// TF_GraphToFunction_wrapper does not use any Python methods and +// does not require GIL to be held. +%unignore TF_GraphToFunction_wrapper; + +// $input is a Python list of wrapped TF_Operations +%typemap(in) (const std::vector* opers) + (std::vector opers) { + if ($input != Py_None) { + if (!PyList_Check($input)) { + SWIG_exception_fail(SWIG_TypeError, "$symname: expected list"); + } + size_t size = PyList_Size($input); + for (int i = 0; i < size; ++i) { + PyObject* item = PyList_GetItem($input, i); + TF_Operation* oper_ptr; + SWIG_ConvertPtr(item, reinterpret_cast(&oper_ptr), + $descriptor(TF_Operation*), 0); + opers.push_back(oper_ptr); + } + $1 = &opers; + } else { + $1 = nullptr; + } +} + %include "tensorflow/python/client/tf_session_helper.h" %unignoreall diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc index 60a589fa8bb..72f560fa878 100644 --- a/tensorflow/python/client/tf_session_helper.cc +++ b/tensorflow/python/client/tf_session_helper.cc @@ -337,4 +337,38 @@ std::vector TF_OperationGetControlInputs_wrapper( return control_inputs; } +TF_Function* TF_GraphToFunction_wrapper(const TF_Graph* fn_body, + const char* fn_name, + const std::vector* opers, + const std::vector& inputs, + const std::vector& outputs, + const NameVector& output_names, + const TF_FunctionOptions* opts, + TF_Status* out_status) { + if (!output_names.empty() && output_names.size() != outputs.size()) { + Set_TF_Status_from_Status( + out_status, + errors::InvalidArgument( + "output names must be either empty or equal in size to outputs. ", + "output names size = ", output_names.size(), + " outputs size = ", outputs.size())); + return nullptr; + } + + int nopers = -1; + const TF_Operation* const* opers_array = nullptr; + if (opers != nullptr) { + nopers = opers->size(); + opers_array = opers->data(); + } + + const char** output_names_ptr = + output_names.empty() ? nullptr + : const_cast(output_names.data()); + + return TF_GraphToFunction(fn_body, fn_name, nopers, opers_array, + inputs.size(), inputs.data(), outputs.size(), + outputs.data(), output_names_ptr, opts, out_status); +} + } // namespace tensorflow diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h index 3bc63f822fe..8fae6206c07 100644 --- a/tensorflow/python/client/tf_session_helper.h +++ b/tensorflow/python/client/tf_session_helper.h @@ -148,6 +148,16 @@ void TF_SessionPRun_wrapper(TF_Session* session, const char* handle, std::vector TF_OperationGetControlInputs_wrapper( TF_Operation* oper); +// `opers` equaling NULL are converted to `nopers = -1`. +// `output_names` must be empty or have the same length as `outputs`. +TF_Function* TF_GraphToFunction_wrapper(const TF_Graph* fn_body, + const char* fn_name, + const std::vector* opers, + const std::vector& inputs, + const std::vector& outputs, + const NameVector& output_names, + const TF_FunctionOptions* opts, + TF_Status* out_status); } // namespace tensorflow #endif // TENSORFLOW_PYTHON_CLIENT_TF_SESSION_HELPER_H_ diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py index ca3ad1a2c33..326f56ebf9b 100644 --- a/tensorflow/python/eager/backprop.py +++ b/tensorflow/python/eager/backprop.py @@ -169,10 +169,6 @@ def _record_gradient(op_name, inputs, attrs, results, name): execute.record_gradient = _record_gradient -def _ones(shape, dtype): - return array_ops.fill(shape, tensor.Tensor(1, dtype=dtype)) - - def _aggregate_grads(gradients): """Aggregate gradients of the same tensor.""" grad_lists = dict() @@ -225,7 +221,7 @@ def implicit_val_and_grad(f): (end_node.progenitors, repr(start_node))) output_gradients = kwds.get("output_gradients", None) if output_gradients is None: - output_gradients = _ones(end_node.shape, end_node.dtype) + output_gradients = array_ops.ones_like(end_node.value) grad = ag_core.backward_pass(output_gradients, end_node, start_node) return end_node.value, _aggregate_grads(grad.gradients) diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py index 010124ed56a..b4379055096 100644 --- a/tensorflow/python/eager/backprop_test.py +++ b/tensorflow/python/eager/backprop_test.py @@ -85,7 +85,7 @@ class BackpropTest(test.TestCase): initial_value=tensor.Tensor(1.0), name='x') def fn(): - tape.watch(x.handle) + tape.watch_variable(x) b = tensor.Tensor(2.0) c = math_ops.add(x.value(), b) return math_ops.add(c, tensor.Tensor(3.0)) @@ -307,6 +307,20 @@ class BackpropTest(test.TestCase): [tensor_shape.TensorShape(s).as_proto() for s in shape_list], backprop.make_attr([pywrap_tensorflow.TF_ATTR_SHAPE], shape_list)) + def testMultiValueConvertToTensor(self): + x = resource_variable_ops.ResourceVariable( + initial_value=array_ops.constant([1.0]), name='x') + + def fn(): + tape.watch_variable(x) + a = math_ops.add(x.value(), 1.0) + # Make sure convert_to_tensor works correctly with list of TensorNodes. + b = array_ops.stack([a, a], axis=0) + return math_ops.reduce_mean(b) + + grad = backprop.implicit_grad(fn)()[0][1] + self.assertAllEqual([1.0], grad.numpy()) + if __name__ == '__main__': test.main() diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py index 27ffdd98105..a5a93b7bbe0 100644 --- a/tensorflow/python/eager/context.py +++ b/tensorflow/python/eager/context.py @@ -171,16 +171,6 @@ class Context(object): """Sets summary writer resource.""" self._summary_writer_resource = resource - @property - def recording_summaries(self): - """Returns True if recording summaries is enabled in current thread..""" - return self._eager_context.recording_summaries - - @recording_summaries.setter - def recording_summaries(self, val): - """Enables recording summaries is enabled in current thread..""" - self._eager_context.recording_summaries = val - @property def device_name(self): """Returns the device name for the current thread.""" @@ -360,24 +350,6 @@ def device(name): return context().device(name) -@contextlib.contextmanager -def record_summaries(): - """Context-manager to enable recording of summaries.""" - ctx = context() - old = ctx.recording_summaries - ctx.recording_summaries = True - try: - yield - finally: - ctx.recording_summaries = old - - -def should_record_summary(): - """True if a summary should be recorded now.""" - c = context() - return c.recording_summaries and c.summary_writer_resource is not None - - def run(main=None, argv=None): """Runs the program with an optional 'main' function and 'argv' list. diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py index 7ae80aa156a..5de396f62c3 100644 --- a/tensorflow/python/eager/core_test.py +++ b/tensorflow/python/eager/core_test.py @@ -55,10 +55,6 @@ class TFETest(test_util.TensorFlowTestCase): ctx.summary_writer_resource = 'mock' self.assertEqual('mock', ctx.summary_writer_resource) - self.assertFalse(ctx.recording_summaries) - ctx.recording_summaries = True - self.assertTrue(ctx.recording_summaries) - self.assertEqual('', ctx.device_name) self.assertEqual(ctx.device_name, ctx.device_spec.to_string()) with ctx.device('GPU:0'): @@ -95,8 +91,7 @@ class TFETest(test_util.TensorFlowTestCase): return [ ctx.in_graph_mode(), ctx.in_eager_mode(), ctx.scope_name, ctx.summary_writer_resource, - ctx.recording_summaries, ctx.device_name, - ctx.num_gpus() + ctx.device_name, ctx.num_gpus() ] def get_values(ctx, values): diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py index 18b722e7923..c15dde9e487 100644 --- a/tensorflow/python/eager/function_test.py +++ b/tensorflow/python/eager/function_test.py @@ -29,6 +29,7 @@ from tensorflow.python.framework import function as tf_function from tensorflow.python.ops import array_ops from tensorflow.python.ops import clip_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops class FunctionTest(test.TestCase): @@ -52,6 +53,19 @@ class FunctionTest(test.TestCase): out = sq(t) self.assertAllEqual(out.numpy(), math_ops.matmul(t, t).numpy()) + def testGraphModeWithGradients(self): + v = resource_variable_ops.ResourceVariable(1.0) + + @function.defun + def step(): + def inner(): + tape.watch(v.handle) + return v * v + + return backprop.implicit_grad(inner)()[0][1] + + self.assertAllEqual(step().numpy(), 2.0) + def testTensorConversionWithDefun(self): @function.defun diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc index c46a3d8db37..62579bd23ae 100644 --- a/tensorflow/python/eager/python_eager_op_gen.cc +++ b/tensorflow/python/eager/python_eager_op_gen.cc @@ -659,14 +659,26 @@ void GenEagerPythonOp::AddEagerExecute(const string& num_outputs_expr) { string GetEagerPythonOps(const OpList& ops, const std::vector& hidden_ops, - bool require_shapes) { + bool require_shapes, + const string& source_file_name = "") { + string result; // Header // TODO(josh11b): Mention the library for which wrappers are being generated. - strings::StrAppend(&result, R"("""Python wrappers for TensorFlow ops. + strings::StrAppend(&result, R"("""Python wrappers around TensorFlow ops. This file is MACHINE GENERATED! Do not edit. -""" +)"); + + // Mention the original source file so someone tracing back through generated + // Python code will know where to look next. + if (!source_file_name.empty()) { + strings::StrAppend(&result, "Original C++ source file: "); + strings::StrAppend(&result, source_file_name); + strings::StrAppend(&result, "\n"); + } + + strings::StrAppend(&result, R"(""" import collections as _collections @@ -747,8 +759,11 @@ from tensorflow.python.framework import op_def_library as _op_def_library void PrintEagerPythonOps(const OpList& ops, const std::vector& hidden_ops, - bool require_shapes) { - printf("%s", GetEagerPythonOps(ops, hidden_ops, require_shapes).c_str()); + bool require_shapes, + const string& source_file_name) +{ + printf("%s", GetEagerPythonOps(ops, hidden_ops, require_shapes, + source_file_name).c_str()); } string GetEagerPythonWrappers(const char* op_list_buf, size_t op_list_len) { diff --git a/tensorflow/python/eager/python_eager_op_gen.h b/tensorflow/python/eager/python_eager_op_gen.h index 9a7ed28cf94..250623850f2 100644 --- a/tensorflow/python/eager/python_eager_op_gen.h +++ b/tensorflow/python/eager/python_eager_op_gen.h @@ -24,9 +24,12 @@ namespace tensorflow { // hidden_ops should be a list of Op names that should get a leading _ // in the output. Prints the output to stdout. +// Optional fourth argument is the name of the original C++ source file +// where the ops' REGISTER_OP() calls reside. void PrintEagerPythonOps(const OpList& ops, const std::vector& hidden_ops, - bool require_shapes); + bool require_shapes, + const string& source_file_name = ""); // Get the python wrappers for a list of ops in a OpList. // `op_list_buf` should be a pointer to a buffer containing diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py index 4d09db73c97..9cd29f630df 100644 --- a/tensorflow/python/eager/tape.py +++ b/tensorflow/python/eager/tape.py @@ -151,6 +151,15 @@ def watch(tensor): return tensor +def watch_variable(resource_variable): + """Marks this ResourceVariable to be watched by all tapes in the stack. + + Args: + resource_variable: A ResourceVariable to be watched. + """ + watch(resource_variable.handle) # py-lint: disable=protected-access + + def pop_tape(): """Pops the top tape in the stack, if any.""" if _tape_stack.stack: diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py index 8d0f639ddcb..bd8e653b976 100644 --- a/tensorflow/python/eager/tensor_test.py +++ b/tensorflow/python/eager/tensor_test.py @@ -77,8 +77,8 @@ class TFETensorTest(test_util.TensorFlowTestCase): def testMultiLineTensorStr(self): t = tensor.Tensor(np.eye(3)) tensor_str = str(t) - self.assertIn("shape=%s, dtype=%s, " % (t.shape, t.dtype.name), tensor_str) - self.assertIn("numpy=\n%s" % t.numpy(), tensor_str) + self.assertIn("shape=%s, dtype=%s" % (t.shape, t.dtype.name), tensor_str) + self.assertIn(str(t.numpy()), tensor_str) def testMultiLineTensorRepr(self): t = tensor.Tensor(np.eye(3)) @@ -95,7 +95,7 @@ class TFETensorTest(test_util.TensorFlowTestCase): np.set_printoptions(threshold=2, edgeitems=1) t = tensor.Tensor(np.arange(10, dtype=np.int32)) - self.assertIn("numpy=[0 ..., 9]", str(t)) + self.assertIn("[0 ..., 9]", str(t)) self.assertIn("[0, ..., 9]", repr(t)) # Clean up: reset to previous printoptions. @@ -103,7 +103,7 @@ class TFETensorTest(test_util.TensorFlowTestCase): def testZeroDimTensorStr(self): t = tensor.Tensor(42) - self.assertIn("shape=(), dtype=int32, numpy=42", str(t)) + self.assertIn("42, shape=(), dtype=int32", str(t)) def testZeroDimTensorRepr(self): t = tensor.Tensor(42) @@ -113,7 +113,7 @@ class TFETensorTest(test_util.TensorFlowTestCase): def testZeroSizeTensorStr(self): t = tensor.Tensor(np.zeros(0, dtype=np.float32)) - self.assertIn("shape=(0,), dtype=float32, numpy=[]", str(t)) + self.assertIn("[], shape=(0,), dtype=float32", str(t)) def testZeroSizeTensorRepr(self): t = tensor.Tensor(np.zeros(0, dtype=np.float32)) @@ -127,8 +127,8 @@ class TFETensorTest(test_util.TensorFlowTestCase): t = tensor.Tensor(42) # Force change dtype to a numpy-unprintable type. t._dtype = dtypes.resource - self.assertIn("numpy=", str(t)) - self.assertIn("numpy=", repr(t)) + self.assertIn("", str(t)) + self.assertIn("", repr(t)) def testStringTensor(self): t_np_orig = np.array([[b"a", b"ab"], [b"abc", b"abcd"]]) diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD index 83eeeb35b67..167f9b10543 100644 --- a/tensorflow/python/estimator/BUILD +++ b/tensorflow/python/estimator/BUILD @@ -148,6 +148,7 @@ py_test( name = "dnn_test", size = "medium", srcs = ["canned/dnn_test.py"], + shard_count = 4, srcs_version = "PY2AND3", tags = ["no_pip"], deps = [ @@ -201,7 +202,7 @@ py_test( name = "dnn_linear_combined_test", size = "medium", srcs = ["canned/dnn_linear_combined_test.py"], - shard_count = 4, + shard_count = 8, srcs_version = "PY2AND3", tags = ["no_pip"], deps = [ @@ -552,11 +553,9 @@ py_test( name = "linear_test", size = "medium", srcs = ["canned/linear_test.py"], + shard_count = 4, srcs_version = "PY2AND3", - tags = [ - "no_pip", - "noasan", # times out b/63680444 - ], + tags = ["no_pip"], deps = [ ":linear", ":linear_testing_utils", diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py index d2c5772483b..80d109d927a 100644 --- a/tensorflow/python/estimator/canned/head.py +++ b/tensorflow/python/estimator/canned/head.py @@ -200,8 +200,11 @@ def _check_labels(labels, expected_labels_dimension): dim1 = static_shape[1] if (dim1 is not None) and (dim1 != expected_labels_dimension): raise ValueError( - 'labels shape must be [batch_size, labels_dimension], got %s.' % - (static_shape,)) + 'Mismatched label shape. ' + 'Classifier configured with n_classes=%s. Received %s. ' + 'Suggested Fix: check your n_classes argument to the estimator ' + 'and/or the shape of your label.' % + (expected_labels_dimension, dim1)) assert_dimension = check_ops.assert_equal( expected_labels_dimension, labels_shape[1], message=err_msg) with ops.control_dependencies([assert_dimension]): diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py index 23678013c66..fa3d5b44eb6 100644 --- a/tensorflow/python/estimator/canned/head_test.py +++ b/tensorflow/python/estimator/canned/head_test.py @@ -139,7 +139,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase): features = {'x': np.array(((42.,),))} # Static shape. - with self.assertRaisesRegexp(ValueError, 'labels shape'): + with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'): head.create_loss( features=features, mode=model_fn.ModeKeys.EVAL, @@ -889,7 +889,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase): logits_2x1 = np.array(((45.,), (41.,),)) # Static shape. - with self.assertRaisesRegexp(ValueError, 'labels shape'): + with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'): head.create_loss( features={'x': np.array(((42.,),))}, mode=model_fn.ModeKeys.EVAL, @@ -1692,7 +1692,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase): values_1d = np.array(((43.,), (44.,),)) # Static shape. - with self.assertRaisesRegexp(ValueError, 'labels shape'): + with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'): head.create_loss( features={'x': values_1d}, mode=model_fn.ModeKeys.EVAL, @@ -1737,7 +1737,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase): values_1d = np.array(((43.,), (44.,),)) # Static shape. - with self.assertRaisesRegexp(ValueError, 'labels shape'): + with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'): head.create_loss( features={'x': values_1d}, mode=model_fn.ModeKeys.TRAIN, diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index a8434d0c991..f64235d70b0 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -2474,6 +2474,11 @@ class _IndicatorColumn(_DenseColumn, sp_ids=id_tensor, sp_values=weight_tensor, vocab_size=int(self._variable_shape[-1])) + # Remove (?, -1) index + weighted_column = sparse_ops.sparse_slice( + weighted_column, + [0, 0], + weighted_column.dense_shape) return sparse_ops.sparse_tensor_to_dense(weighted_column) dense_id_tensor = sparse_ops.sparse_tensor_to_dense( diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py index 5138f31e981..e707770f8a3 100644 --- a/tensorflow/python/feature_column/feature_column_test.py +++ b/tensorflow/python/feature_column/feature_column_test.py @@ -3213,13 +3213,39 @@ class IndicatorColumnTest(test.TestCase): weights = fc.weighted_categorical_column(ids, 'weights') indicator = fc.indicator_column(weights) features = { - 'ids': constant_op.constant(['c', 'b', 'a'], shape=(1, 3)), - 'weights': constant_op.constant([2., 4., 6.], shape=(1, 3)) + 'ids': constant_op.constant([['c', 'b', 'a']]), + 'weights': constant_op.constant([[2., 4., 6.]]) } indicator_tensor = _transform_features(features, [indicator])[indicator] with _initialized_session(): self.assertAllEqual([[6., 4., 2.]], indicator_tensor.eval()) + def test_transform_with_missing_value_in_weighted_column(self): + # Github issue 12583 + ids = fc.categorical_column_with_vocabulary_list( + key='ids', vocabulary_list=('a', 'b', 'c')) + weights = fc.weighted_categorical_column(ids, 'weights') + indicator = fc.indicator_column(weights) + features = { + 'ids': constant_op.constant([['c', 'b', 'unknown']]), + 'weights': constant_op.constant([[2., 4., 6.]]) + } + indicator_tensor = _transform_features(features, [indicator])[indicator] + with _initialized_session(): + self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval()) + + def test_transform_with_missing_value_in_categorical_column(self): + # Github issue 12583 + ids = fc.categorical_column_with_vocabulary_list( + key='ids', vocabulary_list=('a', 'b', 'c')) + indicator = fc.indicator_column(ids) + features = { + 'ids': constant_op.constant([['c', 'b', 'unknown']]), + } + indicator_tensor = _transform_features(features, [indicator])[indicator] + with _initialized_session(): + self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval()) + def test_linear_model(self): animal = fc.indicator_column( fc.categorical_column_with_identity('animal', num_buckets=4)) diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py index 2f35f0e04b6..7a866ee6e8a 100644 --- a/tensorflow/python/framework/function.py +++ b/tensorflow/python/framework/function.py @@ -26,7 +26,9 @@ import hashlib from tensorflow.core.framework import attr_value_pb2 from tensorflow.core.framework import op_def_pb2 +from tensorflow.python import pywrap_tensorflow as c_api from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import graph_to_function_def from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -290,6 +292,7 @@ class _DefinedFunction(object): self._shape_func = shape_func self._extra_kwargs = kwargs self._definition = None # Constructed lazily. + self._c_func = None # Constructed with definition. self._sub_functions = dict() # Constructed with definition. self._args = [] @@ -396,6 +399,22 @@ class _DefinedFunction(object): if self._func.__doc__: self._definition.signature.description = self._func.__doc__ + # pylint: disable=protected-access + if temp_graph._c_graph: + with errors.raise_exception_on_not_ok_status() as status: + output_names = ([compat.as_bytes(x) for x in self._out_names] + if self._out_names else []) + self._c_func = c_api.TF_GraphToFunction_wrapper( + temp_graph._c_graph, + self._func_name, + None, # opers + [t._as_tf_output() for t in inputs], + [t._as_tf_output() for t in outputs], + output_names, + None, # opts + status) + # pylint: enable=protected-access + def _create_hash_str(self, input_arg, output_arg, node_def): """Creates an 8-character string unique to this input. diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py index 589db9ef4dc..40205ddf053 100644 --- a/tensorflow/python/framework/function_test.py +++ b/tensorflow/python/framework/function_test.py @@ -33,6 +33,7 @@ from tensorflow.python.framework import function from tensorflow.python.framework import graph_to_function_def from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import clip_ops from tensorflow.python.ops import control_flow_ops @@ -63,7 +64,51 @@ def _OptimizerOptions(): do_constant_folding=cfold))) -class FunctionTest(test.TestCase): +class FunctionTestMethods(object): + """Test methods for verifying Function support. + + These test methods are used as mix-ins in two test cases: with + and without C API support. + """ + + def testIdentity(self): + + @function.Defun(dtypes.float32, func_name="MyIdentity") + def MyIdentityFunc(a): + return a + + with ops.Graph().as_default(): + call = MyIdentityFunc([18.0]) + self.assertEqual("MyIdentity", call.op.name) + with session.Session() as sess: + self.assertAllEqual([18.0], sess.run(call)) + + def testIdentityOutputName(self): + + @function.Defun( + dtypes.float32, func_name="MyIdentity", out_names=["my_result_name"]) + def MyIdentityFunc(a): + return a + + with ops.Graph().as_default(): + call = MyIdentityFunc([18.0]) + self.assertEqual("MyIdentity", call.op.name) + with session.Session() as sess: + self.assertAllEqual([18.0], sess.run(call)) + + def testTooManyOutputNames(self): + + @function.Defun( + dtypes.float32, func_name="MyIdentity", + out_names=["my_result1", "my_result2"]) + def MyIdentityFunc(a): + return a + + with ops.Graph().as_default(): + with self.assertRaisesRegexp( + ValueError, (r"Length of out_names \(2\) does not match number of " + r"outputs \(1\): my_result1, my_result2")): + MyIdentityFunc([18.0]) def testDefineFunction2Args(self): @@ -77,6 +122,35 @@ class FunctionTest(test.TestCase): with session.Session() as sess: self.assertAllEqual([5.0], sess.run(call)) + def testValueErrorOnFunctionWithNoOutput(self): + # TODO(iga): Remove this restriction and this test + + @function.Defun(dtypes.float32, dtypes.float32) + def APlus2B(a, b): + print(a + b * 2) # Create some ops to have nodes in the body + # Using 'print' to make lint happy + + with ops.Graph().as_default(): + with self.assertRaisesRegexp(ValueError, + "Function can not return None"): + APlus2B([1.0], [2.0]) + + def testDefineFunction2ArgsOutputName(self): + + @function.Defun( + dtypes.float32, + dtypes.float32, + func_name="APlus2B", + out_names=["my_result_name"]) + def APlus2B(a, b): + return a + b * 2 + + with ops.Graph().as_default(): + call = APlus2B([1.0], [2.0]) + self.assertEqual("APlus2B", call.op.name) + with session.Session() as sess: + self.assertAllEqual([5.0], sess.run(call)) + def testDefineFunctionDuplicateOutputs(self): @function.Defun(dtypes.float32, func_name="Duplicate") @@ -137,6 +211,7 @@ class FunctionTest(test.TestCase): out, = sess.run(dx, feed) self.assertAllClose(1 - np.square(np.tanh(inp)), out) + @test_util.disable_c_api # Function gradients don't work with C API def testCustomGradient(self): dtype = dtypes.float32 @@ -169,6 +244,7 @@ class FunctionTest(test.TestCase): out, = sess.run(dlogits, {logits: x, labels: y}) self.assertAllClose(out, np.exp(prob - y)) + @test_util.disable_c_api # Function gradients don't work with C API def testCustomGradientError(self): dtype = dtypes.float32 @@ -194,6 +270,7 @@ class FunctionTest(test.TestCase): "SymGrad expects to return 1.*but get 2.*instead"): _ = sess.run(dinp, {inp: x}) + @test_util.disable_c_api # Function gradients don't work with C API def testSymGradShape(self): g = ops.Graph() with g.as_default(): @@ -209,6 +286,7 @@ class FunctionTest(test.TestCase): self.assertEqual(x.get_shape(), dx.get_shape()) self.assertEqual(y.get_shape(), dy.get_shape()) + @test_util.disable_c_api # Function gradients don't work with C API def testSymGradAttr(self): @function.Defun(noinline=True) @@ -312,6 +390,7 @@ class FunctionTest(test.TestCase): "assertion failed.*-3"): self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0) + @test_util.disable_c_api # Op._add_control_inputs doesn't work with C API def testAssertWrapper(self): @function.Defun(dtypes.float32) @@ -326,6 +405,7 @@ class FunctionTest(test.TestCase): "assertion"): _ = MyFn(100.0).eval() + @test_util.disable_c_api # Op._add_control_inputs doesn't work with C API def testWhileLoopCallsFunc(self): with self.test_session(use_gpu=True) as sess: @@ -345,6 +425,7 @@ class FunctionTest(test.TestCase): ans = sess.run(loop) self.assertAllClose(ans, 131072.) + @test_util.disable_c_api # Op._add_control_inputs doesn't work with C API def testControlFlowStrictness(self): """Inlined functions must not execute in a untaken control flow branch.""" @@ -607,6 +688,7 @@ class FunctionTest(test.TestCase): self.assertAllClose(vals[0], vals[1]) self.assertAllClose(vals[2], vals[3]) + @test_util.disable_c_api # Function Declaration doesn't work with C API def testDeclare(self): foo = function.Declare("Foo", [("x", dtypes.float32)], [("y", dtypes.float32)]) @@ -626,6 +708,7 @@ class FunctionTest(test.TestCase): expected = rand * rand + 1.0 self.assertAllClose(expected, y.eval(feed_dict={x: rand})) + @test_util.disable_c_api # Function Declaration doesn't work with C API def testDeclareUsedInDefun(self): foo = function.Declare("Foo", [("x", dtypes.float32)], [("y", dtypes.float32)]) @@ -649,6 +732,7 @@ class FunctionTest(test.TestCase): expected = rand * rand + 1.0 self.assertAllClose(expected, y.eval(feed_dict={x: rand})) + @test_util.disable_c_api # Function Declaration doesn't work with C API def testDeclareTypeMistake(self): foo = function.Declare("Foo", [("x", dtypes.float32)], [("y", dtypes.float32)]) @@ -861,6 +945,32 @@ class FunctionTest(test.TestCase): self.assertEqual(len(f.signature.input_arg), 3) +class FunctionTest(FunctionTestMethods, test.TestCase): + """Test case that invokes test methods with _USE_C_API=False.""" + + def setUp(self): + self.prev_use_c_api = ops._USE_C_API + ops._USE_C_API = False + super(FunctionTest, self).setUp() + + def tearDown(self): + ops._USE_C_API = self.prev_use_c_api + super(FunctionTest, self).tearDown() + + +class FunctionWithCApiTest(FunctionTestMethods, test.TestCase): + """Test case that invokes test methods with _USE_C_API=True.""" + + def setUp(self): + self.prev_use_c_api = ops._USE_C_API + ops._USE_C_API = True + super(FunctionWithCApiTest, self).setUp() + + def tearDown(self): + ops._USE_C_API = self.prev_use_c_api + super(FunctionWithCApiTest, self).tearDown() + + class FunctionsFromProtos(test.TestCase): def expectFunctionsEqual(self, func, grad_func=None, new_func=None): diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py index aa373600669..76424ef579b 100644 --- a/tensorflow/python/framework/op_def_library.py +++ b/tensorflow/python/framework/op_def_library.py @@ -784,6 +784,7 @@ class OpDefLibrary(object): if arg.is_ref] with _MaybeColocateWith(must_colocate_inputs): # Add Op to graph + inputs = [ag_core.getval(x) for x in inputs] op = g.create_op(op_type_name, inputs, output_types, name=scope, input_types=input_types, attrs=attr_protos, op_def=op_def) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 5a0c323ce47..b197e96886e 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -49,6 +49,7 @@ from tensorflow.python.framework import versions from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import compat from tensorflow.python.util import decorator_utils +from tensorflow.python.util import nest from tensorflow.python.util import tf_contextlib # Temporary global switch determining if we should enable the work-in-progress @@ -604,6 +605,13 @@ def _maybe_modify_numpy_dtype_determination(np_array): return np_array +def _has_string(value): + if isinstance(value, compat.bytes_or_text_types): return True + if isinstance(value, collections.Sequence) and value: + return _has_string(value[0]) + return False + + # TODO(agarwal): rename to TensorHandle. class EagerTensor(Tensor): """A TensorFlow Eager Tensor.""" @@ -625,6 +633,8 @@ class EagerTensor(Tensor): # https://www.tensorflow.org/code/tensorflow/python/framework/constant_op.py self._id = uid() if not isinstance(value, np.ndarray): + if dtype is None and _has_string(value): + dtype = dtypes.string npt = None if dtype is None else dtype.as_numpy_dtype try: value = np.array(value, dtype=npt) @@ -712,12 +722,12 @@ class EagerTensor(Tensor): return numpy_text def __str__(self): - return "tfe.Tensor(shape=%s, dtype=%s, numpy=%s)" % (self.shape, - self.dtype.name, - self._numpy_text()) + return "tf.Tensor(%s, shape=%s, dtype=%s)" % (self._numpy_text(), + self.shape, + self.dtype.name) def __repr__(self): - return "" % ( + return "" % ( self._id, self.shape, self.dtype.name, self._numpy_text(is_repr=True)) @staticmethod @@ -1027,12 +1037,19 @@ def internal_convert_to_tensor(value, # tracing gradients, to ensure the same behavior happens with and without # tracing. unwrapped = ag_core.getval(value) - # Fast path for EagerTensors that don't need any conversion. - if isinstance(unwrapped, EagerTensor) and context.in_eager_mode(): - # Note that we don't check that value's dtype matches the dtype - # argument. We exepct that the C runtime will do that checking - # when we execute the kernel. - return value + + if context.in_eager_mode(): + # Fast path for EagerTensors that don't need any conversion. + if isinstance(unwrapped, EagerTensor): + # Note that we don't check that value's dtype matches the dtype + # argument. We exepct that the C runtime will do that checking + # when we execute the kernel. + return value + values = nest.flatten(value) + if (len(values) > 1 and + any(isinstance(ag_core.getval(v), EagerTensor) for v in values)): + raise TypeError("Cannot convert to a eager tensor.") + if dtype is not None: dtype = dtypes.as_dtype(dtype) unwrapped_type = type(unwrapped) @@ -2939,6 +2956,14 @@ class Graph(object): if self._graph_def_versions.min_consumer < 12: self._graph_def_versions.min_consumer = 12 self._functions[name] = function + if self._c_graph: + # pylint: disable=protected-access + assert function._c_func, ( + "Cannot add function created without C API support to graph " + "created with C API support") + with errors.raise_exception_on_not_ok_status() as status: + c_api.TF_GraphAddFunction(self._c_graph, function._c_func, status) + # pylint: enable=protected-access @property def building_function(self): diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc index 83665422885..3cf56330e0d 100644 --- a/tensorflow/python/framework/python_op_gen_main.cc +++ b/tensorflow/python/framework/python_op_gen_main.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/framework/op_def.pb.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/io/inputbuffer.h" +#include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/scanner.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" @@ -80,7 +81,31 @@ Status ParseOpListCommandLine(const char* arg, std::vector* op_list) { return Status::OK(); } -void PrintAllPythonOps(const std::vector& op_list, bool require_shapes, + +// Use the name of the current executable to infer the C++ source file +// where the REGISTER_OP() call for the operator can be found. +// Returns the name of the file. +// Returns an empty string if the current executable's name does not +// follow a known pattern. +string InferSourceFileName(const char* argv_zero) { + StringPiece command_str = io::Basename(argv_zero); + + // For built-in ops, the Bazel build creates a separate executable + // with the name gen__ops_py_wrappers_cc containing the + // operators defined in _ops.cc + const char* kExecPrefix = "gen_"; + const char* kExecSuffix = "_py_wrappers_cc"; + if (command_str.Consume(kExecPrefix) && command_str.ends_with(kExecSuffix)) { + command_str.remove_suffix(strlen(kExecSuffix)); + return strings::StrCat(command_str, ".cc"); + } else { + return string(""); + } +} + +void PrintAllPythonOps(const std::vector& op_list, + const string& source_file_name, + bool require_shapes, bool op_list_is_whitelist) { OpList ops; OpRegistry::Global()->Export(false, &ops); @@ -93,9 +118,9 @@ void PrintAllPythonOps(const std::vector& op_list, bool require_shapes, *pruned_ops.mutable_op()->Add() = op_def; } } - PrintEagerPythonOps(pruned_ops, {}, require_shapes); + PrintEagerPythonOps(pruned_ops, {}, require_shapes, source_file_name); } else { - PrintEagerPythonOps(ops, op_list, require_shapes); + PrintEagerPythonOps(ops, op_list, require_shapes, source_file_name); } } @@ -105,20 +130,26 @@ void PrintAllPythonOps(const std::vector& op_list, bool require_shapes, int main(int argc, char* argv[]) { tensorflow::port::InitMain(argv[0], &argc, &argv); + tensorflow::string source_file_name = + tensorflow::InferSourceFileName(argv[0]); + // Usage: // gen_main [ @FILENAME | OpName[,OpName]* ] (0 | 1) [0 | 1] if (argc == 2) { - tensorflow::PrintAllPythonOps({}, {}, tensorflow::string(argv[1]) == "1"); + tensorflow::PrintAllPythonOps({}, source_file_name, + tensorflow::string(argv[1]) == "1", + false /* op_list_is_whitelist */); } else if (argc == 3) { std::vector hidden_ops; TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &hidden_ops)); - tensorflow::PrintAllPythonOps(hidden_ops, + tensorflow::PrintAllPythonOps(hidden_ops, source_file_name, tensorflow::string(argv[2]) == "1", false /* op_list_is_whitelist */); } else if (argc == 4) { std::vector op_list; TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &op_list)); - tensorflow::PrintAllPythonOps(op_list, tensorflow::string(argv[2]) == "1", + tensorflow::PrintAllPythonOps(op_list, source_file_name, + tensorflow::string(argv[2]) == "1", tensorflow::string(argv[3]) == "1"); } else { return -1; diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py index eea3d28a7e4..8c0975b11b3 100644 --- a/tensorflow/python/framework/tensor_util.py +++ b/tensorflow/python/framework/tensor_util.py @@ -236,7 +236,9 @@ def _FilterTuple(v): def _FilterInt(v): if isinstance(v, (list, tuple)): return _FirstNotNone([_FilterInt(x) for x in v]) - return None if isinstance(v, compat.integral_types) else _NotNone(v) + return None if isinstance( + v, + (compat.integral_types, tensor_shape.Dimension)) else _NotNone(v) def _FilterFloat(v): diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py index 2760f98a6bc..ca47274e9a4 100644 --- a/tensorflow/python/framework/tensor_util_test.py +++ b/tensorflow/python/framework/tensor_util_test.py @@ -314,6 +314,17 @@ class TensorUtilTest(test.TestCase): shape=[3, 4], dtype=dtype))) + def testIntMixedWithDimension(self): + # Github issue: 11974 + dtype = dtypes.int32 + nptype = np.int32 + t = tensor_util.make_tensor_proto([10, tensor_shape.Dimension(20), 30], + dtype=dtype) + self.assertEquals(dtype, t.dtype) + a = tensor_util.MakeNdarray(t) + self.assertEquals(nptype, a.dtype) + self.assertAllClose(np.array([10, 20, 30], dtype=nptype), a) + def testLong(self): t = tensor_util.make_tensor_proto(10, dtype=dtypes.int64) self.assertProtoEquals(""" diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index c65816a5436..04c7554a580 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -65,7 +65,7 @@ def gpu_device_name(): """Returns the name of a GPU device if available or the empty string.""" for x in device_lib.list_local_devices(): if x.device_type == "GPU" or x.device_type == "SYCL": - return x.name + return compat.as_str(x.name) return "" @@ -392,6 +392,7 @@ class TensorFlowTestCase(googletest.TestCase): self._cached_session = None def setUp(self): + logging.info("SET UP: %s" % str(self)) self._ClearCachedSession() random.seed(random_seed.DEFAULT_GRAPH_SEED) np.random.seed(random_seed.DEFAULT_GRAPH_SEED) @@ -406,6 +407,7 @@ class TensorFlowTestCase(googletest.TestCase): ops.get_default_graph().seed = random_seed.DEFAULT_GRAPH_SEED def tearDown(self): + logging.info("TEAR DOWN: %s" % str(self)) for thread in self._threads: self.assertFalse(thread.is_alive(), "A checkedThread did not terminate") diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 4fa1e1fee80..d9c5f3bce99 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -518,7 +518,7 @@ tf_py_test( tf_py_test( name = "matrix_solve_ls_op_test", - size = "small", + size = "medium", srcs = ["matrix_solve_ls_op_test.py"], additional_deps = [ "//third_party/py/numpy", @@ -528,6 +528,7 @@ tf_py_test( "//tensorflow/python:linalg_ops", "//tensorflow/python:math_ops", ], + tags = ["nomsan"], # fails in msan from numpy calls ) tf_py_test( @@ -683,13 +684,15 @@ cuda_py_test( tf_py_test( name = "segment_reduction_ops_test", - size = "small", + size = "medium", srcs = ["segment_reduction_ops_test.py"], additional_deps = [ "//third_party/py/numpy", + "//tensorflow/python:client", "//tensorflow/python:client_testlib", "//tensorflow/python:framework_for_generated_wrappers", "//tensorflow/python:math_ops", + "//tensorflow/python:variables", "//tensorflow/python:nn_grad", ], ) @@ -1708,6 +1711,26 @@ cuda_py_test( tags = ["no_windows_gpu"], ) +cuda_py_test( + name = "reduction_ops_test_big", + size = "medium", + srcs = ["reduction_ops_test_big.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:math_ops", + ], + tags = [ + "manual", + "no_gpu", + "nogpu", + "noguitar", + "notap", + ], +) + cuda_py_test( name = "relu_op_test", size = "small", @@ -2154,7 +2177,7 @@ cuda_py_test( "//tensorflow/python:nn_grad", "//tensorflow/python:nn_ops", ], - tags = ["noasan"], # times out b/63680444 + shard_count = 2, ) cuda_py_test( diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py index 392639fa179..77c5bb6d400 100644 --- a/tensorflow/python/kernel_tests/array_ops_test.py +++ b/tensorflow/python/kernel_tests/array_ops_test.py @@ -981,15 +981,15 @@ class SequenceMaskTest(test_util.TensorFlowTestCase): class ConcatSliceResourceTest(test_util.TensorFlowTestCase): + @test_util.run_in_graph_and_eager_modes() def testConcatSlice(self): - with self.test_session(): - r1 = test_ops.stub_resource_handle_op(container="a", shared_name="b") - r2 = test_ops.stub_resource_handle_op(container="a", shared_name="c") - c = array_ops.stack([r1, r2]) - s = array_ops.strided_slice(c, [1], [2]) - test_ops.resource_create_op(s).run() - with self.assertRaises(errors.AlreadyExistsError): - test_ops.resource_create_op(r2).run() + r1 = test_ops.stub_resource_handle_op(container="a", shared_name="b") + r2 = test_ops.stub_resource_handle_op(container="a", shared_name="c") + c = array_ops.stack([r1, r2]) + s = array_ops.strided_slice(c, [1], [2]) + self.evaluate(test_ops.resource_create_op(s)) + with self.assertRaises(errors.AlreadyExistsError): + self.evaluate(test_ops.resource_create_op(r2)) class IdentityTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py index eb06e067a7f..de80fb30554 100644 --- a/tensorflow/python/kernel_tests/cholesky_op_test.py +++ b/tensorflow/python/kernel_tests/cholesky_op_test.py @@ -183,14 +183,11 @@ class CholeskyGradTest(test.TestCase): self.runFiniteDifferences( shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64)) - # TODO(eriche): investigate why this test fails only in opensource - # ubuntu gpu python3 - - # def testSmallMatricesComplex(self): - # np.random.seed(0) - # shapes = self.getShapes([1, 2, 10]) - # self.runFiniteDifferences( - # shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128)) + def testSmallMatricesComplex(self): + np.random.seed(0) + shapes = self.getShapes([1, 2, 10]) + self.runFiniteDifferences( + shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128)) def testOneBlockMatrices(self): np.random.seed(0) diff --git a/tensorflow/python/kernel_tests/constant_op_eager_test.py b/tensorflow/python/kernel_tests/constant_op_eager_test.py index 0e98afbe6e4..0b4fa60d81b 100644 --- a/tensorflow/python/kernel_tests/constant_op_eager_test.py +++ b/tensorflow/python/kernel_tests/constant_op_eager_test.py @@ -26,27 +26,33 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes as dtypes_lib from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops +from tensorflow.python.util import compat -# TODO(josh11b): add tests with string types, lists/tuples, Shape. +# TODO(josh11b): add tests with lists/tuples, Shape. class ConstantTest(test.TestCase): def _testCpu(self, x): np_ans = np.array(x) - tf_ans = ops.convert_to_tensor(x).numpy() + with context.device("/device:CPU:0"): + tf_ans = ops.convert_to_tensor(x).numpy() if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]: self.assertAllClose(np_ans, tf_ans) else: self.assertAllEqual(np_ans, tf_ans) def _testGpu(self, x): - np_ans = np.array(x) - tf_ans = ops.convert_to_tensor(x).numpy() - if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]: - self.assertAllClose(np_ans, tf_ans) - else: - self.assertAllEqual(np_ans, tf_ans) + device = test_util.gpu_device_name() + if device: + np_ans = np.array(x) + with context.device(device): + tf_ans = ops.convert_to_tensor(x).numpy() + if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]: + self.assertAllClose(np_ans, tf_ans) + else: + self.assertAllEqual(np_ans, tf_ans) def _testAll(self, x): self._testCpu(x) @@ -78,11 +84,11 @@ class ConstantTest(test.TestCase): def testComplex64(self): self._testAll( - np.complex(1, 2) * np.arange(-15, 15).reshape([2, 3, 5 - ]).astype(np.complex64)) + np.complex(1, 2) * + np.arange(-15, 15).reshape([2, 3, 5]).astype(np.complex64)) self._testAll( - np.complex(1, 2) * np.random.normal(size=30).reshape( - [2, 3, 5]).astype(np.complex64)) + np.complex(1, 2) * + np.random.normal(size=30).reshape([2, 3, 5]).astype(np.complex64)) self._testAll(np.empty((2, 0, 5)).astype(np.complex64)) def testComplex128(self): @@ -94,6 +100,26 @@ class ConstantTest(test.TestCase): [2, 3, 5]).astype(np.complex128)) self._testAll(np.empty((2, 0, 5)).astype(np.complex128)) + def testString(self): + val = [compat.as_bytes(str(x)) for x in np.arange(-15, 15)] + self._testCpu(np.array(val).reshape([2, 3, 5])) + self._testCpu(np.empty((2, 0, 5)).astype(np.str_)) + + def testStringWithNulls(self): + val = ops.convert_to_tensor(b"\0\0\0\0").numpy() + self.assertEqual(len(val), 4) + self.assertEqual(val, b"\0\0\0\0") + + val = ops.convert_to_tensor(b"xx\0xx").numpy() + self.assertEqual(len(val), 5) + self.assertAllEqual(val, b"xx\0xx") + + nested = [[b"\0\0\0\0", b"xx\0xx"], [b"\0_\0_\0_\0", b"\0"]] + val = ops.convert_to_tensor(nested).numpy() + # NOTE(mrry): Do not use assertAllEqual, because it converts nested to a + # numpy array, which loses the null terminators. + self.assertEqual(val.tolist(), nested) + def testExplicitShapeNumPy(self): c = constant_op.constant( np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float32), diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py index df413939c76..6167cb9999b 100644 --- a/tensorflow/python/kernel_tests/constant_op_test.py +++ b/tensorflow/python/kernel_tests/constant_op_test.py @@ -32,6 +32,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.ops import gradient_checker +from tensorflow.python.ops import logging_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import test from tensorflow.python.util import compat @@ -119,11 +120,11 @@ class ConstantTest(test.TestCase): variant_val=[ tensor_pb2.VariantTensorDataProto( # Match registration in variant_op_registry.cc - type_name=b"int32", + type_name=b"int", metadata=np.array(1, dtype=np.int32).tobytes()) ]) - const_op = constant_op.constant(variant_tensor).op - const_value = const_op.get_attr("value") + const = constant_op.constant(variant_tensor) + const_value = const.op.get_attr("value") # Ensure we stored the tensor proto properly. self.assertProtoEquals(variant_tensor, const_value) @@ -134,7 +135,10 @@ class ConstantTest(test.TestCase): # native numpy types cannot be passed to ops.convert_to_tensor. # TODO(ebrevdo): Add registration mechanism for # ops.convert_to_tensor and for session.run output. - const_op.run() + logging_const_op = logging_ops.Print( + const, [const], + message="Variant storing an int, decoded const value:").op + logging_const_op.run() def testStringWithNulls(self): with self.test_session(): @@ -469,6 +473,35 @@ class ZerosLikeTest(test.TestCase): self.assertEqual(y.shape, shape) self.assertAllEqual(y, np.zeros(shape, dtype=out_type)) + def testZerosLikeVariant(self): + # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant + # copying between CPU and GPU is supported AND we register a + # ZerosLike callback for GPU for Variant storing primitive types + # in variant_op_registry.cc. + with self.test_session(use_gpu=False): + variant_tensor = tensor_pb2.TensorProto( + dtype=dtypes_lib.variant.as_datatype_enum, + tensor_shape=tensor_shape.TensorShape([]).as_proto(), + variant_val=[ + tensor_pb2.VariantTensorDataProto( + # Match registration in variant_op_registry.cc + type_name=b"int", + metadata=np.array(1, dtype=np.int32).tobytes()) + ]) + const_variant = constant_op.constant(variant_tensor) + zeros_like = array_ops.zeros_like(const_variant) + zeros_like_op = logging_ops.Print( + zeros_like, [const_variant, zeros_like], + message="Variant storing an int, input and output of zeros_like:").op + + # Smoke test -- ensure this executes without trouble. + # Right now, non-numpy-compatible objects cannot be returned from a + # session.run call; similarly, objects that can't be converted to + # native numpy types cannot be passed to ops.convert_to_tensor. + # TODO(ebrevdo): Add registration mechanism for + # ops.convert_to_tensor and for session.run output. + zeros_like_op.run() + class OnesTest(test.TestCase): diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py index 85e7b635d80..748135440ec 100644 --- a/tensorflow/python/kernel_tests/fifo_queue_test.py +++ b/tensorflow/python/kernel_tests/fifo_queue_test.py @@ -1078,6 +1078,9 @@ class FIFOQueueTest(test.TestCase): self.assertEqual([50.0], dequeued_t.eval()) self.assertEqual([60.0], dequeued_t.eval()) + # Make sure the thread finishes before exiting. + thread.join() + def testBlockingEnqueueBeforeClose(self): with self.test_session() as sess: q = data_flow_ops.FIFOQueue(4, dtypes_lib.float32) diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py index 53b1897f488..d8c3f9823c3 100644 --- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py +++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py @@ -1191,6 +1191,9 @@ class PaddingFIFOQueueTest(test.TestCase): self.assertEqual([50.0], dequeued_t.eval()) self.assertEqual([60.0], dequeued_t.eval()) + # Make sure the thread finishes before exiting. + thread.join() + def testBlockingEnqueueBeforeClose(self): with self.test_session() as sess: q = data_flow_ops.PaddingFIFOQueue(4, dtypes_lib.float32, ((),)) diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py index fa1553a3f6b..b01fc129538 100644 --- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py +++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py @@ -321,6 +321,15 @@ class PoolingTest(test.TestCase): strides=(1, 1, 1), padding="VALID") + def testMaxPoolGradValidPadding1_2_3d(self): + self._ConstructAndTestGradient( + nn_ops.max_pool3d, + input_sizes=[1, 3, 3, 3, 1], + output_sizes=[1, 2, 2, 2, 1], + window=(1, 1, 1), + strides=(2, 2, 2), + padding="VALID") + def testMaxPoolGradValidPadding2_2_3d(self): self._ConstructAndTestGradient( nn_ops.max_pool3d, @@ -339,6 +348,15 @@ class PoolingTest(test.TestCase): strides=(1, 1, 1), padding="SAME") + def testMaxPoolGradSamePadding1_2_3d(self): + self._ConstructAndTestGradient( + nn_ops.max_pool3d, + input_sizes=[1, 3, 2, 4, 1], + output_sizes=[1, 2, 1, 2, 1], + window=(1, 1, 1), + strides=(2, 2, 2), + padding="SAME") + def testMaxPoolGradSamePadding2_1_3d(self): self._ConstructAndTestGradient( nn_ops.max_pool3d, @@ -375,6 +393,15 @@ class PoolingTest(test.TestCase): strides=(1, 1, 1), padding="VALID") + def testAvgPoolGradValidPadding1_2_3d(self): + self._ConstructAndTestGradient( + nn_ops.avg_pool3d, + input_sizes=[1, 3, 3, 3, 1], + output_sizes=[1, 2, 2, 2, 1], + window=(1, 1, 1), + strides=(2, 2, 2), + padding="VALID") + def testAvgPoolGradValidPadding2_1_3d(self): self._ConstructAndTestGradient( nn_ops.avg_pool3d, @@ -402,6 +429,15 @@ class PoolingTest(test.TestCase): strides=(1, 1, 1), padding="SAME") + def testAvgPoolGradSamePadding1_2_3d(self): + self._ConstructAndTestGradient( + nn_ops.avg_pool3d, + input_sizes=[1, 3, 2, 4, 2], + output_sizes=[1, 2, 1, 2, 2], + window=(1, 1, 1), + strides=(2, 2, 2), + padding="SAME") + def testAvgPoolGradSamePadding2_1_3d(self): self._ConstructAndTestGradient( nn_ops.avg_pool3d, diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py index da14871c872..9eb1fea8037 100644 --- a/tensorflow/python/kernel_tests/pooling_ops_test.py +++ b/tensorflow/python/kernel_tests/pooling_ops_test.py @@ -998,6 +998,20 @@ class PoolingTest(test.TestCase): data_format=data_format, use_gpu=use_gpu) + def _testMaxPoolGradValidPadding1_2(self, data_format, use_gpu): + for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]: + self._ConstructAndTestGradient( + pool_func, + input_sizes=[1, 3, 3, 1], + output_sizes=[1, 2, 2, 1], + window_rows=1, + window_cols=1, + row_stride=2, + col_stride=2, + padding="VALID", + data_format=data_format, + use_gpu=use_gpu) + def _testMaxPoolGradValidPadding2_2(self, data_format, use_gpu): for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]: self._ConstructAndTestGradient( @@ -1026,6 +1040,20 @@ class PoolingTest(test.TestCase): data_format=data_format, use_gpu=use_gpu) + def _testMaxPoolGradSamePadding1_2(self, data_format, use_gpu): + for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]: + self._ConstructAndTestGradient( + pool_func, + input_sizes=[2, 2, 4, 3], + output_sizes=[2, 1, 2, 3], + window_rows=1, + window_cols=1, + row_stride=2, + col_stride=2, + padding="SAME", + data_format=data_format, + use_gpu=use_gpu) + def _testMaxPoolGradSamePadding2_1(self, data_format, use_gpu): for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]: self._ConstructAndTestGradient( @@ -1071,10 +1099,12 @@ class PoolingTest(test.TestCase): def testMaxPoolGrad(self): for (data_format, use_gpu) in GetTestConfigs(): self._testMaxPoolGradValidPadding1_1(data_format, use_gpu) + self._testMaxPoolGradValidPadding1_2(data_format, use_gpu) self._testMaxPoolGradValidPadding2_1_6(data_format, use_gpu) self._testMaxPoolGradValidPadding2_1_7(data_format, use_gpu) self._testMaxPoolGradValidPadding2_2(data_format, use_gpu) self._testMaxPoolGradSamePadding1_1(data_format, use_gpu) + self._testMaxPoolGradSamePadding1_2(data_format, use_gpu) self._testMaxPoolGradSamePadding2_1(data_format, use_gpu) self._testMaxPoolGradSamePadding2_2(data_format, use_gpu) self._testMaxPoolGradSamePadding3_1(data_format, use_gpu) @@ -1497,9 +1527,11 @@ class PoolingTest(test.TestCase): def testAvgPoolGrad(self): for (data_format, use_gpu) in GetTestConfigs(): self._testAvgPoolGradValidPadding1_1(data_format, use_gpu) + self._testAvgPoolGradValidPadding1_2(data_format, use_gpu) self._testAvgPoolGradValidPadding2_1(data_format, use_gpu) self._testAvgPoolGradValidPadding2_2(data_format, use_gpu) self._testAvgPoolGradSamePadding1_1(data_format, use_gpu) + self._testAvgPoolGradSamePadding1_2(data_format, use_gpu) self._testAvgPoolGradSamePadding2_1(data_format, use_gpu) self._testAvgPoolGradSamePadding2_2(data_format, use_gpu) self._testAvgPoolGradSamePadding3_1(data_format, use_gpu) @@ -1517,6 +1549,19 @@ class PoolingTest(test.TestCase): data_format=data_format, use_gpu=use_gpu) + def _testAvgPoolGradValidPadding1_2(self, data_format, use_gpu): + self._ConstructAndTestGradient( + nn_ops.avg_pool, + input_sizes=[2, 3, 3, 3], + output_sizes=[2, 2, 2, 3], + window_rows=1, + window_cols=1, + row_stride=2, + col_stride=2, + padding="VALID", + data_format=data_format, + use_gpu=use_gpu) + def _testAvgPoolGradValidPadding2_1(self, data_format, use_gpu): self._ConstructAndTestGradient( nn_ops.avg_pool, @@ -1556,6 +1601,19 @@ class PoolingTest(test.TestCase): data_format=data_format, use_gpu=use_gpu) + def _testAvgPoolGradSamePadding1_2(self, data_format, use_gpu): + self._ConstructAndTestGradient( + nn_ops.avg_pool, + input_sizes=[2, 2, 4, 3], + output_sizes=[2, 1, 2, 3], + window_rows=1, + window_cols=1, + row_stride=2, + col_stride=2, + padding="SAME", + data_format=data_format, + use_gpu=use_gpu) + def _testAvgPoolGradSamePadding2_1(self, data_format, use_gpu): self._ConstructAndTestGradient( nn_ops.avg_pool, diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py index 04ce99a4a63..8d6b7925e45 100644 --- a/tensorflow/python/kernel_tests/reduction_ops_test.py +++ b/tensorflow/python/kernel_tests/reduction_ops_test.py @@ -175,6 +175,24 @@ class SumReductionTest(BaseReductionTest): np_arr = self._makeIncremental((2,) * rank, dtypes.int32) self._compareAllAxes(np_arr) + def testFloat16(self): + for rank in range(1, _MAX_RANK + 1): + np_arr = self._makeIncremental((2,) * rank, dtypes.float16) + self._compareAllAxes(np_arr) + + # test that mean doesn't overflow + # only on GPU, since it has the more accurate implementation + if not test.is_gpu_available(): + return + + arr = np.ones([68000], dtype=np.float16) + + with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: + tf_arr = array_ops.constant(arr) + tf_mean = math_ops.reduce_mean(tf_arr, 0, False) + tf_out_mean = sess.run(tf_mean) + self.assertAllClose(tf_out_mean, 1.) + def testFloat32(self): for rank in range(1, _MAX_RANK + 1): np_arr = self._makeIncremental((2,) * rank, dtypes.float32) @@ -523,7 +541,7 @@ class MinReductionTest(test.TestCase): def testFloatReduce3D(self): # Create a 3D array of floats and reduce across all possible # dimensions - np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32) + np_arr = np.arange(1, 31).reshape([2, 3, 5]).astype(np.float32) self._compareAll(np_arr, None) self._compareAll(np_arr, []) self._compareAll(np_arr, [0]) @@ -537,7 +555,7 @@ class MinReductionTest(test.TestCase): def testDoubleReduce3D(self): # Create a 3D array of doubles and reduce across all possible # dimensions - np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float64) + np_arr = np.arange(1, 31).reshape([2, 3, 5]).astype(np.float64) self._compareAll(np_arr, None) self._compareAll(np_arr, []) self._compareAll(np_arr, [0]) @@ -629,7 +647,7 @@ class MaxReductionTest(test.TestCase): def testFloatReduce3D(self): # Create a 3D array of floats and reduce across all possible # dimensions - np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32) + np_arr = np.arange(-31, -1).reshape([2, 3, 5]).astype(np.float32) self._compareAll(np_arr, None) self._compareAll(np_arr, []) self._compareAll(np_arr, [0]) @@ -643,7 +661,7 @@ class MaxReductionTest(test.TestCase): def testDoubleReduce3D(self): # Create a 3D array of doubles and reduce across all possible # dimensions - np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float64) + np_arr = np.arange(-31, -1).reshape([2, 3, 5]).astype(np.float64) self._compareAll(np_arr, None) self._compareAll(np_arr, []) self._compareAll(np_arr, [0]) @@ -656,7 +674,7 @@ class MaxReductionTest(test.TestCase): def testGradient(self): s = [2, 3, 4, 2] - x = np.arange(1.0, 49.0).reshape(s).astype(np.float64) + x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64) with self.test_session(): t = ops.convert_to_tensor(x) su = math_ops.reduce_max(t, [1, 2]) @@ -666,7 +684,7 @@ class MaxReductionTest(test.TestCase): def testGradient2(self): s = [2, 3, 4, 2] - x = np.arange(1.0, 49.0).reshape(s).astype(np.float64) + x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64) with self.test_session(): t = ops.convert_to_tensor(x) su = math_ops.reduce_max(t, [1]) @@ -676,7 +694,7 @@ class MaxReductionTest(test.TestCase): def testGradient3(self): s = [2, 3, 4, 2] - x = np.arange(1.0, 49.0).reshape(s).astype(np.float64) + x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64) with self.test_session(): t = ops.convert_to_tensor(x) su = math_ops.reduce_max(t, [2]) @@ -686,7 +704,7 @@ class MaxReductionTest(test.TestCase): def testGradient4(self): s = [2, 3, 4, 2] - x = np.arange(1.0, 49.0).reshape(s).astype(np.float64) + x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64) with self.test_session(): t = ops.convert_to_tensor(x) su = math_ops.reduce_max(t) diff --git a/tensorflow/python/kernel_tests/reduction_ops_test_big.py b/tensorflow/python/kernel_tests/reduction_ops_test_big.py new file mode 100644 index 00000000000..0959adb026e --- /dev/null +++ b/tensorflow/python/kernel_tests/reduction_ops_test_big.py @@ -0,0 +1,179 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functional tests for reduction ops.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.python.framework import ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import test + + +class BaseReductionTest(test.TestCase): + + def _tf_reduce(self, x, reduction_axes, keep_dims): + raise NotImplementedError() + + +class BigReductionTest(BaseReductionTest): + """Test reductions for sum and boolean all over a wide range of shapes.""" + + def _tf_reduce_max(self, x, reduction_axes, keep_dims): + return math_ops.reduce_max(x, reduction_axes, keep_dims) + + def _tf_reduce_all(self, x, reduction_axes, keep_dims): + return math_ops.reduce_all(x, reduction_axes, keep_dims) + + def _tf_reduce_mean(self, x, reduction_axes, keep_dims): + return math_ops.reduce_mean(x, reduction_axes, keep_dims) + + def _tf_reduce_sum(self, x, reduction_axes, keep_dims): + return math_ops.reduce_sum(x, reduction_axes, keep_dims) + + def testFloat32Sum(self): + # make sure we test all possible kernel invocations + # logic is the same for all ops, test just float32 for brevity + arr_ = np.ones([4097, 4097], dtype=np.float32) + for size_x in [ + 1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025, + 4096, 4097 + ]: + for size_y in [ + 1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025, + 4096, 4097 + ]: + arr = arr_[0:size_x, 0:size_y] + col_sum = np.ones([size_y], dtype=np.float32) * size_x + row_sum = np.ones([size_x], dtype=np.float32) * size_y + full_sum = np.ones([], dtype=np.float32) * size_x * size_y + + with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: + tf_row_sum = self._tf_reduce_sum(arr, 1, False) + tf_col_sum = self._tf_reduce_sum(arr, 0, False) + tf_full_sum = self._tf_reduce_sum(arr, [0, 1], False) + tf_out_row, tf_out_col, tf_out_full = sess.run( + [tf_row_sum, tf_col_sum, tf_full_sum]) + self.assertAllClose(col_sum, tf_out_col) + self.assertAllClose(row_sum, tf_out_row) + self.assertAllClose(full_sum, tf_out_full) + + arr_ = np.ones([130, 130, 130], dtype=np.float32) + for size_x in range(1, 130, 13): + for size_y in range(1, 130, 13): + for size_z in range(1, 130, 13): + arr = arr_[0:size_x, 0:size_y, 0:size_z] + sum_y = np.ones([size_x, size_z], dtype=np.float32) + sum_xz = np.ones([size_y], dtype=np.float32) + + with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: + tf_sum_xz = self._tf_reduce_mean(arr, [0, 2], False) + tf_sum_y = self._tf_reduce_mean(arr, 1, False) + tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y]) + self.assertAllClose(sum_y, tf_out_sum_y) + self.assertAllClose(sum_xz, tf_out_sum_xz) + + def testFloat32Max(self): + # make sure we test all possible kernel invocations + # logic is the same for all ops, test just float32 for brevity + arr_ = np.random.uniform( + low=-3, high=-1, size=[4105, 4105]).astype(np.float32) + for size_x in [ + 1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025, + 4096, 4097 + ]: + for size_y in [ + 1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025, + 4096, 4097 + ]: + arr = arr_[0:size_x, 0:size_y] + col_max = np.max(arr, axis=0) + row_max = np.max(arr, axis=1) + full_max = np.max(col_max) + + with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: + tf_row_max = self._tf_reduce_max(arr, 1, False) + tf_col_max = self._tf_reduce_max(arr, 0, False) + tf_full_max = self._tf_reduce_max(arr, [0, 1], False) + tf_out_row, tf_out_col, tf_out_full = sess.run( + [tf_row_max, tf_col_max, tf_full_max]) + self.assertAllClose(col_max, tf_out_col) + self.assertAllClose(row_max, tf_out_row) + self.assertAllClose(full_max, tf_out_full) + + arr_ = np.random.uniform( + low=-3, high=-1, size=[130, 130, 130]).astype(np.float32) + for size_x in range(1, 130, 13): + for size_y in range(1, 130, 13): + for size_z in range(1, 130, 13): + arr = arr_[0:size_x, 0:size_y, 0:size_z] + sum_y = np.max(arr, axis=1) + sum_xz = np.max(arr, axis=(0, 2)) + + with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: + tf_sum_xz = self._tf_reduce_max(arr, [0, 2], False) + tf_sum_y = self._tf_reduce_max(arr, 1, False) + tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y]) + self.assertAllClose(sum_y, tf_out_sum_y) + self.assertAllClose(sum_xz, tf_out_sum_xz) + + def testBooleanAll(self): + # make sure we test all possible kernel invocations + # test operation where T(0) is not the identity + arr_ = np.ones([4097, 4097], dtype=np.bool) + for size_x in [ + 1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025, + 4096, 4097 + ]: + for size_y in [ + 1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025, + 4096, 4097 + ]: + arr = arr_[0:size_x, 0:size_y] + col_sum = np.ones([size_y], dtype=np.bool) + row_sum = np.ones([size_x], dtype=np.bool) + full_sum = np.ones([1], dtype=np.bool).reshape([]) + + with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: + tf_row_sum = self._tf_reduce_all(arr, 1, False) + tf_col_sum = self._tf_reduce_all(arr, 0, False) + tf_full_sum = self._tf_reduce_all(arr, [0, 1], False) + tf_out_row, tf_out_col, tf_out_full = sess.run( + [tf_row_sum, tf_col_sum, tf_full_sum]) + self.assertAllClose(col_sum, tf_out_col) + self.assertAllClose(row_sum, tf_out_row) + self.assertAllClose(full_sum, tf_out_full) + + arr_ = np.ones([130, 130, 130], dtype=np.bool) + for size_x in range(1, 130, 13): + for size_y in range(1, 130, 13): + for size_z in range(1, 130, 13): + arr = arr_[0:size_x, 0:size_y, 0:size_z] + sum_y = np.ones([size_x, size_z], dtype=np.bool) + sum_xz = np.ones([size_y], dtype=np.bool) + + with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: + tf_sum_xz = self._tf_reduce_all(arr, [0, 2], False) + tf_sum_y = self._tf_reduce_all(arr, 1, False) + tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y]) + self.assertAllClose(sum_y, tf_out_sum_y) + self.assertAllClose(sum_xz, tf_out_sum_xz) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py index 33269c91234..5e426fc61a7 100644 --- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py +++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py @@ -18,12 +18,17 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import itertools + import numpy as np +from tensorflow.python.client import session +from tensorflow.python.framework import ops from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes as dtypes_lib from tensorflow.python.ops import gradient_checker from tensorflow.python.ops import math_ops +from tensorflow.python.ops import variables import tensorflow.python.ops.nn_grad # pylint: disable=unused-import from tensorflow.python.platform import test @@ -107,19 +112,19 @@ class SegmentReductionOpTest(SegmentReductionHelper): curr_ops_list = complex_ops_list else: curr_ops_list = ops_list - - with self.test_session(use_gpu=False): - tf_x, np_x = self._input(shape, dtype=dtype) - for np_op1, np_op2, tf_op in curr_ops_list: - np_ans = self._segmentReduce(indices, np_x, np_op1, np_op2) - s = tf_op(data=tf_x, segment_ids=indices) - tf_ans = s.eval() - self.assertAllClose(np_ans, tf_ans) - # NOTE(mrry): The static shape inference that computes - # `tf_ans.shape` can only infer that sizes from dimension 1 - # onwards, because the size of dimension 0 is data-dependent - # and may therefore vary dynamically. - self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:]) + for use_gpu in [True, False]: + with self.test_session(use_gpu=use_gpu): + tf_x, np_x = self._input(shape, dtype=dtype) + for np_op1, np_op2, tf_op in curr_ops_list: + np_ans = self._segmentReduce(indices, np_x, np_op1, np_op2) + s = tf_op(data=tf_x, segment_ids=indices) + tf_ans = s.eval() + self.assertAllClose(np_ans, tf_ans) + # NOTE(mrry): The static shape inference that computes + # `tf_ans.shape` can only infer that sizes from dimension 1 + # onwards, because the size of dimension 0 is data-dependent + # and may therefore vary dynamically. + self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:]) def testSegmentIdsShape(self): shape = [4, 4] @@ -130,41 +135,45 @@ class SegmentReductionOpTest(SegmentReductionHelper): def testSegmentIdsSize(self): shape = [4, 4] - with self.test_session(): - tf_x, _ = self._input(shape) - indices = [0, 1] - s = math_ops.segment_sum(data=tf_x, segment_ids=indices) - with self.assertRaisesOpError("segment_ids should be the same size"): - s.eval() + for use_gpu in [True, False]: + with self.test_session(use_gpu=use_gpu): + tf_x, _ = self._input(shape) + indices = [0, 1] + s = math_ops.segment_sum(data=tf_x, segment_ids=indices) + with self.assertRaisesOpError("segment_ids should be the same size"): + s.eval() def testSegmentIdsValid(self): # This is a baseline for the following SegmentIdsInvalid* tests. shape = [4, 4] - with self.test_session(): - tf_x, _ = self._input(shape) - indices = [0, 0, 0, 1] - result = math_ops.segment_sum(data=tf_x, segment_ids=indices).eval() - self.assertAllEqual([[15, 18, 21, 24], [13, 14, 15, 16]], result) + for use_gpu in [True, False]: + with self.test_session(use_gpu=use_gpu): + tf_x, _ = self._input(shape, dtype=dtypes_lib.float32) + indices = [0, 0, 0, 1] + result = math_ops.segment_sum(data=tf_x, segment_ids=indices).eval() + self.assertAllEqual([[15, 18, 21, 24], [13, 14, 15, 16]], result) def testSegmentIdsGreaterThanZero(self): shape = [4, 4] - with self.test_session(): - tf_x, np_x = self._input(shape) - indices = [1, 1, 2, 2] - np_ans = self._segmentReduce(indices, np_x, np.add) - s = math_ops.segment_sum(data=tf_x, segment_ids=indices) - tf_ans = s.eval() - self.assertAllClose(np_ans, tf_ans) + for use_gpu in [True, False]: + with self.test_session(use_gpu=use_gpu): + tf_x, np_x = self._input(shape, dtype=dtypes_lib.float32) + indices = [1, 1, 2, 2] + np_ans = self._segmentReduce(indices, np_x, np.add) + s = math_ops.segment_sum(data=tf_x, segment_ids=indices) + tf_ans = s.eval() + self.assertAllClose(np_ans, tf_ans) def testSegmentIdsHole(self): shape = [4, 4] - with self.test_session(): - tf_x, np_x = self._input(shape) - indices = [0, 0, 3, 3] - np_ans = self._segmentReduce(indices, np_x, np.add) - s = math_ops.segment_sum(data=tf_x, segment_ids=indices) - tf_ans = s.eval() - self.assertAllClose(np_ans, tf_ans) + for use_gpu in [True, False]: + with self.test_session(use_gpu=use_gpu): + tf_x, np_x = self._input(shape, dtype=dtypes_lib.float32) + indices = [0, 0, 3, 3] + np_ans = self._segmentReduce(indices, np_x, np.add) + s = math_ops.segment_sum(data=tf_x, segment_ids=indices) + tf_ans = s.eval() + self.assertAllClose(np_ans, tf_ans) def testSegmentIdsInvalid1(self): shape = [4, 4] @@ -199,21 +208,23 @@ class SegmentReductionOpTest(SegmentReductionHelper): def testSegmentIdsInvalid4(self): shape = [4, 4] - with self.test_session(): - tf_x, _ = self._input(shape) - indices = [0, 0, 0, -1] - s = math_ops.segment_sum(data=tf_x, segment_ids=indices) - with self.assertRaisesOpError("segment ids must be >= 0"): - s.eval() + for use_gpu in [True, False]: + with self.test_session(use_gpu=use_gpu): + tf_x, _ = self._input(shape, dtype=dtypes_lib.float32) + indices = [0, 0, 0, -1] + s = math_ops.segment_sum(data=tf_x, segment_ids=indices) + with self.assertRaisesOpError("segment ids must be >= 0"): + s.eval() def testSegmentIdsInvalid5(self): shape = [4, 4] - with self.test_session(): - tf_x, _ = self._input(shape) - indices = [0, 0, 0, -2] - s = math_ops.segment_sum(data=tf_x, segment_ids=indices) - with self.assertRaisesOpError("segment ids must be >= 0"): - s.eval() + for use_gpu in [True, False]: + with self.test_session(use_gpu=use_gpu): + tf_x, _ = self._input(shape, dtype=dtypes_lib.float32) + indices = [0, 0, 0, -2] + s = math_ops.segment_sum(data=tf_x, segment_ids=indices) + with self.assertRaisesOpError("segment ids must be >= 0"): + s.eval() def testGradient(self): shape = [4, 4] @@ -341,7 +352,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper): with self.test_session(use_gpu=True): tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64) s = math_ops.unsorted_segment_max(data=tf_x, segment_ids=indices, - num_segments=num_segments) + num_segments=num_segments) jacob_t, jacob_n = gradient_checker.compute_gradient( tf_x, shape, @@ -635,6 +646,64 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper): with self.assertRaisesOpError(r"Segment id 0 out of range \[0, 0\)"): s.eval() +class SegmentReductionOpBenchmark(test.Benchmark): + outer_dim_options = [2**x for x in range(9, 14, 2)] + ratio_options = [2**x for x in range(1, 6, 2)] + inner_dim_options = [2**x for x in range(9, 14, 2)] + #randomly generated sizes with less alignments + inner_dim_options += [1120, 1215, 1856, 1302, 1329, 1531, 1313, 1672, 1851, 1584] + dtype_options = [np.float32, np.float64] + options = (outer_dim_options, + ratio_options, inner_dim_options, dtype_options) + op_functors = [lambda vc, vs, seg_ids: + ("sorted", math_ops.segment_sum(vc, vs)), + lambda vc, vs, seg_ids: + ("unsorted", math_ops.unsorted_segment_sum(vc, vs, seg_ids[-1]+1))] + repeat = 10 + + def _npTypeToStr(self, t): + if t == np.float32: + return "fp32" + if t == np.float64: + return "fp64" + + def _runGraph(self, op_functor, outer_dim, ratio, inner_dim, dtype): + output_outer_dim = int(outer_dim/ratio) + const = np.random.randint(5, size=(outer_dim, inner_dim)) + seg_ids = np.sort(np.random.randint( + output_outer_dim, size=outer_dim)) + vs = variables.Variable(seg_ids.astype(np.int32)) + with ops.device("/gpu:0"): + vc = variables.Variable(const.astype(dtype)) + name, op = op_functor(vc, vs, seg_ids) + with session.Session() as sess: + variables.global_variables_initializer().run() + r = self.run_op_benchmark(sess, op, min_iters=self.repeat, + name="_".join(map(str, + [name, + outer_dim, + ratio, + inner_dim, + self._npTypeToStr(dtype)]))) + return name, r["wall_time"] + + def benchmarkSegmentSumGPU(self): + if not test.is_gpu_available(cuda_only=True): + return + for outer_dim, ratio, inner_dim, dtype in itertools.product(*self.options): + output_outer_dim = int(outer_dim/ratio) + op_functor = self.op_functors[0] + with ops.Graph().as_default(): + self._runGraph(op_functor, outer_dim, ratio, inner_dim, dtype) + + def benchmarkUnsortedSegmentSumGPU(self): + if not test.is_gpu_available(cuda_only=True): + return + for outer_dim, ratio, inner_dim, dtype in itertools.product(*self.options): + output_outer_dim = int(outer_dim/ratio) + op_functor = self.op_functors[1] + with ops.Graph().as_default(): + self._runGraph(op_functor, outer_dim, ratio, inner_dim, dtype) if __name__ == "__main__": test.main() diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py index 68293aa5fe5..41c67743b6d 100644 --- a/tensorflow/python/layers/convolutional.py +++ b/tensorflow/python/layers/convolutional.py @@ -24,6 +24,7 @@ import six from six.moves import xrange # pylint: disable=redefined-builtin import numpy as np +from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops @@ -171,7 +172,7 @@ class _Conv(base.Layer): padding=self.padding.upper(), data_format=utils.convert_data_format(self.data_format, self.rank + 2)) - if self.bias is not None: + if self.use_bias: if self.data_format == 'channels_first': if self.rank == 1: # nn.bias_add does not accept a 1D input tensor. @@ -988,7 +989,7 @@ class SeparableConv2D(Conv2D): rate=self.dilation_rate, data_format=utils.convert_data_format(self.data_format, ndim=4)) - if self.bias is not None: + if self.use_bias: outputs = nn.bias_add( outputs, self.bias, @@ -1293,20 +1294,21 @@ class Conv2DTranspose(Conv2D): padding=self.padding.upper(), data_format=utils.convert_data_format(self.data_format, ndim=4)) - # Infer the static output shape: - out_shape = inputs.get_shape().as_list() - out_shape[c_axis] = self.filters - out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis], - kernel_h, - self.padding, - stride_h) - out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis], - kernel_w, - self.padding, - stride_w) - outputs.set_shape(out_shape) + if context.in_graph_mode(): + # Infer the static output shape: + out_shape = inputs.get_shape().as_list() + out_shape[c_axis] = self.filters + out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis], + kernel_h, + self.padding, + stride_h) + out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis], + kernel_w, + self.padding, + stride_w) + outputs.set_shape(out_shape) - if self.bias: + if self.use_bias: outputs = nn.bias_add( outputs, self.bias, @@ -1591,24 +1593,25 @@ class Conv3DTranspose(Conv3D): data_format=utils.convert_data_format(self.data_format, ndim=5), padding=self.padding.upper()) - # Infer the static output shape: - out_shape = inputs.get_shape().as_list() - out_shape[c_axis] = self.filters - out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis], - kernel_d, - self.padding, - stride_d) - out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis], - kernel_h, - self.padding, - stride_h) - out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis], - kernel_w, - self.padding, - stride_w) - outputs.set_shape(out_shape) + if context.in_graph_mode(): + # Infer the static output shape: + out_shape = inputs.get_shape().as_list() + out_shape[c_axis] = self.filters + out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis], + kernel_d, + self.padding, + stride_d) + out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis], + kernel_h, + self.padding, + stride_h) + out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis], + kernel_w, + self.padding, + stride_w) + outputs.set_shape(out_shape) - if self.bias: + if self.use_bias: outputs_shape = outputs.shape.as_list() if self.data_format == 'channels_first': outputs_4d = array_ops.reshape(outputs, [ diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py index 62f5881f164..1fc2d70f9ca 100644 --- a/tensorflow/python/layers/normalization.py +++ b/tensorflow/python/layers/normalization.py @@ -25,6 +25,7 @@ import six from six.moves import xrange # pylint: disable=redefined-builtin import numpy as np +from tensorflow.python.eager import context from tensorflow.python.framework import dtypes from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import ops @@ -242,15 +243,20 @@ class BatchNormalization(base.Layer): initializer=init_ops.zeros_initializer(), trainable=False) return var + with ops.device(None): - with ops.device(lambda _: self.moving_mean.device): + device = ((lambda _: self.moving_mean.device) + if context.in_graph_mode() else self.moving_mean.device) + with ops.device(device): self.renorm_mean = _renorm_variable('renorm_mean', (param_dim,)) self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ()) # We initialize renorm_stddev to 0, and maintain the (0-initialized) # renorm_stddev_weight. This allows us to (1) mix the average # stddev with the minibatch stddev early in training, and (2) compute # the unbiased average stddev by dividing renorm_stddev by the weight. - with ops.device(lambda _: self.moving_variance.device): + device = ((lambda _: self.moving_variance.device) + if context.in_graph_mode() else self.moving_variance.device) + with ops.device(device): self.renorm_stddev = _renorm_variable('renorm_stddev', (param_dim,)) self.renorm_stddev_weight = _renorm_variable( 'renorm_stddev_weight', ()) @@ -301,8 +307,12 @@ class BatchNormalization(base.Layer): self.moving_mean, mean, decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( self.moving_variance, variance, decay, zero_debias=False) - self.add_update(mean_update, inputs=inputs) - self.add_update(variance_update, inputs=inputs) + if context.in_graph_mode(): + # Note that in Eager mode, the updates are already executed when running + # assign_moving_averages. So we do not need to put them into + # collections. + self.add_update(mean_update, inputs=inputs) + self.add_update(variance_update, inputs=inputs) return output @@ -335,6 +345,7 @@ class BatchNormalization(base.Layer): r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r)) d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d)) decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.) + def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" # Update the variables without zero debiasing. The debiasing will be @@ -418,9 +429,9 @@ class BatchNormalization(base.Layer): self.moving_mean, new_mean, decay, zero_debias=False) variance_update = moving_averages.assign_moving_average( self.moving_variance, new_variance, decay, zero_debias=False) - - self.add_update(mean_update, inputs=inputs) - self.add_update(variance_update, inputs=inputs) + if context.in_graph_mode(): + self.add_update(mean_update, inputs=inputs) + self.add_update(variance_update, inputs=inputs) else: mean, variance = self.moving_mean, self.moving_variance @@ -566,7 +577,6 @@ def batch_normalization(inputs, BatchNorm = BatchNormalization batch_norm = batch_normalization - # Helper function diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index 2b9306e8748..33ba5df7a6e 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -1466,12 +1466,15 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True): with ops.name_scope(name, "zeros_like", [tensor]) as name: tensor = ops.convert_to_tensor(tensor, name="tensor") - if tensor.shape.is_fully_defined(): + # For now, variant types must be created via zeros_like; as we need to + # pass the input variant object to the proper zeros callback. + + if tensor.shape.is_fully_defined() and tensor.dtype != dtypes.variant: # We can produce a zeros tensor independent of the value of 'tensor', # since the shape is known statically. return zeros(tensor.shape, dtype=dtype or tensor.dtype, name=name) - if dtype is not None and dtype != tensor.dtype: + if dtype is not None and dtype != tensor.dtype and dtype != dtypes.variant: return zeros( shape_internal(tensor, optimize=optimize), dtype=dtype, name=name) else: diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py index 59add19a581..0c6096a0755 100644 --- a/tensorflow/python/ops/distributions/util.py +++ b/tensorflow/python/ops/distributions/util.py @@ -770,7 +770,7 @@ def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"): else: d = math_ops.cast(array_ops.shape(x)[-1], dtype=dtypes.float32) # d = n(n+1)/2 implies n is: - n = math_ops.cast(0.5 * (dtypes.sqrt(1. + 8. * d) - 1.), + n = math_ops.cast(0.5 * (math_ops.sqrt(1. + 8. * d) - 1.), dtype=dtypes.int32) if validate_args: is_valid_input_shape = check_ops.assert_equal( diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py index 64987f93ddb..cb7d409f3bc 100644 --- a/tensorflow/python/ops/gradients_impl.py +++ b/tensorflow/python/ops/gradients_impl.py @@ -278,7 +278,7 @@ def _VerifyGeneratedGradients(grads, op): "inputs %d" % (len(grads), op.node_def, len(op.inputs))) -def _StopOps(from_ops, pending_count): +def _StopOps(from_ops, stop_gradient_ops, pending_count): """The set of ops that terminate the gradient computation. This computes the frontier of the forward graph *before* which backprop @@ -288,8 +288,11 @@ def _StopOps(from_ops, pending_count): `_PendingCount(g, xs, from_ops)`. An 'op' has predecessors in `from_ops` iff pending_count[op._id] > 0. + In addition, none of `stop_gradient_ops` will be differentiated. + Args: from_ops: list of Operations. + stop_gradient_ops: list of Operations never to backprop through. pending_count: List of integers, indexed by operation id. Returns: @@ -304,6 +307,7 @@ def _StopOps(from_ops, pending_count): break if is_stop_op: stop_ops.add(op._id) + stop_ops.update(op._id for op in stop_gradient_ops) # pylint: disable=protected-access return stop_ops @@ -374,17 +378,17 @@ def gradients(ys, name="gradients", colocate_gradients_with_ops=False, gate_gradients=False, - aggregation_method=None): - """Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`. + aggregation_method=None, + stop_gradients=None): + """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`. `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys` is a list of `Tensor`, holding the gradients received by the `ys`. The list must be the same length as `ys`. - `gradients()` adds ops to the graph to output the partial - derivatives of `ys` with respect to `xs`. It returns a list of - `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)` - for y in `ys`. + `gradients()` adds ops to the graph to output the derivatives of `ys` with + respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where + each tensor is the `sum(dy/dx)` for y in `ys`. `grad_ys` is a list of tensors of the same length as `ys` that holds the initial gradients for each y in `ys`. When `grad_ys` is None, @@ -394,6 +398,31 @@ def gradients(ys, one wanted to weight the gradient differently for each value in each y). + `stop_gradients` is a `Tensor` or a list of tensors to be considered constant + with respect to all `xs`. These tensors will not be backpropagated through, + as though they had been explicitly disconnected using `stop_gradient`. Among + other things, this allows computation of partial derivatives as opposed to + total derivatives. For example: + + a = tf.constant(0.) + b = 2 * a + g = tf.gradients(a + b, [a, b], stop_gradients=[a, b]) + + Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the + total derivatives `tf.gradients(a + b, [a, b])`, which take into account the + influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is + equivalent to: + + a = tf.stop_gradient(tf.constant(0.)) + b = tf.stop_gradient(2 * a) + g = tf.gradients(a + b, [a, b]) + + `stop_gradients` provides a way of stopping gradient after the graph has + already been constructed, as compared to `tf.stop_gradient` which is used + during graph construction. When the two approaches are combined, + backpropagation stops at both `tf.stop_gradient` nodes and nodes in + `stop_gradients`, whichever is encountered first. + Args: ys: A `Tensor` or list of tensors to be differentiated. xs: A `Tensor` or list of tensors to be used for differentiation. @@ -407,6 +436,8 @@ def gradients(ys, for an operations. This avoids some race conditions. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. + stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate + through. Returns: A list of `sum(dy/dx)` for each x in `xs`. @@ -423,12 +454,15 @@ def gradients(ys, "functions in tf.contrib.eager.backprop instead.") ys = _AsList(ys) xs = _AsList(xs) + stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients) if grad_ys is None: grad_ys = [None] * len(ys) else: grad_ys = _AsList(grad_ys) - with ops.name_scope(name, "gradients", ys + xs + grad_ys) as grad_scope: + with ops.name_scope( + name, "gradients", + list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope: ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y") xs = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x @@ -450,6 +484,7 @@ def gradients(ys, ys = [array_ops.identity(y) if y.consumers() else y for y in ys] to_ops = [t.op for t in ys] from_ops = [t.op for t in xs] + stop_gradient_ops = [t.op for t in stop_gradients] pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops) @@ -488,8 +523,7 @@ def gradients(ys, _SetGrad(grads, y, loop_state.ZerosLikeForExit(y)) queue.append(y.op) - # The set of 'from_ops'. - stop_ops = _StopOps(from_ops, pending_count) + stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count) while queue: # generate gradient subgraph for op. op = queue.popleft() diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py index 11c204b5b7f..7a561d046a8 100644 --- a/tensorflow/python/ops/gradients_test.py +++ b/tensorflow/python/ops/gradients_test.py @@ -349,6 +349,64 @@ class GradientsTest(test_util.TensorFlowTestCase): g = gradients.gradients([z, z2], x) self.assertAllClose(17502.0, g[0].eval()) + def testPartialDerivatives(self): + with self.test_session(): + x = constant_op.constant(1.) + y = 2 * x + z = x + y + totalg = gradients.gradients(z, [x, y]) + self.assertEqual([3.0, 1.0], [g.eval() for g in totalg]) + partialg = gradients.gradients(z, [x, y], stop_gradients=[x, y]) + self.assertEqual([1.0, 1.0], [g.eval() for g in partialg]) + + def testStopGradients(self): + def _MakeGraph(rng, stop_gradients=()): + def _FunctionOf(xs, k=3): + return ops.convert_to_tensor( + sum(math_ops.matmul(rng.rand(k, k), x) for x in xs) + + rng.rand(k, k)) + + a = _FunctionOf([]) + if "a" in stop_gradients: a = array_ops.stop_gradient(a) + b = _FunctionOf([a]) + if "b" in stop_gradients: b = array_ops.stop_gradient(b) + c = _FunctionOf([a, b]) + if "c" in stop_gradients: c = array_ops.stop_gradient(c) + d = _FunctionOf([b, c]) + if "d" in stop_gradients: d = array_ops.stop_gradient(d) + return dict(a=a, b=b, c=c, d=d) + + def _Gradients(ys, xs, **kwargs): + dydxs = gradients.gradients(ys, xs, **kwargs) + dydxs = [0. * x if dydx is None else dydx + for x, dydx in zip(xs, dydxs)] + return dydxs + + seed = np.random.randint(1000) + cases = [] + subsets = [""] + "a b c d ab ac ad bc bd cd abc abd acd bcd abcd".split() + graph = _MakeGraph(np.random.RandomState(seed)) + for constants in subsets: + graph_with_stops = _MakeGraph(np.random.RandomState(seed), constants) + for variables_ in subsets: + # compute the gradient when stopped using tf.stop_gradients + grad1 = _Gradients([graph_with_stops["d"]], + [graph_with_stops[v] for v in variables_]) + # compute the gradient when stopped using the stop_gradients kwarg + grad2 = _Gradients([graph["d"]], + [graph[v] for v in variables_], + stop_gradients=[graph[v] for v in constants]) + cases.append(dict(grad1=grad1, grad2=grad2, + constants=constants, variables=variables_)) + + # evaluate all tensors in one call to session.run for speed + with self.test_session() as session: + results = session.run([(case["grad1"], case["grad2"]) for case in cases]) + + for (npgrad1, npgrad2), case in zip(results, cases): + for a, b in zip(npgrad1, npgrad2): + np.testing.assert_allclose(a, b) + class FunctionGradientsTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py index 5cd5d7ba2f3..bd879ac4238 100644 --- a/tensorflow/python/ops/io_ops.py +++ b/tensorflow/python/ops/io_ops.py @@ -37,6 +37,7 @@ See the @{$python/io_ops} guide. @@parse_example @@parse_single_example @@parse_tensor +@@serialize_tensor @@decode_json_example @@QueueBase @@FIFOQueue diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py index 3b0a357b164..16320f75849 100644 --- a/tensorflow/python/ops/metrics_impl.py +++ b/tensorflow/python/ops/metrics_impl.py @@ -463,10 +463,16 @@ def _confusion_matrix_at_thresholds( if include not in all_includes: raise ValueError('Invaild key: %s.' % include) - predictions, labels, weights = _remove_squeezable_dimensions( - predictions=math_ops.to_float(predictions), - labels=math_ops.cast(labels, dtype=dtypes.bool), - weights=weights) + with ops.control_dependencies([ + check_ops.assert_greater_equal( + predictions, 0.0, message='predictions must be in [0, 1]'), + check_ops.assert_less_equal( + predictions, 1.0, message='predictions must be in [0, 1]') + ]): + predictions, labels, weights = _remove_squeezable_dimensions( + predictions=math_ops.to_float(predictions), + labels=math_ops.cast(labels, dtype=dtypes.bool), + weights=weights) num_thresholds = len(thresholds) diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py index e0e3d08e7ce..bf7c9fac8ed 100644 --- a/tensorflow/python/ops/parsing_ops.py +++ b/tensorflow/python/ops/parsing_ops.py @@ -40,6 +40,7 @@ from tensorflow.python.platform import tf_logging ops.NotDifferentiable("DecodeRaw") ops.NotDifferentiable("ParseTensor") +ops.NotDifferentiable("SerializeTensor") ops.NotDifferentiable("StringToNumber") diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py index 1d747f84008..2cae16f44cc 100644 --- a/tensorflow/python/ops/resource_variable_ops.py +++ b/tensorflow/python/ops/resource_variable_ops.py @@ -19,11 +19,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from autograd import core as ag_core + from tensorflow.core.framework import attr_value_pb2 from tensorflow.core.framework import variable_pb2 from tensorflow.python.eager import context from tensorflow.python.eager import custom_gradient from tensorflow.python.eager import tape +from tensorflow.python.eager import tensor_node from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape @@ -502,6 +505,8 @@ class ResourceVariable(variables.Variable): def sparse_read(self, indices, name=None): """Reads the value of this variable sparsely, using `gather`.""" with ops.name_scope("Gather" if name is None else name) as name: + if self._trainable: + tape.watch(self._handle) value = resource_gather( self._handle, indices, dtype=self._dtype, name=name) return array_ops.identity(value) @@ -574,7 +579,14 @@ class ResourceVariable(variables.Variable): def _run_op(a, *args): # pylint: disable=protected-access - return getattr(ops.Tensor, operator)(a._AsTensor(), *args) + value = a._AsTensor() + if ag_core.isnode(value): + # This avoids autograd trying to wrap a ResourceVariable. + value = ops.convert_to_tensor(value) + args = [ops.convert_to_tensor(x) for x in args] + return getattr(tensor_node.TensorNode, operator)(value, *args) + else: + return getattr(ops.Tensor, operator)(value, *args) # Propagate __doc__ to wrapper try: diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py index b1626feb27a..25a0ad0a37e 100644 --- a/tensorflow/python/ops/rnn_cell_impl.py +++ b/tensorflow/python/ops/rnn_cell_impl.py @@ -28,6 +28,7 @@ import collections import hashlib import numbers +from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -124,9 +125,10 @@ def _zero_state_tensors(state_size, batch_size, dtype): def get_state_shape(s): """Combine s with batch_size to get a proper tensor shape.""" c = _concat(batch_size, s) - c_static = _concat(batch_size, s, static=True) size = array_ops.zeros(c, dtype=dtype) - size.set_shape(c_static) + if context.in_graph_mode(): + c_static = _concat(batch_size, s, static=True) + size.set_shape(c_static) return size return nest.map_structure(get_state_shape, state_size) diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py index 9093c12968f..645775239fd 100644 --- a/tensorflow/python/ops/variable_scope.py +++ b/tensorflow/python/ops/variable_scope.py @@ -1698,7 +1698,7 @@ def variable_scope(name_or_scope, use when doing asynchronous distributed training. Returns: - A scope that can be to captured and reused. + A scope that can be captured and reused. Raises: ValueError: when trying to reuse within a create scope, or create within diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py index 53459496646..a1fe47982f0 100644 --- a/tensorflow/python/profiler/model_analyzer.py +++ b/tensorflow/python/profiler/model_analyzer.py @@ -117,7 +117,7 @@ class Profiler(object): ```python Typical use case: # Currently we are only allowed to create 1 profiler per process. - profiler = Profile(sess.graph) + profiler = Profiler(sess.graph) for i in xrange(total_steps): if i % 10000 == 0: @@ -174,7 +174,7 @@ class Profiler(object): """Add statistics of a step. Args: - step: A step uint64 used to identify the RunMetadata. Must be different + step: int, A step used to identify the RunMetadata. Must be different across different AddStep() calls. run_meta: RunMetadata proto that contains statistics of a session run. """ diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py index a8712fc37e6..00de044505f 100644 --- a/tensorflow/python/tools/import_pb_to_tensorboard.py +++ b/tensorflow/python/tools/import_pb_to_tensorboard.py @@ -51,7 +51,7 @@ def import_to_tensorboard(model_dir, log_dir): pb_visual_writer = summary.FileWriter(log_dir) pb_visual_writer.add_graph(sess.graph) print("Model Imported. Visualize by running: " - "> tensorboard --logdir={}".format(log_dir)) + "tensorboard --logdir={}".format(log_dir)) def main(unused_args): diff --git a/tensorflow/python/training/checkpoint_ops.py b/tensorflow/python/training/checkpoint_ops.py new file mode 100644 index 00000000000..70460ceb480 --- /dev/null +++ b/tensorflow/python/training/checkpoint_ops.py @@ -0,0 +1,453 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Operations for generating and loading vocab remappings.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_checkpoint_ops +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import math_ops + +ops.NotDifferentiable("GenerateVocabRemapping") +ops.NotDifferentiable("LoadAndRemapMatrix") + + +def _load_and_remap_matrix(ckpt_path, + old_tensor_name, + new_row_vocab_offset, + num_rows_to_load, + new_col_vocab_size, + initializer, + old_row_vocab_file=None, + new_row_vocab_file=None, + old_col_vocab_file=None, + new_col_vocab_file=None, + num_row_oov_buckets=0, + num_col_oov_buckets=0, + max_rows_in_memory=-1): + """Loads a 2-D (matrix) `Tensor` from checkpoint. + + Generates 1D-remappings for rows and columns using the + `GenerateVocabRemapping` op, and initializes any anticipated values with the + provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a + matrix that loads existing values from the checkpoint, while filling out + "missing" values with the newly initialized values. See + contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped + functionality (LoadAndRemapMatrix). This wrapper can be used to perform only + row remapping or only col remapping. If only row remapping is desired, + {new,old}_col_vocab_file should be `None`, and vice versa for column + remapping. + + NOTE: This only supports div-partitioning the vocabulary on the 1st dimension + (row axis) via `new_row_vocab_offset`. + + Args: + ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) + from which the old matrix `Tensor` will be loaded. + old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. + new_row_vocab_offset: A 0-indexed integer representing what line to + start reading at in the new row vocabulary. Used for partitioned + variables. + num_rows_to_load: Number of rows to load for the new vocabulary (note: to + support variable partitioning and partial loading, this does not need to + be the same as the number of entries in `new_row_vocab_file`). + new_col_vocab_size: Number of columns to load - should be the same as the + number of entries in `new_col_vocab_file`, since we don't support + partitioning along the column axis. + initializer: Callable initializer function that accepts a 1-D tensor as the + arg to specify the shape of the returned tensor. Used to initialize + missing values. + old_row_vocab_file: A scalar `Tensor` of type `string` containing the + path to the old row vocabulary file. Can be None, which represents no + remapping on the row axis. + new_row_vocab_file: A scalar `Tensor` of type `string` containing the path + to the new row vocabulary file. Can be None, which represents no remapping + on the row axis - in which case, `new_row_vocab_offset` and + `num_rows_to_load` work under the assumption that the new row vocab is the + same as the old row vocab. + old_col_vocab_file: A scalar `Tensor` of type `string` containing the + path to the old column vocabulary file. Can be None, which represents no + remapping on the column axis. + new_col_vocab_file: A scalar `Tensor` of type `string` containing the path + to the new column vocabulary file. Can be None, which represents no + remapping on the column axis - in which case, `new_col_vocab_size` works + under the assumption that the new col vocab is the same as the old col + vocab. + num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows + to append. Must be >= 0. + num_col_oov_buckets: `int` specifying the number of out-of-vocabulary + columns to append. Must be >= 0. + max_rows_in_memory: `int` specifying the maximum number of rows to load from + the checkpoint at once. If less than or equal to 0, the entire matrix will + be loaded into memory. Setting this arg trades increased disk reads for + lower memory usage. + + Returns: + A Tensor of shape `[num_rows_to_load + num_row_oov_buckets, + new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the + specified tensor in the checkpoint, and any missing or OOV values + initialized with the given `initializer`. + + Raises: + ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0. + ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is + provided, while the other is not. Same for `old_col_vocab_file` and + `new_col_vocab_file`. + ValueError: If neither row vocabs or col vocabs are provided. + """ + if num_row_oov_buckets < 0: + raise ValueError("num_row_oov_buckets must be >= 0, but received %d" % + num_row_oov_buckets) + if num_col_oov_buckets < 0: + raise ValueError("num_col_oov_buckets must be >= 0, but received %d" % + num_col_oov_buckets) + + if bool(old_row_vocab_file) != bool(new_row_vocab_file): + raise ValueError( + "old_row_vocab_file and new_row_vocab_file must both be specified or " + "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'". + format(old_row_vocab_file, new_row_vocab_file)) + if bool(old_col_vocab_file) != bool(new_col_vocab_file): + raise ValueError( + "old_col_vocab_file and new_col_vocab_file must both be specified or " + "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'". + format(old_col_vocab_file, new_col_vocab_file)) + + remap_rows = new_row_vocab_file and old_row_vocab_file + remap_cols = new_col_vocab_file and old_col_vocab_file + if not (remap_rows or remap_cols): + raise ValueError( + "Must provide either row or column vocab files. If no remapping is " + "necessary, consider using `tf.contrib.framework.init_from_checkpoint` " + "instead.") + + num_rows_present = num_rows_to_load + if remap_rows: + row_remapping, num_rows_present = ( + gen_checkpoint_ops._generate_vocab_remapping( # pylint: disable=protected-access + new_vocab_file=new_row_vocab_file, + old_vocab_file=old_row_vocab_file, + new_vocab_offset=new_row_vocab_offset, + num_new_vocab=num_rows_to_load)) + else: + # Even when the rows are not being reordered, we still need to generate a + # remapping to account for initializing partitioned Variables (when + # new_row_vocab_offset is non-zero). + row_remapping = math_ops.range( + new_row_vocab_offset, + new_row_vocab_offset + num_rows_to_load, + dtype=dtypes.int64) + + col_remapping = [] + num_cols_present = new_col_vocab_size + if remap_cols: + col_remapping, num_cols_present = ( + gen_checkpoint_ops._generate_vocab_remapping( # pylint: disable=protected-access + new_vocab_file=new_col_vocab_file, + old_vocab_file=old_col_vocab_file, + new_vocab_offset=0, # Offset is unused for cols (no partitioning). + num_new_vocab=new_col_vocab_size)) + + init_vals = initializer([ + num_rows_to_load * new_col_vocab_size - + num_rows_present * num_cols_present, 1 + ]) + return_tensor = gen_checkpoint_ops._load_and_remap_matrix( # pylint: disable=protected-access + ckpt_path=ckpt_path, + old_tensor_name=old_tensor_name, + row_remapping=row_remapping, + col_remapping=col_remapping, + initializing_values=init_vals, + num_rows=num_rows_to_load, + num_cols=new_col_vocab_size, + max_rows_in_memory=max_rows_in_memory) + + # Add OOV row(s) and column(s). + if num_row_oov_buckets > 0: + init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size]) + init_row_oov_val = ops.convert_to_tensor(init_row_oov_val) + return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0) + if num_col_oov_buckets > 0: + # We need to add any row OOV to the new column shape. + init_col_oov_val = initializer( + [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets]) + init_col_oov_val = ops.convert_to_tensor(init_col_oov_val) + return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1) + + return return_tensor + + +def _load_and_remap_matrix_initializer(ckpt_path, + old_tensor_name, + new_row_vocab_size, + new_col_vocab_size, + old_row_vocab_file=None, + new_row_vocab_file=None, + old_col_vocab_file=None, + new_col_vocab_file=None, + num_row_oov_buckets=0, + num_col_oov_buckets=0, + initializer=None, + max_rows_in_memory=-1): + r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor. + + The returned initializer loads a 2-D (matrix) `Tensor` with name + `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the + rows/columns according to the specified vocab files and append additional + out-of-vocabulary rows/columns according to the number of OOV buckets. + + The format of the file at the `{old,new}_{row,col}_vocab_file` path should be + a text file, with each line containing a single entity within the vocabulary. + Let the function `line_of(f, "x")` return the 0-indexed line number of the + entity "x" in file f, and the function `entity_at(f, i)` return the entity at + line i of file f. Then, row i of the new output matrix will be taken from row + `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old + matrix. If any entity in `new_row_vocab_file` is not found in + `old_row_vocab_file`, that row is considered a "missing" row, and its values + will be initialized using the `initializer` arg. The same logic also applies + for the columns. + + For example, assuming that: + + * `old_row_vocab_file` contains "mercury\nvenus\nmars" + * `new_row_vocab_file` contains "venus\njupiter\nmercury" + * `old_col_vocab_file` contains "good\nbetter\nbest" + * `new_col_vocab_file` contains "good\nbest\nfantastic" + * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]` + * `w(i, j)` represents the value from row i, column j of the old matrix + + Then the new output matrix will look like: + + `[[w(1, 0), w(1, 2), 1], + [2, 3, 4], + [w(0, 0), w(0, 2), 5]]` + + If we further specify that: + + * `num_row_oov_buckets` == 2 + * `num_col_oov_buckets` == 1 + + Then the new output matrix will look like: + + `[[w(1, 0), w(1, 2), 1, 12], + [2, 3, 4, 13], + [w(0, 0), w(0, 2), 5, 14], + [6, 7, 8, 15], + [9, 10, 11, 16]]` + + If `{old,new}_row_vocab_file` are None, we assume that the old and new row + vocab files are the same, and no row remapping is done. If + `{old,new}_col_vocab_file` are None, we assume that the old and new column + vocab files are the same, and no column remapping is done. + + The returned initializer only supports div-partitioning along the row axis. It + does not support partitioning along the column axis or mod-partitioning. + + NOTE: When this is used to warm-start variables, client code should use + `tf.lookup.index_table_from_tensor()` like + contrib/layers/python/layers/feature_column.py does, as opposed to + `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the + same. + + Args: + ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) + from which the old matrix `Tensor` will be loaded. + old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. + new_row_vocab_size: `int` specifying the number of entries in + `new_row_vocab_file`. If no row remapping is needed (no row vocab + provided), this should be equal to the number of rows to load from the old + matrix (which can theoretically be smaller than the number of rows in the + old matrix). + new_col_vocab_size: `int` specifying the number of entries in + `new_col_vocab_file`. If no column remapping is needed (no column vocab + provided), this should be equal to the number of columns in the old + matrix. + old_row_vocab_file: A scalar `Tensor` of type `string` containing the + path to the old row vocabulary file. Can be None, which represents no + remapping on the row axis. + new_row_vocab_file: A scalar `Tensor` of type `string` containing the path + to the new row vocabulary file. Can be None, which represents no remapping + on the row axis. + old_col_vocab_file: A scalar `Tensor` of type `string` containing the + path to the old column vocabulary file. Can be None, which represents no + remapping on the column axis. + new_col_vocab_file: A scalar `Tensor` of type `string` containing the path + to the new column vocabulary file. Can be None, which represents no + remapping on the column axis. + num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows + to append. Must be >= 0. + num_col_oov_buckets: `int` specifying the number of out-of-vocabulary + columns to append. Must be >= 0. + initializer: Initializer function to initialize missing values. Accepts a + 1-D tensor as the arg to specify the shape of the returned tensor. If + `None`, defaults to using `zeros_initializer()`. + max_rows_in_memory: `int` specifying the maximum number of rows to load from + the checkpoint at once. If less than or equal to 0, the entire matrix will + be loaded into memory. Setting this arg trades increased disk reads for + lower memory usage. + + Returns: + A variable initializer function that should be used to initialize a + (potentially partitioned) `Variable` whose complete shape is + `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size + + num_col_oov_buckets]`. + + Raises: + TypeError: If `initializer` is specified but not callable. + """ + if initializer is None: + # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from + # Glorot and Bengio, 2010. + initializer = init_ops.zeros_initializer() + + if not callable(initializer): + raise TypeError( + "initializer must be callable, instead of being {} of type {}.".format( + initializer, type(initializer))) + + def _initializer(shape, dtype=dtypes.float32, partition_info=None): + """Variable initializer. + + Args: + shape: Shape of `Tensor` to return. Should include OOV on both axes. + dtype: Must be float32. + partition_info: variable_scope._PartitionInfo. + + Returns: + `Tensor` of shape `shape`. + + Raises: + TypeError: If `dtype` is anything other than float32. + ValueError: For shape mismatch upon invocation. + """ + # Sanity checks. + if dtype != dtypes.float32: + raise TypeError( + "Currently, only float32 is supported. Received dtype: {}".format( + dtype)) + if len(shape) != 2: + raise ValueError("Expected 2-dim shape, but received: {}".format(shape)) + if shape[0] <= 0: + raise ValueError( + "Expected 1st dim of shape to be > 0, but received shape: {}".format( + shape)) + if shape[1] != (new_col_vocab_size + num_col_oov_buckets): + raise ValueError( + "Expected 2nd dim of shape to be new_col_vocab_size ({}) + " + "num_col_oov_buckets ({}) = {}, but received shape: {}".format( + new_col_vocab_size, num_col_oov_buckets, + new_col_vocab_size + num_col_oov_buckets, shape)) + + offset = 0 + if partition_info is not None: + offset = partition_info.single_offset(shape) + + if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets: + raise ValueError( + "Trying to initialize {} additional rows after {} rows have already " + "been initialized, which would exceed expected total row count of " + "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format( + shape[0], offset, new_row_vocab_size, num_row_oov_buckets, + new_row_vocab_size + num_row_oov_buckets)) + + row_oov_buckets_to_use = min(shape[0], + max(0, offset + shape[0] - new_row_vocab_size)) + num_rows_to_load = shape[0] - row_oov_buckets_to_use + + return _load_and_remap_matrix( + ckpt_path=ckpt_path, + old_tensor_name=old_tensor_name, + new_row_vocab_offset=offset, + num_rows_to_load=num_rows_to_load, + new_col_vocab_size=new_col_vocab_size, + initializer=initializer, + old_row_vocab_file=old_row_vocab_file, + new_row_vocab_file=new_row_vocab_file, + old_col_vocab_file=old_col_vocab_file, + new_col_vocab_file=new_col_vocab_file, + num_row_oov_buckets=row_oov_buckets_to_use, + num_col_oov_buckets=num_col_oov_buckets, + max_rows_in_memory=max_rows_in_memory) + + return _initializer + + +def _load_embedding_initializer(ckpt_path, + embedding_tensor_name, + new_vocab_size, + embedding_dim, + old_vocab_file, + new_vocab_file, + num_oov_buckets=0, + initializer=None, + max_rows_in_memory=-1): + """Returns a variable initializer for loading pre-trained embeddings. + + Wrapper around `load_and_remap_matrix_initializer()` specialized for loading + embedding weights and remapping according to the provided vocab files. See + docs for `load_and_remap_matrix_initializer()` for more details. + + NOTE: Only for use with div-partitioned variables / vocabularies. + + Args: + ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) + from which the old matrix `Tensor` will be loaded. + embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint. + new_vocab_size: Number of entries in the new vocab. + embedding_dim: `int` specifying the dimension of the embedding vectors from + the checkpoint. Must match the number of columns in the old embedding + matrix. + old_vocab_file: A scalar `Tensor` of type `string` containing the + path to the old vocabulary file. + new_vocab_file: A scalar `Tensor` of type `string` containing the + path to the new vocabulary file. + num_oov_buckets: `int` specifying the number of out-of-vocabulary + buckets to use. Must be >= 0. + initializer: Initializer function that accepts a 1-D tensor as the arg to + specify the shape of the returned tensor. If `None`, defaults to using + `truncated_normal_initializer()`. + max_rows_in_memory: `int` specifying the maximum number of rows to load from + the checkpoint at once. If less than or equal to 0, the entire matrix will + be loaded into memory. Setting this arg trades increased disk reads for + lower memory usage. + + Returns: + A variable initializer function. + """ + if initializer is None: + # TODO(b/25671353): This should be kept in sync with the stddev used by + # feature_column.py's _EmbeddingColumn. + initializer = init_ops.truncated_normal_initializer( + stddev=1.0 / math.sqrt(embedding_dim)) + + return _load_and_remap_matrix_initializer( + ckpt_path=ckpt_path, + old_tensor_name=embedding_tensor_name, + new_row_vocab_size=new_vocab_size, + new_col_vocab_size=embedding_dim, + old_row_vocab_file=old_vocab_file, + new_row_vocab_file=new_vocab_file, + old_col_vocab_file=None, + new_col_vocab_file=None, + num_row_oov_buckets=num_oov_buckets, + num_col_oov_buckets=0, + initializer=initializer, + max_rows_in_memory=max_rows_in_memory) diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py new file mode 100644 index 00000000000..39c4d2911f2 --- /dev/null +++ b/tensorflow/python/training/checkpoint_ops_test.py @@ -0,0 +1,305 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functional tests for Python wrappers around warm-starting.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import numpy as np + +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import partitioned_variables +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import checkpoint_ops +from tensorflow.python.training import saver as saver_lib + + +class LoadAndRemapWrappersTest(test.TestCase): + """Tests for the functionality of the Python wrappers.""" + + def setUp(self): + ops.reset_default_graph() + # Create the checkpoint file in a temporary directory. + checkpoint_prefix = os.path.join(self.get_temp_dir(), 'model') + # 0., 1., ..., 79. reshaped into [5, 16]. + initializer = init_ops.constant_initializer( + np.reshape(np.linspace(0.0, 79, 5 * 16), (5, 16))) + with self.test_session() as sess: + with variable_scope.variable_scope('some_scope'): + variable_scope.get_variable(name='embeddings', shape=[5, 16], + initializer=initializer) + sess.run(variables.global_variables_initializer()) + saver = saver_lib.Saver() + saver.save(sess, checkpoint_prefix, global_step=5) + self.checkpoint_file = '{}-5'.format(checkpoint_prefix) + + # Create the vocabulary files. + self.new_feature_vocab_file = os.path.join( + self.get_temp_dir(), 'new_feature_vocab.txt') + with open(self.new_feature_vocab_file, 'w') as f: + f.write('\n'.join(['zero', 'one', 'two', 'three', 'four']) + '\n') + + self.old_feature_vocab_file = os.path.join( + self.get_temp_dir(), 'old_feature_vocab.txt') + with open(self.old_feature_vocab_file, 'w') as f: + f.write('\n'.join(['zero', 'one', 'two', 'three']) + '\n') + + self.new_class_vocab_file = os.path.join( + self.get_temp_dir(), 'new_class_vocab.txt') + with open(self.new_class_vocab_file, 'w') as f: + f.write('\n'.join(['MISSING', 'knitting', 'flask', 'eminem']) + '\n') + + self.old_class_vocab_file = os.path.join( + self.get_temp_dir(), 'old_class_vocab.txt') + with open(self.old_class_vocab_file, 'w') as f: + f.write('\n'.join(['knitting', 'eminem', 'MISSING']) + '\n') + + self.init_val = 42 + + def _init_val_initializer(shape, dtype=None, partition_info=None): + del dtype, partition_info # Unused by this unit-testing initializer. + return array_ops.tile( + constant_op.constant([[self.init_val]], dtype=dtypes.float32), shape) + + self.initializer = _init_val_initializer + + def test_load_and_remap_matrix(self): + """Tests the end-to-end loading / remapping of weights.""" + # _load_and_remap_matrix() is the generalized wrapper that takes in row and + # column vocabulary files, calls the relevant remappings, and returns the + # weight matrix. Take this example to be linear multi-class by providing + # both row and column vocabularies. + remapped_matrix = checkpoint_ops._load_and_remap_matrix( + new_row_vocab_file=self.new_feature_vocab_file, + old_row_vocab_file=self.old_feature_vocab_file, + num_rows_to_load=4, + new_col_vocab_file=self.new_class_vocab_file, + old_col_vocab_file=self.old_class_vocab_file, + new_col_vocab_size=4, + old_tensor_name='some_scope/embeddings', + ckpt_path=[self.checkpoint_file], + new_row_vocab_offset=1, + initializer=self.initializer, + num_row_oov_buckets=1, + num_col_oov_buckets=1) + + # [4 in vocab + 1 oov features, 4 in vocab + 1 oov classes]. The offset + # means we read + expected_remapped_matrix = np.concatenate( + [ + np.reshape([18, 34, 50, self.init_val, self.init_val], [5, 1]), + np.reshape([16, 32, 48, self.init_val, self.init_val], [5, 1]), + np.reshape([self.init_val] * 5, [5, 1]), + np.reshape([17, 33, 49, self.init_val, self.init_val], [5, 1]), + np.reshape([self.init_val] * 5, [5, 1]) + ], + axis=1) + + with self.test_session(): + self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval()) + + def test_load_and_remap_output_layer_weight_initializer_linear(self): + """Tests for the output layer initializer in the linear multi-class case.""" + loading_initializer = (checkpoint_ops._load_and_remap_matrix_initializer( + new_row_vocab_size=5, + new_col_vocab_file=self.new_class_vocab_file, + old_col_vocab_file=self.old_class_vocab_file, + new_col_vocab_size=4, + old_tensor_name='some_scope/embeddings', + ckpt_path=[self.checkpoint_file], + new_row_vocab_file=self.new_feature_vocab_file, + old_row_vocab_file=self.old_feature_vocab_file, + num_row_oov_buckets=1, + num_col_oov_buckets=1, + initializer=self.initializer)) + + expected_remapped_matrix = np.concatenate( + [ + np.reshape([2, 18, 34, 50, self.init_val, self.init_val], [6, 1]), + np.reshape([0, 16, 32, 48, self.init_val, self.init_val], [6, 1]), + np.reshape([self.init_val] * 6, [6, 1]), + np.reshape([1, 17, 33, 49, self.init_val, self.init_val], [6, 1]), + np.reshape([self.init_val] * 6, [6, 1]) + ], + axis=1) + + # The new weight matrix is of size + # [5 feature vocab + 1 feature OOV, 4 class vocab + 1 class OOV]. Use a + # partitioned variable to confirm that the offset logic works. + remapped_matrix = variable_scope.get_variable( + name='linear/obtained_weight_matrix', + shape=[6, 5], + initializer=loading_initializer, + partitioner=partitioned_variables.fixed_size_partitioner(2)) + + with self.test_session(): + variables.global_variables_initializer().run() + self.assertAllClose(expected_remapped_matrix, + remapped_matrix.as_tensor().eval()) + + def test_load_and_remap_output_layer_weight_initializer_dnn_output(self): + """Tests for the output layer initializer in the DNN output case.""" + loading_initializer = (checkpoint_ops._load_and_remap_matrix_initializer( + new_row_vocab_size=5, + new_col_vocab_file=self.new_class_vocab_file, + old_col_vocab_file=self.old_class_vocab_file, + new_col_vocab_size=4, + old_tensor_name='some_scope/embeddings', + ckpt_path=[self.checkpoint_file], + num_col_oov_buckets=1, + initializer=self.initializer)) + + expected_remapped_matrix = np.concatenate( + [ + np.reshape([2, 18, 34, 50, 66], [5, 1]), + np.reshape([0, 16, 32, 48, 64], [5, 1]), + np.reshape([self.init_val] * 5, [5, 1]), + np.reshape([1, 17, 33, 49, 65], [5, 1]), + np.reshape([self.init_val] * 5, [5, 1]) + ], + axis=1) + + # The new weight matrix is of size + # [5-sized input layer, 4 class vocab + 1 class OOV]. + remapped_matrix = variable_scope.get_variable( + name='dnn_output/obtained_weight_matrix', + shape=[5, 5], + initializer=loading_initializer, + partitioner=partitioned_variables.fixed_size_partitioner(2)) + + with self.test_session(): + variables.global_variables_initializer().run() + self.assertAllClose(expected_remapped_matrix, + remapped_matrix.as_tensor().eval()) + + def test_initializer_with_oov_only_partition(self): + """Tests for the output layer initializer where one partition is all OOV.""" + loading_initializer = (checkpoint_ops._load_and_remap_matrix_initializer( + new_row_vocab_size=5, + new_col_vocab_file=self.new_class_vocab_file, + old_col_vocab_file=self.old_class_vocab_file, + new_col_vocab_size=4, + old_tensor_name='some_scope/embeddings', + ckpt_path=[self.checkpoint_file], + new_row_vocab_file=self.new_feature_vocab_file, + old_row_vocab_file=self.old_feature_vocab_file, + num_row_oov_buckets=5, + num_col_oov_buckets=1, + initializer=self.initializer)) + + expected_remapped_matrix = np.concatenate( + [ + np.reshape([2, 18, 34, 50] + [self.init_val] * 6, [10, 1]), + np.reshape([0, 16, 32, 48] + [self.init_val] * 6, [10, 1]), + np.reshape([self.init_val] * 10, [10, 1]), + np.reshape([1, 17, 33, 49] + [self.init_val] * 6, [10, 1]), + np.reshape([self.init_val] * 10, [10, 1]), + ], + axis=1) + + # The new weight matrix is of size + # [5 feature vocab + 5 feature OOV, 4 class vocab + 1 class OOV]. The + # second partition has only OOV. + remapped_matrix = variable_scope.get_variable( + name='linear_all_oov/obtained_weight_matrix', + shape=[10, 5], + initializer=loading_initializer, + partitioner=partitioned_variables.fixed_size_partitioner(2)) + + with self.test_session(): + variables.global_variables_initializer().run() + self.assertAllClose(expected_remapped_matrix, + remapped_matrix.as_tensor().eval()) + + def test_load_and_remap_linear_multiclass_initializer_default_init(self): + """Tests where the zeros_initializer default is used for linear.""" + loading_initializer = (checkpoint_ops._load_and_remap_matrix_initializer( + new_row_vocab_size=5, + new_col_vocab_file=self.new_class_vocab_file, + old_col_vocab_file=self.old_class_vocab_file, + new_col_vocab_size=4, + old_tensor_name='some_scope/embeddings', + ckpt_path=[self.checkpoint_file], + new_row_vocab_file=self.new_feature_vocab_file, + old_row_vocab_file=self.old_feature_vocab_file, + num_row_oov_buckets=1, + num_col_oov_buckets=1)) + + expected_remapped_matrix = np.concatenate( + [ + np.reshape([2, 18, 34, 50, 0, 0], [6, 1]), + np.reshape([0, 16, 32, 48, 0, 0], [6, 1]), + np.reshape([0] * 6, [6, 1]), + np.reshape([1, 17, 33, 49, 0, 0], [6, 1]), + np.reshape([0] * 6, [6, 1]) + ], + axis=1) + + remapped_matrix = variable_scope.get_variable( + name='linear_init_fallback/obtained_weight_matrix', + shape=[6, 5], + initializer=loading_initializer, + partitioner=partitioned_variables.fixed_size_partitioner(2)) + + with self.test_session(): + variables.global_variables_initializer().run() + self.assertAllClose(expected_remapped_matrix, + remapped_matrix.as_tensor().eval()) + + def test_load_embedding_initializer(self): + """Tests for the load_embedding_initializer wrapper.""" + embedding_loading_initializer = (checkpoint_ops._load_embedding_initializer( + new_vocab_file=self.new_feature_vocab_file, + old_vocab_file=self.old_feature_vocab_file, + new_vocab_size=5, + embedding_dim=16, + embedding_tensor_name='some_scope/embeddings', + ckpt_path=[self.checkpoint_file], + num_oov_buckets=1, + initializer=self.initializer)) + + expected_remapped_embeddings = np.concatenate( + [ + np.reshape(range(64), [4, 16]), + np.reshape([self.init_val] * 32, [2, 16]), + ], + axis=0) + + # The new weight matrix is of size + # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the + # last vocab row (2nd last row) is newly initialized (wasn't found in + # previous vocab) and the actual last row is OOV and also newly initialized. + # Use a partitioned variable to confirm that the offset logic works. + remapped_embeddings = variable_scope.get_variable( + name='embedding/obtained_embedding_matrix', + shape=[6, 16], + initializer=embedding_loading_initializer, + partitioner=partitioned_variables.fixed_size_partitioner(2)) + + with self.test_session(): + variables.global_variables_initializer().run() + self.assertAllClose(expected_remapped_embeddings, + remapped_embeddings.as_tensor().eval()) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds index af5341a8d62..bddb87f00cb 100644 --- a/tensorflow/tf_exported_symbols.lds +++ b/tensorflow/tf_exported_symbols.lds @@ -2,4 +2,5 @@ *perftools*gputools* *tf_* *TF_* +*TFE_* *nsync_* diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds index aeb7d66b321..11f66c5c8b2 100644 --- a/tensorflow/tf_version_script.lds +++ b/tensorflow/tf_version_script.lds @@ -3,6 +3,7 @@ tensorflow { *tensorflow*; *perftools*gputools*; *TF_*; + *TFE_*; *nsync_*; local: *; diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt index 667ae5cf6e5..8893594dc34 100644 --- a/tensorflow/tools/api/golden/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.pbtxt @@ -1098,7 +1098,7 @@ tf_module { } member_method { name: "gradients" - argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\'], " + argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\'], " } member_method { name: "greater" @@ -1684,6 +1684,10 @@ tf_module { name: "serialize_sparse" argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "serialize_tensor" + argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "set_random_seed" argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None" diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py index 6f3c3f15103..4405678a6b8 100755 --- a/tensorflow/tools/ci_build/update_version.py +++ b/tensorflow/tools/ci_build/update_version.py @@ -277,8 +277,8 @@ def check_for_lingering_string(lingering_string): formatted_string = lingering_string.replace(".", r"\.") try: linger_str_output = subprocess.check_output( - ['grep', '-rnoH', formatted_string, TF_SRC_DIR]) - linger_strs = linger_str_output.decode('utf8').split("\n") + ["grep", "-rnoH", formatted_string, TF_SRC_DIR]) + linger_strs = linger_str_output.decode("utf8").split("\n") except subprocess.CalledProcessError: linger_strs = [] diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index 34cb19f9cb7..d62316964f8 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -158,6 +158,7 @@ sh_binary( "//tensorflow/contrib/ndlstm:ndlstm", "//tensorflow/contrib/nn:nn_py", "//tensorflow/contrib/predictor:predictor_pip", + "//tensorflow/contrib/receptive_field:receptive_field_pip", "//tensorflow/contrib/session_bundle:session_bundle_pip", "//tensorflow/contrib/signal:signal_py", "//tensorflow/contrib/slim:slim", diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 2c7acd809a8..ef342fe1272 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -573,11 +573,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): patched_http_archive( name = "boringssl", urls = [ - "http://mirror.bazel.build/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz", - "https://github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz", # 2016-07-11 + "http://mirror.bazel.build/github.com/google/boringssl/archive/e3860009a091cd1bd2bc189cdbc3c6d095abde84.tar.gz", + "https://github.com/google/boringssl/archive/e3860009a091cd1bd2bc189cdbc3c6d095abde84.tar.gz", # 2017-07-07 ], - sha256 = "025264d6e9a7ad371f2f66d17a28b6627de0c9592dc2eb54afd062f68f1f9aa3", - strip_prefix = "boringssl-bbcaa15b0647816b9a1a9b9e0d209cd6712f0105", + sha256 = "02f5950f93c4fd3691771c07c9d04cf2999ab01383ff99da345249e93b0fcfb2", + strip_prefix = "boringssl-e3860009a091cd1bd2bc189cdbc3c6d095abde84", # Add patch to boringssl code to support s390x patch_file = str(Label("//third_party/boringssl:add_boringssl_s390x.patch")), ) @@ -683,11 +683,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): native.new_http_archive( name = "cub_archive", urls = [ - "http://mirror.bazel.build/github.com/NVlabs/cub/archive/69ceda618313df8e9cac6659d607b08949455d14.tar.gz", - "https://github.com/NVlabs/cub/archive/69ceda618313df8e9cac6659d607b08949455d14.tar.gz", + "http://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.3.zip", + "https://github.com/NVlabs/cub/archive/1.7.3.zip", ], - sha256 = "87e856522c283b8ea887c3b61d7d5b252d2dd74abac4f1d756d776e721223e82", - strip_prefix = "cub-69ceda618313df8e9cac6659d607b08949455d14", + sha256 = "b7ead9e291d34ffa8074243541c1380d63be63f88de23de8ee548db573b72ebe", + strip_prefix = "cub-1.7.3", build_file = str(Label("//third_party:cub.BUILD")), ) @@ -699,9 +699,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""): native.http_archive( name = "bazel_toolchains", urls = [ - "http://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/bccee4855c049d34bac481083b4c68e2fab8cc50.tar.gz", - "https://github.com/bazelbuild/bazel-toolchains/archive/bccee4855c049d34bac481083b4c68e2fab8cc50.tar.gz", + "http://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/9dbd803ad3b9447430a296810197b09b3a710956.tar.gz", + "https://github.com/bazelbuild/bazel-toolchains/archive/9dbd803ad3b9447430a296810197b09b3a710956.tar.gz", ], - sha256 = "3903fd93b96b42067e00b7973a2c16c34e761ad7a0b55e1557d408f352849e41", - strip_prefix = "bazel-toolchains-bccee4855c049d34bac481083b4c68e2fab8cc50", + sha256 = "0799aa12db5260a499beb40f81744e760c59d055bfc5d271dd2c2ed4d5419faa", + strip_prefix = "bazel-toolchains-9dbd803ad3b9447430a296810197b09b3a710956", ) diff --git a/third_party/boringssl/add_boringssl_s390x.patch b/third_party/boringssl/add_boringssl_s390x.patch index 9a34a59a1d1..8b42d10e687 100644 --- a/third_party/boringssl/add_boringssl_s390x.patch +++ b/third_party/boringssl/add_boringssl_s390x.patch @@ -3,9 +3,9 @@ index 7a3adfb..88012ad 100644 --- a/src/include/openssl/base.h +++ b/src/include/openssl/base.h @@ -94,6 +94,8 @@ extern "C" { - #elif defined(__pnacl__) - #define OPENSSL_32_BIT #define OPENSSL_PNACL + #elif defined(__myriad2__) + #define OPENSSL_32_BIT +#elif defined(__s390x__) +#define OPENSSL_64_BIT #else diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl index 7d8b6005135..98cb326572e 100644 --- a/third_party/gpus/crosstool/BUILD.tpl +++ b/third_party/gpus/crosstool/BUILD.tpl @@ -12,12 +12,12 @@ cc_toolchain_suite( cc_toolchain( name = "cc-compiler-local", - all_files = ":crosstool_wrapper_driver_is_not_gcc", + all_files = "%{linker_files}", compiler_files = ":empty", cpu = "local", dwp_files = ":empty", dynamic_runtime_libs = [":empty"], - linker_files = ":crosstool_wrapper_driver_is_not_gcc", + linker_files = "%{linker_files}", objcopy_files = ":empty", static_runtime_libs = [":empty"], strip_files = ":empty", @@ -30,12 +30,12 @@ cc_toolchain( cc_toolchain( name = "cc-compiler-darwin", - all_files = ":crosstool_wrapper_driver_is_not_gcc", + all_files = "%{linker_files}", compiler_files = ":empty", cpu = "darwin", dwp_files = ":empty", dynamic_runtime_libs = [":empty"], - linker_files = ":crosstool_wrapper_driver_is_not_gcc", + linker_files = "%{linker_files}", objcopy_files = ":empty", static_runtime_libs = [":empty"], strip_files = ":empty", diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl index b85e565f362..4a0f4710881 100644 --- a/third_party/gpus/cuda_configure.bzl +++ b/third_party/gpus/cuda_configure.bzl @@ -971,7 +971,6 @@ def _create_local_cuda_repository(repository_ctx): ' ":cudnn-include",') }) # Set up crosstool/ - _file(repository_ctx, "crosstool:BUILD") cc = find_cc(repository_ctx) host_compiler_includes = _host_compiler_includes(repository_ctx, cc) cuda_defines = { @@ -981,11 +980,14 @@ def _create_local_cuda_repository(repository_ctx): } if _use_cuda_clang(repository_ctx): cuda_defines["%{clang_path}"] = cc + _tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty"}) _tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL") else: nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" % (cuda_config.cuda_toolkit_path, ".exe" if cuda_config.cpu_value == "Windows" else ""))) + _tpl(repository_ctx, "crosstool:BUILD", + {"%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc"}) _tpl(repository_ctx, "crosstool:CROSSTOOL_nvcc", cuda_defines, out="crosstool/CROSSTOOL") _tpl(repository_ctx, "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",