diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md index 1a401997c64..2f3df7cda9c 100644 --- a/ISSUE_TEMPLATE.md +++ b/ISSUE_TEMPLATE.md @@ -4,7 +4,7 @@ https://stackoverflow.com/questions/tagged/tensorflow If you open a GitHub issue, here is our policy: -1. It must be a bug or a feature request. +1. It must be a bug, a feature request, or a significant problem with documentation (for small docs fixes please send a PR instead). 2. The form below must be filled out. 3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues). diff --git a/README.md b/README.md index 0c93813e584..c754c3f0db0 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** | |-----------------|---------------------|------------------|-------------------|---------------| -| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) | +| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) | **TensorFlow** is an open source software library for numerical computation using data flow graphs. The graph nodes represent mathematical operations, while diff --git a/RELEASE.md b/RELEASE.md index fdf10407fda..af6440acef5 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,18 +1,39 @@ # Release 1.5.0 ## Breaking Changes -* Prebuilt binaries are now built against CUDA 9 and cuDNN 7. +* Prebuilt binaries are now built against CUDA 9.0 and cuDNN 7. * Our Linux binaries are built using ubuntu 16 containers, potentially introducing glibc incompatibility issues with ubuntu 14. * Starting from 1.6 release, our prebuilt binaries will use AVX instructions. This may break TF on older CPUs. +## Known Bugs +* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or + `CUDA_ILLEGAL_ADDRESS` failures. + + Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9 + and CUDA 9.1 sometimes does not properly compute the carry bit when + decomposing 64-bit address calculations with large offsets (e.g. `load [x + + large_constant]`) into 32-bit arithmetic in SASS. + + As a result, these versions of `ptxas` miscompile most XLA programs which use + more than 4GB of temp memory. This results in garbage results and/or + `CUDA_ERROR_ILLEGAL_ADDRESS` failures. + + A fix in CUDA 9.1.121 is expected in late February 2018. We do not expect a + fix for CUDA 9.0.x. Until the fix is available, the only workaround is to + [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x + or disable XLA:GPU. + + TensorFlow will print a warning if you use XLA:GPU with a known-bad version of + CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122. + ## Major Features And Improvements * [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager) preview version is now available. * [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/lite) dev preview is now available. -* CUDA 9 and cuDNN 7 support. +* CUDA 9.0 and cuDNN 7 support. * Accelerated Linear Algebra (XLA): * Add `complex64` support to XLA compiler. * `bfloat` support is now added to XLA infrastructure. diff --git a/configure.py b/configure.py index 94827891c36..318e2c7a575 100644 --- a/configure.py +++ b/configure.py @@ -298,7 +298,7 @@ def get_var(environ_cp, System". enabled_by_default: boolean for default behavior. question: optional string for how to ask for user input. - yes_reply: optionanl string for reply when feature is enabled. + yes_reply: optional string for reply when feature is enabled. no_reply: optional string for reply when feature is disabled. Returns: @@ -411,7 +411,7 @@ def set_action_env_var(environ_cp, System". enabled_by_default: boolean for default behavior. question: optional string for how to ask for user input. - yes_reply: optionanl string for reply when feature is enabled. + yes_reply: optional string for reply when feature is enabled. no_reply: optional string for reply when feature is disabled. """ var = int( @@ -1354,6 +1354,7 @@ def main(): environ_cp['TF_NEED_GCP'] = '0' environ_cp['TF_NEED_HDFS'] = '0' environ_cp['TF_NEED_JEMALLOC'] = '0' + environ_cp['TF_NEED_KAFKA'] = '0' environ_cp['TF_NEED_OPENCL_SYCL'] = '0' environ_cp['TF_NEED_COMPUTECPP'] = '0' environ_cp['TF_NEED_OPENCL'] = '0' @@ -1372,6 +1373,8 @@ def main(): 'with_hdfs_support', True, 'hdfs') set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System', 'with_s3_support', True, 's3') + set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform', + 'with_kafka_support', False, 'kafka') set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support', False, 'xla') set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support', diff --git a/tensorflow/BUILD b/tensorflow/BUILD index b26c5255255..c225cc1a74c 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -211,6 +211,12 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "with_kafka_support", + define_values = {"with_kafka_support": "true"}, + visibility = ["//visibility:public"], +) + # Crosses between platforms and file system libraries not supported on those # platforms due to limitations in nested select() statements. config_setting( @@ -544,8 +550,10 @@ filegroup( "//tensorflow/contrib/predictor:all_files", "//tensorflow/contrib/py2tf:all_files", "//tensorflow/contrib/py2tf/converters:all_files", + "//tensorflow/contrib/py2tf/impl:all_files", "//tensorflow/contrib/py2tf/pyct:all_files", "//tensorflow/contrib/py2tf/pyct/static_analysis:all_files", + "//tensorflow/contrib/py2tf/utils:all_files", "//tensorflow/contrib/quantize:all_files", "//tensorflow/contrib/receptive_field:all_files", "//tensorflow/contrib/reduce_slice_ops:all_files", diff --git a/tensorflow/SECURITY.md b/tensorflow/SECURITY.md new file mode 100644 index 00000000000..074eed29515 --- /dev/null +++ b/tensorflow/SECURITY.md @@ -0,0 +1,239 @@ +# Using TensorFlow Securely + +This document discusses how to safely deal with untrusted programs (models or +model parameters), and input data. Below, we also provide guidelines on how to +report vulnerabilities in TensorFlow. + +## TensorFlow models are programs + +TensorFlow's runtime system interprets and executes programs. What machine +learning practitioners term +[**models**](https://developers.google.com/machine-learning/glossary/#model) are +expressed as programs that TensorFlow executes. TensorFlow programs are encoded +as computation +[**graphs**](https://developers.google.com/machine-learning/glossary/#graph). +The model's parameters are often stored separately in **checkpoints**. + +At runtime, TensorFlow executes the computation graph using the parameters +provided. Note that the behavior of the computation graph may change +depending on the parameters provided. TensorFlow itself is not a sandbox. When +executing the computation graph, TensorFlow may read and write files, send and +receive data over the network, and even spawn additional processes. All these +tasks are performed with the permissions of the TensorFlow process. Allowing +for this flexibility makes for a powerful machine learning platform, +but it has implications for security. + +The computation graph may also accept **inputs**. Those inputs are the +data you supply to TensorFlow to train a model, or to use a model to run +inference on the data. + +**TensorFlow models are programs, and need to be treated as such from a security +perspective.** + +## Running untrusted models + +As a general rule: **Always** execute untrusted models inside a sandbox (e.g., +[nsjail](https://github.com/google/nsjail)). + +There are several ways in which a model could become untrusted. Obviously, if an +untrusted party supplies TensorFlow kernels, arbitrary code may be executed. +The same is true if the untrusted party provides Python code, such as the +Python code that generates TensorFlow graphs. + +Even if the untrusted party only supplies the serialized computation +graph (in form of a `GraphDef`, `SavedModel`, or equivalent on-disk format), the +set of computation primitives available to TensorFlow is powerful enough that +you should assume that the TensorFlow process effectively executes arbitrary +code. One common solution is to whitelist only a few safe Ops. While this is +possible in theory, we still recommend you sandbox the execution. + +It depends on the computation graph whether a user provided checkpoint is safe. +It is easily possible to create computation graphs in which malicious +checkpoints can trigger unsafe behavior. For example, consider a graph that +contains a `tf.cond` depending on the value of a `tf.Variable`. One branch of +the `tf.cond` is harmless, but the other is unsafe. Since the `tf.Variable` is +stored in the checkpoint, whoever provides the checkpoint now has the ability to +trigger unsafe behavior, even though the graph is not under their control. + +In other words, graphs can contain vulnerabilities of their own. To allow users +to provide checkpoints to a model you run on their behalf (e.g., in order to +compare model quality for a fixed model architecture), you must carefully audit +your model, and we recommend you run the TensorFlow process in a sandbox. + +## Accepting untrusted Inputs + +It is possible to write models that are secure in a sense that they can safely +process untrusted inputs assuming there are no bugs. There are two main reasons +to not rely on this: first, it is easy to write models which must not be exposed +to untrusted inputs, and second, there are bugs in any software system of +sufficient complexity. Letting users control inputs could allow them to trigger +bugs either in TensorFlow or in dependent libraries. + +In general, it is good practice to isolate parts of any system which is exposed +to untrusted (e.g., user-provided) inputs in a sandbox. + +A useful analogy to how any TensorFlow graph is executed is any interpreted +programming language, such as Python. While it is possible to write secure +Python code which can be exposed to user supplied inputs (by, e.g., carefully +quoting and sanitizing input strings, size-checking input blobs, etc.), it is +very easy to write Python programs which are insecure. Even secure Python code +could be rendered insecure by a bug in the Python interpreter, or in a bug in a +Python library used (e.g., +[this one](https://www.cvedetails.com/cve/CVE-2017-12852/)). + +## Running a TensorFlow server + +TensorFlow is a platform for distributed computing, and as such there is a +TensorFlow server (`tf.train.Server`). **The TensorFlow server is meant for +internal communication only. It is not built for use in an untrusted network.** + +For performance reasons, the default TensorFlow server does not include any +authorization protocol and sends messages unencrypted. It accepts connections +from anywhere, and executes the graphs it is sent without performing any checks. +Therefore, if you run a `tf.train.Server` in your network, anybody with +access to the network can execute what you should consider arbitrary code with +the privileges of the process running the `tf.train.Server`. + +When running distributed TensorFlow, you must isolate the network in which the +cluster lives. Cloud providers provide instructions for setting up isolated +networks, which are sometimes branded as "virtual private cloud." Refer to the +instructions for +[GCP](https://cloud.google.com/compute/docs/networks-and-firewalls) and +[AWS](https://aws.amazon.com/vpc/)) for details. + +Note that `tf.train.Server` is different from the server created by +`tensorflow/serving` (the default binary for which is called `ModelServer`). +By default, `ModelServer` also has no built-in mechanism for authentication. +Connecting it to an untrusted network allows anyone on this network to run the +graphs known to the `ModelServer`. This means that an attacker may run +graphs using untrusted inputs as described above, but they would not be able to +execute arbitrary graphs. It is possible to safely expose a `ModelServer` +directly to an untrusted network, **but only if the graphs it is configured to +use have been carefully audited to be safe**. + +Similar to best practices for other servers, we recommend running any +`ModelServer` with appropriate privileges (i.e., using a separate user with +reduced permisisons). In the spirit of defense in depth, we recommend +authenticating requests to any TensorFlow server connected to an untrusted +network, as well as sandboxing the server to minimize the adverse effects of +any breach. + +## Vulnerabilities in TensorFlow + +TensorFlow is a large and complex system. It also depends on a large set of +third party libraries (e.g., `numpy`, `libjpeg-turbo`, PNG parsers, `protobuf`). +It is possible that TensorFlow or its dependent libraries contain +vulnerabilities that would allow triggering unexpected or dangerous behavior +with specially crafted inputs. + +### What is a vulnerability? + +Given TensorFlow's flexibility, it is possible to specify computation graphs +which exhibit unexpected or unwanted behaviors. The fact that TensorFlow models +can perform arbitrary computations means that they may read and write files, +communicate via the network, produce deadlocks and infinite loops, or run out +of memory. It is only when these behaviors are outside the specifications of the +operations involved that such behavior is a vulnerability. + +A `FileWriter` writing a file is not unexpected behavior and therefore is not a +vulnerability in TensorFlow. A `MatMul` allowing arbitrary binary code execution +**is** a vulnerability. + +This is more subtle from a system perspective. For example, it is easy to cause +a TensorFlow process to try to allocate more memory than available by specifying +a computation graph containing an ill-considered `tf.tile` operation. TensorFlow +should exit cleanly in this case (it would raise an exception in Python, or +return an error `Status` in C++). However, if the surrounding system is not +expecting the possibility, such behavior could be used in a denial of service +attack (or worse). Because TensorFlow behaves correctly, this is not a +vulnerability in TensorFlow (although it would be a vulnerability of this +hypothetical system). + +As a general rule, it is incorrect behavior for Tensorflow to access memory it +does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to +such behaviors constitute a vulnerability. + +One of the most critical parts of any system is input handling. If malicious +input can trigger side effects or incorrect behavior, this is a bug, and likely +a vulnerability. + +### Reporting vulnerabilities + +Please email reports about any security related issues you find to +`security@tensorflow.org`. This mail is delivered to a small security team. Your +email will be acknowledged within one business day, and you'll receive a more +detailed response to your email within 7 days indicating the next steps in +handling your report. For critical problems, you may encrypt your report (see +below). + +Please use a descriptive subject line for your report email. After the initial +reply to your report, the security team will endeavor to keep you informed of +the progress being made towards a fix and announcement. + +If you believe that an existing (public) issue is security-related, please send +an email to `security@tensorflow.org`. The email should include the issue ID and +a short description of why it should be handled according to this security +policy. + +Once an issue is reported, TensorFlow uses the following disclosure process: + +* When a report is received, we confirm the issue and determine its severity. +* If we know of specific third-party services or software based on TensorFlow + that require mitigation before publication, those projects will be notified. +* An advisory is prepared (but not published) which details the problem and + steps for mitigation. +* Wherever possible, fixes are prepared for the last minor release of the two + latest major releases, as well as the master branch. We will attempt to + commit these fixes as soon as possible, and as close together as + possible. +* Patch releases are published for all fixed released versions, a + notification is sent to discuss@tensorflow.org, and the advisory is published. + +Past security advisories are listed below. We credit reporters for identifying +security issues, although we keep your name confidential if you request it. + +#### Encryption key for `security@tensorflow.org` + +If your disclosure is extremely sensitive, you may choose to encrypt your +report using the key below. Please only use this for critical security +reports. + +``` +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQENBFpqdzwBCADTeAHLNEe9Vm77AxhmGP+CdjlY84O6DouOCDSq00zFYdIU/7aI +LjYwhEmDEvLnRCYeFGdIHVtW9YrVktqYE9HXVQC7nULU6U6cvkQbwHCdrjaDaylP +aJUXkNrrxibhx9YYdy465CfusAaZ0aM+T9DpcZg98SmsSml/HAiiY4mbg/yNVdPs +SEp/Ui4zdIBNNs6at2gGZrd4qWhdM0MqGJlehqdeUKRICE/mdedXwsWLM8AfEA0e +OeTVhZ+EtYCypiF4fVl/NsqJ/zhBJpCx/1FBI1Uf/lu2TE4eOS1FgmIqb2j4T+jY +e+4C8kGB405PAC0n50YpOrOs6k7fiQDjYmbNABEBAAG0LVRlbnNvckZsb3cgU2Vj +dXJpdHkgPHNlY3VyaXR5QHRlbnNvcmZsb3cub3JnPokBTgQTAQgAOBYhBEkvXzHm +gOJBnwP4Wxnef3wVoM2yBQJaanc8AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheA +AAoJEBnef3wVoM2yNlkIAICqetv33MD9W6mPAXH3eon+KJoeHQHYOuwWfYkUF6CC +o+X2dlPqBSqMG3bFuTrrcwjr9w1V8HkNuzzOJvCm1CJVKaxMzPuXhBq5+DeT67+a +T/wK1L2R1bF0gs7Pp40W3np8iAFEh8sgqtxXvLGJLGDZ1Lnfdprg3HciqaVAiTum +HBFwszszZZ1wAnKJs5KVteFN7GSSng3qBcj0E0ql2nPGEqCVh+6RG/TU5C8gEsEf +3DX768M4okmFDKTzLNBm+l08kkBFt+P43rNK8dyC4PXk7yJa93SmS/dlK6DZ16Yw +2FS1StiZSVqygTW59rM5XNwdhKVXy2mf/RtNSr84gSi5AQ0EWmp3PAEIALInfBLR +N6fAUGPFj+K3za3PeD0fWDijlC9f4Ety/icwWPkOBdYVBn0atzI21thPRbfuUxfe +zr76xNNrtRRlbDSAChA1J5T86EflowcQor8dNC6fS+oHFCGeUjfEAm16P6mGTo0p +osdG2XnnTHOOEFbEUeWOwR/zT0QRaGGknoy2pc4doWcJptqJIdTl1K8xyBieik/b +nSoClqQdZJa4XA3H9G+F4NmoZGEguC5GGb2P9NHYAJ3MLHBHywZip8g9oojIwda+ +OCLL4UPEZ89cl0EyhXM0nIAmGn3Chdjfu3ebF0SeuToGN8E1goUs3qSE77ZdzIsR +BzZSDFrgmZH+uP0AEQEAAYkBNgQYAQgAIBYhBEkvXzHmgOJBnwP4Wxnef3wVoM2y +BQJaanc8AhsMAAoJEBnef3wVoM2yX4wIALcYZbQhSEzCsTl56UHofze6C3QuFQIH +J4MIKrkTfwiHlCujv7GASGU2Vtis5YEyOoMidUVLlwnebE388MmaJYRm0fhYq6lP +A3vnOCcczy1tbo846bRdv012zdUA+wY+mOITdOoUjAhYulUR0kiA2UdLSfYzbWwy +7Obq96Jb/cPRxk8jKUu2rqC/KDrkFDtAtjdIHh6nbbQhFuaRuWntISZgpIJxd8Bt +Gwi0imUVd9m9wZGuTbDGi6YTNk0GPpX5OMF5hjtM/objzTihSw9UN+65Y/oSQM81 +v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc= +=CDME +-----END PGP PUBLIC KEY BLOCK----- +``` + +### Known vulnerabilities + +| Type | Versions affected | Reported by | Additional Information | +|------|:-----------------:|---------------------------------------| +| out of bounds read| <=1.4 | @zhangbo5891001 | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) | + diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index 3c7f041b39f..e7fb1dec53b 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -195,10 +195,10 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims, reinterpret_cast(data) % EIGEN_MAX_ALIGN_BYTES != 0) { // TF_STRING and TF_RESOURCE tensors have a different representation in // TF_Tensor than they do in tensorflow::Tensor. So a copy here is a waste - // (any alignement requirements will be taken care of by TF_TensorToTensor + // (any alignment requirements will be taken care of by TF_TensorToTensor // and TF_TensorFromTensor). // - // Other types have the same represntation, so copy only if it is safe to do + // Other types have the same representation, so copy only if it is safe to do // so. buf->data_ = allocate_tensor("TF_NewTensor", len); std::memcpy(buf->data_, data, len); @@ -2144,7 +2144,7 @@ Status CopyGraph(Graph* src_graph, Graph* dst_graph, opts.return_tensors.push_back(ToTensorId(nodes_to_return[i])); } - // TOOD(skyewm): change to OutputTensor + // TODO(skyewm): change to OutputTensor tensorflow::ImportGraphDefResults results; TF_RETURN_IF_ERROR( ImportGraphDef(opts, gdef, dst_graph, dst_refiner, &results)); diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index 74190cb135a..e62310d8114 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -46,6 +46,7 @@ tf_cuda_library( "//tensorflow/c:c_api", "//tensorflow/c:c_api_internal", "//tensorflow/core:core_cpu_lib", + "//tensorflow/core:framework", "//tensorflow/core:framework_internal", "//tensorflow/core:framework_lite", "//tensorflow/core:lib_internal", diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index a76c8f5ec05..d65b5928959 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -85,15 +85,7 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) { return nullptr; } - TFE_Context* ret = new TFE_Context(session); - ret->policy = opts->policy; - ret->pflr.reset(new tensorflow::ProcessFunctionLibraryRuntime( - ret->session->device_mgr, opts->session_options.options.env, - TF_GRAPH_DEF_VERSION, &ret->func_lib_def, {})); - ret->rendezvous = - new tensorflow::IntraProcessRendezvous(ret->session->device_mgr); - - return ret; + return new TFE_Context(*opts, session); } void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) { @@ -261,15 +253,6 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, void TFE_DeleteOp(TFE_Op* op) { delete op; } -static void TFE_OpSetDeviceHelper(TFE_Op* op, tensorflow::Device* device, - TF_Status* status) { - // Questionable heuristic: Place the op on the same device as the first input - // placed outside of host memory? - if (IsCPU(op->device) && !IsCPU(device)) { - op->device = device; - } -} - void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) { tensorflow::Device* d = nullptr; if (device_name != nullptr && strlen(device_name) > 0) { @@ -277,11 +260,24 @@ void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) { op->ctx->session->device_mgr->LookupDevice(device_name, &d); if (!status->status.ok()) return; } - TFE_OpSetDeviceHelper(op, d, status); + op->device = d; +} + +const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) { + tensorflow::Device* device = + (op->device == nullptr) ? op->ctx->devices()[0] : op->device; + return device->name().c_str(); } void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) { - TFE_OpSetDeviceHelper(op, h->d, status); + // Questionable heuristic ... + // + // Motivation: After an 'op' is placed on GPU because some of its earlier + // inputs are on GPU, we want to keep the 'op' there, even if some later + // inputs of it are not on GPU. + if (IsCPU(op->device) && !IsCPU(h->d)) { + op->device = h->d; + } if (!status->status.ok()) return; op->inputs.push_back(h->t); op->input_devices.push_back(h->d); @@ -298,7 +294,7 @@ TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name, return TF_ATTR_INT; // The compiler requires that we return something. } status->status = - tensorflow::AttrTypeByName(op->attr_types, attr_name, &ret, is_list); + tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list); return ret; } diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h index 387de078948..6a2aff1591d 100644 --- a/tensorflow/c/eager/c_api.h +++ b/tensorflow/c/eager/c_api.h @@ -154,6 +154,9 @@ TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op); TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status); +// The returned string remains valid throughout the lifetime of 'op'. +TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op, + TF_Status* status); TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status); diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h index a6f76c732f2..f2abffb7bc0 100644 --- a/tensorflow/c/eager/c_api_internal.h +++ b/tensorflow/c/eager/c_api_internal.h @@ -35,6 +35,7 @@ limitations under the License. #include "tensorflow/core/lib/gtl/stl_util.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/public/version.h" struct TFE_ContextOptions { TF_SessionOptions session_options; @@ -43,9 +44,15 @@ struct TFE_ContextOptions { }; struct TFE_Context { - explicit TFE_Context(TF_Session* s) : session(s) {} + explicit TFE_Context(const TFE_ContextOptions& opts, TF_Session* s) + : policy(opts.policy), + session(s), + rendezvous(new tensorflow::IntraProcessRendezvous(s->device_mgr)), + pflr(new tensorflow::ProcessFunctionLibraryRuntime( + session->device_mgr, opts.session_options.options.env, + TF_GRAPH_DEF_VERSION, &func_lib_def, {})) {} - TFE_ContextDevicePlacementPolicy policy; + const TFE_ContextDevicePlacementPolicy policy; // Note: we cannot use C++11 thread_local here as there is no concept of a // thread-local-object-local variable in C++11. @@ -54,8 +61,8 @@ struct TFE_Context { thread_local_policies GUARDED_BY(policy_map_mu); // TFE_Context is an extension of TF_Session. And TF_Session needs a TF_Graph. - TF_Session* session; - tensorflow::Rendezvous* rendezvous; + TF_Session* const session; + tensorflow::Rendezvous* const rendezvous; tensorflow::mutex functions_mu; tensorflow::FunctionLibraryDefinition func_lib_def GUARDED_BY(functions_mu){ @@ -64,14 +71,14 @@ struct TFE_Context { // One FunctionLibraryRuntime per device. // func_libs[i] is the FunctionLibraryRuntime corresponding to // session->devices[i]. - std::unique_ptr pflr; + const std::unique_ptr pflr; tensorflow::mutex cache_mu; std::unordered_map kernel_cache GUARDED_BY(cache_mu); - tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) { + tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) const { return pflr->GetFLR(d->name()); } @@ -100,6 +107,8 @@ struct TFE_TensorHandle { }; struct TFE_Op { + // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a + // primitive operation. TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t) : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {} diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc index 18e7a64435e..b0409af87c2 100644 --- a/tensorflow/c/eager/c_api_test.cc +++ b/tensorflow/c/eager/c_api_test.cc @@ -60,6 +60,31 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) { return op; } +// If there is a GPU device, returns true and sets 'gpu_device_name' +// accordingly. +bool GetGPUDeviceName(TFE_Context* ctx, string* gpu_device_name) { + std::unique_ptr status( + TF_NewStatus(), TF_DeleteStatus); + TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get()); + CHECK_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + + const int num_devices = TF_DeviceListCount(devices); + for (int i = 0; i < num_devices; ++i) { + const string device_type(TF_DeviceListType(devices, i, status.get())); + CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get()); + const string device_name(TF_DeviceListName(devices, i, status.get())); + CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get()); + if (device_type == "GPU") { + *gpu_device_name = device_name; + LOG(INFO) << "Found GPU device " << device_name; + TF_DeleteDeviceList(devices); + return true; + } + } + TF_DeleteDeviceList(devices); + return false; +} + void BM_InitOp(int iters) { tensorflow::testing::StopTiming(); TF_Status* status = TF_NewStatus(); @@ -288,22 +313,15 @@ TEST(CAPI, TensorHandleSilentCopy) { TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get()); ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); - TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get()); - ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); - const int num_devices = TF_DeviceListCount(devices); - // Disable the test if no GPU is present. - if (num_devices > 1) { - const int device_to_use = 1; - const string name(TF_DeviceListName(devices, device_to_use, status.get())); - ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get()); - - TFE_TensorHandle* hgpu = - TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get()); + string gpu_device_name; + if (GetGPUDeviceName(ctx, &gpu_device_name)) { + TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice( + hcpu, ctx, gpu_device_name.c_str(), status.get()); ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get()); TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu); - TFE_OpSetDevice(matmul, name.c_str(), status.get()); + TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get()); ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get()); TFE_TensorHandle* retvals[1]; int num_retvals = 1; @@ -314,7 +332,6 @@ TEST(CAPI, TensorHandleSilentCopy) { TFE_DeleteTensorHandle(hgpu); } - TF_DeleteDeviceList(devices); TF_DeleteTensor(t); TFE_DeleteTensorHandle(hcpu); TFE_DeleteContext(ctx, status.get()); @@ -337,22 +354,15 @@ TEST(CAPI, TensorHandleSilentCopyLocal) { TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get()); ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); - TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get()); - ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); - const int num_devices = TF_DeviceListCount(devices); - // Disable the test if no GPU is present. - if (num_devices > 1) { - const int device_to_use = 1; - const string name(TF_DeviceListName(devices, device_to_use, status.get())); - ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get()); - - TFE_TensorHandle* hgpu = - TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get()); + string gpu_device_name; + if (GetGPUDeviceName(ctx, &gpu_device_name)) { + TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice( + hcpu, ctx, gpu_device_name.c_str(), status.get()); ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get()); TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu); - TFE_OpSetDevice(matmul, name.c_str(), status.get()); + TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get()); ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get()); TFE_TensorHandle* retvals[1]; int num_retvals = 1; @@ -363,13 +373,43 @@ TEST(CAPI, TensorHandleSilentCopyLocal) { TFE_DeleteTensorHandle(hgpu); } - TF_DeleteDeviceList(devices); TF_DeleteTensor(t); TFE_DeleteTensorHandle(hcpu); TFE_DeleteContext(ctx, status.get()); EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); } +TEST(CAPI, SetAndGetOpDevices) { + TF_Status* status = TF_NewStatus(); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + TFE_Context* ctx = TFE_NewContext(opts, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContextOptions(opts); + + TFE_TensorHandle* m = TestMatrixTensorHandle(); + TFE_Op* matmul = MatMulOp(ctx, m, m); + + // Disable the test if no GPU is present. + string gpu_device_name; + if (GetGPUDeviceName(ctx, &gpu_device_name)) { + TFE_OpSetDevice(matmul, "GPU:0", status); + ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status); + const char* device_name = TFE_OpGetDevice(matmul, status); + ASSERT_TRUE(strstr(device_name, "GPU:0") != nullptr); + + TFE_OpSetDevice(matmul, "CPU:0", status); + ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status); + device_name = TFE_OpGetDevice(matmul, status); + ASSERT_TRUE(strstr(device_name, "CPU:0") != nullptr); + } + + TFE_DeleteOp(matmul); + TFE_DeleteTensorHandle(m); + TFE_DeleteContext(ctx, status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TF_DeleteStatus(status); +} + TEST(CAPI, Execute) { TF_Status* status = TF_NewStatus(); TFE_ContextOptions* opts = TFE_NewContextOptions(); diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/c/eager/runtime.cc index 3a9951e14de..12abfcba2f0 100644 --- a/tensorflow/c/eager/runtime.cc +++ b/tensorflow/c/eager/runtime.cc @@ -86,10 +86,9 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) { return Status::OK(); } -Status AttrTypeByName(const AttrTypeMap* m, const string& attr_name, +Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name, TF_AttrType* out, unsigned char* is_list) { - CHECK(m); - auto* t = gtl::FindOrNull(*m, attr_name); + auto* t = gtl::FindOrNull(m, attr_name); if (t == nullptr) { return errors::InvalidArgument("Attribute '", attr_name, "' does not exist for this operation"); @@ -173,14 +172,14 @@ void CombineUnordered(const tensorflow::Fprint128& a, b->high64 += a.high64; } -inline tensorflow::Fprint128 CacheKeyHelper(const StringPiece& s, +inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s, const tensorflow::Fprint128& b) { // TODO(agarwal): avoid ToString(). tensorflow::Fprint128 a = tensorflow::Fingerprint128(s.ToString()); return FingerprintCat128(a, b); } -inline tensorflow::Fprint128 CacheKeyHelper(const StringPiece& s, uint64 b) { +inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s, uint64 b) { return CacheKeyHelper(s, {b, b}); } diff --git a/tensorflow/c/eager/runtime.h b/tensorflow/c/eager/runtime.h index e28a416e67f..4d20b5244a4 100644 --- a/tensorflow/c/eager/runtime.h +++ b/tensorflow/c/eager/runtime.h @@ -43,7 +43,7 @@ typedef std::unordered_map AttrTypeMap; Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out); // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'. -Status AttrTypeByName(const AttrTypeMap* m, const string& attr_name, +Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name, TF_AttrType* out, unsigned char* is_list); // KernelAndDevice::Init needs a NodeDef only to pass the attribute map through. diff --git a/tensorflow/c/eager/runtime_test.cc b/tensorflow/c/eager/runtime_test.cc index 2ccca66f672..643153058ce 100644 --- a/tensorflow/c/eager/runtime_test.cc +++ b/tensorflow/c/eager/runtime_test.cc @@ -63,17 +63,17 @@ TEST(AttrTypeMap, Lookup) { TF_AttrType t; unsigned char is_list = 1; - s = AttrTypeByName(m, "ThisAttribyteCannotPossiblyExist", &t, &is_list); + s = AttrTypeByName(*m, "ThisAttribyteCannotPossiblyExist", &t, &is_list); EXPECT_FALSE(s.ok()); EXPECT_NE(is_list, 0); - s = AttrTypeByName(m, "transpose_a", &t, &is_list); + s = AttrTypeByName(*m, "transpose_a", &t, &is_list); ASSERT_TRUE(s.ok()) << s; EXPECT_EQ(TF_ATTR_BOOL, t); EXPECT_EQ(is_list, 0); s = AttrTypeMapForOp("Squeeze", &m); ASSERT_TRUE(s.ok()) << s; - s = AttrTypeByName(m, "squeeze_dims", &t, &is_list); + s = AttrTypeByName(*m, "squeeze_dims", &t, &is_list); ASSERT_TRUE(s.ok()) << s; EXPECT_EQ(TF_ATTR_INT, t); EXPECT_NE(is_list, 0); diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h index 2b65e38f540..bdb0815d6b6 100644 --- a/tensorflow/c/eager/tape.h +++ b/tensorflow/c/eager/tape.h @@ -18,12 +18,12 @@ limitations under the License. // Language-agnostic gradient tape. Does not perform backpropagation, just // maintains the data structures required to do so. -#include -#include #include #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/gtl/flatmap.h" +#include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/platform/types.h" namespace tensorflow { @@ -54,11 +54,11 @@ struct OpTapeEntry { // Map from tensor_id to internally-defined operation-id of the operation which // produced this tensor. A value of -1 means that the tensor was directly // watched and not the result of any operation in the tape. -using TensorTape = std::unordered_map; +using TensorTape = gtl::FlatMap; // Map from operation-id to tape entry. template -using OpTape = std::unordered_map>; +using OpTape = gtl::FlatMap>; // Operations the tape needs to perform on tensors to do backpropagation. Named // "vspace" because a subset of these are related to a vector space, such as @@ -159,7 +159,7 @@ class GradientTape { // Map from tensor id to number of remaining usages (i.e. how many entries in // the tape refer to it); to aid in tape garbage collection. - std::unordered_map tensor_usage_; + gtl::FlatMap tensor_usage_; // If false, all activations are deleted in the first call to ComputeGradient. // Else, only when this is destructed. @@ -286,11 +286,11 @@ struct BackpropInitialState { // Map from tensor ID to how many references still exist for this tensor in // the tape. - std::unordered_map tensor_usage_counts; + gtl::FlatMap tensor_usage_counts; // Maps from op ID to how many output tensors of this op still need to have // their gradients computed. - std::unordered_map op_missing_tensor; + gtl::FlatMap op_missing_tensor; }; // If `persistent_tape` is true, op_tape is not changed and none of the @@ -301,8 +301,8 @@ struct BackpropInitialState { template BackpropInitialState PrepareBackprop( gtl::ArraySlice target, const TensorTape& tensor_tape, - OpTape* op_tape, - const std::unordered_set& sources_set, bool persistent_tape) { + OpTape* op_tape, const gtl::FlatSet& sources_set, + bool persistent_tape) { std::vector tensor_stack; tensor_stack.reserve(target.size()); for (auto t : target) { @@ -362,7 +362,7 @@ BackpropInitialState PrepareBackprop( template std::vector InitialStack( const OpTape& op_tape, - const std::unordered_map& op_missing_tensor) { + const gtl::FlatMap& op_missing_tensor) { std::vector result; for (auto& op_entry : op_tape) { if (op_missing_tensor.find(op_entry.first) == op_missing_tensor.end()) { @@ -373,13 +373,13 @@ std::vector InitialStack( } template -Status InitialGradients( - const VSpace& vspace, - gtl::ArraySlice target_tensor_ids, - gtl::ArraySlice output_gradients, const TensorTape& tensor_tape, - const OpTape& op_tape, - const std::unordered_map& tensor_usage_counts, - std::unordered_map>* result) { +Status InitialGradients(const VSpace& vspace, + gtl::ArraySlice target_tensor_ids, + gtl::ArraySlice output_gradients, + const TensorTape& tensor_tape, + const OpTape& op_tape, + const gtl::FlatMap& tensor_usage_counts, + gtl::FlatMap>* result) { for (int i = 0; i < target_tensor_ids.size(); ++i) { const int64 id = target_tensor_ids[i]; if (tensor_usage_counts.find(id) != tensor_usage_counts.end()) { @@ -441,13 +441,13 @@ Status GradientTape::ComputeGradient( gtl::ArraySlice source_tensor_ids, gtl::ArraySlice output_gradients, std::vector* result) { - std::unordered_set sources_set(source_tensor_ids.begin(), - source_tensor_ids.end()); + gtl::FlatSet sources_set(source_tensor_ids.begin(), + source_tensor_ids.end()); BackpropInitialState state = PrepareBackprop( target_tensor_ids, tensor_tape_, &op_tape_, sources_set, persistent_); std::vector op_stack = InitialStack(state.op_tape, state.op_missing_tensor); - std::unordered_map> gradients; + gtl::FlatMap> gradients; Status s = InitialGradients(vspace, target_tensor_ids, output_gradients, tensor_tape_, state.op_tape, state.tensor_usage_counts, &gradients); @@ -463,7 +463,7 @@ Status GradientTape::ComputeGradient( cleanup(); return s; } - std::unordered_map gradients_size; + gtl::FlatMap gradients_size; // TODO(apassos) multiple threads could be dequeuing from op_stack at the same // time, for better CPU backprop performance. VLOG(1) << "Initial stack:"; @@ -472,11 +472,10 @@ Status GradientTape::ComputeGradient( VLOG(1) << " " << t; } } - std::unordered_map> - functions_accept_none_for_indices({ - {"SoftmaxCrossEntropyWithLogits", {1}}, - {"FusedBatchNorm", {1, 2, 3, 4}}, - }); + gtl::FlatMap> functions_accept_none_for_indices({ + {"SoftmaxCrossEntropyWithLogits", {1}}, + {"FusedBatchNorm", {1, 2, 3, 4}}, + }); while (!op_stack.empty()) { const int64 op = op_stack.back(); VLOG(1) << "Popped " << op; diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD index c9ade5fb83f..9060c19e9d2 100644 --- a/tensorflow/cc/BUILD +++ b/tensorflow/cc/BUILD @@ -433,6 +433,7 @@ tf_gen_op_wrappers_cc( "linalg_ops", "logging_ops", "lookup_ops", + "manip_ops", "math_ops", "nn_ops", "no_op", diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc index acef098c7d0..faa1e378d07 100644 --- a/tensorflow/cc/saved_model/loader.cc +++ b/tensorflow/cc/saved_model/loader.cc @@ -96,7 +96,9 @@ Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto, Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def, const SessionOptions& session_options, std::unique_ptr* session) { - session->reset(NewSession(session_options)); + Session* session_p = nullptr; + TF_RETURN_IF_ERROR(NewSession(session_options, &session_p)); + session->reset(session_p); return (*session)->Create(meta_graph_def.graph_def()); } diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc index 0ad6b33bba5..4c64d2cfe3c 100644 --- a/tensorflow/cc/saved_model/loader_test.cc +++ b/tensorflow/cc/saved_model/loader_test.cc @@ -155,6 +155,24 @@ TEST_F(LoaderTest, NoTagMatchMultiple) { << st.error_message(); } +TEST_F(LoaderTest, SessionCreationFailure) { + SavedModelBundle bundle; + // Use invalid SessionOptions to cause session creation to fail. Default + // options work, so provide an invalid value for the target field. + SessionOptions session_options; + constexpr char kInvalidTarget[] = "invalid target"; + session_options.target = kInvalidTarget; + RunOptions run_options; + + const string export_dir = + io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded); + Status st = LoadSavedModel(session_options, run_options, export_dir, + {kSavedModelTagServe}, &bundle); + EXPECT_FALSE(st.ok()); + EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget)) + << st.error_message(); +} + TEST_F(LoaderTest, PbtxtFormat) { SavedModelBundle bundle; SessionOptions session_options; diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD index 0a7c37383f9..97f66e79b8a 100644 --- a/tensorflow/cc/tools/BUILD +++ b/tensorflow/cc/tools/BUILD @@ -23,7 +23,6 @@ cc_library( "//tensorflow/core:core_cpu", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", - "//tensorflow/core:tensorflow", ], ) diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl index 2b9c83ba149..58572fea3db 100644 --- a/tensorflow/compiler/aot/tfcompile.bzl +++ b/tensorflow/compiler/aot/tfcompile.bzl @@ -4,7 +4,7 @@ To use from your BUILD file, add the following line to load the macro: -load("@org_tensorflow//tensorflow/compiler/aot:tfcompile.bzl", "tf_library") +load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library") Then call the macro like this: @@ -16,14 +16,15 @@ tf_library( ) """ -load("@org_tensorflow//tensorflow:tensorflow.bzl", "if_android", "tf_copts") +load("//tensorflow:tensorflow.bzl", + "if_android", "tf_cc_test", "tf_copts") def tf_library(name, graph, config, freeze_checkpoint=None, freeze_saver=None, cpp_class=None, gen_test=True, gen_benchmark=True, visibility=None, testonly=None, tfcompile_flags=None, - tfcompile_tool="@org_tensorflow//tensorflow/compiler/aot:tfcompile", + tfcompile_tool="//tensorflow/compiler/aot:tfcompile", include_standard_runtime_deps=True, deps=None, tags=None): """Runs tfcompile to compile a TensorFlow graph into executable code. @@ -119,9 +120,9 @@ def tf_library(name, graph, config, out_nodes_file, ] + freeze_saver_srcs, outs=[freeze_file], - cmd=("$(location @org_tensorflow//tensorflow/python/tools:freeze_graph)" + + cmd=("$(location //tensorflow/python/tools:freeze_graph)" + freeze_args), - tools=["@org_tensorflow//tensorflow/python/tools:freeze_graph"], + tools=["//tensorflow/python/tools:freeze_graph"], tags=tags, ) tfcompile_graph = freeze_file @@ -213,22 +214,22 @@ def tf_library(name, graph, config, # These deps are required by all tf_library targets even if # include_standard_runtime_deps is False. Without them, the # generated code will fail to compile. - "@org_tensorflow//tensorflow/compiler/tf2xla:xla_compiled_cpu_function", - "@org_tensorflow//tensorflow/core:framework_lite", + "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function", + "//tensorflow/core:framework_lite", ] + (need_xla_data_proto and [ # If we're generating the program shape, we must depend on the proto. - "@org_tensorflow//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla:xla_data_proto", ] or []) + (include_standard_runtime_deps and [ # TODO(cwhipkey): only depend on kernel code that the model actually needed. - "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d", - "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d", - "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx", - "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon", - "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1", - "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_conv2d", - "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_matmul", - "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d", - "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul", + "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d", + "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d", + "//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx", + "//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon", + "//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1", + "//tensorflow/compiler/xla/service/cpu:runtime_conv2d", + "//tensorflow/compiler/xla/service/cpu:runtime_matmul", + "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d", + "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul", "//third_party/eigen3", ] or []) + (deps or []), tags=tags, @@ -254,28 +255,32 @@ def tf_library(name, graph, config, name=("gen_" + test_name), testonly=1, srcs=[ - "@org_tensorflow//tensorflow/compiler/aot:test.cc", + "//tensorflow/compiler/aot:test.cc", header_file, ], outs=[test_file], cmd=("sed " + sed_replace + - " $(location @org_tensorflow//tensorflow/compiler/aot:test.cc) " + + " $(location //tensorflow/compiler/aot:test.cc) " + "> $(OUTS)"), tags=tags, ) - # The cc_test rule for the generated code. - native.cc_test( + # The cc_test rule for the generated code. To ensure that this works + # reliably across build configurations, we must use tf_cc_test instead of + # native.cc_test. This is related to how we build + # //tensorflow/core:lib -- see the note in tensorflow/core/BUILD + # for more details. + tf_cc_test( name=test_name, srcs=[test_file], deps=[ ":" + name, - "@org_tensorflow//tensorflow/compiler/aot:runtime", - "@org_tensorflow//tensorflow/compiler/aot:tf_library_test_main", - "@org_tensorflow//tensorflow/compiler/xla:executable_run_options", + "//tensorflow/compiler/aot:runtime", + "//tensorflow/compiler/aot:tf_library_test_main", + "//tensorflow/compiler/xla:executable_run_options", "//third_party/eigen3", - "@org_tensorflow//tensorflow/core:lib", - "@org_tensorflow//tensorflow/core:test", + "//tensorflow/core:lib", + "//tensorflow/core:test", ], tags=tags, ) @@ -283,7 +288,7 @@ def tf_library(name, graph, config, if gen_benchmark: benchmark_name = name + "_benchmark" benchmark_file = benchmark_name + ".cc" - benchmark_main = ("@org_tensorflow//tensorflow/compiler/aot:" + + benchmark_main = ("//tensorflow/compiler/aot:" + "benchmark_main.template") # Rule to rewrite benchmark.cc to produce the benchmark_file. @@ -301,7 +306,9 @@ def tf_library(name, graph, config, tags=tags, ) - # The cc_benchmark rule for the generated code. + # The cc_benchmark rule for the generated code. This does not need the + # tf_cc_binary since we (by deliberate design) do not depend on + # //tensorflow/core:lib. # # Note: to get smaller size on android for comparison, compile with: # --copt=-fvisibility=hidden @@ -315,12 +322,12 @@ def tf_library(name, graph, config, linkopts = if_android(["-pie", "-s"]), deps=[ ":" + name, - "@org_tensorflow//tensorflow/compiler/aot:benchmark", - "@org_tensorflow//tensorflow/compiler/aot:runtime", - "@org_tensorflow//tensorflow/compiler/xla:executable_run_options", + "//tensorflow/compiler/aot:benchmark", + "//tensorflow/compiler/aot:runtime", + "//tensorflow/compiler/xla:executable_run_options", "//third_party/eigen3", ] + if_android([ - "@org_tensorflow//tensorflow/compiler/aot:benchmark_extra_android", + "//tensorflow/compiler/aot:benchmark_extra_android", ]), tags=tags, ) @@ -330,11 +337,11 @@ def target_llvm_triple(): # TODO(toddw): Add target_triple for other targets. For details see: # http://llvm.org/docs/doxygen/html/Triple_8h_source.html return select({ - "@org_tensorflow//tensorflow:android_armeabi": "armv5-none-android", - "@org_tensorflow//tensorflow:android_arm": "armv7-none-android", - "@org_tensorflow//tensorflow:android_arm64": "aarch64-none-android", - "@org_tensorflow//tensorflow:android_x86": "i686-none-android", - "@org_tensorflow//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu", - "@org_tensorflow//tensorflow:darwin": "x86_64-none-darwin", + "//tensorflow:android_armeabi": "armv5-none-android", + "//tensorflow:android_arm": "armv7-none-android", + "//tensorflow:android_arm64": "aarch64-none-android", + "//tensorflow:android_x86": "i686-none-android", + "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu", + "//tensorflow:darwin": "x86_64-none-darwin", "//conditions:default": "x86_64-pc-linux", }) diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc index 0de163d3a8f..9c372a01278 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc @@ -30,12 +30,14 @@ limitations under the License. #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/common_runtime/optimization_registry.h" +#include "tensorflow/core/common_runtime/shape_refiner.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_def_util.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/graph_def_builder.h" #include "tensorflow/core/graph/tensor_id.h" #include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/lib/gtl/map_util.h" @@ -141,8 +143,7 @@ struct NodeSlot { // everything to use it. static const char* const kArgOp = "_Arg"; static const char* const kRetValOp = "_Retval"; -static const char* const kSendToHostOp = "_XlaSendToHost"; -static const char* const kRecvFromHostOp = "_XlaRecvFromHost"; +static const char* const kHostComputeOp = "_XlaHostCompute"; static const char* const kSendFromHostOp = "_XlaSendFromHost"; static const char* const kRecvAtHostOp = "_XlaRecvAtHost"; @@ -171,7 +172,8 @@ class Encapsulator { // Write a copy of the input graph to 'graph_out', where the subgraphs are // replaced with calls to the new functions. - Status BuildOutputGraph(bool parallel_checking, Graph* graph_out); + Status BuildOutputGraph(bool parallel_checking, Graph* graph_out, + FunctionLibraryDefinition* library); private: // A subgraph of the input, all marked with a common 'group_attribute' @@ -201,21 +203,29 @@ class Encapsulator { // .. . // RAH --> C --> SFH // - // The compiled cluster is as follows. STH is a SendToHost node which is the - // source of a channel to the RAH node above. RFH is a RecvFromHost node which - // is the destination of a channel from the SFH node above. There is a control - // edge that ensures RFH follows STH, which is used in shape inference to - // ensure that the shapes on the STH host channel are known before the RFH - // channel is compiled. + // The compiled cluster is as follows. HC is a HostCompute node which is the + // source of a channel to the RAH node above and the destination of a channel + // from the SFH node above. // - // Arg --> B --> STH ..> RFH --> D --> Retval + // Arg --> B --> HC --> D --> Retval // - // The channels STH/RAH and SFH/RFH each transmit a tuple, so there is at most - // one RAH and SFH in each compiled cluster. This design is preferred over - // adding separate Arg/Retval nodes for each transmitted value because it - // simplifies the host code that would like to limit communication between - // host and device and, e.g., raise only one interrupt per channel rather than - // one per transmitted value. + // The channels HC/RAH and SFH/HC each transmit multiple tensors, so there is + // at most one RAH and SFH in each outside_compilation cluster. This design is + // preferred over adding separate Arg/Retval nodes for each transmitted value + // because it allows optimizations to the host code that would like to limit + // communication between host and device and, e.g., raise only one interrupt + // per channel rather than one per transmitted value. + // + // The shapes of the outputs from the HC node in general cannot be determined + // until the shapes of its inputs are known at compile time, since e.g., + // above, the shape of C's outputs aren't known until the shape of its inputs + // are known. If the shapes of the HC's outputs can be determined during the + // rewrite, they are stored in the node's 'shapes' attr. Otherwise a minimal + // graph is stored in the shape_inference_graph attr. This graph can be used + // when compiling the HC Op to determined the shape of the SFH inputs given + // the shapes of any ancestor RAH outputs. If it can be determined that the + // shape of the SFH inputs will not be inferrable even once the shapes of the + // RAH outputs are known, an error is returned by the rewriter. class Subgraph { public: // Creates a graph to build the subgraph in, if it doesn't already exist, @@ -246,6 +256,10 @@ class Encapsulator { const std::unordered_map& node_images, Graph* graph_out); + // Returns the names of all the outside_compilation subgraphs in this + // Subgraph. + void GetOutsideCompilationSubgraphNames(std::vector* names) const; + // Returns the Node that inputs to the function should be wired up to. Node* GetCallNodeForInputs() const; @@ -305,15 +319,9 @@ class Encapsulator { void RecordOutsideCompilationOutputOrControl( const string& outside_compilation_id, const Edge* edge); - // Adds the SendToHost nodes for each outside_compilation subgraph once the - // edges have all been recorded via RecordOutsideCompilationInputOrControl. - Status AddSendsToOutsideCompilation( - const std::unordered_map& node_images); - - // Adds the RecvFromHost nodes for each outside_compilation subgraph once - // the edges have all been recorded via - // RecordOutsideCompilationOutputOrControl. - Status AddRecvsFromOutsideCompilation( + // Adds the HostCompute nodes for each outside_compilation subgraph. + Status AddHostComputes( + const string& subgraph_name, const std::unordered_map& node_images); // Creates the sequencer node if it doesn't exist, adding it to graph_out. @@ -323,10 +331,16 @@ class Encapsulator { // all the downstream nodes of call_node_outputs. void ConnectSequencerToOutputs(Graph* graph_out); + Status AddShapeInferenceInfo( + const string& outside_compilation_subgraph_name, + const std::vector& shapes, GraphDef* inference_graph); + + Status ReplaceFunctionDef(FunctionLibraryDefinition* library); + private: struct OutsideCompilationSubgraph { // Map from source (producer node/slot) tensors in the original graph to - // input index (slot number in the SendToHost/RecvAtHost nodes that will + // input index (slot number in the HostCompute/RecvAtHost nodes that will // be created) for the outside_compilation subgraph. std::unordered_map inputs; @@ -335,14 +349,14 @@ class Encapsulator { // outside_compilation subgraph. These are recorded by // RecordOutsideCompilationInputOrControl while walking all the subgraph // edges, and lifted control edges within the subgraph are added by - // AddSendsToOutsideCompilation once the _SendToHost node has been + // AddSendsToOutsideCompilation once the _HostCompute node has been // created. The matching control edge from _RecvAtHost to the // destination is added by CopyEdgeToOutputGraph. std::unordered_set control_inputs; // Maps from source (producer node/slot) and destination (consumer // node/slot) tensors in the original graph to output index (slot number - // in the SendFromHost/RecvFromHost nodes that will be created) for the + // in the SendFromHost/HostCompute nodes that will be created) for the // outside_compilation subgraph. std::unordered_map outputs_by_src; std::unordered_map outputs_by_dst; @@ -352,13 +366,13 @@ class Encapsulator { // containing compiled subgraph. These are recorded by // RecordOutsideCompilationOutputOrControl while walking all the subgraph // edges, and lifted control edges within the subgraph are added by - // AddRecvsFromToOutsideCompilation once the _RecvFromHost node has been + // AddRecvsFromToOutsideCompilation once the _HostCompute node has been // created. The matching control edge from the source to _SendFromHost to // the destination is added by CopyEdgeToOutputGraph. std::unordered_set control_outputs; - // _SendToHost node in the subgraph. Not owned. - Node* send_to_host = nullptr; + // Name of the _HostCompute node in the subgraph. + string host_compute_name; // _RecvAtHost node in the output graph. Not owned. Node* recv_at_host = nullptr; @@ -516,6 +530,59 @@ class Encapsulator { const std::unordered_map& node_images, bool parallel_checking, Graph* graph_out); + // Constructs a minimal shape inference graph that can be used to determine + // the shape of send_node at the time that the subgraph is compiled. + // recv_at_host_nodes contains the names of all the recv_at_host nodes that + // send_node might depend on. These recv_at_host nodes have shapes that are + // not known during the rewrite pass, but will be known at compile time. + // + // If the shapes of all the inputs to send_node can be determined during the + // rewrite pass, on exit graphdef_out is empty and the shapes are returned in + // static_shape_out. Otherwise graphdef_out contains a graph that can be used + // for shape inference at compile time, where all the source nodes of the + // graph are either constants with known shapes, or nodes named in + // recv_at_host_nodes. + // + // A non-OK status is returned if neither of the above conditions can be + // satisfied, e.g., because send_node depends on a node that doesn't have a + // registered shape inference function. + Status DoStaticShapeInferenceForOutsideCompilationSend( + const Graph& graph_in, const ShapeRefiner& shape_refiner, + const std::unordered_set& recv_at_host_nodes, Node* send_node, + FunctionLibraryDefinition* library, + std::vector* static_shape_out, + std::unique_ptr* graphdef_out); + + // Makes a copy of graph containing only nodes that are ancestors of at least + // one node in send_from_host_nodes and store it in pruned_graph. On exit + // nodes_images contains a mapping from nodes in graph to nodes in + // pruned_graph. All functions in the copied graph are inlined. + Status MakePrunedGraphCopyAndInline( + const Graph& graph, const std::vector& sink_nodes, + std::unique_ptr* pruned_graph, + std::unordered_map* node_images, + FunctionLibraryDefinition* library); + + // Makes a copy of graph containing only nodes that are ancestors of a + // send_from_host node in an outside_compilation subgraph, and store it in + // pruned_graph. Also perform shape inference on the pruned graph, using + // shape_refiner. On exit node_images contains a mapping from nodes in graph + // to nodes in pruned_graph. + Status MakeGraphForOutsideCompilationSends( + const Graph& graph, std::unique_ptr* pruned_graph, + ShapeRefiner* shape_refiner, + std::unordered_map* node_images, + FunctionLibraryDefinition* library); + + // Performs static shape inference, as far as possible, for the send_from_host + // nodes in each outside_compilation subgraph. Where it is not possible to + // determine the shape statically, stores a serialized GraphDef in the + // HostCompute 'shape_inference_graph' attr, to be used at compile time for + // final inference. If the shapes are known statically they are stored in the + // HostCompute 'shapes' attr. + Status GetShapeInfoForOutsideCompilationSends( + Graph* graph_out, FunctionLibraryDefinition* library); + const string group_attribute_; const string outside_compilation_attribute_; const Graph* graph_in_; @@ -682,16 +749,20 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl( } } -Status Encapsulator::Subgraph::AddSendsToOutsideCompilation( +Status Encapsulator::Subgraph::AddHostComputes( + const string& subgraph_name, const std::unordered_map& node_images) { for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) { const string& oc_subgraph_name = oc_subgraph_iter.first; OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second; - if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) { - // Build a _SendToHost node sending all the args of the appropriate - // types. - std::vector dtypes(oc_subgraph.inputs.size(), DT_INVALID); + if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() || + !oc_subgraph.outputs_by_src.empty() || + !oc_subgraph.control_outputs.empty()) { + // Build a _HostCompute node. std::vector inputs(oc_subgraph.inputs.size()); + std::vector input_dtypes(oc_subgraph.inputs.size(), DT_INVALID); + std::vector output_dtypes(oc_subgraph.outputs_by_src.size(), + DT_INVALID); for (const auto& input_src : oc_subgraph.inputs) { const Node* src_node = input_src.first.node; @@ -700,94 +771,64 @@ Status Encapsulator::Subgraph::AddSendsToOutsideCompilation( int input_index = input_src.second; DataType dtype = src_node->output_type(src_slot); - dtypes[input_index] = dtype; inputs[input_index].Reset(src_image->name(), src_slot, dtype); + input_dtypes[input_index] = dtype; } - NodeDef send_def; - NodeDefBuilder builder( - strings::StrCat("outside_compilation_", oc_subgraph_name, "_send"), - kSendToHostOp); - builder.Attr("dtypes", dtypes); + for (const auto& output : oc_subgraph.outputs_by_src) { + DataType dtype = output.first.dtype; + int output_index = output.second; + output_dtypes[output_index] = dtype; + } + + NodeDef host_compute_def; + NodeDefBuilder builder(strings::StrCat("outside_compilation_", + oc_subgraph_name, "_host_compute"), + kHostComputeOp); builder.Input(inputs); - Status s = builder.Finalize(&send_def); + builder.Attr("Tinputs", input_dtypes); + builder.Attr("Toutputs", output_dtypes); + builder.Attr("key", + strings::StrCat("host_compute_channel_", subgraph_name, "_", + oc_subgraph_name)); + Status s = builder.Finalize(&host_compute_def); if (!s.ok()) return s; - oc_subgraph.send_to_host = graph_->AddNode(send_def, &s); + Node* host_compute = graph_->AddNode(host_compute_def, &s); if (!s.ok()) return s; + oc_subgraph.host_compute_name = host_compute->name(); - // Connect the _SendToHost node to its producers in the subgraph. + // Connect the _HostCompute node to its producers in the subgraph. for (auto& input_src : oc_subgraph.inputs) { const Node* src_node = input_src.first.node; Node* src_image = node_images.at(src_node); int src_slot = input_src.first.slot; int input_index = input_src.second; - graph_->AddEdge(src_image, src_slot, oc_subgraph.send_to_host, - input_index); + graph_->AddEdge(src_image, src_slot, host_compute, input_index); } - // Connect the _SendToHost node to its control edge producers in the + // Connect the _HostCompute node to its control edge producers in the // subgraph. for (const auto& src_node : oc_subgraph.control_inputs) { Node* src_image = node_images.at(src_node); - graph_->AddControlEdge(src_image, oc_subgraph.send_to_host); - } - } - } - - return Status::OK(); -} - -Status Encapsulator::Subgraph::AddRecvsFromOutsideCompilation( - const std::unordered_map& node_images) { - for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) { - const string& oc_subgraph_name = oc_subgraph_iter.first; - OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second; - if (!oc_subgraph.outputs_by_src.empty() || - !oc_subgraph.control_outputs.empty()) { - // Build a _RecvFromHost node producing all the outputs of the appropriate - // types. - std::vector dtypes(oc_subgraph.outputs_by_src.size(), - DT_INVALID); - - for (const auto& output : oc_subgraph.outputs_by_src) { - DataType dtype = output.first.dtype; - int output_index = output.second; - dtypes[output_index] = dtype; + graph_->AddControlEdge(src_image, host_compute); } - NodeDef recv_def; - NodeDefBuilder builder( - strings::StrCat("outside_compilation_", oc_subgraph_name, "_recv"), - kRecvFromHostOp); - builder.Attr("dtypes", dtypes); - Status s = builder.Finalize(&recv_def); - if (!s.ok()) return s; - - Node* recv = graph_->AddNode(recv_def, &s); - if (!s.ok()) return s; - - // Connect the consumers in the subgraph to the _RecvFromHost node. + // Connect the consumers in the subgraph to the _HostCompute node. for (const auto& output : oc_subgraph.outputs_by_dst) { const Node* dst_node = output.first.node; Node* dst_image = node_images.at(dst_node); int dst_slot = output.first.slot; int output_index = output.second; - graph_->AddEdge(recv, output_index, dst_image, dst_slot); + graph_->AddEdge(host_compute, output_index, dst_image, dst_slot); } - // Connect the control edge consumers in the subgraph to the _RecvFromHost + // Connect the control edge consumers in the subgraph to the _HostCompute // node. for (const auto& dst_node : oc_subgraph.control_outputs) { Node* dst_image = node_images.at(dst_node); - graph_->AddControlEdge(recv, dst_image); - } - - // Add a control edge in the subgraph so that the _SendToHost node, if - // any, is compiled before the _RecvFromHost node. - if (oc_subgraph.send_to_host != nullptr) { - graph_->AddControlEdge(oc_subgraph.send_to_host, recv); + graph_->AddControlEdge(host_compute, dst_image); } } } @@ -882,6 +923,63 @@ Status Encapsulator::Subgraph::BuildFunctionDef( return Status::OK(); } +Status Encapsulator::Subgraph::AddShapeInferenceInfo( + const string& outside_compilation_subgraph_name, + const std::vector& shapes, GraphDef* inference_graph) { + OutsideCompilationSubgraph& oc_subgraph = + outside_compilation_subgraphs_.at(outside_compilation_subgraph_name); + + Node* host_compute = nullptr; + for (Node* n : graph_->nodes()) { + if (n->name() == oc_subgraph.host_compute_name) { + host_compute = n; + break; + } + } + if (host_compute == nullptr) { + return errors::InvalidArgument( + "After rewriting subgraph ", outside_compilation_subgraph_name, + " there is no HostCompute Op for outside compilation subgraph ", + oc_subgraph.host_compute_name); + } + + if (inference_graph == nullptr) { + host_compute->AddAttr("shape_inference_graph", ""); + host_compute->AddAttr("shapes", shapes); + } else { + string serialized_graph; + if (!inference_graph->SerializeToString(&serialized_graph)) { + return errors::Internal( + "Failed to serialize graph for outside compilation subgraph ", + oc_subgraph.host_compute_name); + } + host_compute->AddAttr("shape_inference_graph", serialized_graph); + host_compute->AddAttr("shapes", std::vector()); + } + return Status::OK(); +} + +Status Encapsulator::Subgraph::ReplaceFunctionDef( + FunctionLibraryDefinition* library) { + const string& name = call_node_def_.name(); + + FunctionDef fdef; + TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef)); + + if (VLOG_IS_ON(1)) { + VLOG(2) << "Replace function def " << name; + dump_graph::DumpGraphToFile( + strings::StrCat("replace_encapsulate_fdef_graph_", name), *graph_, + library); + dump_graph::DumpFunctionDefToFile( + strings::StrCat("replace_encapsulate_fdef_", name), fdef); + } + + TF_RETURN_IF_ERROR(library->RemoveFunction(name)); + TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef)); + return Status::OK(); +} + Status Encapsulator::Subgraph::BuildParallelCheckOp( const std::unordered_map& node_images, Graph* graph_out) { @@ -980,7 +1078,9 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode( NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name, "_", oc_subgraph_name, "_recv"), kRecvAtHostOp); - builder.Attr("dtypes", dtypes); + builder.Attr("Toutputs", dtypes); + builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name, + "_", oc_subgraph_name)); Status s = builder.Finalize(&recv_def); if (!s.ok()) return s; @@ -1020,7 +1120,9 @@ Status Encapsulator::Subgraph::AddSendFromHostNode( NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name, "_", oc_subgraph_name, "_send"), kSendFromHostOp); - builder.Attr("dtypes", dtypes); + builder.Attr("Tinputs", dtypes); + builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name, + "_", oc_subgraph_name)); builder.Input(inputs); Status s = builder.Finalize(&send_def); if (!s.ok()) return s; @@ -1062,6 +1164,13 @@ Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes( return Status::OK(); } +void Encapsulator::Subgraph::GetOutsideCompilationSubgraphNames( + std::vector* names) const { + for (auto& entry : outside_compilation_subgraphs_) { + names->push_back(entry.first); + } +} + Status Encapsulator::GetFunctionNameAttr( Node const* node, string* attr, string* outside_compilation_attr) const { Status s = GetNodeAttr(node->attrs(), group_attribute_, attr); @@ -1220,8 +1329,7 @@ Status Encapsulator::SplitIntoSubgraphs() { // single input and output node for it. for (auto& entry : subgraphs_) { Subgraph& subgraph = entry.second; - TF_RETURN_IF_ERROR(subgraph.AddSendsToOutsideCompilation(node_images)); - TF_RETURN_IF_ERROR(subgraph.AddRecvsFromOutsideCompilation(node_images)); + TF_RETURN_IF_ERROR(subgraph.AddHostComputes(entry.first, node_images)); } MarkGuaranteedConstants(*graph_in_, src_arg_pairs); @@ -1509,8 +1617,346 @@ Status Encapsulator::AddEdgesToOutputGraph( return Status::OK(); } -Status Encapsulator::BuildOutputGraph(bool parallel_checking, - Graph* graph_out) { +namespace { + +// Adds a dummy Const node to graph_out. The "constant" has the type of +// data_type and the shape indicated in 'shape'. The dummy node is not a valid +// Const node because it does not have any value defined, but this doesn't +// matter because it will only be used subsequently for shape inference. (It +// would be possible to add a switch statement over data_type to create a value +// for the constant, but that would entail maintaining the logic as new types +// are added, and is not necessary.) +Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape, + Graph* graph_out) { + TensorProto dummy_proto; + dummy_proto.set_dtype(data_type); + *dummy_proto.mutable_tensor_shape() = shape; + // Don't set any value field in the proto, since it is only going to be used + // for shape inference. + + GraphDefBuilder::Options options(graph_out, /*status=*/nullptr); + NodeBuilder node_builder(options.GetNameForOp("KnownShape"), "Const", + options.op_registry()); + node_builder.Attr("dtype", data_type).Attr("value", dummy_proto); + return options.FinalizeBuilder(&node_builder); +} + +// Adds a copy of node_in to graph_out and adds the mapping to +// copied_node_images. +Status CopyShapeInferenceNodeToGraph( + Node* node_in, const Node* send_node, + const std::unordered_map& dummy_node_images, + FunctionLibraryDefinition* library, + std::unordered_map* copied_node_images, Graph* graph_out) { + // Once all the ancestor nodes have been added to graph_out, add this node + // and connect it to its ancestors. + Node* node_out = graph_out->CopyNode(node_in); + (*copied_node_images)[node_in] = node_out; + // Don't bother to build the shape inference graph if there's a node with no + // shape inference function, since it would just result in an error later at + // compile time. + const OpRegistrationData* op_reg_data; + TF_RETURN_IF_ERROR(library->LookUp(node_in->type_string(), &op_reg_data)); + if (op_reg_data->shape_inference_fn == nullptr) { + return errors::InvalidArgument( + "Shape inference is not possible for outside_compilation " + "SendFromHost node ", + send_node->name(), " because it depends on node ", node_in->name(), + " which does not have a shape inference function registered."); + } + // Add all the edges to the newly copied node. + for (const Edge* in_edge : node_in->in_edges()) { + if (!in_edge->IsControlEdge()) { + Node* src = in_edge->src(); + const auto iter = dummy_node_images.find(src); + if (iter == dummy_node_images.end()) { + // The src is a copied node so use the original output port. + graph_out->AddEdge((*copied_node_images)[in_edge->src()], + in_edge->src_output(), node_out, + in_edge->dst_input()); + } else { + // The src is a dummy node so use output port 0. + graph_out->AddEdge(iter->second, 0, node_out, in_edge->dst_input()); + } + } + } + return Status::OK(); +} + +} // namespace + +Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend( + const Graph& graph_in, const ShapeRefiner& shape_refiner, + const std::unordered_set& recv_at_host_nodes, Node* send_node, + FunctionLibraryDefinition* library, + std::vector* static_shape_out, + std::unique_ptr* graphdef_out) { + // Maps from nodes in graph_in to nodes in graph_out. + // + // When an edge has fully defined shape the source node in graph_in is + // replaced in graph_out by a dummy constant node. The mapping from nodes + // in graph_in to dummy nodes is stored in dummy_node_images. + // + // When a node in graph_in has at least one ancestor that doesn't have fully + // defined shape, it is copied into graph_out. The mapping from nodes in + // graph_in to copied nodes is stored in copied_node_images. + // + // The two types of node are treated differently because, when adding edges to + // graph_out, an output from a dummy node always uses port 0, whereas an + // output from a copied node uses the same port that was used in graph_in. + std::unordered_map dummy_node_images; + std::unordered_map copied_node_images; + + std::unique_ptr graph_out(new Graph(graph_in.op_registry())); + graph_out->set_versions(graph_in.versions()); + static_shape_out->resize(send_node->num_inputs()); + + // We don't use the standard ReverseDFS because we want to cut off traversal + // whenever we find an output with fully defined shape. + // TODO(misard) make this work properly in the presence of control flow. + struct Work { + Node* node; + bool leave; // Are we entering or leaving node? + }; + std::vector stack({{send_node, false}}); + std::vector visited(graph_in.num_node_ids(), false); + while (!stack.empty()) { + Work w = stack.back(); + stack.pop_back(); + Node* n = w.node; + + if (w.leave) { + TF_RETURN_IF_ERROR(CopyShapeInferenceNodeToGraph( + n, send_node, dummy_node_images, library, &copied_node_images, + graph_out.get())); + } else { + if (visited[n->id()]) continue; + visited[n->id()] = true; + + // Arrange to revisit when all done with all inputs. + stack.push_back(Work{n, true}); + + bool has_parent_with_unknown_shape = false; + for (const Edge* in_edge : n->in_edges()) { + if (!in_edge->IsControlEdge()) { + Node* src_node = in_edge->src(); + int src_port = in_edge->src_output(); + shape_inference::InferenceContext* context = + shape_refiner.GetContext(src_node); + shape_inference::ShapeHandle shape = context->output(src_port); + if (context->FullyDefined(shape)) { + // This ancestor has known shape, so instead of adding it to the + // stack, add a dummy node with that shape to graph_out and + // continue. + TensorShapeProto proto; + context->ShapeHandleToProto(shape, &proto); + dummy_node_images[src_node] = AddDummyShapedNode( + src_node->output_type(src_port), proto, graph_out.get()); + if (n == send_node) { + (*static_shape_out)[in_edge->dst_input()] = proto; + } + } else { + if (!visited[src_node->id()]) { + has_parent_with_unknown_shape = true; + stack.push_back({src_node, false}); + } + } + } + } + if (!has_parent_with_unknown_shape) { + if (n == send_node) { + // The shapes of all the inputs to send_node are statically known. We + // won't have to do any inference at compile time so return now: the + // shapes were stored in static_shape_out above. + graphdef_out->reset(); + return Status::OK(); + } else { + // Any shape that is being processed is either the original send node + // or has at least one output with statically-unknown shape. If the + // latter and it doesn't have any inputs with statically-unknown + // shape, then check that it is of the recv nodes that we can fill in + // the shape of at run-time later. If it isn't one of those, then we + // won't have any additional knowledge at compile time, so we already + // know we won't be able to do shape inference and we can return an + // error now. + if (recv_at_host_nodes.find(n->name()) == recv_at_host_nodes.end()) { + return errors::InvalidArgument( + "Shape inference is not possible for outside_compilation " + "SendFromHost node ", + send_node->name(), " because shape of node ", n->name(), + " will not be known at compilation time."); + } + } + } + } + } + + graphdef_out->reset(new GraphDef()); + graph_out->ToGraphDef(graphdef_out->get()); + + return Status::OK(); +} + +Status Encapsulator::MakePrunedGraphCopyAndInline( + const Graph& graph, const std::vector& sink_nodes, + std::unique_ptr* pruned_graph, + std::unordered_map* node_images, + FunctionLibraryDefinition* library) { + // First copy all ancestor nodes of sink_nodes into a new graph. + pruned_graph->reset(new Graph(library)); + (*pruned_graph)->set_versions(graph.versions()); + ReverseDFSFrom(graph, sink_nodes, + /*enter=*/nullptr, + /*leave=*/[&](Node* n) { + if (!n->IsSource()) { + Node* copied = (*pruned_graph)->CopyNode(n); + node_images->emplace(n, copied); + } + }); + + // Add all the edges between copied nodes. + for (auto entry : *node_images) { + const Node* orig = entry.first; + Node* image = entry.second; + for (const Edge* out_edge : orig->out_edges()) { + auto iter = node_images->find(out_edge->dst()); + if (iter != node_images->end()) { + // The source and destination are both in the copied graph. + (*pruned_graph) + ->AddEdge(image, out_edge->src_output(), iter->second, + out_edge->dst_input()); + } + } + } + + // Find all the function call nodes, and inline them. + std::vector function_nodes; + for (auto node : (*pruned_graph)->nodes()) { + const OpRegistrationData* op_reg_data; + TF_RETURN_IF_ERROR(library->LookUp(node->type_string(), &op_reg_data)); + if (op_reg_data->is_function_op) { + function_nodes.push_back(node); + } + } + for (auto node : function_nodes) { + VLOG(2) << "Inlining function " << node->name(); + const FunctionDef* fdef = library->Find(node->type_string()); + if (fdef == nullptr) { + return errors::Internal("Failed to find function ", node->type_string(), + " in function library."); + } + FunctionBody* fbody = nullptr; + TF_RETURN_IF_ERROR( + FunctionDefToBodyHelper(*fdef, node->attrs(), library, + [library](const string& op, const OpDef** sig) { + return library->LookUpOpDef(op, sig); + }, + &fbody)); + InlineFunctionBody(*library, pruned_graph->get(), node, fbody); + delete fbody; + } + + return Status::OK(); +} + +Status Encapsulator::MakeGraphForOutsideCompilationSends( + const Graph& graph, std::unique_ptr* pruned_graph, + ShapeRefiner* shape_refiner, + std::unordered_map* node_images, + FunctionLibraryDefinition* library) { + // Find all the send_from_host nodes in all subgraphs, to use as roots for the + // pruning. + std::vector send_from_host_nodes; + for (auto& subgraph_entry : subgraphs_) { + Subgraph& subgraph = subgraph_entry.second; + std::vector outside_compilation_names; + subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names); + for (const auto& name : outside_compilation_names) { + Node* send_node = subgraph.GetSendFromHostNode(name); + if (send_node != nullptr) { + send_from_host_nodes.push_back(send_node); + } + } + } + + // Make a copy of all the graph nodes needed to evaluate the send_from_host + // nodes, inlining any functions as needed. + TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline( + graph, send_from_host_nodes, pruned_graph, node_images, library)); + + // Perform shape inference on the pruned graph. + shape_refiner->set_require_shape_inference_fns(false); + FixupSourceAndSinkEdges(pruned_graph->get()); + std::vector post_order; + GetReversePostOrder(*(*pruned_graph), &post_order); + for (auto node : post_order) { + // Ignore the status returned by the shape_refiner. At this point we want + // the best effort shapes, even if no shape function is registered for a + // node. + Status status = shape_refiner->AddNode(node); + if (!status.ok()) { + VLOG(1) << "Shape inference failed for node: " << status; + } + } + + return Status::OK(); +} + +Status Encapsulator::GetShapeInfoForOutsideCompilationSends( + Graph* graph_out, FunctionLibraryDefinition* library) { + std::unique_ptr pruned_graph; + ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry()); + std::unordered_map node_images; + TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends( + *graph_out, &pruned_graph, &shape_refiner, &node_images, library)); + + for (auto& subgraph_entry : subgraphs_) { + Subgraph& subgraph = subgraph_entry.second; + // Find all the recv_at_host nodes in this subgraph. + std::vector outside_compilation_names; + subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names); + std::unordered_set recv_at_host_names; + for (const auto& name : outside_compilation_names) { + Node* recv_node = subgraph.GetRecvAtHostNode(name); + if (recv_node != nullptr) { + recv_at_host_names.insert(recv_node->name()); + } + } + // For each send_from_host node, do as much shape inference as possible + // without knowing the shape of the recv_at_host nodes, and store the + // result, along with enough information to complete the job at compile time + // once the recv_at_host shapes are known. + for (const auto& name : outside_compilation_names) { + Node* send_node = subgraph.GetSendFromHostNode(name); + std::vector static_shape; + std::unique_ptr graphdef; + if (send_node != nullptr) { + TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend( + *pruned_graph, shape_refiner, recv_at_host_names, + node_images[send_node], library, &static_shape, &graphdef)); + if (graphdef == nullptr) { + VLOG(2) << "Send node " << send_node->name() << " shapes"; + for (int i = 0; i < static_shape.size(); ++i) { + VLOG(2) << static_shape[i].DebugString(); + } + } else { + VLOG(2) << "Send node " << send_node->name() << " graph\n" + << graphdef->DebugString(); + } + } + TF_RETURN_IF_ERROR( + subgraph.AddShapeInferenceInfo(name, static_shape, graphdef.get())); + } + if (!outside_compilation_names.empty()) { + TF_RETURN_IF_ERROR(subgraph.ReplaceFunctionDef(library)); + } + } + + return Status::OK(); +} + +Status Encapsulator::BuildOutputGraph(bool parallel_checking, Graph* graph_out, + FunctionLibraryDefinition* library) { // Map from nodes in the input graph to nodes in the output graph. std::unordered_map node_images; @@ -1522,6 +1968,9 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking, TF_RETURN_IF_ERROR( AddEdgesToOutputGraph(node_images, parallel_checking, graph_out)); + TF_RETURN_IF_ERROR( + GetShapeInfoForOutsideCompilationSends(graph_out, library)); + return Status::OK(); } @@ -1545,7 +1994,7 @@ Status EncapsulateSubgraphsInFunctions( std::unique_ptr out(new Graph(library)); out->set_versions(graph_in.versions()); TF_RETURN_IF_ERROR( - encapsulator.BuildOutputGraph(parallel_checking, out.get())); + encapsulator.BuildOutputGraph(parallel_checking, out.get(), library)); *graph_out = std::move(out); return Status::OK(); diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc index b100861d5e9..aed9cae0f17 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc @@ -29,17 +29,181 @@ limitations under the License. namespace tensorflow { namespace { -bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b, - string* diff) { - // TODO(phawkins) use a more sophisticated equality test. - if (a.DebugString() != b.DebugString()) { +template +bool EqualProtoMap(const ::tensorflow::protobuf::Map& a, + const ::tensorflow::protobuf::Map& b, + const std::function& key_to_string, + const std::function& value_to_string, + const std::function& compare, + const string& map_name, string* diff) { + for (const auto& elt_a : a) { + const auto iter = b.find(elt_a.first); + if (iter == b.end()) { + if (diff) { + *diff = strings::StrCat( + map_name, " expected: contains element with key '", + key_to_string(elt_a.first), "' got: map has no such element"); + } + return false; + } + if (!compare(elt_a.first, elt_a.second, iter->second)) { + if (diff) { + *diff = strings::StrCat(map_name, " expected: element with key '", + key_to_string(elt_a.first), " has value '", + value_to_string(elt_a.second), "' got: '", + value_to_string(iter->second), "'"); + } + return false; + } + } + for (const auto& elt_b : b) { + const auto iter = a.find(elt_b.first); + if (iter == a.end()) { + if (diff) { + *diff = strings::StrCat(map_name, " got: contains element with key '", + key_to_string(elt_b.first), + "' expected: map has no such element"); + } + return false; + } + } + return true; +} + +bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b, + const string& diff_preamble, string* diff) { + if (a.op() != b.op()) { if (diff) { - *diff = strings::StrCat("Definition mismatch for function ", - a.signature().name(), ", expected:\n", - a.DebugString(), "\ngot:\n", b.DebugString()); + *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), + ", expected op '", a.op(), "' got '", b.op()); } return false; } + if (a.device() != b.device()) { + if (diff) { + *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), + ", expected device '", a.device(), "' got '", + b.device()); + } + return false; + } + if (a.input_size() != b.input_size()) { + if (diff) { + *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), + ", expected ", a.input_size(), " inputs got ", + b.input_size(), " expected:\n", a.DebugString(), + "\ngot:\n", b.DebugString()); + } + return false; + } + for (int i = 0; i < a.input_size(); ++i) { + if (a.input(i) != b.input(i)) { + if (diff) { + *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), + " input ", i, ", expected ", a.input(i), + " got ", b.input(i), " expected:\n", + a.DebugString(), "\ngot:\n", b.DebugString()); + } + return false; + } + } + return EqualProtoMap( + a.attr(), b.attr(), [](const string& s) { return s; }, + [](const AttrValue& v) { return v.DebugString(); }, + [](const string& key, const AttrValue& av, const AttrValue& bv) { + if (key == "shape_inference_graph") { + // Default serialization of GraphDef is unstable because maps don't + // serialize deterministically. Rather than go through the hoops to + // turn on deterministic serialization of this attr just for this + // test, add logic here to compare determinstically. + GraphDef ga; + if (!ga.ParseFromString(av.s())) { + return false; + } + GraphDef gb; + if (!gb.ParseFromString(bv.s())) { + return false; + } + return EqualGraphDef(ga, gb, nullptr); + } else { + return av.DebugString() == bv.DebugString(); + } + }, + strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()), + diff); +} + +bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b, + string* diff) { + if (a.signature().DebugString() != b.signature().DebugString()) { + if (diff) { + *diff = strings::StrCat("Signature mismatch for function ", + a.signature().name(), ", expected:\n", + a.signature().DebugString(), "\ngot:\n", + b.signature().DebugString()); + } + return false; + } + if (!EqualProtoMap( + a.attr(), b.attr(), [](const string& s) { return s; }, + [](const AttrValue& v) { return v.DebugString(); }, + [](const string& key, const AttrValue& av, const AttrValue& bv) { + return av.DebugString() == bv.DebugString(); + }, + strings::StrCat("attr mismatch for function ", a.signature().name()), + diff)) { + return false; + } + if (!EqualProtoMap( + a.ret(), b.ret(), [](const string& s) { return s; }, + [](const string& s) { return s; }, + [](const string& key, const string& av, const string& bv) { + return av == bv; + }, + strings::StrCat("ret mismatch for function ", a.signature().name()), + diff)) { + return false; + } + for (int i = 0; i < a.node_def_size(); ++i) { + bool found = false; + for (int j = 0; j < b.node_def_size(); ++j) { + if (a.node_def(i).name() == b.node_def(j).name()) { + if (!EqualFunctionNodeDef( + a.node_def(i), b.node_def(j), + strings::StrCat("Function ", a.signature().name()), diff)) { + return false; + } + found = true; + break; + } + } + if (!found) { + if (diff) { + *diff = strings::StrCat("Function ", a.signature().name(), + ", expected: has node '", a.node_def(i).name(), + "' got: no node of that name"); + } + return false; + } + } + for (int i = 0; i < b.node_def_size(); ++i) { + bool found = false; + for (int j = 0; j < a.node_def_size(); ++j) { + if (b.node_def(i).name() == a.node_def(j).name()) { + found = true; + break; + } + } + if (!found) { + if (diff) { + *diff = strings::StrCat("Function ", a.signature().name(), + ", got: has node '", b.node_def(i).name(), + "' expected: no node of that name"); + } + return false; + } + } return true; } @@ -84,29 +248,64 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected, // TODO(misard): remove these fake registrations once there are real Ops to be // compiled. -REGISTER_OP("_XlaSendToHost") - .Input("input: dtypes") - .Attr("dtypes: list(type) >= 0"); - -REGISTER_OP("_XlaRecvFromHost") - .Output("output: dtypes") - .Attr("dtypes: list(type) >= 0"); +REGISTER_OP("_XlaHostCompute") + .Input("inputs: Tinputs") + .Output("outputs: Toutputs") + .Attr("Tinputs: list(type) >= 0") + .Attr("Toutputs: list(type) >= 0") + .Attr("key: string") + .SetShapeFn(::tensorflow::shape_inference::UnknownShape); REGISTER_OP("_XlaSendFromHost") - .Input("input: dtypes") - .Attr("dtypes: list(type) >= 0"); + .Input("input: Tinputs") + .Attr("Tinputs: list(type) >= 0") + .Attr("key: string") + .SetShapeFn(::tensorflow::shape_inference::UnknownShape); REGISTER_OP("_XlaRecvAtHost") - .Output("output: dtypes") - .Attr("dtypes: list(type) >= 0"); + .Output("output: Toutputs") + .Attr("Toutputs: list(type) >= 0") + .Attr("key: string") + .SetShapeFn(::tensorflow::shape_inference::UnknownShape); -REGISTER_OP("InputTest").Output("o: float"); +REGISTER_OP("InputTest") + .Output("o: float") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { + c->set_output(0, c->UnknownShape()); + return Status::OK(); + }); -REGISTER_OP("UnaryTest").Input("a: float").Output("o: float"); +REGISTER_OP("InputTestShaped") + .Output("o: float") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { + c->set_output(0, c->Vector(2)); + return Status::OK(); + }); + +REGISTER_OP("UnaryTest") + .Input("a: float") + .Output("o: float") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { + ::tensorflow::shape_inference::ShapeHandle o; + TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o)); + c->set_output(0, o); + return Status::OK(); + }); REGISTER_OP("BinaryTest") .Input("a: float") .Input("b: float") - .Output("o: float"); + .Output("o: float") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { + ::tensorflow::shape_inference::ShapeHandle o; + TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o)); + c->set_output(0, o); + return Status::OK(); + }); +REGISTER_OP("BinaryTest2") + .Input("a: float") + .Input("b: float") + .Output("o: float") + .SetShapeFn(::tensorflow::shape_inference::UnknownShape); REGISTER_OP("AddNLikeTest") .Input("inputs: N * T") @@ -124,22 +323,48 @@ Node* Input(const GraphDefBuilder::Options& opts) { return ops::SourceOp("InputTest", opts); } -Node* RecvAtHost(const gtl::ArraySlice& dtypes, +Node* InputShaped(const GraphDefBuilder::Options& opts) { + return ops::SourceOp("InputTestShaped", opts); +} + +Node* KnownShape(const gtl::ArraySlice& shape, + const GraphDefBuilder::Options& opts) { + if (opts.HaveError()) return nullptr; + NodeBuilder node_builder(opts.GetNameForOp("Const"), "Const", + opts.op_registry()); + TensorProto value; + value.set_dtype(DT_FLOAT); + for (int dim : shape) { + value.mutable_tensor_shape()->add_dim()->set_size(dim); + } + return opts.WithAttr("value", value) + .WithAttr("dtype", DT_FLOAT) + .FinalizeBuilder(&node_builder); +} + +Node* RecvAtHost(const string& key, const gtl::ArraySlice& dtypes, const GraphDefBuilder::Options& opts) { if (opts.HaveError()) return nullptr; NodeBuilder node_builder(opts.GetNameForOp("_XlaRecvAtHost"), "_XlaRecvAtHost", opts.op_registry()); - return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder); + return opts.WithAttr("Toutputs", dtypes) + .WithAttr("key", key) + .FinalizeBuilder(&node_builder); } -Node* SendFromHost(const std::vector& inputs, - const gtl::ArraySlice& dtypes, +Node* SendFromHost(const string& key, const std::vector& inputs, const GraphDefBuilder::Options& opts) { if (opts.HaveError()) return nullptr; NodeBuilder node_builder(opts.GetNameForOp("_XlaSendFromHost"), "_XlaSendFromHost", opts.op_registry()); node_builder.Input(inputs); - return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder); + std::vector dtypes; + for (const auto& node : inputs) { + dtypes.push_back(node.dt); + } + return opts.WithAttr("key", key) + .WithAttr("Tinputs", dtypes) + .FinalizeBuilder(&node_builder); } Node* Unary(ops::NodeOut a, const GraphDefBuilder::Options& opts) { @@ -151,6 +376,11 @@ Node* Binary(ops::NodeOut a, ops::NodeOut b, return ops::BinaryOp("BinaryTest", std::move(a), std::move(b), opts); } +Node* BinaryUnknownShape(ops::NodeOut a, ops::NodeOut b, + const GraphDefBuilder::Options& opts) { + return ops::BinaryOp("BinaryTest2", std::move(a), std::move(b), opts); +} + Node* AddNLike(const std::vector& inputs, const GraphDefBuilder::Options& opts) { if (opts.HaveError()) return nullptr; @@ -576,6 +806,21 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) { FunctionDefLibrary library_expected; GraphDef graphdef_expected; + string shape_string_expected; + { + GraphDefBuilder shape(GraphDefBuilder::kFailImmediately); + Node* recv = + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT}, + shape.opts().WithName("outside_compilation_F1_O1_recv")); + Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1), + shape.opts().WithName("E")); + SendFromHost("host_compute_channel_F1_O1", {e}, + shape.opts().WithName("outside_compilation_F1_O1_send")); + GraphDef shape_graph; + TF_EXPECT_OK(shape.ToGraphDef(&shape_graph)); + EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected)); + } + *library_expected.add_function() = test::function::XTimesTwo(); *library_expected.add_function() = FunctionDefHelper::Create( "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {}, @@ -584,19 +829,18 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) { {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}}, {{"F"}, "BinaryTest", - {"C:o:0", "outside_compilation_O1_recv:output:0"}, + {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"}, {}, - {"outside_compilation_O1_recv"}}, - {{"outside_compilation_O1_send"}, - "_XlaSendToHost", + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O1_host_compute"}, + "_XlaHostCompute", {"C:o:0", "c:o:0"}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", shape_string_expected}, + {"shapes", gtl::ArraySlice({})}}, {"c"}}, - {{"outside_compilation_O1_recv"}, - "_XlaRecvFromHost", - {}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}, - {"outside_compilation_O1_send"}}, }, {{"f_0_retval", "F:o:0"}}); @@ -612,11 +856,11 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) { Node* call = b2.opts().FinalizeBuilder(&node_builder); Node* recv = - RecvAtHost({DT_FLOAT, DT_FLOAT}, + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv")); Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1), b2.opts().WithName("E").WithControlInputs({recv, b})); - Node* send = SendFromHost({e}, {DT_FLOAT}, + Node* send = SendFromHost("host_compute_channel_F1_O1", {e}, b2.opts() .WithName("outside_compilation_F1_O1_send") .WithControlInput(e)); @@ -674,37 +918,71 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) { FunctionDefLibrary library_expected; GraphDef graphdef_expected; + string shape_string_expected_1; + { + GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately); + Node* recv = + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT}, + shape1.opts().WithName("outside_compilation_F1_O1_recv")); + Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1), + shape1.opts().WithName("E")); + SendFromHost("host_compute_channel_F1_O1", {e}, + shape1.opts().WithName("outside_compilation_F1_O1_send")); + GraphDef shape1_graph; + TF_EXPECT_OK(shape1.ToGraphDef(&shape1_graph)); + EXPECT_TRUE(shape1_graph.SerializeToString(&shape_string_expected_1)); + } + + string shape_string_expected_2; + { + GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately); + Node* recv1 = + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT}, + shape2.opts().WithName("outside_compilation_F1_O1_recv")); + Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1), + shape2.opts().WithName("E")); + Node* recv2 = + RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT}, + shape2.opts().WithName("outside_compilation_F1_O2_recv")); + Node* h = Binary(ops::NodeOut(recv2, 0), e, shape2.opts().WithName("H")); + SendFromHost("host_compute_channel_F1_O2", {h}, + shape2.opts().WithName("outside_compilation_F1_O2_send")); + GraphDef shape2_graph; + TF_EXPECT_OK(shape2.ToGraphDef(&shape2_graph)); + EXPECT_TRUE(shape2_graph.SerializeToString(&shape_string_expected_2)); + } + *library_expected.add_function() = FunctionDefHelper::Create( "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {}, { {{"C"}, "UnaryTest", {"a_0_arg"}}, {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}}, - {{"I"}, "UnaryTest", {"outside_compilation_O2_recv:output:0"}}, + {{"I"}, + "UnaryTest", + {"outside_compilation_O2_host_compute:outputs:0"}}, {{"F"}, "BinaryTest", - {"C:o:0", "outside_compilation_O1_recv:output:0"}, + {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"}, {}, - {"outside_compilation_O1_recv"}}, - {{"outside_compilation_O2_send"}, - "_XlaSendToHost", + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O2_host_compute"}, + "_XlaHostCompute", {"D:o:0", "F:o:0"}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"key", "host_compute_channel_F1_O2"}, + {"shape_inference_graph", shape_string_expected_2}, + {"shapes", gtl::ArraySlice({})}}, {"F"}}, - {{"outside_compilation_O1_send"}, - "_XlaSendToHost", + {{"outside_compilation_O1_host_compute"}, + "_XlaHostCompute", {"C:o:0", "D:o:0"}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", shape_string_expected_1}, + {"shapes", gtl::ArraySlice({})}}, {"D"}}, - {{"outside_compilation_O2_recv"}, - "_XlaRecvFromHost", - {}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}, - {"outside_compilation_O2_send"}}, - {{"outside_compilation_O1_recv"}, - "_XlaRecvFromHost", - {}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}, - {"outside_compilation_O1_send"}}, }, {{"i_0_retval", "I:o:0"}}); @@ -720,23 +998,24 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) { Node* call = b2.opts().FinalizeBuilder(&node_builder); Node* recv1 = - RecvAtHost({DT_FLOAT, DT_FLOAT}, + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv")); Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1), b2.opts().WithName("E").WithControlInputs({recv1, b})); - Node* send1 = SendFromHost({e}, {DT_FLOAT}, + Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e}, b2.opts() .WithName("outside_compilation_F1_O1_send") .WithControlInput(e)); Node* recv2 = - RecvAtHost({DT_FLOAT, DT_FLOAT}, + RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O2_recv")); Node* g = Binary(e, ops::NodeOut(recv2, 1), b2.opts().WithName("G").WithControlInputs({recv2, e})); Node* h = Binary(ops::NodeOut(recv2, 0), e, b2.opts().WithName("H")); - Node* send2 = SendFromHost( - {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O2_send")); + Node* send2 = + SendFromHost("host_compute_channel_F1_O2", {h}, + b2.opts().WithName("outside_compilation_F1_O2_send")); Node* s = NoOp(b2.opts() .WithName("F1_sequencer") @@ -758,8 +1037,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { { GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); - Node* a = Input(b1.opts().WithName("A")); - Node* b = Input(b1.opts().WithName("B")); + Node* a = InputShaped(b1.opts().WithName("A")); + Node* b = InputShaped(b1.opts().WithName("B")); Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); Node* d = Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); @@ -791,6 +1070,24 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { FunctionDefLibrary library_expected; GraphDef graphdef_expected; + string shape_string_expected; + { + GraphDefBuilder shape(GraphDefBuilder::kFailImmediately); + Node* recv = + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT}, + shape.opts().WithName("outside_compilation_F1_O1_recv")); + Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1), + shape.opts().WithName("E")); + SendFromHost("host_compute_channel_F1_O1", {e}, + shape.opts().WithName("outside_compilation_F1_O1_send")); + GraphDef shape_graph; + TF_EXPECT_OK(shape.ToGraphDef(&shape_graph)); + EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected)); + } + + TensorShapeProto shape_proto_expected; + shape_proto_expected.add_dim()->set_size(2); + *library_expected.add_function() = FunctionDefHelper::Create( "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float", "d_0_retval:float"}, {}, @@ -799,19 +1096,18 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, {{"F"}, "BinaryTest", - {"C:o:0", "outside_compilation_O1_recv:output:0"}, + {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"}, {}, - {"outside_compilation_O1_recv"}}, - {{"outside_compilation_O1_send"}, - "_XlaSendToHost", + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O1_host_compute"}, + "_XlaHostCompute", {"C:o:0", "D:o:0"}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", shape_string_expected}, + {"shapes", gtl::ArraySlice({})}}, {"D"}}, - {{"outside_compilation_O1_recv"}, - "_XlaRecvFromHost", - {}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}, - {"outside_compilation_O1_send"}}, }, {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}}); @@ -822,16 +1118,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {{"G"}, "BinaryTest", {"e_0_arg", "f_0_arg"}}, {{"I"}, "BinaryTest", - {"f_0_arg", "outside_compilation_O1_recv:output:0"}}, - {{"outside_compilation_O1_send"}, - "_XlaSendToHost", + {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}}, + {{"outside_compilation_O1_host_compute"}, + "_XlaHostCompute", {"G:o:0"}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}}, - {{"outside_compilation_O1_recv"}, - "_XlaRecvFromHost", - {}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}, - {"outside_compilation_O1_send"}}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"key", "host_compute_channel_F2_O1"}, + {"shape_inference_graph", ""}, + {"shapes", + gtl::ArraySlice({shape_proto_expected})}}}, }, {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}}); @@ -839,15 +1135,15 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { std::unique_ptr lib_def( new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); - Node* a = Input(b2.opts().WithName("A")); - Node* b = Input(b2.opts().WithName("B")); + Node* a = InputShaped(b2.opts().WithName("A")); + Node* b = InputShaped(b2.opts().WithName("B")); Node* recv1 = - RecvAtHost({DT_FLOAT, DT_FLOAT}, + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv")); Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1), b2.opts().WithName("E").WithControlInputs({recv1, b})); - Node* send1 = SendFromHost({e}, {DT_FLOAT}, + Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e}, b2.opts() .WithName("outside_compilation_F1_O1_send") .WithControlInput(e)); @@ -857,12 +1153,14 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { Node* s1 = NoOp( b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1})); - Node* recv2 = RecvAtHost( - {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_recv")); + Node* recv2 = + RecvAtHost("host_compute_channel_F2_O1", {DT_FLOAT}, + b2.opts().WithName("outside_compilation_F2_O1_recv")); Node* h = Binary(ops::NodeOut(call1, 1), recv2, b2.opts().WithName("H").WithControlInput(s1)); - Node* send2 = SendFromHost( - {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_send")); + Node* send2 = + SendFromHost("host_compute_channel_F2_O1", {h}, + b2.opts().WithName("outside_compilation_F2_O1_send")); NodeBuilder node_builder2("F2", "F2", lib_def.get()); node_builder2.Input(e).Input(call1); @@ -888,7 +1186,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { { GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); - Node* a = Input(b1.opts().WithName("A")); + Node* a = InputShaped(b1.opts().WithName("A")); Node* b = Input(b1.opts().WithName("B")); Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); Node* d = @@ -908,6 +1206,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { FunctionDefLibrary library_expected; GraphDef graphdef_expected; + TensorShapeProto shape_proto_expected; + shape_proto_expected.add_dim()->set_size(2); + *library_expected.add_function() = FunctionDefHelper::Create( "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {}, { @@ -915,11 +1216,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, {{"F"}, "BinaryTest", - {"D:o:0", "outside_compilation_O1_recv:output:0"}}, - {{"outside_compilation_O1_recv"}, - "_XlaRecvFromHost", + {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}}, + {{"outside_compilation_O1_host_compute"}, + "_XlaHostCompute", {}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}}, + {{"Tinputs", gtl::ArraySlice({})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", ""}, + {"shapes", + gtl::ArraySlice({shape_proto_expected})}}}, }, {{"f_0_retval", "F:o:0"}}); @@ -927,12 +1233,13 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { std::unique_ptr lib_def( new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); - Node* a = Input(b2.opts().WithName("A")); + Node* a = InputShaped(b2.opts().WithName("A")); Node* b = Input(b2.opts().WithName("B")); Node* e = Unary(a, b2.opts().WithName("E")); - Node* send1 = SendFromHost( - {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send")); + Node* send1 = + SendFromHost("host_compute_channel_F1_O1", {e}, + b2.opts().WithName("outside_compilation_F1_O1_send")); NodeBuilder node_builder1("F1", "F1", lib_def.get()); node_builder1.Input(a).Input(b); Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); @@ -954,7 +1261,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) { { GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); - Node* a = Input(b1.opts().WithName("A")); + Node* a = InputShaped(b1.opts().WithName("A")); Node* b = Input(b1.opts().WithName("B")); Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); Node* d = @@ -975,6 +1282,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) { FunctionDefLibrary library_expected; GraphDef graphdef_expected; + TensorShapeProto shape_proto_expected; + shape_proto_expected.add_dim()->set_size(2); + *library_expected.add_function() = FunctionDefHelper::Create( "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {}, { @@ -982,17 +1292,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) { {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, {{"F"}, "BinaryTest", - {"D:o:0", "outside_compilation_O1_recv:output:0"}}, - {{"outside_compilation_O1_send"}, - "_XlaSendToHost", + {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}}, + {{"outside_compilation_O1_host_compute"}, + "_XlaHostCompute", {}, - {{"dtypes", gtl::ArraySlice({})}}, + {{"Tinputs", gtl::ArraySlice({})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", ""}, + {"shapes", + gtl::ArraySlice({shape_proto_expected})}}, {"D"}}, - {{"outside_compilation_O1_recv"}, - "_XlaRecvFromHost", - {}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}, - {"outside_compilation_O1_send"}}, }, {{"f_0_retval", "F:o:0"}}); @@ -1000,14 +1310,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) { std::unique_ptr lib_def( new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); - Node* a = Input(b2.opts().WithName("A")); + Node* a = InputShaped(b2.opts().WithName("A")); Node* b = Input(b2.opts().WithName("B")); Node* recv1 = - RecvAtHost({}, b2.opts().WithName("outside_compilation_F1_O1_recv")); + RecvAtHost("host_compute_channel_F1_O1", {}, + b2.opts().WithName("outside_compilation_F1_O1_recv")); Node* e = Unary(a, b2.opts().WithName("E").WithControlInput(recv1)); - Node* send1 = SendFromHost( - {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send")); + Node* send1 = + SendFromHost("host_compute_channel_F1_O1", {e}, + b2.opts().WithName("outside_compilation_F1_O1_send")); NodeBuilder node_builder1("F1", "F1", lib_def.get()); node_builder1.Input(a).Input(b); Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); @@ -1055,10 +1367,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) { {{"C"}, "UnaryTest", {"a_0_arg"}}, {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, {{"F"}, "UnaryTest", {"D:o:0"}}, - {{"outside_compilation_O1_send"}, - "_XlaSendToHost", + {{"outside_compilation_O1_host_compute"}, + "_XlaHostCompute", {"D:o:0"}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", ""}, + {"shapes", gtl::ArraySlice({})}}}, }, {{"f_0_retval", "F:o:0"}}); @@ -1069,8 +1385,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) { Node* a = Input(b2.opts().WithName("A")); Node* b = Input(b2.opts().WithName("B")); - Node* recv1 = RecvAtHost( - {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv")); + Node* recv1 = + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT}, + b2.opts().WithName("outside_compilation_F1_O1_recv")); Node* e = Unary(recv1, b2.opts().WithName("E")); NodeBuilder node_builder1("F1", "F1", lib_def.get()); node_builder1.Input(a).Input(b); @@ -1118,16 +1435,19 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) { { {{"C"}, "UnaryTest", {"a_0_arg"}}, {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, - {{"F"}, "UnaryTest", {"D:o:0"}, {}, {"outside_compilation_O1_recv"}}, - {{"outside_compilation_O1_send"}, - "_XlaSendToHost", + {{"F"}, + "UnaryTest", {"D:o:0"}, - {{"dtypes", gtl::ArraySlice({DT_FLOAT})}}}, - {{"outside_compilation_O1_recv"}, - "_XlaRecvFromHost", {}, - {{"dtypes", gtl::ArraySlice({})}}, - {"outside_compilation_O1_send"}}, + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O1_host_compute"}, + "_XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", ""}, + {"shapes", gtl::ArraySlice({})}}}, }, {{"f_0_retval", "F:o:0"}}); @@ -1138,10 +1458,11 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) { Node* a = Input(b2.opts().WithName("A")); Node* b = Input(b2.opts().WithName("B")); - Node* recv1 = RecvAtHost( - {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv")); + Node* recv1 = + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT}, + b2.opts().WithName("outside_compilation_F1_O1_recv")); Node* e = Unary(recv1, b2.opts().WithName("E")); - Node* send1 = SendFromHost({}, {}, + Node* send1 = SendFromHost("host_compute_channel_F1_O1", {}, b2.opts() .WithName("outside_compilation_F1_O1_send") .WithControlInput(e)); @@ -1215,5 +1536,110 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) { TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); } +// Test for shape inference of outside compilation. +TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + *library.add_function() = test::function::XTimesTwo(); + + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = InputShaped(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + // Give nodes 'c' and 'd' names that collide after lowercasing. + Node* c = Unary(a, b1.opts().WithName("C")); + Node* d = Unary(b, b1.opts().WithName("c").WithControlInput(c).WithAttr( + "_encapsulate", "F1")); + Node* e = BinaryUnknownShape(c, d, + b1.opts() + .WithName("E") + .WithControlInputs({b, d}) + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Binary(c, e, + b1.opts().WithName("F").WithControlInput(e).WithAttr( + "_encapsulate", "F1")); + Binary(a, f, b1.opts().WithName("G").WithControlInput(e)); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + string shape_string_expected; + { + GraphDefBuilder shape(GraphDefBuilder::kFailImmediately); + Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_0")); + Node* recv = + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT}, + shape.opts().WithName("outside_compilation_F1_O1_recv")); + Node* e = BinaryUnknownShape(known, recv, shape.opts().WithName("E")); + SendFromHost("host_compute_channel_F1_O1", {e}, + shape.opts().WithName("outside_compilation_F1_O1_send")); + GraphDef shape_graph; + TF_EXPECT_OK(shape.ToGraphDef(&shape_graph)); + EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected)); + } + + *library_expected.add_function() = test::function::XTimesTwo(); + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval:float"}, {}, + { + {{"c"}, "UnaryTest", {"b_0_arg"}, {}, {}}, + {{"F"}, + "BinaryTest", + {"c_0_arg", "outside_compilation_O1_host_compute:outputs:0"}, + {}, + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O1_host_compute"}, + "_XlaHostCompute", + {"c:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", shape_string_expected}, + {"shapes", gtl::ArraySlice({})}}, + {"c"}}, + }, + {{"f_0_retval", "F:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = InputShaped(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + Node* c = Unary(a, b2.opts().WithName("C")); + + NodeBuilder node_builder("F1", "F1", lib_def.get()); + node_builder.Input(b).Input(c); + Node* call = + b2.opts().WithControlInputs({c}).FinalizeBuilder(&node_builder); + + Node* recv = + RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT}, + b2.opts().WithName("outside_compilation_F1_O1_recv")); + Node* e = BinaryUnknownShape( + c, ops::NodeOut(recv, 0), + b2.opts().WithName("E").WithControlInputs({recv, b})); + Node* send = SendFromHost("host_compute_channel_F1_O1", {e}, + b2.opts() + .WithName("outside_compilation_F1_O1_send") + .WithControlInput(e)); + + Node* s = NoOp( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send})); + + Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e})); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc index 4842877d9af..6353149e4af 100644 --- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc +++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc @@ -45,7 +45,7 @@ namespace tensorflow { // see comment on `AllowsAsynchronousDeallocation()`. class XlaAllocator : public xla::DeviceMemoryAllocator { public: - XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context); + XlaAllocator(const gpu::Platform* platform, OpKernelContext* op_context); ~XlaAllocator() override; xla::StatusOr Allocate(int device_ordinal, uint64 size, bool retry_on_failure) override; @@ -79,7 +79,8 @@ class XlaAllocator : public xla::DeviceMemoryAllocator { std::unordered_map tensors_; }; -XlaAllocator::XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context) +XlaAllocator::XlaAllocator(const gpu::Platform* platform, + OpKernelContext* op_context) : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {} XlaAllocator::~XlaAllocator() = default; @@ -248,12 +249,16 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) { xla::LocalClient* client = static_cast(cache->client()); + // Builds an XLA allocator for the device. + XlaAllocator xla_allocator(client->platform(), ctx); + XlaCompiler::Options options; options.client = client; options.device_type = &cache->device_type(); options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition(); options.graph_def_version = ctx->function_library()->graph_def_version(); options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId); + options.device_allocator = &xla_allocator; const XlaCompiler::CompilationResult* kernel; xla::LocalExecutable* executable; @@ -264,9 +269,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) { VLOG(1) << "Executing XLA Computation..."; - // Builds an XLA allocator for the device. - XlaAllocator xla_allocator(client->platform(), ctx); - std::unique_ptr output; // Build xla::ShapedBuffers that point directly to the Tensor buffers. std::vector> arg_buffers; @@ -374,8 +376,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) { OP_REQUIRES(ctx, write.input_index >= 0 && write.input_index < ctx->num_inputs(), errors::Internal("Invalid input index for variable write.")); - TensorShape write_shape; - OP_REQUIRES_OK(ctx, XLAShapeToTensorShape(write.shape, &write_shape)); gpu::DeviceMemoryBase buffer = output->buffer({output_num}); @@ -397,7 +397,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) { // Looks up the owning Tensor by buffer address. OP_REQUIRES_OK( - ctx, xla_allocator.MakeTensorFromBuffer(buffer, write.type, write_shape, + ctx, xla_allocator.MakeTensorFromBuffer(buffer, write.type, write.shape, variable->tensor())); ++output_num; } diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc index bfff52c55a7..6d854a920eb 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.cc +++ b/tensorflow/compiler/jit/xla_compilation_cache.cc @@ -148,8 +148,7 @@ Status BuildArguments(int num_constant_args, XlaCompiler::Argument& arg = (*args)[input_num]; arg.kind = XlaCompiler::Argument::kConstant; arg.type = input.dtype(); - TF_RETURN_IF_ERROR( - TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape)); + arg.shape = input.shape(); arg.constant_value = input; ++input_num; } @@ -170,8 +169,7 @@ Status BuildArguments(int num_constant_args, arg.constant_value = input; } arg.type = input.dtype(); - TF_RETURN_IF_ERROR( - TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape)); + arg.shape = input.shape(); ++input_num; } @@ -189,8 +187,7 @@ Status BuildArguments(int num_constant_args, if (variable_args[variable_id].present) { const Tensor& value = variable_args[variable_id].value; arg.type = value.dtype(); - TF_RETURN_IF_ERROR( - TensorShapeToXLAShape(value.dtype(), value.shape(), &arg.shape)); + arg.shape = value.shape(); arg.initialized = true; } else { // The values of uninitialized variables are not passed as inputs, since @@ -199,7 +196,7 @@ Status BuildArguments(int num_constant_args, // uninitialized variables. arg.initialized = false; arg.type = DT_INVALID; - arg.shape = xla::Shape(); + arg.shape = TensorShape(); } ++input_num; } @@ -223,6 +220,7 @@ Status XlaCompilationCache::BuildExecutable( xla::ExecutableBuildOptions build_options; build_options.set_device_ordinal(client_->default_device_ordinal()); build_options.set_result_layout(result.xla_output_shape); + build_options.set_device_allocator(options.device_allocator); auto compile_result = client_->Compile(*result.computation, argument_layouts, build_options); diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 314f5506b16..b0b038775f7 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -144,6 +144,21 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "matrix_triangular_solve_op_test", + size = "small", + srcs = ["matrix_triangular_solve_op_test.py"], + tags = ["optonly"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:math_ops", + "//tensorflow/python:platform_test", + "//tensorflow/python:training", + ], +) + tf_xla_py_test( name = "clustering_test", size = "small", @@ -240,6 +255,18 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "extract_image_patches_op_test", + size = "small", + srcs = ["extract_image_patches_op_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:platform_test", + ], +) + tf_xla_py_test( name = "fft_test", size = "medium", @@ -326,6 +353,19 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "matrix_band_part_test", + size = "medium", + srcs = ["matrix_band_part_test.py"], + tags = ["optonly"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:platform_test", + ], +) + tf_xla_py_test( name = "momentum_test", size = "small", @@ -437,6 +477,18 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "reverse_sequence_op_test", + size = "small", + srcs = ["reverse_sequence_op_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:platform_test", + ], +) + tf_xla_py_test( name = "rmsprop_test", size = "small", diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py index c95fb1c5152..30a6d3a74d6 100644 --- a/tensorflow/compiler/tests/binary_ops_test.py +++ b/tensorflow/compiler/tests/binary_ops_test.py @@ -1181,6 +1181,50 @@ class BinaryOpsTest(XLATestCase): np.array([4, 5, 6], dtype=np.int32), expected=None) + def testMatrixSetDiag(self): + for dtype in self.numeric_types: + # Square + self._testBinary( + array_ops.matrix_set_diag, + np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]], + dtype=dtype), + np.array([1.0, 2.0, 3.0], dtype=dtype), + expected=np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0], [1.0, 1.0, 3.0]], + dtype=dtype)) + + self._testBinary( + array_ops.matrix_set_diag, + np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]], + [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0], [2.0, 0.0, 6.0]]], + dtype=dtype), + np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]], dtype=dtype), + expected=np.array( + [[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0], [1.0, 0.0, -3.0]], + [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0], [2.0, 0.0, -6.0]]], + dtype=dtype)) + + # Rectangular + self._testBinary( + array_ops.matrix_set_diag, + np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]], dtype=dtype), + np.array([3.0, 4.0], dtype=dtype), + expected=np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]], dtype=dtype)) + + self._testBinary( + array_ops.matrix_set_diag, + np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]], dtype=dtype), + np.array([3.0, 4.0], dtype=dtype), + expected=np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]], dtype=dtype)) + + self._testBinary( + array_ops.matrix_set_diag, + np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]], + [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]], dtype=dtype), + np.array([[-1.0, -2.0], [-4.0, -5.0]], + dtype=dtype), + expected=np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]], + [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]], + dtype=dtype)) if __name__ == "__main__": googletest.main() diff --git a/tensorflow/compiler/tests/extract_image_patches_op_test.py b/tensorflow/compiler/tests/extract_image_patches_op_test.py new file mode 100644 index 00000000000..0361702e7af --- /dev/null +++ b/tensorflow/compiler/tests/extract_image_patches_op_test.py @@ -0,0 +1,134 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functional tests for ExtractImagePatches op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import test + + +class ExtractImagePatches(XLATestCase): + """Functional tests for ExtractImagePatches op.""" + + def _VerifyValues(self, image, ksizes, strides, rates, padding, patches): + """Tests input-output pairs for the ExtractImagePatches op. + + Args: + image: Input tensor with shape: [batch, in_rows, in_cols, depth]. + ksizes: Patch size specified as: [ksize_rows, ksize_cols]. + strides: Output strides, specified as [stride_rows, stride_cols]. + rates: Atrous rates, specified as [rate_rows, rate_cols]. + padding: Padding type. + patches: Expected output. + """ + ksizes = [1] + ksizes + [1] + strides = [1] + strides + [1] + rates = [1] + rates + [1] + + with self.test_session(): + image_placeholder = array_ops.placeholder(dtypes.float32) + with self.test_scope(): + out_tensor = array_ops.extract_image_patches( + image_placeholder, + ksizes=ksizes, + strides=strides, + rates=rates, + padding=padding, + name="im2col") + feed_dict = {image_placeholder: image} + self.assertAllClose(patches, out_tensor.eval(feed_dict=feed_dict)) + + def testKsize1x1Stride1x1Rate1x1(self): + """Verifies that for 1x1 kernel the output equals the input.""" + # [2, 3, 4, 5] + image = np.reshape(range(120), [2, 3, 4, 5]) + # [2, 3, 4, 5] + patches = np.reshape(range(120), [2, 3, 4, 5]) + for padding in ["VALID", "SAME"]: + self._VerifyValues( + image, + ksizes=[1, 1], + strides=[1, 1], + rates=[1, 1], + padding=padding, + patches=patches) + + def testKsize1x1Stride2x3Rate1x1(self): + """Test for 1x1 kernel and strides.""" + # [2, 4, 5, 3] + image = np.reshape(range(120), [2, 4, 5, 3]) + # [2, 2, 2, 3] + patches = image[:, ::2, ::3, :] + for padding in ["VALID", "SAME"]: + self._VerifyValues( + image, + ksizes=[1, 1], + strides=[2, 3], + rates=[1, 1], + padding=padding, + patches=patches) + + def testKsize2x2Stride1x1Rate1x1Valid(self): + """Test for 2x2 kernel with VALID padding.""" + # [1, 2, 2, 1] + image = [[[[1], [2]], [[3], [4]]]] + # [1, 1, 1, 4] + patches = [[[[1, 2, 3, 4]]]] + self._VerifyValues( + image, + ksizes=[2, 2], + strides=[1, 1], + rates=[1, 1], + padding="VALID", + patches=patches) + + def testKsize2x2Stride1x1Rate1x1Same(self): + """Test for 2x2 kernel with SAME padding.""" + # [1, 2, 2, 1] + image = [[[[1], [2]], [[3], [4]]]] + # [1, 2, 2, 4] + patches = [[[[1, 2, 3, 4], [2, 0, 4, 0]], [[3, 4, 0, 0], [4, 0, 0, 0]]]] + self._VerifyValues( + image, + ksizes=[2, 2], + strides=[1, 1], + rates=[1, 1], + padding="SAME", + patches=patches) + + def testKsize2x2Stride1x1Rate2x2Valid(self): + """Test for 2x2 kernel with 2x2 dilation.""" + # [1, 2, 2, 1] + image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32) + # [1, 2, 2, 4] + patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]], + [[4, 6, 12, 14], [5, 7, 13, 15]]]] + self._VerifyValues( + image, + ksizes=[2, 2], + strides=[1, 1], + rates=[2, 2], + padding="VALID", + patches=patches) + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/matrix_band_part_test.py b/tensorflow/compiler/tests/matrix_band_part_test.py new file mode 100644 index 00000000000..29394f9ea51 --- /dev/null +++ b/tensorflow/compiler/tests/matrix_band_part_test.py @@ -0,0 +1,64 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import test + + +class MatrixBandPartTest(XLATestCase): + + def _testMatrixBandPart(self, dtype, shape): + with self.test_session(): + batch_shape = shape[:-2] + mat = np.ones(shape).astype(dtype) + batch_mat = np.tile(mat, batch_shape + [1, 1]) + for lower in -1, 0, 1, shape[-2] - 1: + for upper in -1, 0, 1, shape[-1] - 1: + band_np = mat + if lower >= 0: + band_np = np.triu(band_np, -lower) + if upper >= 0: + band_np = np.tril(band_np, upper) + if batch_shape: + band_np = np.tile(band_np, batch_shape + [1, 1]) + + placeholder = array_ops.placeholder(dtype) + with self.test_scope(): + band = array_ops.matrix_band_part( + placeholder, + constant_op.constant(lower, dtype=dtypes.int32), + constant_op.constant(upper, dtype=dtypes.int32)) + feed_dict = {placeholder: batch_mat} + self.assertAllEqual(band_np, band.eval(feed_dict=feed_dict)) + + def testMatrixBandPart(self): + for dtype in self.float_types: + for batch_shape in [[], [2,], [1, 3, 2]]: + for rows in 1, 2, 7: + for cols in 1, 2, 7: + self._testMatrixBandPart(dtype, batch_shape + [rows, cols]) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py new file mode 100644 index 00000000000..cccb7f5789d --- /dev/null +++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py @@ -0,0 +1,130 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow.ops.tf.MatrixTriangularSolve.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import itertools + +import numpy as np + +from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import linalg_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import test + + +def MakePlaceholder(x): + return array_ops.placeholder(dtypes.as_dtype(x.dtype), shape=x.shape) + + +class MatrixTriangularSolveOpTest(XLATestCase): + + def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca, + placeholder_b, a, clean_a, b, verification, + atol): + feed_dict = {placeholder_a: a, placeholder_ca: clean_a, placeholder_b: b} + verification_np = sess.run(verification, feed_dict) + self.assertAllClose(b, verification_np, atol=atol) + + def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol): + clean_a = np.tril(a) if lower else np.triu(a) + with self.test_session() as sess: + placeholder_a = MakePlaceholder(a) + placeholder_ca = MakePlaceholder(clean_a) + placeholder_b = MakePlaceholder(b) + with self.test_scope(): + x = linalg_ops.matrix_triangular_solve( + placeholder_a, placeholder_b, lower=lower, adjoint=adjoint) + verification = math_ops.matmul(placeholder_ca, x, adjoint_a=adjoint) + self._VerifyTriangularSolveBase(sess, placeholder_a, placeholder_ca, + placeholder_b, a, clean_a, b, + verification, atol) + + def _VerifyTriangularSolveCombo(self, a, b, atol=1e-4): + transp = lambda x: np.swapaxes(x, -1, -2) + for lower, adjoint in itertools.product([True, False], repeat=2): + self._VerifyTriangularSolve( + a if lower else transp(a), b, lower, adjoint, atol) + + def testBasic(self): + rng = np.random.RandomState(0) + a = np.tril(rng.randn(5, 5)) + b = rng.randn(5, 7) + for dtype in self.float_types: + self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype)) + + def testBasicNotActuallyTriangular(self): + rng = np.random.RandomState(0) + a = rng.randn(5, 5) # the `a` matrix is not lower-triangular + b = rng.randn(5, 7) + for dtype in self.float_types: + self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype)) + + def testBasicComplexDtypes(self): + rng = np.random.RandomState(0) + a = np.tril(rng.randn(5, 5) + rng.randn(5, 5) * 1j) + b = rng.randn(5, 7) + rng.randn(5, 7) * 1j + for dtype in self.complex_types: + self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype)) + + def testBatch(self): + rng = np.random.RandomState(0) + shapes = [((4, 3, 3), (4, 3, 5)), ((1, 2, 2), (1, 2, 1)), + ((1, 1, 1), (1, 1, 2)), ((2, 3, 4, 4), (2, 3, 4, 1))] + tuples = itertools.product(self.float_types, shapes) + for dtype, (a_shape, b_shape) in tuples: + n = a_shape[-1] + a = np.tril(rng.rand(*a_shape) - 0.5) / (2.0 * n) + np.eye(n) + b = rng.randn(*b_shape) + self._VerifyTriangularSolveCombo( + a.astype(dtype), b.astype(dtype), atol=1e-3) + + def testLarge(self): + n = 1024 + rng = np.random.RandomState(0) + a = np.tril(rng.rand(n, n) - 0.5) / (2.0 * n) + np.eye(n) + b = rng.randn(n, n) + self._VerifyTriangularSolve( + a.astype(np.float32), b.astype(np.float32), True, False, 1e-4) + + def testNonSquareCoefficientMatrix(self): + rng = np.random.RandomState(0) + for dtype in self.float_types: + a = rng.randn(3, 4).astype(dtype) + b = rng.randn(4, 4).astype(dtype) + with self.assertRaises(ValueError): + linalg_ops.matrix_triangular_solve(a, b) + with self.assertRaises(ValueError): + linalg_ops.matrix_triangular_solve(a, b) + + def testWrongDimensions(self): + randn = np.random.RandomState(0).randn + for dtype in self.float_types: + lhs = constant_op.constant(randn(3, 3), dtype=dtype) + rhs = constant_op.constant(randn(4, 3), dtype=dtype) + with self.assertRaises(ValueError): + linalg_ops.matrix_triangular_solve(lhs, rhs) + with self.assertRaises(ValueError): + linalg_ops.matrix_triangular_solve(lhs, rhs) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/reverse_sequence_op_test.py b/tensorflow/compiler/tests/reverse_sequence_op_test.py new file mode 100644 index 00000000000..1a5d05094e5 --- /dev/null +++ b/tensorflow/compiler/tests/reverse_sequence_op_test.py @@ -0,0 +1,93 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow.ops.reverse_sequence_op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import test + + +class ReverseSequenceTest(XLATestCase): + + def _testReverseSequence(self, + x, + batch_axis, + seq_axis, + seq_lengths, + truth, + expected_err_re=None): + with self.test_session(): + p = array_ops.placeholder(dtypes.as_dtype(x.dtype)) + lengths = array_ops.placeholder(dtypes.as_dtype(seq_lengths.dtype)) + with self.test_scope(): + ans = array_ops.reverse_sequence( + p, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=lengths) + if expected_err_re is None: + tf_ans = ans.eval(feed_dict={p: x, lengths: seq_lengths}) + self.assertAllClose(tf_ans, truth, atol=1e-10) + else: + with self.assertRaisesOpError(expected_err_re): + ans.eval(feed_dict={p: x, lengths: seq_lengths}) + + def testSimple(self): + x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32) + expected = np.array([[1, 2, 3], [6, 5, 4], [8, 7, 9]], dtype=np.int32) + self._testReverseSequence( + x, + batch_axis=0, + seq_axis=1, + seq_lengths=np.array([1, 3, 2], np.int32), + truth=expected) + + def _testBasic(self, dtype, len_dtype): + x = np.asarray( + [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]], + [[17, 18, 19, 20], [21, 22, 23, 24]]], + dtype=dtype) + x = x.reshape(3, 2, 4, 1, 1) + x = x.transpose([2, 1, 0, 3, 4]) # permute axes 0 <=> 2 + + # reverse dim 2 up to (0:3, none, 0:4) along dim=0 + seq_lengths = np.asarray([3, 0, 4], dtype=len_dtype) + + truth_orig = np.asarray( + [ + [[3, 2, 1, 4], [7, 6, 5, 8]], # reverse 0:3 + [[9, 10, 11, 12], [13, 14, 15, 16]], # reverse none + [[20, 19, 18, 17], [24, 23, 22, 21]] + ], # reverse 0:4 (all) + dtype=dtype) + truth_orig = truth_orig.reshape(3, 2, 4, 1, 1) + truth = truth_orig.transpose([2, 1, 0, 3, 4]) # permute axes 0 <=> 2 + + seq_axis = 0 # permute seq_axis and batch_axis (originally 2 and 0, resp.) + batch_axis = 2 + self._testReverseSequence(x, batch_axis, seq_axis, seq_lengths, truth) + + def testSeqLength(self): + for dtype in self.all_types: + for seq_dtype in self.int_types: + self._testBasic(dtype, seq_dtype) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py index 8e4b8a38336..3d3e112f482 100644 --- a/tensorflow/compiler/tests/unary_ops_test.py +++ b/tensorflow/compiler/tests/unary_ops_test.py @@ -154,6 +154,21 @@ class UnaryOpsTest(XLATestCase): def testFloatOps(self): for dtype in self.float_types: + x = np.arange(-0.90, 0.90, 0.25) + self._assertOpOutputMatchesExpected( + math_ops.acos, + x.astype(dtype), + expected=np.arccos(x).astype(dtype)) + self._assertOpOutputMatchesExpected( + math_ops.asin, + x.astype(dtype), + expected=np.arcsin(x).astype(dtype)) + x = np.arange(-3, 3).reshape(1, 3, 2) + self._assertOpOutputMatchesExpected( + math_ops.atan, + x.astype(dtype), + expected=np.arctan(x).astype(dtype)) + self._assertOpOutputMatchesExpected( math_ops.acosh, np.array([1, 2, 3, 4], dtype=dtype), diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc index 1d9e0fb33ee..bf304102ede 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc @@ -427,16 +427,36 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame, // identity nodes are values used by the loop body or condition. // The Identity node may have the wrong device so copy the device from // one of its outputs instead. + std::deque possible_exit; for (const Edge* edge : arg.switch_node->out_edges()) { - if (edge->src_output() == 0 && IsExit(edge->dst())) { + if (edge->src_output() == 0) { + possible_exit.push_back(edge); + } + if (IsIdentity(edge->dst())) { + TF_RETURN_IF_ERROR( + SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true)); + } + } + // TODO(b/67425339): Allow general graph between switch and exit. + while (!possible_exit.empty()) { + const Edge* edge = possible_exit.front(); + possible_exit.pop_front(); + if (IsExit(edge->dst())) { if (arg.exit != nullptr) { return errors::InvalidArgument("Duplicate Exit successors to ", arg.switch_node->name()); } arg.exit = edge->dst(); - } else if (StringPiece(edge->dst()->type_string()) == "Identity") { - TF_RETURN_IF_ERROR( - SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true)); + } else { + if (!IsIdentity(edge->dst())) { + return errors::Unimplemented("General graph between switch (", + arg.switch_node->name(), + ") and exit node of frame ", + frame->name, " not supported yet."); + } + for (const Edge* out : edge->dst()->out_edges()) { + possible_exit.push_back(out); + } } } } diff --git a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md index 82b3b46a2f1..91351421bca 100644 --- a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md +++ b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md @@ -6,6 +6,9 @@ Operator | Type Constraint `Acosh` | `T={complex64,double,float}` `Add` | `T={complex64,double,float,int32,int64}` `AddN` | `T={complex64,double,float,int32,int64,uint32,uint64}` +`AdjustContrastv2` | +`AdjustHue` | +`AdjustSaturation` | `All` | `Tidx={int32,int64}` `Angle` | `Tout={double,float}`
`T={complex64}` `Any` | `Tidx={int32,int64}` @@ -34,7 +37,7 @@ Operator | Type Constraint `BroadcastGradientArgs` | `T={int32,int64}` `Cast` | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`
`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}` `Ceil` | `T={double,float}` -`Cholesky` | `T={complex64,double,float}` +`Cholesky` | `T={double,float}` `Complex` | `Tout={complex64}`
`T={double,float}` `ComplexAbs` | `Tout={double,float}`
`T={complex64}` `Concat` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` @@ -68,7 +71,11 @@ Operator | Type Constraint `Exp` | `T={complex64,double,float}` `ExpandDims` | `Tdim={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Expm1` | `T={complex64,double,float}` -`Fill` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` +`ExtractImagePatches` | `T={double,float,int32,int64,uint32,uint64}` +`FFT` | +`FFT2D` | +`FFT3D` | +`Fill` | `index_type={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Floor` | `T={double,float}` `FloorDiv` | `T={complex64,double,float,int32,int64}` `FloorMod` | `T={double,float,int32,int64}` @@ -80,6 +87,13 @@ Operator | Type Constraint `GatherV2` | `Taxis={int32,int64}`
`Tindices={int32,int64}`
`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}` `Greater` | `T={double,float,int32,int64,uint32,uint64}` `GreaterEqual` | `T={double,float,int32,int64,uint32,uint64}` +`HSVToRGB` | `T={double,float}` +`IFFT` | +`IFFT2D` | +`IFFT3D` | +`IRFFT` | +`IRFFT2D` | +`IRFFT3D` | `Identity` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` `IdentityN` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Imag` | `Tout={double,float}`
`T={complex64}` @@ -105,11 +119,14 @@ Operator | Type Constraint `MatMul` | `T={complex64,double,float}` `MatrixDiag` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` `MatrixDiagPart` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` +`MatrixTriangularSolve` | `T={complex64,double,float}` `Max` | `Tidx={int32,int64}`
`T={complex64,double,float,int32,int64,uint32,uint64}` `MaxPool` | `T={double,float,int32,int64}` `MaxPool3D` | `T={float}` `MaxPool3DGrad` | `TInput={float}`
`T={float}` `MaxPoolGrad` | `T={double,float,int32,int64,uint32,uint64}` +`MaxPoolGradV2` | `T={double,float,int32,int64,uint32,uint64}` +`MaxPoolV2` | `T={double,float,int32,int64}` `Maximum` | `T={double,float,int32,int64}` `Mean` | `Tidx={int32,int64}`
`T={complex64,double,float,int32,int64,uint32,uint64}` `Min` | `Tidx={int32,int64}`
`T={complex64,double,float,int32,int64,uint32,uint64}` @@ -131,6 +148,10 @@ Operator | Type Constraint `PreventGradient` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Prod` | `Tidx={int32,int64}`
`T={complex64,double,float,int32,int64,uint32,uint64}` `QuantizeAndDequantizeV2` | `T={double,float}` +`RFFT` | +`RFFT2D` | +`RFFT3D` | +`RGBToHSV` | `T={double,float}` `RandomStandardNormal` | `dtype={float}` `RandomUniform` | `T={int32,int64}`
`dtype={double,float}` `RandomUniformInt` | `T={int32,int64}`
`Tout={int32,int64}` @@ -146,6 +167,8 @@ Operator | Type Constraint `Relu6Grad` | `T={double,float,int32,int64,uint32,uint64}` `ReluGrad` | `T={double,float,int32,int64,uint32,uint64}` `Reshape` | `Tshape={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` +`ResizeBilinear` | `T={double,float,int32,int64}` +`ResizeBilinearGrad` | `T={double,float}` `ResourceApplyAdagrad` | `T={double,float}` `ResourceApplyAdam` | `T={double,float}` `ResourceApplyFtrl` | `T={double,float}` @@ -156,6 +179,7 @@ Operator | Type Constraint `ResourceGather` | `Tindices={int32,int64}`
`dtype={complex64,double,float,int32,int64,uint32,uint64}` `ResourceStridedSliceAssign` | `Index={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Reverse` | `T={bool,complex64,double,float,int32,int64}` +`ReverseSequence` | `Tlen={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` `ReverseV2` | `T={bool,complex64,double,float,int32,int64}`
`Tidx={int32,int64}` `RightShift` | `T={int32,int64,uint32,uint64}` `Rint` | `T={double,float}` diff --git a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md index d4b7621ad28..b9bdb829d77 100644 --- a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md +++ b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md @@ -6,6 +6,9 @@ Operator | Type Constraint `Acosh` | `T={complex64,double,float}` `Add` | `T={complex64,double,float,int32,int64}` `AddN` | `T={complex64,double,float,int32,int64,uint32,uint64}` +`AdjustContrastv2` | +`AdjustHue` | +`AdjustSaturation` | `All` | `Tidx={int32,int64}` `Angle` | `Tout={double,float}`
`T={complex64}` `Any` | `Tidx={int32,int64}` @@ -34,7 +37,7 @@ Operator | Type Constraint `BroadcastGradientArgs` | `T={int32,int64}` `Cast` | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`
`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}` `Ceil` | `T={double,float}` -`Cholesky` | `T={complex64,double,float}` +`Cholesky` | `T={double,float}` `Complex` | `Tout={complex64}`
`T={double,float}` `ComplexAbs` | `Tout={double,float}`
`T={complex64}` `Concat` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` @@ -68,7 +71,11 @@ Operator | Type Constraint `Exp` | `T={complex64,double,float}` `ExpandDims` | `Tdim={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Expm1` | `T={complex64,double,float}` -`Fill` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` +`ExtractImagePatches` | `T={double,float,int32,int64,uint32,uint64}` +`FFT` | +`FFT2D` | +`FFT3D` | +`Fill` | `index_type={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Floor` | `T={double,float}` `FloorDiv` | `T={complex64,double,float,int32,int64}` `FloorMod` | `T={double,float,int32,int64}` @@ -80,6 +87,13 @@ Operator | Type Constraint `GatherV2` | `Taxis={int32,int64}`
`Tindices={int32,int64}`
`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}` `Greater` | `T={double,float,int32,int64,uint32,uint64}` `GreaterEqual` | `T={double,float,int32,int64,uint32,uint64}` +`HSVToRGB` | `T={double,float}` +`IFFT` | +`IFFT2D` | +`IFFT3D` | +`IRFFT` | +`IRFFT2D` | +`IRFFT3D` | `Identity` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` `IdentityN` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Imag` | `Tout={double,float}`
`T={complex64}` @@ -105,11 +119,14 @@ Operator | Type Constraint `MatMul` | `T={complex64,double,float}` `MatrixDiag` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` `MatrixDiagPart` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` +`MatrixTriangularSolve` | `T={complex64,double,float}` `Max` | `Tidx={int32,int64}`
`T={complex64,double,float,int32,int64,uint32,uint64}` `MaxPool` | `T={double,float,int32,int64}` `MaxPool3D` | `T={float}` `MaxPool3DGrad` | `TInput={float}`
`T={float}` `MaxPoolGrad` | `T={double,float,int32,int64,uint32,uint64}` +`MaxPoolGradV2` | `T={double,float,int32,int64,uint32,uint64}` +`MaxPoolV2` | `T={double,float,int32,int64}` `Maximum` | `T={double,float,int32,int64}` `Mean` | `Tidx={int32,int64}`
`T={complex64,double,float,int32,int64,uint32,uint64}` `Min` | `Tidx={int32,int64}`
`T={complex64,double,float,int32,int64,uint32,uint64}` @@ -131,6 +148,10 @@ Operator | Type Constraint `PreventGradient` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Prod` | `Tidx={int32,int64}`
`T={complex64,double,float,int32,int64,uint32,uint64}` `QuantizeAndDequantizeV2` | `T={double,float}` +`RFFT` | +`RFFT2D` | +`RFFT3D` | +`RGBToHSV` | `T={double,float}` `Range` | `Tidx={double,float,int32,int64}` `Rank` | `T={bool,complex64,double,float,int32,int64,uint32,uint64}` `ReadVariableOp` | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}` @@ -143,6 +164,8 @@ Operator | Type Constraint `Relu6Grad` | `T={double,float,int32,int64,uint32,uint64}` `ReluGrad` | `T={double,float,int32,int64,uint32,uint64}` `Reshape` | `Tshape={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` +`ResizeBilinear` | `T={double,float,int32,int64}` +`ResizeBilinearGrad` | `T={double,float}` `ResourceApplyAdagrad` | `T={double,float}` `ResourceApplyAdam` | `T={double,float}` `ResourceApplyFtrl` | `T={double,float}` @@ -153,6 +176,7 @@ Operator | Type Constraint `ResourceGather` | `Tindices={int32,int64}`
`dtype={complex64,double,float,int32,int64,uint32,uint64}` `ResourceStridedSliceAssign` | `Index={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` `Reverse` | `T={bool,complex64,double,float,int32,int64}` +`ReverseSequence` | `Tlen={int32,int64}`
`T={bool,complex64,double,float,int32,int64,uint32,uint64}` `ReverseV2` | `T={bool,complex64,double,float,int32,int64}`
`Tidx={int32,int64}` `RightShift` | `T={int32,int64,uint32,uint64}` `Rint` | `T={double,float}` diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc index 02215b5112d..1418d95956e 100644 --- a/tensorflow/compiler/tf2xla/graph_compiler.cc +++ b/tensorflow/compiler/tf2xla/graph_compiler.cc @@ -60,9 +60,7 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph, for (int i = 0; i < args->size(); ++i) { XlaCompiler::Argument& arg = (*args)[i]; arg.type = ctx->input_type(i); - - TF_RETURN_IF_ERROR( - TensorShapeToXLAShape(arg.type, ctx->InputShape(i), &arg.shape)); + arg.shape = ctx->InputShape(i); if (arg.type == DT_RESOURCE) { return errors::InvalidArgument( diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index 5e1b01878b7..e9be6f8476d 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -31,6 +31,7 @@ tf_kernel_library( "diag_op.cc", "dynamic_stitch_op.cc", "elu_op.cc", + "extract_image_patches_op.cc", "fft_ops.cc", "fill_op.cc", "function_ops.cc", @@ -43,6 +44,9 @@ tf_kernel_library( "l2loss_op.cc", "lrn_ops.cc", "matmul_op.cc", + "matrix_band_part_op.cc", + "matrix_set_diag_op.cc", + "matrix_triangular_solve_op.cc", "mirror_pad_op.cc", "no_op.cc", "one_hot_op.cc", @@ -58,6 +62,7 @@ tf_kernel_library( "reshape_op.cc", "retval_op.cc", "reverse_op.cc", + "reverse_sequence_op.cc", "scan_ops.cc", "segment_reduction_ops.cc", "select_op.cc", @@ -92,6 +97,7 @@ tf_kernel_library( "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/tf2xla/lib:batch_dot", "//tensorflow/compiler/tf2xla/lib:cholesky", + "//tensorflow/compiler/tf2xla/lib:triangular_solve", "//tensorflow/compiler/tf2xla/lib:util", "//tensorflow/compiler/tf2xla/ops:sendrecv_ops", "//tensorflow/compiler/xla:array4d", diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc index a015b8e0e89..b0ba25b9983 100644 --- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc @@ -28,8 +28,9 @@ class BatchMatMulOp : public XlaOpKernel { } void Compile(XlaOpKernelContext* ctx) override { - auto result = - BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1), adj_x_, adj_y_); + auto result = BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1), + /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_, + /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_); OP_REQUIRES_OK(ctx, result.status()); ctx->SetOutput(0, result.ValueOrDie()); } diff --git a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc index 87d858f7635..fe6651793dc 100644 --- a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc @@ -33,7 +33,7 @@ class CholeskyOp : public XlaOpKernel { } }; -REGISTER_XLA_OP(Name("Cholesky"), CholeskyOp); +REGISTER_XLA_OP(Name("Cholesky").TypeConstraint("T", kFloatTypes), CholeskyOp); } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc new file mode 100644 index 00000000000..b2970eae20a --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc @@ -0,0 +1,169 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/type_util.h" +#include "tensorflow/compiler/tf2xla/xla_helpers.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/util/tensor_format.h" + +namespace tensorflow { + +namespace { + +class ExtractImagePatchesOp : public XlaOpKernel { + public: + explicit ExtractImagePatchesOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("ksizes", &ksizes_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("rates", &dilations_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + const TensorFormat data_format = FORMAT_NHWC; + const int num_dims = ksizes_.size(); + + OP_REQUIRES( + ctx, num_dims >= 3, + errors::InvalidArgument("Kernel size must have at least 3 dimensions")); + const int num_spatial_dims = num_dims - 2; + + OP_REQUIRES(ctx, strides_.size() == num_dims, + errors::InvalidArgument("Sliding window strides field must " + "specify ", + num_dims, " dimensions")); + OP_REQUIRES(ctx, dilations_.size() == num_dims, + errors::InvalidArgument("Dilations field must " + "specify ", + num_dims, " dimensions")); + + int batch_dim = GetTensorBatchDimIndex(num_dims, data_format); + int feature_dim = GetTensorFeatureDimIndex(num_dims, data_format); + OP_REQUIRES( + ctx, ksizes_[batch_dim] == 1 && ksizes_[feature_dim] == 1, + errors::Unimplemented("Current implementation does not yet support " + "kernel sizes > 1 in the batch and depth " + "dimensions.")); + OP_REQUIRES( + ctx, strides_[batch_dim] == 1 && strides_[feature_dim] == 1, + errors::Unimplemented("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES( + ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1, + errors::Unimplemented("Current implementation does not support " + "dilations in the batch and depth dimensions.")); + + for (int i = 0; i < num_spatial_dims; ++i) { + int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i); + OP_REQUIRES( + ctx, ksizes_[input_dim] >= 0, + errors::Unimplemented("Kernel size values must be non-negative; ", i, + "th spatial dimension had dilation ", + dilations_[input_dim])); + OP_REQUIRES(ctx, strides_[input_dim] >= 1, + errors::Unimplemented("Stride values must be positive; ", i, + "th spatial dimension had dilation ", + dilations_[input_dim])); + OP_REQUIRES(ctx, dilations_[input_dim] >= 1, + errors::Unimplemented("Dilation values must be positive; ", i, + "th spatial dimension had dilation ", + dilations_[input_dim])); + } + + xla::PrimitiveType type; + OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(ctx->input_type(0), &type)); + + const TensorShape input_shape = ctx->InputShape(0); + OP_REQUIRES( + ctx, input_shape.dims() == num_dims, + errors::InvalidArgument("input must be ", num_dims, "-dimensional", + input_shape.DebugString())); + const int64 depth = input_shape.dim_size(feature_dim); + + xla::ComputationBuilder* builder = ctx->builder(); + + // The following code is equivalent to: + // eye = np.eye(kH * kW * D).reshape([kH, kW, D, kH * kW * kD]) + int64 kernel_size = 1; + std::vector lhs_shape(num_dims, 1); + for (int i = 0; i < num_spatial_dims; ++i) { + int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i); + lhs_shape[i] = ksizes_[input_dim]; + kernel_size *= ksizes_[input_dim]; + } + lhs_shape[num_spatial_dims] = depth; + lhs_shape[num_spatial_dims + 1] = 1; + + // Builds an identity matrix as a broadcast equality of iotas. + // iota = np.arange(np.prod(ksize), depth) + // filter = np.equal(np.reshape(iota, [-1, 1]), iota).astype(np.float32) + xla::ComputationDataHandle iota; + TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32, + kernel_size * depth, &iota)); + + auto lhs = builder->Reshape(iota, lhs_shape); + auto filter = builder->ConvertElementType( + builder->Eq(lhs, iota, {num_spatial_dims + 1}), type); + + xla::ConvolutionDimensionNumbers dims; + std::vector window_strides(num_spatial_dims); + std::vector lhs_dilation(num_spatial_dims, 1); + std::vector rhs_dilation(num_spatial_dims); + std::vector> padding(num_spatial_dims); + + dims.set_input_batch_dimension(batch_dim); + dims.set_output_batch_dimension(batch_dim); + dims.set_input_feature_dimension(feature_dim); + dims.set_output_feature_dimension(feature_dim); + dims.set_kernel_input_feature_dimension(num_spatial_dims); + dims.set_kernel_output_feature_dimension(num_spatial_dims + 1); + + for (int i = 0; i < num_spatial_dims; ++i) { + const int64 dim = GetTensorSpatialDimIndex(num_dims, data_format, i); + dims.add_input_spatial_dimensions(dim); + dims.add_kernel_spatial_dimensions(i); + dims.add_output_spatial_dimensions(dim); + window_strides[i] = strides_.at(dim); + rhs_dilation[i] = dilations_.at(dim); + + int64 unused_output_size; + OP_REQUIRES_OK( + ctx, GetWindowedOutputSizeVerboseV2( + input_shape.dim_size(dim), ksizes_[dim], rhs_dilation[i], + window_strides[i], padding_, &unused_output_size, + &padding[i].first, &padding[i].second)); + } + + xla::ComputationDataHandle conv = + builder->ConvGeneralDilated(ctx->Input(0), filter, window_strides, + padding, lhs_dilation, rhs_dilation, dims); + ctx->SetOutput(0, conv); + } + + protected: + std::vector ksizes_; + std::vector dilations_; + std::vector strides_; + Padding padding_; + + private: + TF_DISALLOW_COPY_AND_ASSIGN(ExtractImagePatchesOp); +}; + +REGISTER_XLA_OP(Name("ExtractImagePatches"), ExtractImagePatchesOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc new file mode 100644 index 00000000000..faa415a97b0 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc @@ -0,0 +1,98 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_helpers.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/framework/tensor_shape.h" + +namespace tensorflow { +namespace { + +class MatrixBandPartOp : public XlaOpKernel { + public: + explicit MatrixBandPartOp(OpKernelConstruction* context) + : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + const TensorShape input_shape = context->InputShape(0); + // Preliminary validation of sizes. + OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape), + errors::InvalidArgument( + "input must be at least 2-dim, received shape: ", + input_shape.DebugString())); + + const TensorShape num_lower_in_shape = context->InputShape(1); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_lower_in_shape), + errors::InvalidArgument("num_lower must be scalar, got shape ", + num_lower_in_shape.DebugString())); + + const TensorShape num_upper_in_shape = context->InputShape(2); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in_shape), + errors::InvalidArgument("num_upper must be scalar, got shape ", + num_upper_in_shape.DebugString())); + + xla::ComputationBuilder* builder = context->builder(); + xla::ComputationDataHandle input = context->Input(0); + xla::ComputationDataHandle num_lower = context->Input(1); + xla::ComputationDataHandle num_upper = context->Input(2); + DataType input_type = context->input_type(0); + DataType index_type = context->input_type(1); + + TensorShape batch_shape = input_shape; + batch_shape.RemoveLastDims(2); + const int64 m = input_shape.dim_size(input_shape.dims() - 2); + const int64 n = input_shape.dim_size(input_shape.dims() - 1); + + // Compute 'offset', which is how many diagonals we are above/below the + // diagonal. + xla::ComputationDataHandle iota_m; + OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, m, &iota_m)); + + xla::ComputationDataHandle iota_n; + OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, n, &iota_n)); + + auto offset = builder->Sub(builder->Broadcast(iota_n, {m}), iota_m, + /*broadcast_dimensions=*/{0}); + + // If num_lower or num_upper are negative, include all lower/upper + // diagonals. + auto zero_index = XlaHelpers::Zero(builder, index_type); + num_lower = builder->Select( + builder->Lt(num_lower, zero_index), + XlaHelpers::IntegerLiteral(builder, index_type, m), num_lower); + num_upper = builder->Select( + builder->Lt(num_upper, zero_index), + XlaHelpers::IntegerLiteral(builder, index_type, n), num_upper); + + auto indicator = builder->And(builder->Le(builder->Neg(num_lower), offset), + builder->Le(offset, num_upper)); + indicator = builder->Broadcast(indicator, batch_shape.dim_sizes()); + + auto zero_input = XlaHelpers::Zero(builder, input_type); + auto output = builder->Select( + indicator, input, + builder->Broadcast(zero_input, input_shape.dim_sizes())); + + context->SetOutput(0, output); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(MatrixBandPartOp); +}; +REGISTER_XLA_OP(Name("MatrixBandPart"), MatrixBandPartOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc new file mode 100644 index 00000000000..b2940bdcff7 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc @@ -0,0 +1,93 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_helpers.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" + +namespace tensorflow { + +class MatrixSetDiagOp : public XlaOpKernel { + public: + explicit MatrixSetDiagOp(OpKernelConstruction* context) + : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + const TensorShape input_shape = context->InputShape(0); + const TensorShape diag_shape = context->InputShape(1); + + const int rank = input_shape.dims(); + + // Preliminary validation of sizes. + OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape), + errors::InvalidArgument( + "input must be at least 2-dim, received shape: ", + input_shape.DebugString())); + + // Check to make sure the last dimension of diag is equal to the smaller of + // the last two dimensions of input. + const int64 m = input_shape.dim_size(rank - 2); + const int64 n = input_shape.dim_size(rank - 1); + const int64 min_dim = std::min(m, n); + + TensorShape batch_shape = input_shape; + batch_shape.RemoveLastDims(2); + + TensorShape expected_diag_shape = batch_shape; + expected_diag_shape.AddDim(min_dim); + OP_REQUIRES(context, expected_diag_shape == diag_shape, + errors::InvalidArgument( + "must have diagonal.shape == input.shape[:-2] + " + "min(input.shape[-2:]), but received input shape: ", + input_shape.DebugString(), + " and diagonal shape: ", diag_shape.DebugString())); + + xla::ComputationBuilder* builder = context->builder(); + xla::ComputationDataHandle input = context->Input(0); + xla::ComputationDataHandle diag = context->Input(1); + + auto zero = XlaHelpers::Zero(builder, context->input_type(0)); + + // Create an indicator tensor that is true only on the diagonal. + xla::ComputationDataHandle iota_m; + OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, m, &iota_m)); + xla::ComputationDataHandle iota_n; + OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, n, &iota_n)); + auto indicator = builder->Eq(iota_m, + builder->Broadcast(iota_n, {m}), + /*broadcast_dimensions=*/{0}); + indicator = builder->Broadcast(indicator, batch_shape.dim_sizes()); + + // Broadcast diag up to the input shape. Use an implicit broadcast (Add) + // because we need to broadcast on the right. + std::vector diag_broadcast_dims(rank - 1); + std::iota(diag_broadcast_dims.begin(), diag_broadcast_dims.end(), 0); + if (min_dim != m) { + diag_broadcast_dims.back() = rank - 1; + } + diag = builder->Add(diag, builder->Broadcast(zero, input_shape.dim_sizes()), + /*broadcast_dimensions=*/diag_broadcast_dims); + + auto output = builder->Select(indicator, diag, input); + context->SetOutput(0, output); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp); +}; + +REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp); + +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc new file mode 100644 index 00000000000..eaed9314646 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc @@ -0,0 +1,50 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" + +namespace tensorflow { +namespace { + +class MatrixTriangularSolveOp : public XlaOpKernel { + public: + explicit MatrixTriangularSolveOp(OpKernelConstruction* ctx) + : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("lower", &lower_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint", &adjoint_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + auto result = TriangularSolve( + ctx->builder(), ctx->Input(0), ctx->Input(1), /*left_side=*/true, + /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_); + if (!result.ok()) { + ctx->SetStatus(result.status()); + return; + } + ctx->SetOutput(0, result.ValueOrDie()); + } + + private: + bool lower_; + bool adjoint_; +}; + +REGISTER_XLA_OP(Name("MatrixTriangularSolve"), MatrixTriangularSolveOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc index 0b5a38967ae..d4fb5dd4e06 100644 --- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc @@ -37,21 +37,23 @@ class PoolingOp : public XlaOpKernel { public: PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims) : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) { - std::vector ksize_int; - std::vector stride_int; - OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int)); - OP_REQUIRES(ctx, ksize_int.size() == num_dims(), - errors::InvalidArgument("Sliding window ksize field must " - "specify ", - num_dims(), " dimensions")); - OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int)); - OP_REQUIRES(ctx, stride_int.size() == num_dims(), - errors::InvalidArgument("Sliding window stride field must " - "specify ", - num_dims(), " dimensions")); - for (int i = 0; i < num_dims(); ++i) { - ksize_.push_back(ksize_int[i]); - stride_.push_back(stride_int[i]); + if (ctx->num_inputs() == 1) { + std::vector ksize_int; + std::vector stride_int; + OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int)); + OP_REQUIRES(ctx, ksize_int.size() == num_dims(), + errors::InvalidArgument("Sliding window ksize field must " + "specify ", + num_dims(), " dimensions")); + OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int)); + OP_REQUIRES(ctx, stride_int.size() == num_dims(), + errors::InvalidArgument("Sliding window stride field must " + "specify ", + num_dims(), " dimensions")); + for (int i = 0; i < num_dims(); ++i) { + ksize_.push_back(ksize_int[i]); + stride_.push_back(stride_int[i]); + } } Padding padding; OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding)); @@ -77,6 +79,33 @@ class PoolingOp : public XlaOpKernel { xla::ComputationDataHandle input = ctx->Input(0); const TensorShape input_shape = ctx->InputShape(0); + std::vector ksize = ksize_; + std::vector stride = stride_; + if (ctx->num_inputs() != 1) { + const TensorShape ksize_shape = ctx->InputShape(1); + // Validate input sizes. + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape), + errors::InvalidArgument("ksize must be a vector, not shape ", + ksize_shape.DebugString())); + OP_REQUIRES(ctx, ksize_shape.num_elements() == num_dims(), + errors::InvalidArgument("Sliding window ksize field must " + "specify ", + num_dims(), " dimensions")); + ksize.clear(); + OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize)); + + const TensorShape stride_shape = ctx->InputShape(2); + // Validate input sizes. + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape), + errors::InvalidArgument("stride must be a vector, not shape ", + stride_shape.DebugString())); + OP_REQUIRES(ctx, stride_shape.num_elements() == num_dims(), + errors::InvalidArgument("Sliding window stride field must " + "specify ", + num_dims(), " dimensions")); + stride.clear(); + OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride)); + } OP_REQUIRES(ctx, input_shape.dims() == num_dims(), errors::InvalidArgument("Input to ", type_string(), " operator must have ", num_dims(), @@ -84,8 +113,8 @@ class PoolingOp : public XlaOpKernel { const DataType type = input_type(0); xla::ComputationDataHandle pooled = ctx->builder()->ReduceWindow( - input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize_, - stride_, padding_); + input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize, + stride, padding_); ctx->SetOutput(0, PostProcessOutput(ctx, pooled, type, input_shape)); } @@ -130,6 +159,10 @@ class MaxPool2DOp : public MaxPoolOp { } }; REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp); +REGISTER_XLA_OP(Name("MaxPoolV2") + .CompileTimeConstInput("ksize") + .CompileTimeConstInput("strides"), + MaxPool2DOp); class MaxPool3DOp : public MaxPoolOp { public: @@ -243,22 +276,44 @@ class MaxPoolGradOp : public XlaOpKernel { public: MaxPoolGradOp(OpKernelConstruction* ctx, int num_spatial_dims) : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_)); - OP_REQUIRES(ctx, ksize_.size() == num_dims(), - errors::InvalidArgument("Sliding window ksize field must " - "specify ", - num_dims(), " dimensions")); - OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_)); - OP_REQUIRES(ctx, stride_.size() == num_dims(), - errors::InvalidArgument("Sliding window strides field must " - "specify ", - num_dims(), " dimensions")); + if (ctx->num_inputs() == 3) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_)); + } OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_)); } int num_dims() const { return num_spatial_dims_ + 2; } void Compile(XlaOpKernelContext* ctx) override { + if (ctx->num_inputs() != 3) { + OP_REQUIRES( + ctx, ctx->num_inputs() == 5, + errors::InvalidArgument("Must supply ksize and stride arguments.")); + const TensorShape ksize_shape = ctx->InputShape(3); + // Validate input sizes. + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape), + errors::InvalidArgument("ksize must be a vector, not shape ", + ksize_shape.DebugString())); + OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(3, &ksize_)); + + const TensorShape stride_shape = ctx->InputShape(4); + // Validate input sizes. + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape), + errors::InvalidArgument("stride must be a vector, not shape ", + stride_shape.DebugString())); + OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(4, &stride_)); + } + + OP_REQUIRES(ctx, ksize_.size() == num_dims(), + errors::InvalidArgument("Sliding window ksize field must " + "specify ", + num_dims(), " dimensions")); + OP_REQUIRES(ctx, stride_.size() == num_dims(), + errors::InvalidArgument("Sliding window strides field must " + "specify ", + num_dims(), " dimensions")); + const TensorShape tensor_in_shape = ctx->InputShape(0); const TensorShape tensor_out_shape = ctx->InputShape(1); const TensorShape out_backprop_shape = ctx->InputShape(2); @@ -315,6 +370,10 @@ class MaxPool2DGradOp : public MaxPoolGradOp { } }; REGISTER_XLA_OP(Name("MaxPoolGrad"), MaxPool2DGradOp); +REGISTER_XLA_OP(Name("MaxPoolGradV2") + .CompileTimeConstInput("ksize") + .CompileTimeConstInput("strides"), + MaxPool2DGradOp); class MaxPool3DGradOp : public MaxPoolGradOp { public: diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc new file mode 100644 index 00000000000..6bc5d3adb09 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc @@ -0,0 +1,182 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/xla_helpers.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/framework/tensor_shape.h" + +namespace tensorflow { +namespace { + +class ReverseSequenceOp : public XlaOpKernel { + public: + explicit ReverseSequenceOp(OpKernelConstruction* context) + : XlaOpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("batch_dim", &batch_dim_)); + OP_REQUIRES_OK(context, context->GetAttr("seq_dim", &seq_dim_)); + } + + void Compile(XlaOpKernelContext* context) override { + const TensorShape input_shape = context->InputShape(0); + const TensorShape seq_lens_shape = context->InputShape(1); + + OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens_shape), + errors::InvalidArgument("seq_lens input must be 1-dim, not ", + seq_lens_shape.dims())); + OP_REQUIRES(context, batch_dim_ != seq_dim_, + errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim_)); + OP_REQUIRES( + context, seq_dim_ < input_shape.dims(), + errors::InvalidArgument("seq_dim must be < input.dims()", "( ", + seq_dim_, " vs. ", input_shape.dims(), ")")); + OP_REQUIRES( + context, batch_dim_ < input_shape.dims(), + errors::InvalidArgument("batch_dim must be < input.dims()", "( ", + batch_dim_, " vs. ", input_shape.dims(), ")")); + OP_REQUIRES( + context, + seq_lens_shape.num_elements() == input_shape.dim_size(batch_dim_), + errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim_, + "), ", "(", seq_lens_shape.num_elements(), + " vs. ", input_shape.dim_size(batch_dim_))); + + xla::ComputationBuilder* builder = context->builder(); + const auto input = context->Input(0); + const auto seq_lens = context->Input(1); + + const int64 batch_size = input_shape.dim_size(batch_dim_); + + const DataType input_type = context->input_type(0); + const DataType seq_lens_type = context->input_type(1); + const int64 max_seq_len = input_shape.dim_size(seq_dim_); + + xla::Shape input_xla_shape; + OP_REQUIRES_OK(context, TensorShapeToXLAShape(input_type, input_shape, + &input_xla_shape)); + xla::Shape seq_lens_xla_shape; + OP_REQUIRES_OK(context, TensorShapeToXLAShape(seq_lens_type, seq_lens_shape, + &seq_lens_xla_shape)); + + const auto tuple_shape = xla::ShapeUtil::MakeTupleShape({ + xla::ShapeUtil::MakeShape(seq_lens_xla_shape.element_type(), {}), + seq_lens_xla_shape, + input_xla_shape, + }); + + // For each entry in the batch, reverse the sequence. + // TODO(b/65689298): generalize the Map() operator to non-scalar cases and + // use it here, instead of a While loop. + + // Condition: lambda (i, _, _): i < batch_size + auto condition_builder = + builder->CreateSubBuilder("reverse_sequence_condition"); + { + auto param = condition_builder->Parameter(0, tuple_shape, "param"); + auto i = condition_builder->GetTupleElement(param, 0); + condition_builder->Lt( + i, XlaHelpers::IntegerLiteral(condition_builder.get(), seq_lens_type, + batch_size)); + } + auto condition = condition_builder->Build(); + OP_REQUIRES_OK(context, condition.status()); + + auto body_builder = builder->CreateSubBuilder("reverse_sequence_body"); + { + auto param = body_builder->Parameter(0, tuple_shape, "param"); + auto i = body_builder->GetTupleElement(param, 0); + auto seq_lens = body_builder->GetTupleElement(param, 1); + auto output = body_builder->GetTupleElement(param, 2); + + // seq_len is the sequence length of the current batch element (rank 1) + auto seq_len = body_builder->DynamicSlice( + seq_lens, body_builder->Reshape(i, {1}), {1}); + + // Indices is the offset of the batch element in the input. + auto indices = body_builder->Broadcast( + XlaHelpers::Zero(body_builder.get(), seq_lens_type), + {input_shape.dims()}); + indices = body_builder->DynamicUpdateSlice( + indices, body_builder->Reshape(i, {1}), + body_builder->Reshape( + XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type, + batch_dim_), + {1})); + + // slice_indices is the offset of the start of the reversed sequence in + // the input. + auto slice_indices = body_builder->DynamicUpdateSlice( + indices, + body_builder->Sub(XlaHelpers::IntegerLiteral( + body_builder.get(), seq_lens_type, max_seq_len), + seq_len), + body_builder->Reshape( + XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type, + seq_dim_), + {1})); + + // Slice out the reversed sequence. The slice will overflow the end of the + // sequence, and the contents of the overflow are implementation-defined. + // However, we will mask off these elements and replace them with elements + // from the original input so their values do not matter. + TensorShape slice_shape = input_shape; + slice_shape.set_dim(batch_dim_, 1); + auto slice = body_builder->DynamicSlice(output, slice_indices, + slice_shape.dim_sizes()); + + // Shift the reversed sequence to the left. + output = body_builder->DynamicUpdateSlice(output, slice, indices); + + body_builder->Tuple( + {body_builder->Add( + i, XlaHelpers::One(body_builder.get(), seq_lens_type)), + seq_lens, output}); + } + auto body = body_builder->Build(); + OP_REQUIRES_OK(context, body.status()); + + auto loop_output = builder->While( + condition.ValueOrDie(), body.ValueOrDie(), + builder->Tuple({XlaHelpers::Zero(builder, seq_lens_type), seq_lens, + builder->Rev(input, {seq_dim_})})); + auto output = builder->GetTupleElement(loop_output, 2); + + // Mask out elements after the sequence length. + xla::ComputationDataHandle iota; + OP_REQUIRES_OK( + context, XlaHelpers::Iota(builder, seq_lens_type, max_seq_len, &iota)); + std::vector dims(input_shape.dims(), 1); + dims[batch_dim_] = batch_size; + auto mask = builder->Lt(iota, builder->Reshape(seq_lens, dims), {seq_dim_}); + + // Broadcast the mask up to the input shape. + mask = + builder->Or(mask, builder->Broadcast(builder->ConstantR0(false), + input_shape.dim_sizes())); + + output = builder->Select(mask, output, input); + context->SetOutput(0, output); + } + + private: + int32 batch_dim_; + int32 seq_dim_; +}; + +REGISTER_XLA_OP(Name("ReverseSequence"), ReverseSequenceOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc index d77fb768ef4..1a78c7ab9be 100644 --- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc @@ -77,10 +77,8 @@ Status MaybeInitializeStack(xla::ComputationBuilder* builder, // Stack has not been initialized. xla::ComputationDataHandle zero = XlaHelpers::Zero(builder, resource->type()); - TF_RETURN_IF_ERROR(resource->SetValue( - dtype, - builder->Tuple({builder->Broadcast(zero, stack_shape.dim_sizes()), - builder->ConstantR0(0)}))); + TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape)); + TF_RETURN_IF_ERROR(resource->SetZeroValue(builder)); } else { // Checks the expected shape matches the actual shape. TensorShape actual_shape; @@ -119,8 +117,8 @@ class StackOp : public XlaOpKernel { string name = strings::StrCat("Stack: ", stack_name_); OP_REQUIRES_OK( ctx, xc.CreateResource(XlaResource::kStack, -1, std::move(name), dtype_, - value, &resource)); - resource->set_tensor_array_size(size); + TensorShape(), value, /*tensor_array_size=*/size, + /*tensor_array_gradients=*/{}, &resource)); ctx->SetResourceOutput(0, resource); } @@ -164,11 +162,9 @@ class StackPushOp : public XlaOpKernel { // TODO(phawkins): We don't check the index is in bounds --- there is no // error mechanism in XLA. - OP_REQUIRES_OK( - ctx, - resource->SetValue( - dtype_, b->Tuple({b->DynamicUpdateSlice(ta, update, start_indices), - b->Add(index, b->ConstantR0(1))}))); + OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple( + {b->DynamicUpdateSlice(ta, update, start_indices), + b->Add(index, b->ConstantR0(1))}))); ctx->SetOutput(0, value); } @@ -208,7 +204,7 @@ class StackPopOp : public XlaOpKernel { xla::ComputationDataHandle index = b->GetTupleElement(state, 1); index = b->Sub(index, b->ConstantR0(1)); - OP_REQUIRES_OK(ctx, resource->SetValue(dtype_, b->Tuple({ta, index}))); + OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple({ta, index}))); // start_indices of the DynamicSlice are [index, 0, 0, ..., 0]. auto start_indices = diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc index f0525a5fb86..91c169428c7 100644 --- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc @@ -231,6 +231,7 @@ class StridedSliceAssignOp : public XlaOpKernel { OP_REQUIRES_OK(ctx, ctx->GetAttr("new_axis_mask", &new_axis_mask_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("shrink_axis_mask", &shrink_axis_mask_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("Index", &index_type_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); } void Compile(XlaOpKernelContext* ctx) override { @@ -252,9 +253,9 @@ class StridedSliceAssignOp : public XlaOpKernel { OP_REQUIRES_OK(ctx, LiteralToHostTensor(strides_literal, index_type_, &strides_tensor)); - DataType lhs_type; TensorShape lhs_shape; - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &lhs_type, &lhs_shape)); + xla::ComputationDataHandle lhs; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &lhs_shape, &lhs)); const TensorShape rhs_shape = ctx->InputShape(4); @@ -282,9 +283,6 @@ class StridedSliceAssignOp : public XlaOpKernel { " does not match r-value shape ", rhs_shape.DebugString(), ". Automatic broadcasting not yet implemented.")); - xla::ComputationDataHandle lhs; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &lhs)); - xla::ComputationDataHandle rhs = ctx->Input(4); gtl::InlinedVector dimensions_to_reverse; @@ -320,13 +318,14 @@ class StridedSliceAssignOp : public XlaOpKernel { lhs, rhs, ctx->builder()->ConstantR1(slice_begin)); } - OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, lhs_type, lhs)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs)); } private: int32 begin_mask_, end_mask_; int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_; DataType index_type_; + DataType dtype_; }; REGISTER_XLA_OP(Name("ResourceStridedSliceAssign") diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc index 9224072a3cb..7cf9b796b91 100644 --- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc @@ -62,15 +62,13 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder, TF_RET_CHECK(resource->tensor_array_size() >= 0) << resource->name() << " size " << resource->tensor_array_size(); - TensorShape ta_shape; - ta_shape.AddDim(resource->tensor_array_size()); - ta_shape.AppendShape(elem_shape); if (!resource->initialized()) { xla::ComputationDataHandle zero = XlaHelpers::Zero(builder, resource->type()); - TF_RETURN_IF_ERROR(resource->SetValue( - dtype, builder->Broadcast(zero, ta_shape.dim_sizes()))); + + TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape)); + TF_RETURN_IF_ERROR(resource->SetZeroValue(builder)); } else { // Checks the elem_shape matches the TensorArray shape. auto shape_or_status = builder->GetShape(resource->value()); @@ -80,6 +78,10 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder, TensorShape shape; TF_RETURN_IF_ERROR( XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape)); + + TensorShape ta_shape; + ta_shape.AddDim(resource->tensor_array_size()); + ta_shape.AppendShape(elem_shape); if (ta_shape != shape) { return errors::InvalidArgument( "Mismatched TensorArray sizes: ", ta_shape.DebugString(), " vs ", @@ -114,10 +116,8 @@ Status CheckTensorArrayIsInitialized(const string& op_name, Status GetTensorArrayShape(const XlaResource* resource, xla::ComputationBuilder* builder, TensorShape* shape) { - TF_RETURN_IF_ERROR(resource->GetShape(builder, shape)); - if (shape->dims() < 1) { - return errors::InvalidArgument("TensorArray rank must be >= 1"); - } + *shape = resource->shape(); + shape->InsertDim(0, resource->tensor_array_size()); return Status::OK(); } @@ -160,8 +160,8 @@ class TensorArrayOp : public XlaOpKernel { // Initializes the TensorArray value if we know the element shape. // Otherwise, defer initialization to the first write. xla::ComputationDataHandle value; + TensorShape shape; if (element_shape_.IsFullyDefined()) { - TensorShape shape; CHECK(element_shape_.AsTensorShape(&shape)); TensorShape ta_shape; ta_shape.AddDim(size); @@ -175,8 +175,8 @@ class TensorArrayOp : public XlaOpKernel { string name = strings::StrCat("TensorArray: ", tensor_array_name_); OP_REQUIRES_OK( ctx, xc.CreateResource(XlaResource::kTensorArray, -1, std::move(name), - dtype_, value, &var)); - var->set_tensor_array_size(size); + dtype_, shape, value, /*tensor_array_size=*/size, + /*tensor_array_gradients=*/{}, &var)); ctx->SetResourceOutput(0, var); Tensor flow(DT_FLOAT, TensorShape({})); @@ -230,7 +230,7 @@ class TensorArrayWriteOp : public XlaOpKernel { xla::ComputationDataHandle written = DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices); - OP_REQUIRES_OK(ctx, resource->SetValue(dtype_, written)); + OP_REQUIRES_OK(ctx, resource->SetValue(written)); ctx->SetOutput(0, flow); } @@ -421,7 +421,7 @@ class TensorArrayScatterOp : public XlaOpKernel { } } - OP_REQUIRES_OK(ctx, resource->SetValue(dtype_, ta)); + OP_REQUIRES_OK(ctx, resource->SetValue(ta)); ctx->SetOutput(0, flow); } @@ -525,9 +525,8 @@ class TensorArraySplitOp : public XlaOpKernel { value_shape.DebugString(), " vs. ", ta_shape.DebugString())); - OP_REQUIRES_OK( - ctx, resource->SetValue( - dtype_, b->Add(ta, b->Reshape(value, ta_shape.dim_sizes())))); + OP_REQUIRES_OK(ctx, resource->SetValue(b->Add( + ta, b->Reshape(value, ta_shape.dim_sizes())))); ctx->SetOutput(0, flow); } diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc index 5534d1bfa13..f750f7003be 100644 --- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc @@ -32,9 +32,24 @@ class ResourceApplyGradientDescent : public XlaOpKernel { void Compile(XlaOpKernelContext* ctx) override { xla::ComputationDataHandle handle; xla::ComputationBuilder* b = ctx->builder(); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle)); + DataType type = ctx->input_type(1); + TensorShape var_shape; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &handle)); + + TensorShape alpha_shape = ctx->InputShape(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_shape), + errors::InvalidArgument("alpha is not a scalar: ", + alpha_shape.DebugString())); + + TensorShape delta_shape = ctx->InputShape(2); + OP_REQUIRES( + ctx, var_shape.IsSameSize(delta_shape), + errors::InvalidArgument("var and delta do not have the same shape: ", + var_shape.DebugString(), " vs ", + delta_shape.DebugString())); + handle = b->Sub(handle, b->Mul(ctx->Input(1), ctx->Input(2))); - OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle)); } }; REGISTER_XLA_OP( @@ -52,18 +67,10 @@ class ResourceApplyMomentum : public XlaOpKernel { DataType type = ctx->input_type(2); - DataType var_type, accum_type; TensorShape var_shape, accum_shape; - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape)); - OP_REQUIRES_OK(ctx, - ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape)); - - OP_REQUIRES( - ctx, type == var_type && type == accum_type, - errors::InvalidArgument( - "Types of variable arguments to ResourceApplyMomentum must match: ", - DataTypeString(type), " vs. ", DataTypeString(var_type), " and ", - DataTypeString(accum_type))); + xla::ComputationDataHandle var, accum; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum)); OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape), errors::InvalidArgument( @@ -86,10 +93,6 @@ class ResourceApplyMomentum : public XlaOpKernel { errors::InvalidArgument("momentum is not a scalar: ", momentum_shape.DebugString())); - xla::ComputationDataHandle var, accum; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum)); - xla::ComputationDataHandle lr = ctx->Input(2); xla::ComputationDataHandle grad = ctx->Input(3); xla::ComputationDataHandle momentum = ctx->Input(4); @@ -122,18 +125,10 @@ class ResourceApplyAdagrad : public XlaOpKernel { DataType type = ctx->input_type(2); - DataType var_type, accum_type; TensorShape var_shape, accum_shape; - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape)); - OP_REQUIRES_OK(ctx, - ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape)); - - OP_REQUIRES( - ctx, type == var_type && type == accum_type, - errors::InvalidArgument( - "Types of variable arguments to ResourceApplyAdagrad must match: ", - DataTypeString(type), " vs. ", DataTypeString(var_type), " and ", - DataTypeString(accum_type))); + xla::ComputationDataHandle var, accum; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum)); OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape), errors::InvalidArgument( @@ -151,9 +146,6 @@ class ResourceApplyAdagrad : public XlaOpKernel { "var and grad do not have the same shape", var_shape.DebugString(), " ", grad_shape.DebugString())); - xla::ComputationDataHandle var, accum; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum)); xla::ComputationDataHandle lr = ctx->Input(2); xla::ComputationDataHandle grad = ctx->Input(3); @@ -175,18 +167,11 @@ class ResourceApplyAdam : public XlaOpKernel { } void Compile(XlaOpKernelContext* ctx) override { - DataType var_type, m_type, v_type; TensorShape var_shape, m_shape, v_shape; - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape)); - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &m_type, &m_shape)); - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &v_type, &v_shape)); - - OP_REQUIRES( - ctx, dtype_ == var_type && dtype_ == m_type && dtype_ == v_type, - errors::InvalidArgument( - "Types of variable arguments to ResourceApplyRMSProp must match: ", - DataTypeString(dtype_), " vs. ", DataTypeString(var_type), " vs. ", - DataTypeString(m_type), " vs. ", DataTypeString(v_type))); + xla::ComputationDataHandle var, m, v; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype_, &m_shape, &m)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype_, &v_shape, &v)); TensorShape beta1_power_shape = ctx->InputShape(3); TensorShape beta2_power_shape = ctx->InputShape(4); @@ -228,10 +213,6 @@ class ResourceApplyAdam : public XlaOpKernel { "var and grad do not have the same shape", var_shape.DebugString(), " ", grad_shape.DebugString())); - xla::ComputationDataHandle var, m, v; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &m)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &v)); xla::ComputationDataHandle beta1_power = ctx->Input(3); xla::ComputationDataHandle beta2_power = ctx->Input(4); xla::ComputationDataHandle lr = ctx->Input(5); @@ -278,18 +259,11 @@ class ResourceApplyRMSProp : public XlaOpKernel { DataType type = ctx->input_type(3); - DataType var_type, ms_type, mom_type; TensorShape var_shape, ms_shape, mom_shape; - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape)); - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &ms_type, &ms_shape)); - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &mom_type, &mom_shape)); - - OP_REQUIRES( - ctx, type == var_type && type == ms_type && type == mom_type, - errors::InvalidArgument( - "Types of variable arguments to ResourceApplyRMSProp must match: ", - DataTypeString(type), " vs. ", DataTypeString(var_type), " vs. ", - DataTypeString(ms_type), " vs. ", DataTypeString(mom_type))); + xla::ComputationDataHandle var, ms, mom; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &ms_shape, &ms)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, type, &mom_shape, &mom)); TensorShape lr_shape = ctx->InputShape(3); OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape), @@ -323,10 +297,6 @@ class ResourceApplyRMSProp : public XlaOpKernel { "var and grad do not have the same shape", var_shape.DebugString(), " ", grad_shape.DebugString())); - xla::ComputationDataHandle var, ms, mom; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &ms)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &mom)); xla::ComputationDataHandle lr = ctx->Input(3); xla::ComputationDataHandle rho = ctx->Input(4); xla::ComputationDataHandle momentum = ctx->Input(5); @@ -373,20 +343,11 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype, bool has_l2_shrinkage) { xla::ComputationBuilder* b = ctx->builder(); - DataType var_type, accum_type, linear_type; TensorShape var_shape, accum_shape, linear_shape; - OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape)); - OP_REQUIRES_OK(ctx, - ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape)); - OP_REQUIRES_OK(ctx, - ctx->GetVariableTypeAndShape(2, &linear_type, &linear_shape)); - - OP_REQUIRES( - ctx, dtype == var_type && dtype == accum_type && dtype == linear_type, - errors::InvalidArgument( - "Types of variable arguments to ResourceApplyFtrlV2 must match: ", - DataTypeString(dtype), " vs. ", DataTypeString(var_type), " and ", - DataTypeString(accum_type), " and ", DataTypeString(linear_type))); + xla::ComputationDataHandle var, accum, linear; + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype, &var_shape, &var)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype, &accum_shape, &accum)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype, &linear_shape, &linear)); OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape), errors::InvalidArgument( @@ -438,10 +399,6 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype, errors::InvalidArgument("lr_power is not a scalar: ", lr_power_shape.DebugString())); - xla::ComputationDataHandle var, accum, linear; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum)); - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &linear)); xla::ComputationDataHandle grad = ctx->Input(3); xla::ComputationDataHandle lr = ctx->Input(4); xla::ComputationDataHandle l1 = ctx->Input(5); diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc index a266e9013c4..0c5ad9e5255 100644 --- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc @@ -50,18 +50,41 @@ XLAJIT_MAKE_UNARY(Conj, b->Conj(x)); // Return x if x>0, otherwise -x. XLAJIT_MAKE_UNARY(Abs, b->Abs(x)); +// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x)) +XLAJIT_MAKE_UNARY( + Acos, + b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0), + b->Atan2(b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)), + b->Mul(x, x)), + XlaHelpers::FloatLiteral(b, input_type(0), 0.5)), + b->Add(XlaHelpers::One(b, input_type(0)), x)))); + // acosh(x) = log(x + sqrt(x^2 - 1)) XLAJIT_MAKE_UNARY( Acosh, b->Log(b->Add(x, b->Pow(b->Sub(b->Mul(x, x), XlaHelpers::One(b, input_type(0))), XlaHelpers::FloatLiteral(b, input_type(0), 0.5))))); + +// asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2))) +XLAJIT_MAKE_UNARY( + Asin, + b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0), + b->Atan2(x, b->Add(XlaHelpers::One(b, input_type(0)), + b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)), + b->Mul(x, x)), + XlaHelpers::FloatLiteral(b, input_type(0), + 0.5)))))); + // asinh(x) = log(x + sqrt(x^2 + 1)) XLAJIT_MAKE_UNARY( Asinh, b->Log(b->Add(x, b->Pow(b->Add(b->Mul(x, x), XlaHelpers::One(b, input_type(0))), XlaHelpers::FloatLiteral(b, input_type(0), 0.5))))); + +XLAJIT_MAKE_UNARY(Atan, b->Atan2(x, XlaHelpers::One(b, input_type(0)))); + // atanh(x) = 0.5 * log((1 + x) / (1 - x)) XLAJIT_MAKE_UNARY( Atanh, b->Mul(b->Log(b->Div(b->Add(XlaHelpers::One(b, input_type(0)), x), diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc index 68847ae7a2c..e4079ebf0b8 100644 --- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc @@ -33,21 +33,29 @@ class VarIsInitializedOp : public XlaOpKernel { public: explicit VarIsInitializedOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { - xla::ComputationDataHandle handle; - bool initialized = ctx->ReadVariableInput(0, &handle).ok(); - ctx->SetOutput(0, ctx->builder()->ConstantR0(initialized)); + XlaResource* variable; + OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &variable)); + ctx->SetOutput(0, + ctx->builder()->ConstantR0(variable->initialized())); } }; REGISTER_XLA_OP(Name("VarIsInitializedOp"), VarIsInitializedOp); class ReadVariableOp : public XlaOpKernel { public: - explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_)); + } + void Compile(XlaOpKernelContext* ctx) override { xla::ComputationDataHandle handle; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle)); + OP_REQUIRES_OK( + ctx, ctx->ReadVariableInput(0, dtype_, /*shape=*/nullptr, &handle)); ctx->SetOutput(0, handle); } + + private: + DataType dtype_; }; REGISTER_XLA_OP(Name("ReadVariableOp"), ReadVariableOp); @@ -65,10 +73,12 @@ class AssignAddVariableOp : public XlaOpKernel { public: explicit AssignAddVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { + DataType type = ctx->input_type(1); xla::ComputationDataHandle handle; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle)); + OP_REQUIRES_OK(ctx, + ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle)); handle = ctx->builder()->Add(handle, ctx->Input(1)); - OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle)); } }; REGISTER_XLA_OP( @@ -79,10 +89,12 @@ class AssignSubVariableOp : public XlaOpKernel { public: explicit AssignSubVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} void Compile(XlaOpKernelContext* ctx) override { + DataType type = ctx->input_type(1); xla::ComputationDataHandle handle; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle)); + OP_REQUIRES_OK(ctx, + ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle)); handle = ctx->builder()->Sub(handle, ctx->Input(1)); - OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle)); + OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle)); } }; REGISTER_XLA_OP( @@ -95,28 +107,19 @@ class ResourceGatherOp : public XlaOpKernel { void Compile(XlaOpKernelContext* ctx) override { xla::ComputationBuilder* builder = ctx->builder(); - // Get the shape of the resource tensor. + DataType type = ctx->expected_output_dtype(0); + TensorShape resource_shape; - DataType resource_dtype; - OP_REQUIRES_OK( - ctx, ctx->GetVariableTypeAndShape(0, &resource_dtype, &resource_shape)); - - DataType expected_output_dtype = ctx->expected_output_dtype(0); - OP_REQUIRES(ctx, resource_dtype == expected_output_dtype, - errors::InvalidArgument( - "Variable dtype is ", DataTypeString(resource_dtype), - " but expected output dtype is ", - DataTypeString(expected_output_dtype), ".")); - xla::ComputationDataHandle resource_handle; - OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &resource_handle)); + OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &resource_shape, + &resource_handle)); auto indices = ctx->Input(1); auto indices_shape = ctx->InputShape(1); DataType index_type = ctx->input_type(1); xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice( - ctx, resource_handle, resource_shape, indices, indices_shape, 0, - resource_dtype, index_type, builder); + ctx, resource_handle, resource_shape, indices, indices_shape, 0, type, + index_type, builder); ctx->SetOutput(0, gather); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc index 4a711e4d9b7..0ff1b65ae91 100644 --- a/tensorflow/compiler/tf2xla/kernels/while_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc @@ -58,9 +58,8 @@ Status MakeXlaCompilerArgumentsFromInputs( } arg.type = resource->type(); - if (arg.initialized) { - TF_RETURN_IF_ERROR(resource->PackedShape(ctx->builder(), &arg.shape)); - } else { + arg.shape = resource->shape(); + if (!arg.initialized) { *has_uninitialized_vars = true; } arg.tensor_array_size = resource->tensor_array_size(); @@ -70,14 +69,13 @@ Status MakeXlaCompilerArgumentsFromInputs( arg.name = resource->name(); VLOG(2) << " resource " << resource->name() << " type: " << DataTypeString(arg.type) - << " shape: " << xla::ShapeUtil::HumanString(arg.shape) + << " shape: " << arg.shape.DebugString() << " initialized: " << arg.initialized; } else { arg.kind = XlaCompiler::Argument::kParameter; arg.type = ctx->input_type(i); - TF_RETURN_IF_ERROR( - TensorShapeToXLAShape(arg.type, ctx->InputShape(i), &arg.shape)); + arg.shape = ctx->InputShape(i); } } return Status::OK(); @@ -154,17 +152,14 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) { XlaCompiler::Argument& arg = arguments[update.input_index]; if (!arg.initialized) { VLOG(2) << "Update shape for argument " << update.input_index << " " - << xla::ShapeUtil::HumanString(update.shape); + << update.shape.DebugString(); arg.initialized = true; - xla::Shape shape = update.shape; - if (!update.tensor_array_gradients_accessed.empty()) { - shape = xla::ShapeUtil::GetTupleElementShape(shape, 0); - } - std::unique_ptr zero = - xla::Literal::CreateFromShape(shape); - OP_REQUIRES_OK(ctx, resource->SetValue( - update.type, builder->ConstantLiteral(*zero))); + arg.shape = update.shape; + OP_REQUIRES_OK(ctx, + resource->SetTypeAndShape(update.type, update.shape)); + + OP_REQUIRES_OK(ctx, resource->SetZeroValue(builder)); } // Add any TensorArray gradients touched by the body to the enclosing @@ -182,9 +177,6 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) { for (const auto& gradient : resource->tensor_array_gradients()) { arg.tensor_array_gradients.insert(gradient.first); } - - // Recompute the argument shape. - OP_REQUIRES_OK(ctx, resource->PackedShape(ctx->builder(), &arg.shape)); } // Recompile the body with the "correct" resource shapes. VLOG(1) << "Recompiling body with corrected resource shapes"; @@ -292,13 +284,12 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) { OP_REQUIRES_OK(ctx, resource->SetFromPack( arguments[update.input_index].tensor_array_gradients, - builder->GetTupleElement(while_result, pos), - /*reset_initial_values=*/false, builder)); + builder->GetTupleElement(while_result, pos), builder)); } VLOG(2) << "Loop-carried variable: pos: " << update.input_index << " name: " << resource->name() << " modified: " << update.modified << " type: " << DataTypeString(update.type) - << " shape: " << xla::ShapeUtil::HumanString(update.shape); + << " shape: " << update.shape.DebugString(); // Copies the identity of the resource variable from input to output // unchanged, even if the variable was not modified. ctx->op_kernel_context()->set_output( diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD index 21ad21f7373..d184f59e017 100644 --- a/tensorflow/compiler/tf2xla/lib/BUILD +++ b/tensorflow/compiler/tf2xla/lib/BUILD @@ -60,6 +60,8 @@ cc_library( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/core:lib", diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc index 9b0e6174475..798f0fa7805 100644 --- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc +++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc @@ -25,11 +25,10 @@ limitations under the License. namespace tensorflow { -// The current implementation simply unrolls the computation along the batch -// dimension. xla::StatusOr BatchDot( xla::ComputationBuilder* builder, xla::ComputationDataHandle x, - xla::ComputationDataHandle y, bool transpose_x, bool transpose_y) { + xla::ComputationDataHandle y, bool transpose_x, bool transpose_y, + bool conjugate_x, bool conjugate_y) { TF_ASSIGN_OR_RETURN(std::unique_ptr x_shape, builder->GetShape(x)); TF_ASSIGN_OR_RETURN(std::unique_ptr y_shape, @@ -89,10 +88,10 @@ xla::StatusOr BatchDot( dimensions); } - if (x_shape->element_type() == xla::C64 && transpose_x) { + if (x_shape->element_type() == xla::C64 && conjugate_x) { x = builder->Conj(x); } - if (y_shape->element_type() == xla::C64 && transpose_y) { + if (y_shape->element_type() == xla::C64 && conjugate_y) { y = builder->Conj(y); } diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h index b46bc7417d2..b230e885f10 100644 --- a/tensorflow/compiler/tf2xla/lib/batch_dot.h +++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h @@ -27,7 +27,10 @@ namespace tensorflow { // viewed as an element of a batch), and arranges the individual results // in a single output tensor of the same batch size. Each of the // individual slices can optionally be transposed before multiplication by -// setting the `transpose_x` or `transpose_y` flag to `true`. +// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each +// can be elementwise-complex-conjugated by setting the `conjugate_x` or +// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both +// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`. // // The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]` // and `[..., r_y, c_y]`. @@ -40,11 +43,10 @@ namespace tensorflow { // It is computed as: // // output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :]) -// TODO(phawkins): add an option to take the complex conjugate of the LHS or -// RHS. xla::StatusOr BatchDot( xla::ComputationBuilder* builder, xla::ComputationDataHandle x, - xla::ComputationDataHandle y, bool transpose_x, bool transpose_y); + xla::ComputationDataHandle y, bool transpose_x, bool transpose_y, + bool conjugate_x = false, bool conjugate_y = false); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc index b3cc489adf6..e795701181d 100644 --- a/tensorflow/compiler/tf2xla/lib/cholesky.cc +++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc @@ -71,11 +71,14 @@ xla::StatusOr CholeskyUnblocked( SliceInMinorDims(builder, l, {j + 1, 0}, {n, j})); TF_ASSIGN_OR_RETURN(auto r_squared, BatchDot(builder, r, r, /*transpose_x=*/false, - /*transpose_y=*/true)); + /*transpose_y=*/true, /*conjugate_x=*/false, + /*conjugate_y=*/false)); new_d_squared = builder->Sub(new_d_squared, r_squared); TF_ASSIGN_OR_RETURN(br, BatchDot(builder, b, r, /*transpose_x=*/false, - /*transpose_y=*/true)); + /*transpose_y=*/true, + /*conjugate_x=*/false, + /*conjugate_y=*/false)); } auto new_d_inv = builder->Pow( new_d_squared, FloatLiteral(builder, shape->element_type(), -0.5)); @@ -134,7 +137,8 @@ xla::StatusOr Cholesky( SliceInMinorDims(builder, l, {i, 0}, {i + k, i})); TF_ASSIGN_OR_RETURN(auto delta, BatchDot(builder, lhs, rhs, /*transpose_x=*/false, - /*transpose_y=*/true)); + /*transpose_y=*/true, /*conjugate_x=*/false, + /*conjugate_y=*/false)); TF_ASSIGN_OR_RETURN(auto before, SliceInMinorDims(builder, a, {i, i}, {n, i + k})); TF_ASSIGN_OR_RETURN( @@ -155,6 +159,10 @@ xla::StatusOr Cholesky( SliceInMinorDims(builder, a, {i + k, i}, {n, i + k})); TF_ASSIGN_OR_RETURN(auto update, TriangularSolve(builder, factorized, panel, + /*left_side=*/false, + /*lower=*/true, + /*transpose_a=*/true, + /*conjugate_a=*/false, /*block_size=*/8)); TF_ASSIGN_OR_RETURN( l, UpdateSliceInMinorDims(builder, l, update, {i + k, i})); diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h index 2bead7359ba..e083a383be4 100644 --- a/tensorflow/compiler/tf2xla/lib/cholesky.h +++ b/tensorflow/compiler/tf2xla/lib/cholesky.h @@ -29,6 +29,7 @@ namespace tensorflow { // the block size to use. // TODO(phawkins): check for negative values on the diagonal and return an // error, instead of silently yielding NaNs. +// TODO(mattjj): handle the complex Hermitian case xla::StatusOr Cholesky( xla::ComputationBuilder* builder, xla::ComputationDataHandle a, int64 block_size = 256); diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc index 579944c3a38..7f72a6073df 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc @@ -24,13 +24,15 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/core/errors.h" namespace tensorflow { xla::StatusOr TriangularSolve( xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a, - xla::ComputationDataHandle b, int64 block_size) { + xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a, + bool conjugate_a, int64 block_size) { TF_ASSIGN_OR_RETURN(std::unique_ptr a_shape, builder->GetShape(a)); TF_ASSIGN_OR_RETURN(std::unique_ptr b_shape, @@ -60,14 +62,15 @@ xla::StatusOr TriangularSolve( batch_dimensions.push_back(a_size); } - const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1); - const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2); - if (n != xla::ShapeUtil::GetDimension(*a_shape, -2)) { + if (xla::ShapeUtil::GetDimension(*a_shape, -1) != + xla::ShapeUtil::GetDimension(*a_shape, -2)) { return errors::InvalidArgument( "The 'a' arguments to TriangularSolve must be square matrices: ", xla::ShapeUtil::HumanString(*a_shape)); } - if (n != xla::ShapeUtil::GetDimension(*b_shape, -1)) { + const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2); + const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1); + if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(*a_shape, -1)) { return errors::InvalidArgument( "Arguments to TriangularSolve have incompatible matrix shapes: ", xla::ShapeUtil::HumanString(*a_shape), " vs ", @@ -89,6 +92,14 @@ xla::StatusOr TriangularSolve( return output; }; + // Applies a complex conjugation operation if `a` is complex and `conjugate_a` + // is true, otherwise returns its argument. + auto maybe_conj = [&](xla::ComputationBuilder* builder, + xla::ComputationDataHandle x) { + auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a; + return perform_conj ? builder->Conj(x) : x; + }; + std::map base_computations; auto get_base_triangular_solve = [&](int k) -> xla::StatusOr { @@ -103,19 +114,35 @@ xla::StatusOr TriangularSolve( prepend_batch_dims({k, k})), "a"); + std::array b_lastd; + if (left_side) { + b_lastd = {k, n}; + } else { + b_lastd = {m, k}; + } auto b_param = sub->Parameter(1, xla::ShapeUtil::MakeShape(b_shape->element_type(), - prepend_batch_dims({m, k})), + prepend_batch_dims(b_lastd)), "b"); - // TODO(phawkins): it might make sense to use a while loop here, rather - // than unrolling. - // TODO(phawkins): the left-looking variant of the algorithm might be more - // efficient at block size 1. - TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param, - /*block_size=*/1) - .status()); + // We use a left-looking subroutine on the block diagonal in some common + // cases, while falling back to a recursive call in unsupported cases. The + // left-looking subroutine is written with a While loop and so yields much + // faster compile times. Moreover, the left-looking variant can give + // higher performance on smaller (sub)problems. + if (left_side && lower) { + TF_RETURN_IF_ERROR(TriangularSolveLeftLooking(sub.get(), a_param, + b_param, transpose_a, + conjugate_a) + .status()); + } else { + TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param, + left_side, lower, transpose_a, + conjugate_a, + /*block_size=*/1) + .status()); + } TF_ASSIGN_OR_RETURN(computation, sub->Build()); } @@ -129,47 +156,396 @@ xla::StatusOr TriangularSolve( // Goto, Kazushige, and Robert Van De Geijn. "High-performance implementation // of the level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1 // (2008): 4. - for (int64 i = 0; i < n; i += block_size) { - int64 k = std::min(block_size, n - i); - // if k > 1: - // output[..., :, i:i+k] = triangular_solve( - // a[..., i:i+k, ..., i:i+k], b[..., :, i:i+k], side='Right', - // kind='Lower', transpose=True, block_size=1) - // else: - // output[..., :, i] = b[..., :, i] / a[..., i, i] - TF_ASSIGN_OR_RETURN(auto a_slice, - SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); - TF_ASSIGN_OR_RETURN(auto b_slice, - SliceInMinorDims(builder, b, {0, i}, {m, i + k})); - xla::ComputationDataHandle update; - if (k > 1) { - TF_ASSIGN_OR_RETURN(xla::Computation * solve, - get_base_triangular_solve(k)); - update = builder->Call(*solve, {a_slice, b_slice}); - } else { - update = builder->Div(b_slice, a_slice); + // In the code comments below, T = lambda x: np.swapaxes(x, -1, -2) if + // conjugate_a is False, or T = lambda x: np.conj(np.swapaxes(x, -1, -2)) if + // conjugate_a is True. + + if (!left_side && lower == transpose_a) { + // for i in range(0, a.shape[-1], block_size): + for (int64 i = 0; i < n; i += block_size) { + int64 k = std::min(block_size, n - i); + + // output[..., :, i:i+k] = triangular_solve( + // a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1) + TF_ASSIGN_OR_RETURN(auto a_slice, + SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); + TF_ASSIGN_OR_RETURN(auto b_slice, + SliceInMinorDims(builder, b, {0, i}, {m, i + k})); + xla::ComputationDataHandle update; + if (k > 1) { + TF_ASSIGN_OR_RETURN(xla::Computation * solve, + get_base_triangular_solve(k)); + update = builder->Call(*solve, {a_slice, b_slice}); + } else { + update = builder->Div(b_slice, maybe_conj(builder, a_slice)); + } + TF_ASSIGN_OR_RETURN( + output, UpdateSliceInMinorDims(builder, output, update, {0, i})); + + // if i + k < a.shape[-1]: + // a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:] + // a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2 + // b[..., :, i+k:] -= np.matmul(output[..., :, i:i+k], a_slice_2) + if (i + k < n) { + xla::ComputationDataHandle a_slice_2; + if (lower) { + TF_ASSIGN_OR_RETURN( + a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {n, i + k})); + } else { + TF_ASSIGN_OR_RETURN( + a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, n})); + } + + TF_ASSIGN_OR_RETURN(auto b_update, + BatchDot(builder, update, a_slice_2, + /*transpose_x=*/false, + /*transpose_y=*/transpose_a, + /*conjugate_x=*/false, + /*conjugate_y=*/conjugate_a)); + TF_ASSIGN_OR_RETURN(auto b_slice_2, + SliceInMinorDims(builder, b, {0, i + k}, {m, n})); + b_update = builder->Sub(b_slice_2, b_update); + TF_ASSIGN_OR_RETURN( + b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k})); + } } - TF_ASSIGN_OR_RETURN( - output, UpdateSliceInMinorDims(builder, output, update, {0, i})); - // b[..., :, i+k:] -= np.dot(output[..., :, i:i+k], - // np.transpose(..., a[i+k:, i:i+k])) - if (i + k < n) { - TF_ASSIGN_OR_RETURN(auto a_slice_2, - SliceInMinorDims(builder, a, {i + k, i}, {n, i + k})); - TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, update, a_slice_2, - /*transpose_x=*/false, - /*transpose_y=*/true)); + } else if (left_side && lower != transpose_a) { + // for i in range(0, a.shape[-1], block_size): + for (int64 i = 0; i < m; i += block_size) { + int64 k = std::min(block_size, m - i); - TF_ASSIGN_OR_RETURN(auto b_slice_2, - SliceInMinorDims(builder, b, {0, i + k}, {m, n})); - b_update = builder->Sub(b_slice_2, b_update); + // output[..., i:i+k, :] = triangular_solve( + // a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1) + TF_ASSIGN_OR_RETURN(auto a_slice, + SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); + TF_ASSIGN_OR_RETURN(auto b_slice, + SliceInMinorDims(builder, b, {i, 0}, {i + k, n})); + xla::ComputationDataHandle update; + if (k > 1) { + TF_ASSIGN_OR_RETURN(xla::Computation * solve, + get_base_triangular_solve(k)); + update = builder->Call(*solve, {a_slice, b_slice}); + } else { + update = builder->Div(b_slice, maybe_conj(builder, a_slice)); + } TF_ASSIGN_OR_RETURN( - b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k})); + output, UpdateSliceInMinorDims(builder, output, update, {i, 0})); + + // if i + k < a.shape[-1]: + // a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:] + // a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2 + // b[..., i+k:, :] -= np.matmul(a_slice_2, output[..., i:i+k, :]) + if (i + k < m) { + xla::ComputationDataHandle a_slice_2; + if (lower) { + TF_ASSIGN_OR_RETURN( + a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {m, i + k})); + } else { + TF_ASSIGN_OR_RETURN( + a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, m})); + } + + TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update, + /*transpose_x=*/transpose_a, + /*transpose_y=*/false, + /*conjugate_x=*/conjugate_a, + /*conjugate_y=*/false)); + TF_ASSIGN_OR_RETURN(auto b_slice_2, + SliceInMinorDims(builder, b, {i + k, 0}, {m, n})); + b_update = builder->Sub(b_slice_2, b_update); + TF_ASSIGN_OR_RETURN( + b, UpdateSliceInMinorDims(builder, b, b_update, {i + k, 0})); + } + } + } else if (!left_side && lower != transpose_a) { + // for i in reversed(range(0, a.shape[-1], block_size)): + const int64 last_blk_ix = xla::RoundUpToNearest(n, block_size) - block_size; + for (int64 i = last_blk_ix; i >= 0; i -= block_size) { + int64 k = std::min(block_size, n - i); + + // output[..., :, i:i+k] triangular_solve( + // a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1) + TF_ASSIGN_OR_RETURN(auto a_slice, + SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); + TF_ASSIGN_OR_RETURN(auto b_slice, + SliceInMinorDims(builder, b, {0, i}, {m, i + k})); + xla::ComputationDataHandle update; + if (k > 1) { + TF_ASSIGN_OR_RETURN(xla::Computation * solve, + get_base_triangular_solve(k)); + update = builder->Call(*solve, {a_slice, b_slice}); + } else { + update = builder->Div(b_slice, maybe_conj(builder, a_slice)); + } + TF_ASSIGN_OR_RETURN( + output, UpdateSliceInMinorDims(builder, output, update, {0, i})); + + // if i - k >= 0: + // a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k] + // a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2 + // b[..., :, :i] -= np.matmul(out[..., :, i:i+k], a_slice_2) + if (i - k >= 0) { + xla::ComputationDataHandle a_slice_2; + if (lower) { + TF_ASSIGN_OR_RETURN(a_slice_2, + SliceInMinorDims(builder, a, {i, 0}, {i + k, i})); + } else { + TF_ASSIGN_OR_RETURN(a_slice_2, + SliceInMinorDims(builder, a, {0, i}, {i, i + k})); + } + + TF_ASSIGN_OR_RETURN(auto b_update, + BatchDot(builder, update, a_slice_2, + /*transpose_x=*/false, + /*transpose_y=*/transpose_a, + /*conjugate_x=*/false, + /*conjugate_y=*/conjugate_a)); + TF_ASSIGN_OR_RETURN(auto b_slice_2, + SliceInMinorDims(builder, b, {0, 0}, {m, i})); + b_update = builder->Sub(b_slice_2, b_update); + TF_ASSIGN_OR_RETURN( + b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0})); + } + } + } else { // left_side && lower == transpose_a + // for i in reversed(range(0, a.shape[-1], block_size)): + const int64 last_blk_ix = xla::RoundUpToNearest(m, block_size) - block_size; + for (int64 i = last_blk_ix; i >= 0; i -= block_size) { + int64 k = std::min(block_size, m - i); + + // output[..., i:i+k, :] triangular_solve( + // a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1) + TF_ASSIGN_OR_RETURN(auto a_slice, + SliceInMinorDims(builder, a, {i, i}, {i + k, i + k})); + TF_ASSIGN_OR_RETURN(auto b_slice, + SliceInMinorDims(builder, b, {i, 0}, {i + k, n})); + xla::ComputationDataHandle update; + if (k > 1) { + TF_ASSIGN_OR_RETURN(xla::Computation * solve, + get_base_triangular_solve(k)); + update = builder->Call(*solve, {a_slice, b_slice}); + } else { + update = builder->Div(b_slice, maybe_conj(builder, a_slice)); + } + TF_ASSIGN_OR_RETURN( + output, UpdateSliceInMinorDims(builder, output, update, {i, 0})); + + // if i - k >= 0: + // a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k] + // a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2 + // b[..., :i, :] -= np.matmul(a_slice_2, out[..., i:i+k, :]) + if (i - k >= 0) { + xla::ComputationDataHandle a_slice_2; + if (lower) { + TF_ASSIGN_OR_RETURN(a_slice_2, + SliceInMinorDims(builder, a, {i, 0}, {i + k, i})); + } else { + TF_ASSIGN_OR_RETURN(a_slice_2, + SliceInMinorDims(builder, a, {0, i}, {i, i + k})); + } + + TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update, + /*transpose_x=*/transpose_a, + /*transpose_y=*/false, + /*conjugate_x=*/conjugate_a, + /*conjugate_y=*/false)); + TF_ASSIGN_OR_RETURN(auto b_slice_2, + SliceInMinorDims(builder, b, {0, 0}, {i, n})); + b_update = builder->Sub(b_slice_2, b_update); + TF_ASSIGN_OR_RETURN( + b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0})); + } } } + return output; } +xla::StatusOr TriangularSolveLeftLooking( + xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a, + const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a) { + TF_ASSIGN_OR_RETURN(std::unique_ptr a_shape, + builder->GetShape(a)); + TF_ASSIGN_OR_RETURN(std::unique_ptr b_shape, + builder->GetShape(b)); + const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2); + const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1); + const int64 ndims = xla::ShapeUtil::Rank(*a_shape); + + std::vector batch_dimensions; + for (int i = 0; i < ndims - 2; ++i) { + int64 a_size = a_shape->dimensions(i); + batch_dimensions.push_back(a_size); + } + + auto prepend_batch_dims = [&](std::array indices) { + std::vector output(ndims); + std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin()); + std::copy(indices.begin(), indices.end(), + output.begin() + batch_dimensions.size()); + return output; + }; + + auto maybe_conj = [&](xla::ComputationBuilder* builder, + xla::ComputationDataHandle x) { + auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a; + return perform_conj ? builder->Conj(x) : x; + }; + + // The main computation is performed in a While loop. + + // Allocate the output and set its first or last row, + // output = np.zeros_like(b) + // if transpose_a: + // output[..., m-1:, :] = b[..., m-1:, :] / a[..., m-1:, m-1:] + // else: + // output[..., :1, :] = b[..., :1, :] / a[..., :1, :1] + xla::ComputationDataHandle output = Zeros(builder, *b_shape); + { + auto i = transpose_a ? m - 1 : 0; + TF_ASSIGN_OR_RETURN(auto a_slice, + SliceInMinorDims(builder, a, {i, i}, {i + 1, i + 1})); + TF_ASSIGN_OR_RETURN(auto b_slice, + SliceInMinorDims(builder, b, {i, 0}, {i + 1, n})); + auto update = builder->Div(b_slice, maybe_conj(builder, a_slice)); + TF_ASSIGN_OR_RETURN( + output, UpdateSliceInMinorDims(builder, output, update, {i, 0})); + } + + // Construct the initial loop carry tuple, + // if transpose_a: + // init = (m-2, output, a, b) + // else: + // init = (1, output, a, b) + std::vector tuple_shapes = { + // The loop iteration counter is a scalar, incremented each iteration. + xla::ShapeUtil::MakeShape(xla::S32, {}), + // The output has the shape of b, with one row updated each iteration. + *b_shape, + // The coefficient matrix a is a loop invariant. + *a_shape, + // The right-hand-side matrix b is a loop invariant. + *b_shape}; + xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes); + auto init_i = builder->ConstantR0(transpose_a ? m - 2 : 1); + auto init = builder->Tuple({init_i, output, a, b}); + + // Construct the loop condition function, + // def cond_fun(loop_carry): + // i, output, a, b = loop_carry + // return i >= 0 if transpose_a else i < m + std::unique_ptr condb = + builder->CreateSubBuilder("TriangularSolveLeftLookingWhileCond"); + { + auto i = condb->GetTupleElement( + condb->Parameter(0, tuple_shape, + "TriangularSolveLeftLookingWhileTuple"), + 0); + if (transpose_a) { + condb->Ge(i, condb->ConstantR0(0)); + } else { + condb->Lt(i, condb->ConstantR0(m)); + } + } + TF_ASSIGN_OR_RETURN(auto cond, condb->Build()); + + // Construct the loop body function, + // def body_fun(loop_carry): + // i, output, a, b = loop_carry + // if transpose_a: + // a_row = np.swapaxes(a[..., i+1:, i:i+1], -1 -2) + // else: + // a_row = a[..., i:i+1, :i] + // result_row = b[..., i:i+1, :] - np.matmul(a_row, output[..., :, :]) + // output[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1] + // if transpose_a: + // return (i - 1, output, a, b) + // else: + // return (i + 1, output, a, b) + // We have to do some extra FLOPs propagating zeros in the matrix multiply + // because we can't have the size of its arguments depend on the loop counter. + std::unique_ptr bodyb = + builder->CreateSubBuilder("TriangularSolveLeftLookingWhileBody"); + { + auto input_tuple = bodyb->Parameter(0, tuple_shape, + "TriangularSolveLeftLookingWhileTuple"); + + // i, output, a, b = loop_carry + auto i = bodyb->GetTupleElement(input_tuple, 0); + auto body_out = bodyb->GetTupleElement(input_tuple, 1); + auto body_a = bodyb->GetTupleElement(input_tuple, 2); + auto body_b = bodyb->GetTupleElement(input_tuple, 3); + auto zero = bodyb->ConstantR0(0); + + // Set up some helper functions. + auto prepend_zeros = [&](std::array starts) { + auto zero = bodyb->Reshape(bodyb->ConstantR0(0), {1}); + std::vector padded_starts(ndims, zero); + padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1}); + padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1}); + return bodyb->ConcatInDim(padded_starts, 0); + }; + + auto dynamic_slice = [&](xla::ComputationDataHandle x, + std::array starts, + std::array sizes) { + auto padded_starts = prepend_zeros(starts); + auto padded_sizes = prepend_batch_dims(sizes); + return bodyb->DynamicSlice(x, padded_starts, padded_sizes); + }; + + auto update = [&](xla::ComputationDataHandle x, + xla::ComputationDataHandle update, + std::array starts) { + auto padded_starts = prepend_zeros(starts); + return bodyb->DynamicUpdateSlice(x, update, padded_starts); + }; + + // We'd like to implement this: + // if transpose_a: + // a_row = T(a[..., i+1:, i:i+1]) + // result_row = (b[..., i:i+1, :] + // - np.matmul(a_row, body_out[..., i+1:, :])) + // else: + // result_row = (b[..., i:i+1, :] + // - np.matmul(a[..., i:i+1, :i], body_out[..., :i, :])) + // But since we can't have intermediate array sizes depend on the loop + // counter, we instead exploit the fact that we initialized the output to + // all zeros and use that as zero-padding (doing unnecessary FLOPs). + xla::ComputationDataHandle a_row; + if (transpose_a) { + a_row = dynamic_slice(body_a, {zero, i}, {m, 1}); + } else { + a_row = dynamic_slice(body_a, {i, zero}, {1, m}); + } + TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out, + /*transpose_x=*/transpose_a, + /*transpose_y=*/false, + /*conjugate_x=*/conjugate_a, + /*conjugate_y=*/false)); + auto result_row = + bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update); + + // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1] + auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1}); + auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt)); + body_out = update(body_out, div_result, {i, zero}); + + // if transpose_a: + // return (i - 1, body_out, a, b) + // else: + // return (i + 1, body_out, a, b) + auto next_i = bodyb->Add(i, bodyb->ConstantR0(transpose_a ? -1 : 1)); + bodyb->Tuple({next_i, body_out, body_a, body_b}); + } + TF_ASSIGN_OR_RETURN(auto body, bodyb->Build()); + + // Construct the While loop and return the result, + // return while_loop(cond_fun, body_fun, init)[1] + auto triangular_solve_left_looking_while = builder->While(cond, body, init); + return builder->GetTupleElement(triangular_solve_left_looking_while, 1); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h index 501d026411c..e32223bfddd 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h @@ -21,25 +21,50 @@ limitations under the License. namespace tensorflow { -// Solves systems of linear equations with upper or lower triangular matrices by -// backsubstitution. +// Solves systems of linear equations with lower or upper triangular coefficient +// matrices by forward- or back-substitution. Broadcasting along leading +// dimensions, this routine solves one of the matrix systems +// `op(a) * x = b`, or `x * op(a) = b`, +// for the variable `x` given `a` and `b`, where `op(a)` is either +// `op(a) = a`, or `op(a) = transpose(a)`, or `op(a) = conj(transpose(a))`. +// That is, the innermost matrices in the output satisfy a scalar system +// depending on the value of the value of (left_side, transpose_a, conjugate_a) +// according to: +// (F, F, F) => `output[..., i, k] a[..., k, j] = b[..., i, j]`, +// (F, F, T) => `output[..., i, k] a*[..., k, j] = b[..., i, j]`, +// (F, T, F) => `output[..., i, k] a[..., j, k] = b[..., i, j]`, +// (F, T, T) => `output[..., i, k] a*[..., j, k] = b[..., i, j]`, +// (T, F, F) => ` a[..., i, k] output[..., k, j] = b[..., i, j]`, +// (T, F, T) => `a*[..., i, k] output[..., k, j] = b[..., i, j]`, +// (T, T, F) => ` a[..., i, k] output[..., j, k] = b[..., i, j]`, +// (T, T, T) => `a*[..., i, k] output[..., j, k] = b[..., i, j]`, +// where * denotes complex conjugation and where the index `k` is summed over. // -// `a` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form -// square matrices. The strictly upper triangular part of each inner-most matrix -// is assumed to be zero and not accessed. -// `b` is a tensor of shape `[..., M, K]`. -// -// The innermost matrices in the output satisfy matrix equations -// `output[..., i, j] * adjoint(a[..., k, j]) = b[..., i, k]`. +// `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form +// square matrices. If lower is true (false), then the strictly upper (lower) +// triangular part of each innermost matrix in `a` is assumed to be zero and is +// not accessed. +// `b` is a tensor of shape `[..., M, K]` if left_side is true, otherwise a +// tensor of shape `[..., K, M]`. +// `left_side` is a boolean, indicating whether to solve a system of the form +// op(a) * x = b (true) or x * op(a) = b (false). +// `lower` is a boolean, indicating whether the argument `a` is lower-triangular +// (true) or upper-triangular (false). +// `transpose_a` is a boolean indicating whether the matrix `a` is transposed. +// `conjugate_a` is a boolean indicating whether the entries of `a` are complex +// conjugated (independently of whether they are transposed), so that when both +// transpose_a and conjugate_a are true the effect is a Hermitian adjoint. // // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no // blocking is used. -// TODO(phawkins): equivalent to the BLAS TRSM routine with side=right, -// kind=lower, and transposed_a=true. Implement the other possible combinations -// of side, kind and transposed_a. xla::StatusOr TriangularSolve( xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a, - xla::ComputationDataHandle b, int64 block_size = 256); + xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a, + bool conjugate_a, int64 block_size = 256); + +xla::StatusOr TriangularSolveLeftLooking( + xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a, + const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc index 671d9aa4fe0..66170706291 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc @@ -27,32 +27,68 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/lib/core/status_test_util.h" namespace tensorflow { namespace { using TriangularSolveTest = xla::ClientLibraryTestBase; +using TriangularSolveLeftLookingTest = xla::ClientLibraryTestBase; +using complex64 = xla::complex64; -XLA_TEST_F(TriangularSolveTest, Simple) { +xla::Array2D AValsLower() { + return {{2, 0, 0, 0}, {3, 6, 0, 0}, {4, 7, 9, 0}, {5, 8, 10, 11}}; +} + +xla::Array2D AValsUpper() { + return {{2, 3, 4, 5}, {0, 6, 7, 8}, {0, 0, 9, 10}, {0, 0, 0, 11}}; +} + +xla::Array2D BValsRight() { + return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}; +} + +xla::Array2D BValsLeft() { + return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}}; +} + +xla::Array2D AValsLowerComplex() { + return {{2, 0, 0, 0}, + {complex64(3, 1), 6, 0, 0}, + {4, complex64(7, 2), 9, 0}, + {5, 8, complex64(10, 3), 11}}; +} + +xla::Array2D AValsUpperComplex() { + return {{2, 3, complex64(4, 3), 5}, + {0, 6, complex64(7, 2), 8}, + {0, 0, complex64(9, 1), 10}, + {0, 0, 0, 11}}; +} + +xla::Array2D BValsRightComplex() { + return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}}; +} + +xla::Array2D BValsLeftComplex() { + return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}}; +} + +xla::Array2D AValsFull() { + return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}}; +} + +XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) { xla::ComputationBuilder builder(client_, TestName()); - xla::Array2D a_vals({ - {2, 0, 0, 0}, - {3, 6, 0, 0}, - {4, 7, 9, 0}, - {5, 8, 10, 11}, - }); - xla::Array2D b_vals({ - {1, 2, 3, 4}, - {5, 6, 7, 8}, - {9, 10, 11, 12}, - }); - xla::ComputationDataHandle a, b; - auto a_data = CreateR2Parameter(a_vals, 0, "a", &builder, &a); - auto b_data = CreateR2Parameter(b_vals, 1, "b", &builder, &b); - auto result = TriangularSolve(&builder, a, b, /*block_size=*/2); + auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsRight(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/false, /*lower=*/true, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); TF_ASSERT_OK(result.status()); xla::Array2D expected({ @@ -62,7 +98,267 @@ XLA_TEST_F(TriangularSolveTest, Simple) { }); ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, - xla::ErrorSpec(2e-3, 2e-3)); + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsRight(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/false, /*lower=*/true, + /*transpose_a=*/false, /*conjugate_a=*/false, + /*block_size=*/2); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {-0.16414141, -0.06902357, -0.07070707, 0.36363636}, + {0.64393939, 0.06565657, -0.03030303, 0.72727273}, + {1.4520202, 0.2003367, 0.01010101, 1.09090909}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = CreateR2Parameter(AValsUpper(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsRight(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/false, /*lower=*/false, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {-0.16414141, -0.06902357, -0.07070707, 0.36363636}, + {0.64393939, 0.06565657, -0.03030303, 0.72727273}, + {1.4520202, 0.2003367, 0.01010101, 1.09090909}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = CreateR2Parameter(AValsUpper(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsRight(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/false, /*lower=*/false, + /*transpose_a=*/false, /*conjugate_a=*/false, + /*block_size=*/2); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {0.5, 0.08333334, 0.04629629, 0.03367003}, + {2.5, -0.25, -0.1388889, -0.1010101}, + {4.5, -0.58333331, -0.32407406, -0.23569024}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/true, /*lower=*/true, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {-0.89646465, -0.69444444, -0.49242424}, + {-0.27441077, -0.24074074, -0.20707071}, + {-0.23232323, -0.22222222, -0.21212121}, + {0.90909091, 1., 1.09090909}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/true, /*lower=*/true, + /*transpose_a=*/false, /*conjugate_a=*/false, + /*block_size=*/2); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {0.5, 1.0, 1.5}, + {0.41666667, 0.33333333, 0.25}, + {0.23148148, 0.18518519, 0.13888889}, + {0.16835017, 0.13468013, 0.1010101}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = CreateR2Parameter(AValsUpper(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/true, /*lower=*/false, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {0.5, 1.0, 1.5}, + {0.41666667, 0.33333333, 0.25}, + {0.23148148, 0.18518519, 0.13888889}, + {0.16835017, 0.13468013, 0.1010101}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = CreateR2Parameter(AValsUpper(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/true, /*lower=*/false, + /*transpose_a=*/false, /*conjugate_a=*/false, + /*block_size=*/2); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {-0.89646465, -0.69444444, -0.49242424}, + {-0.27441077, -0.24074074, -0.20707071}, + {-0.23232323, -0.22222222, -0.21212121}, + {0.90909091, 1., 1.09090909}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = + CreateR2Parameter(AValsLowerComplex(), 0, "a", &builder, &a); + auto b_data = + CreateR2Parameter(BValsRightComplex(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/false, /*lower=*/true, + /*transpose_a=*/true, /*conjugate_a=*/true, + /*block_size=*/2); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {0.5, complex64(0.08333333, 0.08333333), + complex64(0.02777778, -0.0462963), complex64(0.06313131, -0.01094276)}, + {2.5, complex64(-0.25, 0.41666667), complex64(-0.23148148, -0.37962963), + complex64(0.08670034, -0.02104377)}, + {4.5, complex64(-0.58333333, 0.75), complex64(-0.49074074, -0.71296296), + complex64(0.11026936, -0.03114478)}, + }); + + ComputeAndCompareR2(&builder, expected, + {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = + CreateR2Parameter(AValsUpperComplex(), 0, "a", &builder, &a); + auto b_data = + CreateR2Parameter(BValsLeftComplex(), 1, "b", &builder, &b); + auto result = TriangularSolve(&builder, a, b, + /*left_side=*/true, /*lower=*/false, + /*transpose_a=*/true, /*conjugate_a=*/false, + /*block_size=*/2); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {0.5, 1., 1.5}, + {0.41666667, 0.33333333, 0.25}, + {complex64(0.20020325, -2.81504065e-01), + complex64(0.13821138, -4.22764228e-01), + complex64(0.07621951, -5.64024390e-01)}, + {complex64(0.19678492, 2.55912786e-01), + complex64(0.17738359, 3.84331116e-01), + complex64(0.15798226, 5.12749446e-01)}, + }); + + ComputeAndCompareR2(&builder, expected, + {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = CreateR2Parameter(AValsLower(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); + auto result = TriangularSolveLeftLooking(&builder, a, b, + /*transpose_a=*/false, + /*conjugate_a=*/false); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {0.5, 1.0, 1.5}, + {0.41666667, 0.33333333, 0.25}, + {0.23148148, 0.18518519, 0.13888889}, + {0.16835017, 0.13468013, 0.1010101}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); +} + +XLA_TEST_F(TriangularSolveLeftLookingTest, NonzeroUpperTriangle) { + xla::ComputationBuilder builder(client_, TestName()); + + xla::ComputationDataHandle a, b; + auto a_data = CreateR2Parameter(AValsFull(), 0, "a", &builder, &a); + auto b_data = CreateR2Parameter(BValsLeft(), 1, "b", &builder, &b); + auto result = TriangularSolveLeftLooking(&builder, a, b, + /*transpose_a=*/false, + /*conjugate_a=*/false); + TF_ASSERT_OK(result.status()); + + xla::Array2D expected({ + {0.5, 1.0, 1.5}, + {0.41666667, 0.33333333, 0.25}, + {0.23148148, 0.18518519, 0.13888889}, + {0.16835017, 0.13468013, 0.1010101}, + }); + + ComputeAndCompareR2(&builder, expected, {a_data.get(), b_data.get()}, + xla::ErrorSpec(1e-2, 1e-2)); } } // namespace diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc index ce24b61b5dc..9b7492f8cf6 100644 --- a/tensorflow/compiler/tf2xla/lib/util.cc +++ b/tensorflow/compiler/tf2xla/lib/util.cc @@ -107,4 +107,15 @@ xla::StatusOr UpdateSliceInMinorDims( return UpdateSlice(builder, x, update, padded_start); } +xla::StatusOr TransposeInMinorDims( + xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x) { + TF_ASSIGN_OR_RETURN(std::unique_ptr shape, builder->GetShape(x)); + const int64 n_dims = xla::ShapeUtil::Rank(*shape); + TF_RET_CHECK(n_dims >= 2); + std::vector permutation(n_dims); + std::iota(permutation.begin(), permutation.end(), 0); + std::swap(permutation[n_dims - 1], permutation[n_dims - 2]); + return builder->Transpose(x, permutation); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h index fb138b4f736..7f93102ee78 100644 --- a/tensorflow/compiler/tf2xla/lib/util.h +++ b/tensorflow/compiler/tf2xla/lib/util.h @@ -49,6 +49,10 @@ xla::StatusOr UpdateSliceInMinorDims( xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x, const xla::ComputationDataHandle& update, gtl::ArraySlice start); +// Transposes a stack of matrices `x` by swapping the last two dimensions. +xla::StatusOr TransposeInMinorDims( + xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x); + } // namespace tensorflow #endif // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_ diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc index 906f2290433..6051d7dffd7 100644 --- a/tensorflow/compiler/tf2xla/tf2xla.cc +++ b/tensorflow/compiler/tf2xla/tf2xla.cc @@ -241,9 +241,7 @@ Status CreateXlaArgs(const Graph& graph, XlaCompiler::Argument arg; arg.kind = XlaCompiler::Argument::kParameter; TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type)); - TensorShape shape; - TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &shape)); - TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, &arg.shape)); + TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape)); TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name)); xla_args->push_back(arg); } diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 69b265436bb..c5b4ec5b15f 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -66,13 +66,14 @@ Status CheckSignature(const DataTypeVector& types, bool XlaCompiler::Argument::operator==( const XlaCompiler::Argument& other) const { - if (std::tie(kind, resource_kind, type, name, tensor_array_size, + if (std::tie(kind, resource_kind, type, name, initialized, tensor_array_size, tensor_array_gradients) != std::tie(other.kind, other.resource_kind, other.type, other.name, - other.tensor_array_size, other.tensor_array_gradients)) { + other.initialized, other.tensor_array_size, + other.tensor_array_gradients)) { return false; } - if (!xla::ShapeUtil::Equal(shape, other.shape)) { + if (shape != other.shape) { return false; } if (constant_value.shape() != other.constant_value.shape()) { @@ -230,6 +231,64 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options, return Status::OK(); } +// Computes the XLA shape for argument 'arg'. +/*static*/ Status XlaCompiler::XLAShapeForArgument( + const XlaCompiler::Argument& arg, xla::Shape* xla_shape) { + switch (arg.kind) { + case XlaCompiler::Argument::kConstant: + return TensorShapeToXLAShape(arg.type, arg.constant_value.shape(), + xla_shape); + case XlaCompiler::Argument::kParameter: + return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape); + case XlaCompiler::Argument::kResource: { + TF_RET_CHECK(arg.initialized); + + switch (arg.resource_kind) { + case XlaResource::kVariable: + return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape); + case XlaResource::kTensorArray: { + if (arg.tensor_array_size < 0) { + return errors::InvalidArgument( + "Negative tensor_array_size in XLAShapeForArgument"); + } + TensorShape shape; + shape.AddDim(arg.tensor_array_size); + shape.AppendShape(arg.shape); + TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape)); + + if (!arg.tensor_array_gradients.empty()) { + std::vector tuple_shape( + arg.tensor_array_gradients.size() + 1, *xla_shape); + *xla_shape = xla::ShapeUtil::MakeTupleShape(tuple_shape); + } + return Status::OK(); + } + case XlaResource::kStack: { + if (arg.tensor_array_size < 0) { + return errors::InvalidArgument( + "Negative tensor_array_size in XLAShapeForArgument"); + } + TensorShape shape; + shape.AddDim(arg.tensor_array_size); + shape.AppendShape(arg.shape); + xla::Shape buffer_shape; + TF_RETURN_IF_ERROR( + TensorShapeToXLAShape(arg.type, shape, &buffer_shape)); + *xla_shape = xla::ShapeUtil::MakeTupleShape( + {buffer_shape, xla::ShapeUtil::MakeShape(xla::S32, {})}); + return Status::OK(); + } + + case XlaResource::kInvalid: + return errors::Internal( + "Invalid resource type in XLAShapeForArgument()"); + } + } + case XlaCompiler::Argument::kInvalid: + return errors::Internal("Invalid argument type in XLAShapeForArgument()"); + } +} + namespace { Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr graph, @@ -275,8 +334,9 @@ Status BuildArguments(const Graph& graph, // Argument numbers of arguments and resources that are to be passed to the // XLA computation as runtime parameters. - std::vector parameters, resources; - parameters.reserve(args.size()); + input_mapping->clear(); + input_mapping->reserve(args.size()); + std::vector resources; resources.reserve(args.size()); // Fills in constant arguments, and computes non-constant argument order. @@ -290,18 +350,20 @@ Status BuildArguments(const Graph& graph, // TODO(phawkins): this code assumes that resource arguments do not // alias. XlaResource* resource; - TF_RETURN_IF_ERROR( - context->CreateResource(arg.resource_kind, i, arg.name, arg.type, - xla::ComputationDataHandle(), &resource)); - resource->set_tensor_array_size(arg.tensor_array_size); + TF_RETURN_IF_ERROR(context->CreateResource( + arg.resource_kind, i, arg.name, arg.type, arg.shape, + xla::ComputationDataHandle(), + /*tensor_array_size=*/arg.tensor_array_size, + /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource)); arg_expression.set_resource(resource); if (arg.initialized) { resources.push_back(i); } break; - case XlaCompiler::Argument::kParameter: - parameters.push_back(i); + case XlaCompiler::Argument::kParameter: { + input_mapping->push_back(i); break; + } case XlaCompiler::Argument::kConstant: arg_expression.set_constant_value(arg.constant_value); break; @@ -312,19 +374,17 @@ Status BuildArguments(const Graph& graph, // Append parameters containing variable values after the other runtime // parameters. - parameters.insert(parameters.end(), resources.begin(), resources.end()); - if (parameters.empty()) { + input_mapping->insert(input_mapping->end(), resources.begin(), + resources.end()); + if (input_mapping->empty()) { return Status::OK(); } - std::vector arg_shapes; - arg_shapes.reserve(parameters.size()); - input_mapping->resize(parameters.size()); - for (std::vector::size_type i = 0; i < parameters.size(); ++i) { - const XlaCompiler::Argument& arg = args[parameters[i]]; + std::vector arg_shapes(input_mapping->size()); + for (std::vector::size_type i = 0; i < input_mapping->size(); ++i) { // Computes the shapes of non-constant arguments. - arg_shapes.push_back(arg.shape); - (*input_mapping)[i] = parameters[i]; + TF_RETURN_IF_ERROR(XlaCompiler::XLAShapeForArgument( + args[(*input_mapping)[i]], &arg_shapes[i])); } if (use_tuple_arg) { @@ -354,13 +414,13 @@ Status BuildArguments(const Graph& graph, } // Build parameter handles for non-constant arguments. - std::vector arg_handles(parameters.size()); + std::vector arg_handles(input_mapping->size()); if (use_tuple_arg) { xla::ComputationDataHandle tuple; if (is_entry_computation) { xla::OpSharding tuple_sharding; tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE); - for (int64 parameter : parameters) { + for (int64 parameter : *input_mapping) { const int core = (*arg_cores)[parameter]; const int root_device = 0; *tuple_sharding.add_tuple_shardings() = @@ -373,16 +433,16 @@ Status BuildArguments(const Graph& graph, } else { tuple = builder->Parameter(0, (*input_shapes)[0], "arg_tuple"); } - for (std::vector::size_type i = 0; i < parameters.size(); ++i) { - const int core = (*arg_cores)[parameters[i]]; + for (std::vector::size_type i = 0; i < input_mapping->size(); ++i) { + const int core = (*arg_cores)[input_mapping->at(i)]; xla::ScopedShardingAssignment assign_sharding( builder, core == -1 ? tensorflow::gtl::optional() : xla::sharding_builder::AssignDevice(core)); arg_handles[i] = builder->GetTupleElement(tuple, i); } } else { - for (std::vector::size_type i = 0; i < parameters.size(); ++i) { - const int core = (*arg_cores)[parameters[i]]; + for (std::vector::size_type i = 0; i < input_mapping->size(); ++i) { + const int core = (*arg_cores)[input_mapping->at(i)]; xla::ScopedShardingAssignment assign_sharding( builder, core == -1 ? tensorflow::gtl::optional() : xla::sharding_builder::AssignDevice(core)); @@ -393,19 +453,18 @@ Status BuildArguments(const Graph& graph, // Fill in the handles in non-constant arguments. VLOG(2) << "XLA computation inputs:"; - for (std::vector::size_type i = 0; i < parameters.size(); ++i) { - const XlaCompiler::Argument& arg = args[parameters[i]]; + for (std::vector::size_type i = 0; i < input_mapping->size(); ++i) { + const XlaCompiler::Argument& arg = args[input_mapping->at(i)]; VLOG(2) << " XLA arg " << i << " shape: " << xla::ShapeUtil::HumanString(arg_shapes[i]) - << " name: " << arg.name << " TF arg " << parameters[i]; - XlaExpression& arg_expression = (*arg_expressions)[parameters[i]]; + << " name: " << arg.name << " TF arg " << input_mapping->at(i); + XlaExpression& arg_expression = (*arg_expressions)[input_mapping->at(i)]; switch (arg.kind) { case XlaCompiler::Argument::kResource: { TF_RET_CHECK(arg.initialized); XlaResource* resource = arg_expression.resource(); - TF_RETURN_IF_ERROR( - resource->SetFromPack(arg.tensor_array_gradients, arg_handles[i], - /*reset_initial_values=*/true, builder)); + TF_RETURN_IF_ERROR(resource->SetFromPack(arg.tensor_array_gradients, + arg_handles[i], builder)); VLOG(2) << " resource: num_gradients: " << arg.tensor_array_gradients.size(); break; @@ -486,6 +545,7 @@ Status BuildComputation( XlaCompiler::ResourceUpdate& update = resource_updates->back(); update.input_index = resource->arg_num(); update.type = resource->type(); + update.shape = resource->shape(); update.modified = modified; for (const auto& grad : resource->tensor_array_gradients()) { update.tensor_array_gradients_accessed.insert(grad.first); @@ -616,13 +676,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options, ++computation_output; } } - - for (std::vector::size_type i = 0; - i < result->resource_updates.size(); ++i) { - result->resource_updates[i].shape = xla::ShapeUtil::GetTupleElementShape( - result->xla_output_shape, computation_output); - ++computation_output; - } return Status::OK(); } diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index 6a46e54f61c..b86c82c0ab5 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -104,9 +104,17 @@ class XlaCompiler { // is the type of the variable's value, not DT_RESOURCE. DataType type; - // The shape of the argument. If the argument is a resource, this is the - // shape of the resource's value. - xla::Shape shape; + // The shape of the argument. For: + // * a parameter: the shape of the parameter. + // * a constant: ignored; the shape given by constant_value is used + // instead. + // * an uninitialized resource: ignored. We don't yet know the shape of an + // uninitialized resource (otherwise we would have initialized it!) + // * an initialized variable: the shape of the variable's value. + // * an initialized TensorArray or Stack resource: the shape of an entry in + // the TensorArray/Stack. Note this is the size of a single entry, not the + // XLA data structure that represents the complete stack/array. + TensorShape shape; // The value of the argument, if it is a compile-time constant. Must be a // host-memory tensor. @@ -175,8 +183,9 @@ class XlaCompiler { int input_index; // Type and shape of the tensor to be written back. + // The `shape` field has the same meaning as the Argument::shape field. DataType type; - xla::Shape shape; + TensorShape shape; // Was the value of the variable modified by the computation? // (Always true, unless `return_updated_values_for_all_resources` is true.) @@ -235,6 +244,19 @@ class XlaCompiler { // device is created, and can be used to create metadata objects // that can be accessed by XLA op kernels. std::function* populate_resource_manager = nullptr; + + // If not nullptr, this memory allocator can be used by the compiler for + // temporary allocations it might want to make during compilation. + // + // For example, the compiler may want to try out different algorithms and + // choose the fastest one, and it might run those algorithms over buffers + // created using this allocator. + // + // The compiler can function correctly without an explicit allocator given + // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly + // allocate most or all available memory on the device, leaving none for the + // compiler to access, unless it can use TensorFlow's allocator. + xla::DeviceMemoryAllocator* device_allocator = nullptr; }; explicit XlaCompiler(Options options); @@ -253,11 +275,10 @@ class XlaCompiler { const std::vector& args, CompilationResult* result); - Status PrepareArguments(xla::ComputationBuilder* builder, NameAttrList func, - const std::vector& types, - const std::vector& shapes, - const std::vector& expressions, - std::vector* args); + // Returns the shape of the XLA parameter for an argument 'arg'. + // See the class comment for more details about the argument passing + // convention. + static Status XLAShapeForArgument(const Argument& arg, xla::Shape* xla_shape); // Retrieves the channel handle associated with `key`. Allocates // a new channel handle if none exists. diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc index 7ebe4b75bc1..65de4dbad75 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc @@ -191,10 +191,10 @@ TEST_F(XlaCompilerTest, Simple) { std::vector args(2); args[0].kind = XlaCompiler::Argument::kParameter; args[0].type = DT_INT32; - args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2}); + args[0].shape = TensorShape({2}); args[1].kind = XlaCompiler::Argument::kParameter; args[1].type = DT_INT32; - args[1].shape = xla::ShapeUtil::MakeShape(xla::S32, {2}); + args[1].shape = TensorShape({2}); // Compiles the graph. XlaCompiler compiler(DefaultOptions()); @@ -242,10 +242,10 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) { std::vector args(2); args[0].kind = XlaCompiler::Argument::kParameter; args[0].type = DT_INT32; - args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2}); + args[0].shape = TensorShape({2}); args[1].kind = XlaCompiler::Argument::kParameter; args[1].type = DT_INT32; - args[1].shape = xla::ShapeUtil::MakeShape(xla::S32, {2}); + args[1].shape = TensorShape({2}); // Compiles the graph. XlaCompiler compiler(DefaultOptions()); @@ -281,7 +281,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) { std::vector args(1); args[0].kind = XlaCompiler::Argument::kParameter; args[0].type = DT_INT32; - args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2}); + args[0].shape = TensorShape({2}); XlaCompiler::Options options = DefaultOptions(); XlaCompiler compiler(options); @@ -373,7 +373,7 @@ TEST_F(XlaCompilerTest, ResourceManager) { std::vector args(1); args[0].kind = XlaCompiler::Argument::kParameter; args[0].type = DT_INT32; - args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2}); + args[0].shape = TensorShape({2}); DummyResourceForTest* resource = new DummyResourceForTest(); @@ -420,7 +420,7 @@ TEST_F(XlaCompilerTest, DeterministicCompilation) { std::vector args(1); args[0].kind = XlaCompiler::Argument::kParameter; args[0].type = DT_INT32; - args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2}); + args[0].shape = TensorShape({2}); // Compiles the graph. auto options = DefaultOptions(); @@ -472,9 +472,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) { args[0].resource_kind = XlaResource::kTensorArray; args[0].initialized = true; args[0].type = DT_INT32; - args[0].shape = xla::ShapeUtil::MakeTupleShape( - {xla::ShapeUtil::MakeShape(xla::S32, {2}), - xla::ShapeUtil::MakeShape(xla::S32, {2})}); + args[0].shape = TensorShape({}); args[0].tensor_array_size = 2; args[0].tensor_array_gradients = {"grad2"}; @@ -540,9 +538,7 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) { args[0].resource_kind = XlaResource::kTensorArray; args[0].initialized = true; args[0].type = DT_INT32; - args[0].shape = xla::ShapeUtil::MakeTupleShape( - {xla::ShapeUtil::MakeShape(xla::S32, {2}), - xla::ShapeUtil::MakeShape(xla::S32, {2})}); + args[0].shape = TensorShape({}); args[0].tensor_array_size = 2; args[0].tensor_array_gradients = {"grad1"}; @@ -574,9 +570,7 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) { args[0].resource_kind = XlaResource::kTensorArray; args[0].initialized = true; args[0].type = DT_INT32; - args[0].shape = xla::ShapeUtil::MakeTupleShape( - {xla::ShapeUtil::MakeShape(xla::S32, {2}), - xla::ShapeUtil::MakeShape(xla::S32, {2})}); + args[0].shape = TensorShape({}); args[0].tensor_array_size = 2; args[0].tensor_array_gradients = {"grad1"}; diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc index e8d17e2e0a1..73878955e3f 100644 --- a/tensorflow/compiler/tf2xla/xla_context.cc +++ b/tensorflow/compiler/tf2xla/xla_context.cc @@ -103,12 +103,14 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype, xla::ComputationBuilder* XlaContext::builder() { return builder_; } -Status XlaContext::CreateResource(XlaResource::Kind kind, int arg_num, - string name, DataType type, - const xla::ComputationDataHandle& handle, - XlaResource** resource) { +Status XlaContext::CreateResource( + XlaResource::Kind kind, int arg_num, string name, DataType type, + TensorShape shape, const xla::ComputationDataHandle& handle, + int64 tensor_array_size, const std::set& tensor_array_gradients, + XlaResource** resource) { resources_.emplace_back( - new XlaResource(kind, arg_num, std::move(name), type, handle)); + new XlaResource(kind, arg_num, std::move(name), type, std::move(shape), + handle, tensor_array_size, tensor_array_gradients)); *resource = resources_.back().get(); return Status::OK(); } diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h index 1a7dafe8cdb..fac0352ae81 100644 --- a/tensorflow/compiler/tf2xla/xla_context.h +++ b/tensorflow/compiler/tf2xla/xla_context.h @@ -71,11 +71,15 @@ class XlaContext : public ResourceBase { Status AddConstRetval(int retval_index, DataType dtype, const xla::Literal& literal); - // Creates a resource with resource `kind` and initial type `type` and - // value `handle`. `name` is a descriptive name for use in error messages. + // Creates a resource with resource `kind` and initial value `handle`. `name` + // is a descriptive name for use in error messages. See the `XlaResource` + // constructor for a description of the remaining arguments. // Fails if the resource already exists. Status CreateResource(XlaResource::Kind kind, int arg_num, string name, - DataType type, const xla::ComputationDataHandle& handle, + DataType type, TensorShape shape, + const xla::ComputationDataHandle& handle, + int64 tensor_array_size, + const std::set& tensor_array_gradients, XlaResource** resource); const std::vector>& resources() { diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc index ee0aed672e1..ee29158646f 100644 --- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc +++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc @@ -286,7 +286,8 @@ Status XlaOpKernelContext::ConstantInputList( } Status XlaOpKernelContext::ReadVariableInput( - int index, xla::ComputationDataHandle* value) { + int index, DataType type, TensorShape* shape, + xla::ComputationDataHandle* value) { const Tensor& tensor = context_->input(index); const XlaExpression* expression = CastExpressionFromTensor(tensor); XlaResource* variable = expression->resource(); @@ -296,7 +297,15 @@ Status XlaOpKernelContext::ReadVariableInput( return errors::InvalidArgument("Read of uninitialized variable ", variable->name()); } + if (variable->type() != type) { + return errors::InvalidArgument( + "Type mismatch for read of variable ", variable->name(), ". Expected ", + DataTypeString(type), "; got ", DataTypeString(variable->type())); + } *value = variable->value(); + if (shape) { + *shape = variable->shape(); + } return Status::OK(); } @@ -312,12 +321,7 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type, variable->name()); } *type = variable->type(); - auto shape_or_status = builder()->GetShape(variable->value()); - if (!shape_or_status.ok()) { - return shape_or_status.status(); - } - TF_RETURN_IF_ERROR( - XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), shape)); + *shape = variable->shape(); return Status::OK(); } @@ -405,7 +409,17 @@ Status XlaOpKernelContext::AssignVariable( XlaResource* variable = expression->resource(); TF_RET_CHECK(variable != nullptr); TF_RET_CHECK(variable->kind() == XlaResource::kVariable); - return variable->SetValue(type, handle); + + auto shape_or_status = builder()->GetShape(handle); + if (!shape_or_status.ok()) { + return shape_or_status.status(); + } + TensorShape shape; + TF_RETURN_IF_ERROR( + XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape)); + + TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape)); + return variable->SetValue(handle); } XlaCompiler* XlaOpKernelContext::compiler() const { diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h index 6d3b6db2289..e1fd0f55c6d 100644 --- a/tensorflow/compiler/tf2xla/xla_op_kernel.h +++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h @@ -164,11 +164,16 @@ class XlaOpKernelContext { TensorShape* shape) const; // Reads the current value of the resouce variable referred to by input - // 'index'. - Status ReadVariableInput(int index, xla::ComputationDataHandle* value); + // 'index'. If `shape` is not nullptr, sets `*shape` to the shape of the + // variable. Returns an error if the variable has not been initialized, or if + // its type does not match `type`. + Status ReadVariableInput(int index, DataType type, TensorShape* shape, + xla::ComputationDataHandle* value); // Assigns the value `handle` to the variable referenced by input - // `input_index`. Marks the operator as having side effects. + // `input_index`. The variable must be of `type`. Returns an error if the + // variable has been initialized with a different type or with a + // different shape. Status AssignVariable(int input_index, DataType type, const xla::ComputationDataHandle& handle); diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc index 9abac8bdaa7..c2075b44b82 100644 --- a/tensorflow/compiler/tf2xla/xla_resource.cc +++ b/tensorflow/compiler/tf2xla/xla_resource.cc @@ -25,51 +25,99 @@ limitations under the License. namespace tensorflow { -XlaResource::XlaResource(Kind kind, int arg_num, string name, - DataType initial_type, - const xla::ComputationDataHandle& initial_value) +XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type, + TensorShape shape, + const xla::ComputationDataHandle& initial_value, + int64 tensor_array_size, + const std::set& tensor_array_gradients) : kind_(kind), arg_num_(arg_num), name_(std::move(name)), - type_(initial_type), + type_(type), + shape_(std::move(shape)), value_(initial_value), - initial_value_(initial_value) { + initial_value_(initial_value), + tensor_array_size_(tensor_array_size) { CHECK(kind_ != kInvalid); + + for (const string& gradient : tensor_array_gradients) { + tensor_array_gradients_[gradient].reset( + new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1, + /*name=*/strings::StrCat("TensorArrayGrad: ", name_), + type_, shape_, xla::ComputationDataHandle(), + tensor_array_size_, /*tensor_array_gradients=*/{})); + } } -Status XlaResource::SetValue(DataType type, - const xla::ComputationDataHandle& value) { - if (type_ == DT_INVALID && type == DT_INVALID) { - return errors::InvalidArgument("Attempted to initialized resource ", name_, - " to an invalid type"); +Status XlaResource::SetTypeAndShape(DataType type, const TensorShape& shape) { + if (type == DT_INVALID) { + return errors::InvalidArgument("Attempted to set type of resource '", name_, + "'' to an invalid type"); } - if (type_ != DT_INVALID && type_ != type) { + if (initialized() && type_ != type) { return errors::InvalidArgument("Type of resource ", name_, " cannot be changed after initialization: " "old type was ", DataTypeString(type_), ", new type is ", DataTypeString(type)); } + if (initialized() && shape_ != shape) { + return errors::InvalidArgument("Shape of resource ", name_, + " cannot be changed after initialization: " + "old shape was ", + shape_.DebugString(), ", new shape is ", + shape.DebugString()); + } type_ = type; + shape_ = shape; + return Status::OK(); +} + +Status XlaResource::SetValue(const xla::ComputationDataHandle& value) { + if (type_ == DT_INVALID) { + return errors::InvalidArgument( + "Resource '", name_, + "' must be initialized with a valid type before use."); + } value_ = value; return Status::OK(); } -Status XlaResource::GetXlaShape(xla::ComputationBuilder* builder, - xla::Shape* shape) const { - auto shape_or_status = builder->GetShape(value_); - if (!shape_or_status.ok()) { - return shape_or_status.status(); +Status XlaResource::SetZeroValue(xla::ComputationBuilder* builder) { + if (type_ == DT_INVALID) { + return errors::InvalidArgument( + "Resource '", name_, + "' must be initialized with a valid type before use."); } - *shape = *shape_or_status.ValueOrDie(); - return Status::OK(); -} + switch (kind_) { + case kVariable: { + value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_), + shape_.dim_sizes()); + break; + } + case kTensorArray: { + TensorShape ta_shape; + ta_shape.AddDim(tensor_array_size_); + ta_shape.AppendShape(shape_); + value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_), + ta_shape.dim_sizes()); + break; + } + case kStack: { + TensorShape ta_shape; + ta_shape.AddDim(tensor_array_size_); + ta_shape.AppendShape(shape_); + value_ = + builder->Tuple({builder->Broadcast(XlaHelpers::Zero(builder, type_), + ta_shape.dim_sizes()), + builder->ConstantR0(0)}); + break; + } -Status XlaResource::GetShape(xla::ComputationBuilder* builder, - TensorShape* shape) const { - xla::Shape xla_shape; - TF_RETURN_IF_ERROR(GetXlaShape(builder, &xla_shape)); - TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, shape)); + case kInvalid: + default: + LOG(FATAL) << "Invalid resource type"; + } return Status::OK(); } @@ -82,36 +130,20 @@ Status XlaResource::GetOrCreateTensorArrayGradient( std::unique_ptr& gradient = tensor_array_gradients_[source]; if (!gradient) { TensorShape ta_shape; - TF_RETURN_IF_ERROR(GetShape(builder, &ta_shape)); + ta_shape.AddDim(tensor_array_size_); + ta_shape.AppendShape(shape_); xla::ComputationDataHandle gradient_value = builder->Broadcast( XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes()); gradient.reset( new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1, /*name=*/strings::StrCat("TensorArrayGrad: ", name_), - type_, gradient_value)); - gradient->tensor_array_size_ = tensor_array_size_; + type_, shape_, gradient_value, tensor_array_size_, + /*tensor_array_gradients=*/{})); } *gradient_out = gradient.get(); return Status::OK(); } -Status XlaResource::PackedShape(xla::ComputationBuilder* builder, - xla::Shape* packed_shape) const { - if (tensor_array_gradients_.empty()) { - return GetXlaShape(builder, packed_shape); - } - TF_RET_CHECK(kind_ == kTensorArray); - std::vector elem_shapes(1 + tensor_array_gradients_.size()); - int pos = 0; - TF_RETURN_IF_ERROR(GetXlaShape(builder, &elem_shapes[pos++])); - for (const auto& gradient : tensor_array_gradients_) { - TF_RETURN_IF_ERROR( - gradient.second->GetXlaShape(builder, &elem_shapes[pos++])); - } - *packed_shape = xla::ShapeUtil::MakeTupleShape(elem_shapes); - return Status::OK(); -} - Status XlaResource::Pack(xla::ComputationDataHandle* pack, xla::ComputationBuilder* builder) const { if (tensor_array_gradients_.empty()) { @@ -130,27 +162,32 @@ Status XlaResource::Pack(xla::ComputationDataHandle* pack, Status XlaResource::SetFromPack(const std::set& gradient_sources, const xla::ComputationDataHandle& pack, - bool reset_initial_values, xla::ComputationBuilder* builder) { if (gradient_sources.empty()) { + if (!initialized()) { + initial_value_ = pack; + } value_ = pack; } else { TF_RET_CHECK(kind_ == kTensorArray); int pos = 0; - value_ = builder->GetTupleElement(pack, pos++); + auto v = builder->GetTupleElement(pack, pos++); + if (!initialized()) { + initial_value_ = v; + } + value_ = v; + for (const auto& source : gradient_sources) { XlaResource* gradient; TF_RETURN_IF_ERROR( GetOrCreateTensorArrayGradient(source, builder, &gradient)); - gradient->value_ = builder->GetTupleElement(pack, pos++); - if (reset_initial_values) { - gradient->initial_value_ = gradient->value_; + auto v = builder->GetTupleElement(pack, pos++); + if (!gradient->initialized()) { + gradient->initial_value_ = v; } + gradient->value_ = v; } } - if (reset_initial_values) { - initial_value_ = value_; - } return Status::OK(); } diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h index 6b46089e4f5..1bb2c7274ec 100644 --- a/tensorflow/compiler/tf2xla/xla_resource.h +++ b/tensorflow/compiler/tf2xla/xla_resource.h @@ -36,8 +36,11 @@ class XlaResource { kStack, }; - XlaResource(Kind kind, int arg_num, string name, DataType initial_type, - const xla::ComputationDataHandle& initial_value); + XlaResource(Kind kind, int arg_num, string name, DataType type, + TensorShape shape, + const xla::ComputationDataHandle& initial_value, + int64 tensor_array_size, + const std::set& tensor_array_gradients); XlaResource(const XlaResource&) = delete; XlaResource(XlaResource&&) = delete; @@ -60,6 +63,12 @@ class XlaResource { // a resource is first initialized we do not yet know its type, so we keep // track of its type dynamically. DataType type() const { return type_; } + + // Shape of the resource. For an uninitialized resource, this is ignored. + // For a Variable, this is the shape of the value. For a TensorArray or Stack + // this is the shape of each entry in the TensorArray/Stack. + const TensorShape& shape() const { return shape_; } + const xla::ComputationDataHandle& value() const { return value_; } // Value of the resource at computation entry. Used to detect which @@ -68,17 +77,19 @@ class XlaResource { return initial_value_; } + // A variable is initialized if it has a value. bool initialized() const { return value_.handle() > 0; } - // Sets the current type/value of the resource. - Status SetValue(DataType type, const xla::ComputationDataHandle& value); + // Sets the type and shape of the resource. The type and shape of a resource + // must not change once the variable has been initialized. + Status SetTypeAndShape(DataType type, const TensorShape& shape); - // Returns the shape of the resource as an xla::Shape. - Status GetXlaShape(xla::ComputationBuilder* builder, xla::Shape* shape) const; + // Sets the current value of the resource. Returns an error if the type is not + // set to a valid value. + Status SetValue(const xla::ComputationDataHandle& value); - // Returns the shape of the resource as an TensorShape. Fails if the shape is - // not representable as a TensorShape. - Status GetShape(xla::ComputationBuilder* builder, TensorShape* shape) const; + // Sets the current value of the resource to an all-zero value. + Status SetZeroValue(xla::ComputationBuilder* builder); // Looks up the gradient for `source`, or creates it if it does not already // exist. The call target must be an initialized TensorArray resource. A @@ -96,10 +107,6 @@ class XlaResource { Status Pack(xla::ComputationDataHandle* pack, xla::ComputationBuilder* builder) const; - // Returns the shape of the `pack` value computed by `Pack()`. - Status PackedShape(xla::ComputationBuilder* builder, - xla::Shape* packed_shape) const; - // Updates the resource with values from `pack`. If `gradient_sources` is // non-empty, treats `pack` as a tuple that represents a TensorArray and // its gradients, and unpacks and updates the gradient resources. @@ -108,14 +115,14 @@ class XlaResource { // Opposite of Pack(). Status SetFromPack(const std::set& gradient_sources, const xla::ComputationDataHandle& pack, - bool reset_initial_values, xla::ComputationBuilder* builder); - // TensorArray-specific fields + // TensorArray and Stack specific fields // 'tensor_array_size' stores the expected size of the TensorArray or Stack. // We need to store this since sometimes TensorArrays must be initialized // lazily since we do not know the element shape at construction time. + // Used by both TensorArrays and Stacks. int64 tensor_array_size() const { return tensor_array_size_; } void set_tensor_array_size(int64 size) { tensor_array_size_ = size; } @@ -136,6 +143,7 @@ class XlaResource { const string name_; DataType type_; + TensorShape shape_; xla::ComputationDataHandle value_; xla::ComputationDataHandle initial_value_; diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index c22fd37129c..34e733bc8d8 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -88,7 +88,6 @@ cc_library( visibility = [":friends"], deps = [ "//tensorflow/core:framework_lite", - "//tensorflow/core:lib", "//third_party/eigen3", ], ) diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD index 952109dde2d..02356699a25 100644 --- a/tensorflow/compiler/xla/client/BUILD +++ b/tensorflow/compiler/xla/client/BUILD @@ -80,6 +80,18 @@ cc_library( ], ) +cc_library( + name = "executable_build_options", + srcs = ["executable_build_options.cc"], + hdrs = ["executable_build_options.h"], + deps = [ + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/service:device_memory_allocator", + "//tensorflow/core:lib", + ], +) + cc_library( name = "local_client", srcs = ["local_client.cc"], @@ -87,6 +99,7 @@ cc_library( deps = [ ":client", ":computation", + ":executable_build_options", "//tensorflow/compiler/xla:executable_run_options", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc new file mode 100644 index 00000000000..804e34f5e75 --- /dev/null +++ b/tensorflow/compiler/xla/client/executable_build_options.cc @@ -0,0 +1,79 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/executable_build_options.h" + +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/core/lib/strings/stringprintf.h" + +namespace xla { + +ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator( + DeviceMemoryAllocator* allocator) { + device_allocator_ = allocator; + return *this; +} + +DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const { + return device_allocator_; +} + +ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal( + int device_ordinal) { + CHECK_GE(device_ordinal, 0); + device_ordinal_ = device_ordinal; + return *this; +} + +int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; } + +ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout( + const Shape& shape_with_layout) { + result_layout_set_ = true; + result_layout_ = shape_with_layout; + return *this; +} + +const Shape* ExecutableBuildOptions::result_layout() const { + return result_layout_set_ ? &result_layout_ : nullptr; +} + +string ExecutableBuildOptions::ToString() const { + string result_layout = "nullopt"; + if (result_layout_set_) { + result_layout = ShapeUtil::HumanStringWithLayout(result_layout_); + } + string generate_hlo_graph = "nullopt"; + if (generate_hlo_graph_.has_value()) { + generate_hlo_graph = generate_hlo_graph_.value(); + } + return tensorflow::strings::Printf( + "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, " + "generate_hlo_graph=%s}", + device_ordinal_, result_layout.c_str(), generate_hlo_graph.c_str()); +} + +ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph( + string regex) { + generate_hlo_graph_ = std::move(regex); + return *this; +} + +const tensorflow::gtl::optional& +ExecutableBuildOptions::generate_hlo_graph() const { + return generate_hlo_graph_; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h new file mode 100644 index 00000000000..3a52dbac9ad --- /dev/null +++ b/tensorflow/compiler/xla/client/executable_build_options.h @@ -0,0 +1,74 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_ + +#include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/lib/gtl/optional.h" + +namespace xla { + +// Class containing options for building an LocalExecutable with +// LocalClient::Compile. +class ExecutableBuildOptions { + public: + // If set, this is the device to build the computation for. Valid + // device_ordinal values are: 0 to # of devices - 1. These values are + // identical to the device ordinal values used by StreamExecutor. The built + // executable will be executable on any device equivalent to the specified + // device as determined by Backend::devices_equivalent(). A value of -1 + // indicates this option has not been set. + ExecutableBuildOptions& set_device_ordinal(int device_ordinal); + int device_ordinal() const; + + // If set, this specifies the layout of the result of the computation. If not + // set, the service will chose the layout of the result. A Shape is used to + // store the layout to accommodate tuple result shapes. A value of nullptr + // indicates the option has not been set. + ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout); + const Shape* result_layout() const; + + // If set, this specifies an allocator that can be used to allocate temporary + // space on the device during compilation. For example, the compiler might + // want to run various algorithms on the device and pick the fastest one -- it + // might allocate buffers for use by these algorithms using this allocator. + // + // This does not need to be the same as the DeviceMemoryAllocator passed when + // running the executable. + ExecutableBuildOptions& set_device_allocator( + DeviceMemoryAllocator* allocator); + DeviceMemoryAllocator* device_allocator() const; + + // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions). + ExecutableBuildOptions& set_generate_hlo_graph(string regex); + const tensorflow::gtl::optional& generate_hlo_graph() const; + + // Returns a string representation of the build options, suitable for + // debugging. + string ToString() const; + + private: + int device_ordinal_ = -1; + Shape result_layout_; + bool result_layout_set_ = false; + tensorflow::gtl::optional generate_hlo_graph_; + DeviceMemoryAllocator* device_allocator_ = nullptr; +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_ diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index fbeedfcecdd..ef98dbb6403 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -30,25 +30,6 @@ using xla::source_map_util::InvalidParameterArgument; namespace xla { -ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal( - int device_ordinal) { - device_ordinal_ = device_ordinal; - return *this; -} - -int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; } - -ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout( - const Shape& shape_with_layout) { - result_layout_set_ = true; - result_layout_ = shape_with_layout; - return *this; -} - -const Shape* ExecutableBuildOptions::result_layout() const { - return result_layout_set_ ? &result_layout_ : nullptr; -} - namespace { StatusOr BorrowStreamForDevice(int device_ordinal, Backend* backend) { @@ -60,16 +41,18 @@ StatusOr BorrowStreamForDevice(int device_ordinal, } // namespace LocalExecutable::LocalExecutable(std::unique_ptr executable, - Backend* backend, int device_ordinal, - const ExecutableBuildOptions& build_options) + Backend* backend, + ExecutableBuildOptions build_options) : executable_(std::move(executable)), backend_(backend), - build_device_ordinal_(device_ordinal), - build_options_(build_options) {} + build_options_(std::move(build_options)) { + CHECK_GE(build_options_.device_ordinal(), 0) + << "Must have a valid device ordinal that the executable was built for."; +} tensorflow::Status LocalExecutable::ValidateExecutionOptions( const tensorflow::gtl::ArraySlice arguments, - const ExecutableRunOptions& options, const Backend& backend) { + const ExecutableRunOptions& run_options, const Backend& backend) { const ComputationLayout& computation_layout = executable_->module_config().entry_computation_layout(); @@ -93,14 +76,14 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions( } } - if (options.stream() != nullptr) { - if (!options.stream()->ok()) { + if (run_options.stream() != nullptr) { + if (!run_options.stream()->ok()) { return InvalidArgument("stream is uninitialized or in an error state"); } // Check stream matches service platform. const se::Platform* stream_platform = - options.stream()->parent()->platform(); + run_options.stream()->parent()->platform(); if (stream_platform != backend_->platform()) { return InvalidArgument( "stream is for platform %s, but service targets platform %s", @@ -110,7 +93,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions( // Cannot specify device_ordinal with a stream. The stream determines these // values. - if (options.device_ordinal() != -1) { + if (run_options.device_ordinal() != -1) { return InvalidArgument( "cannot set both device ordinal and stream options in " "ExecutableRunOptions; the stream determines the device ordinal"); @@ -119,34 +102,34 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions( // Verify that the device the executable was built for is equivalent to the // device it will run on. - int run_device_ordinal = options.device_ordinal() == -1 + int run_device_ordinal = run_options.device_ordinal() == -1 ? backend_->default_device_ordinal() - : options.device_ordinal(); - TF_ASSIGN_OR_RETURN( - bool devices_equivalent, - backend_->devices_equivalent(run_device_ordinal, build_device_ordinal_)); + : run_options.device_ordinal(); + TF_ASSIGN_OR_RETURN(bool devices_equivalent, + backend_->devices_equivalent( + run_device_ordinal, build_options_.device_ordinal())); if (!devices_equivalent) { TF_ASSIGN_OR_RETURN(se::StreamExecutor * run_executor, backend_->stream_executor(run_device_ordinal)); TF_ASSIGN_OR_RETURN(se::StreamExecutor * build_executor, - backend_->stream_executor(build_device_ordinal_)); + backend_->stream_executor(build_device_ordinal())); return InvalidArgument( "executable is built for device %s of type \"%s\"; cannot run it on " "device %s of type \"%s\"", - backend_->device_name(build_device_ordinal_).c_str(), + backend_->device_name(build_device_ordinal()).c_str(), build_executor->GetDeviceDescription().name().c_str(), backend_->device_name(run_device_ordinal).c_str(), run_executor->GetDeviceDescription().name().c_str()); } - if (!options.allocator()) { + if (!run_options.allocator()) { return InvalidArgument("an allocator must be provided to ExecuteLocally"); } - if (options.allocator()->platform() != backend.platform()) { + if (run_options.allocator()->platform() != backend.platform()) { return InvalidArgument( "allocator platform (%s) does not match service platform (%s)", - options.allocator()->platform()->Name().c_str(), + run_options.allocator()->platform()->Name().c_str(), backend.platform()->Name().c_str()); } @@ -155,23 +138,22 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions( StatusOr> LocalExecutable::Run( const tensorflow::gtl::ArraySlice arguments, - const ExecutableRunOptions& options) { - TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_)); - - ExecutableRunOptions actual_options = options; + ExecutableRunOptions run_options) { + TF_RETURN_IF_ERROR( + ValidateExecutionOptions(arguments, run_options, *backend_)); Backend::StreamPtr stream; - if (options.stream() == nullptr) { + if (run_options.stream() == nullptr) { // NB! The lifetime of `stream` needs to match the lifetime of // `actual_options` (otherwise we will end up using a returned stream in // ExecuteOnStreamWrapper), which is why it isn't declared in the inner "if" // scope. TF_ASSIGN_OR_RETURN( - stream, BorrowStreamForDevice(options.device_ordinal(), backend_)); - actual_options.set_stream(stream.get()); + stream, BorrowStreamForDevice(run_options.device_ordinal(), backend_)); + run_options.set_stream(stream.get()); } - if (options.allocator() == nullptr) { - actual_options.set_allocator(backend_->memory_allocator()); + if (run_options.allocator() == nullptr) { + run_options.set_allocator(backend_->memory_allocator()); } // For local client execution on CPU backends: @@ -180,7 +162,7 @@ StatusOr> LocalExecutable::Run( // *) The thread pool used for XLA CPU ops is from // backend_->eigen_intra_op_thread_pool(). ServiceExecutableRunOptions service_options( - actual_options, backend_->StreamBorrower(), + run_options, backend_->StreamBorrower(), backend_->eigen_intra_op_thread_pool()); if (executable_->dumping()) { @@ -189,9 +171,8 @@ StatusOr> LocalExecutable::Run( TF_ASSIGN_OR_RETURN( std::unique_ptr result, executable_->ExecuteOnStreamWrapper( - &service_options, options.execution_profile(), arguments)); - return ScopedShapedBuffer::MakeScoped(result.get(), - actual_options.allocator()); + &service_options, run_options.execution_profile(), arguments)); + return ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator()); } StatusOr> LocalExecutable::ExecuteAndDump( @@ -267,16 +248,19 @@ StatusOr> LocalClient::Compile( const Computation& computation, const tensorflow::gtl::ArraySlice argument_layouts, const ExecutableBuildOptions& options) { - int device_ordinal = options.device_ordinal() == -1 - ? default_device_ordinal() - : options.device_ordinal(); - TF_ASSIGN_OR_RETURN(std::unique_ptr executable, - local_service_->CompileExecutable( - computation.handle(), argument_layouts, - options.result_layout(), device_ordinal)); + ExecutableBuildOptions updated_options = options; + if (options.device_ordinal() == -1) { + updated_options.set_device_ordinal(default_device_ordinal()); + VLOG(3) << "Set device ordinal to default value of: " + << updated_options.device_ordinal(); + } + TF_ASSIGN_OR_RETURN( + std::unique_ptr executable, + local_service_->CompileExecutable(computation.handle(), argument_layouts, + updated_options)); return WrapUnique(new LocalExecutable(std::move(executable), local_service_->mutable_backend(), - device_ordinal, options)); + updated_options)); } StatusOr> diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h index 19fd14f76bc..b52a30f5a0b 100644 --- a/tensorflow/compiler/xla/client/local_client.h +++ b/tensorflow/compiler/xla/client/local_client.h @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/computation.h" +#include "tensorflow/compiler/xla/client/executable_build_options.h" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" @@ -33,39 +34,13 @@ limitations under the License. namespace xla { -// Class containing options for building an LocalExecutable with -// LocalClient::Compile. -class ExecutableBuildOptions { - public: - // If set, this is the device to build the computation for. Valid - // device_ordinal values are: 0 to # of devices - 1. These values are - // identical to the device ordinal values used by StreamExecutor. The built - // executable will be executable on any device equivalent to the specified - // device as determined by Backend::devices_equivalent(). A value of -1 - // indicates this option has not been set. - ExecutableBuildOptions& set_device_ordinal(int device_ordinal); - int device_ordinal() const; - - // If set, this specifies the layout of the result of the computation. If not - // set, the service will chose the layout of the result. A Shape is used to - // store the layout to accommodate tuple result shapes. A value of nullptr - // indicates the option has not been set. - ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout); - const Shape* result_layout() const; - - private: - int device_ordinal_ = -1; - Shape result_layout_; - bool result_layout_set_ = false; -}; - class LocalExecutable { public: // Run the compiled computation with the given arguments and options and // return the result. StatusOr> Run( const tensorflow::gtl::ArraySlice arguments, - const ExecutableRunOptions& options); + ExecutableRunOptions run_options); // Return the layout (contained in a shape) of the result produced by the // computation. @@ -88,8 +63,7 @@ class LocalExecutable { // Constructor invoked by LocalClient. LocalExecutable(std::unique_ptr executable, Backend* backend, - int device_ordinal, - const ExecutableBuildOptions& build_options); + ExecutableBuildOptions build_options); // Validates that the given arguments and options satisfy various constraints // of the computation. @@ -117,19 +91,19 @@ class LocalExecutable { StatusOr> LiteralFromShapedBuffer( const ShapedBuffer& shaped_buffer); + // The ordinal of the device which this executable was compiled for. The + // executable can run on all equivalent devices (as determined by + // Backend::devices_equivalent). + int build_device_ordinal() const { return build_options_.device_ordinal(); } + // Compiled computation. std::unique_ptr executable_; // Execution backend. - Backend* backend_; - - // The ordinal of the device which this executable was compiled for. The - // executable can run on all equivalent devices (as determined by - // Backend::devices_equivalent). - int build_device_ordinal_; + Backend* backend_ = nullptr; // Options used to build the executable. - const ExecutableBuildOptions& build_options_; + const ExecutableBuildOptions build_options_; }; // An XLA Client specialization for use when the client and service run in diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc index fe3a4d2f6df..c8ed3e3a2b0 100644 --- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc +++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc @@ -221,13 +221,19 @@ void AllocateFlags() { flag_values->xla_gpu_disable_multi_streaming(), "If true, multi-streaming in the GPU backend is disabled."), tensorflow::Flag( - "xla_dump_hlo_proto_to", flag_values->mutable_xla_dump_hlo_proto_to(), - "Dump compilation artifacts as proto binary into this directory."), + "xla_dump_optimized_hlo_proto_to", + flag_values->mutable_xla_dump_optimized_hlo_proto_to(), + "Dump Hlo after all hlo passes are executed as proto binary into " + "this directory."), tensorflow::Flag( - "xla_dump_prepass_hlo_proto_to", - flag_values->mutable_xla_dump_prepass_hlo_proto_to(), - "Dump compilation artifacts, before hlo passes are executed, as " - "proto binary into this directory."), + "xla_dump_unoptimized_hlo_proto_to", + flag_values->mutable_xla_dump_unoptimized_hlo_proto_to(), + "Dump HLO before any hlo passes are executed as proto binary into " + "this directory."), + tensorflow::Flag("xla_dump_per_pass_hlo_proto_to", + flag_values->mutable_xla_dump_per_pass_hlo_proto_to(), + "Dump HLO after each pass as an HloProto in binary file " + "format into this directory."), tensorflow::Flag( "xla_test_all_output_layouts", bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts), diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h index e0196509a74..2b68b8f177d 100644 --- a/tensorflow/compiler/xla/literal_util.h +++ b/tensorflow/compiler/xla/literal_util.h @@ -486,6 +486,7 @@ class Literal { std::vector> elements); // Returns a string representation of the literal value. + // Warning: this function can take minutes for multi-million element Literals. string ToString(bool print_layout = false) const; // Invokes the "per cell" callback for each element in the provided diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index a8ca0e3ea01..e2972f06016 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -49,6 +49,7 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client:executable_build_options", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/core:framework_lite", diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc index 37f1eada2bc..8386acf0cd4 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.cc +++ b/tensorflow/compiler/xla/python/local_computation_builder.cc @@ -98,15 +98,25 @@ const std::unique_ptr& LocalShapedBuffer::shaped_buffer() return shaped_buffer_; } +static StatusOr> ToBuffer( + LocalClient* client, int device_ordinal, const Literal& arg) { + return client->LiteralToShapedBuffer(arg, device_ordinal, + client->backend().memory_allocator()); +} + /* static */ -LocalShapedBuffer* LocalShapedBuffer::FromLiteral(const Literal& argument) { +LocalShapedBuffer* LocalShapedBuffer::FromLiteral( + const Literal& argument, + const tensorflow::gtl::optional& shape_with_layout) { LocalClient* client = GetOrCreateLocalClient(); - std::unique_ptr buf = - client - ->LiteralToShapedBuffer(argument, - /*device_ordinal=*/0, - client->backend().memory_allocator()) - .ConsumeValueOrDie(); + std::unique_ptr buf; + if (shape_with_layout) { + std::unique_ptr relaid = + argument.Relayout(shape_with_layout.value()); + buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie(); + } else { + buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie(); + } return new LocalShapedBuffer(std::move(buf)); } @@ -120,7 +130,8 @@ CompiledLocalComputation::CompiledLocalComputation( : executable_(std::move(executable)) {} StatusOr> CompiledLocalComputation::Execute( - const std::vector& arguments) { + const std::vector& arguments, + const std::vector>& shapes_with_layout) { LocalClient* client = GetOrCreateLocalClient(); VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas."; @@ -133,7 +144,8 @@ StatusOr> CompiledLocalComputation::Execute( GetReplicaCount()); for (int replica = 0; replica < GetReplicaCount(); ++replica) { - pool.Schedule([this, client, replica, &arguments, &results] { + pool.Schedule([this, client, replica, &arguments, &shapes_with_layout, + &results] { StatusOr device_ordinal_status = client->ReplicaNumberToDeviceOrdinal(replica); if (!device_ordinal_status.ok()) { @@ -144,18 +156,28 @@ StatusOr> CompiledLocalComputation::Execute( VLOG(3) << "Replica " << replica << " mapped to device ordinal for execution: " << device_ordinal; + // Transfer arguments in std::vector> scoped_buffers; scoped_buffers.reserve(arguments.size()); - for (const Literal& argument : arguments) { - StatusOr> pushed = - client->LiteralToShapedBuffer( - argument, device_ordinal, - client->backend().memory_allocator()); + for (int i = 0; i < arguments.size(); ++i) { + const Literal& argument = arguments[i]; + const tensorflow::gtl::optional& shape_with_layout = + shapes_with_layout[i]; + + StatusOr> pushed; + if (shape_with_layout) { + std::unique_ptr relaid = + argument.Relayout(shape_with_layout.value()); + pushed = ToBuffer(client, device_ordinal, *relaid); + } else { + pushed = ToBuffer(client, device_ordinal, argument); + } if (!pushed.ok()) { results[replica] = pushed.status(); return; } + scoped_buffers.push_back(std::move(pushed).ValueOrDie()); } @@ -233,7 +255,8 @@ LocalComputation::LocalComputation(Computation computation) : computation_(std::move(computation)) {} StatusOr LocalComputation::Compile( - const std::vector& argument_shapes) { + const std::vector& argument_shapes, + const ExecutableBuildOptions* build_options) { std::vector argument_shape_pointers; argument_shape_pointers.reserve(argument_shapes.size()); for (auto& argument_shape : argument_shapes) { @@ -242,6 +265,9 @@ StatusOr LocalComputation::Compile( LocalClient* client = GetOrCreateLocalClient(); ExecutableBuildOptions options; + if (build_options != nullptr) { + options = *build_options; + } TF_ASSIGN_OR_RETURN( auto local_executable, client->Compile(computation_, argument_shape_pointers, options)); @@ -363,12 +389,6 @@ LocalComputationBuilder::SelectAndScatterWithGeneralPadding( source, init_value, scatter.computation()); } -ComputationDataHandle LocalComputationBuilder::Select( - const ComputationDataHandle& pred, const ComputationDataHandle& on_true, - const ComputationDataHandle& on_false) { - return builder_.Select(pred, on_true, on_false); -} - ComputationDataHandle LocalComputationBuilder::Tuple( tensorflow::gtl::ArraySlice elements) { return builder_.Tuple(elements); @@ -384,6 +404,12 @@ ComputationDataHandle LocalComputationBuilder::Dot( return builder_.Dot(lhs, rhs); } +ComputationDataHandle LocalComputationBuilder::DotGeneral( + const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, + const DotDimensionNumbers& dimension_numbers) { + return builder_.DotGeneral(lhs, rhs, dimension_numbers); +} + ComputationDataHandle LocalComputationBuilder::ConvGeneralDilated( const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, tensorflow::gtl::ArraySlice window_strides, @@ -483,6 +509,15 @@ ComputationDataHandle LocalComputationBuilder::While( tensorflow::gtl::ArraySlice broadcast_dimensions), \ (lhs, rhs, broadcast_dimensions)) +#define _FORWARD_TRIOP(method_name) \ + _FORWARD( \ + method_name, ComputationDataHandle, \ + (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \ + const ComputationDataHandle& ehs), \ + (lhs, rhs, ehs)) + +_FORWARD_TRIOP(Select) +_FORWARD_TRIOP(Clamp) _FORWARD_BINOP(Eq) _FORWARD_BINOP(Ne) _FORWARD_BINOP(Ge) @@ -503,6 +538,7 @@ _FORWARD_UNOP(Abs) _FORWARD_UNOP(Exp) _FORWARD_UNOP(Floor) _FORWARD_UNOP(Ceil) +_FORWARD_UNOP(Round) _FORWARD_UNOP(Log) _FORWARD_UNOP(Sign) _FORWARD_UNOP(Cos) @@ -519,6 +555,7 @@ _FORWARD_UNOP(Sort) #undef _FORWARD #undef _FORWARD_UNOP #undef _FORWARD_BINOP +#undef _FORWARD_TRIOP void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer) { delete local_shaped_buffer; diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h index e5503cd52fa..f39d15cff70 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.h +++ b/tensorflow/compiler/xla/python/local_computation_builder.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/computation_builder.h" +#include "tensorflow/compiler/xla/client/executable_build_options.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -58,7 +59,9 @@ StatusOr > TransferFromOutfeedLocalReplica( // client. class LocalShapedBuffer { public: - static LocalShapedBuffer* FromLiteral(const Literal& argument); + static LocalShapedBuffer* FromLiteral( + const Literal& argument, + const tensorflow::gtl::optional& shape_with_layout); LocalShapedBuffer(std::unique_ptr shaped_buffer); const std::unique_ptr& shaped_buffer() const; std::unique_ptr ToLiteral() const; @@ -76,8 +79,15 @@ class LocalShapedBuffer { class CompiledLocalComputation { public: CompiledLocalComputation(std::unique_ptr executable); + + // Execute the computation with the given argument literals, and + // with optionally-specified argument layouts. The literals will be + // re-laid out according to the corresponding elements of + // shapes_with_layout. StatusOr > Execute( - const std::vector& arguments); + const std::vector& arguments, + const std::vector >& shapes_with_layout); + LocalShapedBuffer* ExecuteWithShapedBuffers( tensorflow::gtl::ArraySlice argument_handles); @@ -93,7 +103,8 @@ class LocalComputation { public: LocalComputation(Computation computation); StatusOr Compile( - const std::vector& argument_shapes); + const std::vector& argument_shapes, + const ExecutableBuildOptions* build_options); const Computation& computation() const; private: @@ -172,10 +183,6 @@ class LocalComputationBuilder { const ComputationDataHandle& source, const ComputationDataHandle& init_value, const LocalComputation& scatter); - ComputationDataHandle Select(const ComputationDataHandle& pred, - const ComputationDataHandle& on_true, - const ComputationDataHandle& on_false); - ComputationDataHandle Tuple( tensorflow::gtl::ArraySlice elements); @@ -185,6 +192,10 @@ class LocalComputationBuilder { ComputationDataHandle Dot(const ComputationDataHandle& lhs, const ComputationDataHandle& rhs); + ComputationDataHandle DotGeneral( + const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, + const DotDimensionNumbers& dimension_numbers); + ComputationDataHandle ConvGeneralDilated( const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, tensorflow::gtl::ArraySlice window_strides, @@ -252,6 +263,14 @@ class LocalComputationBuilder { (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \ tensorflow::gtl::ArraySlice broadcast_dimensions)) +#define _FORWARD_TRIOP(method_name) \ + _FORWARD( \ + method_name, ComputationDataHandle, \ + (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \ + const ComputationDataHandle& ehs)) + + _FORWARD_TRIOP(Select) + _FORWARD_TRIOP(Clamp) _FORWARD_BINOP(Eq) _FORWARD_BINOP(Ne) _FORWARD_BINOP(Ge) @@ -272,6 +291,7 @@ class LocalComputationBuilder { _FORWARD_UNOP(Exp) _FORWARD_UNOP(Floor) _FORWARD_UNOP(Ceil) + _FORWARD_UNOP(Round) _FORWARD_UNOP(Log) _FORWARD_UNOP(Sign) _FORWARD_UNOP(Cos) @@ -288,6 +308,7 @@ class LocalComputationBuilder { #undef _FORWARD #undef _FORWARD_UNOP #undef _FORWARD_BINOP +#undef _FORWARD_TRIOP private: ComputationBuilder builder_; diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i index 31789259609..5ea75550c96 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.i +++ b/tensorflow/compiler/xla/python/local_computation_builder.i @@ -27,12 +27,14 @@ limitations under the License. // ArraySlice <- sequence of int // Literal <-> (nested tuple of) numpy ndarray // std::vector <- sequence of (nested tuple of) ndarray -// Shape <-> pair holding (dtype, dimensions) -// std::vector <- sequence of shape information pairs +// Shape -> pair holding (dtype, dimensions) +// <- object duck-typed as xla_client.Shape +// std::vector <- sequence of xla_client.Shape objects // PrimitiveType <- int // ArraySlice> <- sequence of int pairs // PaddingConfig proto <- corresponding Python proto // ConvolutionDimensionNumbers proto <- corresponding Python proto +// DotDimensionNumbers proto <- corresponding Python proto // // Arrows indicate whether a conversion only ever occurs in one // direction, or whether it is maintained bidirectionally. @@ -55,7 +57,7 @@ limitations under the License. // translates to a tuple-shaped XLA Literal, whose component subshapes // are a 2x3 F32-shaped literal followed by two tuple-shaped literals. // -// The Python objects corresponding to C++ Shapes have the type: +// Shapes output by C++ become Python objects with the type: // // T = (dtype, S) // S = DIMENSIONS | TUPLE_SHAPES @@ -176,6 +178,16 @@ tensorflow::ImportNumpy(); } } +%typemap(out) StatusOr< std::unique_ptr > { + if ($1.ok()) { + std::unique_ptr value = $1.ConsumeValueOrDie(); + $result = numpy::PyObjectFromXlaLiteral(*value); + } else { + PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str()); + return NULL; + } +} + %typemap(out) StatusOr { if ($1.ok()) { auto* value = $1.ValueOrDie(); @@ -343,15 +355,31 @@ tensorflow::ImportNumpy(); // Shape %typemap(in) const Shape& (Shape temp) { - Status shape_status = numpy::CheckPyShapeInfo($input); - if (!shape_status.ok()) { - PyErr_SetString(PyExc_RuntimeError, shape_status.ToString().c_str()); + StatusOr statusor = numpy::XlaShapeFromPyShape($input); + if (!statusor.ok()) { + PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str()); return NULL; } - temp = numpy::XlaShapeFromPyShapeInfo($input); + temp = std::move(statusor).ValueOrDie(); $1 = &temp; } +%typemap(in) const tensorflow::gtl::optional& ( + tensorflow::gtl::optional temp) { + if ($input == Py_None) { + temp = tensorflow::gtl::nullopt; + $1 = &temp; + } else { + StatusOr statusor = numpy::XlaShapeFromPyShape($input); + if (!statusor.ok()) { + PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str()); + return NULL; + } + temp = std::move(statusor).ValueOrDie(); + $1 = &temp; + } +} + %typemap(out) std::unique_ptr { $result = numpy::PyShapeInfoFromXlaShape(*$1); } @@ -364,14 +392,37 @@ tensorflow::ImportNumpy(); const int size = PySequence_Size($input); for (int i = 0; i < size; ++i) { PyObject* o = PySequence_GetItem($input, i); - Status shape_status = numpy::CheckPyShapeInfo(o); - if (!shape_status.ok()) { - PyErr_SetString(PyExc_RuntimeError, shape_status.ToString().c_str()); - Py_DECREF(o); + StatusOr statusor = numpy::XlaShapeFromPyShape(o); + Py_DECREF(o); + if (!statusor.ok()) { + PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str()); return NULL; } - temps.push_back(numpy::XlaShapeFromPyShapeInfo(o)); - Py_DECREF(o); + temps.push_back(statusor.ConsumeValueOrDie()); + } + $1 = &temps; +} + +%typemap(in) const std::vector >& ( + std::vector > temps) { + if (!PySequence_Check($input)) { + PyErr_SetString(PyExc_TypeError, "Argument is not a sequence"); + return NULL; + } + const int size = PySequence_Size($input); + for (int i = 0; i < size; ++i) { + PyObject* o = PySequence_GetItem($input, i); + if (o == Py_None) { + temps.push_back(tensorflow::gtl::nullopt); + } else { + StatusOr statusor = numpy::XlaShapeFromPyShape(o); + Py_DECREF(o); + if (!statusor.ok()) { + PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str()); + return NULL; + } + temps.push_back(statusor.ConsumeValueOrDie()); + } } $1 = &temps; } @@ -461,6 +512,135 @@ tensorflow::ImportNumpy(); $1 = temps; } +// DotDimensionNumbers + +%typemap(in) const DotDimensionNumbers& + (DotDimensionNumbers dimension_numbers) { + int length; + + /* lhs_contracting_dimensions */ + PyObject* lhs_contracting_dimensions = PyObject_GetAttrString( + $input, "lhs_contracting_dimensions"); + if (!lhs_contracting_dimensions) { + return NULL; + } + + length = PySequence_Size(lhs_contracting_dimensions); + if (length == -1) { + Py_DECREF(lhs_contracting_dimensions); + return NULL; + } + + for (int i = 0; i < length; ++i) { + PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i); + if (!item) { + Py_DECREF(lhs_contracting_dimensions); + return NULL; + } + const int64 dimension = numpy::PyIntOrPyLongToLong(item); + if (dimension == -1 && PyErr_Occurred()) { + Py_DECREF(item); + Py_DECREF(lhs_contracting_dimensions); + return NULL; + } + dimension_numbers.add_lhs_contracting_dimensions(dimension); + Py_DECREF(item); + } + Py_DECREF(lhs_contracting_dimensions); + + /* rhs_contracting_dimensions */ + PyObject* rhs_contracting_dimensions = PyObject_GetAttrString( + $input, "rhs_contracting_dimensions"); + if (!lhs_contracting_dimensions) { + return NULL; + } + + length = PySequence_Size(rhs_contracting_dimensions); + if (length == -1) { + Py_DECREF(rhs_contracting_dimensions); + return NULL; + } + + for (int i = 0; i < length; ++i) { + PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i); + if (!item) { + Py_DECREF(rhs_contracting_dimensions); + return NULL; + } + const int64 dimension = numpy::PyIntOrPyLongToLong(item); + if (dimension == -1 && PyErr_Occurred()) { + Py_DECREF(item); + Py_DECREF(rhs_contracting_dimensions); + return NULL; + } + dimension_numbers.add_rhs_contracting_dimensions(dimension); + Py_DECREF(item); + } + Py_DECREF(rhs_contracting_dimensions); + + /* lhs_batch_dimensions */ + PyObject* lhs_batch_dimensions = PyObject_GetAttrString( + $input, "lhs_batch_dimensions"); + if (!lhs_batch_dimensions) { + return NULL; + } + + length = PySequence_Size(lhs_batch_dimensions); + if (length == -1) { + Py_DECREF(lhs_batch_dimensions); + return NULL; + } + + for (int i = 0; i < length; ++i) { + PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i); + if (!item) { + Py_DECREF(lhs_batch_dimensions); + return NULL; + } + const int64 dimension = numpy::PyIntOrPyLongToLong(item); + if (dimension == -1 && PyErr_Occurred()) { + Py_DECREF(item); + Py_DECREF(lhs_batch_dimensions); + return NULL; + } + dimension_numbers.add_lhs_batch_dimensions(dimension); + Py_DECREF(item); + } + Py_DECREF(lhs_batch_dimensions); + + /* rhs_batch_dimensions */ + PyObject* rhs_batch_dimensions = PyObject_GetAttrString( + $input, "rhs_batch_dimensions"); + if (!rhs_batch_dimensions) { + return NULL; + } + + length = PySequence_Size(rhs_batch_dimensions); + if (length == -1) { + Py_DECREF(rhs_batch_dimensions); + return NULL; + } + + for (int i = 0; i < length; ++i) { + PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i); + if (!item) { + Py_DECREF(rhs_batch_dimensions); + return NULL; + } + const int64 dimension = numpy::PyIntOrPyLongToLong(item); + if (dimension == -1 && PyErr_Occurred()) { + Py_DECREF(item); + Py_DECREF(rhs_batch_dimensions); + return NULL; + } + dimension_numbers.add_rhs_batch_dimensions(dimension); + Py_DECREF(item); + } + Py_DECREF(rhs_batch_dimensions); + + $1 = &dimension_numbers; +} + // PaddingConfig %typemap(in) const PaddingConfig& @@ -623,6 +803,30 @@ tensorflow::ImportNumpy(); $1 = &dimension_numbers; } +// ExecutableBuildOptions + +%typemap(in) const ExecutableBuildOptions* + (ExecutableBuildOptions build_options) { + if ($input == Py_None) { + $1 = NULL; + } else { + PyObject* o = PyObject_GetAttrString($input, "generate_hlo_graph"); + if (!o) { + return NULL; + } + if (o != Py_None) { + if (!PyString_Check(o)) { + PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.generate_hlo_graph must be a string or None."); + return NULL; + } + build_options.set_generate_hlo_graph(PyString_AsString(o)); + } + Py_DECREF(o); + + $1 = &build_options; + } +} + %ignoreall %unignore xla; %unignore xla::swig; @@ -667,6 +871,7 @@ tensorflow::ImportNumpy(); %unignore xla::swig::LocalComputationBuilder::Call; %unignore xla::swig::LocalComputationBuilder::Transpose; %unignore xla::swig::LocalComputationBuilder::Rev; +%unignore xla::swig::LocalComputationBuilder::Clamp; %unignore xla::swig::LocalComputationBuilder::Map; %unignore xla::swig::LocalComputationBuilder::Reduce; %unignore xla::swig::LocalComputationBuilder::ReduceWindowWithGeneralPadding; @@ -681,6 +886,7 @@ tensorflow::ImportNumpy(); %unignore xla::swig::LocalComputationBuilder::Lt; %unignore xla::swig::LocalComputationBuilder::Le; %unignore xla::swig::LocalComputationBuilder::Dot; +%unignore xla::swig::LocalComputationBuilder::DotGeneral; %unignore xla::swig::LocalComputationBuilder::ConvGeneralDilated; %unignore xla::swig::LocalComputationBuilder::Add; %unignore xla::swig::LocalComputationBuilder::Sub; @@ -696,6 +902,7 @@ tensorflow::ImportNumpy(); %unignore xla::swig::LocalComputationBuilder::Exp; %unignore xla::swig::LocalComputationBuilder::Floor; %unignore xla::swig::LocalComputationBuilder::Ceil; +%unignore xla::swig::LocalComputationBuilder::Round; %unignore xla::swig::LocalComputationBuilder::Log; %unignore xla::swig::LocalComputationBuilder::Sign; %unignore xla::swig::LocalComputationBuilder::Cos; diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc index 5c722623e31..3d87480728a 100644 --- a/tensorflow/compiler/xla/python/numpy_bridge.cc +++ b/tensorflow/compiler/xla/python/numpy_bridge.cc @@ -176,85 +176,107 @@ static string PyObjectCppRepr(PyObject* o) { return ExtractStringAndDecref(r); } -Status CheckPyShapeInfo(PyObject* o) { +StatusOr XlaShapeFromPyShape(PyObject* o) { auto error = [o](const string& prefix) { return InvalidArgument("%s; got %s", prefix.c_str(), PyObjectCppRepr(o).c_str()); }; - // The object is a tuple (a pair) - if (!PyTuple_Check(o)) { - return error("Shape record must be a tuple"); - } - if (PyTuple_Size(o) != 2) { - return error("Shape record tuple must be of length 2"); - } - // It has a first element, which is a numpy dtype object - PyObject* first = PyTuple_GetItem(o, 0); - if (first == nullptr) { - return error("Tuple has no item 0 (shape dtype)"); - } - if (first->ob_type != &PyArrayDescr_Type) { - return error( - "Shape record does not have a numpy dtype as its first element"); - } - const int np_type = NumpyTypenum(first); - if (!NumpyTypeIsValid(np_type)) { - return error("Shape record has an invalid integer dtype"); - } - - // It has a second element, which is a tuple, either of shape - // records or of Python ints - PyObject* second = PyTuple_GetItem(o, 1); - if (!second) { - return error("Tuple has no item 0 (shape dimensions)"); - } - if (!PyTuple_Check(second)) { - return error("Shape record does not have a tuple as its second element"); - } - const int length = PyTuple_Size(second); - const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type); - for (int i = 0; i < length; i++) { - PyObject* dimension = PyTuple_GetItem(second, i); - if (element_type == TUPLE) { - VLOG(3) << "element_type is tuple, checking member: " << i; - Status result = CheckPyShapeInfo(dimension); - if (!result.ok()) { - return AddStatus( - result, tensorflow::strings::StrCat("Validating tuple member ", i, - " of ", PyObjectCppRepr(o))); - } - } else if (!CheckPyIntOrLong(dimension)) { - return error("Non-tuple shape record has a non-integer dimension"); + auto get_attr = [o, &error](const string& field) -> StatusOr { + PyObject* result = + PyObject_GetAttrString(o, const_cast(field.c_str())); + if (result == nullptr) { + return error(tensorflow::strings::StrCat( + "Failed to get attribute of Shape object:", field)); } + return result; + }; + + auto call_method = [o, &error](const string& method) -> StatusOr { + PyObject* result = + PyObject_CallMethod(o, const_cast(method.c_str()), nullptr); + if (result == nullptr) { + return error(tensorflow::strings::StrCat( + "Failed to call method of shape object:", method)); + } + return result; + }; + + PyObject* np_type; + TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype")); + if (np_type->ob_type != &PyArrayDescr_Type) { + return error("Shape attribute np_dtype is not an integer numpy dtype"); } + if (!NumpyTypeIsValid(NumpyTypenum(np_type))) { + return error("Shape attribute np_dtype is not a valid integer numpy dtype"); + } + const PrimitiveType element_type = + NumpyTypeToPrimitiveType(NumpyTypenum(np_type)); + Py_DECREF(np_type); - return Status::OK(); -} - -// Precondition: CheckPyShapeInfo(o) -Shape XlaShapeFromPyShapeInfo(PyObject* o) { - const int np_type = NumpyTypenum(PyTuple_GetItem(o, 0)); - const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type); - PyObject* py_dimensions = PyTuple_GetItem(o, 1); - const int length = PyTuple_Size(py_dimensions); if (element_type == TUPLE) { + PyObject* py_subshapes; + TF_ASSIGN_OR_RETURN(py_subshapes, call_method("tuple_shapes")); + if (!PyTuple_Check(py_subshapes)) { + return error( + "Return value of Shape method tuple_shapes() is not a tuple"); + } + const int length = PyTuple_Size(py_subshapes); std::vector subshapes; subshapes.reserve(length); for (int i = 0; i < length; i++) { - subshapes.push_back( - XlaShapeFromPyShapeInfo(PyTuple_GetItem(py_dimensions, i))); + TF_ASSIGN_OR_RETURN( + const Shape& subshape, + XlaShapeFromPyShape(PyTuple_GetItem(py_subshapes, i))); + subshapes.push_back(subshape); } + Py_DECREF(py_subshapes); return ShapeUtil::MakeTupleShape(subshapes); } else { + PyObject* py_dimensions; + PyObject* py_minor_to_major; + TF_ASSIGN_OR_RETURN(py_dimensions, call_method("dimensions")); + TF_ASSIGN_OR_RETURN(py_minor_to_major, call_method("minor_to_major")); + if (!PyTuple_Check(py_dimensions)) { + return error("Return value of Shape method dimensions() is not a tuple"); + } + if (py_minor_to_major != Py_None && !PyTuple_Check(py_minor_to_major)) { + return error( + "Return value of Shape method minor_to_major() is neither a tuple " + "nor None"); + } + const int length = PyTuple_Size(py_dimensions); + if (py_minor_to_major != Py_None && + length != PyTuple_Size(py_minor_to_major)) { + return error( + "Shape methods dimensions() and minor_to_major() return " + "different-length tuples"); + } std::vector dimensions(length); + std::vector minor_to_major(length); for (int i = 0; i < length; i++) { dimensions[i] = PyIntOrPyLongToLong(PyTuple_GetItem(py_dimensions, i)); - if (dimensions[i] == -1) { - CHECK(!PyErr_Occurred()); + if (dimensions[i] == -1 && PyErr_Occurred()) { + return error("Dimension is not an int"); + } + + if (py_minor_to_major != Py_None) { + minor_to_major[i] = + PyIntOrPyLongToLong(PyTuple_GetItem(py_minor_to_major, i)); + if (minor_to_major[i] == -1 && PyErr_Occurred()) { + return error("Minor-to-major value is not an int"); + } } } - return ShapeUtil::MakeShape(element_type, dimensions); + bool with_layout = py_minor_to_major != Py_None; + Py_DECREF(py_dimensions); + Py_DECREF(py_minor_to_major); + if (with_layout) { + return ShapeUtil::MakeShapeWithLayout(element_type, dimensions, + minor_to_major); + } else { + return ShapeUtil::MakeShape(element_type, dimensions); + } } } diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h index 6ff1c34cfc5..adfcc3b8588 100644 --- a/tensorflow/compiler/xla/python/numpy_bridge.h +++ b/tensorflow/compiler/xla/python/numpy_bridge.h @@ -56,15 +56,11 @@ bool NumpyTypeIsValid(int np_type); // The return value is a new reference. PyObject* PyShapeInfoFromXlaShape(const Shape& shape); -// Returns the outcome of a best-effort check that the Python object -// is a pair of the form (numpy dtype, dimensions), as produced by -// PyShapeInfoFromXlaShape. -Status CheckPyShapeInfo(PyObject* o); - -// Performs the inverse conversion to that of PyShapeInfoFromXlaShape. +// Converts a Python object with a method interface mathing that of +// xla_client.Shape into an XLA Shape object. // // The return value is a new reference. -Shape XlaShapeFromPyShapeInfo(PyObject* o); +StatusOr XlaShapeFromPyShape(PyObject* o); // Converts a PyObject that represents operation metadata into protocol buffer // form. diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index 66ace613a0c..2c693f05816 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -89,6 +89,7 @@ _UNARY_OPS = [ 'Abs', 'Exp', 'Floor', + 'Round', 'Ceil', 'Log', 'Sign', @@ -155,9 +156,14 @@ class LocalBuffer(object): self._delete = c_api.DeleteLocalShapedBuffer @staticmethod - def from_py(npval): + def from_py(npval, layout_fn=None): npval = require_numpy_array_layout(npval) - return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval)) + if layout_fn: + shape = Shape.from_numpy(npval) + shape = shape.map_leaves(layout_fn) + else: + shape = None + return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape)) def to_py(self): return self.c_local_shaped_buffer.ToLiteral() @@ -182,13 +188,17 @@ class Shape(object): represents an XLA tuple. """ - def __init__(self, np_dtype, dimensions): + def __init__(self, np_dtype, dimensions, minor_to_major=None): + assert isinstance(dimensions, tuple) self.np_dtype = np_dtype self._dimensions = dimensions + self._minor_to_major = minor_to_major + self._check_minor_to_major() def __repr__(self): - return 'xla_client.Shape(np_dtype={!r}, dimensions={!r})'.format( - self.np_dtype, self._dimensions) + return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, ' + 'minor_to_major={!r})').format(self.np_dtype, self._dimensions, + self._minor_to_major) def element_type(self): return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)] @@ -201,11 +211,49 @@ class Shape(object): raise ValueError('Tuple shape has no dimensions') return self._dimensions + def minor_to_major(self): + return self._minor_to_major + def tuple_shapes(self): if not self.is_tuple(): raise ValueError('Shape is not a tuple shape') return self._dimensions + def rank(self): + return len(self.dimensions()) + + def map_leaves(self, f): + """Map f over each leaf-level array subshape. + + Args: + f: The function to apply. Whenever f returns None, the identity is + applied instead. + + Returns: + A new Shape with the mapped leaves. + """ + if self.is_tuple(): + children = tuple(child.map_leaves(f) for child in self.tuple_shapes()) + return Shape(np.dtype('O'), children) + else: + mapped = f(self) + return self if mapped is None else mapped + + def _check_minor_to_major(self): + mtm = self._minor_to_major + if self.is_tuple(): + assert mtm is None, self + if mtm is not None: + assert self.rank() == len(mtm), self + assert sorted(mtm) == range(len(mtm)), self + + def update_minor_to_major(self, minor_to_major): + if not isinstance(minor_to_major, tuple): + raise TypeError('minor_to_major must be a tuple') + updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major) + updated._check_minor_to_major() # pylint: disable=protected-access + return updated + @staticmethod def from_numpy(npval): @@ -222,23 +270,10 @@ def _wrap_shape(shape_info): dtype, dims = shape_info element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)] if element_type == xla_data_pb2.TUPLE: - dims = [_wrap_shape(subshape_info) for subshape_info in dims] + dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims) return Shape(dtype, dims) -def _unwrap_shape(shape): - if shape.is_tuple(): - components = tuple( - _unwrap_shape(subshape) for subshape in shape.tuple_shapes()) - else: - components = shape.dimensions() - return (shape.np_dtype, components) - - -def _unwrap_shapes(shapes): - return [_unwrap_shape(shape) for shape in shapes] - - def _wrap_data_handle(handle): cdh = xla_data_pb2.ComputationDataHandle() cdh.handle = handle @@ -260,6 +295,17 @@ def require_numpy_array_layout(value): return np.require(value, requirements=['C', 'A']) +class CompileOptions(object): + """Python object for XLA compile options. + + These options can be passed to the 'compile' step when using a local XLA + client. + """ + + def __init__(self): + self.generate_hlo_graph = None + + def transfer_to_infeed(value, replica_number=None): """Transfers the given value into the XLA infeed queue. @@ -291,8 +337,7 @@ def transfer_from_outfeed(shape, replica_number=None): Returns: The literal value that is produced from the outfeed queue. """ - return c_api.TransferFromOutfeedLocalReplica( - _unwrap_shape(shape), replica_number or 0) + return c_api.TransferFromOutfeedLocalReplica(shape, replica_number or 0) class LocalComputation(object): @@ -313,22 +358,39 @@ class LocalComputation(object): else: self._delete = c_api.DeleteLocalComputation - def Compile(self, argument_shapes=()): + def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None): if self.is_compiled: raise ValueError('Attempt to compile a compiled local XLA computation.') + if layout_fn: + argument_shapes = [ + shape.map_leaves(layout_fn) for shape in argument_shapes + ] return LocalComputation( - self.c_local_computation.Compile(_unwrap_shapes(argument_shapes)), + self.c_local_computation.Compile(argument_shapes, compile_options), is_compiled=True) - def CompileWithExampleArguments(self, arguments=()): + def CompileWithExampleArguments(self, + arguments=(), + compile_options=None, + layout_fn=None): return self.Compile( - argument_shapes=[Shape.from_numpy(arg) for arg in arguments]) + argument_shapes=[Shape.from_numpy(arg) for arg in arguments], + compile_options=compile_options, + layout_fn=layout_fn) - def Execute(self, arguments=()): + def Execute(self, arguments=(), layout_fn=None): + """Execute with Python values as arguments and return value.""" if not self.is_compiled: raise ValueError('Cannot execute an uncompiled local XLA computation.') + argument_shapes = [Shape.from_numpy(arg) for arg in arguments] + if layout_fn: + argument_shapes = [ + shape.map_leaves(layout_fn) for shape in argument_shapes + ] + else: + argument_shapes = [None for shape in argument_shapes] arguments = tuple(map(require_numpy_array_layout, arguments)) - return self.c_local_computation.Execute(arguments) + return self.c_local_computation.Execute(arguments, argument_shapes) def ExecuteWithLocalBuffers(self, arguments=()): """Execute with LocalBuffer arguments and return value.""" @@ -384,7 +446,7 @@ class ComputationBuilder(object): Returns: A ComputationDataHandle message. """ - return _wrap_data_handle(self._client.Infeed(_unwrap_shape(shape))) + return _wrap_data_handle(self._client.Infeed(shape)) def Outfeed(self, operand): """Enqueues an outfeed op onto the computation. @@ -393,7 +455,7 @@ class ComputationBuilder(object): outfeed queue for subsequent dequeue via the client API. """ self._client.Outfeed( - _unwrap_data_handle(operand), _unwrap_shape(self.GetShape(operand)), + _unwrap_data_handle(operand), self.GetShape(operand), ''.encode('utf-8')) def Constant(self, value): @@ -484,8 +546,7 @@ class ComputationBuilder(object): parameter_num = next(self._parameter_numbering) return _wrap_data_handle( - self._client.Parameter( - parameter_num, _unwrap_shape(shape), name.encode('utf8'))) + self._client.Parameter(parameter_num, shape, name.encode('utf8'))) def ParameterFromNumpy(self, value, name=None, parameter_num=None): """Enqueues a Parameter op onto the computation. @@ -606,6 +667,13 @@ class ComputationBuilder(object): return _wrap_data_handle( self._client.Rev(_unwrap_data_handle(operand), dimensions)) + def Clamp(self, min, operand, max): # pylint: disable=redefined-builtin + """Clamp op.""" + return _wrap_data_handle( + self._client.Clamp(_unwrap_data_handle(min), + _unwrap_data_handle(operand), + _unwrap_data_handle(max))) + def SelectAndScatter(self, operand, select, window_dimensions, window_strides, padding, source, init_value, scatter): """Select and scatter op, used by the gradient of ReduceWindow. @@ -825,8 +893,7 @@ class ComputationBuilder(object): shape = Shape(self.GetShape(mu).np_dtype, dims) return _wrap_data_handle( self._client.RngNormal( - _unwrap_data_handle(mu), _unwrap_data_handle(sigma), - _unwrap_shape(shape))) + _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape)) def RngUniform(self, a, b, dims): """Enqueues an RngUniform operation onto the computation. @@ -846,8 +913,7 @@ class ComputationBuilder(object): shape = Shape(self.GetShape(a).np_dtype, dims) return _wrap_data_handle( self._client.RngUniform( - _unwrap_data_handle(a), _unwrap_data_handle(b), - _unwrap_shape(shape))) + _unwrap_data_handle(a), _unwrap_data_handle(b), shape)) def While(self, cond, body, init): """Enqueues a While operation onto the computation. @@ -865,10 +931,37 @@ class ComputationBuilder(object): _unwrap_data_handle(init))) def Dot(self, lhs, rhs): - """Matrix multiplication between lhs and rhs.""" + """Enqueues a dot operation onto the computation. + + Args: + lhs: ComputationDataHandle for the rank 1 or rank 2 left-hand-side array. + rhs: ComputationDataHandle for the rank 1 or rank 2 right-hand-side array. + + Returns: a ComputationDataHandle representing the Dot operation. + """ return _wrap_data_handle( self._client.Dot(_unwrap_data_handle(lhs), _unwrap_data_handle(rhs))) + def DotGeneral(self, lhs, rhs, dimension_numbers): + """Enqueues a general dot operation onto the computation. + + Args: + lhs: ComputationDataHandle for the left-hand-side array. + rhs: ComputationDataHandle for the right-hand-side array. + dimension_numbers: either an xla_data_pb2.DotDimensionNumbers or a nested + tuple ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of + integers representing the dimensions to treat as contracting dimensions + and batch dimensions on each input operand. + + Returns: a ComputationDataHandle representing the DotGeneral operation. + """ + if not isinstance(dimension_numbers, xla_data_pb2.DotDimensionNumbers): + dimension_numbers = GetDotDimensionsFromLists(dimension_numbers) + return _wrap_data_handle( + self._client.DotGeneral( + _unwrap_data_handle(lhs), _unwrap_data_handle(rhs), + dimension_numbers)) + def Conv(self, lhs, rhs, window_strides, padding): """Enqueues a Conv operation onto the computation. @@ -979,7 +1072,7 @@ def initialize_replica_count(replica_count): Args: replica_count: number of replicas that are desired for set up during XLA - initalization. + initialization. Raises: A runtime exception if the XLA service has already been initialized. @@ -1005,3 +1098,13 @@ def GetPaddingConfigFromTriples(triples): dimension.edge_padding_high = hi dimension.interior_padding = interior return padding_config + + +def GetDotDimensionsFromLists(dimension_numbers): + (lhs_contract, rhs_contract), (lhs_batch, rhs_batch) = dimension_numbers + dot_dims_proto = xla_data_pb2.DotDimensionNumbers() + dot_dims_proto.lhs_contracting_dimensions.extend(lhs_contract) + dot_dims_proto.rhs_contracting_dimensions.extend(rhs_contract) + dot_dims_proto.lhs_batch_dimensions.extend(lhs_batch) + dot_dims_proto.rhs_batch_dimensions.extend(rhs_batch) + return dot_dims_proto diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py index c0413b9bbc3..421fba40e32 100644 --- a/tensorflow/compiler/xla/python/xla_client_test.py +++ b/tensorflow/compiler/xla/python/xla_client_test.py @@ -444,6 +444,30 @@ class SingleOpTest(LocalComputationTest): c.Dot(c.Constant(lhs), c.Constant(rhs)) self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs)) + def testDotGeneral(self): + c = self._NewComputation() + rng = np.random.RandomState(0) + lhs = NumpyArrayF32(rng.randn(10, 3, 4)) + rhs = NumpyArrayF32(rng.randn(10, 4, 5)) + dimension_numbers = (([2], [1]), ([0], [0])) + c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers) + self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs)) + + def testDotGeneralWithDotDimensionNumbersProto(self): + c = self._NewComputation() + rng = np.random.RandomState(0) + lhs = NumpyArrayF32(rng.randn(10, 3, 4)) + rhs = NumpyArrayF32(rng.randn(10, 4, 5)) + + dimension_numbers = xla_client.xla_data_pb2.DotDimensionNumbers() + dimension_numbers.lhs_contracting_dimensions.append(2) + dimension_numbers.rhs_contracting_dimensions.append(1) + dimension_numbers.lhs_batch_dimensions.append(0) + dimension_numbers.rhs_batch_dimensions.append(0) + + c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers) + self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs)) + def testConvF32Same(self): c = self._NewComputation() a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32") @@ -496,6 +520,12 @@ class SingleOpTest(LocalComputationTest): c.Exp(c.Constant(arr)) self._ExecuteAndCompareClose(c, expected=np.exp(arr)) + def testRound(self): + c = self._NewComputation() + arr = NumpyArrayF32([3.3, 12.1]) + c.Round(c.Constant(arr)) + self._ExecuteAndCompareClose(c, expected=np.round(arr)) + def testLog(self): c = self._NewComputation() arr = NumpyArrayF32([3.3, 12.1]) @@ -699,6 +729,23 @@ class SingleOpTest(LocalComputationTest): self._ExecuteAndCompareExact( c, expected=[[[6, 5], [8, 7]], [[2, 1], [4, 3]]]) + def testClampF32(self): + c = self._NewComputation() + c.Clamp( + c.Constant(NumpyArrayF32(-1)), + c.Constant(NumpyArrayF32([-2, -1, 0, 1, 2, 3])), + c.Constant(NumpyArrayF32(2))) + self._ExecuteAndCompareExact(c, expected=[-1, -1, 0, 1, 2, 2]) + + # TODO(b/72689392): re-enable when bug S32 resolved + def DISABLED_testClampS32(self): + c = self._NewComputation() + c.Clamp( + c.Constant(NumpyArrayS32(-1)), + c.Constant(NumpyArrayS32([-2, -1, 0, 1, 2, 3])), + c.Constant(NumpyArrayS32(2))) + self._ExecuteAndCompareExact(c, expected=[-1, 0, 1, 2, 2]) + def testSelect(self): c = self._NewComputation() c.Select( diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 469acc330cb..0f2d0a9e96e 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -509,6 +509,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client:executable_build_options", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", ], @@ -1110,8 +1111,6 @@ cc_library( ":hlo", ":hlo_evaluator", ":hlo_pass", - ":tuple_util", - ":while_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/core:lib", ], @@ -1156,6 +1155,34 @@ tf_cc_test( ], ) +cc_library( + name = "implicit_broadcast_remover", + srcs = ["implicit_broadcast_remover.cc"], + hdrs = ["implicit_broadcast_remover.h"], + deps = [ + ":hlo", + ":hlo_dce", + ":hlo_pass", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", + "//tensorflow/core:lib", + ], +) + +tf_cc_test( + name = "implicit_broadcast_remover_test", + srcs = ["implicit_broadcast_remover_test.cc"], + deps = [ + ":hlo_matchers", + ":implicit_broadcast_remover", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla/tests:hlo_verified_test_base", + ], +) + cc_library( name = "dot_decomposer", srcs = ["dot_decomposer.cc"], @@ -1825,7 +1852,9 @@ tf_cc_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:test_utils", + "//tensorflow/compiler/xla/tools/parser:hlo_parser", "//tensorflow/core:lib", + "//tensorflow/core:test", ], ) @@ -1856,6 +1885,7 @@ cc_library( ":hlo", ":hlo_graph_dumper", ":hlo_pass", + ":hlo_proto_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index ba82e822b21..fb857559f97 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -1618,9 +1618,12 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) { reduce, HloInstruction::CreateBroadcast(reduce->shape(), init_value, {})); } + // A Transpose feeding a reduce can simply permute the reduction dimensions - // field. - if (arg->opcode() == HloOpcode::kTranspose) { + // field if the output of the reduce is a vector or scalar. Higher ranked + // result may require a transpose of the output. + if (ShapeUtil::Rank(reduce->shape()) <= 1 && + arg->opcode() == HloOpcode::kTranspose) { auto transpose_dimensions = arg->dimensions(); std::vector new_reduce_dimensions; for (auto dim : dimensions) { diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc index d5594dc07c8..774b11478c6 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment.cc @@ -997,14 +997,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering( auto color = single_colored_set.first; VLOG(2) << "Simulating heap for color " << color; int64 alignment = assignment->color_alignment_(color); + HeapSimulator::Options options; + options.buffers_to_assign = &single_colored_set.second; TF_ASSIGN_OR_RETURN( const HeapSimulator::Result result, HeapSimulator::Run(MakeUnique( MakeUnique(alignment)), assignment->module(), module_sequence, assignment->points_to_analysis(), - assignment->buffer_size_, - &single_colored_set.second)); + assignment->buffer_size_, options)); AssignBuffersFromHeapSimulator(result, assignment, single_colored_set.first); } @@ -1024,14 +1025,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering( auto color = single_colored_set.first; VLOG(2) << "Simulating heap for color " << color; int64 alignment = assignment->color_alignment_(color); + HeapSimulator::Options options; + options.buffers_to_assign = &single_colored_set.second; TF_ASSIGN_OR_RETURN( const HeapSimulator::Result result, HeapSimulator::Run(MakeUnique( MakeUnique(alignment)), *computation, *instruction_sequence, assignment->points_to_analysis(), - assignment->buffer_size_, - &single_colored_set.second)); + assignment->buffer_size_, options)); AssignBuffersFromHeapSimulator(result, assignment, single_colored_set.first); } diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h index fc67330f5cb..74fd24edf88 100644 --- a/tensorflow/compiler/xla/service/compiler.h +++ b/tensorflow/compiler/xla/service/compiler.h @@ -72,8 +72,18 @@ class AotCompilationOptions { // Returns the ID of the platform to which these options apply. virtual perftools::gputools::Platform::Id PlatformId() const = 0; + // Optional allocator that may be used for allocating temp space on the device + // during compilation. + DeviceMemoryAllocator* device_allocator() const { return device_allocator_; } + void set_device_allocator(DeviceMemoryAllocator* device_allocator) { + device_allocator_ = device_allocator; + } + protected: AotCompilationOptions() = default; + + private: + DeviceMemoryAllocator* device_allocator_ = nullptr; }; // Abstract compiler interface that is subclassed for compilation on a @@ -99,9 +109,16 @@ class Compiler { // Runs Hlo passes to optimize the given Hlo module, returns the optimized // module. + // + // If device_allocator is not null, the compiler may use it to allocate temp + // space on the device for use during compilation. For example, the compiler + // may allocate buffers on the device and then run variants of a given + // algorithm over those buffers, to see which variant is fastest. Any space + // allocated should be deallocated before this function returns. virtual StatusOr> RunHloPasses( std::unique_ptr module, - perftools::gputools::StreamExecutor* executor) = 0; + perftools::gputools::StreamExecutor* executor, + DeviceMemoryAllocator* device_allocator) = 0; // Compiles the HLO module for execution on a device given by the executor, // and returns an executable object or an error status. No HLO passes are @@ -112,21 +129,27 @@ class Compiler { // The compiler may optionally specialize to the individual device // (not just type of device) indicated by the executor. // + // device_allocator is optional; see RunHloPasses. + // // Use the overload below to compile computations that run in parallel. virtual StatusOr> RunBackend( std::unique_ptr module, - perftools::gputools::StreamExecutor* executor) = 0; + perftools::gputools::StreamExecutor* executor, + DeviceMemoryAllocator* device_allocator) = 0; // Compiles a set of HLO modules that can run in parallel, potentially // communicating data between the modules, and returns a corresponding // sequence of executable objects. // + // device_allocator is optional; see RunHloPasses. + // // TODO(b/68666782): Remove this method after adding support for multiple // modules to RunHloPasses and RunBackends. virtual StatusOr>> Compile( std::vector> modules, std::vector> - stream_exec) = 0; + stream_exec, + DeviceMemoryAllocator* device_allocator) = 0; // Compiles the HLO module for ahead-of-time execution. This is intended for // use in static compilation. diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 33af77e1a81..d13a97bcc9a 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -437,7 +437,8 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) { StatusOr> CpuCompiler::RunHloPasses( std::unique_ptr module, - perftools::gputools::StreamExecutor* /*stream_exec*/) { + perftools::gputools::StreamExecutor* /*stream_exec*/, + DeviceMemoryAllocator* /*device_allocator*/) { VLOG(2) << "Before optimization:"; XLA_VLOG_LINES(2, module->ToString()); @@ -450,7 +451,8 @@ StatusOr> CpuCompiler::RunHloPasses( StatusOr> CpuCompiler::RunBackend( std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec) { + perftools::gputools::StreamExecutor* stream_exec, + DeviceMemoryAllocator* /*device_allocator*/) { const string timer_message = "Compiling [" + module->name() + "] for CPU using JIT"; XLA_SCOPED_LOGGING_TIMER(timer_message); @@ -517,8 +519,8 @@ StatusOr> CpuCompiler::RunBackend( // ownership is std::moved. const bool embed_ir_in_executable = module->config().debug_options().xla_embed_ir_in_executable(); - const string xla_dump_hlo_proto_to = - module->config().debug_options().xla_dump_hlo_proto_to(); + const string xla_dump_optimized_hlo_proto_to = + module->config().debug_options().xla_dump_optimized_hlo_proto_to(); if (options::CpuParallelBackendRequested(module->config())) { VLOG(1) << "Using parallel cpu backend"; @@ -538,10 +540,10 @@ StatusOr> CpuCompiler::RunBackend( // print one ourselves. XLA_VLOG_LINES(2, assignment->ToString()); - if (!xla_dump_hlo_proto_to.empty()) { + if (!xla_dump_optimized_hlo_proto_to.empty()) { HloProto proto = MakeHloProto(*module, *assignment); TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( - proto, xla_dump_hlo_proto_to, module->name())); + proto, xla_dump_optimized_hlo_proto_to, module->name())); } // If we are using the parallel CPU backend, we need to create map from @@ -647,10 +649,10 @@ StatusOr> CpuCompiler::RunBackend( // print one ourselves. XLA_VLOG_LINES(2, assignment->ToString()); - if (!xla_dump_hlo_proto_to.empty()) { + if (!xla_dump_optimized_hlo_proto_to.empty()) { HloProto proto = MakeHloProto(*module, *assignment); TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( - proto, xla_dump_hlo_proto_to, module->name())); + proto, xla_dump_optimized_hlo_proto_to, module->name())); } // Each computation is a single function. Emit all embedded computations @@ -826,12 +828,12 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, // print one ourselves. XLA_VLOG_LINES(2, assignment->ToString()); - const string xla_dump_hlo_proto_to = - module->config().debug_options().xla_dump_hlo_proto_to(); - if (!xla_dump_hlo_proto_to.empty()) { + const string xla_dump_optimized_hlo_proto_to = + module->config().debug_options().xla_dump_optimized_hlo_proto_to(); + if (!xla_dump_optimized_hlo_proto_to.empty()) { HloProto proto = MakeHloProto(*module, *assignment); TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( - proto, xla_dump_hlo_proto_to, module->name())); + proto, xla_dump_optimized_hlo_proto_to, module->name())); } IrEmitter ir_emitter(*module, *assignment, &llvm_module, diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h index ebed7058d8f..3498139ab95 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h @@ -118,11 +118,13 @@ class CpuCompiler : public LLVMCompiler { StatusOr> RunHloPasses( std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec) override; + perftools::gputools::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec) override; + perftools::gputools::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::vector> modules, diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 71e81331897..0b2d3d47463 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -479,7 +479,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) { Status IrEmitter::HandleSort(HloInstruction* sort) { // TODO(b/26783907): Implement sort on CPU. - return Unimplemented("Sort is not supported on CPU (b/26783907)."); + return Unimplemented("Sort is not implemented on CPU."); } Status IrEmitter::HandleTuple(HloInstruction* tuple) { @@ -522,7 +522,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) { // TODO(b/31410564): Implement dilation for reduce-window. if (window_util::HasDilation(window)) { return Unimplemented( - "Dilation for reduce-window not implemented on CPU. See b/31410564."); + "Dilation for ReduceWindow is not implemented on CPU."); } // The called computation should have been emitted previously. @@ -625,8 +625,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { // TODO(b/31410564): Implement dilation for select-and-scatter. if (window_util::HasDilation(window)) { return Unimplemented( - "Dilation for select-and-scatter not implemented on CPU. " - "See b/31410564."); + "Dilation for SelectAndScatter is not implemented on CPU. "); } // The select and scatter computations should have been emitted previously. @@ -1196,8 +1195,7 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) { } // TODO(b/33011107): Support cross replica sum on CPU. - return Unimplemented( - "Cross replica sum is not implemented on CPU. See b/33011107."); + return Unimplemented("CrossReplicaSum is not implemented on CPU."); } // Fills up the free variables in 'index_with_free_var' with values from @@ -1811,12 +1809,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) { Status IrEmitter::HandleSend(HloInstruction* send) { // TODO(b/33942983): Support Send/Recv on CPU. - return Unimplemented("Send is not implemented on CPU. See b/33942983."); + return Unimplemented("Send is not implemented on CPU."); } Status IrEmitter::HandleSendDone(HloInstruction* send_done) { // TODO(b/33942983): Support Send/Recv on CPU. - return Unimplemented("Send-done is not implemented on CPU. See b/33942983."); + return Unimplemented("Send-done is not implemented on CPU."); } Status IrEmitter::HandleSlice(HloInstruction* slice) { @@ -1981,12 +1979,12 @@ Status IrEmitter::HandleDynamicUpdateSlice( Status IrEmitter::HandleRecv(HloInstruction* recv) { // TODO(b/33942983): Support Send/Recv on CPU. - return Unimplemented("Recv is not implemented on CPU. See b/33942983."); + return Unimplemented("Recv is not implemented on CPU."); } Status IrEmitter::HandleRecvDone(HloInstruction* recv_done) { // TODO(b/33942983): Support Send/Recv on CPU. - return Unimplemented("Recv-done is not implemented on CPU. See b/33942983."); + return Unimplemented("Recv-done is not implemented on CPU."); } Status IrEmitter::HandlePad(HloInstruction* pad) { @@ -1995,10 +1993,10 @@ Status IrEmitter::HandlePad(HloInstruction* pad) { for (auto& padding_dimension : pad->padding_config().dimensions()) { if (padding_dimension.edge_padding_low() < 0 || padding_dimension.edge_padding_high() < 0) { - return Unimplemented( - "Negative padding not supported in the CPU backend (b/34628603); " - "this should have been eliminated at the HLO level: %s", - pad->padding_config().ShortDebugString().c_str()); + return InternalErrorStrCat( + "Encountered negative padding in IrEmitter on CPU. " + "This should have been eliminated at the HLO level. ", + pad->ToString()); } } diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc index 2e4b0a52305..78e7aa48acc 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.cc +++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc @@ -24,7 +24,7 @@ limitations under the License. namespace xla { StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator( - perftools::gputools::Platform* platform, + const perftools::gputools::Platform* platform, tensorflow::gtl::ArraySlice stream_executors) : DeviceMemoryAllocator(platform), diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h index 00caefab667..39dfad84c1c 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.h +++ b/tensorflow/compiler/xla/service/device_memory_allocator.h @@ -33,7 +33,7 @@ class DeviceMemoryAllocator { public: // Parameter platform indicates which platform the allocator allocates memory // on. Must be non-null. - explicit DeviceMemoryAllocator(perftools::gputools::Platform* platform) + explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform) : platform_(platform) {} virtual ~DeviceMemoryAllocator() {} @@ -49,14 +49,14 @@ class DeviceMemoryAllocator { int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0; // Return the platform that the allocator allocates memory on. - perftools::gputools::Platform* platform() const { return platform_; } + const perftools::gputools::Platform* platform() const { return platform_; } // Can we call Deallocate() as soon as a computation has been scheduled on // a stream, or do we have to wait for the computation to complete first? virtual bool AllowsAsynchronousDeallocation() const = 0; protected: - perftools::gputools::Platform* platform_; + const perftools::gputools::Platform* platform_; }; // Default memory allocator for a platform which uses @@ -64,7 +64,7 @@ class DeviceMemoryAllocator { class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator { public: StreamExecutorMemoryAllocator( - perftools::gputools::Platform* platform, + const perftools::gputools::Platform* platform, tensorflow::gtl::ArraySlice stream_executors); diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index 9780bac16ec..4468adbadbf 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -428,7 +428,7 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( llvm::Intrinsic::round, {operand_value}, {operand_value->getType()}, ir_builder_); case HloOpcode::kSign: { - // TODO(b/32151903): Ensure consistent sign behavior for -0.0 + // TODO(b/32151903): Ensure consistent sign behavior for -0.0. auto type = operand_value->getType(); auto zero = llvm::ConstantFP::get(type, 0.0); auto oeq = ir_builder_->CreateFCmpOEQ(operand_value, zero); @@ -870,7 +870,10 @@ llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value, StatusOr ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type, llvm::Value* x) const { if (prim_type != F32) { - return Unimplemented("inverse erf only implemented for F32 (b/34339814)"); + // TODO(b/34339814): Implement inverse erf for F64. + return Unimplemented( + "Inverse erf is only implemented for element " + "type F32."); } auto getFloat = [&](const float f) { return llvm::ConstantFP::get(ir_builder_->getFloatTy(), f); @@ -1040,17 +1043,9 @@ StatusOr ElementalIrEmitter::EmitIntegerBinaryOp( is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE, lhs_value, rhs_value, ir_builder_); case HloOpcode::kMinimum: - return ir_builder_->CreateSelect( - ir_builder_->CreateICmp( - is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE, - lhs_value, rhs_value), - lhs_value, rhs_value); + return EmitIntegralMin(lhs_value, rhs_value, is_signed); case HloOpcode::kMaximum: - return ir_builder_->CreateSelect( - ir_builder_->CreateICmp( - is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE, - lhs_value, rhs_value), - lhs_value, rhs_value); + return EmitIntegralMax(lhs_value, rhs_value, is_signed); case HloOpcode::kAnd: return ir_builder_->CreateAnd(lhs_value, rhs_value); case HloOpcode::kOr: @@ -1067,6 +1062,26 @@ StatusOr ElementalIrEmitter::EmitIntegerBinaryOp( } } +llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value, + llvm::Value* rhs_value, + bool is_signed) const { + return ir_builder_->CreateSelect( + ir_builder_->CreateICmp( + is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE, + lhs_value, rhs_value), + lhs_value, rhs_value); +} + +llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value, + llvm::Value* rhs_value, + bool is_signed) const { + return ir_builder_->CreateSelect( + ir_builder_->CreateICmp( + is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE, + lhs_value, rhs_value), + lhs_value, rhs_value); +} + llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex( const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo, int64 operand_no) const { @@ -1363,7 +1378,18 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( TF_ASSIGN_OR_RETURN(llvm::Value * max_value, operand_to_generator.at(hlo->operand(2))( ElementwiseSourceIndex(index, *hlo, 2))); - return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value)); + PrimitiveType prim_type = hlo->shape().element_type(); + if (primitive_util::IsFloatingPointType(prim_type)) { + return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value)); + } else if (primitive_util::IsIntegralType(prim_type)) { + bool is_signed = primitive_util::IsSignedIntegralType(prim_type); + return EmitIntegralMin( + max_value, EmitIntegralMax(min_value, arg_value, is_signed), + is_signed); + } else { + return Unimplemented("Clamp unimplemented for %s", + PrimitiveType_Name(prim_type).c_str()); + } }; case HloOpcode::kReducePrecision: return [this, hlo, &operand_to_generator]( diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h index 1a48eb5fcb9..c516a826d9e 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h @@ -86,6 +86,12 @@ class ElementalIrEmitter { virtual llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value) const; + llvm::Value* EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value, + bool is_signed) const; + + llvm::Value* EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value, + bool is_signed) const; + virtual StatusOr EmitErfInv(PrimitiveType prim_type, llvm::Value* value) const; diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 3c3328b9cd2..7df01f7edd1 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -131,6 +131,7 @@ cc_library( "ir_emitter_context.h", ], deps = [ + ":cudnn_convolution_runner", ":elemental_ir_emitter", ":gpu_constants", ":gpu_executable", @@ -262,6 +263,7 @@ cc_library( ], deps = [ ":buffer_allocations", + ":cudnn_convolution_runner", ":infeed_manager", ":ir_emission_utils", ":partition_assignment", @@ -309,9 +311,41 @@ cc_library( ) cc_library( - name = "convolution_folding", - srcs = ["convolution_folding.cc"], - hdrs = ["convolution_folding.h"], + name = "cudnn_convolution_algorithm_picker", + srcs = ["cudnn_convolution_algorithm_picker.cc"], + hdrs = ["cudnn_convolution_algorithm_picker.h"], + deps = [ + ":cudnn_convolution_runner", + ":gpu_executable", + ":ir_emission_utils", + "//tensorflow/compiler/xla/service:device_memory_allocator", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_pass", + "//tensorflow/core:lib", + "//tensorflow/core:stream_executor_no_cuda", + ], +) + +cc_library( + name = "cudnn_convolution_runner", + srcs = ["cudnn_convolution_runner.cc"], + hdrs = ["cudnn_convolution_runner.h"], + deps = [ + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/core:stream_executor_no_cuda", + ], +) + +cc_library( + name = "cudnn_convolution_rewriter", + srcs = ["cudnn_convolution_rewriter.cc"], + hdrs = ["cudnn_convolution_rewriter.h"], deps = [ ":ir_emission_utils", "//tensorflow/compiler/xla:literal_util", @@ -325,15 +359,18 @@ cc_library( ) tf_cc_test( - name = "convolution_folding_test", - srcs = ["convolution_folding_test.cc"], + name = "cudnn_convolution_rewriter_test", + srcs = ["cudnn_convolution_rewriter_test.cc"], deps = [ - ":convolution_folding", + ":cudnn_convolution_rewriter", + ":ir_emission_utils", + "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_matchers", "//tensorflow/compiler/xla/service:shape_inference", "//tensorflow/compiler/xla/tests:hlo_test_base", - "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep "//tensorflow/core:test", ], ) @@ -446,7 +483,8 @@ cc_library( srcs = ["gpu_compiler.cc"], hdrs = ["gpu_compiler.h"], deps = [ - ":convolution_folding", + ":cudnn_convolution_algorithm_picker", + ":cudnn_convolution_rewriter", ":fusion_merger", ":gpu_constants", ":gpu_copy_insertion", @@ -514,7 +552,6 @@ cc_library( "//tensorflow/compiler/xla:literal_util", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_pass", - "@llvm//:core", ], ) diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc index 899cc5c83b9..f76f15929d1 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc @@ -17,6 +17,7 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/strings/strcat.h" @@ -36,366 +37,69 @@ using se::dnn::DataLayout; using se::dnn::FilterDescriptor; using se::dnn::FilterLayout; -ConvolveScratchAllocator::ConvolveScratchAllocator( - int device_ordinal, DeviceMemoryAllocator* memory_allocator) - : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {} - -ConvolveScratchAllocator::~ConvolveScratchAllocator() { - for (auto& allocated_buffer : allocated_buffers_) { - if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer) - .ok()) { - // The program can still continue with failed deallocation. - LOG(ERROR) << "Failed to deallocate the allocated buffer: " - << allocated_buffer.opaque(); - } - } -} - -int64 ConvolveScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) { - constexpr int64 kConvolveScratchSize = 1LL << 32; // 4GB by default. - return kConvolveScratchSize; -} - -se::port::StatusOr> -ConvolveScratchAllocator::AllocateBytes(se::Stream* stream, int64 byte_size) { - CHECK_GE(byte_size, 0) << "byte_size must be positive."; - if (byte_size > GetMemoryLimitInBytes(stream)) { - return se::port::Status( - se::port::error::RESOURCE_EXHAUSTED, - tensorflow::strings::Printf( - "Allocating %lld bytes exceeds the memory limit of %lld bytes.", - byte_size, GetMemoryLimitInBytes(stream))); - } - - auto status_or_memory = - memory_allocator_->Allocate(device_ordinal_, byte_size, - /*retry_on_failure=*/false); - if (!status_or_memory.ok()) { - return se::port::Status(se::port::error::RESOURCE_EXHAUSTED, - tensorflow::strings::Printf( - "Failed to allocate %lld bytes on device %d.", - byte_size, device_ordinal_)); - } - se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie(); - allocated_buffers_.push_back(allocated_buffer); - total_allocated_bytes_ += byte_size; - return se::DeviceMemory(allocated_buffer); -} - -string ConvolutionKindToString( - ConvolutionThunk::ConvolutionKind convolution_kind) { - switch (convolution_kind) { - case ConvolutionThunk::ConvolutionKind::kForward: - return "forward"; - case ConvolutionThunk::ConvolutionKind::kBackwardFilter: - return "backward_filter"; - case ConvolutionThunk::ConvolutionKind::kBackwardInput: - return "backward_input"; - } - return "unknown convolution kind"; -} - ConvolutionThunk::ConvolutionThunk( - ConvolutionKind convolution_kind, - const BufferAllocation::Slice& input_buffer, + CudnnConvKind convolution_kind, const BufferAllocation::Slice& input_buffer, const BufferAllocation::Slice& filter_buffer, - const BufferAllocation::Slice& output_buffer, const Shape& input_shape, + const BufferAllocation::Slice& output_buffer, + const BufferAllocation::Slice& tuple_result_buffer, + const BufferAllocation::Slice& scratch_buffer, const Shape& input_shape, const Shape& filter_shape, const Shape& output_shape, const Window& window, - const ConvolutionDimensionNumbers& dim_nums, const HloInstruction* hlo) + const ConvolutionDimensionNumbers& dim_nums, int64 algorithm, + const HloInstruction* hlo) : Thunk(Kind::kConvolution, hlo), convolution_kind_(convolution_kind), input_buffer_(input_buffer), filter_buffer_(filter_buffer), output_buffer_(output_buffer), + tuple_result_buffer_(tuple_result_buffer), + scratch_buffer_(scratch_buffer), input_shape_(input_shape), filter_shape_(filter_shape), output_shape_(output_shape), window_(window), - dim_nums_(dim_nums) {} + dim_nums_(dim_nums), + algorithm_(algorithm) {} -tensorflow::Status ConvolutionThunk::ExecuteOnStream( +Status ConvolutionThunk::ExecuteOnStream( const BufferAllocations& buffer_allocations, se::Stream* stream) { - VLOG(3) << "Convolution kind: " << ConvolutionKindToString(convolution_kind_); - VLOG(3) << "input shape: { " << input_shape_.ShortDebugString() << " }"; - VLOG(3) << "filter shape: { " << filter_shape_.ShortDebugString() << " }"; - VLOG(3) << "Output shape: { " << output_shape_.ShortDebugString() << " }"; - VLOG(3) << "Dim nums: { " << dim_nums_.ShortDebugString() << " }"; - VLOG(3) << "Window: { " << window_.ShortDebugString() << " }"; - - const int num_dimensions = window_.dimensions_size(); - CHECK_LE(num_dimensions, 3); - // cuDNN does not support 1D convolutions. We therefore express 1D - // convolutions as 2D convolutions where the first spatial dimension is 1. - // This matches the behavior of TF (see definition of conv1d in - // tensorflow/python/ops/nn_ops.py). - const int effective_num_dimensions = std::max(2, num_dimensions); - - CHECK_EQ(F32, output_shape_.element_type()); - CHECK_EQ(num_dimensions, dim_nums_.input_spatial_dimensions_size()); - CHECK_EQ(num_dimensions, dim_nums_.kernel_spatial_dimensions_size()); - CHECK_EQ(num_dimensions, dim_nums_.output_spatial_dimensions_size()); - for (const WindowDimension& dim : window_.dimensions()) { - CHECK_EQ(dim.padding_low(), dim.padding_high()); - } - - // cuDNN's convolution APIs support the BDYX layout for activations/output and - // the OIYX layout for weights. - BatchDescriptor input_descriptor(effective_num_dimensions); - input_descriptor.set_layout(DataLayout::kBatchDepthYX) - .set_feature_map_count( - input_shape_.dimensions(dim_nums_.input_feature_dimension())) - .set_count(input_shape_.dimensions(dim_nums_.input_batch_dimension())); - for (int dim = 0; dim < num_dimensions; ++dim) { - // Note that the dimensions are reversed. The same holds below. - input_descriptor.set_spatial_dim( - static_cast(effective_num_dimensions - dim - 1), - input_shape_.dimensions(dim_nums_.input_spatial_dimensions(dim))); - } - - FilterDescriptor filter_descriptor(effective_num_dimensions); - filter_descriptor.set_layout(FilterLayout::kOutputInputYX) - .set_input_feature_map_count( - filter_shape_.dimensions(dim_nums_.kernel_input_feature_dimension())) - .set_output_feature_map_count(filter_shape_.dimensions( - dim_nums_.kernel_output_feature_dimension())); - for (int dim = 0; dim < num_dimensions; ++dim) { - filter_descriptor.set_spatial_dim( - static_cast(effective_num_dimensions - dim - 1), - filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(dim))); - } - - ConvolutionDescriptor convolution_descriptor(effective_num_dimensions); - for (int dim = 0; dim < num_dimensions; ++dim) { - convolution_descriptor - .set_zero_padding( - static_cast(effective_num_dimensions - dim - 1), - window_.dimensions(dim).padding_low()) - .set_filter_stride( - static_cast(effective_num_dimensions - dim - 1), - window_.dimensions(dim).stride()); - } - - BatchDescriptor output_descriptor(effective_num_dimensions); - output_descriptor.set_layout(DataLayout::kBatchDepthYX) - .set_feature_map_count( - output_shape_.dimensions(dim_nums_.output_feature_dimension())) - .set_count(output_shape_.dimensions(dim_nums_.output_batch_dimension())); - for (int dim = 0; dim < num_dimensions; ++dim) { - output_descriptor.set_spatial_dim( - static_cast(effective_num_dimensions - dim - 1), - output_shape_.dimensions(dim_nums_.output_spatial_dimensions(dim))); - } - - // Add a singleton dimension in the 1D convolution case. - if (num_dimensions == 1) { - input_descriptor.set_spatial_dim(static_cast(0), 1); - output_descriptor.set_spatial_dim(static_cast(0), 1); - filter_descriptor.set_spatial_dim(static_cast(0), 1); - convolution_descriptor - .set_zero_padding(static_cast(0), 0) - .set_filter_stride(static_cast(0), 1); - } - se::DeviceMemory input_data( buffer_allocations.GetDeviceAddress(input_buffer_)); se::DeviceMemory filter_data( buffer_allocations.GetDeviceAddress(filter_buffer_)); se::DeviceMemory output_data( buffer_allocations.GetDeviceAddress(output_buffer_)); - return ConvolveWithTune(input_descriptor, input_data, filter_descriptor, - filter_data, output_descriptor, output_data, - convolution_descriptor, buffer_allocations, stream); -} + se::DeviceMemoryBase scratch = + buffer_allocations.GetDeviceAddress(scratch_buffer_); -tensorflow::Status ConvolutionThunk::Convolve( - const BatchDescriptor& input_descriptor, se::DeviceMemory input_data, - const FilterDescriptor& filter_descriptor, - se::DeviceMemory filter_data, - const BatchDescriptor& output_descriptor, - se::DeviceMemory output_data, - const ConvolutionDescriptor& convolution_descriptor, - const se::dnn::AlgorithmConfig& algorithm_config, se::Stream* stream, - ConvolveScratchAllocator* scratch_allocator, - se::dnn::ProfileResult* profile_result) { - bool launch_ok; - switch (convolution_kind_) { - case ConvolutionKind::kBackwardFilter: - launch_ok = - stream - ->ThenConvolveBackwardFilterWithAlgorithm( - input_descriptor, input_data, output_descriptor, output_data, - convolution_descriptor, filter_descriptor, &filter_data, - scratch_allocator, algorithm_config, profile_result) - .ok(); - break; - case ConvolutionKind::kBackwardInput: - launch_ok = stream - ->ThenConvolveBackwardDataWithAlgorithm( - filter_descriptor, filter_data, output_descriptor, - output_data, convolution_descriptor, input_descriptor, - &input_data, scratch_allocator, algorithm_config, - profile_result) - .ok(); - break; - case ConvolutionKind::kForward: - launch_ok = - stream - ->ThenConvolveWithAlgorithm( - input_descriptor, input_data, filter_descriptor, filter_data, - convolution_descriptor, output_descriptor, &output_data, - scratch_allocator, algorithm_config, profile_result) - .ok(); - break; - } - if (launch_ok) { - return tensorflow::Status::OK(); - } - return InternalError( - "Unable to launch convolution for thunk %p with type %s and algorithm " - "(%lld, %lld)", - this, ConvolutionKindToString(convolution_kind_).c_str(), - algorithm_config.algorithm().algo_id(), - algorithm_config.algorithm_no_scratch().algo_id()); -} + se::dnn::AlgorithmConfig algorithm_config( + se::dnn::AlgorithmDesc(algorithm_, /*use_tensor_ops=*/false)); -std::vector ConvolutionThunk::GetAlgorithms( - bool with_winograd_nonfused, se::StreamExecutor* stream_exec) const { - std::vector algorithms; - switch (convolution_kind_) { - case ConvolutionKind::kBackwardFilter: - CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms( - with_winograd_nonfused, &algorithms)); - break; - case ConvolutionKind::kBackwardInput: - CHECK(stream_exec->GetConvolveBackwardDataAlgorithms( - with_winograd_nonfused, &algorithms)); - break; - case ConvolutionKind::kForward: - CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused, - &algorithms)); - break; - } - return algorithms; -} + TF_RETURN_IF_ERROR(RunCudnnConvolution( + convolution_kind_, input_shape_, filter_shape_, output_shape_, input_data, + filter_data, output_data, scratch, window_, dim_nums_, algorithm_config, + stream)); -static string AlgorithmToString(const se::dnn::AlgorithmDesc& algo) { - if (algo.tensor_ops_enabled()) { - return tensorflow::strings::StrCat(algo.algo_id(), "+TC"); - } - return tensorflow::strings::StrCat(algo.algo_id()); -} - -// Determines whether we can safely perform a winograd non-fused convolution for -// the given input and output descriptors. This works around b/68264959, an -// integer overflow in cuDNNv5 and cuDNNv6. -static bool ShouldIncludeWinogradNonfusedAlgo( - const BatchDescriptor& input_descriptor, - const BatchDescriptor& output_descriptor) { - int64 batch = input_descriptor.count(); - int64 in_depths = input_descriptor.feature_map_count(); - int64 in_rows = input_descriptor.height(); - int64 in_cols = input_descriptor.width(); - int64 out_depths = output_descriptor.feature_map_count(); - - int64 total_size = 16 * std::ceil(batch / 16.0) * - std::max(in_depths, out_depths) * in_cols * in_rows * - sizeof(float); - int64 threshold = 1L << 31; - - return total_size < threshold; -} - -tensorflow::Status ConvolutionThunk::ConvolveWithTune( - const BatchDescriptor& input_descriptor, se::DeviceMemory input_data, - const FilterDescriptor& filter_descriptor, - se::DeviceMemory filter_data, - const BatchDescriptor& output_descriptor, - se::DeviceMemory output_data, - const ConvolutionDescriptor& convolution_descriptor, - const BufferAllocations& buffer_allocations, se::Stream* stream) { - // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out. - if (!best_algorithm_.has_value()) { - best_algorithm_.emplace(); - - // Auto-tuning either is disabled or only happens in the first run of this - // function. - VLOG(2) << "Profiling for best convolution algorithm used for " - "ConvolutionThunk: " - << this; - - bool with_winograd_nonfused = - ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor); - - se::dnn::ProfileResult best_result; - se::dnn::ProfileResult best_result_without_scratch; - std::vector algorithms = - GetAlgorithms(with_winograd_nonfused, stream->parent()); - for (auto algorithm : algorithms) { - ConvolveScratchAllocator scratch_allocator( - buffer_allocations.device_ordinal(), - buffer_allocations.memory_allocator()); - se::dnn::ProfileResult profile_result; - VLOG(3) << "Trying algorithm " << AlgorithmToString(algorithm) - << " for ConvolutionThunk: " << this; - bool launch_ok = - Convolve(input_descriptor, input_data, filter_descriptor, filter_data, - output_descriptor, output_data, convolution_descriptor, - se::dnn::AlgorithmConfig(algorithm, algorithm), stream, - &scratch_allocator, &profile_result) - .ok(); - if (launch_ok && profile_result.is_valid()) { - VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm) - << " for ConvolutionThunk " << this << " succeeded, taking " - << profile_result.elapsed_time_in_ms() - << "ms. (Best result: " << best_result.elapsed_time_in_ms() - << "ms)"; - if (profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; - } - if (scratch_allocator.TotalAllocatedBytes() == 0 && - profile_result.elapsed_time_in_ms() < - best_result_without_scratch.elapsed_time_in_ms()) { - best_result_without_scratch = profile_result; - } - } else { - VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm) - << " for ConvolutionThunk " << this << " failed."; - } + // Figure out which of output/input/filter is the result produced by this op, + // and write the result tuple. + void* result_ptr = [&] { + switch (convolution_kind_) { + case CudnnConvKind::kForward: + return output_data.opaque(); + case CudnnConvKind::kBackwardInput: + return input_data.opaque(); + case CudnnConvKind::kBackwardFilter: + return filter_data.opaque(); } + }(); + void* ptrs[] = {result_ptr, scratch.opaque()}; + se::DeviceMemory tuple_addr( + buffer_allocations.GetDeviceAddress(tuple_result_buffer_)); + stream->ThenMemcpyH2D(ptrs, &tuple_addr); - if (best_result.is_valid()) { - best_algorithm_->set_algorithm(best_result.algorithm()); - } else { - LOG(ERROR) << "No convolution algorithm works with profiling. Fall back " - "to the default algorithm."; - best_algorithm_->set_algorithm(AlgorithmDesc()); - } - - if (best_result_without_scratch.is_valid()) { - best_algorithm_->set_algorithm_no_scratch( - best_result_without_scratch.algorithm()); - } else { - LOG(ERROR) << "No convolution algorithm without scratch works with " - "profiling. Fall back " - "to the default algorithm."; - best_algorithm_->set_algorithm_no_scratch(AlgorithmDesc()); - } - } - - { - VLOG(2) << "Using convolution algorithm (" - << AlgorithmToString(best_algorithm_->algorithm()) << ", " - << AlgorithmToString(best_algorithm_->algorithm_no_scratch()) - << ") for ConvolutionThunk: " << this; - ConvolveScratchAllocator scratch_allocator( - buffer_allocations.device_ordinal(), - buffer_allocations.memory_allocator()); - return Convolve(input_descriptor, input_data, filter_descriptor, - filter_data, output_descriptor, output_data, - convolution_descriptor, *best_algorithm_, stream, - &scratch_allocator, nullptr); + if (!stream->ok()) { + return InternalError("ConvolutionThunk::ExecuteOnStream failed."); } + return Status::OK(); } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h index 46c94d0bf1e..ca9ef5277b3 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h" #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -30,106 +31,47 @@ limitations under the License. namespace xla { namespace gpu { -// A one-time scratch allocator for forward and backward convolution. The -// scratch buffers allocated are released on destruction. -// -// Not thread-safe. -class ConvolveScratchAllocator : public perftools::gputools::ScratchAllocator { - public: - ConvolveScratchAllocator(int device_ordinal, - DeviceMemoryAllocator* memory_allocator); - - ~ConvolveScratchAllocator() override; - - int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override; - - int64 TotalAllocatedBytes() { return total_allocated_bytes_; } - - perftools::gputools::port::StatusOr> - AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override; - - private: - const int device_ordinal_; - DeviceMemoryAllocator* memory_allocator_; - std::vector allocated_buffers_; - int64 total_allocated_bytes_ = 0; -}; - // This class stores everything that StreamExecutor needs to launch a BNN // convolution. It is generated by IrEmitter. // // This is thread-compatible. class ConvolutionThunk : public Thunk { public: - // ConvolutionThunk performs one of the following types of convolution. - enum class ConvolutionKind { - kBackwardFilter, // Backward convolution for filter. - kBackwardInput, // Backward convolution for input. - kForward, // Forward convolution. - }; - - // Constructs a thunk for launching a DNN convolution. + // Constructs a thunk for launching a DNN convolution. When run, it will + // write a tuple (result, scratch_memory) into `tuple_result_buffer`. + // + // `algorithm` is a cudnn algorithm number. `algorithm == -1` indicates that + // we should use the default (i.e. baseline) cudnn algorithm. + // + // Note that "output" here doesn't refer to the output from running this + // thunk, but rather to the "output" of a hypothetical forward convolution + // that corresponds to this input+filter+output triple. That is, the result + // generated by this thunk is "output" for forward convs, "input" for + // backward-input convs, and "filter" for backward-filter convs. + // // Semantics of null hlo_instruction argument are as in Thunk. - ConvolutionThunk(ConvolutionKind convolution_kind, + ConvolutionThunk(CudnnConvKind convolution_kind, const BufferAllocation::Slice& input_buffer, const BufferAllocation::Slice& filter_buffer, const BufferAllocation::Slice& output_buffer, + const BufferAllocation::Slice& tuple_result_buffer, + const BufferAllocation::Slice& scratch_buffer, const Shape& input_shape, const Shape& filter_shape, const Shape& output_shape, const Window& window, - const ConvolutionDimensionNumbers& dnums, + const ConvolutionDimensionNumbers& dim_nums, int64 algorithm, const HloInstruction* hlo); ConvolutionThunk(const ConvolutionThunk&) = delete; ConvolutionThunk& operator=(const ConvolutionThunk&) = delete; - // Does the convolution for the thunk on "stream". Auto-tuning happens on the - // first run of this function. - tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; - - // Returns true if the next run of ExecuteOnStream will do autotuning. If so, - // we want the GPU to be quiescent during autotuning, so as not to introduce - // noise in our results. - bool ShouldHaltAllActivityBeforeRunning( - perftools::gputools::Stream*) override { - return !best_algorithm_.has_value(); - } - - // Return true if scratch memory is needed to execute the thunk, that is - // either the best algorithm hasn't been chosen or the best algorithm is not - // the same as the no-scratch algorithm. This is because that the execution - // of the thunk is asynchronous, and the scratch allocator goes out of - // scope before the thunk finishes execution. Returning true tells the stream - // executor to make future thunks wait for this thunk to avoid reusing the - // deallocated scratch memory until this thunk is done with it. - bool ShouldBlockFutureThunks() { - if (!best_algorithm_.has_value()) { - return true; - } - - const perftools::gputools::dnn::AlgorithmDesc& best_alg = - best_algorithm_->algorithm(); - const perftools::gputools::dnn::AlgorithmDesc& no_scratch_best_alg = - best_algorithm_->algorithm_no_scratch(); - return (!best_alg.is_default() || !no_scratch_best_alg.is_default() || - !(best_alg == no_scratch_best_alg)); - } + // Does the convolution for the thunk on "stream". + Status ExecuteOnStream(const BufferAllocations& buffer_allocations, + perftools::gputools::Stream* stream) override; private: - tensorflow::Status ConvolveWithTune( - const perftools::gputools::dnn::BatchDescriptor& input_descriptor, - perftools::gputools::DeviceMemory input_data, - const perftools::gputools::dnn::FilterDescriptor& filter_descriptor, - perftools::gputools::DeviceMemory filter_data, - const perftools::gputools::dnn::BatchDescriptor& output_descriptor, - perftools::gputools::DeviceMemory output_data, - const perftools::gputools::dnn::ConvolutionDescriptor& - convolution_descriptor, - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream); + class ScratchAllocator; - tensorflow::Status Convolve( + Status Convolve( const perftools::gputools::dnn::BatchDescriptor& input_descriptor, perftools::gputools::DeviceMemory input_data, const perftools::gputools::dnn::FilterDescriptor& filter_descriptor, @@ -139,40 +81,26 @@ class ConvolutionThunk : public Thunk { const perftools::gputools::dnn::ConvolutionDescriptor& convolution_descriptor, const perftools::gputools::dnn::AlgorithmConfig& algorithm_config, - perftools::gputools::Stream* stream, - ConvolveScratchAllocator* scratch_allocator, + perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator, perftools::gputools::dnn::ProfileResult* profile_result); - // Returns the convolve algorithms that can be used for this ConvolutionThunk. - std::vector GetAlgorithms( - bool with_winograd_nonfused, - perftools::gputools::StreamExecutor* stream_exec) const; - - // Fastest cuDNN convolution algorithm for this thunk learned from - // auto-tuning. If auto-tuning is disabled or failed, best_algorithm_ is set - // to the default value, indicating cuDNN's convolution will choose the best - // algorithm from some heuristics based on its parameters. - tensorflow::gtl::optional - best_algorithm_; - - const ConvolutionKind convolution_kind_; + const CudnnConvKind convolution_kind_; const BufferAllocation::Slice input_buffer_; const BufferAllocation::Slice filter_buffer_; const BufferAllocation::Slice output_buffer_; + const BufferAllocation::Slice tuple_result_buffer_; + const BufferAllocation::Slice scratch_buffer_; const Shape input_shape_; const Shape filter_shape_; const Shape output_shape_; const Window window_; - const ConvolutionDimensionNumbers dim_nums_; + int64 algorithm_; }; -string ConvolutionKindToString( - ConvolutionThunk::ConvolutionKind convolution_kind); - } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc new file mode 100644 index 00000000000..621b2d510fa --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc @@ -0,0 +1,370 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h" +#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" +#include "tensorflow/core/lib/gtl/optional.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/strcat.h" + +namespace xla { +namespace gpu { +namespace { + +namespace se = perftools::gputools; + +using se::DeviceMemoryBase; +using se::dnn::AlgorithmConfig; +using se::dnn::AlgorithmDesc; +using tensorflow::gtl::nullopt; +using tensorflow::gtl::optional; + +class ScratchAllocator : public se::ScratchAllocator { + public: + ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator) + : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {} + + ~ScratchAllocator() override; + + int64 GetMemoryLimitInBytes(se::Stream* stream) override { + return 1LL << 32; // 4GB. TODO(jlebar): Tune this? + } + int64 TotalAllocatedBytes() { return total_allocated_bytes_; } + + se::port::StatusOr> AllocateBytes( + se::Stream* stream, int64 byte_size) override; + + private: + const int device_ordinal_; + DeviceMemoryAllocator* memory_allocator_; + std::vector allocated_buffers_; + int64 total_allocated_bytes_ = 0; +}; + +ScratchAllocator::~ScratchAllocator() { + for (auto& allocated_buffer : allocated_buffers_) { + if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer) + .ok()) { + // The program can still continue with failed deallocation. + LOG(ERROR) << "Failed to deallocate the allocated buffer: " + << allocated_buffer.opaque(); + } + } +} + +se::port::StatusOr> ScratchAllocator::AllocateBytes( + se::Stream* stream, int64 byte_size) { + CHECK_GE(byte_size, 0) << "byte_size must be positive."; + if (byte_size > GetMemoryLimitInBytes(stream)) { + return se::port::Status( + se::port::error::RESOURCE_EXHAUSTED, + tensorflow::strings::Printf( + "Allocating %lld bytes exceeds the memory limit of %lld bytes.", + byte_size, GetMemoryLimitInBytes(stream))); + } + + auto status_or_memory = + memory_allocator_->Allocate(device_ordinal_, byte_size, + /*retry_on_failure=*/false); + if (!status_or_memory.ok()) { + return se::port::Status(se::port::error::RESOURCE_EXHAUSTED, + tensorflow::strings::Printf( + "Failed to allocate %lld bytes on device %d.", + byte_size, device_ordinal_)); + } + se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie(); + allocated_buffers_.push_back(allocated_buffer); + total_allocated_bytes_ += byte_size; + return se::DeviceMemory(allocated_buffer); +} + +// Determines whether we can safely perform a winograd non-fused convolution for +// the given input and output shapes. This works around b/68264959, an integer +// overflow in cuDNNv5 and cuDNNv6. +// +// TODO(jlebar): We shouldn't need this check for cuDNNv7. +bool ShouldIncludeWinogradNonfusedAlgo( + const Shape& input_shape, const Shape& output_shape, + const ConvolutionDimensionNumbers& dnums) { + int64 batch = input_shape.dimensions(dnums.input_batch_dimension()); + int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension()); + int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0)); + int64 in_cols = + dnums.input_spatial_dimensions_size() == 1 + ? 1 + : input_shape.dimensions(dnums.input_spatial_dimensions(1)); + int64 out_depths = output_shape.dimensions(dnums.output_feature_dimension()); + + int64 total_size = CeilOfRatio(batch, int64{16}) * + std::max(in_depths, out_depths) * in_cols * in_rows * + sizeof(float); + + const int64 threshold = 1L << 31; + return total_size < threshold; +} + +std::vector GetAlgorithms(CudnnConvKind kind, + bool with_winograd_nonfused, + se::StreamExecutor* stream_exec_) { + std::vector algorithms; + switch (kind) { + case CudnnConvKind::kBackwardFilter: + CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms( + with_winograd_nonfused, &algorithms)); + break; + case CudnnConvKind::kBackwardInput: + CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms( + with_winograd_nonfused, &algorithms)); + break; + case CudnnConvKind::kForward: + CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused, + &algorithms)); + break; + } + + // Remove any algorithms with tensor math enabled. These have lower precision + // than regular algorithms, and we don't yet have a way to turn this on/off in + // XLA. + algorithms.erase(std::remove_if(algorithms.begin(), algorithms.end(), + [&](const AlgorithmDesc& a) { + return a.tensor_ops_enabled(); + }), + algorithms.end()); + + return algorithms; +} + +string AlgorithmToString(const AlgorithmDesc& algo) { + if (algo.tensor_ops_enabled()) { + return tensorflow::strings::StrCat(algo.algo_id(), "+TC"); + } + return tensorflow::strings::StrCat(algo.algo_id()); +} + +string NumBytesToString(int64 bytes) { + return tensorflow::strings::StrCat( + tensorflow::strings::HumanReadableNumBytes(bytes), " (", bytes, "B)"); +} + +} // anonymous namespace + +// We could have caching here so that we don't redo this work for two identical +// convolutions. Unfortunately our cache key would have to be a tuple +// containing the protos passed to this function, and we have no utility for +// hashing protos. We could write our own hash functions, but they'd silently +// break if we ever added a field to one of the protos. Perhaps we could hack +// using the binary-encoded proto as the hash key, on the assumption that two +// protos being binary-equal is a sufficient, if not necessary, condition for +// proper equality. But that would still leave us open to having unnecessary +// cache misses and doing extra work. Overall, caching doesn't seem worth the +// trouble, but we may want to revisit this if we ever find a model where +// caching would speed up compilation a lot. +optional> +CudnnConvolutionAlgorithmPicker::PickBestAlgorithm( + CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape, + const Shape& output_shape, const Window& window, + const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) { + // Create a stream for us to do our work on. + se::Stream stream{stream_exec_}; + stream.Init(); + const auto device_ordinal = stream_exec_->device_ordinal(); + + // allocator either points to this->allocator_ or, if that's null, to a + // StreamExecutorMemoryAllocator for stream_exec_. + DeviceMemoryAllocator* allocator; + optional se_allocator; + if (allocator_ != nullptr) { + allocator = allocator_; + } else { + se_allocator.emplace( + stream_exec_->platform(), + tensorflow::gtl::ArraySlice({stream_exec_})); + allocator = &*se_allocator; + } + + // Allocate space for the input, filter, and output of the convolution. We + // use a ScratchAllocator for this instead of calling allocator_ directly so + // that our allocations don't leak. + // + // We don't put any data in these buffers, because (in theory, anyway) the + // speed of a conv isn't affected by the data being convolved. + ScratchAllocator input_output_allocator(device_ordinal, allocator); + se::port::StatusOr input_buf = + input_output_allocator.AllocateBytes(&stream, + ShapeUtil::ByteSizeOf(input_shape)); + se::port::StatusOr filter_buf = + input_output_allocator.AllocateBytes(&stream, + ShapeUtil::ByteSizeOf(filter_shape)); + se::port::StatusOr output_buf = + input_output_allocator.AllocateBytes(&stream, + ShapeUtil::ByteSizeOf(output_shape)); + if (!input_buf.ok() || !filter_buf.ok() || !output_buf.ok()) { + LOG(WARNING) + << "Couldn't allocate space for input/filter/output of convolution " + << instr->ToString() << ". Falling back to default algorithm."; + return nullopt; + } + + const bool use_winograd_nonfused = + ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums); + se::dnn::ProfileResult best_result; + int64 best_result_bytes_used = 0; + for (const AlgorithmDesc& alg : + GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) { + ScratchAllocator scratch_allocator(device_ordinal, allocator); + se::dnn::ProfileResult profile_result; + VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for " + << instr->ToString(); + + bool launch_ok = + RunCudnnConvolution(kind, input_shape, filter_shape, output_shape, + se::DeviceMemory(input_buf.ValueOrDie()), + se::DeviceMemory(filter_buf.ValueOrDie()), + se::DeviceMemory(output_buf.ValueOrDie()), + &scratch_allocator, window, dnums, + AlgorithmConfig(alg), &stream, &profile_result) + .ok(); + + if (launch_ok && profile_result.is_valid()) { + int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes(); + VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) + << " succeeded, taking " << profile_result.elapsed_time_in_ms() + << "ms and using " << NumBytesToString(scratch_bytes_used) + << " of scratch (Best result: " + << best_result.elapsed_time_in_ms() << "ms, " + << NumBytesToString(best_result_bytes_used) << " of scratch)"; + if (profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + best_result_bytes_used = scratch_bytes_used; + } + } else { + VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed."; + } + } + if (best_result.is_valid()) { + VLOG(2) << "Best algorithm for " << instr->ToString() << ": " + << AlgorithmToString(best_result.algorithm()) << ", takes " + << best_result.elapsed_time_in_ms() << "ms, and uses " + << best_result_bytes_used << "B of scratch memory."; + return std::make_pair(best_result.algorithm().algo_id(), + best_result_bytes_used); + } + + LOG(WARNING) << "All algorithms tried for convolution " << instr->ToString() + << " failed. Falling back to default algorithm."; + return nullopt; +} + +StatusOr CudnnConvolutionAlgorithmPicker::RunOnInstruction( + HloInstruction* instr) { + CHECK(IsCustomCallToDnnConvolution(*instr)); + + const auto& call_target = instr->custom_call_target(); + const auto& lhs_shape = instr->operand(0)->shape(); + const auto& rhs_shape = instr->operand(1)->shape(); + const auto& conv_result_shape = instr->shape().tuple_shapes(0); + optional> alg_and_scratch_bytes; + if (call_target == kCudnnConvForwardCallTarget) { + alg_and_scratch_bytes = PickBestAlgorithm( + CudnnConvKind::kForward, /*input_shape=*/lhs_shape, + /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape, + instr->window(), instr->convolution_dimension_numbers(), instr); + } else if (call_target == kCudnnConvBackwardInputCallTarget) { + alg_and_scratch_bytes = PickBestAlgorithm( + CudnnConvKind::kBackwardInput, /*input_shape=*/conv_result_shape, + /*filter_shape=*/rhs_shape, /*output_shape=*/lhs_shape, instr->window(), + instr->convolution_dimension_numbers(), instr); + } else if (call_target == kCudnnConvBackwardFilterCallTarget) { + alg_and_scratch_bytes = PickBestAlgorithm( + CudnnConvKind::kBackwardFilter, /*input_shape=*/lhs_shape, + /*filter_shape=*/conv_result_shape, /*output_shape=*/rhs_shape, + instr->window(), instr->convolution_dimension_numbers(), instr); + } else { + LOG(FATAL) << "Unknown custom call target for cudnn conv: " + << instr->ToString(); + } + + if (!alg_and_scratch_bytes.has_value()) { + return false; + } + + int64 algorithm; + int64 scratch_bytes; + std::tie(algorithm, scratch_bytes) = *alg_and_scratch_bytes; + + VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and " + << NumBytesToString(scratch_bytes) + << " of scratch memory: " << instr->ToString(); + + // Replace instr with a new CustomCall which has the correct algorithm, and + // whose output shape has the appropriate amount of scratch memory. + HloComputation* computation = instr->parent(); + Shape new_call_shape = + ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0), + ShapeUtil::MakeShape(U8, {scratch_bytes})}); + HloInstruction* algorithm_hlo = computation->AddInstruction( + HloInstruction::CreateConstant(Literal::CreateR0(algorithm))); + HloInstruction* new_call = + computation->AddInstruction(HloInstruction::CreateCustomCall( + new_call_shape, + {instr->mutable_operand(0), instr->mutable_operand(1), algorithm_hlo}, + instr->custom_call_target())); + new_call->set_window(instr->window()); + new_call->set_convolution_dimension_numbers( + instr->convolution_dimension_numbers()); + + // Repackage new_call so it has the same shape as the original call, namely + // (conv_result, u8[0]). + HloInstruction* new_tuple = + computation->AddInstruction(HloInstruction::CreateTuple( + {computation->AddInstruction(HloInstruction::CreateGetTupleElement( + new_call_shape.tuple_shapes(0), new_call, 0)), + computation->AddInstruction( + HloInstruction::CreateConstant(Literal::CreateR1({})))})); + + TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple)); + return true; +} + +StatusOr CudnnConvolutionAlgorithmPicker::RunOnComputation( + HloComputation* computation) { + std::vector convs; + for (auto* instr : computation->instructions()) { + if (IsCustomCallToDnnConvolution(*instr)) { + convs.push_back(instr); + } + } + + bool changed = false; + for (auto* instr : convs) { + TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr)); + changed |= result; + } + return changed; +} + +StatusOr CudnnConvolutionAlgorithmPicker::Run(HloModule* module) { + bool changed = false; + for (HloComputation* computation : module->MakeNonfusionComputations()) { + TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation)); + changed |= result; + } + return changed; +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h new file mode 100644 index 00000000000..10e49daee5d --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h @@ -0,0 +1,62 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_ + +#include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" +#include "tensorflow/core/lib/gtl/optional.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace gpu { + +// Modifies CustomCalls to cudnn convolutions, choosing the best algorithm for +// each and adding explicit scratch space to the CustomCalls. +class CudnnConvolutionAlgorithmPicker : public HloPassInterface { + public: + // If the `allocator` parameter is not null, we will use it to allocate temp + // memory while timing the various convolution algorithms. If it's null, + // we'll use the default allocator on the StreamExecutor. + CudnnConvolutionAlgorithmPicker( + perftools::gputools::StreamExecutor* stream_exec, + DeviceMemoryAllocator* allocator) + : stream_exec_(stream_exec), allocator_(allocator) {} + + tensorflow::StringPiece name() const override { + return "cudnn-convolution-algorithm-picker"; + } + + StatusOr Run(HloModule* module) override; + + private: + StatusOr RunOnComputation(HloComputation* computation); + StatusOr RunOnInstruction(HloInstruction* instr); + tensorflow::gtl::optional> PickBestAlgorithm( + CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape, + const Shape& output_shape, const Window& window, + const ConvolutionDimensionNumbers& dnums, HloInstruction* instr); + + perftools::gputools::StreamExecutor* stream_exec_; // never null + DeviceMemoryAllocator* allocator_; // may be null +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc similarity index 83% rename from tensorflow/compiler/xla/service/gpu/convolution_folding.cc rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc index b0626ca3bc9..e0c73aa73ac 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc @@ -1,4 +1,4 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h" #include #include @@ -33,14 +33,32 @@ namespace xla { namespace gpu { namespace { + +bool CanImplementAsCudnnForwardConv(HloInstruction* conv) { + const ConvolutionDimensionNumbers& dnums = + conv->convolution_dimension_numbers(); + if (dnums.input_spatial_dimensions_size() > 3) { + return false; + } + + // CuDNN does not accept zero-element arguments + if (ShapeUtil::HasZeroElements(conv->operand(0)->shape()) || + ShapeUtil::HasZeroElements(conv->operand(1)->shape())) { + return false; + } + + if (window_util::HasWindowReversal(conv->window())) { + return false; + } + return true; +} + // Try to match a backward filter pattern that contains "conv". // Precondition: "conv" is a kConvolution. -std::tuple, Window, - ConvolutionDimensionNumbers> -MatchBackwardFilter(HloInstruction* conv) { +std::tuple MatchBackwardFilter( + HloInstruction* conv) { const auto no_match_result = - std::make_tuple(false, std::vector(), Window(), - ConvolutionDimensionNumbers()); + std::make_tuple(false, Window(), ConvolutionDimensionNumbers()); // Step 1: match the instruction pattern without considering the paddings and // dimension numbers just yet. We may need some generic pattern matcher // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h @@ -190,18 +208,15 @@ MatchBackwardFilter(HloInstruction* conv) { backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]); } - return std::make_tuple(true, std::vector({conv}), - backward_conv_window, backward_conv_dnums); + return std::make_tuple(true, backward_conv_window, backward_conv_dnums); } // Try to match a backward input pattern that contains "conv". // Precondition: "conv" is a kConvolution. -std::tuple, Window, - ConvolutionDimensionNumbers> -MatchBackwardInput(HloInstruction* conv) { +std::tuple MatchBackwardInput( + HloInstruction* conv) { const auto no_match_result = - std::make_tuple(false, std::vector(), Window(), - ConvolutionDimensionNumbers()); + std::make_tuple(false, Window(), ConvolutionDimensionNumbers()); // Match instruction pattern. CHECK_EQ(HloOpcode::kConvolution, conv->opcode()); @@ -374,16 +389,63 @@ MatchBackwardInput(HloInstruction* conv) { dnums.set_kernel_output_feature_dimension( conv->convolution_dimension_numbers().kernel_input_feature_dimension()); - return std::make_tuple(true, - std::vector({conv, reverse_filter}), - new_window, dnums); + return std::make_tuple(true, new_window, dnums); } -} // namespace -StatusOr ConvolutionFolding::Run(HloModule* module) { - HloComputation* entry_computation = module->entry_computation(); +// Tries to rewrite a single convolution into a call to cudnn. +StatusOr RunOnInstruction(HloInstruction* conv) { + CHECK_EQ(conv->opcode(), HloOpcode::kConvolution); + + HloInstruction* custom_call = [&]() -> HloInstruction* { + bool match; + Window window; + ConvolutionDimensionNumbers dnums; + + std::tie(match, window, dnums) = MatchBackwardFilter(conv); + if (match) { + return CreateCudnnConvBackwardFilter( + conv->shape(), conv->mutable_operand(0), conv->mutable_operand(1), + window, dnums); + } + + std::tie(match, window, dnums) = MatchBackwardInput(conv); + if (match) { + // Backward input conv subsumes the conv plus the reverse in operand 1. + HloInstruction* reverse = conv->mutable_operand(1); + CHECK_EQ(reverse->opcode(), HloOpcode::kReverse); + HloInstruction* rhs = reverse->mutable_operand(0); + + return CreateCudnnConvBackwardInput( + conv->shape(), conv->mutable_operand(0), rhs, window, dnums); + } + + // If all else fails, try a forward convolution. + if (CanImplementAsCudnnForwardConv(conv)) { + return CreateCudnnConvForward(conv->shape(), conv->mutable_operand(0), + conv->mutable_operand(1), conv->window(), + conv->convolution_dimension_numbers()); + } + + return nullptr; + }(); + + if (custom_call == nullptr) { + return false; + } + + // The CustomCall returns a tuple (conv_result, scratch_memory). Extract out + // the conv result and replace `conv` with it. + TF_RETURN_IF_ERROR(conv->parent()->ReplaceWithNewInstruction( + conv, + HloInstruction::CreateGetTupleElement(conv->shape(), custom_call, 0))); + return true; +} + +// Rewrites the convolutions in the given computation into calls to cudnn. +// Returns true if it made any changes. +StatusOr RunOnComputation(HloComputation* computation) { std::vector convs; - for (auto* hlo : entry_computation->instructions()) { + for (auto* hlo : computation->instructions()) { if (hlo->opcode() == HloOpcode::kConvolution) { convs.push_back(hlo); } @@ -391,41 +453,18 @@ StatusOr ConvolutionFolding::Run(HloModule* module) { bool changed = false; for (HloInstruction* conv : convs) { - bool match; - std::vector hlos_to_fuse; - Window window; - ConvolutionDimensionNumbers dnums; - std::tie(match, hlos_to_fuse, window, dnums) = MatchBackwardFilter(conv); - if (match) { - VLOG(2) << "Fuse instructions"; - for (HloInstruction* hlo_to_fuse : hlos_to_fuse) { - VLOG(2) << " " << hlo_to_fuse->ToString(); - } - HloInstruction* backward_convolution = - entry_computation->CreateFusionInstructionForBackwardConvolution( - hlos_to_fuse, HloInstruction::FusionKind::kConvBackwardFilter, - window, dnums); - VLOG(2) << "to backward filter convolution"; - VLOG(2) << " " << backward_convolution->ToString(); - changed = true; - continue; - } + TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(conv)); + changed |= result; + } + return changed; +} +} // namespace - std::tie(match, hlos_to_fuse, window, dnums) = MatchBackwardInput(conv); - if (match) { - VLOG(2) << "Fuse instructions"; - for (HloInstruction* hlo_to_fuse : hlos_to_fuse) { - VLOG(2) << " " << hlo_to_fuse->ToString(); - } - HloInstruction* backward_convolution = - entry_computation->CreateFusionInstructionForBackwardConvolution( - hlos_to_fuse, HloInstruction::FusionKind::kConvBackwardInput, - window, dnums); - VLOG(2) << "to backward input convolution"; - VLOG(2) << " " << backward_convolution->ToString(); - changed = true; - continue; - } +StatusOr CudnnConvolutionRewriter::Run(HloModule* module) { + bool changed = false; + for (HloComputation* computation : module->MakeNonfusionComputations()) { + TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation)); + changed |= result; } return changed; } diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h similarity index 63% rename from tensorflow/compiler/xla/service/gpu/convolution_folding.h rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h index f9c898721f8..0c0578d8884 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_folding.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h @@ -1,4 +1,4 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_ #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" @@ -22,10 +22,12 @@ limitations under the License. namespace xla { namespace gpu { -class ConvolutionFolding : public HloPassInterface { +// Rewrites plain convolutions, backwards-filter convolutions, and +// backwards-input convolutions into CustomCall HLOs that call into cuDNN. +class CudnnConvolutionRewriter : public HloPassInterface { public: tensorflow::StringPiece name() const override { - return "convolution-folding"; + return "cudnn-convolution-rewriter"; } StatusOr Run(HloModule* module) override; @@ -34,4 +36,4 @@ class ConvolutionFolding : public HloPassInterface { } // namespace gpu } // namespace xla -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_ +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc similarity index 82% rename from tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc index 34e6bdb117d..65588b6aaf2 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc @@ -1,4 +1,4 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,23 +13,29 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/shape_inference.h" +#include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/core/platform/test.h" namespace xla { namespace gpu { +namespace { -class ConvolutionFoldingTest : public HloTestBase { +namespace op = xla::testing::opcode_matchers; + +class CudnnConvolutionRewriterTest : public HloTestBase { public: - ConvolutionFoldingTest() { + CudnnConvolutionRewriterTest() { for (int i = 0; i < 2; ++i) { WindowDimension* window_dim = default_conv_window_.add_dimensions(); window_dim->set_size(1); @@ -44,7 +50,8 @@ class ConvolutionFoldingTest : public HloTestBase { // the batch and feature dimension in the activations, and treat the batch // dimension in gradients as the input feature dimension in the filter. // - // TODO(jingyue): Add more tests on NCHW input order which TF also supports. + // TODO(jingyue): Add more tests on NCHW input order, which TF also + // supports. tf_default_dnums_for_backward_filter_.set_input_batch_dimension(3); tf_default_dnums_for_backward_filter_.set_input_feature_dimension(0); tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(1); @@ -74,9 +81,8 @@ class ConvolutionFoldingTest : public HloTestBase { } protected: - bool FoldConvolution(HloModule* module) { - ConvolutionFolding convolution_folding; - return convolution_folding.Run(module).ValueOrDie(); + bool RunPass(HloModule* module) { + return CudnnConvolutionRewriter().Run(module).ValueOrDie(); } // A convolution window with stride 1 and zero padding. The size fields are @@ -86,7 +92,7 @@ class ConvolutionFoldingTest : public HloTestBase { ConvolutionDimensionNumbers tf_default_dnums_for_backward_input_; }; -TEST_F(ConvolutionFoldingTest, BackwardFilterConvolve) { +TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolve) { HloComputation::Builder builder(TestName()); HloInstruction* activations = builder.AddInstruction(HloInstruction::CreateParameter( @@ -108,14 +114,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolve) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(FoldConvolution(module.get())); - EXPECT_EQ(HloOpcode::kFusion, - entry_computation->root_instruction()->opcode()); - EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter == - entry_computation->root_instruction()->fusion_kind()); + EXPECT_TRUE(RunPass(module.get())); + EXPECT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); } -TEST_F(ConvolutionFoldingTest, +TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolveEquivalentToForwardConvolution) { HloComputation::Builder builder(TestName()); HloInstruction* activations = @@ -135,12 +140,17 @@ TEST_F(ConvolutionFoldingTest, tf_default_dnums_for_backward_filter_)); auto module = CreateNewModule(); - module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(FoldConvolution(module.get())); + HloComputation* entry_computation = + module->AddEntryComputation(builder.Build()); + EXPECT_TRUE(RunPass(module.get())); + EXPECT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); } // Extracted from block35 training. -TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) { +TEST_F(CudnnConvolutionRewriterTest, + BackwardFilterConvolveWithPaddedActivations) { auto builder = HloComputation::Builder(TestName()); HloInstruction* activations = builder.AddInstruction(HloInstruction::CreateParameter( @@ -162,15 +172,15 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(FoldConvolution(module.get())); - EXPECT_EQ(HloOpcode::kFusion, - entry_computation->root_instruction()->opcode()); - EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter == - entry_computation->root_instruction()->fusion_kind()); + EXPECT_TRUE(RunPass(module.get())); + EXPECT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); } // Extracted from inception v3 training. -TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) { +TEST_F(CudnnConvolutionRewriterTest, + BackwardFilterConvolveWithPaddedGradients) { auto builder = HloComputation::Builder(TestName()); HloInstruction* activations = builder.AddInstruction(HloInstruction::CreateParameter( @@ -192,14 +202,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(FoldConvolution(module.get())); - EXPECT_EQ(HloOpcode::kFusion, - entry_computation->root_instruction()->opcode()); - EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter == - entry_computation->root_instruction()->fusion_kind()); + EXPECT_TRUE(RunPass(module.get())); + EXPECT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); } -TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) { +TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolveWithUnevenPadding) { auto builder = HloComputation::Builder(TestName()); HloInstruction* activations = builder.AddInstruction(HloInstruction::CreateParameter( @@ -221,14 +230,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(FoldConvolution(module.get())); - EXPECT_EQ(HloOpcode::kFusion, - entry_computation->root_instruction()->opcode()); - EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter == - entry_computation->root_instruction()->fusion_kind()); + EXPECT_TRUE(RunPass(module.get())); + EXPECT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); } -TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) { +TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveEvenPadding) { auto builder = HloComputation::Builder(TestName()); HloInstruction* output = builder.AddInstruction(HloInstruction::CreateParameter( @@ -272,14 +280,15 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(FoldConvolution(module.get())); - EXPECT_EQ(HloOpcode::kFusion, - entry_computation->root_instruction()->opcode()); - EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput == - entry_computation->root_instruction()->fusion_kind()); + EXPECT_TRUE(RunPass(module.get())); + + ASSERT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardInputCallTarget), 0)); + const HloInstruction* custom_call = + entry_computation->root_instruction()->operand(0); for (int i = 0; i < 2; ++i) { - const WindowDimension& window_dim = - entry_computation->root_instruction()->window().dimensions(i); + const WindowDimension& window_dim = custom_call->window().dimensions(i); // Low padding of the backward input convolution // = kernel_size - 1 - low padding on gradients. EXPECT_EQ(3, window_dim.padding_low()); @@ -291,7 +300,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) { // Convolve([abc], [x], base_dilation=2) // = Convolve([abc], Reverse([x]), base_dilation=2) // = BackwardInputConvolve([abc], [x], stride=2) -TEST_F(ConvolutionFoldingTest, BackwardInputConvolve1x1Filter) { +TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolve1x1Filter) { auto builder = HloComputation::Builder(TestName()); // NHWC dimension order. HloInstruction* output = @@ -316,17 +325,16 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolve1x1Filter) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(FoldConvolution(module.get())); - EXPECT_EQ(HloOpcode::kFusion, - entry_computation->root_instruction()->opcode()); - EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput == - entry_computation->root_instruction()->fusion_kind()); + EXPECT_TRUE(RunPass(module.get())); + EXPECT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardInputCallTarget), 0)); } // BackwardInputConvolve([abc], [x], stride=1) is equivalent to // ForwardConvolve([abc], [x], stride=1). No need to fold it into backward input // convolution. -TEST_F(ConvolutionFoldingTest, +TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolve1x1FilterEquivalentToForwardConvolve) { auto builder = HloComputation::Builder(TestName()); // NHWC dimension order. @@ -347,8 +355,12 @@ TEST_F(ConvolutionFoldingTest, tf_default_dnums_for_backward_input_)); auto module = CreateNewModule(); - module->AddEntryComputation(builder.Build()); - EXPECT_FALSE(FoldConvolution(module.get())); + HloComputation* entry_computation = + module->AddEntryComputation(builder.Build()); + EXPECT_TRUE(RunPass(module.get())); + EXPECT_THAT( + entry_computation->root_instruction(), + op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0)); } // Extracted from Inception V3 training. @@ -365,7 +377,8 @@ TEST_F(ConvolutionFoldingTest, // 20x10x10x192 // // Gradients are padded unevenly. -TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) { +TEST_F(CudnnConvolutionRewriterTest, + BackwardInputConvolveUnevenPaddingOnGradients) { auto builder = HloComputation::Builder(TestName()); HloInstruction* output = builder.AddInstruction(HloInstruction::CreateParameter( @@ -397,14 +410,14 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) { auto module = CreateNewModule(); HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(FoldConvolution(module.get())); - EXPECT_EQ(HloOpcode::kFusion, - entry_computation->root_instruction()->opcode()); - EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput == - entry_computation->root_instruction()->fusion_kind()); + EXPECT_TRUE(RunPass(module.get())); + ASSERT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardInputCallTarget), 0)); + const HloInstruction* custom_call = + entry_computation->root_instruction()->operand(0); for (int i = 0; i < 2; ++i) { - const WindowDimension& window_dim = - entry_computation->root_instruction()->window().dimensions(i); + const WindowDimension& window_dim = custom_call->window().dimensions(i); EXPECT_EQ(0, window_dim.padding_low()); EXPECT_EQ(0, window_dim.padding_high()); EXPECT_EQ(2, window_dim.stride()); @@ -413,7 +426,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) { // Similar to BackwardInputConvolveUnevenPadding, but the low padding of the // gradients exceeds kernel_size - 1. Therefore, this pattern cannot be fused. -TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) { +TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveLowPaddingTooLarge) { auto builder = HloComputation::Builder(TestName()); HloInstruction* output = builder.AddInstruction(HloInstruction::CreateParameter( @@ -442,8 +455,12 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) { .ValueOrDie())); auto module = CreateNewModule(); - module->AddEntryComputation(builder.Build()); - EXPECT_FALSE(FoldConvolution(module.get())); + HloComputation* entry_computation = + module->AddEntryComputation(builder.Build()); + EXPECT_TRUE(RunPass(module.get())); + EXPECT_THAT( + entry_computation->root_instruction(), + op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0)); } // Extracted from //learning/brain/google/xla/benchmarks/resnet.py @@ -460,7 +477,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) { // // We should fuse BC even though padding on activations is uneven, because // PadInsertion will canonicalize the fusion HLO. -TEST_F(ConvolutionFoldingTest, +TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveUnevenPaddingOnActivations) { auto builder = HloComputation::Builder(TestName()); // The gradients are in NCHW layout. @@ -493,13 +510,12 @@ TEST_F(ConvolutionFoldingTest, auto module = CreateNewModule(); const HloComputation* entry_computation = module->AddEntryComputation(builder.Build()); - EXPECT_TRUE(FoldConvolution(module.get())); - const HloInstruction* backward_conv = entry_computation->root_instruction(); - EXPECT_EQ(HloOpcode::kFusion, backward_conv->opcode()); - EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput == - backward_conv->fusion_kind()); + EXPECT_TRUE(RunPass(module.get())); + ASSERT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardInputCallTarget), 0)); const WindowDimension& backward_conv_col_dim = - backward_conv->window().dimensions(1); + entry_computation->root_instruction()->operand(0)->window().dimensions(1); EXPECT_EQ(0, backward_conv_col_dim.padding_low()); EXPECT_EQ(1, backward_conv_col_dim.padding_high()); } @@ -515,7 +531,7 @@ TEST_F(ConvolutionFoldingTest, // // We currently don't fuse BC because PadInsertion doesn't support negative // padding on the gradients of backward convolution (b/32744257). -TEST_F(ConvolutionFoldingTest, +TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveNegativePaddingHighOnActivations) { auto builder = HloComputation::Builder(TestName()); // The gradients are in NCHW layout. @@ -544,9 +560,14 @@ TEST_F(ConvolutionFoldingTest, .ValueOrDie())); auto module = CreateNewModule(); - module->AddEntryComputation(builder.Build()); - EXPECT_FALSE(FoldConvolution(module.get())); + HloComputation* entry_computation = + module->AddEntryComputation(builder.Build()); + EXPECT_TRUE(RunPass(module.get())); + EXPECT_THAT( + entry_computation->root_instruction(), + op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0)); } +} // anonymous namespace } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc new file mode 100644 index 00000000000..f5f52cf62bf --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc @@ -0,0 +1,221 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/util.h" + +namespace xla { +namespace gpu { +namespace { + +namespace se = ::perftools::gputools; + +using se::DeviceMemory; +using se::DeviceMemoryBase; +using se::Stream; +using se::dnn::AlgorithmConfig; +using se::dnn::BatchDescriptor; +using se::dnn::ConvolutionDescriptor; +using se::dnn::DataLayout; +using se::dnn::DimIndex; +using se::dnn::FilterDescriptor; +using se::dnn::FilterLayout; +using se::dnn::ProfileResult; + +// A StreamExecutor ScratchAllocator that wraps a single XLA allocation, +// returning it (in its entirety) the first time Allocate() is called. +class ScratchBufAllocator : public se::ScratchAllocator { + public: + explicit ScratchBufAllocator(se::DeviceMemoryBase scratch) + : scratch_(scratch) {} + + ~ScratchBufAllocator() override = default; + + int64 GetMemoryLimitInBytes(se::Stream* /*stream*/) override { + return scratch_.size(); + } + + se::port::StatusOr> AllocateBytes( + se::Stream* stream, int64 byte_size) override { + if (allocated_) { + return se::port::InternalError( + "Can't allocate twice from a ScratchBufAllocator."); + } + if (byte_size > scratch_.size()) { + return se::port::InternalError(tensorflow::strings::StrCat( + "Can't allocate ", byte_size, + " bytes from a ScratchBufAllocator of size ", scratch_.size())); + } + + allocated_ = true; + return se::DeviceMemory(scratch_); + } + + private: + se::DeviceMemoryBase scratch_; + bool allocated_ = false; +}; + +} // anonymous namespace + +string CudnnConvKindToString(CudnnConvKind kind) { + switch (kind) { + case CudnnConvKind::kForward: + return "forward"; + case CudnnConvKind::kBackwardFilter: + return "backward_filter"; + case CudnnConvKind::kBackwardInput: + return "backward_input"; + } +} + +Status RunCudnnConvolution(CudnnConvKind kind, const Shape& input_shape, + const Shape& filter_shape, const Shape& output_shape, + DeviceMemory input_buf, + DeviceMemory filter_buf, + DeviceMemory output_buf, + DeviceMemoryBase scratch_buf, const Window& window, + const ConvolutionDimensionNumbers& dnums, + AlgorithmConfig algorithm, Stream* stream, + ProfileResult* profile_result /*= nullptr*/) { + ScratchBufAllocator scratch_allocator(scratch_buf); + return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape, + input_buf, filter_buf, output_buf, + &scratch_allocator, window, dnums, algorithm, + stream, profile_result); +} + +Status RunCudnnConvolution( + CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape, + const Shape& output_shape, DeviceMemory input_buf, + DeviceMemory filter_buf, DeviceMemory output_buf, + se::ScratchAllocator* scratch_allocator, const Window& window, + const ConvolutionDimensionNumbers& dnums, AlgorithmConfig algorithm, + Stream* stream, ProfileResult* profile_result /*= nullptr*/) { + VLOG(3) << "Convolution kind: " << CudnnConvKindToString(kind); + VLOG(3) << "input shape: { " << ShapeUtil::HumanString(input_shape) << " }"; + VLOG(3) << "filter shape: { " << ShapeUtil::HumanString(filter_shape) << " }"; + VLOG(3) << "Output shape: { " << ShapeUtil::HumanString(output_shape) << " }"; + VLOG(3) << "Window: { " << window.ShortDebugString() << " }"; + VLOG(3) << "Dim nums: { " << dnums.ShortDebugString() << " }"; + + const int num_dimensions = window.dimensions_size(); + CHECK_LE(num_dimensions, 3); + // cuDNN does not support 1D convolutions. We therefore express 1D + // convolutions as 2D convolutions where the first spatial dimension is 1. + // This matches the behavior of TF (see definition of conv1d in + // tensorflow/python/ops/nn_ops.py). + const int effective_num_dimensions = std::max(2, num_dimensions); + + CHECK_EQ(F32, output_shape.element_type()) + << ShapeUtil::HumanString(output_shape); + CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size()); + CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size()); + CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size()); + for (const WindowDimension& dim : window.dimensions()) { + CHECK_EQ(dim.padding_low(), dim.padding_high()); + } + + // cuDNN's convolution APIs support the BDYX layout for activations/output and + // the OIYX layout for weights. + BatchDescriptor input_descriptor(effective_num_dimensions); + input_descriptor.set_layout(DataLayout::kBatchDepthYX) + .set_feature_map_count( + input_shape.dimensions(dnums.input_feature_dimension())) + .set_count(input_shape.dimensions(dnums.input_batch_dimension())); + for (int dim = 0; dim < num_dimensions; ++dim) { + // Note that the dimensions are reversed. The same holds below. + input_descriptor.set_spatial_dim( + static_cast(effective_num_dimensions - dim - 1), + input_shape.dimensions(dnums.input_spatial_dimensions(dim))); + } + + FilterDescriptor filter_descriptor(effective_num_dimensions); + filter_descriptor.set_layout(FilterLayout::kOutputInputYX) + .set_input_feature_map_count( + filter_shape.dimensions(dnums.kernel_input_feature_dimension())) + .set_output_feature_map_count( + filter_shape.dimensions(dnums.kernel_output_feature_dimension())); + for (int dim = 0; dim < num_dimensions; ++dim) { + filter_descriptor.set_spatial_dim( + static_cast(effective_num_dimensions - dim - 1), + filter_shape.dimensions(dnums.kernel_spatial_dimensions(dim))); + } + + ConvolutionDescriptor convolution_descriptor(effective_num_dimensions); + for (int dim = 0; dim < num_dimensions; ++dim) { + convolution_descriptor + .set_zero_padding( + static_cast(effective_num_dimensions - dim - 1), + window.dimensions(dim).padding_low()) + .set_filter_stride( + static_cast(effective_num_dimensions - dim - 1), + window.dimensions(dim).stride()); + } + + BatchDescriptor output_descriptor(effective_num_dimensions); + output_descriptor.set_layout(DataLayout::kBatchDepthYX) + .set_feature_map_count( + output_shape.dimensions(dnums.output_feature_dimension())) + .set_count(output_shape.dimensions(dnums.output_batch_dimension())); + for (int dim = 0; dim < num_dimensions; ++dim) { + output_descriptor.set_spatial_dim( + static_cast(effective_num_dimensions - dim - 1), + output_shape.dimensions(dnums.output_spatial_dimensions(dim))); + } + + // Add a singleton dimension in the 1D convolution case. + if (num_dimensions == 1) { + input_descriptor.set_spatial_dim(static_cast(0), 1); + output_descriptor.set_spatial_dim(static_cast(0), 1); + filter_descriptor.set_spatial_dim(static_cast(0), 1); + convolution_descriptor.set_zero_padding(static_cast(0), 0) + .set_filter_stride(static_cast(0), 1); + } + + switch (kind) { + case CudnnConvKind::kForward: + stream->ThenConvolveWithAlgorithm( + input_descriptor, input_buf, filter_descriptor, filter_buf, + convolution_descriptor, output_descriptor, &output_buf, + scratch_allocator, algorithm, profile_result); + break; + case CudnnConvKind::kBackwardInput: + stream->ThenConvolveBackwardDataWithAlgorithm( + filter_descriptor, filter_buf, output_descriptor, output_buf, + convolution_descriptor, input_descriptor, &input_buf, + scratch_allocator, algorithm, profile_result); + break; + case CudnnConvKind::kBackwardFilter: + stream->ThenConvolveBackwardFilterWithAlgorithm( + input_descriptor, input_buf, output_descriptor, output_buf, + convolution_descriptor, filter_descriptor, &filter_buf, + scratch_allocator, algorithm, profile_result); + break; + } + + if (!stream->ok()) { + return InternalError( + "Unable to launch convolution with type %s and algorithm (%lld, %lld)", + CudnnConvKindToString(kind).c_str(), algorithm.algorithm().algo_id(), + algorithm.algorithm_no_scratch().algo_id()); + } + return Status::OK(); +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h new file mode 100644 index 00000000000..b101f76510c --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h @@ -0,0 +1,97 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_ + +#include "tensorflow/compiler/xla/status.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace gpu { + +// This file contains low-level routines for running cudnn convolutions. + +// Different types of convolutions supported by cudnn. +// +// A way to think about these is that a convolution is defined by three arrays +// -- the "input", the "filter", and the "output" -- and given any two of these, +// we can compute the third. For example, a backward-input convolution takes as +// input a filter and an "output" and produces an "input" such that if one were +// to do a forward convolution of "input" using filter, the result would be +// something with the same shape as "output". +// +// This way of thinking is not correct if you look at the values produced. For +// example, a backward-input convolution is not actually the mathematical +// inverse of a forward convolution. But it's right as far as the shapes and +// "connectivity" (i.e. which elements of the input affect which elements of +// the output) are concerned. +enum class CudnnConvKind { + kForward, // input + filter => output + kBackwardInput, // filter + output => input + kBackwardFilter, // input + output => filter +}; + +// Converts a CudnnConvKind value to a string. +string CudnnConvKindToString(CudnnConvKind kind); + +// Calls into cudnn to run the specified convolution. +// +// Note that depending on the value of CudnnConvKind, the result of this call +// may be written into input_buf, filter_buf, or output_buf! +// +// At the moment we only support cudnn convolutions over floats. +// +// We provide one overload which takes a scratch buffer, and another which takes +// an allocator which is responsible for allocating the scratch space. In +// theory the second one shouldn't be necessary -- users of this function could +// just ask cudnn how much scratch space it needs for a particular convolution. +// But in practice, StreamExecutor does not expose such an API, and in the name +// of parsimony, perhaps it's better not to add it. Instead, the first time you +// call a convolution, you should call the version that takes a scratch +// allocator and take note of how much memory is used. The next time you call +// the same conv, you can provide an explicitly preallocated scratch buffer of +// that size, if you like. +Status RunCudnnConvolution( + CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape, + const Shape& output_shape, + perftools::gputools::DeviceMemory input_buf, + perftools::gputools::DeviceMemory filter_buf, + perftools::gputools::DeviceMemory output_buf, + perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window, + const ConvolutionDimensionNumbers& dnums, + perftools::gputools::dnn::AlgorithmConfig algorithm, + perftools::gputools::Stream* stream, + perftools::gputools::dnn::ProfileResult* profile_result = nullptr); + +Status RunCudnnConvolution( + CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape, + const Shape& output_shape, + perftools::gputools::DeviceMemory input_buf, + perftools::gputools::DeviceMemory filter_buf, + perftools::gputools::DeviceMemory output_buf, + perftools::gputools::ScratchAllocator* scratch_allocator, + const Window& window, const ConvolutionDimensionNumbers& dnums, + perftools::gputools::dnn::AlgorithmConfig algorithm, + perftools::gputools::Stream* stream, + perftools::gputools::dnn::ProfileResult* profile_result = nullptr); + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 0cca3ca0926..12ec266ff38 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -35,8 +35,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/call_inliner.h" #include "tensorflow/compiler/xla/service/dot_decomposer.h" #include "tensorflow/compiler/xla/service/flatten_call_graph.h" -#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h" #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h" #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h" #include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h" @@ -127,7 +128,9 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) { } // Runs optimization passes on the given HLO module. -tensorflow::Status OptimizeHloModule(HloModule* hlo_module) { +tensorflow::Status OptimizeHloModule(HloModule* hlo_module, + se::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) { { HloPassPipeline pipeline("optimization"); pipeline.AddInvariantChecker(); @@ -143,6 +146,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) { // most ops. pipeline.AddPass(BF16, F32); pipeline.AddPass(); + { auto& pass = pipeline.AddPass>("simplification"); @@ -173,7 +177,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) { pass.AddPass(); pass.AddPass(); } - pipeline.AddPass(); + pipeline.AddPass( [](const HloInstruction& dot, const TransposeFolding::OperandIndices& candidate_operands) { @@ -185,6 +189,58 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) { pipeline.AddPass(); TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); } + + { + // Convert convolutions into CustomCalls to cudnn, then canonicalize them + // (PadInsertion). + HloPassPipeline pipeline("conv_canonicalization"); + pipeline.AddInvariantChecker(); + pipeline.AddPass(); + pipeline.AddPass(); + + // Choose the fastest algorithm for each conv. + // + // In theory doing this here is way too early: It needs to happen after + // layout assignment, because the layout of the inputs/outputs affects the + // speed of the conv. But currently we only allow only one input/output + // layout when calling cudnn, so there's no ambiguity. + // + // We pick the algorithm at this early stage so we can generate better HLO. + // After CudnnConvolutionRewriter, our convolutions are CustomCalls which + // return a tuple (conv_result, scratch_memory), and the each conv uses 0 + // bytes of scratch: + // + // customcall = (f32[...], f32[0]) + // return gte(customcall, 0) + // + // The algorithm picker then chooses the best algorithm, and potentially + // increases the scratch space. It replaces customcall with new_tuple, + // giving us the following: + // + // new_customcall = (f32[...], f32[N]) + // new_tuple = tuple(gte(new_customcall, 0), constant f32[0]) + // return gte(new_tuple, 0) + // + // The new tuple and gte instructions then be simplified away, because + // nobody is expected to use the scratch value. + // + // However, if we were to run CudnnConvolutionAlgorithmPicker after layout + // assignment, fusion would already have run, and the gte(customcall, 0) + // would probably already be into a fusion node. We can't simplify across + // HloComputation boundaries, so in this case we wouldn't be able to + // simplify away the new_tuple bits. + // + // We'll need to revisit this if we ever allow multiple layouts for the + // inputs/outputs of a cudnn convolution. + pipeline.AddPass(stream_exec, + device_allocator); + // Clean up new_tuple described above. + pipeline.AddPass(); + pipeline.AddPass(); + + TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); + } + { HloPassFix fusion("fusion"); fusion.AddInvariantChecker(); @@ -220,9 +276,10 @@ tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) { // the parameter. HloPassPipeline pipeline("GPU-ir-emit-prepare"); pipeline.AddInvariantChecker(); - pipeline.AddPass(); + pipeline.AddPass( hlo_module->mutable_entry_computation_layout()); + // The LayoutAssignment pass may leave behind kCopy instructions which are // duplicate or NOPs, so remove them with algebraic simplification and CSE. pipeline.AddPass>( @@ -410,16 +467,19 @@ GpuCompiler::GpuCompiler() .getPointerSize(0 /* default address space */)) {} StatusOr> GpuCompiler::RunHloPasses( - std::unique_ptr module, se::StreamExecutor* /*stream_exec*/) { + std::unique_ptr module, se::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) { XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses"); Tracing::TraceMe annotation("HLO Transforms", module->name(), /*is_expensive=*/true); - TF_RETURN_IF_ERROR(OptimizeHloModule(module.get())); + TF_RETURN_IF_ERROR( + OptimizeHloModule(module.get(), stream_exec, device_allocator)); return std::move(module); } StatusOr> GpuCompiler::RunBackend( - std::unique_ptr module, se::StreamExecutor* stream_exec) { + std::unique_ptr module, se::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) { XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend"); TF_RET_CHECK(stream_exec != nullptr); @@ -459,16 +519,17 @@ StatusOr> GpuCompiler::RunBackend( /*color_alignment=*/[](LogicalBuffer::Color) { return kCudaMallocAlignBytes; })); - // BufferAssignment::ToString() includes a header, so no need for us to - // print one ourselves. + // BufferAssignment::Stats::ToString() and BufferAssignment::ToString() + // include headers, so no need for us to print them ourselves. + XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString()); XLA_VLOG_LINES(2, buffer_assignment->ToString()); XLA_VLOG_LINES(2, module->ToString()); - const string xla_dump_hlo_proto_to = - module->config().debug_options().xla_dump_hlo_proto_to(); - if (!xla_dump_hlo_proto_to.empty()) { + const string xla_dump_optimized_hlo_proto_to = + module->config().debug_options().xla_dump_optimized_hlo_proto_to(); + if (!xla_dump_optimized_hlo_proto_to.empty()) { HloProto proto = MakeHloProto(*module, *buffer_assignment); TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( - proto, xla_dump_hlo_proto_to, module->name())); + proto, xla_dump_optimized_hlo_proto_to, module->name())); } IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(), diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h index 18e34340205..c352d4d8462 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h @@ -51,11 +51,13 @@ class GpuCompiler : public LLVMCompiler { StatusOr> RunHloPasses( std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec) override; + perftools::gputools::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec) override; + perftools::gputools::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::vector> module, diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc index e3b493c6630..88bf5a74fa0 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc @@ -78,6 +78,12 @@ StatusOr GpuCopyInsertion::Run(HloModule* module) { for (int64 i = 0; i < hlo->operand_count() - 2; ++i) { TF_RETURN_IF_ERROR(copy_operand_if_constant(i)); } + } else if (IsCustomCallToDnnConvolution(*hlo)) { + // The last argument to a CUDNN convolution is its algorithm, which must + // be an HLO constant -- it shouldn't be copied. + for (int64 i = 0; i < hlo->operand_count() - 1; ++i) { + TF_RETURN_IF_ERROR(copy_operand_if_constant(i)); + } } else if (ImplementedAsLibraryCall(*hlo)) { // For all other library calls, materialize all the operands into memory. for (int64 i = 0; i < hlo->operand_count(); ++i) { diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc index 58915f1f62f..89f1e625884 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc @@ -28,122 +28,114 @@ limitations under the License. namespace xla { namespace gpu { +// cuDNN convolutions are called with specific layouts on the input, output, +// and filter: +// +// input: DataLayout::kBatchDepthYX +// output: DataLayout::kBatchDepthYX +// filter: FilterLayout::kOutputInputYX +// +// The order dimensions in the constant name is major-to-minor (eg, the +// most-major dimension of the input is batch, most-minor is X). The +// specific dimension numbers these named dimensions correspond to is +// determined by the ConvolutionDimensionNumbers argument. Y is spatial +// dimension 0, and X is spatial dimension 1. +// +// TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls. +static Status AddBackendConstraintsToDnnConvCustomCall( + HloInstruction* instr, LayoutConstraints* constraints) { + CHECK(IsCustomCallToDnnConvolution(*instr)) << instr->ToString(); + Shape input_shape; + Shape filter_shape; + Shape output_shape; + const auto& target = instr->custom_call_target(); + if (target == kCudnnConvForwardCallTarget) { + input_shape = instr->operand(0)->shape(); + filter_shape = instr->operand(1)->shape(); + output_shape = instr->shape().tuple_shapes(0); + } else if (target == kCudnnConvBackwardInputCallTarget) { + input_shape = instr->shape().tuple_shapes(0); + filter_shape = instr->operand(1)->shape(); + output_shape = instr->operand(0)->shape(); + } else if (target == kCudnnConvBackwardFilterCallTarget) { + input_shape = instr->operand(0)->shape(); + filter_shape = instr->shape().tuple_shapes(0); + output_shape = instr->operand(1)->shape(); + } else { + LOG(FATAL) << "Unexpected custom call target: " + << instr->custom_call_target(); + } + + // Construct minor-to-major dimension orders for operands and result. + // cuDNN's convolution APIs support the BDYX layout for activations/output + // and the OIYX layout for weights. + // TODO(b/29399649): Be more flexible about handling layouts of cuDNN + // calls after we switch to cuDNN v5. + const ConvolutionDimensionNumbers& dimension_numbers = + instr->convolution_dimension_numbers(); + std::vector input_layout; + for (int i = dimension_numbers.input_spatial_dimensions_size() - 1; i >= 0; + --i) { + input_layout.push_back(dimension_numbers.input_spatial_dimensions(i)); + } + input_layout.push_back(dimension_numbers.input_feature_dimension()); + input_layout.push_back(dimension_numbers.input_batch_dimension()); + *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout); + + std::vector filter_layout; + for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1; i >= 0; + --i) { + filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i)); + } + filter_layout.push_back(dimension_numbers.kernel_input_feature_dimension()); + filter_layout.push_back(dimension_numbers.kernel_output_feature_dimension()); + *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout); + + std::vector output_layout; + for (int i = dimension_numbers.output_spatial_dimensions_size() - 1; i >= 0; + --i) { + output_layout.push_back(dimension_numbers.output_spatial_dimensions(i)); + } + output_layout.push_back(dimension_numbers.output_feature_dimension()); + output_layout.push_back(dimension_numbers.output_batch_dimension()); + *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout); + + // The custom call returns a tuple of (actual_result, scratch_buffer); + // call_result_buf is the logical buffer for actual_result, the thing that + // contains the result of the conv call. + TF_ASSIGN_OR_RETURN(const LogicalBuffer* call_result_buf, + constraints->points_to_analysis().GetBufferDefinedAt( + instr, /*index=*/{0})); + + // Set layouts of the instructions' shapes. + if (target == kCudnnConvForwardCallTarget) { + TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0)); + TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1)); + TF_RETURN_IF_ERROR( + constraints->SetBufferLayout(output_shape.layout(), *call_result_buf)); + } else if (target == kCudnnConvBackwardInputCallTarget) { + TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 0)); + TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1)); + TF_RETURN_IF_ERROR( + constraints->SetBufferLayout(input_shape.layout(), *call_result_buf)); + } else if (target == kCudnnConvBackwardFilterCallTarget) { + TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0)); + TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 1)); + TF_RETURN_IF_ERROR( + constraints->SetBufferLayout(filter_shape.layout(), *call_result_buf)); + } else { + LOG(FATAL) << "Unexpected custom call target: " + << instr->custom_call_target(); + } + return Status::OK(); +} + Status GpuLayoutAssignment::AddBackendConstraints( LayoutConstraints* constraints) { for (auto* instruction : constraints->computation()->instructions()) { - // cuDNN is called with specific layouts on the input, output, and filter: - // - // input: DataLayout::kBatchDepthYX - // output: DataLayout::kBatchDepthYX - // filter: FilterLayout::kOutputInputYX - // - // The order dimensions in the constant name is major-to-minor (eg, the - // most-major dimension of the input is batch, most-minor is X). The - // specific dimension numbers these named dimensions correspond to is - // determined by the ConvolutionDimensionNumbers argument. Y is spatial - // dimension 0, and X is spatial dimension 1. - // - // TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls. - if (ImplementedAsDnnConvolution(*instruction)) { - HloInstruction* input = nullptr; - HloInstruction* filter = nullptr; - HloInstruction* output = nullptr; - if (instruction->opcode() == HloOpcode::kConvolution) { - input = instruction->mutable_operand(0); - filter = instruction->mutable_operand(1); - output = instruction; - } else { - CHECK_EQ(HloOpcode::kFusion, instruction->opcode()); - switch (instruction->fusion_kind()) { - case HloInstruction::FusionKind::kConvBackwardFilter: - // filter = BackwardFilterConvolve(input, output) - input = instruction->mutable_operand(0); - filter = instruction; - output = instruction->mutable_operand(1); - break; - case HloInstruction::FusionKind::kConvBackwardInput: - // input = BackwardInputConvolve(output, filter) - input = instruction; - filter = instruction->mutable_operand(1); - output = instruction->mutable_operand(0); - break; - default: - LOG(FATAL) << "Not a convolution-fusion"; - } - } - - // Construct minor-to-major dimension orders for operands and result. - // cuDNN's convolution APIs support the BDYX layout for activations/output - // and the OIYX layout for weights. - // TODO(b/29399649): Be more flexible about handling layouts of cuDNN - // calls after we switch to cuDNN v5. - const ConvolutionDimensionNumbers& dimension_numbers = - instruction->convolution_dimension_numbers(); - std::vector input_layout; - for (int i = dimension_numbers.input_spatial_dimensions_size() - 1; - i >= 0; --i) { - input_layout.push_back(dimension_numbers.input_spatial_dimensions(i)); - } - input_layout.push_back(dimension_numbers.input_feature_dimension()); - input_layout.push_back(dimension_numbers.input_batch_dimension()); - Shape input_shape(input->shape()); - *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout); - - std::vector filter_layout; - for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1; - i >= 0; --i) { - filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i)); - } - filter_layout.push_back( - dimension_numbers.kernel_input_feature_dimension()); - filter_layout.push_back( - dimension_numbers.kernel_output_feature_dimension()); - Shape filter_shape(filter->shape()); - *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout); - - std::vector output_layout; - for (int i = dimension_numbers.output_spatial_dimensions_size() - 1; - i >= 0; --i) { - output_layout.push_back(dimension_numbers.output_spatial_dimensions(i)); - } - output_layout.push_back(dimension_numbers.output_feature_dimension()); - output_layout.push_back(dimension_numbers.output_batch_dimension()); - Shape output_shape(output->shape()); - *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout); - - // Set layouts of the instructions' shapes. - if (instruction->opcode() == HloOpcode::kConvolution) { - TF_RETURN_IF_ERROR( - constraints->SetOperandLayout(input_shape, output, 0)); - TF_RETURN_IF_ERROR( - constraints->SetOperandLayout(filter_shape, output, 1)); - TF_RETURN_IF_ERROR( - constraints->SetInstructionLayout(output_shape, output)); - } else { - CHECK_EQ(HloOpcode::kFusion, instruction->opcode()); - switch (instruction->fusion_kind()) { - case HloInstruction::FusionKind::kConvBackwardFilter: - // filter = BackwardFilterConvolve(input, output) - TF_RETURN_IF_ERROR( - constraints->SetOperandLayout(input_shape, filter, 0)); - TF_RETURN_IF_ERROR( - constraints->SetInstructionLayout(filter_shape, filter)); - TF_RETURN_IF_ERROR( - constraints->SetOperandLayout(output_shape, filter, 1)); - break; - case HloInstruction::FusionKind::kConvBackwardInput: - // input = BackwardInputConvolve(output, filter) - TF_RETURN_IF_ERROR( - constraints->SetInstructionLayout(input_shape, input)); - TF_RETURN_IF_ERROR( - constraints->SetOperandLayout(output_shape, input, 0)); - TF_RETURN_IF_ERROR( - constraints->SetOperandLayout(filter_shape, input, 1)); - break; - default: - LOG(FATAL) << "Not a convolution-fusion"; - } - } + if (IsCustomCallToDnnConvolution(*instruction)) { + TF_RETURN_IF_ERROR( + AddBackendConstraintsToDnnConvCustomCall(instruction, constraints)); } } return Status::OK(); @@ -151,9 +143,12 @@ Status GpuLayoutAssignment::AddBackendConstraints( bool GpuLayoutAssignment::CustomCallRequiresMajorFirstLayout( const HloInstruction* instruction) { - // Inputs to cudnn batchnorm custom calls don't need the major-first layout - // (i.e. {n, n-1, ...0}) -- we can handle any layout. - return !IsCustomCallToDnnBatchNorm(*instruction); + // - Inputs to cudnn batchnorm custom calls don't need the major-first layout + // (i.e. {n, n-1, ...0}) -- we can handle any layout. + // - Inputs to cudnn convolution require custom layouts handled in + // AddBackendConstraints. + return !IsCustomCallToDnnBatchNorm(*instruction) && + !IsCustomCallToDnnConvolution(*instruction); } Status GpuLayoutAssignment::PropagateOperandConstraint( diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc index c2115c49993..dd4426ca7b9 100644 --- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc +++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc @@ -191,7 +191,11 @@ static bool BuffersInvariantWithinConsumer( llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo, const HloInstruction& consumer, const ShapeIndex& shape_index) { - llvm_ir::IrArray ir_array(GetBasePointer(hlo, shape_index), + llvm::Value* base_ptr = GetBasePointer(hlo, shape_index); + CHECK_NE(base_ptr, nullptr) + << "Buffer not assigned for shape_index " << shape_index.ToString() + << " of " << hlo.ToString(); + llvm_ir::IrArray ir_array(base_ptr, ShapeUtil::GetSubshape(hlo.shape(), shape_index)); alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array); diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc index 1d47ffde433..2d6dad27a59 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc @@ -137,49 +137,6 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) { .ValueOrDie()); } -TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfConvolutionUnfused) { - HloComputation::Builder builder(TestName()); - auto input = builder.AddInstruction(HloInstruction::CreateParameter( - 0, ShapeUtil::MakeShape(F32, {1, 1, 1, 3}), "input")); - auto filter = builder.AddInstruction(HloInstruction::CreateParameter( - 1, ShapeUtil::MakeShape(F32, {1, 1, 1, 2}), "filter")); - - Window conv_window; - WindowDimension* conv_window_row = conv_window.add_dimensions(); - conv_window_row->set_size(1); - WindowDimension* conv_window_col = conv_window.add_dimensions(); - conv_window_col->set_size(2); - conv_window_col->set_padding_high(1); - - ConvolutionDimensionNumbers conv_dnums; - conv_dnums.set_input_batch_dimension(0); - conv_dnums.set_output_batch_dimension(0); - conv_dnums.set_input_feature_dimension(1); - conv_dnums.set_output_feature_dimension(1); - conv_dnums.add_input_spatial_dimensions(2); - conv_dnums.add_output_spatial_dimensions(2); - conv_dnums.add_input_spatial_dimensions(3); - conv_dnums.add_output_spatial_dimensions(3); - conv_dnums.set_kernel_output_feature_dimension(0); - conv_dnums.set_kernel_input_feature_dimension(1); - conv_dnums.add_kernel_spatial_dimensions(2); - conv_dnums.add_kernel_spatial_dimensions(3); - - auto conv = builder.AddInstruction( - HloInstruction::CreateConvolve(ShapeUtil::MakeShape(F32, {1, 1, 1, 3}), - input, filter, conv_window, conv_dnums)); - auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose( - ShapeUtil::MakeShape(F32, {3, 1, 1, 1}), conv, {3, 2, 1, 0})); - builder.AddInstruction( - HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), transpose)); - - auto module = CreateNewModule(); - module->AddEntryComputation(builder.Build()); - EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true) - .Run(module.get()) - .ValueOrDie()); -} - TEST_F(InstructionFusionTest, GetTupleElementFused) { HloComputation::Builder builder(TestName()); Shape data_shape = ShapeUtil::MakeShape(F32, {8}); diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc index 76566a9e3db..2f65edffea8 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc @@ -90,43 +90,6 @@ bool ImplementedAsGemm(const HloInstruction& hlo) { return false; } -bool ImplementedAsDnnConvolution(const HloInstruction& hlo) { - // We can only do this if the HLO is unnested. - if (hlo.parent() != hlo.GetModule()->entry_computation()) { - return false; - } - - // Forward convolution. - if (hlo.opcode() == HloOpcode::kConvolution) { - const ConvolutionDimensionNumbers& dnums = - hlo.convolution_dimension_numbers(); - if (dnums.input_spatial_dimensions_size() > 3) { - return false; - } - - // CuDNN does not accept zero-element arguments - if (ShapeUtil::HasZeroElements(hlo.operand(0)->shape()) || - ShapeUtil::HasZeroElements(hlo.operand(1)->shape())) { - return false; - } - - if (window_util::HasWindowReversal(hlo.window())) { - return false; - } - - return true; - } - - // Backward convolution. - if (hlo.opcode() == HloOpcode::kFusion && - (hlo.fusion_kind() == HloInstruction::FusionKind::kConvBackwardFilter || - hlo.fusion_kind() == HloInstruction::FusionKind::kConvBackwardInput)) { - return true; - } - - return false; -} - const char* const kCudnnBatchNormForwardInferenceCallTarget = "__cudnn$batchNormalizationForwardInference"; const char* const kCudnnBatchNormForwardTrainingCallTarget = @@ -144,9 +107,76 @@ bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo) { target == kCudnnBatchNormBackwardCallTarget; } +const char* const kCudnnConvForwardCallTarget = "__cudnn$convForward"; +const char* const kCudnnConvBackwardInputCallTarget = + "__cudnn$convBackwardInput"; +const char* const kCudnnConvBackwardFilterCallTarget = + "__cudnn$convBackwardFilter"; + +bool IsCustomCallToDnnConvolution(const HloInstruction& hlo) { + if (hlo.opcode() != HloOpcode::kCustomCall) { + return false; + } + const auto& target = hlo.custom_call_target(); + return target == kCudnnConvForwardCallTarget || + target == kCudnnConvBackwardInputCallTarget || + target == kCudnnConvBackwardFilterCallTarget; +} + bool ImplementedAsLibraryCall(const HloInstruction& hlo) { - return ImplementedAsGemm(hlo) || ImplementedAsDnnConvolution(hlo) || - IsCustomCallToDnnBatchNorm(hlo); + return ImplementedAsGemm(hlo) || IsCustomCallToDnnBatchNorm(hlo) || + IsCustomCallToDnnConvolution(hlo); +} + +static HloInstruction* CreateCudnnConv( + const char* call_target, const Shape& shape, HloInstruction* lhs, + HloInstruction* rhs, const Window& window, + const ConvolutionDimensionNumbers& dnums) { + HloComputation* computation = lhs->parent(); + + // This call returns a tuple of (conv_result, scratch_memory), where + // conv_result is the actual result of the convolution, and scratch_memory is + // temporary memory used by cudnn. + // + // At the moment, we don't know how much scratch memory this conv is going to + // use, so we put u8[0] in this place. Later on another pass will choose + // which conv algorithm to use, and at that point we'll modify the shape of + // this second tuple element. + Shape call_shape = + ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})}); + + // Our CustomCall takes three arguments: The conv lhs and rhs, and the cudnn + // algorithm to use. It's up to a later pass to choose the algorithm, so to + // indicate that we haven't yet made a choice, we speicfy -1 for that arg. + HloInstruction* negative_one = computation->AddInstruction( + HloInstruction::CreateConstant(Literal::CreateR0(-1))); + HloInstruction* custom_call = + computation->AddInstruction(HloInstruction::CreateCustomCall( + call_shape, {lhs, rhs, negative_one}, call_target)); + custom_call->set_window(window); + custom_call->set_convolution_dimension_numbers(dnums); + return custom_call; +} + +HloInstruction* CreateCudnnConvForward( + const Shape& shape, HloInstruction* input, HloInstruction* kernel, + const Window& window, const ConvolutionDimensionNumbers& dnums) { + return CreateCudnnConv(kCudnnConvForwardCallTarget, shape, input, kernel, + window, dnums); +} + +HloInstruction* CreateCudnnConvBackwardInput( + const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter, + const Window& window, const ConvolutionDimensionNumbers& dnums) { + return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, shape, output, + reverse_filter, window, dnums); +} + +HloInstruction* CreateCudnnConvBackwardFilter( + const Shape& shape, HloInstruction* input, HloInstruction* output, + const Window& window, const ConvolutionDimensionNumbers& dnums) { + return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, shape, input, + output, window, dnums); } bool IsReductionToVector(const HloInstruction& reduce) { diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h index d24ed9879d0..7ad9680bfb4 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h @@ -22,6 +22,9 @@ limitations under the License. #include "llvm/IR/Value.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +// TODO(jlebar): Move functions related to cublas/cudnn to a separate file; they +// don't belong in "ir_emission_utils". + namespace xla { namespace gpu { @@ -30,9 +33,6 @@ constexpr int64 kWarpSize = 32; // Returns true if `hlo` will be implemented as a call to BLAS gemm. bool ImplementedAsGemm(const HloInstruction& hlo); -// Returns true if `hlo` will be implemented as a call to cuDNN convolution. -bool ImplementedAsDnnConvolution(const HloInstruction& hlo); - // A call to cuDNN for batch normalization is represented as CustomCall HLO with // a call target equal to one of these strings. // @@ -58,6 +58,60 @@ extern const char* const kCudnnBatchNormBackwardCallTarget; // sequence of generic HLOs or to a cuDNN CustomCall. bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo); +// A call to cuDNN for convolution (forward, backward filter, or backward input) +// is represented as a CustomCall HLO with a call target equal to one of these +// strings. +// +// These CustomCalls have window() and convolution_dimension_numbers() set like +// regular convolution ops. They have the same LHS and RHS operands, plus one +// additional int64 operand, representing which cudnn algorithm to run. This +// operand must be an HLO constant. A value of -1 means that the implementation +// is free to choose the best algorithm it can. +// +// These calls output a tuple (conv_result, scratch_memory), where conv_result +// is the actual result of the convolution, and scratch_memory is temporary +// memory used by cudnn. Callers shouldn't inspect scratch_memory, as its value +// is not well-defined. +// +// CudnnConvolutionRewriter lowers kConvolution HLOs to these custom calls. +// When it does so, it chooses algorithm -1 and 0 bytes of scratch space. Later +// on in the pipeline, CudnnConvolutionAlgorithmChooser chooses an explicit +// algorithm for each conv and sets the amount of scratch space needed. +// +// (Representing the scratch memory as an output may seem strange at first, but +// it's quite sensible, from a certain point of view. The scratch buffer is a +// location in memory that the conv can write into, but which it can't legally +// read from, at least until it's written something first. But that's exactly +// the definition of an output buffer.) +extern const char* const kCudnnConvForwardCallTarget; +extern const char* const kCudnnConvBackwardInputCallTarget; +extern const char* const kCudnnConvBackwardFilterCallTarget; + +// Returns true if `hlo` will be implemented as a call to a cuDNN convolution +// routine. +// +// This returns true if `hlo` is a CustomCall HLO with a call target equal to +// one of the kCudnnConvFoo constants above, but returns *false* for HLOs with a +// kConvolution opcode. +bool IsCustomCallToDnnConvolution(const HloInstruction& hlo); + +// Creates a CustomCall for a cudnn forward/backward-input/backward-filter conv. +// Note that these CustomCalls return a tuple (conv_result, scratch_memory). If +// you want just the conv result, you'll need to get-tuple-element the value +// returned by this function. +// +// The created cudnn call will use the default cudnn algorithm and no scratch +// space. +HloInstruction* CreateCudnnConvForward( + const Shape& shape, HloInstruction* input, HloInstruction* kernel, + const Window& window, const ConvolutionDimensionNumbers& dnums); +HloInstruction* CreateCudnnConvBackwardInput( + const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter, + const Window& window, const ConvolutionDimensionNumbers& dnums); +HloInstruction* CreateCudnnConvBackwardFilter( + const Shape& shape, HloInstruction* input, HloInstruction* output, + const Window& window, const ConvolutionDimensionNumbers& dnums); + // Returns true if `hlo` will be implemented as a library call, e.g. cuBLAS gemm // or cuDNN convolution. bool ImplementedAsLibraryCall(const HloInstruction& hlo); diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc index 23b72c3f71d..affd2ffa8ed 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc @@ -615,8 +615,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) { Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) { // TODO(b/33011107): Support cross replica sum on GPU. - return Unimplemented( - "Cross replica sum not implemented on GPU. See b/33011107."); + return Unimplemented("CrossReplicaSum is not implemented on GPU."); } Status IrEmitter::HandleParameter(HloInstruction* parameter) { @@ -710,11 +709,13 @@ Status IrEmitter::HandleCustomCall(HloInstruction*) { } Status IrEmitter::HandleInfeed(HloInstruction*) { - return Unimplemented("Infeed is not supported on GPU (b/30467474)."); + // TODO(b/30467474): Implement infeed on GPU. + return Unimplemented("Infeed is not supported on GPU."); } Status IrEmitter::HandleOutfeed(HloInstruction*) { - return Unimplemented("Outfeed is not supported on GPU (b/34359662)."); + // TODO(b/34359662): Implement outfeed on GPU. + return Unimplemented("Outfeed is not supported on GPU."); } Status IrEmitter::HandleRng(HloInstruction* random) { diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h index 3aa178410f0..9031a838f92 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h @@ -336,9 +336,6 @@ class IrEmitterUnnested : public IrEmitter { // Thunk object. std::unique_ptr BuildKernelThunk(const HloInstruction* inst); - // Returns a ConvolutionThunk that calls DNN to implement `inst`. - std::unique_ptr BuildConvolutionThunk(const HloInstruction* inst); - // Returns a FftThunk that calls cuFFT to implement `inst`. std::unique_ptr BuildFftThunk(const HloInstruction* inst); diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index fc8783e753d..a4847f6ca99 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -32,6 +32,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h" #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h" +#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h" #include "tensorflow/compiler/xla/service/gpu/fft_thunk.h" #include "tensorflow/compiler/xla/service/gpu/for_thunk.h" #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h" @@ -278,10 +279,6 @@ Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) { } Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) { - if (ImplementedAsDnnConvolution(*convolution)) { - thunk_sequence_->emplace_back(BuildConvolutionThunk(convolution)); - return Status::OK(); - } thunk_sequence_->emplace_back(BuildKernelThunk(convolution)); return IrEmitter::HandleConvolution(convolution); } @@ -380,6 +377,71 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) { return Status::OK(); } + if (IsCustomCallToDnnConvolution(*custom_call)) { + const auto& assn = ir_emitter_context_->buffer_assignment(); + const auto& lhs_shape = custom_call->operand(0)->shape(); + const auto& rhs_shape = custom_call->operand(1)->shape(); + const auto& conv_result_shape = custom_call->shape().tuple_shapes(0); + auto lhs_slice = GetAllocationSlice(*custom_call->operand(0)); + auto rhs_slice = GetAllocationSlice(*custom_call->operand(1)); + auto tuple_result_slice = GetAllocationSlice(*custom_call); + auto conv_result_slice = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie(); + auto scratch_slice = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie(); + + const HloInstruction* algorithm_inst = custom_call->operand(2); + CHECK(algorithm_inst->IsConstant()) << algorithm_inst->ToString(); + int64 algorithm = algorithm_inst->literal().Get({}); + + const auto& target = custom_call->custom_call_target(); + std::unique_ptr thunk; + if (target == kCudnnConvForwardCallTarget) { + thunk = MakeUnique( + CudnnConvKind::kForward, + /*input_buffer=*/lhs_slice, + /*filter_buffer=*/rhs_slice, + /*output_buffer=*/conv_result_slice, + /*tuple_result_buffer=*/tuple_result_slice, + /*scratch_buffer=*/scratch_slice, + /*input_shape=*/lhs_shape, + /*filter_shape=*/rhs_shape, + /*output_shape=*/conv_result_shape, // + custom_call->window(), custom_call->convolution_dimension_numbers(), + algorithm, custom_call); + } else if (target == kCudnnConvBackwardInputCallTarget) { + thunk = MakeUnique( + CudnnConvKind::kBackwardInput, + /*input_buffer=*/conv_result_slice, + /*filter_buffer=*/rhs_slice, + /*output_buffer=*/lhs_slice, + /*tuple_result_buffer=*/tuple_result_slice, + /*scratch_buffer=*/scratch_slice, + /*input_shape=*/conv_result_shape, + /*filter_shape=*/rhs_shape, + /*output_shape=*/lhs_shape, // + custom_call->window(), custom_call->convolution_dimension_numbers(), + algorithm, custom_call); + } else if (target == kCudnnConvBackwardFilterCallTarget) { + thunk = MakeUnique( + CudnnConvKind::kBackwardFilter, + /*input_buffer=*/lhs_slice, + /*filter_buffer=*/conv_result_slice, + /*output_buffer=*/rhs_slice, + /*tuple_result_buffer=*/tuple_result_slice, + /*scratch_buffer=*/scratch_slice, + /*input_shape=*/lhs_shape, + /*filter_shape=*/conv_result_shape, + /*output_shape=*/rhs_shape, // + custom_call->window(), custom_call->convolution_dimension_numbers(), + algorithm, custom_call); + } else { + LOG(FATAL) << "Unexpected custom call target: " + << custom_call->custom_call_target(); + } + + thunk_sequence_->emplace_back(std::move(thunk)); + return Status::OK(); + } + return IrEmitter::HandleCustomCall(custom_call); } @@ -500,10 +562,6 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { thunk_sequence_->emplace_back(BuildGemmThunk(fusion)); return Status::OK(); } - if (ImplementedAsDnnConvolution(*fusion)) { - thunk_sequence_->emplace_back(BuildConvolutionThunk(fusion)); - return Status::OK(); - } thunk_sequence_->emplace_back(BuildKernelThunk(fusion)); return IrEmitter::HandleFusion(fusion); } @@ -1599,24 +1657,24 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) { } Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) { - tensorflow::gtl::ArraySlice operands(tuple->operands()); - bool all_tuple_elements_have_buffer = std::all_of( - operands.begin(), operands.end(), [this](HloInstruction* tuple_element) { + bool all_tuple_elements_have_buffer = + c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) { return ir_emitter_context_->buffer_assignment().HasTopLevelAllocation( tuple_element); }); - // Tuples (especially output tuples) can take too many tuple elements, - // causing the kernel emitted exceeds the parameter space limit - // (b/31336476). As an optimization, if all tuple elements have a buffer, we - // collect their buffer addresses in a host array, and then copy that array - // to the tuple's buffer. + // Tuples (especially tuples that are the final result of a computation) can + // be so huge that if we were to emit a kernel that took each tuple element as + // a parameter, we would exceed the max allowable number of parameters to a + // GPU kernel, b/31336476. As an optimization, if all tuple elements have a + // buffer, we collect their buffer addresses in a host array, and then copy + // that array to the tuple's buffer. // // Some tuple elements (e.g. const or bitcast of const) might not have a - // buffer -- their contents are stored in code. In that case, we fall back - // to emitting kernels which have access to their buffer addresses in code. + // buffer -- their contents are stored in code. In that case, we fall back to + // emitting kernels which have access to their buffer addresses in code. if (all_tuple_elements_have_buffer) { std::vector tuple_element_buffers; - for (const HloInstruction* tuple_element : operands) { + for (const HloInstruction* tuple_element : tuple->operands()) { tuple_element_buffers.push_back(GetAllocationSlice(*tuple_element)); } thunk_sequence_->emplace_back(MakeUnique( @@ -1658,8 +1716,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter( // TODO(b/31410564): Implement dilation rate for select-and-scatter. if (window_util::HasDilation(window)) { return Unimplemented( - "Dilation for select-and-scatter not implemented on GPU. " - "See b/31410564."); + "Dilation for SelectAndScatter not implemented on GPU."); } // kSelectAndScatter is implemented as two kernel launches: the first launch @@ -2012,52 +2069,6 @@ std::unique_ptr IrEmitterUnnested::BuildGemmThunk( LOG(FATAL) << "Cannot build a GemmThunk for " << inst->ToString(); } -std::unique_ptr IrEmitterUnnested::BuildConvolutionThunk( - const HloInstruction* inst) { - const HloInstruction* lhs = inst->operand(0); - const HloInstruction* rhs = inst->operand(1); - if (inst->opcode() == HloOpcode::kConvolution) { - // Forward covolution. - return MakeUnique( - ConvolutionThunk::ConvolutionKind::kForward, - /*input_buffer=*/GetAllocationSlice(*lhs), - /*filter_buffer=*/GetAllocationSlice(*rhs), - /*output_buffer=*/GetAllocationSlice(*inst), - /*input_shape=*/lhs->shape(), - /*filter_shape=*/rhs->shape(), - /*output_shape=*/inst->shape(), inst->window(), - inst->convolution_dimension_numbers(), inst); - } - - // Backward filter convolution, which takes the input (activations) and the - // gradients, and computes the filter. - CHECK_EQ(HloOpcode::kFusion, inst->opcode()); - switch (inst->fusion_kind()) { - case HloInstruction::FusionKind::kConvBackwardFilter: - return MakeUnique( - ConvolutionThunk::ConvolutionKind::kBackwardFilter, - /*input_buffer=*/GetAllocationSlice(*lhs), - /*filter_buffer=*/GetAllocationSlice(*inst), - /*output_buffer=*/GetAllocationSlice(*rhs), - /*input_shape=*/lhs->shape(), - /*filter_shape=*/inst->shape(), - /*output_shape=*/rhs->shape(), inst->window(), - inst->convolution_dimension_numbers(), inst); - case HloInstruction::FusionKind::kConvBackwardInput: - return MakeUnique( - ConvolutionThunk::ConvolutionKind::kBackwardInput, - /*input_buffer=*/GetAllocationSlice(*inst), - /*filter_buffer=*/GetAllocationSlice(*rhs), - /*output_buffer=*/GetAllocationSlice(*lhs), - /*input_shape=*/inst->shape(), - /*filter_shape=*/rhs->shape(), - /*output_shape=*/lhs->shape(), inst->window(), - inst->convolution_dimension_numbers(), inst); - default: - LOG(FATAL) << "Not a convolution-fusion"; - } -} - std::unique_ptr IrEmitterUnnested::BuildFftThunk( const HloInstruction* inst) { const HloInstruction* operand = inst->operand(0); diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc index 2923a79af0a..25846dc6cd4 100644 --- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc +++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc @@ -27,7 +27,7 @@ namespace gpu { namespace { bool IsForwardConvolutionCanonical(const HloInstruction& conv) { - CHECK_EQ(HloOpcode::kConvolution, conv.opcode()); + CHECK_EQ(conv.custom_call_target(), kCudnnConvForwardCallTarget); return window_util::HasSymmetricPadding(conv.window()) && !window_util::HasNegativePadding(conv.window()) && !window_util::HasDilation(conv.window()); @@ -47,6 +47,12 @@ HloInstruction* MaybePaddedAndSlicedInput( window_util::HasBaseDilation(conv_window)) { // If padding is uneven or has dilation, we insert a kPad instruction that // applies positive padding and dilation. + // + // TODO(phawkins): If conv_window has asymmetric padding, perhaps instead of + // moving all the padding into an explicit pad op, we should keep as much + // padding inside of cudnn as possible, on the assumption that padding + // within cudnn is basically free, whereas a kPad's cost increases as the + // amount of padding increases. PaddingConfig padding_config = MakeNoPaddingConfig(input->shape().dimensions_size()); for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) { @@ -167,14 +173,17 @@ bool PadInsertion::CanonicalizeForwardConvolution(HloInstruction* conv) { dim->set_window_dilation(1); } + // The conv CustomCall returns a tuple (conv_result, scratch_buffer). Extract + // out the shape of conv_result. + Shape old_conv_shape = conv->shape().tuple_shapes(0); + VLOG(1) << "Canonicalizing forward conv"; - auto new_conv = HloInstruction::CreateConvolve( - conv->shape(), new_input, new_kernel, new_conv_window, - conv->convolution_dimension_numbers()); + auto new_conv = CreateCudnnConvForward(old_conv_shape, new_input, new_kernel, + new_conv_window, + conv->convolution_dimension_numbers()); VLOG(1) << "Replacing:\n " << conv->ToString() << "\nwith:\n " << new_conv->ToString(); - TF_CHECK_OK( - conv->parent()->ReplaceWithNewInstruction(conv, std::move(new_conv))); + TF_CHECK_OK(conv->parent()->ReplaceInstruction(conv, new_conv)); return true; } @@ -190,6 +199,8 @@ void IncreasePaddingHighBy(int64 delta, WindowDimension* window_dim) { bool PadInsertion::CanonicalizeBackwardFilterConvolution( HloInstruction* backward_conv) { + CHECK_EQ(backward_conv->custom_call_target(), + kCudnnConvBackwardFilterCallTarget); if (window_util::HasSymmetricPadding(backward_conv->window())) { return false; } @@ -202,15 +213,11 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution( // ABCD0 = Pad(ABCD, padding_high=1) // BackwardFilterConv(ABCD0, xyz, padding_low=pading_high=1) // We choose the lesser of padding_low and padding_high as the new padding. - HloInstruction* forward_conv = backward_conv->fused_expression_root(); HloInstruction* input = backward_conv->mutable_operand(0); - Window new_forward_conv_window = forward_conv->window(); Window new_backward_conv_window = backward_conv->window(); // input_padding_config is the config of the kPad to be inserted. PaddingConfig input_padding_config = MakeNoPaddingConfig(ShapeUtil::Rank(input->shape())); - ConvolutionDimensionNumbers forward_conv_dnums = - forward_conv->convolution_dimension_numbers(); ConvolutionDimensionNumbers backward_conv_dnums = backward_conv->convolution_dimension_numbers(); for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) { @@ -222,11 +229,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution( // cuDNN convolution (which doesn't support negative padding) to fail. return false; } - // If the backward convolution has uneven padding on the activations, we - // move some padding on the larger end to "internal" padding, so that the - // backward convolution produces larger weight gradients which get sliced - // later. Therefore, the amount of new padding (low or high) is the minimum - // of the amount of old padding low and old padding high. + // Compute the new, even padding for the backward conv operation. int64 new_conv_padding = std::min(padding_low, padding_high); int64 dim = backward_conv_dnums.input_spatial_dimensions(i); input_padding_config.mutable_dimensions(dim)->set_edge_padding_low( @@ -237,14 +240,9 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution( // Since we move some padding from the backward convolution to the kPad, we // need to accordingly reduce the padding amount of the backward convolution // and its inner forward convolution. - IncreasePaddingLowBy(-(padding_low - new_conv_padding), - new_backward_conv_window.mutable_dimensions(i)); - IncreasePaddingHighBy(-(padding_high - new_conv_padding), - new_backward_conv_window.mutable_dimensions(i)); - IncreasePaddingLowBy(-(padding_low - new_conv_padding), - new_forward_conv_window.mutable_dimensions(i)); - IncreasePaddingHighBy(-(padding_high - new_conv_padding), - new_forward_conv_window.mutable_dimensions(i)); + auto* new_dim = new_backward_conv_window.mutable_dimensions(i); + new_dim->set_padding_low(new_conv_padding); + new_dim->set_padding_high(new_conv_padding); } // Create a new backward convolution replacing the old one. @@ -260,19 +258,12 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution( .ConsumeValueOrDie(), input, padding, input_padding_config)); - HloInstruction* new_forward_conv = - computation->AddInstruction(HloInstruction::CreateConvolve( - ShapeInference::InferConvolveShape( - padded_input->shape(), output->shape(), new_forward_conv_window, - forward_conv_dnums) - .ConsumeValueOrDie(), - padded_input, output, new_forward_conv_window, forward_conv_dnums)); - - // Fuse the new forward convolution to the new backward convolution. - HloInstruction* new_backward_conv = - computation->CreateFusionInstructionForBackwardConvolution( - {new_forward_conv}, HloInstruction::FusionKind::kConvBackwardFilter, - new_backward_conv_window, backward_conv_dnums); + // The shape of the backward_conv CustomCall is a tuple (conv_result, + // scratch_buffer). Extract out the shape of conv_result. + Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0); + HloInstruction* new_backward_conv = CreateCudnnConvBackwardFilter( + backward_conv_shape, padded_input, output, new_backward_conv_window, + backward_conv_dnums); VLOG(1) << "Canonicalizing backward filter conv"; VLOG(1) << "Replacing:\n " << backward_conv->ToString() << "\nwith:\n " @@ -289,14 +280,15 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution( return false; } - HloInstruction* forward_conv = backward_conv->fused_expression_root(); - HloInstruction* reverse_filter = forward_conv->mutable_operand(1); - Window new_forward_conv_window = forward_conv->window(); Window new_backward_conv_window = backward_conv->window(); - ConvolutionDimensionNumbers forward_conv_dnums = - forward_conv->convolution_dimension_numbers(); ConvolutionDimensionNumbers backward_conv_dnums = backward_conv->convolution_dimension_numbers(); + + // The backward_conv CustomCall returns a tuple (conv_result, scratch_memory). + // Get the shape of conv_result. + Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0); + + Shape new_backward_conv_shape = backward_conv_shape; for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) { int64 padding_low = backward_conv->window().dimensions(i).padding_low(); int64 padding_high = backward_conv->window().dimensions(i).padding_high(); @@ -315,41 +307,38 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution( // where the amount of padding low is larger, we can canonicalize it to // [B A] = BackwardInputConvolve([a b], [x y z], padding=(low=1,high=1)) // [A] = Slice([B A]) - // For consistency, we need to increase the low padding of the inner - // convolution by 1 as well because the input is larger now. if (padding_low > padding_high) { IncreasePaddingLowBy(padding_high - padding_low, new_backward_conv_window.mutable_dimensions(i)); - IncreasePaddingLowBy(padding_low - padding_high, - new_forward_conv_window.mutable_dimensions(i)); } else if (padding_low < padding_high) { IncreasePaddingHighBy(padding_low - padding_high, new_backward_conv_window.mutable_dimensions(i)); - IncreasePaddingHighBy(padding_high - padding_low, - new_forward_conv_window.mutable_dimensions(i)); } + // Decreasing the padding by X *increases* the size of our output by X. + int64 dim = backward_conv_dnums.output_spatial_dimensions(i); + new_backward_conv_shape.set_dimensions( + dim, new_backward_conv_shape.dimensions(dim) + + std::abs(padding_low - padding_high)); } // Create a new backward convolution replacing the old one. HloComputation* computation = backward_conv->parent(); HloInstruction* output = backward_conv->mutable_operand(0); HloInstruction* filter = backward_conv->mutable_operand(1); - HloInstruction* new_reverse_filter = - computation->AddInstruction(HloInstruction::CreateReverse( - filter->shape(), filter, reverse_filter->dimensions())); - HloInstruction* new_forward_conv = - computation->AddInstruction(HloInstruction::CreateConvolve( - ShapeInference::InferConvolveShape( - output->shape(), new_reverse_filter->shape(), - new_forward_conv_window, forward_conv_dnums) - .ConsumeValueOrDie(), - output, new_reverse_filter, new_forward_conv_window, - forward_conv_dnums)); + + HloInstruction* new_backward_conv_call = CreateCudnnConvBackwardInput( + new_backward_conv_shape, output, filter, new_backward_conv_window, + backward_conv_dnums); + + // The CustomCall created above returns a tuple (conv_result, scratch_memory). + // Extract out the two elements. HloInstruction* new_backward_conv = - computation->CreateFusionInstructionForBackwardConvolution( - {new_forward_conv, new_reverse_filter}, - HloInstruction::FusionKind::kConvBackwardInput, - new_backward_conv_window, backward_conv_dnums); + computation->AddInstruction(HloInstruction::CreateGetTupleElement( + new_backward_conv_shape, new_backward_conv_call, 0)); + HloInstruction* new_backward_conv_scratch = + computation->AddInstruction(HloInstruction::CreateGetTupleElement( + new_backward_conv_call->shape().tuple_shapes(1), + new_backward_conv_call, 1)); // Slice the new backward convolution. // @@ -377,22 +366,25 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution( } // Replace the old backward convolution with the slice. - CHECK(ShapeUtil::Compatible( + Shape slice_shape = ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices, limit_indices, strides) - .ConsumeValueOrDie(), - backward_conv->shape())); + .ConsumeValueOrDie(); + CHECK(ShapeUtil::Compatible(slice_shape, backward_conv_shape)) + << ShapeUtil::HumanString(slice_shape) << " vs " + << ShapeUtil::HumanString(backward_conv_shape); - auto slice = - HloInstruction::CreateSlice(backward_conv->shape(), new_backward_conv, - start_indices, limit_indices, strides); + HloInstruction* slice = computation->AddInstruction( + HloInstruction::CreateSlice(backward_conv_shape, new_backward_conv, + start_indices, limit_indices, strides)); + HloInstruction* new_tuple = computation->AddInstruction( + HloInstruction::CreateTuple({slice, new_backward_conv_scratch})); VLOG(1) << "Canonicalizing backward input conv"; VLOG(1) << "Replacing:\n " << backward_conv->ToString() << "\nwith:\n " - << slice->ToString(); + << new_tuple->ToString(); - TF_CHECK_OK( - computation->ReplaceWithNewInstruction(backward_conv, std::move(slice))); + TF_CHECK_OK(computation->ReplaceInstruction(backward_conv, new_tuple)); return true; } @@ -400,18 +392,17 @@ StatusOr PadInsertion::Run(HloModule* module) { bool changed = false; for (HloInstruction* instruction : module->entry_computation()->MakeInstructionPostOrder()) { - if (instruction->opcode() == HloOpcode::kConvolution) { - changed |= CanonicalizeForwardConvolution(instruction); - } else if (instruction->opcode() == HloOpcode::kFusion) { - switch (instruction->fusion_kind()) { - case HloInstruction::FusionKind::kConvBackwardFilter: - changed |= CanonicalizeBackwardFilterConvolution(instruction); - break; - case HloInstruction::FusionKind::kConvBackwardInput: - changed |= CanonicalizeBackwardInputConvolution(instruction); - break; - default: - break; + if (IsCustomCallToDnnConvolution(*instruction)) { + const auto& target = instruction->custom_call_target(); + if (target == kCudnnConvForwardCallTarget) { + changed |= CanonicalizeForwardConvolution(instruction); + } else if (target == kCudnnConvBackwardFilterCallTarget) { + changed |= CanonicalizeBackwardFilterConvolution(instruction); + } else if (target == kCudnnConvBackwardInputCallTarget) { + changed |= CanonicalizeBackwardInputConvolution(instruction); + } else { + LOG(FATAL) << "Unknown custom call target for cudnn conv: " + << instruction->ToString(); } } } diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h index 934e7e1919f..8ed63a854a7 100644 --- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h @@ -42,6 +42,11 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter { const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder); + // Constructs a loop emitter for a loop that generates on element of each of N + // arrays on each iteration. + // + // This is used in multi-output fusion. target_element_generator should + // produce a struct with N elements, one for each of target_arrays. ParallelLoopEmitter( const llvm_ir::ElementGenerator& target_element_generator, tensorflow::gtl::ArraySlice target_arrays, diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc index 34e2f7ee206..cde5877e29f 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.cc +++ b/tensorflow/compiler/xla/service/heap_simulator.cc @@ -64,10 +64,8 @@ StatusOr HeapSimulator::Run( std::unique_ptr algorithm, const HloModule& module, const SequentialHloOrdering::HloModuleSequence& module_sequence, const TuplePointsToAnalysis& points_to_analysis, - const LogicalBuffer::SizeFunction& size_fn, - const FlatSet* buffers_to_assign) { - HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign, - &module_sequence); + const LogicalBuffer::SizeFunction& size_fn, const Options& options) { + HeapSimulator heap(std::move(algorithm), size_fn, options, &module_sequence); const HloComputation* entry_computation = module.entry_computation(); const std::vector& instruction_sequence = FindOrDie(module_sequence, entry_computation); @@ -81,9 +79,8 @@ StatusOr HeapSimulator::Run( std::unique_ptr algorithm, const HloComputation& computation, const std::vector& instruction_sequence, const TuplePointsToAnalysis& points_to_analysis, - const LogicalBuffer::SizeFunction& size_fn, - const FlatSet* buffers_to_assign) { - HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign, + const LogicalBuffer::SizeFunction& size_fn, const Options& options) { + HeapSimulator heap(std::move(algorithm), size_fn, options, /*module_sequence=*/nullptr); TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence, points_to_analysis)); @@ -199,15 +196,17 @@ Status HeapSimulator::RunComputation( // We can only share with the operand buffer if it is about to be freed; // we must be the last user of the buffer. bool shared = false; - for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) { - if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) && - buffer->instruction()->opcode() != HloOpcode::kCopy && - CanShareOperandBufferWithUser( - operand_buffer->instruction(), operand_buffer->index(), - buffer->instruction(), buffer->index(), points_to_analysis)) { - ShareBuffer(buffer, operand_buffer, instruction); - shared = true; - break; + if (options_.may_reuse_operand_buffers) { + for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) { + if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) && + buffer->instruction()->opcode() != HloOpcode::kCopy && + CanShareOperandBufferWithUser( + operand_buffer->instruction(), operand_buffer->index(), + buffer->instruction(), buffer->index(), points_to_analysis)) { + ShareBuffer(buffer, operand_buffer, instruction); + shared = true; + break; + } } } @@ -266,13 +265,12 @@ Status HeapSimulator::RunComputation( HeapSimulator::HeapSimulator( std::unique_ptr algorithm, - const LogicalBuffer::SizeFunction& size_fn, - const FlatSet* buffers_to_assign, + const LogicalBuffer::SizeFunction& size_fn, const Options& options, const SequentialHloOrdering::HloModuleSequence* module_sequence) : no_fragmentation_stats_(MakeUnique()), algorithm_(std::move(algorithm)), size_fn_(size_fn), - buffers_to_assign_(buffers_to_assign), + options_(options), module_sequence_(module_sequence) { debug_trace_.set_whole_module_simulation(module_sequence_ != nullptr); } @@ -280,13 +278,16 @@ HeapSimulator::HeapSimulator( HeapSimulator::~HeapSimulator() {} bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const { - // Buffers for constants are ignored, as with BufferAssigner. Also ignore - // buffers that we're not meant to assign. + // Buffers for constants are ignored unless the alloc_constants option is + // set. Also ignore buffers that we're not meant to assign. // // TODO(b/32248867): For consistency, constants should get allocations. - return buffer->instruction()->opcode() == HloOpcode::kConstant || - (buffers_to_assign_ != nullptr && - buffers_to_assign_->count(buffer) == 0); + if (!options_.alloc_constants && + buffer->instruction()->opcode() == HloOpcode::kConstant) { + return true; + } + return options_.buffers_to_assign != nullptr && + options_.buffers_to_assign->count(buffer) == 0; } // Alloc always calls the underlying heap algorithm. @@ -400,8 +401,8 @@ HeapSimulator::Result HeapSimulator::Finish() { } // If we were told to assign specific buffers, make sure we've assigned // exactly that many buffers. - if (buffers_to_assign_ != nullptr) { - CHECK_EQ(buffers_to_assign_->size(), result.chunk_map.size()); + if (options_.buffers_to_assign != nullptr) { + CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size()); } } diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h index 88a8698d161..636f19dd39f 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.h +++ b/tensorflow/compiler/xla/service/heap_simulator.h @@ -67,6 +67,23 @@ class HeapSimulator { HeapSimulatorTrace debug_trace; }; + // The different options to be passed to the Run() APIs. + struct Options { + Options() + : may_reuse_operand_buffers(true), + alloc_constants(false), + buffers_to_assign(nullptr) {} + + // Whether a buffer about to be Free()-ed, can be recycled for a new born + // one, hence collapsing Free()+Alloc() calls (default true). + bool may_reuse_operand_buffers; + // Whether to issue Alloc() and Free() calls for constants (default false). + bool alloc_constants; + // If 'buffers_to_assign' is provided, only those buffers are assigned + // offsets, otherwise all buffers defined by the instructions are assigned. + const tensorflow::gtl::FlatSet* buffers_to_assign; + }; + // Run the heap simulation with the given algorithm, assuming the given // module_sequence, which must contain a topologically-consistent total // ordering of all instructions within each computation. The result is invalid @@ -76,15 +93,12 @@ class HeapSimulator { // to running on a per-computation basis, since we can re-use buffer space for // called sub-computations. // - // If 'buffers_to_assign' is provided, only those buffers are assigned - // offsets, otherwise all buffers defined by the instructions are assigned. static StatusOr Run( std::unique_ptr algorithm, const HloModule& module, const SequentialHloOrdering::HloModuleSequence& module_sequence, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_fn, - const tensorflow::gtl::FlatSet* buffers_to_assign = - nullptr); + const Options& options = Options()); // Same as above, but runs on a single computation. The 'instruction_sequence' // must contain a topologically-consistent total ordering of all instructions @@ -96,8 +110,7 @@ class HeapSimulator { const std::vector& instruction_sequence, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_fn, - const tensorflow::gtl::FlatSet* buffers_to_assign = - nullptr); + const Options& options = Options()); private: // If 'module_sequence' is non-null, it is used to find kCall and kWhile @@ -105,8 +118,7 @@ class HeapSimulator { // be run recursively. I.e. the simulation is run over the whole module. HeapSimulator( std::unique_ptr algorithm, - const LogicalBuffer::SizeFunction& size_fn, - const tensorflow::gtl::FlatSet* buffers_to_assign, + const LogicalBuffer::SizeFunction& size_fn, const Options& options, const SequentialHloOrdering::HloModuleSequence* module_sequence); ~HeapSimulator(); @@ -130,7 +142,7 @@ class HeapSimulator { const std::unique_ptr no_fragmentation_stats_; const std::unique_ptr algorithm_; const LogicalBuffer::SizeFunction size_fn_; - const tensorflow::gtl::FlatSet* buffers_to_assign_; + const Options options_; const SequentialHloOrdering::HloModuleSequence* module_sequence_; // In addition to Alloc and Free, the heap simulator exposes a concept of diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index a63affa06ca..5432419e4a2 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -461,20 +461,6 @@ HloInstruction* HloComputation::CreateFusionInstruction( return fusion_instruction; } -HloInstruction* HloComputation::CreateFusionInstructionForBackwardConvolution( - tensorflow::gtl::ArraySlice instructions_to_fuse, - HloInstruction::FusionKind fusion_kind, const Window& window, - const ConvolutionDimensionNumbers& conv_dnums) { - CHECK(HloInstruction::FusionKind::kConvBackwardFilter == fusion_kind || - HloInstruction::FusionKind::kConvBackwardInput == fusion_kind); - HloInstruction* root = instructions_to_fuse.front(); - HloInstruction* fusion_instruction = - AddInstruction(HloInstruction::CreateFusionForBackwardConvolution( - root->shape(), fusion_kind, window, conv_dnums, root)); - FuseInstructionsInto(instructions_to_fuse, fusion_instruction); - return fusion_instruction; -} - StatusOr HloComputation::DeepCopyHelper( HloInstruction* instruction, const ShapeTree* indices_to_copy, ShapeTree* copies_added, ShapeIndex* index) { @@ -577,8 +563,11 @@ Status HloComputation::ReplaceWithNewInstruction( Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction, HloInstruction* new_instruction) { - TF_RET_CHECK(ShapeUtil::Compatible(old_instruction->shape(), - new_instruction->shape())); + TF_RET_CHECK( + ShapeUtil::Compatible(old_instruction->shape(), new_instruction->shape())) + << ShapeUtil::HumanString(old_instruction->shape()) << " vs " + << ShapeUtil::HumanString(new_instruction->shape()); + VLOG(10) << "transformed " << old_instruction->ToString() << " to " << new_instruction->ToString(); // Try to add metadata for HLO instructions that are created to replace diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h index 6436815f910..061c59abe5e 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.h +++ b/tensorflow/compiler/xla/service/hlo_computation.h @@ -224,15 +224,6 @@ class HloComputation { tensorflow::gtl::ArraySlice instructions_to_fuse, HloInstruction::FusionKind fusion_kind); - // Creates a fusion instruction that represents a backward convolution. This - // is similar to CreateFusionInstruction but takes window and conv_dnums which - // indicate the window and convolution dimension numbers of the backward - // convolution. - HloInstruction* CreateFusionInstructionForBackwardConvolution( - tensorflow::gtl::ArraySlice instructions_to_fuse, - HloInstruction::FusionKind fusion_kind, const Window& window, - const ConvolutionDimensionNumbers& conv_dnums); - // Create a deep copy of the given instruction and return the instruction // producing the copied result. All instructions performing the copy are added // to the computation. For array-shaped values, this method trivially returns diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc index cd54eb74d18..9cd5a1e2b71 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc @@ -469,7 +469,13 @@ Status HloCostAnalysis::HandleCall(const HloInstruction* call) { } Status HloCostAnalysis::HandleCustomCall(const HloInstruction*) { - return Unimplemented("Custom-call is not implemented for HLO cost analysis."); + // We can't do anything sane with CustomCalls, since we don't know what they + // do, and returning an error status will stop iteration over this + // computation, which is probably also not what we want. So just punt and + // return OK. This will cause all of the properties to be reported as 0, + // which is fine. + current_should_compute_bottleneck_time_ = false; + return Status::OK(); } Status HloCostAnalysis::HandleSort(const HloInstruction* sort) { diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc index e3f5c17e35f..ab604064d5e 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator.cc +++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc @@ -40,6 +40,7 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/window_util.h" #include "tensorflow/core/lib/core/bitmap.h" +#include "tensorflow/core/lib/core/casts.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/stringpiece.h" @@ -1706,6 +1707,115 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault { return HandleCos(cos); } + template ::value>::type* = nullptr> + Status HandleReducePrecision(HloInstruction* reduce_precision) { + TF_ASSIGN_OR_RETURN( + parent_->evaluated_[reduce_precision], + ElementWiseUnaryOp(reduce_precision, [reduce_precision]( + ElementwiseT elem) { + uint32_t value_as_int = tensorflow::bit_cast(elem); + const uint32_t mantissa_bits = reduce_precision->mantissa_bits(); + const uint32_t exponent_bits = reduce_precision->exponent_bits(); + + // Code is based on the CPU/GPU implementation in LLVM-emitting code. + // + // Bits in float type: + // mantissa : bits [0:22] + // exponent : bits [23:30] + // sign : bits [31] + if (mantissa_bits < 23) { + const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits); + + // Compute rounding bias for round-to-nearest with ties to even. + // This is equal to a base value of 0111... plus one bit if the last + // remaining mantissa bit is 1. + const uint32_t base_rounding_bias = + (last_mantissa_bit_mask >> 1) - 1; + const uint32_t x_last_mantissa_bit = + (value_as_int & last_mantissa_bit_mask) >> (23 - mantissa_bits); + const uint32_t x_rounding_bias = + x_last_mantissa_bit + base_rounding_bias; + + // Add rounding bias, and mask out truncated bits. Note that the + // case where adding the rounding bias overflows into the exponent + // bits is correct; the non-masked mantissa bits will all be zero, + // and the exponent will be incremented by one. + const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1); + value_as_int = value_as_int + x_rounding_bias; + value_as_int = value_as_int & truncation_mask; + } + if (exponent_bits < 8) { + // Masks for f32 values. + const uint32_t f32_sign_bit_mask = 1u << 31; + const uint32_t f32_exp_bits_mask = 0xffu << 23; + + // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the + // most- significant bit -- is equal to 1.0f for all exponent sizes. + // Adding 2^(n-1)-1 to this gives us the highest non-infinite + // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from + // this gives us the lowest' exponent (corresponding to 0.0f). + // + // Thus, the f32 exponent corresponding to the highest non-infinite + // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32 + // exponent corresponding to the lowest exponent for a bit size of n + // is (2^7-1) - 2^(n-1)-1. + // + // Note that we have already checked that exponents_bits >= 1. + const uint32_t f32_exponent_bias = (1 << 7) - 1; + const uint32_t reduced_exponent_bias = + (1 << (exponent_bits - 1)) - 1; + const uint32_t reduced_max_exponent = + f32_exponent_bias + reduced_exponent_bias; + const uint32_t reduced_min_exponent = + f32_exponent_bias - reduced_exponent_bias; + + // Do we overflow or underflow? + const uint32_t x_exponent = value_as_int & f32_exp_bits_mask; + const bool x_overflows = x_exponent > (reduced_max_exponent << 23); + const bool x_underflows = + x_exponent <= (reduced_min_exponent << 23); + + // Compute appropriately-signed values of zero and infinity. + const uint32_t x_signed_zero = value_as_int & f32_sign_bit_mask; + const uint32_t x_signed_inf = x_signed_zero | f32_exp_bits_mask; + + // Force to zero or infinity if overflow or underflow. (Note that + // this truncates all denormal values to zero, rather than rounding + // them.) + value_as_int = x_overflows ? x_signed_inf : value_as_int; + value_as_int = x_underflows ? x_signed_zero : value_as_int; + } + + float reduced_result = tensorflow::bit_cast(value_as_int); + if (std::isnan(elem)) { + reduced_result = mantissa_bits > 0 + ? elem + : std::numeric_limits::infinity(); + } + return reduced_result; + })); + return Status::OK(); + } + + template ::value>::type* = nullptr> + Status HandleReducePrecision(HloInstruction* reduce_precision) { + return InvalidArgument("Double not supported for reduce precision"); + } + + template < + typename NativeT, + typename std::enable_if::value || + is_complex_t::value>::type* = nullptr> + Status HandleReducePrecision(HloInstruction* reduce_precision) { + return InvalidArgument("Unsupported type for reduce precision"); + } + + Status HandleReducePrecision(HloInstruction* reduce_precision) override { + return HandleReducePrecision(reduce_precision); + } + private: template StatusOr> DynamicSlice( diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index f7c6435002d..44fcd36370d 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -1063,14 +1063,19 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) { // node -- there the shape and layout is present in the output node. if (instr->opcode() != HloOpcode::kFusion || !ShouldShowFusionSubcomputation(instr)) { - string instr_shape = ShapeUtil::HumanString(instr->shape()); - - // Show layout of non-tuple shapes with more than one dimension. - if (LayoutUtil::HasLayout(instr->shape()) && - instr->shape().dimensions_size() > 1 && - !ShapeUtil::IsTuple(instr->shape())) { - StrAppend(&instr_shape, "{", - Join(LayoutUtil::MinorToMajor(instr->shape()), ","), "}"); + // Show layout of instructions with more than one dimension. Don't show + // layout on tuples or tensors with just one dimension (which only have one + // possible layout) to avoid visual noise. + bool shape_is_multidim = false; + ShapeUtil::ForEachSubshape(instr->shape(), + [&](const Shape& s, const ShapeIndex&) { + shape_is_multidim |= s.dimensions_size() > 1; + }); + string instr_shape; + if (instr->opcode() != HloOpcode::kTuple && shape_is_multidim) { + instr_shape = ShapeUtil::HumanStringWithLayout(instr->shape()); + } else { + instr_shape = ShapeUtil::HumanString(instr->shape()); } // Some instructions have giant tuples as their shapes, so truncate the @@ -1421,9 +1426,11 @@ void DumpText(const HloModule& module, const string& label, string MaybeDumpHloModule(const HloModule& module, const string& label, const HloExecutionProfile* profile) { - VLOG(2) << "MaybeDumpHloModule called on module " << module.name(); - string graph_url; const DebugOptions& debug_options = module.config().debug_options(); + VLOG(2) << "MaybeDumpHloModule called on module " << module.name() + << " with generate_hlo_graph regex \"" + << debug_options.xla_generate_hlo_graph() << "\""; + string graph_url; if (!debug_options.xla_generate_hlo_graph().empty() && RE2::PartialMatch(module.name(), debug_options.xla_generate_hlo_graph())) { diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index a889c35aeb2..fac6b434054 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -763,16 +763,13 @@ HloInstruction::CreateBroadcastSequence( return instruction; } -// We put the fusion kind into the instruction's name for transpose-dot and -// backward-conv fusions, since those fusions are really just describing a type -// of dot/conv rather than generating a novel computation. +// We put the fusion kind into the instruction's name for transpose-dot fusions, +// since those fusions are really just describing a type of dot rather than +// generating a novel computation. static string FusionNodeName(HloInstruction::FusionKind fusion_kind) { switch (fusion_kind) { case HloInstruction::FusionKind::kTransposeDot: return "dot_fusion"; - case HloInstruction::FusionKind::kConvBackwardInput: - case HloInstruction::FusionKind::kConvBackwardFilter: - return "conv_fusion"; default: return "fusion"; } @@ -804,18 +801,6 @@ static string FusionNodeName(HloInstruction::FusionKind fusion_kind) { return instruction; } -/* static */ std::unique_ptr -HloInstruction::CreateFusionForBackwardConvolution( - const Shape& shape, FusionKind fusion_kind, const Window& window, - const ConvolutionDimensionNumbers& conv_dnums, HloInstruction* fused_root) { - std::unique_ptr fusion = - CreateFusion(shape, fusion_kind, fused_root); - fusion->window_ = MakeUnique(window); - fusion->convolution_dimension_numbers_ = - MakeUnique(conv_dnums); - return fusion; -} - void HloInstruction::MergeFusionInstruction( HloInstruction* instruction_to_merge) { CHECK_EQ(opcode_, HloOpcode::kFusion); @@ -2318,7 +2303,7 @@ string HloInstruction::ToCategory() const { return "data formatting"; } - auto conv_category = [&] { + if (opcode() == HloOpcode::kConvolution) { string category = "convolution"; if (window_util::HasBaseDilation(window())) { category += " base-dilated"; @@ -2327,10 +2312,6 @@ string HloInstruction::ToCategory() const { category += " window-dilated"; } return category; - }; - - if (opcode() == HloOpcode::kConvolution) { - return conv_category(); } // Give transpose-dot and backwards-conv fusions the categories "dot" and @@ -2348,9 +2329,6 @@ string HloInstruction::ToCategory() const { return "output fusion"; case FusionKind::kTransposeDot: return "dot"; - case FusionKind::kConvBackwardFilter: - case FusionKind::kConvBackwardInput: - return conv_category(); case FusionKind::kCustom: return "custom fusion"; } @@ -3125,10 +3103,6 @@ string ToString(HloInstruction::FusionKind kind) { return "kOutput"; case HloInstruction::FusionKind::kTransposeDot: return "kTransposeDot"; - case HloInstruction::FusionKind::kConvBackwardFilter: - return "kConvBackwardFilter"; - case HloInstruction::FusionKind::kConvBackwardInput: - return "kConvBackwardInput"; case HloInstruction::FusionKind::kCustom: return "kCustom"; } @@ -3148,12 +3122,6 @@ StatusOr StringToFusionKind( if (kind_name == "kTransposeDot") { return HloInstruction::FusionKind::kTransposeDot; } - if (kind_name == "kConvBackwardFilter") { - return HloInstruction::FusionKind::kConvBackwardFilter; - } - if (kind_name == "kConvBackwardInput") { - return HloInstruction::FusionKind::kConvBackwardInput; - } if (kind_name == "kCustom") { return HloInstruction::FusionKind::kCustom; } @@ -3261,7 +3229,13 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const { result += "_"; append_dims(rhs_dims, operand(1)->shape()); result += "->"; - append_dims(output_dims, shape()); + + // A convolution can be represented as a kConvolution HLO or as a CustomCall + // that returns a tuple, the first element of which is the result of the + // convolution. + Shape this_shape = + ShapeUtil::IsTuple(shape()) ? shape().tuple_shapes(0) : shape(); + append_dims(output_dims, this_shape); return result; } diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 5e89dc79bea..bce9ebdda84 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -162,17 +162,14 @@ class HloPrintOptions { class HloInstruction { public: enum class FusionKind { - kLoop, // Fused into a loop. - kInput, // Op's input is fused into the op itself. - kOutput, // Op's output is fused into the op itself. - // REQUIRES: At least one operand buffer must be able - // to alias the output buffer. - kTransposeDot, // Fused into a dot with transposed operands. - kConvBackwardFilter, // Fused into a backward filter convolution. - kConvBackwardInput, // Fused into a backward input convolution. - - kCustom, // Custom category for backend-specific fusions that - // do not match any of the more specific ones. + kLoop, // Fused into a loop. + kInput, // Op's input is fused into the op itself. + kOutput, // Op's output is fused into the op itself. + // REQUIRES: At least one operand buffer must be able + // to alias the output buffer. + kTransposeDot, // Fused into a dot with transposed operands. + kCustom, // Custom category for backend-specific fusions that + // do not match any of the more specific ones. }; ~HloInstruction(); @@ -466,14 +463,6 @@ class HloInstruction { tensorflow::gtl::ArraySlice operands, HloComputation* fusion_computation); - // Creates a fusion instruction that represents backward convolution. This is - // similar to CreateFusion, but with extra arguments indicating the window and - // dimemsion mapping of the backward convolution. - static std::unique_ptr CreateFusionForBackwardConvolution( - const Shape& shape, FusionKind fusion_kind, const Window& window, - const ConvolutionDimensionNumbers& conv_dnums, - HloInstruction* fused_root); - // Creates a call instruction that applies the given computation on the given // operands. "shape" is the resultant shape. static std::unique_ptr CreateCall( @@ -885,8 +874,8 @@ class HloInstruction { // Returns true if this instruction is a fusion instruction that generates // multiple outputs. const bool IsMultiOutputFusion() const { - return (opcode() == HloOpcode::kFusion && - fused_expression_root()->opcode() == HloOpcode::kTuple); + return opcode() == HloOpcode::kFusion && + fused_expression_root()->opcode() == HloOpcode::kTuple; } FusionKind fusion_kind() const { @@ -1052,13 +1041,23 @@ class HloInstruction { return *padding_config_; } - // Returns data on the dimension numbers used for a convolution - // operation. + // Returns data on the dimension numbers used for a convolution operation, + // which may be a kConvolution instruction or a kCustomCall that implements a + // convolution. const ConvolutionDimensionNumbers& convolution_dimension_numbers() const { CHECK(convolution_dimension_numbers_ != nullptr); return *convolution_dimension_numbers_; } + // Sets the convolution dimension numbers on this instruction. In general you + // shouldn't need to call this; instead, specify the convolution dimension + // numbers when you create the instruction. + void set_convolution_dimension_numbers( + const ConvolutionDimensionNumbers& dnums) { + convolution_dimension_numbers_ = + MakeUnique(dnums); + } + FftType fft_type() const { CHECK_EQ(HloOpcode::kFft, opcode_); return fft_type_; diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc index 4255d608662..bc74c4bc10c 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers.cc @@ -102,6 +102,36 @@ bool HloGetTupleElementMatcher::MatchAndExplain( return true; } +void HloCustomCallMatcher::DescribeTo(std::ostream* os) const { + HloMatcher::DescribeTo(os); + *os << " with call target that "; + call_target_matcher_.DescribeTo(os); +} + +bool HloCustomCallMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (!HloMatcher::MatchAndExplain(instruction, listener)) { + return false; + } + ::testing::StringMatchResultListener sub_listener; + bool result = ExplainMatchResult( + call_target_matcher_, instruction->custom_call_target(), &sub_listener); + if (sub_listener.str().empty()) { + sub_listener << " that "; + + std::stringstream desc_stream; + if (result) { + call_target_matcher_.DescribeTo(&desc_stream); + } else { + call_target_matcher_.DescribeNegationTo(&desc_stream); + } + sub_listener << desc_stream.str(); + } + *listener << "custom-call with call target" << sub_listener.str(); + return result; +} + } // namespace testing void PrintTo(const HloInstruction* inst, ::std::ostream* os) { diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h index 9206cdac05f..103f04a2cb7 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.h +++ b/tensorflow/compiler/xla/service/hlo_matchers.h @@ -56,8 +56,8 @@ class HloParameterMatcher : public HloMatcher { // index to match. class HloGetTupleElementMatcher : public HloMatcher { public: - explicit HloGetTupleElementMatcher( - ::testing::Matcher operand, int64 tuple_index) + HloGetTupleElementMatcher(::testing::Matcher operand, + int64 tuple_index) : HloMatcher(HloOpcode::kGetTupleElement, /*operands=*/{operand}), tuple_index_(tuple_index) {} @@ -68,6 +68,24 @@ class HloGetTupleElementMatcher : public HloMatcher { int64 tuple_index_; }; +// Custom matcher for custom-call instructions, which accepts a matcher for its +// call target. +class HloCustomCallMatcher : public HloMatcher { + public: + HloCustomCallMatcher( + ::testing::Matcher call_target_matcher, + std::vector<::testing::Matcher> operands) + : HloMatcher(HloOpcode::kCustomCall, operands), + call_target_matcher_(call_target_matcher) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + ::testing::Matcher call_target_matcher_; +}; + // HloInstruction* matchers for opcode and operands. Example: // namespace op = xla::opcode_matchers; // EXPECT_THAT(instruction, @@ -94,7 +112,6 @@ HLO_MATCHER(Convert); HLO_MATCHER(Convolution); HLO_MATCHER(Copy); HLO_MATCHER(CrossReplicaSum); -HLO_MATCHER(CustomCall); HLO_MATCHER(Divide); HLO_MATCHER(Dot); HLO_MATCHER(DynamicSlice); @@ -184,6 +201,36 @@ inline ::testing::Matcher GetTupleElement() { new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {})); } +// - CustomCall(T, operand1, ..., operandN) matches a CustomCall with call +// target T and the given operands. +// +// - CustomCall(operand1, ..., operandN) matches any CustomCall HLO with the +// given operands. +// +// - CustomCall() matches any CustomCall HLO at all. +template +inline ::testing::Matcher CustomCall( + ::testing::Matcher call_target_matcher, M... operands) { + return ::testing::MakeMatcher(new ::xla::testing::HloCustomCallMatcher( + call_target_matcher, {operands...})); +} +// This overload of CustomCall(A, B, C, ...) exists iff A is not convertible to +// ::testing::Matcher. In that case, we want to prefer the overload +// above. +template >::value, + void>::type*> +inline ::testing::Matcher CustomCall( + FirstM operands_first, M... operands_rest) { + return ::testing::MakeMatcher(new ::xla::testing::HloMatcher( + HloOpcode::kCustomCall, {operands_first, operands_rest...})); +} +inline ::testing::Matcher CustomCall() { + return ::testing::MakeMatcher( + new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {})); +} + #undef HLO_MATCHER } // namespace opcode_matchers diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc index 1465d1cacdc..1c21703a45e 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc @@ -23,6 +23,12 @@ using ::testing::Eq; namespace xla { namespace { +string DescribeHloMatcher(const ::testing::Matcher& m) { + std::stringstream ss; + m.DescribeTo(&ss); + return ss.str(); +} + template string Explain(const T& t, const M& m) { ::testing::StringMatchResultListener listener; @@ -67,5 +73,32 @@ TEST(HloMatchersTest, Test) { "add")); } +TEST(HloMatchersTest, CustomCallMatcher) { + auto c1 = HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3})); + auto c2 = HloInstruction::CreateConstant(Literal::CreateR1({1, 2, 3})); + auto call = HloInstruction::CreateCustomCall( + ShapeUtil::MakeShape(F32, {1}), {c1.get(), c2.get()}, "foo_target"); + + EXPECT_THAT(call.get(), op::CustomCall()); + EXPECT_THAT(call.get(), op::CustomCall(c1.get(), c2.get())); + EXPECT_THAT(call.get(), op::CustomCall("foo_target")); + EXPECT_THAT(call.get(), op::CustomCall("foo_target", c1.get(), c2.get())); + EXPECT_THAT(call.get(), op::CustomCall(::testing::StartsWith("foo"))); + EXPECT_THAT(call.get(), + op::CustomCall(::testing::Not(::testing::StartsWith("bar")))); + + // Wrong number of operands. + EXPECT_THAT(call.get(), ::testing::Not(op::CustomCall(c1.get()))); + + // Call target does not match. + EXPECT_THAT(call.get(), + ::testing::Not(op::CustomCall(::testing::StartsWith("bar")))); + + EXPECT_THAT(Explain(call.get(), op::CustomCall("bar")), + R"(custom-call with call target that isn't equal to "bar")"); + EXPECT_THAT(DescribeHloMatcher(op::CustomCall("foo_target")), + R"(custom-call with call target that is equal to "foo_target")"); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc index 99d8dd04e52..60270b0595d 100644 --- a/tensorflow/compiler/xla/service/hlo_module.cc +++ b/tensorflow/compiler/xla/service/hlo_module.cc @@ -38,12 +38,16 @@ HloModule::HloModule(const string& name, : name_(NameUniquer::GetSanitizedName(name)), config_(config), has_entry_computation_handle_(true), - entry_computation_handle_(entry_computation_handle) {} + entry_computation_handle_(entry_computation_handle), + unique_id_(next_unique_module_id_++) {} HloModule::HloModule(const string& name) - : name_(NameUniquer::GetSanitizedName(name)) {} + : name_(NameUniquer::GetSanitizedName(name)), + unique_id_(next_unique_module_id_++) {} HloModule::HloModule(const string& name, const HloModuleConfig& config) - : name_(NameUniquer::GetSanitizedName(name)), config_(config) {} + : name_(NameUniquer::GetSanitizedName(name)), + config_(config), + unique_id_(next_unique_module_id_++) {} HloComputation* HloModule::AddComputationInternal( std::unique_ptr computation, bool is_entry, @@ -564,4 +568,6 @@ uint64 HloModule::RandomNew64() const { return rng_(); } +/* static */ std::atomic HloModule::next_unique_module_id_(0); + } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h index e377654d024..4bfe8d89ce0 100644 --- a/tensorflow/compiler/xla/service/hlo_module.h +++ b/tensorflow/compiler/xla/service/hlo_module.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_ +#include #include #include #include @@ -201,6 +202,10 @@ class HloModule { // this point are guaranteed to be in the range [0..NumUniqueInstructionIds()) int NumUniqueInstructionIds() const { return next_unique_id_; } + // Returns an id that is unique to this module across all modules created over + // the lifetime of this process. + int unique_id() const { return unique_id_; } + private: HloComputation* AddComputationInternal( std::unique_ptr computation, bool is_entry, @@ -227,6 +232,11 @@ class HloModule { NameUniquer computation_name_uniquer_{/*separator=*/"."}; NameUniquer instruction_name_uniquer_{/*separator=*/"."}; int next_unique_id_ = 0; + + // Used to keep track of the next unique module id that should be assigned. + static std::atomic next_unique_module_id_; + // A unique id to label modules with. + int unique_id_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc index cd51fa4e854..7f28a804bfe 100644 --- a/tensorflow/compiler/xla/service/hlo_module_test.cc +++ b/tensorflow/compiler/xla/service/hlo_module_test.cc @@ -188,6 +188,12 @@ TEST_F(HloModuleTest, LargeConstantToString) { module->ToString(HloPrintOptions().set_print_large_constants(true))); } +TEST_F(HloModuleTest, UniqueModuleId) { + auto module_a = CreateNewModule(); + auto module_b = CreateNewModule(); + EXPECT_NE(module_a->unique_id(), module_b->unique_id()); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc index 53bd46a641a..5120775737b 100644 --- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc +++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" +#include "tensorflow/compiler/xla/service/hlo_proto_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" @@ -32,12 +33,28 @@ using ::tensorflow::strings::StrCat; namespace xla { namespace { -void DumpModule(const HloModule& module, - const string& message) { +void DumpModuleGraph(const HloModule& module, const string& message) { hlo_graph_dumper::MaybeDumpHloModule(module, message); VLOG(3) << "HLO " << message << ":"; XLA_VLOG_LINES(3, module.ToString()); } + +void DumpModuleProto(const HloModule& module, const string& dump_to, + const string& pipeline_name, const string& pass_name) { + static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); + static auto* const module_id_to_pass_number = + new tensorflow::gtl::FlatMap(); + + tensorflow::mutex_lock lock(mu); + const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++; + + const string mod_name = SanitizeFileName(tensorflow::strings::Printf( + "module_%04d.%04lld.%s.after_%s", module.unique_id(), pass_number, + pipeline_name.c_str(), pass_name.c_str())); + + TF_QCHECK_OK(protobuf_util::DumpProtoToDirectory(MakeHloProto(module), + dump_to, mod_name)); +} } // namespace StatusOr HloPassPipeline::Run(HloModule* module) { @@ -78,6 +95,13 @@ StatusOr HloPassPipeline::Run(HloModule* module) { string message; TF_RETURN_IF_ERROR( run_invariant_checkers(StrCat("before running pipeline: ", name()))); + const string xla_dump_per_pass_hlo_proto_to = + module->config().debug_options().xla_dump_per_pass_hlo_proto_to(); + if (!xla_dump_per_pass_hlo_proto_to.empty()) { + DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, name().ToString(), + "pipeline_start"); + } + for (auto& pass : passes_) { if (disabled_passes.count(pass->name().ToString()) > 0) { VLOG(1) << " Skipping HLO pass " << pass->name() @@ -90,17 +114,21 @@ StatusOr HloPassPipeline::Run(HloModule* module) { // Emit label containing: "after foo-pass, before bar-pass". message.clear(); StrAppend(&message, prefix, ", before ", pass->name()); - DumpModule(*module, message); + DumpModuleGraph(*module, message); TF_ASSIGN_OR_RETURN(bool changed_this_pass, pass->Run(module)); TF_RETURN_IF_ERROR( run_invariant_checkers(StrCat("after running pass: ", pass->name()))); + if (!xla_dump_per_pass_hlo_proto_to.empty()) { + DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, + name().ToString(), pass->name().ToString()); + } changed |= changed_this_pass; prefix.clear(); StrAppend(&prefix, name(), ": after ", pass->name()); } - DumpModule(*module, prefix + ", pipeline end"); + DumpModuleGraph(*module, prefix + ", pipeline end"); return changed; } diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc index 204a8bf7486..41b079eb799 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.cc +++ b/tensorflow/compiler/xla/service/hlo_runner.cc @@ -47,22 +47,11 @@ HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string, return tools::Parse(hlo_string, config); } -/*static*/ StatusOr> -HloRunner::ReadModuleFromHloProtoFile(const std::string& filename, - const DebugOptions& debug_options) { - HloProto proto; - - const Status s = - tensorflow::ReadBinaryProto(tensorflow::Env::Default(), filename, &proto); - - if (!s.ok()) { - const Status s2 = - tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto); - if (!s2.ok()) { - return Status(s2.code(), s.error_message() + "\n" + s2.error_message()); - } - } +namespace { +// Creates an HloModule from the given proto. +StatusOr> HloProtoToModule( + const HloProto& proto, const DebugOptions& debug_options) { TF_ASSIGN_OR_RETURN( HloModuleConfig config, HloModule::CreateModuleConfigFromProto(proto.hlo_module())); @@ -72,9 +61,29 @@ HloRunner::ReadModuleFromHloProtoFile(const std::string& filename, return std::move(module); } +} // namespace + /*static*/ StatusOr> -HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename, +HloRunner::ReadModuleFromBinaryProtoFile(const std::string& filename, const DebugOptions& debug_options) { + HloProto proto; + TF_RETURN_IF_ERROR(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), + filename, &proto)); + return HloProtoToModule(proto, debug_options); +} + +/*static*/ StatusOr> +HloRunner::ReadModuleFromTextProtoFile(const std::string& filename, + const DebugOptions& debug_options) { + HloProto proto; + TF_RETURN_IF_ERROR( + tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto)); + return HloProtoToModule(proto, debug_options); +} + +/*static*/ StatusOr> +HloRunner::ReadModuleFromHloTextFile(const std::string& filename, + const DebugOptions& debug_options) { string hlo_string; TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(), filename, &hlo_string)); @@ -83,19 +92,6 @@ HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename, return tools::Parse(hlo_string, config); } -/*static*/ StatusOr> HloRunner::ReadModule( - const std::string& filename, const DebugOptions& debug_options) { - auto module = HloRunner::ReadModuleFromHloProtoFile(filename, debug_options); - if (module.ok()) { - return module; - } - const std::string e = module.status().error_message(); - module = HloRunner::ReadModuleFromHloTextDumpFile(filename, debug_options); - return module.ok() ? std::move(module) - : Status(module.status().code(), - e + "\n" + module.status().error_message()); -} - // Define this in .cc file to avoid having to include eigen or forward declare // these types in the header. struct HloRunner::EigenThreadPoolWrapper { @@ -121,12 +117,14 @@ StatusOr> HloRunner::ExecuteInternal( if (run_hlo_passes) { TF_ASSIGN_OR_RETURN( module, backend().compiler()->RunHloPasses( - std::move(module), backend().default_stream_executor())); + std::move(module), backend().default_stream_executor(), + /*device_allocator=*/nullptr)); } TF_ASSIGN_OR_RETURN( std::unique_ptr executable, backend().compiler()->RunBackend(std::move(module), - backend().default_stream_executor())); + backend().default_stream_executor(), + /*device_allocator=*/nullptr)); se::Stream stream(backend().default_stream_executor()); stream.Init(); diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h index d4b221fb52d..cbaebc68bee 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.h +++ b/tensorflow/compiler/xla/service/hlo_runner.h @@ -52,21 +52,15 @@ class HloRunner { const DebugOptions& debug_options); // Reads the proto file in xla.HloProto format, creates and returns the - // HloModule. Will try to parse the filename as binary proto, then try as - // text proto if that fails. - static StatusOr> ReadModuleFromHloProtoFile( + // HloModule. + static StatusOr> ReadModuleFromBinaryProtoFile( + const std::string& filename, const DebugOptions& debug_options); + static StatusOr> ReadModuleFromTextProtoFile( const std::string& filename, const DebugOptions& debug_options); // Reads the hlo text dump file in HloModule::ToString format, creates and // returns the HloModule. - static StatusOr> ReadModuleFromHloTextDumpFile( - const std::string& filename, const DebugOptions& debug_options); - - // Tries to parse the filename specified first as binary proto format, then - // as a textual proto format, then textual IR, then gives up if both fail. - // ReadModuleFromHloProtoFile or ReadModuleFromHloTextDumpFile should be used - // explicitly when you know the format, this if you don't. - static StatusOr> ReadModule( + static StatusOr> ReadModuleFromHloTextFile( const std::string& filename, const DebugOptions& debug_options); // Executes the given module with given literals as input and returns the diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 6e46f945e0a..04d46565466 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -687,7 +687,8 @@ StatusOr HloVerifier::Run(HloModule* module) { instructions[instruction->name()] = instruction; } - TF_RETURN_IF_ERROR(computation->Accept(shape_verifier_.get())); + std::unique_ptr shape_verifier = shape_verifier_factory_(); + TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get())); } return false; diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h index 5a1d864e03d..26d53dec1e5 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.h +++ b/tensorflow/compiler/xla/service/hlo_verifier.h @@ -106,10 +106,14 @@ class ShapeVerifier : public DfsHloVisitor { class HloVerifier : public HloPassInterface { public: // Uses standard shape inference. - explicit HloVerifier() : shape_verifier_(MakeUnique()) {} + explicit HloVerifier() + : shape_verifier_factory_([] { return MakeUnique(); }) {} + // Uses custom shape verification. - explicit HloVerifier(std::unique_ptr shape_verifier) - : shape_verifier_(std::move(shape_verifier)) {} + explicit HloVerifier( + std::function()> shape_verifier_factory) + : shape_verifier_factory_(std::move(shape_verifier_factory)) {} + ~HloVerifier() override = default; tensorflow::StringPiece name() const override { return "verifier"; } @@ -121,8 +125,11 @@ class HloVerifier : public HloPassInterface { // CHECKs various invariants of a fusion instruction. Status CheckFusionInstruction(HloInstruction* fusion) const; - // Verifies shapes match inferred expectations. - std::unique_ptr shape_verifier_; + // Creates a ShapeVerifier that checks that shapes match inferred + // expectations. This is a factory function because ShapeVerifier, Note that + // ShapeVerifier, being a DfsHloVisitor, is stateful. We want a clean object + // for each run of the verifier. + std::function()> shape_verifier_factory_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc index 2a3b55decc5..c92db0be14d 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc @@ -97,5 +97,31 @@ TEST_F(HloVerifierTest, DifferentOperandParents) { HasSubstr("is in a different computation")); } +TEST_F(HloVerifierTest, ResetsShapeVerifierState) { + HloComputation::Builder builder(TestName()); + Shape s1 = ShapeUtil::MakeShape(F32, {1}); + Shape s2 = ShapeUtil::MakeShape(F32, {2}); + + HloInstruction* param = + builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "param")); + + // Create an add instruction with the incorrect shape. + HloInstruction* add = builder.AddInstruction( + HloInstruction::CreateBinary(s2, HloOpcode::kAdd, param, param)); + + // In order to trigger the bug we're checking for, the instruction with the + // bad shape can't be the root of the computation. + builder.AddInstruction( + HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add)); + + auto module = CreateNewModule(); + module->AddEntryComputation(builder.Build()); + + // Run the verifier twice. It should fail both times, because it shouldn't + // carry state in its DFS visitor between runs. + EXPECT_FALSE(verifier().Run(module.get()).status().ok()); + EXPECT_FALSE(verifier().Run(module.get()).status().ok()); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc new file mode 100644 index 00000000000..ada21345014 --- /dev/null +++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc @@ -0,0 +1,124 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h" + +#include +#include +#include +#include +#include +#include + +#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_dce.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { + +namespace { + +// Visitor for removing implicit broadcasts. +class ImplicitBroadcastVisitor : public DfsHloVisitorWithDefault { + public: + Status DefaultAction(HloInstruction* hlo_instruction) override { + return Status::OK(); + } + + Status HandleElementwiseBinary(HloInstruction* hlo) override { + return ReplaceImplicitBroadcastOperands(hlo); + } + + Status HandleClamp(HloInstruction* hlo) override { + // Clamp is the only element-wise ternary operation. + return ReplaceImplicitBroadcastOperands(hlo); + } + + // Returns whether any modification has been made to any visited instruction. + bool changed() const { return changed_; } + + private: + // Iterates through the operands of 'hlo' and replace any operands which are + // implicitly broadcast with the equivalent sequence of broadcast and reshape + // instructions. An operand is considered to be implicitly broadcast if the + // operand shape does have the same dimensions as the shape of 'hlo'. + Status ReplaceImplicitBroadcastOperands(HloInstruction* hlo) { + auto fadd = [hlo](std::unique_ptr x) { + return hlo->parent()->AddInstruction(std::move(x)); + }; + std::vector operands; + bool operands_changed = false; + for (int i = 0; i < hlo->operand_count(); ++i) { + HloInstruction* operand = hlo->mutable_operand(i); + if (!ShapeUtil::SameDimensions(hlo->shape(), operand->shape())) { + HloInstruction* new_operand = hlo->parent()->AddInstruction( + HloInstruction::CreateBroadcastSequence(hlo->shape(), operand, + fadd)); + operands.push_back(new_operand); + operands_changed = true; + } else { + operands.push_back(operand); + } + } + if (operands_changed) { + // Create a new HLO instruction because the HloInstruction::Replace* + // methods check that the shape does not change with the replacement. + HloInstruction* new_hlo = hlo->parent()->AddInstruction( + hlo->CloneWithNewOperands(hlo->shape(), operands)); + TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo)); + changed_ = true; + } + return Status::OK(); + } + + bool changed_ = false; +}; + +} // namespace + +StatusOr ImplicitBroadcastRemover::Run(HloModule* module) { + VLOG(1) << "Removing implicit broadcast from module " << module->name(); + XLA_VLOG_LINES(2, + "Before removing implicit broadcasts:\n" + module->ToString()); + + ImplicitBroadcastVisitor visitor; + for (HloComputation* computation : module->computations()) { + TF_RETURN_IF_ERROR(computation->Accept(&visitor)); + } + + if (visitor.changed()) { + // HLO instructions with implicitly broadcast operands are cloned and left + // for dead. Remove them. + HloDCE dce; + TF_RETURN_IF_ERROR(dce.Run(module).status()); + } + + XLA_VLOG_LINES(2, + "After removing implicit broadcasts:\n" + module->ToString()); + + return visitor.changed(); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h new file mode 100644 index 00000000000..aa325dc8a35 --- /dev/null +++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h @@ -0,0 +1,42 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_ + +#include + +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" + +namespace xla { + +// Pass which replaces all implicit broadcasts with their equivalent sequence of +// explicit broadcast and reshape instructions. +class ImplicitBroadcastRemover : public HloPassInterface { + public: + ImplicitBroadcastRemover() {} + ~ImplicitBroadcastRemover() override {} + + tensorflow::StringPiece name() const override { + return "implicit-broadcast-remover"; + } + + StatusOr Run(HloModule* module) override; +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_ diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc new file mode 100644 index 00000000000..8c7b38dd1bf --- /dev/null +++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc @@ -0,0 +1,176 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h" + +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h" + +namespace op = xla::testing::opcode_matchers; + +namespace xla { +namespace { + +class ImplicitBroadcastRemoverTest : public HloVerifiedTestBase { + protected: + ImplicitBroadcastRemover remover_; +}; + +TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) { + auto builder = HloComputation::Builder(TestName()); + + const Shape shape = ShapeUtil::MakeShape(F32, {2, 4}); + auto param0 = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")); + auto param1 = + builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1")); + builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1)); + + HloComputation* computation = module().AddEntryComputation(builder.Build()); + + EXPECT_FALSE(remover_.Run(&module()).ValueOrDie()); + + EXPECT_THAT(computation->root_instruction(), + op::Add(op::Parameter(), op::Parameter())); +} + +TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) { + auto builder = HloComputation::Builder(TestName()); + + const Shape shape = ShapeUtil::MakeShape(F32, {2, 4}); + auto param0 = builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {}), "scalar_param")); + auto param1 = + builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1")); + builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kPower, param0, param1)); + + HloComputation* computation = module().AddEntryComputation(builder.Build()); + HloInstruction* root = computation->root_instruction(); + + EXPECT_FALSE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape())); + + EXPECT_TRUE(remover_.Run(&module()).ValueOrDie()); + root = computation->root_instruction(); + + EXPECT_THAT(root, op::Power(op::Broadcast(op::Parameter()), op::Parameter())); + + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape())); +} + +TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) { + auto builder = HloComputation::Builder(TestName()); + + const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6}); + auto param0 = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")); + auto param1 = builder.AddInstruction(HloInstruction::CreateParameter( + 1, ShapeUtil::MakeShape(F32, {1, 4, 1}), "p1")); + builder.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kSubtract, param0, param1)); + + HloComputation* computation = module().AddEntryComputation(builder.Build()); + + EXPECT_TRUE(remover_.Run(&module()).ValueOrDie()); + + HloInstruction* root = computation->root_instruction(); + EXPECT_THAT(root, op::Subtract(op::Parameter(), + op::Broadcast(op::Reshape(op::Parameter())))); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape())); +} + +TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) { + auto builder = HloComputation::Builder(TestName()); + + const Shape shape = ShapeUtil::MakeShape(F32, {1, 4, 1}); + auto param0 = builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {}), "scalar_param")); + auto param1 = + builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1")); + builder.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kSubtract, param0, param1)); + + HloComputation* computation = module().AddEntryComputation(builder.Build()); + + EXPECT_TRUE(remover_.Run(&module()).ValueOrDie()); + + HloInstruction* root = computation->root_instruction(); + EXPECT_THAT(root, + op::Subtract(op::Broadcast(op::Parameter()), op::Parameter())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape())); +} + +TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) { + auto builder = HloComputation::Builder(TestName()); + + const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6, 8}); + auto param0 = builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {1, 4, 1, 8}), "p0")); + auto param1 = builder.AddInstruction(HloInstruction::CreateParameter( + 1, ShapeUtil::MakeShape(F32, {1, 1, 6, 8}), "p1")); + auto param2 = builder.AddInstruction(HloInstruction::CreateParameter( + 2, ShapeUtil::MakeShape(F32, {2, 1, 6, 8}), "p2")); + builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp, + param0, param1, param2)); + + HloComputation* computation = module().AddEntryComputation(builder.Build()); + + EXPECT_TRUE(remover_.Run(&module()).ValueOrDie()); + + HloInstruction* root = computation->root_instruction(); + EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Reshape(op::Parameter())), + op::Broadcast(op::Reshape(op::Parameter())), + op::Broadcast(op::Reshape(op::Parameter())))); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape())); +} + +TEST_F(ImplicitBroadcastRemoverTest, + TernaryScalarAndDegenerateDimensionBroadcast) { + auto builder = HloComputation::Builder(TestName()); + + const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6}); + auto param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0")); + auto param1 = builder.AddInstruction(HloInstruction::CreateParameter( + 1, ShapeUtil::MakeShape(F32, {1, 4, 6}), "p1")); + auto param2 = + builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "p2")); + builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp, + param0, param1, param2)); + + HloComputation* computation = module().AddEntryComputation(builder.Build()); + + EXPECT_TRUE(remover_.Run(&module()).ValueOrDie()); + + HloInstruction* root = computation->root_instruction(); + EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Parameter()), + op::Broadcast(op::Reshape(op::Parameter())), + op::Parameter())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape())); + EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape())); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc index dc63a2224d6..c83880e0309 100644 --- a/tensorflow/compiler/xla/service/interpreter/compiler.cc +++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc @@ -70,15 +70,16 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) { } StatusOr> InterpreterCompiler::RunHloPasses( - std::unique_ptr hlo_module, - se::StreamExecutor* /*stream_exec*/) { + std::unique_ptr hlo_module, se::StreamExecutor* /*stream_exec*/, + DeviceMemoryAllocator* /*device_allocator*/) { VLOG(1) << "Run hlo passes on graph " << hlo_module->name(); TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get())); return std::move(hlo_module); } StatusOr> InterpreterCompiler::RunBackend( - std::unique_ptr hlo_module, se::StreamExecutor* stream_exec) { + std::unique_ptr hlo_module, se::StreamExecutor* stream_exec, + DeviceMemoryAllocator* /*device_allocator*/) { TF_RET_CHECK(stream_exec != nullptr); VLOG(1) << "Run backend " << hlo_module->name(); @@ -96,7 +97,8 @@ StatusOr> InterpreterCompiler::RunBackend( StatusOr>> InterpreterCompiler::Compile( std::vector> /*hlo_modules*/, - std::vector> /*stream_execs*/) { + std::vector> /*stream_execs*/, + DeviceMemoryAllocator* /*device_allocator*/) { return tensorflow::errors::Unimplemented( "Compilation of multiple HLO modules is not supported on Interpreter."); } diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h index 278cf518422..c8660c04d86 100644 --- a/tensorflow/compiler/xla/service/interpreter/compiler.h +++ b/tensorflow/compiler/xla/service/interpreter/compiler.h @@ -45,16 +45,19 @@ class InterpreterCompiler : public Compiler { StatusOr> RunHloPasses( std::unique_ptr hlo_module, - perftools::gputools::StreamExecutor* stream_exec) override; + perftools::gputools::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( std::unique_ptr hlo_module, - perftools::gputools::StreamExecutor* stream_exec) override; + perftools::gputools::StreamExecutor* stream_exec, + DeviceMemoryAllocator* device_allocator) override; StatusOr>> Compile( std::vector> hlo_modules, std::vector> - stream_exec) override; + stream_exec, + DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::vector> hlo_modules, diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc index bbea6bee565..5413b95cfb6 100644 --- a/tensorflow/compiler/xla/service/layout_assignment.cc +++ b/tensorflow/compiler/xla/service/layout_assignment.cc @@ -1236,7 +1236,8 @@ Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout, // instruction itself. Status SetFusionLayouts(HloInstruction* fusion) { TF_RET_CHECK(fusion->opcode() == HloOpcode::kFusion); - for (auto* fused_instruction : fusion->fused_instructions()) { + for (auto* fused_instruction : + fusion->fused_instructions_computation()->MakeInstructionPostOrder()) { if (fused_instruction->opcode() == HloOpcode::kParameter) { const HloInstruction* fusion_operand = fusion->operand(fused_instruction->parameter_number()); @@ -1251,11 +1252,22 @@ Status SetFusionLayouts(HloInstruction* fusion) { ShapeUtil::Compatible(fusion->shape(), fused_instruction->shape())); TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes( fusion->shape(), fused_instruction->mutable_shape())); - } else if (fused_instruction->opcode() != HloOpcode::kConstant && - fused_instruction->opcode() != HloOpcode::kGetTupleElement && - fused_instruction->opcode() != HloOpcode::kInfeed) { - // Internal fused instructions with the exception of constants - // and infeed need no layout. + } else if (fused_instruction->opcode() == HloOpcode::kGetTupleElement) { + // A GTE inherits its layout from its operand (which should ultimately be + // a parameter). + TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes( + fused_instruction->operand(0)->shape().tuple_shapes( + fused_instruction->tuple_index()), + fused_instruction->mutable_shape())); + } else if (fused_instruction->opcode() == HloOpcode::kConstant) { + // Give constants the layout of their literal. + TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes( + fused_instruction->literal().shape(), + fused_instruction->mutable_shape())); + } else if (fused_instruction->opcode() == HloOpcode::kInfeed) { + // Nop; leave the infeed layout alone. + } else { + // Other instructions don't have layouts inside of fusion nodes. LayoutUtil::ClearLayout(fused_instruction->mutable_shape()); } } @@ -1367,20 +1379,6 @@ Status LayoutAssignment::RunOnComputation( << ")"; VLOG(2) << " ComputationLayout = " << computation_layout.ToString(); - // Clear existing layouts of the instructions. All layouts must be assigned by - // the LayoutAssignment pass, except for Infeed, Outfeed, Parameters and the - // computation result. The latter two are specified in computation_layout, so - // we only need to keep the existing layouts for Infeed and Outfeed. Clearing - // the layouts here avoids hiding potential bugs in the layout assignment pass - // that may accidently use the existing layout. - for (HloInstruction* instruction : computation->instructions()) { - if (instruction->opcode() == HloOpcode::kInfeed || - instruction->opcode() == HloOpcode::kOutfeed) { - continue; - } - LayoutUtil::ClearLayout(instruction->mutable_shape()); - } - // Construct LayoutConstraints with all layout constraints of the computation. LayoutConstraints constraints(points_to_analysis, computation); @@ -1458,6 +1456,18 @@ StatusOr LayoutAssignment::Run(HloModule* module) { // is handled before its caller computation. This ensures that the layout of // all callers of a computation will agree. for (auto* computation : module->MakeComputationPostOrder()) { + // Clear existing layouts of the instructions. All layouts must be assigned + // by the LayoutAssignment pass, except for those on infeeds, parameters, + // and the computation result. The latter two are specified in + // computation_layout, so we only need to keep the existing layouts for + // infeeds. Clearing the layouts here avoids hiding potential bugs in the + // layout assignment pass that may accidently use the existing layout. + for (HloInstruction* instruction : computation->instructions()) { + if (instruction->opcode() != HloOpcode::kInfeed) { + LayoutUtil::ClearLayout(instruction->mutable_shape()); + } + } + if (computation == module->entry_computation()) { TF_RETURN_IF_ERROR(RunOnComputation( *entry_computation_layout_, *points_to_analysis, diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc index d51c0d1dfb7..e269a13459f 100644 --- a/tensorflow/compiler/xla/service/layout_assignment_test.cc +++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc @@ -35,9 +35,11 @@ limitations under the License. #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/test_utils.h" +#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/gtl/array_slice.h" namespace op = xla::testing::opcode_matchers; @@ -587,5 +589,74 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) { EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(), transpose->shape(), {2, 3, 0, 1})); } + +// A GTE inside of a fusion node inherits the layout of its operand (which +// should, if we keep following operands, eventually be a parameter). +TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) { + const char* module_str = R"( + HloModule test_module + + fused_computation { + fparam = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0) + gte0 = f32[2,2,2] get-tuple-element(fparam), index=0 + gte1 = (f32[2,2,2], f32[2,2,2]) get-tuple-element(fparam), index=1 + gte1a = f32[2,2,2] get-tuple-element(gte1), index=0 + gte1b = f32[2,2,2] get-tuple-element(gte1), index=1 + add = f32[2,2,2] add(gte1a, gte1b) + ROOT fresult = f32[2,2,2] add(gte0, add) + } + + ENTRY entry_computation { + param = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0) + ROOT fusion = + f32[2,2,2] fusion(param), kind=kLoop, calls=fused_computation + } + )"; + + auto module = tools::Parse(module_str).ValueOrDie(); + ComputationLayout computation_layout( + module->entry_computation()->ComputeProgramShape()); + Shape param_shape = ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 1, 2}), + ShapeUtil::MakeTupleShape({ + ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {1, 2, 0}), + ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {2, 0, 1}), + })}); + TF_ASSERT_OK( + computation_layout.mutable_parameter_layout(0)->CopyLayoutFromShape( + param_shape)); + computation_layout.mutable_result_layout()->ResetLayout( + LayoutUtil::MakeLayout({2, 1, 0})); + AssignLayouts(module.get(), &computation_layout); + + HloComputation* fused_computation = *std::find_if( + module->computations().begin(), module->computations().end(), + [](const HloComputation* c) { return c->name() == "fused_computation"; }); + + auto fused_instr = [&](const string& name) { + auto it = std::find_if( + fused_computation->instructions().begin(), + fused_computation->instructions().end(), + [&](const HloInstruction* i) { return i->name() == name; }); + CHECK(it != fused_computation->instructions().end()); + return *it; + }; + + EXPECT_THAT(fused_instr("gte0")->shape().layout().minor_to_major(), + ElementsAre(0, 1, 2)); + EXPECT_THAT( + fused_instr("gte1")->shape().tuple_shapes(0).layout().minor_to_major(), + ElementsAre(1, 2, 0)); + EXPECT_THAT( + fused_instr("gte1")->shape().tuple_shapes(1).layout().minor_to_major(), + ElementsAre(2, 0, 1)); + EXPECT_THAT(fused_instr("gte1a")->shape().layout().minor_to_major(), + ElementsAre(1, 2, 0)); + EXPECT_THAT(fused_instr("gte1b")->shape().layout().minor_to_major(), + ElementsAre(2, 0, 1)); + EXPECT_THAT(fused_instr("fresult")->shape().layout().minor_to_major(), + ElementsAre(2, 1, 0)); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc index 34f3419269a..f98fc0400a7 100644 --- a/tensorflow/compiler/xla/service/llvm_compiler.cc +++ b/tensorflow/compiler/xla/service/llvm_compiler.cc @@ -18,8 +18,8 @@ limitations under the License. namespace xla { StatusOr>> LLVMCompiler::Compile( std::vector> modules, - std::vector> - stream_execs) { + std::vector> stream_execs, + DeviceMemoryAllocator* device_allocator) { std::vector> result; for (size_t i = 0; i < modules.size(); i++) { if (stream_execs[i].size() != 1) { @@ -27,10 +27,12 @@ StatusOr>> LLVMCompiler::Compile( "Model partitioning not implemented for the CPU/GPU compilers!"); } - TF_ASSIGN_OR_RETURN( - modules[i], RunHloPasses(std::move(modules[i]), stream_execs[i][0])); + TF_ASSIGN_OR_RETURN(modules[i], + RunHloPasses(std::move(modules[i]), stream_execs[i][0], + device_allocator)); TF_ASSIGN_OR_RETURN(std::unique_ptr executable, - RunBackend(std::move(modules[i]), stream_execs[i][0])); + RunBackend(std::move(modules[i]), stream_execs[i][0], + device_allocator)); result.push_back(std::move(executable)); } diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h index c5393cef4f9..d74e81bb7f6 100644 --- a/tensorflow/compiler/xla/service/llvm_compiler.h +++ b/tensorflow/compiler/xla/service/llvm_compiler.h @@ -60,17 +60,20 @@ class LLVMCompiler : public Compiler { // Bring in // StatusOr> RunBackend( // std::unique_ptr module, - // perftools::gputools::StreamExecutor* stream_exec) + // perftools::gputools::StreamExecutor* stream_exec, + // DeviceMemoryAllocator* device_allocator) // StatusOr> RunHloPasses( // std::unique_ptr module, - // perftools::gputools::StreamExecutor* stream_exec) + // perftools::gputools::StreamExecutor* stream_exec, + // DeviceMemoryAllocator* device_allocator) using Compiler::RunBackend; using Compiler::RunHloPasses; StatusOr>> Compile( std::vector> modules, std::vector> - stream_execs) override; + stream_execs, + DeviceMemoryAllocator* device_allocator) override; protected: ModuleHook user_pre_optimization_hook_; diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h index 9ad7cd82cb8..242062e6167 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h +++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h @@ -34,6 +34,12 @@ namespace xla { // Unlike IrEmitter, this creates host functions which emit IR to generate the // output element at the given index. It is used to generate fused operations. +// +// This class handles both vanilla fusion and multi-output fusion. In the MOF +// case, the fusion node ends with a kTuple instruction, and the root generator +// returned by this emitter returns an LLVM struct with N elements, one for each +// element of the arrays in the tuple. It follows that the arrays in the tuple +// must have the same length. class FusedIrEmitter : public DfsHloVisitorWithDefault { public: using Generator = llvm_ir::ElementGenerator; diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc index a5f7c850c33..b6b918ec78a 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc @@ -51,37 +51,40 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator, shape_(target_array.GetShape()), ir_builder_(ir_builder) {} +static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion( + const ElementGenerator& target_element_generator, + const std::vector& target_arrays, llvm::IRBuilder<>* ir_builder) { + return [=](const llvm_ir::IrArray::Index array_index) { + TF_ASSIGN_OR_RETURN(llvm::Value * target_element, + target_element_generator(array_index)); + CHECK(target_element->getType()->isStructTy()) + << "This BodyEmitter is for multi-output fusion, but target element " + "generator does not produce values of struct type."; + CHECK_EQ(target_element->getType()->getStructNumElements(), + target_arrays.size()); + + for (int64 i = 0; i < target_arrays.size(); ++i) { + target_arrays[i].EmitWriteArrayElement( + array_index, ir_builder->CreateExtractValue(target_element, i), + ir_builder); + } + return Status::OK(); + }; +} + LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator, tensorflow::gtl::ArraySlice target_arrays, llvm::IRBuilder<>* ir_builder) - : body_emitter_([=](const llvm_ir::IrArray::Index array_index) - -> ::tensorflow::Status { - // Convert target_element_generator to a BodyEmitter. - TF_ASSIGN_OR_RETURN(llvm::Value * target_element, - target_element_generator(array_index)); - if (target_arrays.size() == 1) { - target_arrays[0].EmitWriteArrayElement(array_index, target_element, - ir_builder); - return tensorflow::Status::OK(); - } - - for (int64 i = 0; i < target_arrays.size(); ++i) { - target_arrays[i].EmitWriteArrayElement( - array_index, ir_builder_->CreateExtractValue(target_element, i), - ir_builder); - } - return tensorflow::Status::OK(); - }), + : body_emitter_(MakeBodyEmitterForMultiOutputFusion( + target_element_generator, + std::vector(target_arrays.begin(), target_arrays.end()), + ir_builder)), + shape_(target_arrays[0].GetShape()), ir_builder_(ir_builder) { - if (target_arrays.size() > 1) { - // The sanity check for multiple outputs. - shape_ = target_arrays[0].GetShape(); - for (int64 i = 1; i < target_arrays.size(); ++i) { - const Shape& element_shape = target_arrays[i].GetShape(); - CHECK(ShapeUtil::SameDimensions(shape_, element_shape)); - } - } else { - shape_ = target_arrays[0].GetShape(); + // Sanity check: In multi-output fusion, all shapes produced must have the + // same dimensions. + for (const IrArray& array : target_arrays) { + CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape())); } } diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h index 1ef1dc24644..0fc528439a0 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h +++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h @@ -47,10 +47,16 @@ class LoopEmitter { // element of the given target array. LoopEmitter(const ElementGenerator& target_element_generator, const IrArray& target_array, llvm::IRBuilder<>* ir_builder); - // Same as previous method except emits multiple targets in an array. + + // Constructs a LoopEmitter that emits one element into each of N separate + // arrays on each iteration of the loop. + // + // This is used for multi-output fusion. target_element_generator must + // produce an LLVM struct with N elements. LoopEmitter(const ElementGenerator& target_element_generator, tensorflow::gtl::ArraySlice target_arrays, llvm::IRBuilder<>* ir_builder); + LoopEmitter(const LoopEmitter&) = delete; LoopEmitter& operator=(const LoopEmitter&) = delete; virtual ~LoopEmitter() = default; diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc index f30530db08e..07f989d4fae 100644 --- a/tensorflow/compiler/xla/service/local_service.cc +++ b/tensorflow/compiler/xla/service/local_service.cc @@ -19,6 +19,7 @@ limitations under the License. #include #include +#include "tensorflow/compiler/xla/client/executable_build_options.h" #include "tensorflow/compiler/xla/execution_options_util.h" #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/backend.h" @@ -71,7 +72,7 @@ LocalService::LocalService(const ServiceOptions& options, StatusOr> LocalService::CompileExecutable( const ComputationHandle& computation, const tensorflow::gtl::ArraySlice argument_layouts, - const Shape* result_layout, int device_ordinal) { + const ExecutableBuildOptions& build_options) { TF_ASSIGN_OR_RETURN(UserComputation * user_computation, computation_tracker_.Resolve(computation)); VersionedComputationHandle versioned_handle = @@ -112,14 +113,19 @@ StatusOr> LocalService::CompileExecutable( ShapeUtil::HumanString(argument_shape).c_str()); } } - if (result_layout != nullptr) { - TF_RETURN_IF_ERROR( - ValidateResultShapeWithLayout(*result_layout, program_shape->result())); + if (build_options.result_layout() != nullptr) { + TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout( + *build_options.result_layout(), program_shape->result())); } ExecutionOptions execution_options = CreateDefaultExecutionOptions(); - if (result_layout != nullptr) { - *execution_options.mutable_shape_with_output_layout() = *result_layout; + if (build_options.generate_hlo_graph().has_value()) { + execution_options.mutable_debug_options()->set_xla_generate_hlo_graph( + build_options.generate_hlo_graph().value()); + } + if (build_options.result_layout() != nullptr) { + *execution_options.mutable_shape_with_output_layout() = + *build_options.result_layout(); } else { *execution_options.mutable_shape_with_output_layout() = program_shape->result(); @@ -131,11 +137,13 @@ StatusOr> LocalService::CompileExecutable( CreateModuleConfig(*program_shape, argument_layouts, &execution_options, *user_computation)); - TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, - execute_backend_->stream_executor(device_ordinal)); + TF_ASSIGN_OR_RETURN( + se::StreamExecutor * executor, + execute_backend_->stream_executor(build_options.device_ordinal())); return BuildExecutable(versioned_handle, std::move(module_config), - execute_backend_.get(), executor); + execute_backend_.get(), executor, + build_options.device_allocator()); } StatusOr LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) { diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h index acbc7268252..15e120685e1 100644 --- a/tensorflow/compiler/xla/service/local_service.h +++ b/tensorflow/compiler/xla/service/local_service.h @@ -18,6 +18,7 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/client/executable_build_options.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" @@ -41,11 +42,13 @@ class LocalService : public Service { // Builds an Executable with the given argument layouts and options. If // result_layout is non-null, then the executable is compiled to produce a - // result of the given layout. + // result of the given layout. If device_allocator is non-null, then the + // compiler may use it to allocate temp space on the device. The compiler is + // responsible for freeing any memory it allocates this way. StatusOr> CompileExecutable( const ComputationHandle& computation, const tensorflow::gtl::ArraySlice argument_layouts, - const Shape* result_layout, int device_ordinal); + const ExecutableBuildOptions& options); // Returns the device ordinal that corresponds to the given replica number. // diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 849df1d8e63..98dfc89867a 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -337,7 +337,8 @@ StatusOr>> Service::BuildExecutables( std::vector versioned_handles, std::vector> module_configs, Backend* backend, - std::vector> executors) { + std::vector> executors, + DeviceMemoryAllocator* device_allocator) { VLOG(1) << Printf("BuildExecutable on service %p", this); // Dump computation proto state if flag is set. @@ -383,7 +384,8 @@ StatusOr>> Service::BuildExecutables( TF_ASSIGN_OR_RETURN( std::vector> executables, - backend->compiler()->Compile(std::move(modules), std::move(executors))); + backend->compiler()->Compile(std::move(modules), std::move(executors), + device_allocator)); for (size_t i = 0; i < versioned_handles.size(); ++i) { if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) { @@ -396,8 +398,8 @@ StatusOr>> Service::BuildExecutables( StatusOr> Service::BuildExecutable( const VersionedComputationHandle& versioned_handle, - std::unique_ptr module_config, - Backend* backend, se::StreamExecutor* executor) { + std::unique_ptr module_config, Backend* backend, + se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) { VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this, versioned_handle.ToString().c_str()); @@ -430,11 +432,12 @@ StatusOr> Service::BuildExecutable( TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module)); TF_ASSIGN_OR_RETURN( - module, backend->compiler()->RunHloPasses(std::move(module), executor)); + module, backend->compiler()->RunHloPasses(std::move(module), executor, + device_allocator)); - TF_ASSIGN_OR_RETURN( - std::unique_ptr executable, - backend->compiler()->RunBackend(std::move(module), executor)); + TF_ASSIGN_OR_RETURN(std::unique_ptr executable, + backend->compiler()->RunBackend( + std::move(module), executor, device_allocator)); if (!other_directory_path.empty()) { executable->set_session_module(std::move(session_module)); @@ -445,9 +448,9 @@ StatusOr> Service::BuildExecutable( StatusOr> Service::BuildAndCacheExecutable( const VersionedComputationHandle& versioned_handle, - std::unique_ptr module_config, - Backend* backend, perftools::gputools::StreamExecutor* executor, - ExecutionProfile* profile) { + std::unique_ptr module_config, Backend* backend, + perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile, + DeviceMemoryAllocator* device_allocator) { std::shared_ptr executable = compilation_cache_.LookUp(versioned_handle, *module_config); @@ -469,7 +472,7 @@ StatusOr> Service::BuildAndCacheExecutable( TF_ASSIGN_OR_RETURN( std::unique_ptr executable_unique_ptr, BuildExecutable(versioned_handle, std::move(module_config), backend, - executor)); + executor, device_allocator)); if (profile != nullptr) { uint64 end_micros = tensorflow::Env::Default()->NowMicros(); @@ -771,10 +774,14 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg, // Build the user computations into HloModules and compile to generate the // executables. + // + // TODO(jlebar): There's currently no way to pass a device allocator to + // ExecuteParallel, so we have to pass a null device_allocator below. TF_ASSIGN_OR_RETURN( std::vector> executables, BuildExecutables(versioned_handles, std::move(module_configs), - execute_backend_.get(), all_executors)); + execute_backend_.get(), all_executors, + /*device_allocator=*/nullptr)); std::vector executable_ptrs; executable_ptrs.reserve(executables.size()); for (const auto& executable : executables) { @@ -1446,9 +1453,9 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) { handle_status = computation->AddInfeedInstruction(arg->infeed_request()); break; case OpRequest::kOutfeedRequest: - TF_RETURN_IF_ERROR( - computation->AddOutfeedInstruction(arg->outfeed_request())); - return tensorflow::Status::OK(); + handle_status = + computation->AddOutfeedInstruction(arg->outfeed_request()); + break; case OpRequest::kMapRequest: { TF_ASSIGN_OR_RETURN( UserComputation * to_apply, @@ -1612,14 +1619,14 @@ StatusOr> Service::Replicas( } Status Service::MaybeDumpHloModule(const HloModule& module) const { - const string xla_dump_prepass_hlo_proto_to = - module.config().debug_options().xla_dump_prepass_hlo_proto_to(); - if (xla_dump_prepass_hlo_proto_to.empty()) { + const string xla_dump_unoptimized_hlo_proto_to = + module.config().debug_options().xla_dump_unoptimized_hlo_proto_to(); + if (xla_dump_unoptimized_hlo_proto_to.empty()) { return Status::OK(); } HloProto proto = MakeHloProto(module); return protobuf_util::DumpProtoToDirectory( - proto, xla_dump_prepass_hlo_proto_to, module.name()); + proto, xla_dump_unoptimized_hlo_proto_to, module.name()); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h index ca77e8fe3a6..6ce24197115 100644 --- a/tensorflow/compiler/xla/service/service.h +++ b/tensorflow/compiler/xla/service/service.h @@ -280,10 +280,15 @@ class Service : public ServiceInterface { const UserComputation& user_computation); // Builds an Executable for the given parameters. + // + // If device_allocator is not null, the compiler may use it to allocate temp + // buffers, which the compiler is responsible for freeing. The allocator + // given here need not match the allocator used when running the executable. StatusOr> BuildExecutable( const VersionedComputationHandle& versioned_handle, - std::unique_ptr module_config, - Backend* backend, perftools::gputools::StreamExecutor* executor); + std::unique_ptr module_config, Backend* backend, + perftools::gputools::StreamExecutor* executor, + DeviceMemoryAllocator* device_allocator = nullptr); // Same as BuildExecutable() above, but builds a list of Executables for the // given computations that may interact with each other. @@ -291,16 +296,17 @@ class Service : public ServiceInterface { std::vector versioned_handles, std::vector> module_configs, Backend* backend, - std::vector> executors); + std::vector> executors, + DeviceMemoryAllocator* device_allocator); // Similar to BuildExecutable, but look in the compilation cache for the // executable first. If the executable is not in the cache, it is built and // inserted into the cache. StatusOr> BuildAndCacheExecutable( const VersionedComputationHandle& versioned_handle, - std::unique_ptr module_config, - Backend* backend, perftools::gputools::StreamExecutor* executor, - ExecutionProfile* profile); + std::unique_ptr module_config, Backend* backend, + perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile, + DeviceMemoryAllocator* device_allocator = nullptr); // Runs the given executable with the given arguments and register the result // in the allocation tracker. The handle of the result from the tracker is diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index a6d6c8b27f8..4ba6da6ccc4 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -37,6 +37,9 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/protobuf.h" +using tensorflow::str_util::Join; +using tensorflow::strings::Printf; + namespace xla { namespace { @@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}", BinaryOperation_Name(operation).c_str(), ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(), - tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str()); + Join(broadcast_dimensions, ", ").c_str()); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs)); TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs)); @@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Map operation requires all operands to have the same shape; got: " "%s", - tensorflow::str_util::Join(pieces, ", ").c_str()); + Join(pieces, ", ").c_str()); } // Check that dimensions.size == arg_shape.dimensions_size() (we currently @@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( if (dimensions[i] != i) { return InvalidArgument( "Map requires monotonically increasing dimension numbers, found: %s ", - tensorflow::str_util::Join(dimensions, ", ").c_str()); + Join(dimensions, ", ").c_str()); } } @@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( const Shape& arg, tensorflow::gtl::ArraySlice starts, tensorflow::gtl::ArraySlice limits, tensorflow::gtl::ArraySlice strides) { + auto error = [&](const string& message) { + return InvalidArgument( + "%s in slice operation; argument shape: %s; starts: {%s}; limits: " + "{%s}; strides: {%s}", + message.c_str(), ShapeUtil::HumanString(arg).c_str(), + Join(starts, ",").c_str(), Join(limits, ",").c_str(), + Join(strides, ",").c_str()); + }; TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice")); VLOG(2) << tensorflow::strings::Printf( "slicing shape %s starts={%s} limits={%s}", - ShapeUtil::HumanString(arg).c_str(), - tensorflow::str_util::Join(starts, ", ").c_str(), - tensorflow::str_util::Join(limits, ", ").c_str()); + ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(), + Join(limits, ", ").c_str()); if (starts.size() != limits.size()) { - return InvalidArgument("slice start and limit sizes differ: %zu vs %zu", - starts.size(), limits.size()); + return error(Printf("slice start and limit sizes differ: %zu vs %zu", + starts.size(), limits.size())); } if (starts.size() != strides.size()) { - return InvalidArgument("slice start and strides sizes differ: %zu vs %zu", - starts.size(), strides.size()); + return error(Printf("slice start and strides sizes differ: %zu vs %zu", + starts.size(), strides.size())); } if (starts.size() != ShapeUtil::Rank(arg)) { @@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( start_index); } if (limit_index > arg.dimensions(dimension)) { - return InvalidArgument( - "limit index (%lld) must be less than or equal to dimension " - "size (%lld)", - limit_index, arg.dimensions(dimension)); + return error( + Printf("limit index (%lld) must be less than or equal to dimension " + "size (%lld)", + limit_index, arg.dimensions(dimension))); } VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension, start_index); VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension, limit_index); if (start_index > limit_index) { - return InvalidArgument( - "limit index (%lld) must be greater or equal to " - "start index (%lld) in slice with positive stride", - limit_index, start_index); + return error( + Printf("limit index (%lld) must be greater or equal to " + "start index (%lld) in slice with positive stride", + limit_index, start_index)); } if (stride <= 0) { return InvalidArgument("stride (%lld) must be positive", stride); @@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}", ShapeUtil::HumanString(operand_shape).c_str(), ShapeUtil::HumanString(start_indices_shape).c_str(), - tensorflow::str_util::Join(slice_sizes, ", ").c_str()); + Join(slice_sizes, ", ").c_str()); if (ShapeUtil::Rank(start_indices_shape) != 1) { return InvalidArgument( @@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( return InvalidArgument( "Reshape dimensions [%s] are not a permutation of the operand " "dimensions (operand shape is %s).", - tensorflow::str_util::Join(dimensions, ",").c_str(), - ShapeUtil::HumanString(operand).c_str()); + Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str()); } return inferred_shape; @@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape( // The applied function's arity equals the number of arguments. if (arg_shapes.size() != to_apply.parameters_size()) { string computation_signature = ShapeUtil::HumanString(to_apply); - string argument_shapes = tensorflow::str_util::Join( - arg_shapes, ", ", [](string* out, const Shape* shape) { + string argument_shapes = + Join(arg_shapes, ", ", [](string* out, const Shape* shape) { tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape)); }); return InvalidArgument( diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc index 99d87f3b550..026c0211657 100644 --- a/tensorflow/compiler/xla/service/shape_inference_test.cc +++ b/tensorflow/compiler/xla/service/shape_inference_test.cc @@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) { "must have the same shape")); } +TEST_F(ShapeInferenceTest, BadSlice) { + auto arg = ShapeUtil::MakeShape(F32, {4}); + StatusOr statusor = + ShapeInference::InferSliceShape(arg, {0}, {5}, {1}); + ASSERT_FALSE(statusor.ok()); + + LOG(INFO) << statusor.status(); + + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("less than or equal to dimension size")) + << statusor.status(); + EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape")) + << statusor.status(); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc index 2ea6507900e..ef9c80b0431 100644 --- a/tensorflow/compiler/xla/service/user_computation.cc +++ b/tensorflow/compiler/xla/service/user_computation.cc @@ -1185,7 +1185,7 @@ StatusOr UserComputation::AddInfeedInstruction( return handle; } -Status UserComputation::AddOutfeedInstruction( +StatusOr UserComputation::AddOutfeedInstruction( const OutfeedRequest& outfeed_request) { tensorflow::mutex_lock lock(mutex_); @@ -1197,8 +1197,6 @@ Status UserComputation::AddOutfeedInstruction( // Verify that operand is valid. TF_RETURN_IF_ERROR(LookUpRequest(outfeed_request.operand()).status()); - // No handle is returned, but a handle must be assigned to this instruction - // for computation versioning. ComputationDataHandle handle = CreateComputationDataHandle(); OperationRequest& request = (*session_computation_.mutable_requests())[handle.handle()]; @@ -1209,7 +1207,7 @@ Status UserComputation::AddOutfeedInstruction( VLOG(1) << "AddOutfeedInstruction (" << GetVersionedHandleInternal() << "), data handle " << handle.handle() << ": " << outfeed_request.ShortDebugString(); - return Status::OK(); + return handle; } StatusOr UserComputation::AddCallInstruction( diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h index 4f92e58877a..54bb24d6d7f 100644 --- a/tensorflow/compiler/xla/service/user_computation.h +++ b/tensorflow/compiler/xla/service/user_computation.h @@ -146,7 +146,8 @@ class UserComputation { const InfeedRequest& infeed_request); // Enqueues an outfeed instruction onto this user computation. - Status AddOutfeedInstruction(const OutfeedRequest& outfeed_request); + StatusOr AddOutfeedInstruction( + const OutfeedRequest& outfeed_request); // Enqueues a call instruction onto this user computation. StatusOr AddCallInstruction( diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc index ca02115863e..2fa163953f6 100644 --- a/tensorflow/compiler/xla/service/user_computation_test.cc +++ b/tensorflow/compiler/xla/service/user_computation_test.cc @@ -67,7 +67,8 @@ TEST_F(UserComputationTest, SimpleComputation) { *outfeed_request.mutable_operand() = constant_handle; *outfeed_request.mutable_shape() = kVectorShape; outfeed_request.set_outfeed_config("abc"); - TF_ASSERT_OK(computation.AddOutfeedInstruction(outfeed_request)); + TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle outfeed_handle, + computation.AddOutfeedInstruction(outfeed_request)); auto hlo_resolver = [](const VersionedComputationHandle& handle) { return nullptr; diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc index cba73322fa9..d63e16ce2bf 100644 --- a/tensorflow/compiler/xla/shape_util.cc +++ b/tensorflow/compiler/xla/shape_util.cc @@ -475,8 +475,6 @@ StatusOr StringToPrimitiveType(const string& name) { if (LayoutUtil::HasLayout(shape)) { tensorflow::strings::StrAppend(&result, LayoutUtil::HumanString(shape.layout())); - } else { - tensorflow::strings::StrAppend(&result, "{no layout}"); } } return result; diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index 4410647f848..d4820d1b6d6 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -578,6 +578,7 @@ xla_test( xla_test( name = "reduce_precision_test", srcs = ["reduce_precision_test.cc"], + tags = ["enable_for_xla_interpreter"], deps = [ "//tensorflow/compiler/xla:array2d", "//tensorflow/compiler/xla:literal_util", diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc index 56fc21d019b..52e14a1f7b9 100644 --- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc +++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc @@ -1893,6 +1893,26 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) { error_spec_); } +XLA_TEST_F(ArrayElementwiseOpTest, ClampS32Vector) { + ComputationBuilder builder(client_, TestName()); + auto min_vector = builder.ConstantR1({1, -6, 1, 2, 0, -5}); + auto arg_vector = builder.ConstantR1({2, 10, -5, 1, 4, 10}); + auto max_vector = builder.ConstantR1({3, 0, 25, 5, 123, -1}); + auto clamp = builder.Clamp(min_vector, arg_vector, max_vector); + + ComputeAndCompareR1(&builder, {2, 0, 1, 2, 4, -1}, {}); +} + +XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) { + ComputationBuilder builder(client_, TestName()); + auto min_vector = builder.ConstantR1({1, 2, 1, 2, 0, ~0u - 4}); + auto arg_vector = builder.ConstantR1({2, 10, 5, 1, 4, 10}); + auto max_vector = builder.ConstantR1({3, 5, 25, 5, 123, ~0u}); + auto clamp = builder.Clamp(min_vector, arg_vector, max_vector); + + ComputeAndCompareR1(&builder, {2, 5, 5, 2, 4, ~0u - 4}, {}); +} + XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) { ComputationBuilder builder(client_, TestName()); diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc index e472408dcf7..022641394f1 100644 --- a/tensorflow/compiler/xla/tests/codegen_test_base.cc +++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc @@ -21,9 +21,11 @@ StatusOr> CodegenTestBase::CompileToExecutable( std::unique_ptr hlo_module) { TF_ASSIGN_OR_RETURN(hlo_module, backend().compiler()->RunHloPasses( std::move(hlo_module), - backend().default_stream_executor())); + backend().default_stream_executor(), + /*device_allocator=*/nullptr)); return backend().compiler()->RunBackend(std::move(hlo_module), - backend().default_stream_executor()); + backend().default_stream_executor(), + /*device_allocator=*/nullptr); } StatusOr> diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc index 7c1a993b478..9f5806c5e16 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.cc +++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc @@ -230,7 +230,7 @@ template const string& filename, const tensorflow::gtl::optional& error, const std::function& reference_preprocessor) { auto module_or_status = - HloRunner::ReadModule(filename, GetDebugOptionsForTest()); + HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest()); if (!module_or_status.ok()) { return ::testing::AssertionFailure() << "failed reading hlo module from file"; @@ -258,7 +258,7 @@ template const string& filename, const tensorflow::gtl::optional& error, const std::function& reference_preprocessor) { auto module_or_status = - HloRunner::ReadModule(filename, GetDebugOptionsForTest()); + HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest()); if (!module_or_status.ok()) { return ::testing::AssertionFailure() << "failed reading hlo module from file"; diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc index f8205de702f..474d2547aeb 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util.cc +++ b/tensorflow/compiler/xla/tests/literal_test_util.cc @@ -355,9 +355,9 @@ class NearComparator { // temporary files on failure. Returns true if literals match. bool ExpectNear(const Literal& expected, const Literal& actual) { VLOG(1) << "expected:"; - XLA_VLOG_LINES(1, expected.ToString()); + XLA_VLOG_LINES(1, TruncateHugeLiteral(expected)); VLOG(1) << "actual:"; - XLA_VLOG_LINES(1, actual.ToString()); + XLA_VLOG_LINES(1, TruncateHugeLiteral(actual)); // If the shapes mismatch, we simply fail the expectation instead of // printing out data, as it's a type error rather than a value error. @@ -376,7 +376,12 @@ class NearComparator { abs_expected_miscompare_sum_ = 0.0; max_rel_err_ = 0.0; max_abs_err_ = 0.0; + first_linear_index_ = -1; + last_linear_index_ = -1; + max_rel_linear_index_ = -1; + max_abs_linear_index_ = -1; miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED)); + miscompares_.PopulateWithValue(false); multi_index_.resize(expected.shape().dimensions_size(), 0); switch (expected.shape().element_type()) { @@ -404,21 +409,33 @@ class NearComparator { if (num_miscompares_ > 0) { if (!VLOG_IS_ON(1)) { LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape()) - << " " << expected.ToString(); + << " " << TruncateHugeLiteral(expected); LOG(INFO) << "actual: " << ShapeUtil::HumanString(actual.shape()) - << " " << actual.ToString(); + << " " << TruncateHugeLiteral(actual); + LOG(INFO) << "Dumping literals to temp files..."; + WriteLiteralToTempFile(expected, "expected"); + WriteLiteralToTempFile(actual, "actual"); + WriteLiteralToTempFile(miscompares_, "miscompares"); } EXPECT_TRUE(num_miscompares_ == 0) << "\nmax relative mismatch at index " - << LiteralTestUtil::MultiIndexAsString(max_rel_multi_index_) + << LiteralTestUtil::MultiIndexAsString( + IndexUtil::LinearIndexToMultidimensionalIndex( + actual.shape(), max_rel_linear_index_)) << "\nmaximum relative error " << max_rel_err_ << "\nmax absolute mismatch at index " - << LiteralTestUtil::MultiIndexAsString(max_abs_multi_index_) + << LiteralTestUtil::MultiIndexAsString( + IndexUtil::LinearIndexToMultidimensionalIndex( + actual.shape(), max_abs_linear_index_)) << "\nmaximum absolute error " << max_abs_err_ << "\nfirst mismatch at index " - << LiteralTestUtil::MultiIndexAsString(first_multi_index_) + << LiteralTestUtil::MultiIndexAsString( + IndexUtil::LinearIndexToMultidimensionalIndex( + actual.shape(), first_linear_index_)) << "\nlast mismatch at index " - << LiteralTestUtil::MultiIndexAsString(last_multi_index_) + << LiteralTestUtil::MultiIndexAsString( + IndexUtil::LinearIndexToMultidimensionalIndex( + actual.shape(), last_linear_index_)) << "\ntotal absolute error " << abs_diff_sum_ << "\ntotal absolute error of miscompares " << abs_diff_miscompare_sum_ << "\ntotal relative error " @@ -426,10 +443,6 @@ class NearComparator { << "\ntotal relative error of miscompares " << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_) << "\nfailure count " << num_miscompares_; - - WriteLiteralToTempFile(expected, "expected"); - WriteLiteralToTempFile(actual, "actual"); - WriteLiteralToTempFile(miscompares_, "miscompares"); } return num_miscompares_ == 0; } @@ -457,57 +470,93 @@ class NearComparator { return true; } - float abs_diff = std::abs(actual - expected); - float rel_err = abs_diff / std::abs(expected); + const float abs_diff = std::abs(actual - expected); + const float rel_err = abs_diff / std::abs(expected); + const bool nan_mismatch = NanMismatch(expected, actual); + const bool mismatch = + (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel)); + return !mismatch; + } + + // Assumes that expected vs actual fail ExpectValuesNear. + template + void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual, + const Shape& shape, const int64 linear_index) { + const float abs_diff = std::abs(actual - expected); + const float rel_err = abs_diff / std::abs(expected); abs_diff_sum_ += abs_diff; abs_expected_sum_ += std::abs(expected); - if (rel_err > max_rel_err_) { + if (rel_err > max_rel_err_ || std::isnan(rel_err)) { max_rel_err_ = rel_err; - max_rel_multi_index_ = multi_index_; + max_rel_linear_index_ = linear_index; } - if (abs_diff > max_abs_err_) { + if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) { max_abs_err_ = abs_diff; - max_abs_multi_index_ = multi_index_; + max_abs_linear_index_ = linear_index; } - VLOG(10) << tensorflow::strings::Printf( - "index %s abs_diff %f rel_err %f", - LiteralTestUtil::MultiIndexAsString(multi_index_).c_str(), abs_diff, - rel_err); - bool nan_mismatch = NanMismatch(expected, actual); - bool mismatch = - (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel)); - if (mismatch) { - abs_diff_miscompare_sum_ += abs_diff; - abs_expected_miscompare_sum_ += std::abs(expected); - const int64 kMaxFailures = 2; - if (num_miscompares_ < kMaxFailures) { - ::testing::Message msg; - msg << "mismatch at index " - << LiteralTestUtil::MultiIndexAsString(multi_index_) << " abs diff " - << abs_diff << " rel err " << rel_err << " failure #" - << num_miscompares_; - ExpectNear(expected, actual, msg); - } else if (num_miscompares_ == kMaxFailures) { - LOG(ERROR) - << "reached max 'loud' failure count; silently proceeding..."; - } - if (num_miscompares_ == 0) { - first_multi_index_ = multi_index_; - } - num_miscompares_++; - last_multi_index_ = multi_index_; + if (VLOG_IS_ON(10)) { + VLOG(10) << tensorflow::strings::Printf( + "index %s abs_diff %f rel_err %f", + LiteralTestUtil::MultiIndexAsString( + IndexUtil::LinearIndexToMultidimensionalIndex(shape, + linear_index)) + .c_str(), + abs_diff, rel_err); } - return !mismatch; + abs_diff_miscompare_sum_ += abs_diff; + abs_expected_miscompare_sum_ += std::abs(expected); + const int64 kMaxFailures = 2; + if (num_miscompares_ < kMaxFailures) { + const auto multi_index = + IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index); + ::testing::Message msg; + msg << "mismatch at index " + << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff " + << abs_diff << " rel err " << rel_err << " failure #" + << num_miscompares_; + ExpectNear(expected, actual, msg); + } else if (num_miscompares_ == kMaxFailures) { + LOG(ERROR) << "reached max 'loud' failure count; silently proceeding..."; + } + if (num_miscompares_ == 0) { + first_linear_index_ = linear_index; + } + num_miscompares_++; + last_linear_index_ = linear_index; + miscompares_.data()[linear_index] = true; } // Recursive function which compares the two given literals elementwise. template void ExpectLiteralsNear(const Literal& expected, const Literal& actual, int64 dimension) { + // Fast path optimization for the case were layouts match. + if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) { + tensorflow::gtl::ArraySlice expected_data = + expected.data(); + tensorflow::gtl::ArraySlice actual_data = + actual.data(); + const int64 len = expected_data.size(); + for (int64 i = 0; i < len; ++i) { + const bool near = ExpectValuesNear(expected_data[i], actual_data[i]); + if (!near) { + UpdateAndLogMiscompares(expected_data[i], actual_data[i], + actual.shape(), i); + } + } + return; + } + if (dimension == expected.shape().dimensions_size()) { bool near = ExpectValuesNear(expected.Get(multi_index_), actual.Get(multi_index_)); - miscompares_.Set(multi_index_, !near); + if (!near) { + UpdateAndLogMiscompares( + expected.Get(multi_index_), + actual.Get(multi_index_), actual.shape(), + IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(), + multi_index_)); + } } else { for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) { multi_index_[dimension] = i; @@ -528,6 +577,32 @@ class NearComparator { LOG(ERROR) << "wrote to " << name << " file: " << filename; } + // Gets the total element count. For tuples, this is not the count of tuple + // elements, but the sum of elements of each tuple element. + int64 RecursiveElementCount(const Shape& shape) { + if (ShapeUtil::IsTuple(shape)) { + const int64 tuple_elements = ShapeUtil::TupleElementCount(shape); + int64 total = 0; + for (int64 i = 0; i < tuple_elements; ++i) { + total += + RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i)); + } + return total; + } else { + return ShapeUtil::ElementsIn(shape); + } + } + + // Calling ToString on a literal with over 100 million elements takes around + // 3 minutes. The utility of printing a literal with >1000 elements is + // questionable, especially when writing the Literal proto to disk is orders + // of magnitude faster. + string TruncateHugeLiteral(const Literal& literal) { + return RecursiveElementCount(literal.shape()) < 1000 + ? literal.ToString() + : "[TRUNCATED, Literal with more than 1000 values]"; + } + ErrorSpec error_; // Number of element miscomparisons encountered so far. @@ -548,10 +623,10 @@ class NearComparator { double abs_expected_miscompare_sum_; float max_rel_err_; float max_abs_err_; - std::vector first_multi_index_; - std::vector last_multi_index_; - std::vector max_rel_multi_index_; - std::vector max_abs_multi_index_; + int64 first_linear_index_; + int64 last_linear_index_; + int64 max_rel_linear_index_; + int64 max_abs_linear_index_; }; template <> @@ -584,6 +659,23 @@ bool NearComparator::ExpectValuesNear(half expected, half actual) { static_cast(std::move(actual))); } +template <> +void NearComparator::UpdateAndLogMiscompares( + const bfloat16 expected, const bfloat16 actual, const Shape& shape, + const int64 linear_index) { + UpdateAndLogMiscompares(static_cast(expected), + static_cast(actual), shape, linear_index); +} + +template <> +void NearComparator::UpdateAndLogMiscompares(half expected, half actual, + const Shape& shape, + const int64 linear_index) { + UpdateAndLogMiscompares(static_cast(std::move(expected)), + static_cast(std::move(actual)), shape, + linear_index); +} + } // namespace /* static */ ::testing::AssertionResult LiteralTestUtil::Near( diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc index e477784557a..3a421f84582 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc +++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc @@ -97,5 +97,29 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) { } } +TEST(LiteralTestUtilTest, NearComparatorR1) { + auto a = + Literal::CreateR1({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}); + auto b = + Literal::CreateR1({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}); + EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001})); +} + +TEST(LiteralTestUtilTest, NearComparatorR1Nan) { + auto a = + Literal::CreateR1({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8}); + auto b = + Literal::CreateR1({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8}); + EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001})); +} + +TEST(LiteralTestUtil, NearComparatorDifferentLengths) { + auto a = + Literal::CreateR1({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}); + auto b = Literal::CreateR1({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7}); + EXPECT_FALSE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001})); + EXPECT_FALSE(LiteralTestUtil::Near(*b, *a, ErrorSpec{0.0001})); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc index b5b95967ff9..7e92439c494 100644 --- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc +++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc @@ -74,7 +74,8 @@ class LLVMCompilerTest : public ::testing::Test { ASSERT_TRUE(compiler ->RunBackend(std::move(hlo_module), - backend_->default_stream_executor()) + backend_->default_stream_executor(), + /*device_allocator=*/nullptr) .ok()); // Test that hooks were called. @@ -98,7 +99,8 @@ class LLVMCompilerTest : public ::testing::Test { executors.push_back({backend_->default_stream_executor()}); executors.push_back({backend_->default_stream_executor()}); - EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors))); + EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors), + /*device_allocator=*/nullptr)); } private: diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc index 3fd83a4c3b1..8cef8dd34dc 100644 --- a/tensorflow/compiler/xla/tests/pad_test.cc +++ b/tensorflow/compiler/xla/tests/pad_test.cc @@ -33,6 +33,14 @@ limitations under the License. namespace xla { namespace { +#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16 +// Tests both F32 and BF16. +static std::array use_bfloat16_params{false, true}; +#else +// Only tests F32. +static std::array use_bfloat16_params{false}; +#endif + class PadTest : public ClientLibraryTestBase { protected: PadTest() { @@ -61,8 +69,22 @@ class PadTest : public ClientLibraryTestBase { PaddingConfig r4_padding_on_dim0_dim1_; }; +class PadTestFloat : public PadTest, + public ::testing::WithParamInterface { + protected: + PadTestFloat() { set_use_bfloat16(GetParam()); } + + ErrorSpec DefaultErrorSpec() const { + if (use_bfloat16()) { + return ErrorSpec(1e-3, 1e-3); + } else { + return ErrorSpec(1e-5, 1e-5); + } + } +}; + // Tests a Pad() with a zero-element input and output. -XLA_TEST_F(PadTest, Pad1DS0ToS0Array) { +XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) { ComputationBuilder b(client_, TestName()); // Set up the padding configuration {low: 0, high: 0, interior: 0}. PaddingConfig padding_config; @@ -71,12 +93,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS0Array) { dimension->set_edge_padding_high(0); dimension->set_interior_padding(0); - b.Pad(b.ConstantR1({}), b.ConstantR0(0.1), padding_config); - ComputeAndCompareR1(&b, {}, {}, ErrorSpec(0.0001)); + b.Pad(AddParam(*Literal::CreateR1({}), &b), + AddParam(*Literal::CreateR0(0.1), &b), padding_config); + ComputeAndCompareR1(&b, {}, {}, DefaultErrorSpec()); } // Tests a Pad() with a zero-element input but a non-zero-element output. -XLA_TEST_F(PadTest, Pad1DS0ToS5Array) { +XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) { ComputationBuilder b(client_, TestName()); // Set up the padding configuration {low: 3, high: 0, interior: 1}. PaddingConfig padding_config; @@ -85,12 +108,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS5Array) { dimension->set_edge_padding_high(4); dimension->set_interior_padding(7); - b.Pad(b.ConstantR1({}), b.ConstantR0(0.1), padding_config); + b.Pad(AddParam(*Literal::CreateR1({}), &b), + AddParam(*Literal::CreateR0(0.1), &b), padding_config); ComputeAndCompareR1(&b, std::vector(5, 0.1), {}, - ErrorSpec(0.0001)); + DefaultErrorSpec()); } -XLA_TEST_F(PadTest, Pad1DS3Array) { +XLA_TEST_P(PadTestFloat, Pad1DS3Array) { ComputationBuilder b(client_, TestName()); // Set up the padding configuration {low: 3, high: 0, interior: 1}. PaddingConfig padding_config; @@ -99,21 +123,21 @@ XLA_TEST_F(PadTest, Pad1DS3Array) { dimension->set_edge_padding_high(0); dimension->set_interior_padding(1); - b.Pad(b.ConstantR1({1, 2, 3}), b.ConstantR0(0.1), - padding_config); + b.Pad(AddParam(*Literal::CreateR1({1, 2, 3}), &b), + AddParam(*Literal::CreateR0(0.1), &b), padding_config); std::vector expected({0.1, 0.1, 0.1, 1, 0.1, 2, 0.1, 3}); - ComputeAndCompareR1(&b, expected, {}, ErrorSpec(0.0001)); + ComputeAndCompareR1(&b, expected, {}, DefaultErrorSpec()); } -XLA_TEST_F(PadTest, Pad4D_2x0x3x2_FloatArray) { +XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) { ComputationBuilder b(client_, TestName()); - b.Pad(b.ConstantR4FromArray4D(Array4D(2, 0, 3, 2)), - b.ConstantR0(1.5), r4_padding_on_dim0_dim1_); + b.Pad(AddParam(Array4D(2, 0, 3, 2), &b), + AddParam(*Literal::CreateR0(1.5), &b), r4_padding_on_dim0_dim1_); ComputeAndCompareR4(&b, Array4D(5, 2, 3, 2, 1.5f), {}, - ErrorSpec(0.0001)); + DefaultErrorSpec()); } -TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) { +TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) { ComputationBuilder b(client_, TestName()); auto input = MakeUnique>(1, 1, 3, 2); Array2D input_xy({ @@ -123,7 +147,7 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) { }); input->FillWithYX(input_xy); - b.Pad(b.ConstantR4FromArray4D(*input), b.ConstantR0(1.5), + b.Pad(AddParam(*input, &b), AddParam(*Literal::CreateR0(1.5), &b), r4_padding_on_dim0_dim1_); auto expected = MakeUnique>(2, 3, 3, 2); @@ -134,15 +158,15 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) { (*expected)(1, 0, 1, 1) = 4.0f; (*expected)(1, 0, 2, 0) = 5.0f; (*expected)(1, 0, 2, 1) = 6.0f; - ComputeAndCompareR4(&b, *expected, {}, ErrorSpec(0.0001)); + ComputeAndCompareR4(&b, *expected, {}, DefaultErrorSpec()); } -TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) { +TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) { ComputationBuilder b(client_, TestName()); const float pad_value = 1.5f; Array4D input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6}); - b.Pad(b.ConstantR4FromArray4D(input), b.ConstantR0(pad_value), + b.Pad(AddParam(input, &b), AddParam(*Literal::CreateR0(pad_value), &b), r4_padding_on_dim0_dim1_); auto expected = MakeUnique>(8, 5, 1, 1); @@ -156,7 +180,7 @@ TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) { ComputeAndCompareR4(&b, *expected, {}, ErrorSpec(0.0001)); } -TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) { +TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) { ComputationBuilder b(client_, TestName()); PaddingConfig padding_config; @@ -184,7 +208,8 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) { auto input = Literal::CreateR4FromArray4D(input_array); input = input->Relayout(layout); - b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config); + b.Pad(AddParam(*input, &b), + AddParam(*Literal::CreateR0(pad_value), &b), padding_config); Array4D expected_array(1, 1, 5, 8); expected_array.Fill(pad_value); @@ -197,7 +222,7 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) { ComputeAndCompareR4(&b, expected_array, {}, ErrorSpec(0.0001)); } -XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) { +XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) { ComputationBuilder b(client_, TestName()); PaddingConfig padding_config; @@ -229,7 +254,8 @@ XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) { auto input = Literal::CreateR4FromArray4D(input_array); input = input->Relayout(layout); - b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config); + b.Pad(AddParam(*input, &b), + AddParam(*Literal::CreateR0(pad_value), &b), padding_config); Array4D expected_array(1, 25, 17, 11); expected_array.Fill(pad_value); @@ -249,7 +275,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) { }); input->FillWithYX(input_xy); - b.Pad(b.ConstantR4FromArray4D(*input), b.ConstantR0(35), + b.Pad(AddParam(*input, &b), b.ConstantR0(35), r4_padding_on_dim0_dim1_); auto expected = MakeUnique>(2, 3, 3, 2); @@ -277,8 +303,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) { auto ones = MakeUnique>(2, 3, 3, 2); zeros->Fill(0); ones->Fill(1); - b.Select(padded, b.ConstantR4FromArray4D(*ones), - b.ConstantR4FromArray4D(*zeros)); + b.Select(padded, AddParam(*ones, &b), AddParam(*zeros, &b)); auto expected = MakeUnique>(2, 3, 3, 2); expected->Fill(0); @@ -291,10 +316,12 @@ XLA_TEST_F(PadTest, Pad4DPredArray) { ComputeAndCompareR4(&b, *expected, {}); } -XLA_TEST_F(PadTest, Large2DPad) { +XLA_TEST_P(PadTestFloat, Large2DPad) { ComputationBuilder b(client_, TestName()); - auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {4, 4}), "input"); + auto ones = MakeUnique>(4, 4); + ones->Fill(1.0f); + auto input = AddParam(*ones, &b); PaddingConfig padding_config = MakeNoPaddingConfig(2); for (int dim : {0, 1}) { padding_config.mutable_dimensions(dim)->set_edge_padding_low( @@ -302,25 +329,22 @@ XLA_TEST_F(PadTest, Large2DPad) { padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 + 100 * dim); } - auto padded = b.Pad(input, b.ConstantR0(0.0f), padding_config); - - auto ones = MakeUnique>(4, 4); - ones->Fill(1.0f); - auto input_literal = Literal::CreateR2FromArray2D(*ones); - std::unique_ptr input_data = - client_->TransferToServer(*input_literal).ConsumeValueOrDie(); + auto padded = b.Pad(input, AddParam(*Literal::CreateR0(0.0f), &b), + padding_config); auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f); - ComputeAndCompareR2(&b, *expected, {input_data.get()}); + ComputeAndCompareR2(&b, *expected, {}, DefaultErrorSpec()); } -XLA_TEST_F(PadTest, AllTypes2DPad) { +XLA_TEST_P(PadTestFloat, AllTypes2DPad) { ComputationBuilder b(client_, TestName()); constexpr int64 in_rows = 35; constexpr int64 in_cols = 35; - auto input = - b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input"); + auto operand = MakeUnique>(in_rows, in_cols); + operand->FillUnique(0.0f); + auto input = AddParam(*operand, &b); + PaddingConfig padding_config = MakeNoPaddingConfig(2); padding_config.mutable_dimensions(0)->set_edge_padding_low(7); padding_config.mutable_dimensions(0)->set_edge_padding_high(5); @@ -328,20 +352,14 @@ XLA_TEST_F(PadTest, AllTypes2DPad) { padding_config.mutable_dimensions(1)->set_edge_padding_low(6); padding_config.mutable_dimensions(1)->set_edge_padding_high(4); padding_config.mutable_dimensions(1)->set_interior_padding(2); - auto padded = b.Pad(input, b.ConstantR0(3.14f), padding_config); - - auto operand = MakeUnique>(in_rows, in_cols); - operand->FillUnique(0.0f); - auto input_literal = Literal::CreateR2FromArray2D(*operand); - std::unique_ptr input_data = - client_->TransferToServer(*input_literal).ConsumeValueOrDie(); + auto padded = b.Pad(input, AddParam(*Literal::CreateR0(3.14f), &b), + padding_config); auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f); - ComputeAndCompareR2(&b, *expected, {input_data.get()}, - ErrorSpec{0.0001}); + ComputeAndCompareR2(&b, *expected, {}, DefaultErrorSpec()); } -XLA_TEST_F(PadTest, High2DPad) { +XLA_TEST_P(PadTestFloat, High2DPad) { ComputationBuilder b(client_, TestName()); constexpr int64 in_rows = 129; @@ -349,8 +367,9 @@ XLA_TEST_F(PadTest, High2DPad) { constexpr int64 low_padding = 0; int64 high_padding[2] = {5, 7}; constexpr int64 interior_padding = 0; - auto input = - b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input"); + auto operand = MakeUnique>(in_rows, in_cols); + operand->FillUnique(1.0f); + auto input = AddParam(*operand, &b); PaddingConfig padding_config = MakeNoPaddingConfig(2); for (int dim : {0, 1}) { padding_config.mutable_dimensions(dim)->set_edge_padding_low(low_padding); @@ -359,20 +378,15 @@ XLA_TEST_F(PadTest, High2DPad) { padding_config.mutable_dimensions(dim)->set_interior_padding( interior_padding); } - auto padded = b.Pad(input, b.ConstantR0(2.718f), padding_config); + auto padded = b.Pad(input, AddParam(*Literal::CreateR0(2.718f), &b), + padding_config); - auto operand = MakeUnique>(in_rows, in_cols); - operand->FillUnique(1.0f); - auto input_literal = Literal::CreateR2FromArray2D(*operand); auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f); - std::unique_ptr input_data = - client_->TransferToServer(*input_literal).ConsumeValueOrDie(); - ComputeAndCompareR2(&b, *expected, {input_data.get()}, - ErrorSpec(0.0001)); + ComputeAndCompareR2(&b, *expected, {}, DefaultErrorSpec()); } -XLA_TEST_F(PadTest, NegativePadding2D) { +XLA_TEST_P(PadTestFloat, NegativePadding2D) { ComputationBuilder b(client_, TestName()); constexpr int64 in_rows = 129; @@ -380,8 +394,9 @@ XLA_TEST_F(PadTest, NegativePadding2D) { int64 low_padding[2] = {-1, -2}; int64 high_padding[2] = {-3, 4}; constexpr int64 interior_padding = 0; - auto input = - b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input"); + auto operand = MakeUnique>(in_rows, in_cols); + operand->FillUnique(1.0f); + auto input = AddParam(*operand, &b); PaddingConfig padding_config = MakeNoPaddingConfig(2); for (int dim : {0, 1}) { padding_config.mutable_dimensions(dim)->set_edge_padding_low( @@ -391,20 +406,15 @@ XLA_TEST_F(PadTest, NegativePadding2D) { padding_config.mutable_dimensions(dim)->set_interior_padding( interior_padding); } - auto padded = b.Pad(input, b.ConstantR0(2.718f), padding_config); + auto padded = b.Pad(input, AddParam(*Literal::CreateR0(2.718f), &b), + padding_config); - auto operand = MakeUnique>(in_rows, in_cols); - operand->FillUnique(1.0f); - auto input_literal = Literal::CreateR2FromArray2D(*operand); auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f); - std::unique_ptr input_data = - client_->TransferToServer(*input_literal).ConsumeValueOrDie(); - ComputeAndCompareR2(&b, *expected, {input_data.get()}, - ErrorSpec(0.0001)); + ComputeAndCompareR2(&b, *expected, {}, DefaultErrorSpec()); } -XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) { +XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) { ComputationBuilder b(client_, TestName()); constexpr int64 in_rows = 8; @@ -412,8 +422,9 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) { int64 low_padding[2] = {4, -1}; int64 high_padding[2] = {-2, -4}; int64 interior_padding[2] = {1, 2}; - auto input = - b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input"); + auto operand = MakeUnique>(in_rows, in_cols); + operand->FillUnique(1.0f); + auto input = AddParam(*operand, &b); PaddingConfig padding_config = MakeNoPaddingConfig(2); for (int dim : {0, 1}) { padding_config.mutable_dimensions(dim)->set_edge_padding_low( @@ -423,44 +434,40 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) { padding_config.mutable_dimensions(dim)->set_interior_padding( interior_padding[dim]); } - auto padded = b.Pad(input, b.ConstantR0(2.718f), padding_config); + auto padded = b.Pad(input, AddParam(*Literal::CreateR0(2.718f), &b), + padding_config); - auto operand = MakeUnique>(in_rows, in_cols); - operand->FillUnique(1.0f); - auto input_literal = Literal::CreateR2FromArray2D(*operand); auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f); - std::unique_ptr input_data = - client_->TransferToServer(*input_literal).ConsumeValueOrDie(); - ComputeAndCompareR2(&b, *expected, {input_data.get()}, - ErrorSpec(0.0001)); + ComputeAndCompareR2(&b, *expected, {}, DefaultErrorSpec()); } // Regression test for b/31827337. -XLA_TEST_F(PadTest, ReducePad) { +XLA_TEST_P(PadTestFloat, ReducePad) { ComputationBuilder b(client_, TestName()); - auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2, 2, 2}), "input"); + auto ones = MakeUnique>(2, 2, 2, 2); + ones->Fill(1.0); + auto input = AddParam(*ones, &b); - Computation add_f32 = CreateScalarAddComputation(F32, &b); - auto reduce = b.Reduce(input, b.ConstantR0(0.0), add_f32, {0}); + Computation add = CreateScalarAddComputation(FloatType(), &b); + auto reduce = + b.Reduce(input, AddParam(*Literal::CreateR0(0.0), &b), add, {0}); PaddingConfig padding_config = MakeNoPaddingConfig(3); padding_config.mutable_dimensions(0)->set_edge_padding_low(1); padding_config.mutable_dimensions(0)->set_edge_padding_high(1); - auto pad = b.Pad(reduce, b.ConstantR0(0.0), padding_config); - - auto ones = MakeUnique>(2, 2, 2, 2); - ones->Fill(1.0); - auto input_literal = Literal::CreateR4FromArray4D(*ones); - std::unique_ptr input_data = - client_->TransferToServer(*input_literal).ConsumeValueOrDie(); + auto padded = b.Pad(reduce, AddParam(*Literal::CreateR0(0.0f), &b), + padding_config); Array3D expected({{{0.0, 0.0}, {0.0, 0.0}}, {{2.0, 2.0}, {2.0, 2.0}}, {{2.0, 2.0}, {2.0, 2.0}}, {{0.0, 0.0}, {0.0, 0.0}}}); - ComputeAndCompareR3(&b, expected, {input_data.get()}); + ComputeAndCompareR3(&b, expected, {}, DefaultErrorSpec()); } +INSTANTIATE_TEST_CASE_P(PadTestFloatInstantiation, PadTestFloat, + ::testing::ValuesIn(use_bfloat16_params)); + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc index 4756ba09689..dc7ce3253ce 100644 --- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc @@ -249,7 +249,9 @@ INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest, // ReducePrecisionInsertion passes. class ReducePrecisionInsertionTest : public ClientLibraryTestBase {}; -XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) { +// The interpreter has no fusion pass, so skip this test. +XLA_TEST_F(ReducePrecisionInsertionTest, + DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) { ComputationBuilder builder(client_, TestName()); std::unique_ptr a_literal = Literal::CreateR1({1.00001}); @@ -276,7 +278,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) { ComputeAndCompareR1(&builder, {0.0f}, {a_data.get()}); } -XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) { +// The interpreter has no fusion pass, so skip this test. +XLA_TEST_F(ReducePrecisionInsertionTest, + DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) { ComputationBuilder builder(client_, TestName()); std::unique_ptr a_literal = Literal::CreateR1({1.00001}); @@ -300,7 +304,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) { ComputeAndCompareR1(&builder, {-1.00001f}, {a_data.get()}); } -XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) { +// The interpreter has no fusion pass, so skip this test. +XLA_TEST_F(ReducePrecisionInsertionTest, + DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) { ComputationBuilder builder(client_, TestName()); std::unique_ptr a_literal = Literal::CreateR1({1.00001}); @@ -322,7 +328,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) { ComputeAndCompareR1(&builder, {-1.0f}, {a_data.get()}); } -XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) { +// The interpreter has no fusion pass, so skip this test. +XLA_TEST_F(ReducePrecisionInsertionTest, + DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) { ComputationBuilder builder(client_, TestName()); std::unique_ptr a_literal = Literal::CreateR1({1.00001}); @@ -345,7 +353,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) { ComputeAndCompareR1(&builder, {-1.00001f}, {a_data.get()}); } -XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedFusionContains) { +// The interpreter has no fusion pass, so skip this test. +XLA_TEST_F(ReducePrecisionInsertionTest, + DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) { ComputationBuilder builder(client_, TestName()); std::unique_ptr a_literal = Literal::CreateR1({1.00001}); diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc index a766fa2db0e..50d7b5074d2 100644 --- a/tensorflow/compiler/xla/tests/reduce_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_test.cc @@ -494,6 +494,26 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) { ErrorSpec(0.01, 1e-4)); } +// Test that algebraic simplifier does not incorrectly fold a transpose into a +// reduction operation. +XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) { + ComputationBuilder builder(client_, TestName()); + Computation add_f32 = CreateScalarAddComputation(F32, &builder); + const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50}); + ComputationDataHandle input = builder.Parameter(0, input_shape, "input"); + ComputationDataHandle zero = builder.ConstantR0(0.0); + ComputationDataHandle transpose = + builder.Transpose(input, /*permutation=*/{1, 0, 2}); + ComputationDataHandle reduce = + builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0}); + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr input_data, + MakeFakeLiteral(input_shape)); + + ComputeAndCompare(&builder, reduce, {std::move(*input_data)}, + ErrorSpec(0.01, 1e-4)); +} + XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) { const int64 rows = 111, cols = 50; diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc index debf2d2d317..4da6ee91607 100644 --- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc +++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc @@ -737,7 +737,61 @@ XLA_TEST_F(ScalarComputationsTest, PowScalar) { ComputeAndCompareR0(&builder, 8.0, {}, error_spec_); } -XLA_TEST_F(ScalarComputationsTest, ClampScalarHigh) { +XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) { + ComputationBuilder builder(client_, TestName()); + builder.Clamp(builder.ConstantR0(-1), // The lower bound. + builder.ConstantR0(5), // The operand to be clamped. + builder.ConstantR0(3)); // The upper bound. + + ComputeAndCompareR0(&builder, 3, {}); +} + +XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleS32) { + ComputationBuilder builder(client_, TestName()); + builder.Clamp(builder.ConstantR0(-1), // The lower bound. + builder.ConstantR0(2), // The operand to be clamped. + builder.ConstantR0(3)); // The upper bound. + + ComputeAndCompareR0(&builder, 2, {}); +} + +XLA_TEST_F(ScalarComputationsTest, ClampScalarLowS32) { + ComputationBuilder builder(client_, TestName()); + builder.Clamp(builder.ConstantR0(-1), // The lower bound. + builder.ConstantR0(-5), // The operand to be clamped. + builder.ConstantR0(3)); // The upper bound. + + ComputeAndCompareR0(&builder, -1, {}); +} + +XLA_TEST_F(ScalarComputationsTest, ClampScalarHighU32) { + ComputationBuilder builder(client_, TestName()); + builder.Clamp(builder.ConstantR0(1), // The lower bound. + builder.ConstantR0(5), // The operand to be clamped. + builder.ConstantR0(3)); // The upper bound. + + ComputeAndCompareR0(&builder, 3, {}); +} + +XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleU32) { + ComputationBuilder builder(client_, TestName()); + builder.Clamp(builder.ConstantR0(1), // The lower bound. + builder.ConstantR0(2), // The operand to be clamped. + builder.ConstantR0(3)); // The upper bound. + + ComputeAndCompareR0(&builder, 2, {}); +} + +XLA_TEST_F(ScalarComputationsTest, ClampScalarLowU32) { + ComputationBuilder builder(client_, TestName()); + builder.Clamp(builder.ConstantR0(1), // The lower bound. + builder.ConstantR0(0), // The operand to be clamped. + builder.ConstantR0(3)); // The upper bound. + + ComputeAndCompareR0(&builder, 1, {}); +} + +XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) { ComputationBuilder builder(client_, TestName()); builder.Clamp(builder.ConstantR0(2.0f), // The lower bound. builder.ConstantR0(5.0f), // The operand to be clamped. @@ -746,7 +800,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHigh) { ComputeAndCompareR0(&builder, 3.0, {}, error_spec_); } -XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddle) { +XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) { ComputationBuilder builder(client_, TestName()); builder.Clamp(builder.ConstantR0(2.0f), // The lower bound. builder.ConstantR0(2.5f), // The operand to be clamped. @@ -755,7 +809,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddle) { ComputeAndCompareR0(&builder, 2.5, {}, error_spec_); } -XLA_TEST_F(ScalarComputationsTest, ClampScalarLow) { +XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) { ComputationBuilder builder(client_, TestName()); builder.Clamp(builder.ConstantR0(2.0f), // The lower bound. builder.ConstantR0(-5.0f), // The operand to be clamped. @@ -852,5 +906,12 @@ XLA_TEST_F(ScalarComputationsTest, SqrtF320) { ComputeAndCompareR0(&builder, 0.0f, {zero_data.get()}, error_spec_); } +XLA_TEST_F(ScalarComputationsTest, RoundScalar) { + ComputationBuilder builder(client_, TestName()); + builder.Round(builder.ConstantR0(1.4f)); + + ComputeAndCompareR0(&builder, 1.0f, {}, error_spec_); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc index 0e90a323583..b060fb13b14 100644 --- a/tensorflow/compiler/xla/tests/test_utils.cc +++ b/tensorflow/compiler/xla/tests/test_utils.cc @@ -24,51 +24,127 @@ namespace xla { namespace { template -void PopulateWithRandomFloatingPointData(Literal* literal) { +void PopulateWithRandomFloatingPointData(Literal* literal, + std::minstd_rand0* engine) { CHECK_EQ(literal->shape().element_type(), primitive_util::NativeToPrimitiveType()); - std::minstd_rand0 engine; - // Create uniform numbers between 1 and 1.125 ot avoid creating denormal + // Create uniform numbers between 1 and 1.125 to avoid creating denormal // numbers. std::uniform_real_distribution generator(1.0f, 1.125f); + const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000; TF_CHECK_OK(literal->Populate( [&](tensorflow::gtl::ArraySlice indices) { - // Generate a random uniforma number from -0.0625 and 0.0625 and bias it - // with a position dependent number with mean 0.037109375. These number + // Generate a random uniform number from -0.0625 and 0.0625 and bias it + // with a position dependent number with mean 0.037109375. These number // should allow for long chains of accumulation without being too close - // to zero or to large to accumulate all numbers accurately. - return (generator(engine) - 1.0625) + - static_cast(Product(indices) % 113 - 47) / - static_cast(256.0f); + // to zero or too large to accumulate all numbers accurately. Only do + // this for large literals where the number of elements is much greater + // than 47 otherwise only negative values are produced. + // + // The value is positionally biased using a product of the indices. Add + // one to each index value to avoid collapsing to zero if any of the + // indices are zero. + int64 index_product = 1; + for (int64 i : indices) { + index_product *= (1 + i); + } + const int64 negative_bias = should_index_bias ? 47 : 0; + FloatT index_bias = + static_cast(index_product % 113 - negative_bias) / + static_cast(256.0f); + return (generator(*engine) - 1.0625) + index_bias; })); } // The standard library does not have a case for bfloat16, unsurprisingly, so we // handle that one specially. template <> -void PopulateWithRandomFloatingPointData(Literal* literal) { +void PopulateWithRandomFloatingPointData(Literal* literal, + std::minstd_rand0* engine) { CHECK_EQ(literal->shape().element_type(), BF16); - std::minstd_rand0 engine; std::uniform_real_distribution generator(-0.9f, 1.0f); TF_CHECK_OK(literal->Populate( [&](tensorflow::gtl::ArraySlice /*indices*/) { - return static_cast(generator(engine)); + return static_cast(generator(*engine)); })); } template -void PopulateWithRandomIntegralData(Literal* literal) { +void PopulateWithRandomIntegralData(Literal* literal, + std::minstd_rand0* engine) { CHECK_EQ(literal->shape().element_type(), primitive_util::NativeToPrimitiveType()); - std::minstd_rand0 engine; std::uniform_int_distribution generator( std::numeric_limits::lowest(), std::numeric_limits::max()); TF_CHECK_OK(literal->Populate( [&](tensorflow::gtl::ArraySlice /*indices*/) { - return generator(engine); + return generator(*engine); })); } +// Similar to MakeFakeLiteral but takes a random number generator engine to +// enable reusing the engine across randomly generated literals. +StatusOr> MakeFakeLiteralInternal( + const Shape& shape, std::minstd_rand0* engine) { + if (ShapeUtil::IsTuple(shape)) { + std::vector> elements; + for (const Shape& element_shape : shape.tuple_shapes()) { + TF_ASSIGN_OR_RETURN(std::unique_ptr element, + MakeFakeLiteralInternal(element_shape, engine)); + elements.push_back(std::move(element)); + } + return Literal::MakeTupleOwned(std::move(elements)); + } + std::unique_ptr literal = Literal::CreateFromShape(shape); + switch (shape.element_type()) { + case BF16: + PopulateWithRandomFloatingPointData(literal.get(), engine); + break; + case F32: + PopulateWithRandomFloatingPointData(literal.get(), engine); + break; + case F64: + PopulateWithRandomFloatingPointData(literal.get(), engine); + break; + case S8: + PopulateWithRandomIntegralData(literal.get(), engine); + break; + case U8: + PopulateWithRandomIntegralData(literal.get(), engine); + break; + case S16: + PopulateWithRandomIntegralData(literal.get(), engine); + break; + case U16: + PopulateWithRandomIntegralData(literal.get(), engine); + break; + case S32: + PopulateWithRandomIntegralData(literal.get(), engine); + break; + case U32: + PopulateWithRandomIntegralData(literal.get(), engine); + break; + case S64: + PopulateWithRandomIntegralData(literal.get(), engine); + break; + case U64: + PopulateWithRandomIntegralData(literal.get(), engine); + break; + case PRED: { + std::uniform_int_distribution generator(0, 1); + TF_CHECK_OK(literal->Populate( + [&](tensorflow::gtl::ArraySlice /*indices*/) { + return generator(*engine); + })); + break; + } + default: + return Unimplemented("Unsupported type for fake literal generation: %s", + ShapeUtil::HumanString(shape).c_str()); + } + return std::move(literal); +} + // Matches binary addition computations. bool LooksLikeSum(const HloComputation& computation) { const HloInstruction* const root = computation.root_instruction(); @@ -95,15 +171,15 @@ bool NeedsZeroInitValue(const HloUse& use) { // Generate random values that are constrained to the input_shape minus the // output_shape so as not to produce wrapping slices, for instance. std::unique_ptr MakeRandomNonwrappingSliceIndex( - const Shape& input_shape, const Shape& slice_shape) { + const Shape& input_shape, const Shape& slice_shape, + std::minstd_rand0* engine) { const int64 rank = ShapeUtil::Rank(input_shape); std::vector start_indices(rank); - std::minstd_rand0 engine; for (int i = 0; i < rank; ++i) { const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) - ShapeUtil::GetDimension(slice_shape, i); std::uniform_int_distribution generator(0, upper_bound); - start_indices[i] = generator(engine); + start_indices[i] = generator(*engine); } return Literal::CreateR1(start_indices); } @@ -150,7 +226,7 @@ std::vector FindConstrainedUses( // zero in the case of init_values for reductions). StatusOr> CreateLiteralForConstrainedUses( const tensorflow::gtl::ArraySlice constrained_uses, - const HloInstruction& param) { + const HloInstruction& param, std::minstd_rand0* engine) { HloInstruction* needs_index = nullptr; HloInstruction* needs_zero = nullptr; for (HloInstruction* use : constrained_uses) { @@ -185,93 +261,39 @@ StatusOr> CreateLiteralForConstrainedUses( } if (needs_index != nullptr) { return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(), - needs_index->shape()); + needs_index->shape(), engine); } else if (needs_zero != nullptr) { return Literal::CreateFromShape(param.shape()); } else { - return MakeFakeLiteral(param.shape()); + return MakeFakeLiteralInternal(param.shape(), engine); } } // Given a module entry parameter, use the dataflow analysis to see if a // special case literal must be created, or if we can generate fake data. StatusOr> MakeConstrainedArgument( - const HloDataflowAnalysis& dataflow, const HloInstruction& param) { + const HloDataflowAnalysis& dataflow, const HloInstruction& param, + std::minstd_rand0* engine) { const auto constrained_uses = FindConstrainedUses(dataflow, param); - return CreateLiteralForConstrainedUses(constrained_uses, param); + return CreateLiteralForConstrainedUses(constrained_uses, param, engine); } } // namespace StatusOr> MakeFakeLiteral(const Shape& shape) { - if (ShapeUtil::IsTuple(shape)) { - std::vector> elements; - for (const Shape& element_shape : shape.tuple_shapes()) { - TF_ASSIGN_OR_RETURN(std::unique_ptr element, - MakeFakeLiteral(element_shape)); - elements.push_back(std::move(element)); - } - return Literal::MakeTupleOwned(std::move(elements)); - } - std::unique_ptr literal = Literal::CreateFromShape(shape); - switch (shape.element_type()) { - case BF16: - PopulateWithRandomFloatingPointData(literal.get()); - break; - case F32: - PopulateWithRandomFloatingPointData(literal.get()); - break; - case F64: - PopulateWithRandomFloatingPointData(literal.get()); - break; - case S8: - PopulateWithRandomIntegralData(literal.get()); - break; - case U8: - PopulateWithRandomIntegralData(literal.get()); - break; - case S16: - PopulateWithRandomIntegralData(literal.get()); - break; - case U16: - PopulateWithRandomIntegralData(literal.get()); - break; - case S32: - PopulateWithRandomIntegralData(literal.get()); - break; - case U32: - PopulateWithRandomIntegralData(literal.get()); - break; - case S64: - PopulateWithRandomIntegralData(literal.get()); - break; - case U64: - PopulateWithRandomIntegralData(literal.get()); - break; - case PRED: { - std::uniform_int_distribution generator(0, 1); - std::minstd_rand0 engine; - TF_CHECK_OK(literal->Populate( - [&](tensorflow::gtl::ArraySlice /*indices*/) { - return generator(engine); - })); - break; - } - default: - return Unimplemented("Unsupported type for fake literal generation: %s", - ShapeUtil::HumanString(shape).c_str()); - } - return std::move(literal); + std::minstd_rand0 engine; + return MakeFakeLiteralInternal(shape, &engine); } StatusOr>> MakeFakeArguments( HloModule* const module) { TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(module)); const auto params = module->entry_computation()->parameter_instructions(); + std::minstd_rand0 engine; std::vector> arguments(params.size()); for (int i = 0; i < params.size(); ++i) { - TF_ASSIGN_OR_RETURN(arguments[i], - MakeConstrainedArgument(*dataflow, *params[i])); + TF_ASSIGN_OR_RETURN( + arguments[i], MakeConstrainedArgument(*dataflow, *params[i], &engine)); } return std::move(arguments); } diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc index 5ede37b8737..b82f1c81c84 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc @@ -85,10 +85,12 @@ void RealMain(tensorflow::gtl::ArraySlice args) { for (int i = 0; i < program_shape->parameters_size(); ++i) { layouts.push_back(&program_shape->parameters(i)); } + ExecutableBuildOptions build_options; + build_options.set_device_ordinal(0); + build_options.set_result_layout(program_shape->result()); StatusOr> executable = local_service->CompileExecutable(computation.handle(), layouts, - &program_shape->result(), - /*device_ordinal=*/0); + build_options); const HloModule& module = executable.ValueOrDie()->module(); diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc index 24417a0cb82..05c0fdf97d2 100644 --- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc +++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc @@ -60,10 +60,13 @@ void RealMain(tensorflow::gtl::ArraySlice args, bool compile) { for (int i = 0; i < program_shape->parameters_size(); ++i) { layouts.push_back(&program_shape->parameters(i)); } + + ExecutableBuildOptions build_options; + build_options.set_device_ordinal(0); + build_options.set_result_layout(program_shape->result()); StatusOr> executable = local_service->CompileExecutable(computation.handle(), layouts, - &program_shape->result(), - /*device_ordinal=*/0); + build_options); const HloModule& module = executable.ValueOrDie()->module(); diff --git a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc index 4e02e17db65..8460ae3e499 100644 --- a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc +++ b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc @@ -19,7 +19,7 @@ limitations under the License. // // Reads one serilized Hlo module, convert it into JSON format and dump into // some output directory. some_binaray_proto is obtained by serializing Hlo -// module to disk using --xla_dump_hlo_proto_to debug optoin. +// module to disk using --xla_dump_optimized_hlo_proto_to debug option. #include #include diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc index 42e7f91f26f..d9c4d094b8e 100644 --- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc +++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc @@ -2173,7 +2173,7 @@ bool HloParser::ParseConvolutionDimensionNumbers( // // {[2:3:4], [5:6:7], [8:9]} // -// The the parsed result will be: +// The parsed result will be: // // {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}} // diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc index b0209050350..1f0c626bbb2 100644 --- a/tensorflow/compiler/xla/util.cc +++ b/tensorflow/compiler/xla/util.cc @@ -339,7 +339,7 @@ std::vector> CommonFactors( string SanitizeFileName(string file_name) { for (char& c : file_name) { - if (c == '/' || c == '\\' || c == '[' || c == ']') { + if (c == '/' || c == '\\' || c == '[' || c == ']' || c == ' ') { c = '_'; } } diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h index 4bc2d632cd8..08df5b12b3a 100644 --- a/tensorflow/compiler/xla/util.h +++ b/tensorflow/compiler/xla/util.h @@ -217,6 +217,24 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2); // Passed-varargs variant of the InvalidArgument factory above. Status InvalidArgumentV(const char* format, va_list args); +template +Status UnimplementedStrCat(Args&&... concat) { + return Unimplemented( + "%s", tensorflow::strings::StrCat(std::forward(concat)...).c_str()); +} + +template +Status InternalErrorStrCat(Args&&... concat) { + return InternalError( + "%s", tensorflow::strings::StrCat(std::forward(concat)...).c_str()); +} + +template +Status ResourceExhaustedStrCat(Args&&... concat) { + return ResourceExhausted( + "%s", tensorflow::strings::StrCat(std::forward(concat)...).c_str()); +} + // Splits the lines of the original, replaces leading whitespace with the prefix // given by "indentation", and returns the string joined by newlines again. As a // side effect, any additional trailing whitespace is removed. @@ -342,7 +360,7 @@ T CeilOfRatio(T dividend, T divisor) { } // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio -// then multiplying by the divisor. For example: RoundUpToMultiple(13, 8) => 16 +// then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16 template T RoundUpToNearest(T value, T divisor) { return CeilOfRatio(value, divisor) * divisor; @@ -350,7 +368,7 @@ T RoundUpToNearest(T value, T divisor) { // Rounds the value down to a multiple of the divisor by first calling // FloorOfRatio then multiplying by the divisor. For example: -// RoundUpToMultiple(13, 8) => 8 +// RoundDownToNearest(13, 8) => 8 template T RoundDownToNearest(T value, T divisor) { return FloorOfRatio(value, divisor) * divisor; diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto index e1ed08c8480..56162ab44e2 100644 --- a/tensorflow/compiler/xla/xla.proto +++ b/tensorflow/compiler/xla/xla.proto @@ -82,8 +82,9 @@ message DebugOptions { // Dump all HLO modules as text into the provided directory path. string xla_generate_hlo_text_to = 7; - // Dump compilation artifacts in binary proto into this directory. - string xla_dump_hlo_proto_to = 8; + // Dump Hlo after all hlo passes are executed as proto binary into this + // directory. + string xla_dump_optimized_hlo_proto_to = 8; // Instrument the computation to collect per-HLO cycle counts. bool xla_hlo_profile = 9; @@ -179,9 +180,13 @@ message DebugOptions { // ops. bool xla_gpu_use_cudnn_batchnorm = 94; - // Dump compilation artifacts, before hlo passes are executed, in binary proto - // into this directory. - string xla_dump_prepass_hlo_proto_to = 95; + // Dump HLO before any hlo passes are executed as proto binary into this + // directory. + string xla_dump_unoptimized_hlo_proto_to = 95; + + // Dump HLO after each pass as an HloProto in binary file format into this + // directory. + string xla_dump_per_pass_hlo_proto_to = 96; // Extra options to pass to the compilation backend; specific interpretation // of these values is left to the backend. diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD index e04ce01f437..3e9664a0187 100644 --- a/tensorflow/contrib/BUILD +++ b/tensorflow/contrib/BUILD @@ -25,6 +25,7 @@ py_library( "//tensorflow/contrib/bayesflow:bayesflow_py", "//tensorflow/contrib/boosted_trees:init_py", "//tensorflow/contrib/cloud:cloud_py", + "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip", "//tensorflow/contrib/cluster_resolver:cluster_resolver_py", "//tensorflow/contrib/coder:coder_ops_py", "//tensorflow/contrib/compiler:compiler_py", @@ -49,6 +50,7 @@ py_library( "//tensorflow/contrib/image:single_image_random_dot_stereograms_py", "//tensorflow/contrib/input_pipeline:input_pipeline_py", "//tensorflow/contrib/integrate:integrate_py", + "//tensorflow/contrib/kafka", "//tensorflow/contrib/keras", "//tensorflow/contrib/kernel_methods", "//tensorflow/contrib/kfac", @@ -77,6 +79,7 @@ py_library( "//tensorflow/contrib/predictor", "//tensorflow/contrib/quantization:quantization_py", "//tensorflow/contrib/quantize:quantize_graph", + "//tensorflow/contrib/py2tf", "//tensorflow/contrib/receptive_field:receptive_field_py", "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py", "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py", @@ -143,6 +146,7 @@ cc_library( "//tensorflow/contrib/factorization:all_ops", "//tensorflow/contrib/framework:all_ops", "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib", + "//tensorflow/contrib/kafka:kafka_ops_op_lib", "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib", "//tensorflow/contrib/nccl:nccl_ops_op_lib", "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib", diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java index dc5b9fb8874..e51e3f747b5 100644 --- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java +++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java @@ -194,6 +194,13 @@ public class TensorFlowInferenceInterface { * @param outputNames A list of output nodes which should be filled by the inference pass. */ public void run(String[] outputNames, boolean enableStats) { + run(outputNames, enableStats, new String[] {}); + } + + /** + * An overloaded version of runInference that allows supplying targetNodeNames as well + */ + public void run(String[] outputNames, boolean enableStats, String[] targetNodeNames) { // Release any Tensors from the previous run calls. closeFetches(); @@ -204,6 +211,11 @@ public class TensorFlowInferenceInterface { runner.fetch(tid.name, tid.outputIndex); } + // Add targets. + for (String t : targetNodeNames) { + runner.addTarget(t); + } + // Run the session. try { if (enableStats) { diff --git a/tensorflow/contrib/android/jni/run_stats_jni.cc b/tensorflow/contrib/android/jni/run_stats_jni.cc index 119fa9cd2c3..707853b59be 100644 --- a/tensorflow/contrib/android/jni/run_stats_jni.cc +++ b/tensorflow/contrib/android/jni/run_stats_jni.cc @@ -21,8 +21,8 @@ limitations under the License. #include "tensorflow/core/protobuf/config.pb.h" #include "tensorflow/core/util/stat_summarizer.h" -using tensorflow::StatSummarizer; using tensorflow::RunMetadata; +using tensorflow::StatSummarizer; namespace { StatSummarizer* requireHandle(JNIEnv* env, jlong handle) { diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD index 11c3c037c4e..6e0f0a05726 100644 --- a/tensorflow/contrib/bayesflow/BUILD +++ b/tensorflow/contrib/bayesflow/BUILD @@ -217,6 +217,7 @@ cuda_py_test( "//tensorflow/python:platform_test", "//tensorflow/python:random_seed", ], + tags = ["notsan"], ) cuda_py_test( diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py index cbc66b6dc13..d244d2f4f53 100644 --- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py +++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py @@ -19,29 +19,36 @@ from __future__ import division from __future__ import print_function import numpy as np -from scipy import special from scipy import stats from tensorflow.contrib.bayesflow.python.ops import hmc +from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _compute_energy_change +from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _leapfrog_integrator +from tensorflow.contrib.distributions.python.ops import independent as independent_lib +from tensorflow.python.framework import ops from tensorflow.python.framework import random_seed from tensorflow.python.ops import array_ops -from tensorflow.python.ops import gradients_impl +from tensorflow.python.ops import gradients_impl as gradients_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops +from tensorflow.python.ops.distributions import gamma as gamma_lib +from tensorflow.python.ops.distributions import normal as normal_lib from tensorflow.python.platform import test -from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.platform import tf_logging as logging_ops + + +def _reduce_variance(x, axis=None, keepdims=False): + sample_mean = math_ops.reduce_mean(x, axis, keepdims=True) + return math_ops.reduce_mean( + math_ops.squared_difference(x, sample_mean), axis, keepdims) -# TODO(b/66964210): Test float16. class HMCTest(test.TestCase): def setUp(self): self._shape_param = 5. self._rate_param = 10. - self._expected_x = (special.digamma(self._shape_param) - - np.log(self._rate_param)) - self._expected_exp_x = self._shape_param / self._rate_param random_seed.set_random_seed(10003) np.random.seed(10003) @@ -63,63 +70,46 @@ class HMCTest(test.TestCase): self._rate_param * math_ops.exp(x), event_dims) - def _log_gamma_log_prob_grad(self, x, event_dims=()): - """Computes log-pdf and gradient of a log-gamma random variable. - - Args: - x: Value of the random variable. - event_dims: Dimensions not to treat as independent. Default is (), - i.e., all dimensions are independent. - - Returns: - log_prob: The log-pdf up to a normalizing constant. - grad: The gradient of the log-pdf with respect to x. - """ - return (math_ops.reduce_sum(self._shape_param * x - - self._rate_param * math_ops.exp(x), - event_dims), - self._shape_param - self._rate_param * math_ops.exp(x)) - - def _n_event_dims(self, x_shape, event_dims): - return np.prod([int(x_shape[i]) for i in event_dims]) - - def _integrator_conserves_energy(self, x, event_dims, sess, + def _integrator_conserves_energy(self, x, independent_chain_ndims, sess, feed_dict=None): - def potential_and_grad(x): - log_prob, grad = self._log_gamma_log_prob_grad(x, event_dims) - return -log_prob, -grad - - step_size = array_ops.placeholder(np.float32, [], name='step_size') - hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps') + step_size = array_ops.placeholder(np.float32, [], name="step_size") + hmc_lf_steps = array_ops.placeholder(np.int32, [], name="hmc_lf_steps") if feed_dict is None: feed_dict = {} feed_dict[hmc_lf_steps] = 1000 + event_dims = math_ops.range(independent_chain_ndims, + array_ops.rank(x)) + m = random_ops.random_normal(array_ops.shape(x)) - potential_0, grad_0 = potential_and_grad(x) - old_energy = potential_0 + 0.5 * math_ops.reduce_sum(m * m, - event_dims) + log_prob_0 = self._log_gamma_log_prob(x, event_dims) + grad_0 = gradients_ops.gradients(log_prob_0, x) + old_energy = -log_prob_0 + 0.5 * math_ops.reduce_sum(m**2., event_dims) - _, new_m, potential_1, _ = ( - hmc.leapfrog_integrator(step_size, hmc_lf_steps, x, - m, potential_and_grad, grad_0)) + new_m, _, log_prob_1, _ = _leapfrog_integrator( + current_momentums=[m], + target_log_prob_fn=lambda x: self._log_gamma_log_prob(x, event_dims), + current_state_parts=[x], + step_sizes=[step_size], + num_leapfrog_steps=hmc_lf_steps, + current_target_log_prob=log_prob_0, + current_grads_target_log_prob=grad_0) + new_m = new_m[0] - new_energy = potential_1 + 0.5 * math_ops.reduce_sum(new_m * new_m, + new_energy = -log_prob_1 + 0.5 * math_ops.reduce_sum(new_m * new_m, event_dims) x_shape = sess.run(x, feed_dict).shape - n_event_dims = self._n_event_dims(x_shape, event_dims) - feed_dict[step_size] = 0.1 / n_event_dims - old_energy_val, new_energy_val = sess.run([old_energy, new_energy], - feed_dict) - logging.vlog(1, 'average energy change: {}'.format( - abs(old_energy_val - new_energy_val).mean())) + event_size = np.prod(x_shape[independent_chain_ndims:]) + feed_dict[step_size] = 0.1 / event_size + old_energy_, new_energy_ = sess.run([old_energy, new_energy], + feed_dict) + logging_ops.vlog(1, "average energy relative change: {}".format( + (1. - new_energy_ / old_energy_).mean())) + self.assertAllClose(old_energy_, new_energy_, atol=0., rtol=0.02) - self.assertAllEqual(np.ones_like(new_energy_val, dtype=np.bool), - abs(old_energy_val - new_energy_val) < 1.) - - def _integrator_conserves_energy_wrapper(self, event_dims): + def _integrator_conserves_energy_wrapper(self, independent_chain_ndims): """Tests the long-term energy conservation of the leapfrog integrator. The leapfrog integrator is symplectic, so for sufficiently small step @@ -127,135 +117,167 @@ class HMCTest(test.TestCase): the energy of the system blowing up or collapsing. Args: - event_dims: A tuple of dimensions that should not be treated as - independent. This allows for multiple chains to be run independently - in parallel. Default is (), i.e., all dimensions are independent. + independent_chain_ndims: Python `int` scalar representing the number of + dims associated with independent chains. """ with self.test_session() as sess: - x_ph = array_ops.placeholder(np.float32, name='x_ph') - - feed_dict = {x_ph: np.zeros([50, 10, 2])} - self._integrator_conserves_energy(x_ph, event_dims, sess, feed_dict) + x_ph = array_ops.placeholder(np.float32, name="x_ph") + feed_dict = {x_ph: np.random.rand(50, 10, 2)} + self._integrator_conserves_energy(x_ph, independent_chain_ndims, + sess, feed_dict) def testIntegratorEnergyConservationNullShape(self): - self._integrator_conserves_energy_wrapper([]) + self._integrator_conserves_energy_wrapper(0) def testIntegratorEnergyConservation1(self): - self._integrator_conserves_energy_wrapper([1]) + self._integrator_conserves_energy_wrapper(1) def testIntegratorEnergyConservation2(self): - self._integrator_conserves_energy_wrapper([2]) + self._integrator_conserves_energy_wrapper(2) - def testIntegratorEnergyConservation12(self): - self._integrator_conserves_energy_wrapper([1, 2]) + def testIntegratorEnergyConservation3(self): + self._integrator_conserves_energy_wrapper(3) - def testIntegratorEnergyConservation012(self): - self._integrator_conserves_energy_wrapper([0, 1, 2]) - - def _chain_gets_correct_expectations(self, x, event_dims, sess, - feed_dict=None): + def _chain_gets_correct_expectations(self, x, independent_chain_ndims, + sess, feed_dict=None): def log_gamma_log_prob(x): + event_dims = math_ops.range(independent_chain_ndims, + array_ops.rank(x)) return self._log_gamma_log_prob(x, event_dims) - step_size = array_ops.placeholder(np.float32, [], name='step_size') - hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps') - hmc_n_steps = array_ops.placeholder(np.int32, [], name='hmc_n_steps') + num_results = array_ops.placeholder( + np.int32, [], name="num_results") + step_size = array_ops.placeholder( + np.float32, [], name="step_size") + num_leapfrog_steps = array_ops.placeholder( + np.int32, [], name="num_leapfrog_steps") if feed_dict is None: feed_dict = {} - feed_dict.update({step_size: 0.1, - hmc_lf_steps: 2, - hmc_n_steps: 300}) + feed_dict.update({num_results: 150, + step_size: 0.1, + num_leapfrog_steps: 2}) - sample_chain, acceptance_prob_chain = hmc.chain([hmc_n_steps], - step_size, - hmc_lf_steps, - x, log_gamma_log_prob, - event_dims) + samples, kernel_results = hmc.sample_chain( + num_results=num_results, + target_log_prob_fn=log_gamma_log_prob, + current_state=x, + step_size=step_size, + num_leapfrog_steps=num_leapfrog_steps, + num_burnin_steps=150, + seed=42) - acceptance_probs, samples = sess.run([acceptance_prob_chain, sample_chain], - feed_dict) - samples = samples[feed_dict[hmc_n_steps] // 2:] - expected_x_est = samples.mean() - expected_exp_x_est = np.exp(samples).mean() + expected_x = (math_ops.digamma(self._shape_param) + - np.log(self._rate_param)) - logging.vlog(1, 'True E[x, exp(x)]: {}\t{}'.format( - self._expected_x, self._expected_exp_x)) - logging.vlog(1, 'Estimated E[x, exp(x)]: {}\t{}'.format( - expected_x_est, expected_exp_x_est)) - self.assertNear(expected_x_est, self._expected_x, 2e-2) - self.assertNear(expected_exp_x_est, self._expected_exp_x, 2e-2) - self.assertTrue((acceptance_probs > 0.5).all()) - self.assertTrue((acceptance_probs <= 1.0).all()) + expected_exp_x = self._shape_param / self._rate_param - def _chain_gets_correct_expectations_wrapper(self, event_dims): + acceptance_probs_, samples_, expected_x_ = sess.run( + [kernel_results.acceptance_probs, samples, expected_x], + feed_dict) + + actual_x = samples_.mean() + actual_exp_x = np.exp(samples_).mean() + + logging_ops.vlog(1, "True E[x, exp(x)]: {}\t{}".format( + expected_x_, expected_exp_x)) + logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format( + actual_x, actual_exp_x)) + self.assertNear(actual_x, expected_x_, 2e-2) + self.assertNear(actual_exp_x, expected_exp_x, 2e-2) + self.assertTrue((acceptance_probs_ > 0.5).all()) + self.assertTrue((acceptance_probs_ <= 1.0).all()) + + def _chain_gets_correct_expectations_wrapper(self, independent_chain_ndims): with self.test_session() as sess: - x_ph = array_ops.placeholder(np.float32, name='x_ph') - - feed_dict = {x_ph: np.zeros([50, 10, 2])} - self._chain_gets_correct_expectations(x_ph, event_dims, sess, - feed_dict) + x_ph = array_ops.placeholder(np.float32, name="x_ph") + feed_dict = {x_ph: np.random.rand(50, 10, 2)} + self._chain_gets_correct_expectations(x_ph, independent_chain_ndims, + sess, feed_dict) def testHMCChainExpectationsNullShape(self): - self._chain_gets_correct_expectations_wrapper([]) + self._chain_gets_correct_expectations_wrapper(0) def testHMCChainExpectations1(self): - self._chain_gets_correct_expectations_wrapper([1]) + self._chain_gets_correct_expectations_wrapper(1) def testHMCChainExpectations2(self): - self._chain_gets_correct_expectations_wrapper([2]) + self._chain_gets_correct_expectations_wrapper(2) - def testHMCChainExpectations12(self): - self._chain_gets_correct_expectations_wrapper([1, 2]) - - def _kernel_leaves_target_invariant(self, initial_draws, event_dims, + def _kernel_leaves_target_invariant(self, initial_draws, + independent_chain_ndims, sess, feed_dict=None): def log_gamma_log_prob(x): + event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x)) return self._log_gamma_log_prob(x, event_dims) def fake_log_prob(x): """Cooled version of the target distribution.""" return 1.1 * log_gamma_log_prob(x) - step_size = array_ops.placeholder(np.float32, [], name='step_size') + step_size = array_ops.placeholder(np.float32, [], name="step_size") if feed_dict is None: feed_dict = {} feed_dict[step_size] = 0.4 - sample, acceptance_probs, _, _ = hmc.kernel(step_size, 5, initial_draws, - log_gamma_log_prob, event_dims) - bad_sample, bad_acceptance_probs, _, _ = hmc.kernel( - step_size, 5, initial_draws, fake_log_prob, event_dims) - (acceptance_probs_val, bad_acceptance_probs_val, initial_draws_val, - updated_draws_val, fake_draws_val) = sess.run([acceptance_probs, - bad_acceptance_probs, - initial_draws, sample, - bad_sample], feed_dict) + sample, kernel_results = hmc.kernel( + target_log_prob_fn=log_gamma_log_prob, + current_state=initial_draws, + step_size=step_size, + num_leapfrog_steps=5, + seed=43) + + bad_sample, bad_kernel_results = hmc.kernel( + target_log_prob_fn=fake_log_prob, + current_state=initial_draws, + step_size=step_size, + num_leapfrog_steps=5, + seed=44) + + [ + acceptance_probs_, + bad_acceptance_probs_, + initial_draws_, + updated_draws_, + fake_draws_, + ] = sess.run([ + kernel_results.acceptance_probs, + bad_kernel_results.acceptance_probs, + initial_draws, + sample, + bad_sample, + ], feed_dict) + # Confirm step size is small enough that we usually accept. - self.assertGreater(acceptance_probs_val.mean(), 0.5) - self.assertGreater(bad_acceptance_probs_val.mean(), 0.5) + self.assertGreater(acceptance_probs_.mean(), 0.5) + self.assertGreater(bad_acceptance_probs_.mean(), 0.5) + # Confirm step size is large enough that we sometimes reject. - self.assertLess(acceptance_probs_val.mean(), 0.99) - self.assertLess(bad_acceptance_probs_val.mean(), 0.99) - _, ks_p_value_true = stats.ks_2samp(initial_draws_val.flatten(), - updated_draws_val.flatten()) - _, ks_p_value_fake = stats.ks_2samp(initial_draws_val.flatten(), - fake_draws_val.flatten()) - logging.vlog(1, 'acceptance rate for true target: {}'.format( - acceptance_probs_val.mean())) - logging.vlog(1, 'acceptance rate for fake target: {}'.format( - bad_acceptance_probs_val.mean())) - logging.vlog(1, 'K-S p-value for true target: {}'.format(ks_p_value_true)) - logging.vlog(1, 'K-S p-value for fake target: {}'.format(ks_p_value_fake)) + self.assertLess(acceptance_probs_.mean(), 0.99) + self.assertLess(bad_acceptance_probs_.mean(), 0.99) + + _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(), + updated_draws_.flatten()) + _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(), + fake_draws_.flatten()) + + logging_ops.vlog(1, "acceptance rate for true target: {}".format( + acceptance_probs_.mean())) + logging_ops.vlog(1, "acceptance rate for fake target: {}".format( + bad_acceptance_probs_.mean())) + logging_ops.vlog(1, "K-S p-value for true target: {}".format( + ks_p_value_true)) + logging_ops.vlog(1, "K-S p-value for fake target: {}".format( + ks_p_value_fake)) # Make sure that the MCMC update hasn't changed the empirical CDF much. self.assertGreater(ks_p_value_true, 1e-3) # Confirm that targeting the wrong distribution does # significantly change the empirical CDF. self.assertLess(ks_p_value_fake, 1e-6) - def _kernel_leaves_target_invariant_wrapper(self, event_dims): + def _kernel_leaves_target_invariant_wrapper(self, independent_chain_ndims): """Tests that the kernel leaves the target distribution invariant. Draws some independent samples from the target distribution, @@ -267,86 +289,116 @@ class HMCTest(test.TestCase): does change the target distribution. (And that we can detect that.) Args: - event_dims: A tuple of dimensions that should not be treated as - independent. This allows for multiple chains to be run independently - in parallel. Default is (), i.e., all dimensions are independent. + independent_chain_ndims: Python `int` scalar representing the number of + dims associated with independent chains. """ with self.test_session() as sess: initial_draws = np.log(np.random.gamma(self._shape_param, size=[50000, 2, 2])) initial_draws -= np.log(self._rate_param) - x_ph = array_ops.placeholder(np.float32, name='x_ph') + x_ph = array_ops.placeholder(np.float32, name="x_ph") feed_dict = {x_ph: initial_draws} - self._kernel_leaves_target_invariant(x_ph, event_dims, sess, - feed_dict) - - def testKernelLeavesTargetInvariantNullShape(self): - self._kernel_leaves_target_invariant_wrapper([]) + self._kernel_leaves_target_invariant(x_ph, independent_chain_ndims, + sess, feed_dict) def testKernelLeavesTargetInvariant1(self): - self._kernel_leaves_target_invariant_wrapper([1]) + self._kernel_leaves_target_invariant_wrapper(1) def testKernelLeavesTargetInvariant2(self): - self._kernel_leaves_target_invariant_wrapper([2]) + self._kernel_leaves_target_invariant_wrapper(2) - def testKernelLeavesTargetInvariant12(self): - self._kernel_leaves_target_invariant_wrapper([1, 2]) + def testKernelLeavesTargetInvariant3(self): + self._kernel_leaves_target_invariant_wrapper(3) - def _ais_gets_correct_log_normalizer(self, init, event_dims, sess, - feed_dict=None): + def _ais_gets_correct_log_normalizer(self, init, independent_chain_ndims, + sess, feed_dict=None): def proposal_log_prob(x): - return math_ops.reduce_sum(-0.5 * x * x - 0.5 * np.log(2*np.pi), - event_dims) + event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x)) + return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi), + axis=event_dims) def target_log_prob(x): + event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x)) return self._log_gamma_log_prob(x, event_dims) if feed_dict is None: feed_dict = {} - w, _, _ = hmc.ais_chain(200, 0.5, 2, init, target_log_prob, - proposal_log_prob, event_dims) + num_steps = 200 - w_val = sess.run(w, feed_dict) - init_shape = sess.run(init, feed_dict).shape - normalizer_multiplier = np.prod([init_shape[i] for i in event_dims]) + _, ais_weights, _ = hmc.sample_annealed_importance_chain( + proposal_log_prob_fn=proposal_log_prob, + num_steps=num_steps, + target_log_prob_fn=target_log_prob, + step_size=0.5, + current_state=init, + num_leapfrog_steps=2, + seed=45) - true_normalizer = -self._shape_param * np.log(self._rate_param) - true_normalizer += special.gammaln(self._shape_param) - true_normalizer *= normalizer_multiplier + event_shape = array_ops.shape(init)[independent_chain_ndims:] + event_size = math_ops.reduce_prod(event_shape) - n_weights = np.prod(w_val.shape) - normalized_w = np.exp(w_val - true_normalizer) - standard_error = np.std(normalized_w) / np.sqrt(n_weights) - logging.vlog(1, 'True normalizer {}, estimated {}, n_weights {}'.format( - true_normalizer, np.log(normalized_w.mean()) + true_normalizer, - n_weights)) - self.assertNear(normalized_w.mean(), 1.0, 4.0 * standard_error) + log_true_normalizer = ( + -self._shape_param * math_ops.log(self._rate_param) + + math_ops.lgamma(self._shape_param)) + log_true_normalizer *= math_ops.cast(event_size, log_true_normalizer.dtype) - def _ais_gets_correct_log_normalizer_wrapper(self, event_dims): + log_estimated_normalizer = (math_ops.reduce_logsumexp(ais_weights) + - np.log(num_steps)) + + ratio_estimate_true = math_ops.exp(ais_weights - log_true_normalizer) + ais_weights_size = array_ops.size(ais_weights) + standard_error = math_ops.sqrt( + _reduce_variance(ratio_estimate_true) + / math_ops.cast(ais_weights_size, ratio_estimate_true.dtype)) + + [ + ratio_estimate_true_, + log_true_normalizer_, + log_estimated_normalizer_, + standard_error_, + ais_weights_size_, + event_size_, + ] = sess.run([ + ratio_estimate_true, + log_true_normalizer, + log_estimated_normalizer, + standard_error, + ais_weights_size, + event_size, + ], feed_dict) + + logging_ops.vlog(1, " log_true_normalizer: {}\n" + " log_estimated_normalizer: {}\n" + " ais_weights_size: {}\n" + " event_size: {}\n".format( + log_true_normalizer_, + log_estimated_normalizer_, + ais_weights_size_, + event_size_)) + self.assertNear(ratio_estimate_true_.mean(), 1., 4. * standard_error_) + + def _ais_gets_correct_log_normalizer_wrapper(self, independent_chain_ndims): """Tests that AIS yields reasonable estimates of normalizers.""" with self.test_session() as sess: - x_ph = array_ops.placeholder(np.float32, name='x_ph') - + x_ph = array_ops.placeholder(np.float32, name="x_ph") initial_draws = np.random.normal(size=[30, 2, 1]) - feed_dict = {x_ph: initial_draws} - - self._ais_gets_correct_log_normalizer(x_ph, event_dims, sess, - feed_dict) - - def testAISNullShape(self): - self._ais_gets_correct_log_normalizer_wrapper([]) + self._ais_gets_correct_log_normalizer( + x_ph, + independent_chain_ndims, + sess, + feed_dict={x_ph: initial_draws}) def testAIS1(self): - self._ais_gets_correct_log_normalizer_wrapper([1]) + self._ais_gets_correct_log_normalizer_wrapper(1) def testAIS2(self): - self._ais_gets_correct_log_normalizer_wrapper([2]) + self._ais_gets_correct_log_normalizer_wrapper(2) - def testAIS12(self): - self._ais_gets_correct_log_normalizer_wrapper([1, 2]) + def testAIS3(self): + self._ais_gets_correct_log_normalizer_wrapper(3) def testNanRejection(self): """Tests that an update that yields NaN potentials gets rejected. @@ -359,24 +411,29 @@ class HMCTest(test.TestCase): """ def _unbounded_exponential_log_prob(x): """An exponential distribution with log-likelihood NaN for x < 0.""" - per_element_potentials = array_ops.where(x < 0, - np.nan * array_ops.ones_like(x), - -x) + per_element_potentials = array_ops.where( + x < 0., + array_ops.fill(array_ops.shape(x), x.dtype.as_numpy_dtype(np.nan)), + -x) return math_ops.reduce_sum(per_element_potentials) with self.test_session() as sess: initial_x = math_ops.linspace(0.01, 5, 10) - updated_x, acceptance_probs, _, _ = hmc.kernel( - 2., 5, initial_x, _unbounded_exponential_log_prob, [0]) - initial_x_val, updated_x_val, acceptance_probs_val = sess.run( - [initial_x, updated_x, acceptance_probs]) + updated_x, kernel_results = hmc.kernel( + target_log_prob_fn=_unbounded_exponential_log_prob, + current_state=initial_x, + step_size=2., + num_leapfrog_steps=5, + seed=46) + initial_x_, updated_x_, acceptance_probs_ = sess.run( + [initial_x, updated_x, kernel_results.acceptance_probs]) - logging.vlog(1, 'initial_x = {}'.format(initial_x_val)) - logging.vlog(1, 'updated_x = {}'.format(updated_x_val)) - logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val)) + logging_ops.vlog(1, "initial_x = {}".format(initial_x_)) + logging_ops.vlog(1, "updated_x = {}".format(updated_x_)) + logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_)) - self.assertAllEqual(initial_x_val, updated_x_val) - self.assertEqual(acceptance_probs_val, 0.) + self.assertAllEqual(initial_x_, updated_x_) + self.assertEqual(acceptance_probs_, 0.) def testNanFromGradsDontPropagate(self): """Test that update with NaN gradients does not cause NaN in results.""" @@ -385,60 +442,195 @@ class HMCTest(test.TestCase): with self.test_session() as sess: initial_x = math_ops.linspace(0.01, 5, 10) - updated_x, acceptance_probs, new_log_prob, new_grad = hmc.kernel( - 2., 5, initial_x, _nan_log_prob_with_nan_gradient, [0]) - initial_x_val, updated_x_val, acceptance_probs_val = sess.run( - [initial_x, updated_x, acceptance_probs]) + updated_x, kernel_results = hmc.kernel( + target_log_prob_fn=_nan_log_prob_with_nan_gradient, + current_state=initial_x, + step_size=2., + num_leapfrog_steps=5, + seed=47) + initial_x_, updated_x_, acceptance_probs_ = sess.run( + [initial_x, updated_x, kernel_results.acceptance_probs]) - logging.vlog(1, 'initial_x = {}'.format(initial_x_val)) - logging.vlog(1, 'updated_x = {}'.format(updated_x_val)) - logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val)) + logging_ops.vlog(1, "initial_x = {}".format(initial_x_)) + logging_ops.vlog(1, "updated_x = {}".format(updated_x_)) + logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_)) - self.assertAllEqual(initial_x_val, updated_x_val) - self.assertEqual(acceptance_probs_val, 0.) + self.assertAllEqual(initial_x_, updated_x_) + self.assertEqual(acceptance_probs_, 0.) self.assertAllFinite( - gradients_impl.gradients(updated_x, initial_x)[0].eval()) - self.assertTrue( - gradients_impl.gradients(new_grad, initial_x)[0] is None) + gradients_ops.gradients(updated_x, initial_x)[0].eval()) + self.assertAllEqual([True], [g is None for g in gradients_ops.gradients( + kernel_results.proposed_grads_target_log_prob, initial_x)]) + self.assertAllEqual([False], [g is None for g in gradients_ops.gradients( + kernel_results.proposed_grads_target_log_prob, + kernel_results.proposed_state)]) # Gradients of the acceptance probs and new log prob are not finite. - _ = new_log_prob # Prevent unused arg error. # self.assertAllFinite( - # gradients_impl.gradients(acceptance_probs, initial_x)[0].eval()) + # gradients_ops.gradients(acceptance_probs, initial_x)[0].eval()) # self.assertAllFinite( - # gradients_impl.gradients(new_log_prob, initial_x)[0].eval()) + # gradients_ops.gradients(new_log_prob, initial_x)[0].eval()) + + def _testChainWorksDtype(self, dtype): + states, kernel_results = hmc.sample_chain( + num_results=10, + target_log_prob_fn=lambda x: -math_ops.reduce_sum(x**2., axis=-1), + current_state=np.zeros(5).astype(dtype), + step_size=0.01, + num_leapfrog_steps=10, + seed=48) + with self.test_session() as sess: + states_, acceptance_probs_ = sess.run( + [states, kernel_results.acceptance_probs]) + self.assertEqual(dtype, states_.dtype) + self.assertEqual(dtype, acceptance_probs_.dtype) def testChainWorksIn64Bit(self): - def log_prob(x): - return - math_ops.reduce_sum(x * x, axis=-1) - states, acceptance_probs = hmc.chain( - n_iterations=10, - step_size=np.float64(0.01), - n_leapfrog_steps=10, - initial_x=np.zeros(5).astype(np.float64), - target_log_prob_fn=log_prob, - event_dims=[-1]) - with self.test_session() as sess: - states_, acceptance_probs_ = sess.run([states, acceptance_probs]) - self.assertEqual(np.float64, states_.dtype) - self.assertEqual(np.float64, acceptance_probs_.dtype) + self._testChainWorksDtype(np.float64) def testChainWorksIn16Bit(self): - def log_prob(x): - return - math_ops.reduce_sum(x * x, axis=-1) - states, acceptance_probs = hmc.chain( - n_iterations=10, - step_size=np.float16(0.01), - n_leapfrog_steps=10, - initial_x=np.zeros(5).astype(np.float16), - target_log_prob_fn=log_prob, - event_dims=[-1]) + self._testChainWorksDtype(np.float16) + + +class _EnergyComputationTest(object): + + def testHandlesNanFromPotential(self): with self.test_session() as sess: - states_, acceptance_probs_ = sess.run([states, acceptance_probs]) - self.assertEqual(np.float16, states_.dtype) - self.assertEqual(np.float16, acceptance_probs_.dtype) + x = [1, np.inf, -np.inf, np.nan] + target_log_prob, proposed_target_log_prob = [ + self.dtype(x.flatten()) for x in np.meshgrid(x, x)] + num_chains = len(target_log_prob) + dummy_momentums = [-1, 1] + momentums = [self.dtype([dummy_momentums] * num_chains)] + proposed_momentums = [self.dtype([dummy_momentums] * num_chains)] + + target_log_prob = ops.convert_to_tensor(target_log_prob) + momentums = [ops.convert_to_tensor(momentums[0])] + proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob) + proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])] + + energy = _compute_energy_change( + target_log_prob, + momentums, + proposed_target_log_prob, + proposed_momentums, + independent_chain_ndims=1) + grads = gradients_ops.gradients(energy, momentums) + + [actual_energy, grads_] = sess.run([energy, grads]) + + # Ensure energy is `inf` (note: that's positive inf) in weird cases and + # finite otherwise. + expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1)) + self.assertAllEqual(expected_energy, actual_energy) + + # Ensure gradient is finite. + self.assertAllEqual(np.ones_like(grads_).astype(np.bool), + np.isfinite(grads_)) + + def testHandlesNanFromKinetic(self): + with self.test_session() as sess: + x = [1, np.inf, -np.inf, np.nan] + momentums, proposed_momentums = [ + [np.reshape(self.dtype(x), [-1, 1])] + for x in np.meshgrid(x, x)] + num_chains = len(momentums[0]) + target_log_prob = np.ones(num_chains, self.dtype) + proposed_target_log_prob = np.ones(num_chains, self.dtype) + + target_log_prob = ops.convert_to_tensor(target_log_prob) + momentums = [ops.convert_to_tensor(momentums[0])] + proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob) + proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])] + + energy = _compute_energy_change( + target_log_prob, + momentums, + proposed_target_log_prob, + proposed_momentums, + independent_chain_ndims=1) + grads = gradients_ops.gradients(energy, momentums) + + [actual_energy, grads_] = sess.run([energy, grads]) + + # Ensure energy is `inf` (note: that's positive inf) in weird cases and + # finite otherwise. + expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1)) + self.assertAllEqual(expected_energy, actual_energy) + + # Ensure gradient is finite. + g = grads_[0].reshape([len(x), len(x)])[:, 0] + self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isfinite(g)) + + # The remaining gradients are nan because the momentum was itself nan or + # inf. + g = grads_[0].reshape([len(x), len(x)])[:, 1:] + self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isnan(g)) -if __name__ == '__main__': +class EnergyComputationTest16(test.TestCase, _EnergyComputationTest): + dtype = np.float16 + + +class EnergyComputationTest32(test.TestCase, _EnergyComputationTest): + dtype = np.float32 + + +class EnergyComputationTest64(test.TestCase, _EnergyComputationTest): + dtype = np.float64 + + +class _HMCHandlesLists(object): + + def testStateParts(self): + with self.test_session() as sess: + dist_x = normal_lib.Normal(loc=self.dtype(0), scale=self.dtype(1)) + dist_y = independent_lib.Independent( + gamma_lib.Gamma(concentration=self.dtype([1, 2]), + rate=self.dtype([0.5, 0.75])), + reinterpreted_batch_ndims=1) + def target_log_prob(x, y): + return dist_x.log_prob(x) + dist_y.log_prob(y) + x0 = [dist_x.sample(seed=1), dist_y.sample(seed=2)] + samples, _ = hmc.sample_chain( + num_results=int(2e3), + target_log_prob_fn=target_log_prob, + current_state=x0, + step_size=0.85, + num_leapfrog_steps=3, + num_burnin_steps=int(250), + seed=49) + actual_means = [math_ops.reduce_mean(s, axis=0) for s in samples] + actual_vars = [_reduce_variance(s, axis=0) for s in samples] + expected_means = [dist_x.mean(), dist_y.mean()] + expected_vars = [dist_x.variance(), dist_y.variance()] + [ + actual_means_, + actual_vars_, + expected_means_, + expected_vars_, + ] = sess.run([ + actual_means, + actual_vars, + expected_means, + expected_vars, + ]) + self.assertAllClose(expected_means_, actual_means_, atol=0.05, rtol=0.16) + self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.40) + + +class HMCHandlesLists16(_HMCHandlesLists, test.TestCase): + dtype = np.float16 + + +class HMCHandlesLists32(_HMCHandlesLists, test.TestCase): + dtype = np.float32 + + +class HMCHandlesLists64(_HMCHandlesLists, test.TestCase): + dtype = np.float64 + + +if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py index fdc12e3b214..d44fe6529a7 100644 --- a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py +++ b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py @@ -31,8 +31,7 @@ __all__ = [ ] -def custom_gradient(fx, gx, x, axis=(), - fx_gx_manually_stopped=False, +def custom_gradient(fx, gx, x, axis=(), fx_gx_manually_stopped=False, name=None): """Enables specifying a custom gradient. @@ -43,7 +42,8 @@ def custom_gradient(fx, gx, x, axis=(), h(x) = x * stop_gradient(g(x)) + stop_gradient(f(x) - x * g(x)) ``` - is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] = stop_gradient(g(x)).` + is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] = + stop_gradient(g(x)).` In addition to scalar-domain/scalar-range functions, this function also supports tensor-domain/scalar-range functions. However, in the latter case it diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc.py b/tensorflow/contrib/bayesflow/python/ops/hmc.py index 977d42fc16b..7fd5652c5c3 100644 --- a/tensorflow/contrib/bayesflow/python/ops/hmc.py +++ b/tensorflow/contrib/bayesflow/python/ops/hmc.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm. -""" +"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.""" from __future__ import absolute_import from __future__ import division @@ -24,11 +23,9 @@ from tensorflow.contrib.bayesflow.python.ops.hmc_impl import * # pylint: disabl from tensorflow.python.util import all_util _allowed_symbols = [ - 'chain', - 'kernel', - 'leapfrog_integrator', - 'leapfrog_step', - 'ais_chain' + "sample_chain", + "sample_annealed_importance_chain", + "kernel", ] all_util.remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py index 5685a942e98..f7a11c21d8a 100644 --- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py +++ b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py @@ -14,17 +14,16 @@ # ============================================================================== """Hamiltonian Monte Carlo, a gradient-based MCMC algorithm. -@@chain -@@update -@@leapfrog_integrator -@@leapfrog_step -@@ais_chain +@@sample_chain +@@sample_annealed_importance_chain +@@kernel """ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import numpy as np from tensorflow.python.framework import dtypes @@ -32,168 +31,292 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import functional_ops -from tensorflow.python.ops import gradients_impl +from tensorflow.python.ops import gradients_impl as gradients_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops -from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.ops.distributions import util as distributions_util __all__ = [ - 'chain', - 'kernel', - 'leapfrog_integrator', - 'leapfrog_step', - 'ais_chain' + "sample_chain", + "sample_annealed_importance_chain", + "kernel", ] -def _make_potential_and_grad(target_log_prob_fn): - def potential_and_grad(x): - log_prob_result = -target_log_prob_fn(x) - grad_result = gradients_impl.gradients(math_ops.reduce_sum(log_prob_result), - x)[0] - return log_prob_result, grad_result - return potential_and_grad +KernelResults = collections.namedtuple( + "KernelResults", + [ + "acceptance_probs", + "current_grads_target_log_prob", # "Current result" means "accepted". + "current_target_log_prob", # "Current result" means "accepted". + "energy_change", + "is_accepted", + "proposed_grads_target_log_prob", + "proposed_state", + "proposed_target_log_prob", + "random_positive", + ]) -def chain(n_iterations, step_size, n_leapfrog_steps, initial_x, - target_log_prob_fn, event_dims=(), name=None): +def _make_dummy_kernel_results( + dummy_state, + dummy_target_log_prob, + dummy_grads_target_log_prob): + return KernelResults( + acceptance_probs=dummy_target_log_prob, + current_grads_target_log_prob=dummy_grads_target_log_prob, + current_target_log_prob=dummy_target_log_prob, + energy_change=dummy_target_log_prob, + is_accepted=array_ops.ones_like(dummy_target_log_prob, dtypes.bool), + proposed_grads_target_log_prob=dummy_grads_target_log_prob, + proposed_state=dummy_state, + proposed_target_log_prob=dummy_target_log_prob, + random_positive=dummy_target_log_prob, + ) + + +def sample_chain( + num_results, + target_log_prob_fn, + current_state, + step_size, + num_leapfrog_steps, + num_burnin_steps=0, + num_steps_between_results=0, + seed=None, + current_target_log_prob=None, + current_grads_target_log_prob=None, + name=None): """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains. - Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) - algorithm that takes a series of gradient-informed steps to produce - a Metropolis proposal. This function samples from an HMC Markov - chain whose initial state is `initial_x` and whose stationary - distribution has log-density `target_log_prob_fn()`. - - This function can update multiple chains in parallel. It assumes - that all dimensions of `initial_x` not specified in `event_dims` are - independent, and should therefore be updated independently. The - output of `target_log_prob_fn()` should sum log-probabilities across - all event dimensions. Slices along dimensions not in `event_dims` - may have different target distributions; this is up to + Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm + that takes a series of gradient-informed steps to produce a Metropolis + proposal. This function samples from an HMC Markov chain at `current_state` + and whose stationary distribution has log-unnormalized-density `target_log_prob_fn()`. - This function basically just wraps `hmc.kernel()` in a tf.scan() loop. + This function samples from multiple chains in parallel. It assumes that the + the leftmost dimensions of (each) `current_state` (part) index an independent + chain. The function `target_log_prob_fn()` sums log-probabilities across + event dimensions (i.e., current state (part) rightmost dimensions). Each + element of the output of `target_log_prob_fn()` represents the (possibly + unnormalized) log-probability of the joint distribution over (all) the current + state (parts). - Args: - n_iterations: Integer number of Markov chain updates to run. - step_size: Scalar step size or array of step sizes for the - leapfrog integrator. Broadcasts to the shape of - `initial_x`. Larger step sizes lead to faster progress, but - too-large step sizes make rejection exponentially more likely. - When possible, it's often helpful to match per-variable step - sizes to the standard deviations of the target distribution in - each variable. - n_leapfrog_steps: Integer number of steps to run the leapfrog - integrator for. Total progress per HMC step is roughly - proportional to step_size * n_leapfrog_steps. - initial_x: Tensor of initial state(s) of the Markov chain(s). - target_log_prob_fn: Python callable which takes an argument like `initial_x` - and returns its (possibly unnormalized) log-density under the target - distribution. - event_dims: List of dimensions that should not be treated as - independent. This allows for multiple chains to be run independently - in parallel. Default is (), i.e., all dimensions are independent. - name: Python `str` name prefixed to Ops created by this function. + The `current_state` can be represented as a single `Tensor` or a `list` of + `Tensors` which collectively represent the current state. When specifying a + `list`, one must also specify a list of `step_size`s. - Returns: - acceptance_probs: Tensor with the acceptance probabilities for each - iteration. Has shape matching `target_log_prob_fn(initial_x)`. - chain_states: Tensor with the state of the Markov chain at each iteration. - Has shape `[n_iterations, initial_x.shape[0],...,initial_x.shape[-1]`. + Only one out of every `num_steps_between_samples + 1` steps is included in the + returned results. This "thinning" comes at a cost of reduced statistical + power, while reducing memory requirements and autocorrelation. For more + discussion see [1]. + + [1]: "Statistically efficient thinning of a Markov chain sampler." + Art B. Owen. April 2017. + http://statweb.stanford.edu/~owen/reports/bestthinning.pdf #### Examples: - ```python - # Sampling from a standard normal (note `log_joint()` is unnormalized): - def log_joint(x): - return tf.reduce_sum(-0.5 * tf.square(x)) - chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint, - event_dims=[0]) - # Discard first half of chain as warmup/burn-in - warmed_up = chain[500:] - mean_est = tf.reduce_mean(warmed_up, 0) - var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est) - ``` + ##### Sample from a diagonal-variance Gaussian. ```python - # Sampling from a diagonal-variance Gaussian: - variances = tf.linspace(1., 3., 10) - def log_joint(x): - return tf.reduce_sum(-0.5 / variances * tf.square(x)) - chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint, - event_dims=[0]) - # Discard first half of chain as warmup/burn-in - warmed_up = chain[500:] - mean_est = tf.reduce_mean(warmed_up, 0) - var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est) + tfd = tf.contrib.distributions + + def make_likelihood(true_variances): + return tfd.MultivariateNormalDiag( + scale_diag=tf.sqrt(true_variances)) + + dims = 10 + dtype = np.float32 + true_variances = tf.linspace(dtype(1), dtype(3), dims) + likelihood = make_likelihood(true_variances) + + states, kernel_results = hmc.sample_chain( + num_results=1000, + target_log_prob_fn=likelihood.log_prob, + current_state=tf.zeros(dims), + step_size=0.5, + num_leapfrog_steps=2, + num_burnin_steps=500) + + # Compute sample stats. + sample_mean = tf.reduce_mean(states, axis=0) + sample_var = tf.reduce_mean( + tf.squared_difference(states, sample_mean), + axis=0) ``` + ##### Sampling from factor-analysis posteriors with known factors. + + I.e., + + ```none + for i=1..n: + w[i] ~ Normal(0, eye(d)) # prior + x[i] ~ Normal(loc=matmul(w[i], F)) # likelihood + ``` + + where `F` denotes factors. + ```python - # Sampling from factor-analysis posteriors with known factors W: - # mu[i, j] ~ Normal(0, 1) - # x[i] ~ Normal(matmul(mu[i], W), I) - def log_joint(mu, x, W): - prior = -0.5 * tf.reduce_sum(tf.square(mu), 1) - x_mean = tf.matmul(mu, W) - likelihood = -0.5 * tf.reduce_sum(tf.square(x - x_mean), 1) - return prior + likelihood - chain, acceptance_probs = hmc.chain(1000, 0.1, 2, - tf.zeros([x.shape[0], W.shape[0]]), - lambda mu: log_joint(mu, x, W), - event_dims=[1]) - # Discard first half of chain as warmup/burn-in - warmed_up = chain[500:] - mean_est = tf.reduce_mean(warmed_up, 0) - var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est) + tfd = tf.contrib.distributions + + def make_prior(dims, dtype): + return tfd.MultivariateNormalDiag( + loc=tf.zeros(dims, dtype)) + + def make_likelihood(weights, factors): + return tfd.MultivariateNormalDiag( + loc=tf.tensordot(weights, factors, axes=[[0], [-1]])) + + # Setup data. + num_weights = 10 + num_factors = 4 + num_chains = 100 + dtype = np.float32 + + prior = make_prior(num_weights, dtype) + weights = prior.sample(num_chains) + factors = np.random.randn(num_factors, num_weights).astype(dtype) + x = make_likelihood(weights, factors).sample(num_chains) + + def target_log_prob(w): + # Target joint is: `f(w) = p(w, x | factors)`. + return prior.log_prob(w) + make_likelihood(w, factors).log_prob(x) + + # Get `num_results` samples from `num_chains` independent chains. + chains_states, kernels_results = hmc.sample_chain( + num_results=1000, + target_log_prob_fn=target_log_prob, + current_state=tf.zeros([num_chains, dims], dtype), + step_size=0.1, + num_leapfrog_steps=2, + num_burnin_steps=500) + + # Compute sample stats. + sample_mean = tf.reduce_mean(chains_states, axis=[0, 1]) + sample_var = tf.reduce_mean( + tf.squared_difference(chains_states, sample_mean), + axis=[0, 1]) ``` - ```python - # Sampling from the posterior of a Bayesian regression model.: + Args: + num_results: Integer number of Markov chain draws. + target_log_prob_fn: Python callable which takes an argument like + `current_state` (or `*current_state` if it's a list) and returns its + (possibly unnormalized) log-density under the target distribution. + current_state: `Tensor` or Python `list` of `Tensor`s representing the + current state(s) of the Markov chain(s). The first `r` dimensions index + independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`. + step_size: `Tensor` or Python `list` of `Tensor`s representing the step size + for the leapfrog integrator. Must broadcast with the shape of + `current_state`. Larger step sizes lead to faster progress, but too-large + step sizes make rejection exponentially more likely. When possible, it's + often helpful to match per-variable step sizes to the standard deviations + of the target distribution in each variable. + num_leapfrog_steps: Integer number of steps to run the leapfrog integrator + for. Total progress per HMC step is roughly proportional to `step_size * + num_leapfrog_steps`. + num_burnin_steps: Integer number of chain steps to take before starting to + collect results. + Default value: 0 (i.e., no burn-in). + num_steps_between_results: Integer number of chain steps between collecting + a result. Only one out of every `num_steps_between_samples + 1` steps is + included in the returned results. This "thinning" comes at a cost of + reduced statistical power, while reducing memory requirements and + autocorrelation. For more discussion see [1]. + Default value: 0 (i.e., no subsampling). + seed: Python integer to seed the random number generator. + current_target_log_prob: (Optional) `Tensor` representing the value of + `target_log_prob_fn` at the `current_state`. The only reason to specify + this argument is to reduce TF graph size. + Default value: `None` (i.e., compute as needed). + current_grads_target_log_prob: (Optional) Python list of `Tensor`s + representing gradient of `target_log_prob` at the `current_state` and wrt + the `current_state`. Must have same shape as `current_state`. The only + reason to specify this argument is to reduce TF graph size. + Default value: `None` (i.e., compute as needed). + name: Python `str` name prefixed to Ops created by this function. + Default value: `None` (i.e., "hmc_sample_chain"). - # Run 100 chains in parallel, each with a different initialization. - initial_beta = tf.random_normal([100, x.shape[1]]) - chain, acceptance_probs = hmc.chain(1000, 0.1, 10, initial_beta, - log_joint_partial, event_dims=[1]) - # Discard first halves of chains as warmup/burn-in - warmed_up = chain[500:] - # Averaging across samples within a chain and across chains - mean_est = tf.reduce_mean(warmed_up, [0, 1]) - var_est = tf.reduce_mean(tf.square(warmed_up), [0, 1]) - tf.square(mean_est) - ``` + Returns: + accepted_states: Tensor or Python list of `Tensor`s representing the + state(s) of the Markov chain(s) at each result step. Has same shape as + input `current_state` but with a prepended `num_results`-size dimension. + kernel_results: `collections.namedtuple` of internal calculations used to + advance the chain. """ - with ops.name_scope(name, 'hmc_chain', [n_iterations, step_size, - n_leapfrog_steps, initial_x]): - initial_x = ops.convert_to_tensor(initial_x, name='initial_x') - non_event_shape = array_ops.shape(target_log_prob_fn(initial_x)) + with ops.name_scope( + name, "hmc_sample_chain", + [num_results, current_state, step_size, num_leapfrog_steps, + num_burnin_steps, num_steps_between_results, seed, + current_target_log_prob, current_grads_target_log_prob]): + with ops.name_scope("initialize"): + [ + current_state, + step_size, + current_target_log_prob, + current_grads_target_log_prob, + ] = _prepare_args( + target_log_prob_fn, current_state, step_size, + current_target_log_prob, current_grads_target_log_prob) + def _run_chain(num_steps, current_state, seed, kernel_results): + """Runs the chain(s) for `num_steps`.""" + def _loop_body(iter_, current_state, kernel_results): + return [iter_ + 1] + list(kernel( + target_log_prob_fn, + current_state, + step_size, + num_leapfrog_steps, + seed, + kernel_results.current_target_log_prob, + kernel_results.current_grads_target_log_prob)) + return control_flow_ops.while_loop( + cond=lambda iter_, *args: iter_ < num_steps, + body=_loop_body, + loop_vars=[0, current_state, kernel_results])[1:] # Lop-off "iter_". - def body(a, _): - updated_x, acceptance_probs, log_prob, grad = kernel( - step_size, n_leapfrog_steps, a[0], target_log_prob_fn, event_dims, - a[2], a[3]) - return updated_x, acceptance_probs, log_prob, grad + def _scan_body(args_list, _): + """Closure which implements `tf.scan` body.""" + current_state, kernel_results = args_list + return _run_chain(num_steps_between_results + 1, current_state, seed, + kernel_results) + + current_state, kernel_results = _run_chain( + num_burnin_steps, + current_state, + distributions_util.gen_new_seed( + seed, salt="hmc_sample_chain_burnin"), + _make_dummy_kernel_results( + current_state, + current_target_log_prob, + current_grads_target_log_prob)) - potential_and_grad = _make_potential_and_grad(target_log_prob_fn) - potential, grad = potential_and_grad(initial_x) return functional_ops.scan( - body, array_ops.zeros(n_iterations, dtype=initial_x.dtype), - (initial_x, - array_ops.zeros(non_event_shape, dtype=initial_x.dtype), - -potential, -grad))[:2] + fn=_scan_body, + elems=array_ops.zeros(num_results, dtype=dtypes.bool), # Dummy arg. + initializer=[current_state, kernel_results]) -def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x, - target_log_prob_fn, proposal_log_prob_fn, event_dims=(), - name=None): +def sample_annealed_importance_chain( + proposal_log_prob_fn, + num_steps, + target_log_prob_fn, + current_state, + step_size, + num_leapfrog_steps, + seed=None, + name=None): """Runs annealed importance sampling (AIS) to estimate normalizing constants. - This routine uses Hamiltonian Monte Carlo to sample from a series of + This function uses Hamiltonian Monte Carlo to sample from a series of distributions that slowly interpolates between an initial "proposal" - distribution + distribution: `exp(proposal_log_prob_fn(x) - proposal_log_normalizer)` - and the target distribution + and the target distribution: `exp(target_log_prob_fn(x) - target_log_normalizer)`, @@ -202,113 +325,183 @@ def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x, normalizing constants of the initial distribution and the target distribution: - E[exp(w)] = exp(target_log_normalizer - proposal_log_normalizer). - - Args: - n_iterations: Integer number of Markov chain updates to run. More - iterations means more expense, but smoother annealing between q - and p, which in turn means exponentially lower variance for the - normalizing constant estimator. - step_size: Scalar step size or array of step sizes for the - leapfrog integrator. Broadcasts to the shape of - `initial_x`. Larger step sizes lead to faster progress, but - too-large step sizes make rejection exponentially more likely. - When possible, it's often helpful to match per-variable step - sizes to the standard deviations of the target distribution in - each variable. - n_leapfrog_steps: Integer number of steps to run the leapfrog - integrator for. Total progress per HMC step is roughly - proportional to step_size * n_leapfrog_steps. - initial_x: Tensor of initial state(s) of the Markov chain(s). Must - be a sample from q, or results will be incorrect. - target_log_prob_fn: Python callable which takes an argument like `initial_x` - and returns its (possibly unnormalized) log-density under the target - distribution. - proposal_log_prob_fn: Python callable that returns the log density of the - initial distribution. - event_dims: List of dimensions that should not be treated as - independent. This allows for multiple chains to be run independently - in parallel. Default is (), i.e., all dimensions are independent. - name: Python `str` name prefixed to Ops created by this function. - - Returns: - ais_weights: Tensor with the estimated weight(s). Has shape matching - `target_log_prob_fn(initial_x)`. - chain_states: Tensor with the state(s) of the Markov chain(s) the final - iteration. Has shape matching `initial_x`. - acceptance_probs: Tensor with the acceptance probabilities for the final - iteration. Has shape matching `target_log_prob_fn(initial_x)`. + `E[exp(ais_weights)] = exp(target_log_normalizer - proposal_log_normalizer)`. #### Examples: - ```python - # Estimating the normalizing constant of a log-gamma distribution: - def proposal_log_prob(x): - # Standard normal log-probability. This is properly normalized. - return tf.reduce_sum(-0.5 * tf.square(x) - 0.5 * np.log(2 * np.pi), 1) - def target_log_prob(x): - # Unnormalized log-gamma(2, 3) distribution. - # True normalizer is (lgamma(2) - 2 * log(3)) * x.shape[1] - return tf.reduce_sum(2. * x - 3. * tf.exp(x), 1) - # Run 100 AIS chains in parallel - initial_x = tf.random_normal([100, 20]) - w, _, _ = hmc.ais_chain(1000, 0.2, 2, initial_x, target_log_prob, - proposal_log_prob, event_dims=[1]) - log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100) - ``` + ##### Estimate the normalizing constant of a log-gamma distribution. ```python - # Estimating the marginal likelihood of a Bayesian regression model: - base_measure = -0.5 * np.log(2 * np.pi) - def proposal_log_prob(x): - # Standard normal log-probability. This is properly normalized. - return tf.reduce_sum(-0.5 * tf.square(x) + base_measure, 1) - def regression_log_joint(beta, x, y): - # This function returns a vector whose ith element is log p(beta[i], y | x). - # Each row of beta corresponds to the state of an independent Markov chain. - log_prior = tf.reduce_sum(-0.5 * tf.square(beta) + base_measure, 1) - means = tf.matmul(beta, x, transpose_b=True) - log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means) + - base_measure, 1) - return log_prior + log_likelihood - def log_joint_partial(beta): - return regression_log_joint(beta, x, y) + tfd = tf.contrib.distributions + # Run 100 AIS chains in parallel - initial_beta = tf.random_normal([100, x.shape[1]]) - w, beta_samples, _ = hmc.ais_chain(1000, 0.1, 2, initial_beta, - log_joint_partial, proposal_log_prob, - event_dims=[1]) - log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100) + num_chains = 100 + dims = 20 + dtype = np.float32 + + proposal = tfd.MultivatiateNormalDiag( + loc=tf.zeros([dims], dtype=dtype)) + + target = tfd.TransformedDistribution( + distribution=tfd.Gamma(concentration=dtype(2), + rate=dtype(3)), + bijector=tfd.bijectors.Invert(tfd.bijectors.Exp()), + event_shape=[dims]) + + chains_state, ais_weights, kernels_results = ( + hmc.sample_annealed_importance_chain( + proposal_log_prob_fn=proposal.log_prob, + num_steps=1000, + target_log_prob_fn=target.log_prob, + step_size=0.2, + current_state=proposal.sample(num_chains), + num_leapfrog_steps=2)) + + log_estimated_normalizer = (tf.reduce_logsumexp(ais_weights) + - np.log(num_chains)) + log_true_normalizer = tf.lgamma(2.) - 2. * tf.log(3.) ``` + + ##### Estimate marginal likelihood of a Bayesian regression model. + + ```python + tfd = tf.contrib.distributions + + def make_prior(dims, dtype): + return tfd.MultivariateNormalDiag( + loc=tf.zeros(dims, dtype)) + + def make_likelihood(weights, x): + return tfd.MultivariateNormalDiag( + loc=tf.tensordot(weights, x, axes=[[0], [-1]])) + + # Run 100 AIS chains in parallel + num_chains = 100 + dims = 10 + dtype = np.float32 + + # Make training data. + x = np.random.randn(num_chains, dims).astype(dtype) + true_weights = np.random.randn(dims).astype(dtype) + y = np.dot(x, true_weights) + np.random.randn(num_chains) + + # Setup model. + prior = make_prior(dims, dtype) + def target_log_prob_fn(weights): + return prior.log_prob(weights) + make_likelihood(weights, x).log_prob(y) + + proposal = tfd.MultivariateNormalDiag( + loc=tf.zeros(dims, dtype)) + + weight_samples, ais_weights, kernel_results = ( + hmc.sample_annealed_importance_chain( + num_steps=1000, + proposal_log_prob_fn=proposal.log_prob, + target_log_prob_fn=target_log_prob_fn + current_state=tf.zeros([num_chains, dims], dtype), + step_size=0.1, + num_leapfrog_steps=2)) + log_normalizer_estimate = (tf.reduce_logsumexp(ais_weights) + - np.log(num_chains)) + ``` + + Args: + proposal_log_prob_fn: Python callable that returns the log density of the + initial distribution. + num_steps: Integer number of Markov chain updates to run. More + iterations means more expense, but smoother annealing between q + and p, which in turn means exponentially lower variance for the + normalizing constant estimator. + target_log_prob_fn: Python callable which takes an argument like + `current_state` (or `*current_state` if it's a list) and returns its + (possibly unnormalized) log-density under the target distribution. + current_state: `Tensor` or Python `list` of `Tensor`s representing the + current state(s) of the Markov chain(s). The first `r` dimensions index + independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`. + step_size: `Tensor` or Python `list` of `Tensor`s representing the step size + for the leapfrog integrator. Must broadcast with the shape of + `current_state`. Larger step sizes lead to faster progress, but too-large + step sizes make rejection exponentially more likely. When possible, it's + often helpful to match per-variable step sizes to the standard deviations + of the target distribution in each variable. + num_leapfrog_steps: Integer number of steps to run the leapfrog integrator + for. Total progress per HMC step is roughly proportional to `step_size * + num_leapfrog_steps`. + seed: Python integer to seed the random number generator. + name: Python `str` name prefixed to Ops created by this function. + Default value: `None` (i.e., "hmc_sample_annealed_importance_chain"). + + Returns: + accepted_state: `Tensor` or Python list of `Tensor`s representing the + state(s) of the Markov chain(s) at the final iteration. Has same shape as + input `current_state`. + ais_weights: Tensor with the estimated weight(s). Has shape matching + `target_log_prob_fn(current_state)`. """ - with ops.name_scope(name, 'hmc_ais_chain', - [n_iterations, step_size, n_leapfrog_steps, initial_x]): - non_event_shape = array_ops.shape(target_log_prob_fn(initial_x)) + def make_convex_combined_log_prob_fn(iter_): + def _fn(*args): + p = proposal_log_prob_fn(*args) + t = target_log_prob_fn(*args) + dtype = p.dtype.base_dtype + beta = (math_ops.cast(iter_ + 1, dtype) + / math_ops.cast(num_steps, dtype)) + return (1. - beta) * p + beta * t + return _fn - beta_series = math_ops.linspace(0., 1., n_iterations+1)[1:] - def _body(a, beta): # pylint: disable=missing-docstring - def log_prob_beta(x): - return ((1 - beta) * proposal_log_prob_fn(x) + - beta * target_log_prob_fn(x)) - last_x = a[0] - w = a[2] - w += (1. / n_iterations) * (target_log_prob_fn(last_x) - - proposal_log_prob_fn(last_x)) - # TODO(b/66917083): There's an opportunity for gradient reuse here. - updated_x, acceptance_probs, _, _ = kernel(step_size, n_leapfrog_steps, - last_x, log_prob_beta, - event_dims) - return updated_x, acceptance_probs, w + with ops.name_scope( + name, "hmc_sample_annealed_importance_chain", + [num_steps, current_state, step_size, num_leapfrog_steps, seed]): + with ops.name_scope("initialize"): + [ + current_state, + step_size, + current_log_prob, + current_grads_log_prob, + ] = _prepare_args( + make_convex_combined_log_prob_fn(iter_=0), + current_state, + step_size, + description="convex_combined_log_prob") + def _loop_body(iter_, ais_weights, current_state, kernel_results): + """Closure which implements `tf.while_loop` body.""" + current_state_parts = (list(current_state) + if _is_list_like(current_state) + else [current_state]) + ais_weights += ((target_log_prob_fn(*current_state_parts) + - proposal_log_prob_fn(*current_state_parts)) + / math_ops.cast(num_steps, ais_weights.dtype)) + return [iter_ + 1, ais_weights] + list(kernel( + make_convex_combined_log_prob_fn(iter_), + current_state, + step_size, + num_leapfrog_steps, + seed, + kernel_results.current_target_log_prob, + kernel_results.current_grads_target_log_prob)) - x, acceptance_probs, w = functional_ops.scan( - _body, beta_series, - (initial_x, array_ops.zeros(non_event_shape, dtype=initial_x.dtype), - array_ops.zeros(non_event_shape, dtype=initial_x.dtype))) - return w[-1], x[-1], acceptance_probs[-1] + [ais_weights, current_state, kernel_results] = control_flow_ops.while_loop( + cond=lambda iter_, *args: iter_ < num_steps, + body=_loop_body, + loop_vars=[ + 0, # iter_ + array_ops.zeros_like(current_log_prob), # ais_weights + current_state, + _make_dummy_kernel_results(current_state, + current_log_prob, + current_grads_log_prob), + ])[1:] # Lop-off "iter_". + + return [current_state, ais_weights, kernel_results] -def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(), - x_log_prob=None, x_grad=None, name=None): +def kernel(target_log_prob_fn, + current_state, + step_size, + num_leapfrog_steps, + seed=None, + current_target_log_prob=None, + current_grads_target_log_prob=None, + name=None): """Runs one iteration of Hamiltonian Monte Carlo. Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) @@ -316,334 +509,625 @@ def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(), a Metropolis proposal. This function applies one step of HMC to randomly update the variable `x`. - This function can update multiple chains in parallel. It assumes - that all dimensions of `x` not specified in `event_dims` are - independent, and should therefore be updated independently. The - output of `target_log_prob_fn()` should sum log-probabilities across - all event dimensions. Slices along dimensions not in `event_dims` - may have different target distributions; for example, if - `event_dims == (1,)`, then `x[0, :]` could have a different target - distribution from x[1, :]. This is up to `target_log_prob_fn()`. - - Args: - step_size: Scalar step size or array of step sizes for the - leapfrog integrator. Broadcasts to the shape of - `x`. Larger step sizes lead to faster progress, but - too-large step sizes make rejection exponentially more likely. - When possible, it's often helpful to match per-variable step - sizes to the standard deviations of the target distribution in - each variable. - n_leapfrog_steps: Integer number of steps to run the leapfrog - integrator for. Total progress per HMC step is roughly - proportional to step_size * n_leapfrog_steps. - x: Tensor containing the value(s) of the random variable(s) to update. - target_log_prob_fn: Python callable which takes an argument like `initial_x` - and returns its (possibly unnormalized) log-density under the target - distribution. - event_dims: List of dimensions that should not be treated as - independent. This allows for multiple chains to be run independently - in parallel. Default is (), i.e., all dimensions are independent. - x_log_prob (optional): Tensor containing the cached output of a previous - call to `target_log_prob_fn()` evaluated at `x` (such as that provided by - a previous call to `kernel()`). Providing `x_log_prob` and - `x_grad` saves one gradient computation per call to `kernel()`. - x_grad (optional): Tensor containing the cached gradient of - `target_log_prob_fn()` evaluated at `x` (such as that provided by - a previous call to `kernel()`). Providing `x_log_prob` and - `x_grad` saves one gradient computation per call to `kernel()`. - name: Python `str` name prefixed to Ops created by this function. - - Returns: - updated_x: The updated variable(s) x. Has shape matching `initial_x`. - acceptance_probs: Tensor with the acceptance probabilities for the final - iteration. This is useful for diagnosing step size problems etc. Has - shape matching `target_log_prob_fn(initial_x)`. - new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`. - new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at - `updated_x`. + This function can update multiple chains in parallel. It assumes that all + leftmost dimensions of `current_state` index independent chain states (and are + therefore updated independently). The output of `target_log_prob_fn()` should + sum log-probabilities across all event dimensions. Slices along the rightmost + dimensions may have different target distributions; for example, + `current_state[0, :]` could have a different target distribution from + `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of + independent chains is `tf.size(target_log_prob_fn(*current_state))`.) #### Examples: + ##### Simple chain with warm-up. + ```python + tfd = tf.contrib.distributions + # Tuning acceptance rates: + dtype = np.float32 target_accept_rate = 0.631 - def target_log_prob(x): - # Standard normal - return tf.reduce_sum(-0.5 * tf.square(x)) - initial_x = tf.zeros([10]) - initial_log_prob = target_log_prob(initial_x) - initial_grad = tf.gradients(initial_log_prob, initial_x)[0] - # Algorithm state - x = tf.Variable(initial_x, name='x') - step_size = tf.Variable(1., name='step_size') - last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob') - last_grad = tf.Variable(initial_grad, name='last_grad') - # Compute updates - new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x, - target_log_prob, - event_dims=[0], - x_log_prob=last_log_prob) - x_update = tf.assign(x, new_x) - log_prob_update = tf.assign(last_log_prob, log_prob) - grad_update = tf.assign(last_grad, grad) - step_size_update = tf.assign(step_size, - tf.where(acceptance_prob > target_accept_rate, - step_size * 1.01, step_size / 1.01)) - adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update] - sampling_updates = [x_update, log_prob_update, grad_update] + num_warmup_iter = 500 + num_chain_iter = 500 + + x = tf.get_variable(name="x", initializer=dtype(1)) + step_size = tf.get_variable(name="step_size", initializer=dtype(1)) + + target = tfd.Normal(loc=dtype(0), scale=dtype(1)) + + new_x, other_results = hmc.kernel( + target_log_prob_fn=target.log_prob, + current_state=x, + step_size=step_size, + num_leapfrog_steps=3)[:4] + + x_update = x.assign(new_x) + + step_size_update = step_size.assign_add( + step_size * tf.where( + other_results.acceptance_probs > target_accept_rate, + 0.01, -0.01)) + + warmup = tf.group([x_update, step_size_update]) + + tf.global_variables_initializer().run() + + sess.graph.finalize() # No more graph building. - sess = tf.Session() - sess.run(tf.global_variables_initializer()) # Warm up the sampler and adapt the step size - for i in xrange(500): - sess.run(adaptive_updates) + for _ in xrange(num_warmup_iter): + sess.run(warmup) + # Collect samples without adapting step size - samples = np.zeros([500, 10]) - for i in xrange(500): - x_val, _ = sess.run([new_x, sampling_updates]) - samples[i] = x_val + samples = np.zeros([num_chain_iter]) + for i in xrange(num_chain_iter): + _, x_, target_log_prob_, grad_ = sess.run([ + x_update, + x, + other_results.target_log_prob, + other_results.grads_target_log_prob]) + samples[i] = x_ + + print(samples.mean(), samples.std()) + ``` + + ##### Sample from more complicated posterior. + + I.e., + + ```none + W ~ MVN(loc=0, scale=sigma * eye(dims)) + for i=1...num_samples: + X[i] ~ MVN(loc=0, scale=eye(dims)) + eps[i] ~ Normal(loc=0, scale=1) + Y[i] = X[i].T * W + eps[i] ``` ```python - # Empirical-Bayes estimation of a hyperparameter by MCMC-EM: + tfd = tf.contrib.distributions - # Problem setup - N = 150 - D = 10 - x = np.random.randn(N, D).astype(np.float32) - true_sigma = 0.5 - true_beta = true_sigma * np.random.randn(D).astype(np.float32) - y = x.dot(true_beta) + np.random.randn(N).astype(np.float32) + def make_training_data(num_samples, dims, sigma): + dt = np.asarray(sigma).dtype + zeros = tf.zeros(dims, dtype=dt) + x = tfd.MultivariateNormalDiag( + loc=zeros).sample(num_samples, seed=1) + w = tfd.MultivariateNormalDiag( + loc=zeros, + scale_identity_multiplier=sigma).sample(seed=2) + noise = tfd.Normal( + loc=dt(0), + scale=dt(1)).sample(num_samples, seed=3) + y = tf.tensordot(x, w, axes=[[1], [0]]) + noise + return y, x, w + + def make_prior(sigma, dims): + # p(w | sigma) + return tfd.MultivariateNormalDiag( + loc=tf.zeros([dims], dtype=sigma.dtype), + scale_identity_multiplier=sigma) + + def make_likelihood(x, w): + # p(y | x, w) + return tfd.MultivariateNormalDiag( + loc=tf.tensordot(x, w, axes=[[1], [0]])) + + # Setup assumptions. + dtype = np.float32 + num_samples = 150 + dims = 10 + num_iters = int(5e3) + + true_sigma = dtype(0.5) + y, x, true_weights = make_training_data(num_samples, dims, true_sigma) + + # Estimate of `log(true_sigma)`. + log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0)) + sigma = tf.exp(log_sigma) + + # State of the Markov chain. + weights = tf.get_variable( + name="weights", + initializer=np.random.randn(dims).astype(dtype)) + + prior = make_prior(sigma, dims) + + def joint_log_prob_fn(w): + # f(w) = log p(w, y | x) + return prior.log_prob(w) + make_likelihood(x, w).log_prob(y) + + weights_update = weights.assign( + hmc.kernel(target_log_prob_fn=joint_log_prob, + current_state=weights, + step_size=0.1, + num_leapfrog_steps=5)[0]) + + with tf.control_dependencies([weights_update]): + loss = -prior.log_prob(weights) - def log_prior(beta, log_sigma): - return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) - - log_sigma) - def regression_log_joint(beta, log_sigma, x, y): - # This function returns log p(beta | log_sigma) + log p(y | x, beta). - means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True) - means = tf.squeeze(means) - log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means)) - return log_prior(beta, log_sigma) + log_likelihood - def log_joint_partial(beta): - return regression_log_joint(beta, log_sigma, x, y) - # Our estimate of log(sigma) - log_sigma = tf.Variable(0., name='log_sigma') - # The state of the Markov chain - beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta') - new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial, - event_dims=[0]) - beta_update = tf.assign(beta, new_beta) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01) - with tf.control_dependencies([beta_update]): - log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma), - var_list=[log_sigma]) + log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma]) - sess = tf.Session() - sess.run(tf.global_variables_initializer()) - log_sigma_history = np.zeros(1000) - for i in xrange(1000): - log_sigma_val, _ = sess.run([log_sigma, log_sigma_update]) - log_sigma_history[i] = log_sigma_val - # Should converge to something close to true_sigma - plt.plot(np.exp(log_sigma_history)) + sess.graph.finalize() # No more graph building. + + tf.global_variables_initializer().run() + + sigma_history = np.zeros(num_iters, dtype) + weights_history = np.zeros([num_iters, dims], dtype) + + for i in xrange(num_iters): + _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights]) + weights_history[i, :] = weights_ + sigma_history[i] = sigma_ + + true_weights_ = sess.run(true_weights) + + # Should converge to something close to true_sigma. + plt.plot(sigma_history); + plt.ylabel("sigma"); + plt.xlabel("iteration"); ``` - """ - with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]): - potential_and_grad = _make_potential_and_grad(target_log_prob_fn) - x = ops.convert_to_tensor(x, name='x') - - x_shape = array_ops.shape(x) - m = random_ops.random_normal(x_shape, dtype=x.dtype) - - kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims) - - if (x_log_prob is not None) and (x_grad is not None): - log_potential_0, grad_0 = -x_log_prob, -x_grad # pylint: disable=invalid-unary-operand-type - else: - if x_log_prob is not None: - logging.warn('x_log_prob was provided, but x_grad was not,' - ' so x_log_prob was not used.') - if x_grad is not None: - logging.warn('x_grad was provided, but x_log_prob was not,' - ' so x_grad was not used.') - log_potential_0, grad_0 = potential_and_grad(x) - - new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator( - step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0) - - kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims) - - energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0 - # Treat NaN as infinite energy (and therefore guaranteed rejection). - energy_change = array_ops.where( - math_ops.is_nan(energy_change), - array_ops.fill(array_ops.shape(energy_change), - energy_change.dtype.as_numpy_dtype(np.inf)), - energy_change) - acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.)) - accepted = ( - random_ops.random_uniform( - array_ops.shape(acceptance_probs), dtype=x.dtype) - < acceptance_probs) - new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0) - - # TODO(b/65738010): This should work, but it doesn't for now. - # reduced_shape = math_ops.reduced_shape(x_shape, event_dims) - reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims, - keep_dims=True)) - accepted = array_ops.reshape(accepted, reduced_shape) - accepted = math_ops.logical_or( - accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool)) - new_x = array_ops.where(accepted, new_x, x) - new_grad = -array_ops.where(accepted, grad_1, grad_0) - - # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect - # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate). This - # should be fixed. - return new_x, acceptance_probs, new_log_prob, new_grad - - -def leapfrog_integrator(step_size, n_steps, initial_position, initial_momentum, - potential_and_grad, initial_grad, name=None): - """Applies `n_steps` steps of the leapfrog integrator. - - This just wraps `leapfrog_step()` in a `tf.while_loop()`, reusing - gradient computations where possible. Args: - step_size: Scalar step size or array of step sizes for the - leapfrog integrator. Broadcasts to the shape of - `initial_position`. Larger step sizes lead to faster progress, but - too-large step sizes lead to larger discretization error and - worse energy conservation. - n_steps: Number of steps to run the leapfrog integrator. - initial_position: Tensor containing the value(s) of the position variable(s) - to update. - initial_momentum: Tensor containing the value(s) of the momentum variable(s) - to update. - potential_and_grad: Python callable that takes a position tensor like - `initial_position` and returns the potential energy and its gradient at - that position. - initial_grad: Tensor with the value of the gradient of the potential energy - at `initial_position`. + target_log_prob_fn: Python callable which takes an argument like + `current_state` (or `*current_state` if it's a list) and returns its + (possibly unnormalized) log-density under the target distribution. + current_state: `Tensor` or Python `list` of `Tensor`s representing the + current state(s) of the Markov chain(s). The first `r` dimensions index + independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`. + step_size: `Tensor` or Python `list` of `Tensor`s representing the step size + for the leapfrog integrator. Must broadcast with the shape of + `current_state`. Larger step sizes lead to faster progress, but too-large + step sizes make rejection exponentially more likely. When possible, it's + often helpful to match per-variable step sizes to the standard deviations + of the target distribution in each variable. + num_leapfrog_steps: Integer number of steps to run the leapfrog integrator + for. Total progress per HMC step is roughly proportional to `step_size * + num_leapfrog_steps`. + seed: Python integer to seed the random number generator. + current_target_log_prob: (Optional) `Tensor` representing the value of + `target_log_prob_fn` at the `current_state`. The only reason to + specify this argument is to reduce TF graph size. + Default value: `None` (i.e., compute as needed). + current_grads_target_log_prob: (Optional) Python list of `Tensor`s + representing gradient of `current_target_log_prob` at the `current_state` + and wrt the `current_state`. Must have same shape as `current_state`. The + only reason to specify this argument is to reduce TF graph size. + Default value: `None` (i.e., compute as needed). name: Python `str` name prefixed to Ops created by this function. + Default value: `None` (i.e., "hmc_kernel"). Returns: - updated_position: Updated value of the position. - updated_momentum: Updated value of the momentum. - new_potential: Potential energy of the new position. Has shape matching - `potential_and_grad(initial_position)`. - new_grad: Gradient from potential_and_grad() evaluated at the new position. - Has shape matching `initial_position`. + accepted_state: Tensor or Python list of `Tensor`s representing the state(s) + of the Markov chain(s) at each result step. Has same shape as + `current_state`. + acceptance_probs: Tensor with the acceptance probabilities for each + iteration. Has shape matching `target_log_prob_fn(current_state)`. + accepted_target_log_prob: `Tensor` representing the value of + `target_log_prob_fn` at `accepted_state`. + accepted_grads_target_log_prob: Python `list` of `Tensor`s representing the + gradient of `accepted_target_log_prob` wrt each `accepted_state`. - Example: Simple quadratic potential. + Raises: + ValueError: if there isn't one `step_size` or a list with same length as + `current_state`. + """ + with ops.name_scope( + name, "hmc_kernel", + [current_state, step_size, num_leapfrog_steps, seed, + current_target_log_prob, current_grads_target_log_prob]): + with ops.name_scope("initialize"): + [current_state_parts, step_sizes, current_target_log_prob, + current_grads_target_log_prob] = _prepare_args( + target_log_prob_fn, current_state, step_size, + current_target_log_prob, current_grads_target_log_prob, + maybe_expand=True) + independent_chain_ndims = distributions_util.prefer_static_rank( + current_target_log_prob) + def init_momentum(s): + return random_ops.random_normal( + shape=array_ops.shape(s), + dtype=s.dtype.base_dtype, + seed=distributions_util.gen_new_seed( + seed, salt="hmc_kernel_momentums")) + current_momentums = [init_momentum(s) for s in current_state_parts] + + [ + proposed_momentums, + proposed_state_parts, + proposed_target_log_prob, + proposed_grads_target_log_prob, + ] = _leapfrog_integrator(current_momentums, + target_log_prob_fn, + current_state_parts, + step_sizes, + num_leapfrog_steps, + current_target_log_prob, + current_grads_target_log_prob) + + energy_change = _compute_energy_change(current_target_log_prob, + current_momentums, + proposed_target_log_prob, + proposed_momentums, + independent_chain_ndims) + + # u < exp(min(-energy, 0)), where u~Uniform[0,1) + # ==> -log(u) >= max(e, 0) + # ==> -log(u) >= e + # (Perhaps surprisingly, we don't have a better way to obtain a random + # uniform from positive reals, i.e., `tf.random_uniform(minval=0, + # maxval=np.inf)` won't work.) + random_uniform = random_ops.random_uniform( + shape=array_ops.shape(energy_change), + dtype=energy_change.dtype, + seed=seed) + random_positive = -math_ops.log(random_uniform) + is_accepted = random_positive >= energy_change + + accepted_target_log_prob = array_ops.where(is_accepted, + proposed_target_log_prob, + current_target_log_prob) + + accepted_state_parts = [_choose(is_accepted, + proposed_state_part, + current_state_part, + independent_chain_ndims) + for current_state_part, proposed_state_part + in zip(current_state_parts, proposed_state_parts)] + + accepted_grads_target_log_prob = [ + _choose(is_accepted, + proposed_grad, + grad, + independent_chain_ndims) + for proposed_grad, grad + in zip(proposed_grads_target_log_prob, current_grads_target_log_prob)] + + maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0] + return [ + maybe_flatten(accepted_state_parts), + KernelResults( + acceptance_probs=math_ops.exp(math_ops.minimum(-energy_change, 0.)), + current_grads_target_log_prob=accepted_grads_target_log_prob, + current_target_log_prob=accepted_target_log_prob, + energy_change=energy_change, + is_accepted=is_accepted, + proposed_grads_target_log_prob=proposed_grads_target_log_prob, + proposed_state=maybe_flatten(proposed_state_parts), + proposed_target_log_prob=proposed_target_log_prob, + random_positive=random_positive, + ), + ] + + +def _leapfrog_integrator(current_momentums, + target_log_prob_fn, + current_state_parts, + step_sizes, + num_leapfrog_steps, + current_target_log_prob=None, + current_grads_target_log_prob=None, + name=None): + """Applies `num_leapfrog_steps` of the leapfrog integrator. + + Assumes a simple quadratic kinetic energy function: `0.5 ||momentum||**2`. + + #### Examples: + + ##### Simple quadratic potential. ```python - def potential_and_grad(position): - return tf.reduce_sum(0.5 * tf.square(position)), position + tfd = tf.contrib.distributions + + dims = 10 + num_iter = int(1e3) + dtype = np.float32 + position = tf.placeholder(np.float32) momentum = tf.placeholder(np.float32) - potential, grad = potential_and_grad(position) - new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_integrator( - 0.1, 3, position, momentum, potential_and_grad, grad) - sess = tf.Session() - position_val = np.random.randn(10) - momentum_val = np.random.randn(10) - potential_val, grad_val = sess.run([potential, grad], - {position: position_val}) - positions = np.zeros([100, 10]) - for i in xrange(100): - position_val, momentum_val, potential_val, grad_val = sess.run( - [new_position, new_momentum, new_potential, new_grad], - {position: position_val, momentum: momentum_val}) - positions[i] = position_val - # Should trace out sinusoidal dynamics. - plt.plot(positions[:, 0]) + [ + new_momentums, + new_positions, + ] = hmc._leapfrog_integrator( + current_momentums=[momentum], + target_log_prob_fn=tfd.MultivariateNormalDiag( + loc=tf.zeros(dims, dtype)).log_prob, + current_state_parts=[position], + step_sizes=0.1, + num_leapfrog_steps=3)[:2] + + sess.graph.finalize() # No more graph building. + + momentum_ = np.random.randn(dims).astype(dtype) + position_ = np.random.randn(dims).astype(dtype) + + positions = np.zeros([num_iter, dims], dtype) + for i in xrange(num_iter): + position_, momentum_ = sess.run( + [new_momentums[0], new_position[0]], + feed_dict={position: position_, momentum: momentum_}) + positions[i] = position_ + + plt.plot(positions[:, 0]); # Sinusoidal. ``` - """ - def leapfrog_wrapper(step_size, x, m, grad, l): - x, m, _, grad = leapfrog_step(step_size, x, m, potential_and_grad, grad) - return step_size, x, m, grad, l + 1 - - def counter_fn(a, b, c, d, counter): # pylint: disable=unused-argument - return counter < n_steps - - with ops.name_scope(name, 'leapfrog_integrator', - [step_size, n_steps, initial_position, initial_momentum, - initial_grad]): - _, new_x, new_m, new_grad, _ = control_flow_ops.while_loop( - counter_fn, leapfrog_wrapper, [step_size, initial_position, - initial_momentum, initial_grad, - array_ops.constant(0)], back_prop=False) - # We're counting on the runtime to eliminate this redundant computation. - new_potential, new_grad = potential_and_grad(new_x) - return new_x, new_m, new_potential, new_grad - - -def leapfrog_step(step_size, position, momentum, potential_and_grad, grad, - name=None): - """Applies one step of the leapfrog integrator. - - Assumes a simple quadratic kinetic energy function: 0.5 * ||momentum||^2. Args: - step_size: Scalar step size or array of step sizes for the - leapfrog integrator. Broadcasts to the shape of - `position`. Larger step sizes lead to faster progress, but - too-large step sizes lead to larger discretization error and - worse energy conservation. - position: Tensor containing the value(s) of the position variable(s) - to update. - momentum: Tensor containing the value(s) of the momentum variable(s) - to update. - potential_and_grad: Python callable that takes a position tensor like - `position` and returns the potential energy and its gradient at that - position. - grad: Tensor with the value of the gradient of the potential energy - at `position`. + current_momentums: Tensor containing the value(s) of the momentum + variable(s) to update. + target_log_prob_fn: Python callable which takes an argument like + `*current_state_parts` and returns its (possibly unnormalized) log-density + under the target distribution. + current_state_parts: Python `list` of `Tensor`s representing the current + state(s) of the Markov chain(s). The first `independent_chain_ndims` of + the `Tensor`(s) index different chains. + step_sizes: Python `list` of `Tensor`s representing the step size for the + leapfrog integrator. Must broadcast with the shape of + `current_state_parts`. Larger step sizes lead to faster progress, but + too-large step sizes make rejection exponentially more likely. When + possible, it's often helpful to match per-variable step sizes to the + standard deviations of the target distribution in each variable. + num_leapfrog_steps: Integer number of steps to run the leapfrog integrator + for. Total progress per HMC step is roughly proportional to `step_size * + num_leapfrog_steps`. + current_target_log_prob: (Optional) `Tensor` representing the value of + `target_log_prob_fn(*current_state_parts)`. The only reason to specify + this argument is to reduce TF graph size. + Default value: `None` (i.e., compute as needed). + current_grads_target_log_prob: (Optional) Python list of `Tensor`s + representing gradient of `target_log_prob_fn(*current_state_parts`) wrt + `current_state_parts`. Must have same shape as `current_state_parts`. The + only reason to specify this argument is to reduce TF graph size. + Default value: `None` (i.e., compute as needed). name: Python `str` name prefixed to Ops created by this function. + Default value: `None` (i.e., "hmc_leapfrog_integrator"). Returns: - updated_position: Updated value of the position. - updated_momentum: Updated value of the momentum. - new_potential: Potential energy of the new position. Has shape matching - `potential_and_grad(position)`. - new_grad: Gradient from potential_and_grad() evaluated at the new position. - Has shape matching `position`. + proposed_momentums: Updated value of the momentum. + proposed_state_parts: Tensor or Python list of `Tensor`s representing the + state(s) of the Markov chain(s) at each result step. Has same shape as + input `current_state_parts`. + proposed_target_log_prob: `Tensor` representing the value of + `target_log_prob_fn` at `accepted_state`. + proposed_grads_target_log_prob: Gradient of `proposed_target_log_prob` wrt + `accepted_state`. - Example: Simple quadratic potential. - - ```python - def potential_and_grad(position): - # Simple quadratic potential - return tf.reduce_sum(0.5 * tf.square(position)), position - position = tf.placeholder(np.float32) - momentum = tf.placeholder(np.float32) - potential, grad = potential_and_grad(position) - new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_step( - 0.1, position, momentum, potential_and_grad, grad) - - sess = tf.Session() - position_val = np.random.randn(10) - momentum_val = np.random.randn(10) - potential_val, grad_val = sess.run([potential, grad], - {position: position_val}) - positions = np.zeros([100, 10]) - for i in xrange(100): - position_val, momentum_val, potential_val, grad_val = sess.run( - [new_position, new_momentum, new_potential, new_grad], - {position: position_val, momentum: momentum_val}) - positions[i] = position_val - # Should trace out sinusoidal dynamics. - plt.plot(positions[:, 0]) - ``` + Raises: + ValueError: if `len(momentums) != len(state_parts)`. + ValueError: if `len(state_parts) != len(step_sizes)`. + ValueError: if `len(state_parts) != len(grads_target_log_prob)`. + TypeError: if `not target_log_prob.dtype.is_floating`. """ - with ops.name_scope(name, 'leapfrog_step', [step_size, position, momentum, - grad]): - momentum -= 0.5 * step_size * grad - position += step_size * momentum - potential, grad = potential_and_grad(position) - momentum -= 0.5 * step_size * grad + def _loop_body(step, + current_momentums, + current_state_parts, + ignore_current_target_log_prob, # pylint: disable=unused-argument + current_grads_target_log_prob): + return [step + 1] + list(_leapfrog_step(current_momentums, + target_log_prob_fn, + current_state_parts, + step_sizes, + current_grads_target_log_prob)) - return position, momentum, potential, grad + with ops.name_scope( + name, "hmc_leapfrog_integrator", + [current_momentums, current_state_parts, step_sizes, num_leapfrog_steps, + current_target_log_prob, current_grads_target_log_prob]): + if len(current_momentums) != len(current_state_parts): + raise ValueError("`momentums` must be in one-to-one correspondence " + "with `state_parts`") + num_leapfrog_steps = ops.convert_to_tensor(num_leapfrog_steps, + name="num_leapfrog_steps") + current_target_log_prob, current_grads_target_log_prob = ( + _maybe_call_fn_and_grads( + target_log_prob_fn, + current_state_parts, + current_target_log_prob, + current_grads_target_log_prob)) + return control_flow_ops.while_loop( + cond=lambda iter_, *args: iter_ < num_leapfrog_steps, + body=_loop_body, + loop_vars=[ + 0, # iter_ + current_momentums, + current_state_parts, + current_target_log_prob, + current_grads_target_log_prob, + ], + back_prop=False)[1:] # Lop-off "iter_". + + +def _leapfrog_step(current_momentums, + target_log_prob_fn, + current_state_parts, + step_sizes, + current_grads_target_log_prob, + name=None): + """Applies one step of the leapfrog integrator.""" + with ops.name_scope( + name, "_leapfrog_step", + [current_momentums, current_state_parts, step_sizes, + current_grads_target_log_prob]): + proposed_momentums = [m + 0.5 * ss * g for m, ss, g + in zip(current_momentums, + step_sizes, + current_grads_target_log_prob)] + proposed_state_parts = [x + ss * m for x, ss, m + in zip(current_state_parts, + step_sizes, + proposed_momentums)] + proposed_target_log_prob = target_log_prob_fn(*proposed_state_parts) + if not proposed_target_log_prob.dtype.is_floating: + raise TypeError("`target_log_prob_fn` must produce a `Tensor` " + "with `float` `dtype`.") + proposed_grads_target_log_prob = gradients_ops.gradients( + proposed_target_log_prob, proposed_state_parts) + if any(g is None for g in proposed_grads_target_log_prob): + raise ValueError( + "Encountered `None` gradient. Does your target `target_log_prob_fn` " + "access all `tf.Variable`s via `tf.get_variable`?\n" + " current_state_parts: {}\n" + " proposed_state_parts: {}\n" + " proposed_grads_target_log_prob: {}".format( + current_state_parts, + proposed_state_parts, + proposed_grads_target_log_prob)) + proposed_momentums = [m + 0.5 * ss * g for m, ss, g + in zip(proposed_momentums, + step_sizes, + proposed_grads_target_log_prob)] + return [ + proposed_momentums, + proposed_state_parts, + proposed_target_log_prob, + proposed_grads_target_log_prob, + ] + + +def _compute_energy_change(current_target_log_prob, + current_momentums, + proposed_target_log_prob, + proposed_momentums, + independent_chain_ndims, + name=None): + """Helper to `kernel` which computes the energy change.""" + with ops.name_scope( + name, "compute_energy_change", + ([current_target_log_prob, proposed_target_log_prob, + independent_chain_ndims] + + current_momentums + proposed_momentums)): + # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy + # since they're a mouthful and lets us inline more. + lk0, lk1 = [], [] + for current_momentum, proposed_momentum in zip(current_momentums, + proposed_momentums): + axis = math_ops.range(independent_chain_ndims, + array_ops.rank(current_momentum)) + lk0.append(_log_sum_sq(current_momentum, axis)) + lk1.append(_log_sum_sq(proposed_momentum, axis)) + + lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1), + axis=-1) + lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1), + axis=-1) + lp0 = -current_target_log_prob # log_potential + lp1 = -proposed_target_log_prob # proposed_log_potential + x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)], + axis=-1) + + # The sum is NaN if any element is NaN or we see both +Inf and -Inf. + # Thus we will replace such rows with infinite energy change which implies + # rejection. Recall that float-comparisons with NaN are always False. + is_sum_determinate = ( + math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) & + math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1)) + is_sum_determinate = array_ops.tile( + is_sum_determinate[..., array_ops.newaxis], + multiples=array_ops.concat([ + array_ops.ones(array_ops.rank(is_sum_determinate), + dtype=dtypes.int32), + [4], + ], axis=0)) + x = array_ops.where(is_sum_determinate, + x, + array_ops.fill(array_ops.shape(x), + value=x.dtype.as_numpy_dtype(np.inf))) + + return math_ops.reduce_sum(x, axis=-1) + + +def _choose(is_accepted, + accepted, + rejected, + independent_chain_ndims, + name=None): + """Helper to `kernel` which expand_dims `is_accepted` to apply tf.where.""" + def _expand_is_accepted_like(x): + with ops.name_scope("_choose"): + expand_shape = array_ops.concat([ + array_ops.shape(is_accepted), + array_ops.ones([array_ops.rank(x) - array_ops.rank(is_accepted)], + dtype=dtypes.int32), + ], axis=0) + multiples = array_ops.concat([ + array_ops.ones([array_ops.rank(is_accepted)], dtype=dtypes.int32), + array_ops.shape(x)[independent_chain_ndims:], + ], axis=0) + m = array_ops.tile(array_ops.reshape(is_accepted, expand_shape), + multiples) + m.set_shape(x.shape) + return m + with ops.name_scope(name, "_choose", values=[ + is_accepted, accepted, rejected, independent_chain_ndims]): + return array_ops.where(_expand_is_accepted_like(accepted), + accepted, + rejected) + + +def _maybe_call_fn_and_grads(fn, + fn_arg_list, + fn_result=None, + grads_fn_result=None, + description="target_log_prob"): + """Helper which computes `fn_result` and `grads` if needed.""" + fn_arg_list = (list(fn_arg_list) if _is_list_like(fn_arg_list) + else [fn_arg_list]) + if fn_result is None: + fn_result = fn(*fn_arg_list) + if not fn_result.dtype.is_floating: + raise TypeError("`{}` must be a `Tensor` with `float` `dtype`.".format( + description)) + if grads_fn_result is None: + grads_fn_result = gradients_ops.gradients( + fn_result, fn_arg_list) + if len(fn_arg_list) != len(grads_fn_result): + raise ValueError("`{}` must be in one-to-one correspondence with " + "`grads_{}`".format(*[description]*2)) + if any(g is None for g in grads_fn_result): + raise ValueError("Encountered `None` gradient.") + return fn_result, grads_fn_result + + +def _prepare_args(target_log_prob_fn, state, step_size, + target_log_prob=None, grads_target_log_prob=None, + maybe_expand=False, description="target_log_prob"): + """Helper which processes input args to meet list-like assumptions.""" + state_parts = list(state) if _is_list_like(state) else [state] + state_parts = [ops.convert_to_tensor(s, name="state") + for s in state_parts] + target_log_prob, grads_target_log_prob = _maybe_call_fn_and_grads( + target_log_prob_fn, + state_parts, + target_log_prob, + grads_target_log_prob, + description) + step_sizes = list(step_size) if _is_list_like(step_size) else [step_size] + step_sizes = [ + ops.convert_to_tensor( + s, name="step_size", dtype=target_log_prob.dtype) + for s in step_sizes] + if len(step_sizes) == 1: + step_sizes *= len(state_parts) + if len(state_parts) != len(step_sizes): + raise ValueError("There should be exactly one `step_size` or it should " + "have same length as `current_state`.") + if maybe_expand: + maybe_flatten = lambda x: x + else: + maybe_flatten = lambda x: x if _is_list_like(state) else x[0] + return [ + maybe_flatten(state_parts), + maybe_flatten(step_sizes), + target_log_prob, + grads_target_log_prob, + ] + + +def _is_list_like(x): + """Helper which returns `True` if input is `list`-like.""" + return isinstance(x, (tuple, list)) + + +def _log_sum_sq(x, axis=None): + """Computes log(sum(x**2)).""" + return math_ops.reduce_logsumexp(2. * math_ops.log(math_ops.abs(x)), axis) diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc index 4b5d5ba0de6..754b7bc3270 100644 --- a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc @@ -48,8 +48,9 @@ class CreateTreeEnsembleVariableOp : public OpKernel { if (!result->InitFromSerialized(tree_ensemble_config_t->scalar()(), stamp_token)) { result->Unref(); - OP_REQUIRES(context, false, errors::InvalidArgument( - "Unable to parse tree ensemble config.")); + OP_REQUIRES( + context, false, + errors::InvalidArgument("Unable to parse tree ensemble config.")); } // Only create one, if one does not exist already. Report status for all diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc index f8086b0c2bb..b3fe38614e0 100644 --- a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc @@ -47,8 +47,8 @@ namespace boosted_trees { using boosted_trees::learner::LearnerConfig; using boosted_trees::learner::LearningRateConfig; using boosted_trees::learner::LearningRateDropoutDrivenConfig; -using boosted_trees::models::MultipleAdditiveTrees; using boosted_trees::models::DecisionTreeEnsembleResource; +using boosted_trees::models::MultipleAdditiveTrees; using boosted_trees::utils::DropoutUtils; using boosted_trees::utils::TensorUtils; diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc index 88f30064076..0f4c2298f56 100644 --- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc @@ -36,13 +36,12 @@ namespace tensorflow { using ::boosted_trees::QuantileConfig; -using boosted_trees::utils::TensorUtils; using boosted_trees::QuantileStreamResource; +using boosted_trees::utils::TensorUtils; namespace { const char* const kExampleWeightsName = "example_weights"; const char* const kMaxElementsName = "max_elements"; -const char* const kHandleName = "handle"; const char* const kNextStampTokenName = "next_stamp_token"; const char* const kStampTokenName = "stamp_token"; const char* const kAreBucketsReadyName = "are_buckets_ready"; @@ -52,7 +51,6 @@ const char* const kNumSparseFeaturesName = "num_sparse_features"; const char* const kSparseBucketsName = "sparse_buckets"; const char* const kSparseValuesName = "sparse_values"; const char* const kSparseIndicesName = "sparse_indices"; -const char* const kSparseStreamsStateName = "sparse_streams_state"; const char* const kSparseSummariesName = "sparse_summaries"; const char* const kSparseConfigName = "sparse_config"; const char* const kSparseOutputTensorName = "sparse_quantiles"; @@ -60,7 +58,6 @@ const char* const kSparseOutputTensorName = "sparse_quantiles"; const char* const kDenseBucketsName = "dense_buckets"; const char* const kDenseConfigName = "dense_config"; const char* const kDenseOutputTensorName = "dense_quantiles"; -const char* const kDenseStreamsStateName = "dense_streams_state"; const char* const kDenseSummariesName = "dense_summaries"; const char* const kDenseValuesName = "dense_values"; const char* const kNumDenseFeaturesName = "num_dense_features"; @@ -387,7 +384,7 @@ class MakeQuantileSummariesOp : public OpKernel { protobuf::Arena arena; ::boosted_trees::QuantileSummaryState* summary_proto = protobuf::Arena::CreateMessage< - ::boosted_trees::QuantileSummaryState>(&arena); + ::boosted_trees::QuantileSummaryState>(&arena); const auto& summary = stream.GetFinalSummary(); CopySummaryToProto(summary, summary_proto); // Output to tensor. diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc index 18b4abd654e..44a8ffaf4b2 100644 --- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc @@ -34,10 +34,10 @@ namespace tensorflow { +using boosted_trees::learner::LearnerConfig_MultiClassStrategy; using boosted_trees::learner::SplitInfo; using boosted_trees::learner::stochastic::GradientStats; using boosted_trees::learner::stochastic::NodeStats; -using boosted_trees::learner::LearnerConfig_MultiClassStrategy; namespace { const int32 DUMMY_FEATURE_DIMENSION = -1; @@ -47,9 +47,8 @@ class BaseBuildSplitOp : public OpKernel { public: explicit BaseBuildSplitOp(OpKernelConstruction* const context) : OpKernel(context) { - OP_REQUIRES_OK( - context, - context->GetAttr("feature_column_group_id", &feature_column_group_id_)); + OP_REQUIRES_OK(context, context->GetAttr("feature_column_group_id", + &feature_column_group_id_)); OP_REQUIRES_OK(context, context->GetAttr("l1_regularization", &l1_regularization_)); OP_REQUIRES_OK(context, diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc index a9a229c8ae0..90a0655201f 100644 --- a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc @@ -134,10 +134,9 @@ void SerializeScalarAccumulatorToOutput( OpKernelContext* context) { int64 num_slots = accumulator_resource.values().size(); Tensor* partition_ids_t = nullptr; - OP_REQUIRES_OK( - context, - context->allocate_output("output_partition_ids", TensorShape({num_slots}), - &partition_ids_t)); + OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids", + TensorShape({num_slots}), + &partition_ids_t)); auto partition_ids = partition_ids_t->vec(); // Feature ids tensor has ids of feature columns and their dimensions. @@ -149,15 +148,14 @@ void SerializeScalarAccumulatorToOutput( Tensor* gradients_t = nullptr; OP_REQUIRES_OK( - context, - context->allocate_output("output_gradients", TensorShape({num_slots}), - &gradients_t)); + context, context->allocate_output( + "output_gradients", TensorShape({num_slots}), &gradients_t)); auto gradients = gradients_t->vec(); Tensor* hessians_t = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output( - "output_hessians", TensorShape({num_slots}), &hessians_t)); + OP_REQUIRES_OK( + context, context->allocate_output("output_hessians", + TensorShape({num_slots}), &hessians_t)); auto hessians = hessians_t->vec(); int i = 0; @@ -177,10 +175,9 @@ void SerializeTensorAccumulatorToOutput( OpKernelContext* context) { int64 num_slots = accumulator_resource.values().size(); Tensor* partition_ids_t = nullptr; - OP_REQUIRES_OK( - context, - context->allocate_output("output_partition_ids", TensorShape({num_slots}), - &partition_ids_t)); + OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids", + TensorShape({num_slots}), + &partition_ids_t)); auto partition_ids = partition_ids_t->vec(); Tensor* feature_ids_t = nullptr; @@ -202,9 +199,8 @@ void SerializeTensorAccumulatorToOutput( int64 num_hessian_elements = hessian_shape.num_elements(); hessian_shape.InsertDim(0, num_slots); Tensor* hessians_t = nullptr; - OP_REQUIRES_OK( - context, - context->allocate_output("output_hessians", hessian_shape, &hessians_t)); + OP_REQUIRES_OK(context, context->allocate_output("output_hessians", + hessian_shape, &hessians_t)); auto hessians = hessians_t->flat_outer_dims(); int i = 0; diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc index f867e77d3ef..8bca132acfd 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc @@ -17,8 +17,8 @@ #include "tensorflow/core/framework/tensor_testutil.h" #include "tensorflow/core/platform/test.h" -using tensorflow::test::AsTensor; using std::vector; +using tensorflow::test::AsTensor; namespace tensorflow { namespace boosted_trees { diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h index 1c4181f1b13..8ad97fedc92 100644 --- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h +++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h @@ -15,9 +15,9 @@ #ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_ #define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_ +#include #include #include -#include #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h" #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h" diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc index cbe26ba918d..705b65e9db9 100644 --- a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc +++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc @@ -22,9 +22,9 @@ namespace tensorflow { namespace boosted_trees { namespace testutil { +using boosted_trees::trees::DenseFloatBinarySplit; using tensorflow::boosted_trees::trees::DecisionTreeConfig; using tensorflow::boosted_trees::trees::TreeNode; -using boosted_trees::trees::DenseFloatBinarySplit; namespace { diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc index 9de3e32b097..609519e8b11 100644 --- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc @@ -25,8 +25,8 @@ namespace boosted_trees { namespace utils { namespace { -using test::AsTensor; using errors::InvalidArgument; +using test::AsTensor; class BatchFeaturesTest : public ::testing::Test {}; diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc index 38f0151255b..db34db998a7 100644 --- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc +++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc @@ -23,10 +23,10 @@ #include "tensorflow/core/lib/random/simple_philox.h" #include "tensorflow/core/platform/logging.h" +using tensorflow::Status; using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig; using tensorflow::random::PhiloxRandom; using tensorflow::random::SimplePhilox; -using tensorflow::Status; namespace tensorflow { namespace boosted_trees { diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc index ce7632e5898..02f972c8e00 100644 --- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc @@ -26,9 +26,9 @@ #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/env.h" +using std::unordered_set; using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig; using tensorflow::boosted_trees::trees::DecisionTreeEnsembleConfig; -using std::unordered_set; namespace tensorflow { namespace boosted_trees { diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc index bb57dcf8ae7..ae99d53a2cf 100644 --- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc +++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc @@ -19,8 +19,8 @@ namespace tensorflow { namespace boosted_trees { -using shape_inference::InferenceContext; using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; using shape_inference::ShapeHandle; REGISTER_RESOURCE_HANDLE_OP(QuantileStreamResource); diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc index 0d27ddaf3a1..5d0ebbf73ce 100644 --- a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc +++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc @@ -18,9 +18,9 @@ namespace tensorflow { +using shape_inference::DimensionHandle; using shape_inference::InferenceContext; using shape_inference::ShapeHandle; -using shape_inference::DimensionHandle; REGISTER_OP("BuildDenseInequalitySplits") .Attr("feature_column_group_id: int") diff --git a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc index 0354f7853cb..179505eef01 100644 --- a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc +++ b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc @@ -19,9 +19,9 @@ namespace tensorflow { namespace boosted_trees { +using shape_inference::DimensionHandle; using shape_inference::InferenceContext; using shape_inference::ShapeHandle; -using shape_inference::DimensionHandle; REGISTER_RESOURCE_HANDLE_OP(StatsAccumulatorScalarResource); diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py index eefa7ef0dcc..81f58de28cb 100644 --- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py +++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py @@ -183,11 +183,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase): self.assertEqual(num_quantiles + 1, len(buckets)) self.assertAllEqual([2030, 2040, 2050, 2060], buckets) - def _testStreamingQuantileBucketsHelper(self, inputs): + def _testStreamingQuantileBucketsHelper( + self, inputs, num_quantiles=3, expected_buckets=None): """Helper to test quantile buckets on different inputs.""" - # Use 3 quantiles, 4 boundaries for simplicity. - num_quantiles = 3 # set generate_quantiles to True since the test will generate fewer # boundaries otherwise. with self.test_session() as sess: @@ -213,7 +212,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase): buckets, are_ready_flush = (sess.run( [buckets, are_ready_flush])) self.assertEqual(True, are_ready_flush) + # By default, use 3 quantiles, 4 boundaries for simplicity. self.assertEqual(num_quantiles + 1, len(buckets)) + if expected_buckets: + self.assertAllEqual(buckets, expected_buckets) def testStreamingQuantileBucketsRepeatedSingleValue(self): inputs = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] @@ -231,6 +233,28 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase): inputs = [5] self._testStreamingQuantileBucketsHelper(inputs) + def testStreamingQuantileBucketsEqualDistributionInSequence(self): + # Input pattern is of the form [1, 1, 1, 2, 2, 2, 3, 3, 3, ...] + ones = 100 * [1] + inputs = [] + for i in range(1, 101): + inputs += [i * k for k in ones] + # Expect 100 equally spaced buckets. + expected_buckets = range(1, 101) + self._testStreamingQuantileBucketsHelper( + inputs, num_quantiles=99, expected_buckets=expected_buckets) + + def testStreamingQuantileBucketsEqualDistributionInterleaved(self): + # Input pattern is of the form [1, 2, 3, 1, 2, 3, 1, 2, 3, ...] + sequence = range(1, 101) + inputs = [] + for _ in range(1, 101): + inputs += sequence + # Expect 100 equally spaced buckets. + expected_buckets = range(1, 101) + self._testStreamingQuantileBucketsHelper( + inputs, num_quantiles=99, expected_buckets=expected_buckets) + def testStreamingQuantileBuckets(self): """Sets up the quantile summary op test as follows. diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py index b281a4c6d1c..7a5f329b7ab 100644 --- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py +++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py @@ -81,32 +81,32 @@ def _scheduled_stamp_resource_op_runner(batch, stamp): if not batch: return arg_keys = set(batch[0].args.keys()) - grouped_args = collections.defaultdict(list) + grouped_args = collections.OrderedDict() resource_handles = [] # Check that the set of arguments is the same across all the scheduled ops. for op in batch: if set(op.args.keys()) != arg_keys: raise ValueError("Mismatching arguments: %s, %s.", op.args, arg_keys) for key in arg_keys: - grouped_args[key].append(op.args[key]) + grouped_args.setdefault(key, []).append(op.args[key]) resource_handles.append(op.resource_handle) # Move all the inputs to the op device in one RPC. - grouped_args = { - k: _move_tensors(v, resource_handles[0].device) - for k, v in grouped_args.items() - } + grouped_args = collections.OrderedDict( + (k, _move_tensors(v, resource_handles[0].device)) + for k, v in sorted(grouped_args.items())) with ops.device(resource_handles[0].device): return batch[0].op(resource_handles, stamp, **grouped_args) def run_handler_scheduled_ops(per_handler_ops, stamp, worker_device): """Given a dictionary of ops for each handler, runs them in batch.""" - batched_ops = collections.defaultdict(list) + batched_ops = collections.OrderedDict() # Group the ops by their batching_key. Ops that share the same batching key # can be executed together. - for handler in sorted(per_handler_ops.keys()): + for handler in per_handler_ops.keys(): for op in per_handler_ops[handler]: - batched_ops[(op.batching_key(), op.batch_runner_fn())].append(op) + key = (op.batching_key(), op.batch_runner_fn()) + batched_ops.setdefault(key, []).append(op) op_results = {} for batch in batched_ops.values(): # Run each of the batched ops using its runner. diff --git a/tensorflow/contrib/boosted_trees/python/training/__init__.py b/tensorflow/contrib/boosted_trees/python/training/__init__.py new file mode 100644 index 00000000000..b569ac5fdb6 --- /dev/null +++ b/tensorflow/contrib/boosted_trees/python/training/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""training module under boosted_trees.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py new file mode 100644 index 00000000000..c1750117cd7 --- /dev/null +++ b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""functions module under boosted_trees.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py index b95956dae2a..f0b66dcbbe1 100644 --- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py +++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import copy from tensorflow.contrib import learn @@ -163,7 +164,7 @@ def extract_features(features, feature_columns): scope = "gbdt" with variable_scope.variable_scope(scope): feature_columns = list(feature_columns) - transformed_features = {} + transformed_features = collections.OrderedDict() for fc in feature_columns: # pylint: disable=protected-access if isinstance(fc, feature_column_lib._EmbeddingColumn): @@ -681,13 +682,13 @@ class GradientBoostedDecisionTreeModel(object): control_flow_ops.no_op)) # Update handler stats. - handler_reads = {} + handler_reads = collections.OrderedDict() for handler in handlers: handler_reads[handler] = handler.scheduled_reads() handler_results = batch_ops_utils.run_handler_scheduled_ops( handler_reads, ensemble_stamp, worker_device) - per_handler_updates = {} + per_handler_updates = collections.OrderedDict() # Two values per handler. First one is if the handler is active for the # current layer. The second one is if the handler is going to be active # for the next layer. diff --git a/tensorflow/contrib/boosted_trees/python/utils/__init__.py b/tensorflow/contrib/boosted_trees/python/utils/__init__.py new file mode 100644 index 00000000000..6ceb150c265 --- /dev/null +++ b/tensorflow/contrib/boosted_trees/python/utils/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""utils module under boosted_trees.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h index 59f23332983..fea6b15640d 100644 --- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h +++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h @@ -399,6 +399,6 @@ const string kTestEmptyRow = R"({ }]}]})"; } // namespace -} // namepsace tensorflow +} // namespace tensorflow #endif // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_ diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD index 15abd2be038..80e18a43a71 100644 --- a/tensorflow/contrib/cluster_resolver/BUILD +++ b/tensorflow/contrib/cluster_resolver/BUILD @@ -34,6 +34,7 @@ py_library( ":cluster_resolver_py", ":gce_cluster_resolver_py", ":tpu_cluster_resolver_py", + "//tensorflow/python:util", ], ) diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py index d17501e87e7..b4d8cd4a7cf 100644 --- a/tensorflow/contrib/cluster_resolver/__init__.py +++ b/tensorflow/contrib/cluster_resolver/__init__.py @@ -26,3 +26,15 @@ from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver # pylint: enable=wildcard-import,unused-import + +from tensorflow.python.util.all_util import remove_undocumented + +_allowed_symbols = [ + 'ClusterResolver', + 'SimpleClusterResolver', + 'UnionClusterResolver', + 'GceClusterResolver', + 'TPUClusterResolver', +] + +remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py index 2e75ac226ea..a6a6e642e4e 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py +++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py @@ -143,7 +143,8 @@ class TPUClusterResolver(ClusterResolver): request = self._service.projects().locations().nodes().get(name=full_name) response = request.execute() - instance_url = '%s:%s' % (response['ipAddress'], response['port']) - worker_list.append(instance_url) + if 'health' in response and response['health'] == 'HEALTHY': + instance_url = '%s:%s' % (response['ipAddress'], response['port']) + worker_list.append(instance_url) return ClusterSpec({self._job_name: worker_list}) diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py index 0c4730613af..4fd34629cf7 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py +++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py @@ -105,7 +105,8 @@ class TPUClusterResolverTest(test.TestCase): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'ipAddress': '10.1.2.3', - 'port': '8470' + 'port': '8470', + 'health': 'HEALTHY' } } @@ -126,7 +127,8 @@ class TPUClusterResolverTest(test.TestCase): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'ipAddress': '10.1.2.3', - 'port': '8470' + 'port': '8470', + 'health': 'HEALTHY' } } @@ -147,11 +149,13 @@ class TPUClusterResolverTest(test.TestCase): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'ipAddress': '10.1.2.3', - 'port': '8470' + 'port': '8470', + 'health': 'HEALTHY' }, 'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': { 'ipAddress': '10.4.5.6', - 'port': '8470' + 'port': '8470', + 'health': 'HEALTHY' } } @@ -169,15 +173,54 @@ class TPUClusterResolverTest(test.TestCase): """ self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto) + def testHealthyTpuNodeRetrieval(self): + tpu_map = { + 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { + 'ipAddress': '10.1.2.3', + 'port': '8470', + 'health': 'HEALTHY' + }, + 'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': { + 'ipAddress': '10.4.5.6', + 'port': '8470', + }, + 'projects/test-project/locations/us-central1-c/nodes/test-tpu-3': { + 'ipAddress': '10.7.8.9', + 'port': '8470', + 'health': 'UNHEALTHY' + } + } + + tpu_cluster_resolver = TPUClusterResolver( + project='test-project', + zone='us-central1-c', + tpu_names=['test-tpu-2', 'test-tpu-1', 'test-tpu-3'], + credentials=None, + service=self.mock_service_client(tpu_map=tpu_map)) + + actual_cluster_spec = tpu_cluster_resolver.cluster_spec() + expected_proto = """ + job { + name: 'tpu_worker' + tasks { + key: 0 + value: '10.1.2.3:8470' + } + } + """ + self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto) + def testGetMasterMultipleEntries(self): tpu_map = { 'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': { 'ipAddress': '10.1.2.3', - 'port': '8470' + 'port': '8470', + 'health': 'HEALTHY' }, 'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': { 'ipAddress': '10.4.5.6', - 'port': '8470' + 'port': '8470', + 'health': 'HEALTHY' } } diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake index aedb793d2ae..fd05fa6d472 100644 --- a/tensorflow/contrib/cmake/external/protobuf.cmake +++ b/tensorflow/contrib/cmake/external/protobuf.cmake @@ -16,7 +16,7 @@ include (ExternalProject) set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src) set(PROTOBUF_URL https://github.com/google/protobuf.git) -set(PROTOBUF_TAG b04e5cba356212e4e8c66c61bbe0c3a20537c5b9) +set(PROTOBUF_TAG 396336eb961b75f03b25824fe86cf6490fb75e3a) if(WIN32) set(protobuf_STATIC_LIBRARIES diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt index 1260b81b07a..3bae1edadb8 100644 --- a/tensorflow/contrib/cmake/python_modules.txt +++ b/tensorflow/contrib/cmake/python_modules.txt @@ -6,6 +6,7 @@ tensorflow/core/example tensorflow/core/framework tensorflow/core/lib tensorflow/core/lib/core +tensorflow/core/profiler tensorflow/core/protobuf tensorflow/core/util tensorflow/examples @@ -216,6 +217,8 @@ tensorflow/contrib/input_pipeline/python/ops tensorflow/contrib/integrate tensorflow/contrib/integrate/python tensorflow/contrib/integrate/python/ops +tensorflow/contrib/kafka/python +tensorflow/contrib/kafka/python/ops tensorflow/contrib/keras tensorflow/contrib/keras/api tensorflow/contrib/keras/api/keras diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake index 138993db352..c42bc35ce7e 100644 --- a/tensorflow/contrib/cmake/tf_core_ops.cmake +++ b/tensorflow/contrib/cmake/tf_core_ops.cmake @@ -30,6 +30,7 @@ set(tf_op_lib_names "list_ops" "lookup_ops" "logging_ops" + "manip_ops" "math_ops" "nn_ops" "no_op" diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake index 8862390d2b6..34c466fa01e 100755 --- a/tensorflow/contrib/cmake/tf_python.cmake +++ b/tensorflow/contrib/cmake/tf_python.cmake @@ -307,7 +307,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name) # containing the wrappers. add_custom_command( OUTPUT ${GENERATE_PYTHON_OP_LIB_DESTINATION} - COMMAND ${tf_python_op_lib_name}_gen_python ${tensorflow_source_dir}/tensorflow/core/api_def/base_api,${tensorflow_source_dir}/tensorflow/core/api_def/python_api @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION} + COMMAND ${tf_python_op_lib_name}_gen_python ${tensorflow_source_dir}/tensorflow/core/api_def/base_api,${tensorflow_source_dir}/tensorflow/core/api_def/python_api ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION} DEPENDS ${tf_python_op_lib_name}_gen_python ) @@ -335,6 +335,7 @@ GENERATE_PYTHON_OP_LIB("list_ops") GENERATE_PYTHON_OP_LIB("logging_ops") GENERATE_PYTHON_OP_LIB("lookup_ops") GENERATE_PYTHON_OP_LIB("nn_ops") +GENERATE_PYTHON_OP_LIB("manip_ops") GENERATE_PYTHON_OP_LIB("parsing_ops") GENERATE_PYTHON_OP_LIB("random_ops") GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops" diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py index f67698eb99a..77ea914380d 100644 --- a/tensorflow/contrib/cmake/tools/create_def_file.py +++ b/tensorflow/contrib/cmake/tools/create_def_file.py @@ -31,6 +31,7 @@ from __future__ import division from __future__ import print_function import argparse +import codecs import io import os import re @@ -103,7 +104,7 @@ def main(): for lib_path in args.input: proc = subprocess.Popen([DUMPBIN, "/nologo", "/linkermember:1", lib_path], stdout=subprocess.PIPE) - for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"): + for line in codecs.getreader("utf-8")(proc.stdout): cols = line.split() if len(cols) < 2: continue @@ -131,7 +132,7 @@ def main(): # We compare on undname but use the decorated name from candidates. dupes = 0 proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE) - for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")): + for idx, line in enumerate(codecs.getreader("utf-8")(proc.stdout)): decorated = candidates[idx] if decorated in taken: # Symbol is already in output, done. diff --git a/tensorflow/contrib/coder/README.md b/tensorflow/contrib/coder/README.md index e1e867db5aa..c6c379c4588 100644 --- a/tensorflow/contrib/coder/README.md +++ b/tensorflow/contrib/coder/README.md @@ -30,7 +30,7 @@ following sense: around, - The number of CDF axes does not extend, i.e., `CDF.ndim == data.ndim + 1`. -In the previous example where data has shape (10, 10), the followings are +In the previous example where data has shape (10, 10), the following are acceptable CDF shapes: - (10, 10, 65) diff --git a/tensorflow/contrib/coder/kernels/range_coder.cc b/tensorflow/contrib/coder/kernels/range_coder.cc index f4f076b6c4e..21b35155ff3 100644 --- a/tensorflow/contrib/coder/kernels/range_coder.cc +++ b/tensorflow/contrib/coder/kernels/range_coder.cc @@ -276,7 +276,7 @@ void RangeEncoder::Finalize(string* sink) { } } else if (base_ != 0) { // If base == 0, then pick 0 from [base, base + size) and no zeros are - // explcitly written. + // explicitly written. // // Otherwise, pick (base + (2^16 - base[16:0])), i.e., round up base to the // next multiple of 2^16. As 2^16 < size, this value should be in the diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py index 2108e42bce4..29a593f6bcf 100644 --- a/tensorflow/contrib/compiler/jit_test.py +++ b/tensorflow/contrib/compiler/jit_test.py @@ -24,6 +24,7 @@ from tensorflow.python.framework import function from tensorflow.python.framework import op_def_registry from tensorflow.python.framework import ops from tensorflow.python.framework import random_seed +from tensorflow.python.framework import test_util from tensorflow.python.ops import gradients from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops @@ -169,6 +170,7 @@ class JITTest(test.TestCase): self.assertEqual(b"jit_scope_0", func_attrs["_XlaScope"].s) +@test_util.with_c_api class CompilationEnabledInGradientTest(test.TestCase): def testCompilationInGradient(self): @@ -188,7 +190,7 @@ class CompilationEnabledInGradientTest(test.TestCase): for cg in c_grad_ops: self.assertTrue(cg.get_attr("_XlaCompile")) for ncg in nc_grad_ops: - with self.assertRaisesRegexp(ValueError, "No attr named"): + with self.assertRaisesRegexp(ValueError, "[Nn]o attr named"): ncg.get_attr("_XlaCompile") # d/dx (x ** 4) = 4 * (x ** 3) diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc index 9e41e678571..1a79bf066c3 100644 --- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc +++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc @@ -251,9 +251,8 @@ REGISTER_OP("CudnnRNNParamsToCanonical") TF_RETURN_IF_ERROR(c->GetAttr("num_params", &num_params)); // Set shape for weight matrices for (int i = 0; i < num_params; i++) { - c->set_output(i, - c->Matrix(InferenceContext::kUnknownDim, - InferenceContext::kUnknownDim)); + c->set_output(i, c->Matrix(InferenceContext::kUnknownDim, + InferenceContext::kUnknownDim)); } // Set shape for bias vectors for (int i = 0; i < num_params; i++) { @@ -300,6 +299,7 @@ upcoming training or inferences. num_params: number of parameter sets for all layers. Each layer may contain multiple parameter sets, with each set consisting of a weight matrix and a bias vector. -)doc", kCudnnRNNCommonAttrs)); +)doc", + kCudnnRNNCommonAttrs)); } // namespace tensorflow diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py index 4fc5ff1bd18..56c562a3bad 100644 --- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py +++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py @@ -20,6 +20,7 @@ from __future__ import print_function import time +from six.moves import xrange from tensorflow.contrib import rnn as contrib_rnn from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops from tensorflow.contrib.rnn.python.ops import lstm_ops diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index 1cf0202fd88..04a21f2b0f5 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -126,6 +126,7 @@ py_library( "//tensorflow/python:client_testlib", "//tensorflow/python:errors", "//tensorflow/python:framework_ops", + "//tensorflow/python:lookup_ops", "//tensorflow/python:platform", "//tensorflow/python:sparse_tensor", "//tensorflow/python:training", diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py index 015f69c5673..0c2827b1e49 100644 --- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py @@ -744,6 +744,23 @@ class BatchDatasetSerializationTest( lambda: self._build_dataset_dense_to_sparse(diff_comp), num_outputs) + def _sparse(self, i): + return sparse_tensor.SparseTensorValue( + indices=[[0]], values=(i * [1]), dense_shape=[1]) + + def _build_dataset_sparse(self, batch_size=5): + return dataset_ops.Dataset.range(10).map(self._sparse).batch(batch_size) + + def testSparseCore(self): + self.run_core_tests(self._build_dataset_sparse, + lambda: self._build_dataset_sparse(2), 2) + + def _build_dataset_nested_sparse(self): + return dataset_ops.Dataset.range(10).map(self._sparse).batch(5).batch(2) + + def testNestedSparseCore(self): + self.run_core_tests(self._build_dataset_nested_sparse, None, 1) + class PaddedBatchDatasetSerializationTest( dataset_serialization_test_base.DatasetSerializationTestBase): diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py index 4d984bb4d76..6de93059d8c 100644 --- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py @@ -41,8 +41,7 @@ class GroupByWindowTest(test.TestCase): dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x) .apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), - 4)) - .make_initializable_iterator()) + 4)).make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() @@ -53,7 +52,8 @@ class GroupByWindowTest(test.TestCase): while True: result = sess.run(get_next) self.assertTrue( - all(x % 2 == 0 for x in result) or all(x % 2 == 1) + all(x % 2 == 0 + for x in result) or all(x % 2 == 1) for x in result) counts.append(result.shape[0]) @@ -116,8 +116,8 @@ class GroupByWindowTest(test.TestCase): iterator = ( dataset_ops.Dataset.from_tensor_slices(components) .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply( - grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32)) - .make_initializable_iterator()) + grouping.group_by_window(lambda x, _: x % 2, reduce_func, + 32)).make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() @@ -136,7 +136,8 @@ class GroupByWindowTest(test.TestCase): window.padded_batch( 4, padded_shapes=tensor_shape.TensorShape([None])), window.padded_batch( - 4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),)) + 4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])), + )) iterator = ( dataset_ops.Dataset.from_tensor_slices(components) @@ -200,9 +201,10 @@ class BucketTest(test.TestCase): # dynamically and does not rely on static shape information about # the arguments. return dataset_ops.Dataset.zip( - (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch( - 32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape([None]), - tensor_shape.TensorShape([3]))))) + (dataset_ops.Dataset.from_tensors(bucket), + window.padded_batch( + 32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape( + [None]), tensor_shape.TensorShape([3]))))) def testSingleBucket(self): @@ -307,12 +309,13 @@ class BucketTest(test.TestCase): def _dynamic_pad_fn(bucket, window, _): return dataset_ops.Dataset.zip( - (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch( - 32, { - "x": tensor_shape.TensorShape([]), - "y": tensor_shape.TensorShape([None]), - "z": tensor_shape.TensorShape([3]) - }))) + (dataset_ops.Dataset.from_tensors(bucket), + window.padded_batch( + 32, { + "x": tensor_shape.TensorShape([]), + "y": tensor_shape.TensorShape([None]), + "z": tensor_shape.TensorShape([3]) + }))) input_dataset = ( dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn) diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py index 7cde6e05b24..dbc35097ddd 100644 --- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py +++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py @@ -24,9 +24,11 @@ import numpy as np from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops from tensorflow.python.data.ops import iterator_ops +from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor +from tensorflow.python.ops import lookup_ops from tensorflow.python.ops import variables from tensorflow.python.platform import gfile from tensorflow.python.platform import test @@ -34,14 +36,29 @@ from tensorflow.python.training import saver as saver_lib from tensorflow.python.util import nest +def remove_variants(get_next_op): + # TODO(b/72408568): Remove this once session.run can get + # variant tensors. + """Remove variants from a nest structure, so sess.run will execute.""" + + def _remove_variant(x): + if isinstance(x, ops.Tensor) and x.dtype == dtypes.variant: + return () + else: + return x + + return nest.map_structure(_remove_variant, get_next_op) + + class DatasetSerializationTestBase(test.TestCase): """Base class for testing serializable datasets.""" def tearDown(self): self._delete_ckpt() - # TODO(b/70988345): Support native `tf.SparseTensor` objects and get rid of - # `sparse_tensors` argument. + # TODO(b/72657739): Remove sparse_tensor argument, which is to test the + # (deprecated) saveable `SparseTensorSliceDataset`, once the API + # `from_sparse_tensor_slices()`and related tests are deleted. def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False): """Runs the core tests. @@ -233,10 +250,10 @@ class DatasetSerializationTestBase(test.TestCase): saver = self._import_meta_graph() init_op, get_next_op = self._get_iterator_ops_from_collection( ds_fn, sparse_tensors=sparse_tensors) + get_next_op = remove_variants(get_next_op) with self.test_session(graph=g) as sess: self._restore(saver, sess) - sess.run(variables.global_variables_initializer()) - sess.run(init_op) + self._initialize(init_op, sess) for _ in range(num_outputs): actual.append(sess.run(get_next_op)) if verify_exhausted: @@ -296,6 +313,7 @@ class DatasetSerializationTestBase(test.TestCase): with ops.Graph().as_default() as g: _, get_next_op, saver = self._build_graph( ds_fn2, sparse_tensors=sparse_tensors) + get_next_op = remove_variants(get_next_op) with self.test_session(graph=g) as sess: self._restore(saver, sess) for _ in range(num_outputs - break_point): @@ -356,6 +374,7 @@ class DatasetSerializationTestBase(test.TestCase): with ops.Graph().as_default() as g: get_next_op, saver = self._build_empty_graph( ds_fn, sparse_tensors=sparse_tensors) + get_next_op = remove_variants(get_next_op) with self.test_session(graph=g) as sess: self._restore(saver, sess) for _ in range(num_outputs - break_point): @@ -389,9 +408,9 @@ class DatasetSerializationTestBase(test.TestCase): with ops.Graph().as_default() as g: init_op, get_next_op, saver = self._build_graph( ds_fn, sparse_tensors=sparse_tensors) + get_next_op = remove_variants(get_next_op) with self.test_session(graph=g) as sess: - sess.run(variables.global_variables_initializer()) - sess.run(init_op) + self._initialize(init_op, sess) for _ in range(break_point): sess.run(get_next_op) with self.assertRaises(error): @@ -485,20 +504,20 @@ class DatasetSerializationTestBase(test.TestCase): else: init_op, get_next_op, saver = self._build_graph( ds_fn, sparse_tensors=sparse_tensors) + get_next_op = remove_variants(get_next_op) return init_op, get_next_op, saver for i in range(len(break_points) + 1): with ops.Graph().as_default() as g: init_op, get_next_op, saver = get_ops() + get_next_op = remove_variants(get_next_op) with self.test_session(graph=g) as sess: if ckpt_saved: if init_before_restore: - sess.run(variables.global_variables_initializer()) - sess.run(init_op) + self._initialize(init_op, sess) self._restore(saver, sess) else: - sess.run(variables.global_variables_initializer()) - sess.run(init_op) + self._initialize(init_op, sess) start = break_points[i - 1] if i > 0 else 0 end = break_points[i] if i < len(break_points) else num_outputs num_iters = end - start @@ -562,13 +581,16 @@ class DatasetSerializationTestBase(test.TestCase): get_next = sparse_tensor.SparseTensor(*iterator.get_next()) else: get_next = iterator.get_next() - self._add_iterator_ops_to_collection(init_op, get_next, sparse_tensors) + self._add_iterator_ops_to_collection(init_op, get_next, ds_fn, + sparse_tensors) saver = saver_lib.Saver(allow_empty=True) return init_op, get_next, saver def _build_empty_graph(self, ds_fn, sparse_tensors=False): iterator = iterator_ops.Iterator.from_structure( - self._get_output_types(ds_fn), self._get_output_shapes(ds_fn)) + self._get_output_types(ds_fn), + output_shapes=self._get_output_shapes(ds_fn), + output_classes=self._get_output_classes(ds_fn)) saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable) if sparse_tensors: @@ -581,12 +603,19 @@ class DatasetSerializationTestBase(test.TestCase): def _add_iterator_ops_to_collection(self, init_op, get_next, + ds_fn, sparse_tensors=False): ops.add_to_collection("iterator_ops", init_op) # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections # do not support tuples we flatten the tensors and restore the shape in # `_get_iterator_ops_from_collection`. - if sparse_tensors: + + # TODO(shivaniagrwal): `output_classes` is a nested structure of classes, + # this base class is specific to current test cases. Update when tests are + # added with `output_classes` as a nested structure with at least one of the + # component being `tf.SparseTensor`. + if (sparse_tensors or + self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor): ops.add_to_collection("iterator_ops", get_next.indices) ops.add_to_collection("iterator_ops", get_next.values) ops.add_to_collection("iterator_ops", get_next.dense_shape) @@ -596,7 +625,8 @@ class DatasetSerializationTestBase(test.TestCase): def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False): all_ops = ops.get_collection("iterator_ops") - if sparse_tensors: + if (sparse_tensors or + self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor): init_op, indices, values, dense_shape = all_ops return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape) else: @@ -611,6 +641,10 @@ class DatasetSerializationTestBase(test.TestCase): with ops.Graph().as_default(): return ds_fn().output_shapes + def _get_output_classes(self, ds_fn): + with ops.Graph().as_default(): + return ds_fn().output_classes + def _ckpt_path(self): return os.path.join(self.get_temp_dir(), "iterator") @@ -621,8 +655,14 @@ class DatasetSerializationTestBase(test.TestCase): saver.save(sess, self._ckpt_path()) def _restore(self, saver, sess): + sess.run(lookup_ops.tables_initializer()) saver.restore(sess, self._latest_ckpt()) + def _initialize(self, init_op, sess): + sess.run(variables.global_variables_initializer()) + sess.run(lookup_ops.tables_initializer()) + sess.run(init_op) + def _import_meta_graph(self): meta_file_path = self._ckpt_path() + ".meta" return saver_lib.import_meta_graph(meta_file_path) diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py index 5921be2ae89..06883934d04 100644 --- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py @@ -194,6 +194,10 @@ class FilterDatasetSerializationTest( return dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map( lambda x, i: x) + def testSparseCore(self): + num_outputs = 5 + self.run_core_tests(self._build_sparse_filter, None, num_outputs) + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py index d4fbaa5cdcd..86d69495ef4 100644 --- a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py @@ -225,6 +225,21 @@ class FlatMapDatasetSerializationTest( self.verify_error_on_save(build_ds, 500, errors.InvalidArgumentError) + def testSparseCore(self): + + def _map_fn(i): + return sparse_tensor.SparseTensorValue( + indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2]) + + def _flat_map_fn(x): + return dataset_ops.Dataset.from_tensor_slices( + sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values)) + + def _build_ds(): + return dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn) + + self.run_core_tests(_build_ds, None, 20) + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py index b1937c08f34..db8429512bf 100644 --- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py @@ -252,6 +252,22 @@ class InterleaveDatasetSeriazationTest( None, num_outputs) # pylint: enable=g-long-lambda + def testSparseCore(self): + + def _map_fn(i): + return sparse_tensor.SparseTensorValue( + indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2]) + + def _interleave_fn(x): + return dataset_ops.Dataset.from_tensor_slices( + sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values)) + + def _build_dataset(): + return dataset_ops.Dataset.range(10).map(_map_fn).interleave( + _interleave_fn, cycle_length=1) + + self.run_core_tests(_build_dataset, None, 20) + class ParallelInterleaveDatasetTest(test.TestCase): diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py index dd8247bfd47..d3ce89298be 100644 --- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py @@ -805,6 +805,21 @@ class MapDatasetSerializationTest( self.run_core_tests(_build_ds, None, num_outputs) + def testSparseCore(self): + + def _sparse(i): + return sparse_tensor.SparseTensorValue( + indices=np.array([[0, 0]]), + values=(i * np.array([1])), + dense_shape=np.array([1, 1])) + + def _build_ds(num_outputs): + return contrib_dataset_ops.Dataset.range(num_outputs).map(_sparse) + + num_outputs = 10 + self.run_core_tests(lambda: _build_ds(num_outputs), + lambda: _build_ds(int(num_outputs / 2)), num_outputs) + class ParallelMapDatasetSerializationTest( dataset_serialization_test_base.DatasetSerializationTestBase): @@ -851,7 +866,8 @@ class ParallelMapDatasetSerializationTest( return random_ops.random_uniform( (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x) - return contrib_dataset_ops.Dataset.range(100).map(_map_fn) + return contrib_dataset_ops.Dataset.range(100).map( + _map_fn, num_parallel_calls=2).prefetch(2) self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError) @@ -861,7 +877,8 @@ class ParallelMapDatasetSerializationTest( counter_var = variable_scope.get_variable( "counter", (), dtypes.int32, use_resource=True) return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map( - lambda _: counter_var.assign_add(1))) + lambda _: counter_var.assign_add(1), + num_parallel_calls=2).prefetch(2)) self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError) @@ -870,7 +887,7 @@ class ParallelMapDatasetSerializationTest( def _build_ds(): constant_var = constant_op.constant(5) return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map( - lambda x: x + constant_var)) + lambda x: x + constant_var, num_parallel_calls=2).prefetch(2)) self.run_core_tests(_build_ds, None, 10) @@ -883,7 +900,8 @@ class ParallelMapDatasetSerializationTest( def defun_fn(x): return constant_op.constant(1000) + math_ops.to_int32(x) - return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn) + return contrib_dataset_ops.Dataset.range(num_outputs).map( + defun_fn, num_parallel_calls=2).prefetch(2) self.run_core_tests(_build_ds, None, num_outputs) @@ -901,7 +919,8 @@ class ParallelMapDatasetSerializationTest( return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x)) - return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn) + return contrib_dataset_ops.Dataset.range(num_outputs).map( + defun_fn, num_parallel_calls=2).prefetch(2) self.run_core_tests(_build_ds, None, num_outputs) diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py index 76c07b2c999..6eb512dec67 100644 --- a/tensorflow/contrib/data/python/ops/batching.py +++ b/tensorflow/contrib/data/python/ops/batching.py @@ -403,7 +403,7 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1): num_parallel_batches: A `tf.int64` scalar `tf.Tensor`, representing the number of batches to create in parallel. On one hand, higher values can help mitigate the effect of stragglers. On the other hand, higher values - can increasing contention if CPU is scarce. + can increase contention if CPU is scarce. Returns: A `Dataset` transformation function, which can be passed to diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py index 1dd0729513c..9cd1701c397 100644 --- a/tensorflow/contrib/data/python/ops/stats_ops.py +++ b/tensorflow/contrib/data/python/ops/stats_ops.py @@ -20,6 +20,7 @@ from __future__ import print_function from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import iterator_ops from tensorflow.python.data.util import nest +from tensorflow.python.data.util import sparse from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import gen_dataset_ops @@ -161,8 +162,10 @@ class _StatsDataset(dataset_ops.Dataset): return self._op_function( self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access self._tag, - output_shapes=nest.flatten(self.output_shapes), - output_types=nest.flatten(self.output_types)) + output_types=nest.flatten( + sparse.as_dense_types(self.output_types, self.output_classes)), + output_shapes=nest.flatten( + sparse.as_dense_shapes(self.output_shapes, self.output_classes))) @property def output_shapes(self): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py index a255d4fc890..31d24aa9ea0 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py @@ -23,10 +23,15 @@ import itertools import numpy as np from tensorflow.contrib.distributions.python.ops import distribution_util +from tensorflow.contrib.distributions.python.ops import mixture +from tensorflow.contrib.distributions.python.ops import mixture_same_family +from tensorflow.contrib.distributions.python.ops import mvn_diag from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops +from tensorflow.python.ops.distributions import categorical +from tensorflow.python.ops.distributions import normal from tensorflow.python.ops.linalg import linear_operator_diag import tensorflow.python.ops.nn_grad # pylint: disable=unused-import from tensorflow.python.platform import test @@ -395,6 +400,41 @@ class MixtureStddevTest(test.TestCase): self.assertAllClose(actual_devs, expected_devs) +class PadMixtureDimensionsTest(test.TestCase): + + def test_pad_mixture_dimensions_mixture(self): + with self.test_session() as sess: + gm = mixture.Mixture( + cat=categorical.Categorical(probs=[[0.3, 0.7]]), + components=[ + normal.Normal(loc=[-1.0], scale=[1.0]), + normal.Normal(loc=[1.0], scale=[0.5]) + ]) + + x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]]) + x_pad = distribution_util.pad_mixture_dimensions( + x, gm, gm.cat, gm.event_shape.ndims) + x_out, x_pad_out = sess.run([x, x_pad]) + + self.assertAllEqual(x_pad_out.shape, [2, 2]) + self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1])) + + def test_pad_mixture_dimensions_mixture_same_family(self): + with self.test_session() as sess: + gm = mixture_same_family.MixtureSameFamily( + mixture_distribution=categorical.Categorical(probs=[0.3, 0.7]), + components_distribution=mvn_diag.MultivariateNormalDiag( + loc=[[-1., 1], [1, -1]], scale_identity_multiplier=[1.0, 0.5])) + + x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]]) + x_pad = distribution_util.pad_mixture_dimensions( + x, gm, gm.mixture_distribution, gm.event_shape.ndims) + x_out, x_pad_out = sess.run([x, x_pad]) + + self.assertAllEqual(x_pad_out.shape, [2, 2, 1]) + self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1])) + + class _PadTest(object): def testNegAxisCorrectness(self): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py new file mode 100644 index 00000000000..ea3c86b5c0f --- /dev/null +++ b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py @@ -0,0 +1,388 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import importlib + +import numpy as np + +from tensorflow.contrib.distributions.python.ops import kumaraswamy as kumaraswamy_lib +from tensorflow.python.client import session +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import random_seed +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import test +from tensorflow.python.platform import tf_logging + + +def try_import(name): # pylint: disable=invalid-name + module = None + try: + module = importlib.import_module(name) + except ImportError as e: + tf_logging.warning("Could not import %s: %s" % (name, str(e))) + return module + + +special = try_import("scipy.special") +stats = try_import("scipy.stats") + + +def _kumaraswamy_mode(a, b): + a = np.asarray(a) + b = np.asarray(b) + return ((a - 1) / (a * b - 1))**(1 / a) + + +def _kumaraswamy_moment(a, b, n): + a = np.asarray(a) + b = np.asarray(b) + return b * special.beta(1.0 + n / a, b) + + +def _harmonic_number(b): + b = np.asarray(b) + return special.psi(b + 1) - special.psi(1) + + +def _kumaraswamy_cdf(a, b, x): + a = np.asarray(a) + b = np.asarray(b) + x = np.asarray(x) + return 1 - (1 - x**a)**b + + +def _kumaraswamy_pdf(a, b, x): + a = np.asarray(a) + b = np.asarray(b) + x = np.asarray(x) + return a * b * x ** (a - 1) * (1 - x ** a) ** (b - 1) + + +class KumaraswamyTest(test.TestCase): + + def testSimpleShapes(self): + with self.test_session(): + a = np.random.rand(3) + b = np.random.rand(3) + dist = kumaraswamy_lib.Kumaraswamy(a, b) + self.assertAllEqual([], dist.event_shape_tensor().eval()) + self.assertAllEqual([3], dist.batch_shape_tensor().eval()) + self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape) + self.assertEqual(tensor_shape.TensorShape([3]), dist.batch_shape) + + def testComplexShapes(self): + with self.test_session(): + a = np.random.rand(3, 2, 2) + b = np.random.rand(3, 2, 2) + dist = kumaraswamy_lib.Kumaraswamy(a, b) + self.assertAllEqual([], dist.event_shape_tensor().eval()) + self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval()) + self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape) + self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape) + + def testComplexShapesBroadcast(self): + with self.test_session(): + a = np.random.rand(3, 2, 2) + b = np.random.rand(2, 2) + dist = kumaraswamy_lib.Kumaraswamy(a, b) + self.assertAllEqual([], dist.event_shape_tensor().eval()) + self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval()) + self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape) + self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape) + + def testAProperty(self): + a = [[1., 2, 3]] + b = [[2., 4, 3]] + with self.test_session(): + dist = kumaraswamy_lib.Kumaraswamy(a, b) + self.assertEqual([1, 3], dist.concentration1.get_shape()) + self.assertAllClose(a, dist.concentration1.eval()) + + def testBProperty(self): + a = [[1., 2, 3]] + b = [[2., 4, 3]] + with self.test_session(): + dist = kumaraswamy_lib.Kumaraswamy(a, b) + self.assertEqual([1, 3], dist.concentration0.get_shape()) + self.assertAllClose(b, dist.concentration0.eval()) + + def testPdfXProper(self): + a = [[1., 2, 3]] + b = [[2., 4, 3]] + with self.test_session(): + dist = kumaraswamy_lib.Kumaraswamy(a, b, validate_args=True) + dist.prob([.1, .3, .6]).eval() + dist.prob([.2, .3, .5]).eval() + # Either condition can trigger. + with self.assertRaisesOpError("sample must be positive"): + dist.prob([-1., 0.1, 0.5]).eval() + with self.assertRaisesOpError("sample must be positive"): + dist.prob([0., 0.1, 0.5]).eval() + with self.assertRaisesOpError("sample must be no larger than `1`"): + dist.prob([.1, .2, 1.2]).eval() + + def testPdfTwoBatches(self): + with self.test_session(): + a = [1., 2] + b = [1., 2] + x = [.5, .5] + dist = kumaraswamy_lib.Kumaraswamy(a, b) + pdf = dist.prob(x) + expected_pdf = _kumaraswamy_pdf(a, b, x) + self.assertAllClose(expected_pdf, pdf.eval()) + self.assertEqual((2,), pdf.get_shape()) + + def testPdfTwoBatchesNontrivialX(self): + with self.test_session(): + a = [1., 2] + b = [1., 2] + x = [.3, .7] + dist = kumaraswamy_lib.Kumaraswamy(a, b) + pdf = dist.prob(x) + expected_pdf = _kumaraswamy_pdf(a, b, x) + self.assertAllClose(expected_pdf, pdf.eval()) + self.assertEqual((2,), pdf.get_shape()) + + def testPdfUniformZeroBatch(self): + with self.test_session(): + # This is equivalent to a uniform distribution + a = 1. + b = 1. + x = np.array([.1, .2, .3, .5, .8], dtype=np.float32) + dist = kumaraswamy_lib.Kumaraswamy(a, b) + pdf = dist.prob(x) + expected_pdf = _kumaraswamy_pdf(a, b, x) + self.assertAllClose(expected_pdf, pdf.eval()) + self.assertEqual((5,), pdf.get_shape()) + + def testPdfAStretchedInBroadcastWhenSameRank(self): + with self.test_session(): + a = [[1., 2]] + b = [[1., 2]] + x = [[.5, .5], [.3, .7]] + dist = kumaraswamy_lib.Kumaraswamy(a, b) + pdf = dist.prob(x) + expected_pdf = _kumaraswamy_pdf(a, b, x) + self.assertAllClose(expected_pdf, pdf.eval()) + self.assertEqual((2, 2), pdf.get_shape()) + + def testPdfAStretchedInBroadcastWhenLowerRank(self): + with self.test_session(): + a = [1., 2] + b = [1., 2] + x = [[.5, .5], [.2, .8]] + pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x) + expected_pdf = _kumaraswamy_pdf(a, b, x) + self.assertAllClose(expected_pdf, pdf.eval()) + self.assertEqual((2, 2), pdf.get_shape()) + + def testPdfXStretchedInBroadcastWhenSameRank(self): + with self.test_session(): + a = [[1., 2], [2., 3]] + b = [[1., 2], [2., 3]] + x = [[.5, .5]] + pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x) + expected_pdf = _kumaraswamy_pdf(a, b, x) + self.assertAllClose(expected_pdf, pdf.eval()) + self.assertEqual((2, 2), pdf.get_shape()) + + def testPdfXStretchedInBroadcastWhenLowerRank(self): + with self.test_session(): + a = [[1., 2], [2., 3]] + b = [[1., 2], [2., 3]] + x = [.5, .5] + pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x) + expected_pdf = _kumaraswamy_pdf(a, b, x) + self.assertAllClose(expected_pdf, pdf.eval()) + self.assertEqual((2, 2), pdf.get_shape()) + + def testKumaraswamyMean(self): + with session.Session(): + a = [1., 2, 3] + b = [2., 4, 1.2] + dist = kumaraswamy_lib.Kumaraswamy(a, b) + self.assertEqual(dist.mean().get_shape(), (3,)) + if not stats: + return + expected_mean = _kumaraswamy_moment(a, b, 1) + self.assertAllClose(expected_mean, dist.mean().eval()) + + def testKumaraswamyVariance(self): + with session.Session(): + a = [1., 2, 3] + b = [2., 4, 1.2] + dist = kumaraswamy_lib.Kumaraswamy(a, b) + self.assertEqual(dist.variance().get_shape(), (3,)) + if not stats: + return + expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment( + a, b, 1)**2 + self.assertAllClose(expected_variance, dist.variance().eval()) + + def testKumaraswamyMode(self): + with session.Session(): + a = np.array([1.1, 2, 3]) + b = np.array([2., 4, 1.2]) + expected_mode = _kumaraswamy_mode(a, b) + dist = kumaraswamy_lib.Kumaraswamy(a, b) + self.assertEqual(dist.mode().get_shape(), (3,)) + self.assertAllClose(expected_mode, dist.mode().eval()) + + def testKumaraswamyModeInvalid(self): + with session.Session(): + a = np.array([1., 2, 3]) + b = np.array([2., 4, 1.2]) + dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False) + with self.assertRaisesOpError("Condition x < y.*"): + dist.mode().eval() + + a = np.array([2., 2, 3]) + b = np.array([1., 4, 1.2]) + dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False) + with self.assertRaisesOpError("Condition x < y.*"): + dist.mode().eval() + + def testKumaraswamyModeEnableAllowNanStats(self): + with session.Session(): + a = np.array([1., 2, 3]) + b = np.array([2., 4, 1.2]) + dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True) + + expected_mode = _kumaraswamy_mode(a, b) + expected_mode[0] = np.nan + self.assertEqual((3,), dist.mode().get_shape()) + self.assertAllClose(expected_mode, dist.mode().eval()) + + a = np.array([2., 2, 3]) + b = np.array([1., 4, 1.2]) + dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True) + + expected_mode = _kumaraswamy_mode(a, b) + expected_mode[0] = np.nan + self.assertEqual((3,), dist.mode().get_shape()) + self.assertAllClose(expected_mode, dist.mode().eval()) + + def testKumaraswamyEntropy(self): + with session.Session(): + a = np.array([1., 2, 3]) + b = np.array([2., 4, 1.2]) + dist = kumaraswamy_lib.Kumaraswamy(a, b) + self.assertEqual(dist.entropy().get_shape(), (3,)) + if not stats: + return + expected_entropy = (1 - 1. / a) + ( + 1 - 1. / b) * _harmonic_number(b) + np.log(a * b) + self.assertAllClose(expected_entropy, dist.entropy().eval()) + + def testKumaraswamySample(self): + with self.test_session(): + a = 1. + b = 2. + kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b) + n = constant_op.constant(100000) + samples = kumaraswamy.sample(n) + sample_values = samples.eval() + self.assertEqual(sample_values.shape, (100000,)) + self.assertFalse(np.any(sample_values < 0.0)) + if not stats: + return + self.assertLess( + stats.kstest( + # Kumaraswamy is a univariate distribution. + sample_values, + lambda x: _kumaraswamy_cdf(1., 2., x))[0], + 0.01) + # The standard error of the sample mean is 1 / (sqrt(18 * n)) + expected_mean = _kumaraswamy_moment(a, b, 1) + self.assertAllClose(sample_values.mean(axis=0), expected_mean, atol=1e-2) + expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment( + a, b, 1)**2 + self.assertAllClose( + np.cov(sample_values, rowvar=0), expected_variance, atol=1e-1) + + # Test that sampling with the same seed twice gives the same results. + def testKumaraswamySampleMultipleTimes(self): + with self.test_session(): + a_val = 1. + b_val = 2. + n_val = 100 + + random_seed.set_random_seed(654321) + kumaraswamy1 = kumaraswamy_lib.Kumaraswamy( + concentration1=a_val, concentration0=b_val, name="kumaraswamy1") + samples1 = kumaraswamy1.sample(n_val, seed=123456).eval() + + random_seed.set_random_seed(654321) + kumaraswamy2 = kumaraswamy_lib.Kumaraswamy( + concentration1=a_val, concentration0=b_val, name="kumaraswamy2") + samples2 = kumaraswamy2.sample(n_val, seed=123456).eval() + + self.assertAllClose(samples1, samples2) + + def testKumaraswamySampleMultidimensional(self): + with self.test_session(): + a = np.random.rand(3, 2, 2).astype(np.float32) + b = np.random.rand(3, 2, 2).astype(np.float32) + kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b) + n = constant_op.constant(100000) + samples = kumaraswamy.sample(n) + sample_values = samples.eval() + self.assertEqual(sample_values.shape, (100000, 3, 2, 2)) + self.assertFalse(np.any(sample_values < 0.0)) + if not stats: + return + self.assertAllClose( + sample_values[:, 1, :].mean(axis=0), + _kumaraswamy_moment(a, b, 1)[1, :], + atol=1e-1) + + def testKumaraswamyCdf(self): + with self.test_session(): + shape = (30, 40, 50) + for dt in (np.float32, np.float64): + a = 10. * np.random.random(shape).astype(dt) + b = 10. * np.random.random(shape).astype(dt) + x = np.random.random(shape).astype(dt) + actual = kumaraswamy_lib.Kumaraswamy(a, b).cdf(x).eval() + self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x) + self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x) + if not stats: + return + self.assertAllClose( + _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0) + + def testKumaraswamyLogCdf(self): + with self.test_session(): + shape = (30, 40, 50) + for dt in (np.float32, np.float64): + a = 10. * np.random.random(shape).astype(dt) + b = 10. * np.random.random(shape).astype(dt) + x = np.random.random(shape).astype(dt) + actual = math_ops.exp(kumaraswamy_lib.Kumaraswamy(a, + b).log_cdf(x)).eval() + self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x) + self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x) + if not stats: + return + self.assertAllClose( + _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py index 1e514fe0ff2..02064891758 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py @@ -107,7 +107,7 @@ def _test_capture_normal_sample_outputs(): ds.Normal._call_sample_n = true_normal_call_sample_n -def make_univariate_mixture(batch_shape, num_components): +def make_univariate_mixture(batch_shape, num_components, use_static_graph): batch_shape = ops.convert_to_tensor(batch_shape, dtypes.int32) logits = random_ops.random_uniform( array_ops.concat((batch_shape, [num_components]), axis=0), @@ -119,11 +119,11 @@ def make_univariate_mixture(batch_shape, num_components): for _ in range(num_components) ] cat = ds.Categorical(logits, dtype=dtypes.int32) - return ds.Mixture(cat, components) + return ds.Mixture(cat, components, use_static_graph=use_static_graph) def make_multivariate_mixture(batch_shape, num_components, event_shape, - batch_shape_tensor=None): + use_static_graph, batch_shape_tensor=None): if batch_shape_tensor is None: batch_shape_tensor = batch_shape batch_shape_tensor = ops.convert_to_tensor(batch_shape_tensor, dtypes.int32) @@ -145,15 +145,17 @@ def make_multivariate_mixture(batch_shape, num_components, event_shape, loc=loc, scale_diag=scale_diag) components = [create_component() for _ in range(num_components)] cat = ds.Categorical(logits, dtype=dtypes.int32) - return ds.Mixture(cat, components) + return ds.Mixture(cat, components, use_static_graph=use_static_graph) class MixtureTest(test.TestCase): + use_static_graph = False def testShapes(self): with self.test_session(): for batch_shape in ([], [1], [2, 3, 4]): - dist = make_univariate_mixture(batch_shape, num_components=10) + dist = make_univariate_mixture(batch_shape, num_components=10, + use_static_graph=self.use_static_graph) self.assertAllEqual(batch_shape, dist.batch_shape) self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval()) self.assertAllEqual([], dist.event_shape) @@ -161,7 +163,8 @@ class MixtureTest(test.TestCase): for event_shape in ([1], [2]): dist = make_multivariate_mixture( - batch_shape, num_components=10, event_shape=event_shape) + batch_shape, num_components=10, event_shape=event_shape, + use_static_graph=self.use_static_graph) self.assertAllEqual(batch_shape, dist.batch_shape) self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval()) self.assertAllEqual(event_shape, dist.event_shape) @@ -172,7 +175,8 @@ class MixtureTest(test.TestCase): r"cat.num_classes != len"): ds.Mixture( ds.Categorical([0.1, 0.5]), # 2 classes - [ds.Normal(loc=1.0, scale=2.0)]) + [ds.Normal(loc=1.0, scale=2.0)], + use_static_graph=self.use_static_graph) with self.assertRaisesWithPredicateMatch( ValueError, r"\(\) and \(2,\) are not compatible"): # The value error is raised because the batch shapes of the @@ -185,13 +189,15 @@ class MixtureTest(test.TestCase): loc=1.0, scale=2.0), # scalar dist ds.Normal( loc=[1.0, 1.0], scale=[2.0, 2.0]) - ]) + ], + use_static_graph=self.use_static_graph) with self.assertRaisesWithPredicateMatch(ValueError, r"Could not infer"): cat_logits = array_ops.placeholder(shape=[1, None], dtype=dtypes.float32) ds.Mixture( ds.Categorical(cat_logits), [ds.Normal( - loc=[1.0], scale=[2.0])]) + loc=[1.0], scale=[2.0])], + use_static_graph=self.use_static_graph) def testBrokenShapesDynamic(self): with self.test_session(): @@ -203,29 +209,37 @@ class MixtureTest(test.TestCase): loc=d0_param, scale=d0_param), ds.Normal( loc=d1_param, scale=d1_param) ], - validate_args=True) - with self.assertRaisesOpError(r"batch shape must match"): + validate_args=True, + use_static_graph=self.use_static_graph) + + if self.use_static_graph: + error_string = r"Shapes of all inputs must match" + else: + error_string = r"batch shape must match" + + with self.assertRaisesOpError(error_string): d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: [1.0]}) - with self.assertRaisesOpError(r"batch shape must match"): + with self.assertRaisesOpError(error_string): d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: 1.0}) def testBrokenTypes(self): with self.assertRaisesWithPredicateMatch(TypeError, "Categorical"): - ds.Mixture(None, []) + ds.Mixture(None, [], use_static_graph=self.use_static_graph) cat = ds.Categorical([0.3, 0.2]) # components must be a list of distributions with self.assertRaisesWithPredicateMatch( TypeError, "all .* must be Distribution instances"): - ds.Mixture(cat, [None]) + ds.Mixture(cat, [None], use_static_graph=self.use_static_graph) with self.assertRaisesWithPredicateMatch(TypeError, "same dtype"): ds.Mixture( cat, [ ds.Normal(loc=[1.0], scale=[2.0]), ds.Normal(loc=[np.float16(1.0)], scale=[np.float16(2.0)]), - ]) + ], use_static_graph=self.use_static_graph) with self.assertRaisesWithPredicateMatch(ValueError, "non-empty list"): - ds.Mixture(ds.Categorical([0.3, 0.2]), None) + ds.Mixture(ds.Categorical([0.3, 0.2]), None, + use_static_graph=self.use_static_graph) # TODO(ebrevdo): once distribution Domains have been added, add a # test to ensure that the domains of the distributions in a @@ -235,7 +249,8 @@ class MixtureTest(test.TestCase): with self.test_session() as sess: for batch_shape in ((), (2,), (2, 3)): dist = make_univariate_mixture( - batch_shape=batch_shape, num_components=2) + batch_shape=batch_shape, num_components=2, + use_static_graph=self.use_static_graph) mean = dist.mean() self.assertEqual(batch_shape, mean.get_shape()) @@ -256,7 +271,8 @@ class MixtureTest(test.TestCase): with self.test_session() as sess: for batch_shape in ((), (2,), (2, 3)): dist = make_multivariate_mixture( - batch_shape=batch_shape, num_components=2, event_shape=(4,)) + batch_shape=batch_shape, num_components=2, event_shape=(4,), + use_static_graph=self.use_static_graph) mean = dist.mean() self.assertEqual(batch_shape + (4,), mean.get_shape()) @@ -283,7 +299,8 @@ class MixtureTest(test.TestCase): with self.test_session() as sess: for batch_shape in ((), (2,), (2, 3)): dist = make_univariate_mixture( - batch_shape=batch_shape, num_components=num_components) + batch_shape=batch_shape, num_components=num_components, + use_static_graph=self.use_static_graph) dev = dist.stddev() self.assertEqual(batch_shape, dev.get_shape()) @@ -325,7 +342,8 @@ class MixtureTest(test.TestCase): dist = make_multivariate_mixture( batch_shape=batch_shape, num_components=num_components, - event_shape=(4,)) + event_shape=(4,), + use_static_graph=self.use_static_graph) dev = dist.stddev() self.assertEqual(batch_shape + (4,), dev.get_shape()) @@ -371,7 +389,8 @@ class MixtureTest(test.TestCase): scale=component_devs[0]), ds.Normal(loc=component_means[1], scale=component_devs[1]), - ]) + ], + use_static_graph=self.use_static_graph) mix_dev = mixture_dist.stddev() with self.test_session() as sess: actual_stddev = sess.run(mix_dev) @@ -379,7 +398,8 @@ class MixtureTest(test.TestCase): def testProbScalarUnivariate(self): with self.test_session() as sess: - dist = make_univariate_mixture(batch_shape=[], num_components=2) + dist = make_univariate_mixture(batch_shape=[], num_components=2, + use_static_graph=self.use_static_graph) for x in [ np.array( [1.0, 2.0], dtype=np.float32), np.array( @@ -405,7 +425,8 @@ class MixtureTest(test.TestCase): def testProbScalarMultivariate(self): with self.test_session() as sess: dist = make_multivariate_mixture( - batch_shape=[], num_components=2, event_shape=[3]) + batch_shape=[], num_components=2, event_shape=[3], + use_static_graph=self.use_static_graph) for x in [ np.array( [[-1.0, 0.0, 1.0], [0.5, 1.0, -0.3]], dtype=np.float32), np.array( @@ -432,7 +453,8 @@ class MixtureTest(test.TestCase): def testProbBatchUnivariate(self): with self.test_session() as sess: - dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2) + dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2, + use_static_graph=self.use_static_graph) for x in [ np.random.randn(2, 3).astype(np.float32), @@ -459,7 +481,8 @@ class MixtureTest(test.TestCase): def testProbBatchMultivariate(self): with self.test_session() as sess: dist = make_multivariate_mixture( - batch_shape=[2, 3], num_components=2, event_shape=[4]) + batch_shape=[2, 3], num_components=2, event_shape=[4], + use_static_graph=self.use_static_graph) for x in [ np.random.randn(2, 3, 4).astype(np.float32), @@ -487,7 +510,8 @@ class MixtureTest(test.TestCase): num_components = 3 batch_shape = [] dist = make_univariate_mixture( - batch_shape=batch_shape, num_components=num_components) + batch_shape=batch_shape, num_components=num_components, + use_static_graph=self.use_static_graph) n = 4 with _test_capture_normal_sample_outputs() as component_samples: samples = dist.sample(n, seed=123) @@ -502,7 +526,10 @@ class MixtureTest(test.TestCase): which_c = np.where(cat_sample_values == c)[0] size_c = which_c.size # Scalar Batch univariate case: batch_size == 1, rank 1 - which_dist_samples = dist_sample_values[c][:size_c] + if self.use_static_graph: + which_dist_samples = dist_sample_values[c][which_c] + else: + which_dist_samples = dist_sample_values[c][:size_c] self.assertAllClose(which_dist_samples, sample_values[which_c]) # Test that sampling with the same seed twice gives the same results. @@ -522,7 +549,8 @@ class MixtureTest(test.TestCase): ] cat = ds.Categorical( logits, dtype=dtypes.int32, name="cat1") - dist1 = ds.Mixture(cat, components, name="mixture1") + dist1 = ds.Mixture(cat, components, name="mixture1", + use_static_graph=self.use_static_graph) samples1 = dist1.sample(n, seed=123456).eval() random_seed.set_random_seed(654321) @@ -532,7 +560,8 @@ class MixtureTest(test.TestCase): ] cat2 = ds.Categorical( logits, dtype=dtypes.int32, name="cat2") - dist2 = ds.Mixture(cat2, components2, name="mixture2") + dist2 = ds.Mixture(cat2, components2, name="mixture2", + use_static_graph=self.use_static_graph) samples2 = dist2.sample(n, seed=123456).eval() self.assertAllClose(samples1, samples2) @@ -541,7 +570,8 @@ class MixtureTest(test.TestCase): with self.test_session() as sess: num_components = 3 dist = make_multivariate_mixture( - batch_shape=[], num_components=num_components, event_shape=[2]) + batch_shape=[], num_components=num_components, event_shape=[2], + use_static_graph=self.use_static_graph) n = 4 with _test_capture_mvndiag_sample_outputs() as component_samples: samples = dist.sample(n, seed=123) @@ -555,14 +585,18 @@ class MixtureTest(test.TestCase): which_c = np.where(cat_sample_values == c)[0] size_c = which_c.size # Scalar Batch multivariate case: batch_size == 1, rank 2 - which_dist_samples = dist_sample_values[c][:size_c, :] + if self.use_static_graph: + which_dist_samples = dist_sample_values[c][which_c, :] + else: + which_dist_samples = dist_sample_values[c][:size_c, :] self.assertAllClose(which_dist_samples, sample_values[which_c, :]) def testSampleBatchUnivariate(self): with self.test_session() as sess: num_components = 3 dist = make_univariate_mixture( - batch_shape=[2, 3], num_components=num_components) + batch_shape=[2, 3], num_components=num_components, + use_static_graph=self.use_static_graph) n = 4 with _test_capture_normal_sample_outputs() as component_samples: samples = dist.sample(n, seed=123) @@ -576,8 +610,12 @@ class MixtureTest(test.TestCase): which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c) size_c = which_c_s.size # Batch univariate case: batch_size == [2, 3], rank 3 - which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0, - which_c_b1] + if self.use_static_graph: + which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0, + which_c_b1] + else: + which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0, + which_c_b1] self.assertAllClose(which_dist_samples, sample_values[which_c_s, which_c_b0, which_c_b1]) @@ -594,7 +632,8 @@ class MixtureTest(test.TestCase): dist = make_multivariate_mixture( batch_shape=batch_shape, num_components=num_components, event_shape=[4], - batch_shape_tensor=batch_shape_tensor) + batch_shape_tensor=batch_shape_tensor, + use_static_graph=self.use_static_graph) n = 5 with _test_capture_mvndiag_sample_outputs() as component_samples: samples = dist.sample(n, seed=123) @@ -617,8 +656,12 @@ class MixtureTest(test.TestCase): which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c) size_c = which_c_s.size # Batch univariate case: batch_size == [2, 3], rank 4 (multivariate) - which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0, - which_c_b1, :] + if self.use_static_graph: + which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0, + which_c_b1, :] + else: + which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0, + which_c_b1, :] self.assertAllClose(which_dist_samples, sample_values[which_c_s, which_c_b0, which_c_b1, :]) @@ -632,7 +675,8 @@ class MixtureTest(test.TestCase): with self.test_session() as sess: for batch_shape in ((), (2,), (2, 3)): dist = make_multivariate_mixture( - batch_shape=batch_shape, num_components=2, event_shape=(4,)) + batch_shape=batch_shape, num_components=2, event_shape=(4,), + use_static_graph=self.use_static_graph) entropy_lower_bound = dist.entropy_lower_bound() self.assertEqual(batch_shape, entropy_lower_bound.get_shape()) @@ -673,7 +717,8 @@ class MixtureTest(test.TestCase): cat_tf = ds.Categorical(probs=mixture_weights) components_tf = [ds.Normal(loc=mu, scale=sigma) for (mu, sigma) in zip(means, sigmas)] - mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf) + mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf, + use_static_graph=self.use_static_graph) x_tensor = array_ops.placeholder(shape=(), dtype=dtypes.float32) @@ -721,7 +766,8 @@ class MixtureTest(test.TestCase): cat_tf = ds.Categorical(probs=mixture_weights) components_tf = [ds.Normal(loc=mu, scale=sigma) for (mu, sigma) in zip(means, sigmas)] - mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf) + mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf, + use_static_graph=self.use_static_graph) x_tensor = array_ops.placeholder(shape=psize, dtype=dtypes.float32) xs_to_check = [ @@ -760,12 +806,18 @@ class MixtureTest(test.TestCase): gm = ds.Mixture( cat=ds.Categorical(probs=[.3, .7]), components=[ds.Gamma(1., 2.), - ds.Gamma(2., 1.)]) + ds.Gamma(2., 1.)], + use_static_graph=self.use_static_graph) x_ = gm.sample().eval() self.assertAllEqual([], x_.shape) +class MixtureStaticSampleTest(MixtureTest): + use_static_graph = True + + class MixtureBenchmark(test.Benchmark): + use_static_graph = False def _runSamplingBenchmark(self, name, create_distribution, use_gpu, num_components, batch_size, num_features, @@ -811,7 +863,7 @@ class MixtureBenchmark(test.Benchmark): components = list( ds.MultivariateNormalDiag( loc=mu, scale_diag=sigma) for (mu, sigma) in zip(mus, sigmas)) - return ds.Mixture(cat, components) + return ds.Mixture(cat, components, use_static_graph=self.use_static_graph) for use_gpu in False, True: if use_gpu and not test.is_gpu_available(): @@ -853,7 +905,7 @@ class MixtureBenchmark(test.Benchmark): ds.MultivariateNormalTriL( loc=mu, scale_tril=linalg_ops.cholesky(sigma)) for (mu, sigma) in zip(mus, sigmas)) - return ds.Mixture(cat, components) + return ds.Mixture(cat, components, use_static_graph=self.use_static_graph) for use_gpu in False, True: if use_gpu and not test.is_gpu_available(): @@ -872,5 +924,9 @@ class MixtureBenchmark(test.Benchmark): sample_size=sample_size) +class MixtureStaticSampleBenchmark(MixtureBenchmark): + use_static_graph = True + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py index dc8ae1eed19..5251dbcb574 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py @@ -237,6 +237,11 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector): return y event_size = array_ops.shape(x)[-1] + # If the event size is available at graph construction time, we can inform + # the graph compiler of the maximum number of steps. If not, + # static_event_size will be None, and the maximum_iterations argument will + # have no effect. + static_event_size = x.shape.with_rank_at_least(1)[-1].value y0 = array_ops.zeros_like(x, name="y0") # call the template once to ensure creation _ = self._shift_and_log_scale_fn(y0) @@ -258,7 +263,8 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector): _, y = control_flow_ops.while_loop( cond=lambda index, _: index < event_size, body=_loop_body, - loop_vars=[0, y0]) + loop_vars=(0, y0), + maximum_iterations=static_event_size) return y def _inverse(self, y): diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py index a4d249d41ec..289e1d50e11 100644 --- a/tensorflow/contrib/distributions/python/ops/distribution_util.py +++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function from tensorflow.contrib import linalg +from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops @@ -442,6 +443,44 @@ def maybe_check_scalar_distribution( return assertions +def pad_mixture_dimensions(x, mixture_distribution, categorical_distribution, + event_ndims): + """Pad dimensions of event tensors for mixture distributions. + + See `Mixture._sample_n` and `MixtureSameFamily._sample_n` for usage examples. + + Args: + x: event tensor to pad. + mixture_distribution: Base distribution of the mixture. + categorical_distribution: `Categorical` distribution that mixes the base + distribution. + event_ndims: Integer specifying the number of event dimensions in the event + tensor. + + Returns: + A padded version of `x` that can broadcast with `categorical_distribution`. + """ + with ops.name_scope("pad_mix_dims", values=[x]): + def _get_ndims(d): + if d.batch_shape.ndims is not None: + return d.batch_shape.ndims + return array_ops.shape(d.batch_shape_tensor())[0] + dist_batch_ndims = _get_ndims(mixture_distribution) + cat_batch_ndims = _get_ndims(categorical_distribution) + pad_ndims = array_ops.where( + categorical_distribution.is_scalar_batch(), + dist_batch_ndims, + dist_batch_ndims - cat_batch_ndims) + s = array_ops.shape(x) + x = array_ops.reshape(x, shape=array_ops.concat([ + s[:-1], + array_ops.ones([pad_ndims], dtype=dtypes.int32), + s[-1:], + array_ops.ones([event_ndims], dtype=dtypes.int32), + ], axis=0)) + return x + + def static_value(x): """Returns the static value of a `Tensor` or `None`.""" return tensor_util.constant_value(ops.convert_to_tensor(x)) diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py new file mode 100644 index 00000000000..74d5d8773cf --- /dev/null +++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py @@ -0,0 +1,258 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""The Kumaraswamy distribution class.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import check_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops +from tensorflow.python.ops import special_math_ops +from tensorflow.python.ops.distributions import beta +from tensorflow.python.ops.distributions import distribution +from tensorflow.python.ops.distributions import util as distribution_util +from tensorflow.python.util.tf_export import tf_export + +__all__ = [ + "Kumaraswamy", +] + +_kumaraswamy_sample_note = """Note: `x` must have dtype `self.dtype` and be in +`[0, 1].` It must have a shape compatible with `self.batch_shape()`.""" + + +def _harmonic_number(x): + """Compute the harmonic number from its analytic continuation. + + Derivation from [1] and Euler's constant [2]. + [1] - + https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers + [2] - https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant + + + Args: + x: input float. + + Returns: + z: The analytic continuation of the harmonic number for the input. + + """ + one = array_ops.ones([], dtype=x.dtype) + return math_ops.digamma(x + one) - math_ops.digamma(one) + + +@tf_export("distributions.Kumaraswamy") +class Kumaraswamy(beta.Beta): + """Kumaraswamy distribution. + + The Kumaraswamy distribution is defined over the `(0, 1)` interval using + parameters + `concentration1` (aka "alpha") and `concentration0` (aka "beta"). It has a + shape similar to the Beta distribution, but is reparameterizeable. + + #### Mathematical Details + + The probability density function (pdf) is, + + ```none + pdf(x; alpha, beta) = alpha * beta * x**(alpha - 1) * (1 - x**alpha)**(beta - + 1) + ``` + + where: + + * `concentration1 = alpha`, + * `concentration0 = beta`, + + Distribution parameters are automatically broadcast in all functions; see + examples for details. + + #### Examples + + ```python + # Create a batch of three Kumaraswamy distributions. + alpha = [1, 2, 3] + beta = [1, 2, 3] + dist = Kumaraswamy(alpha, beta) + + dist.sample([4, 5]) # Shape [4, 5, 3] + + # `x` has three batch entries, each with two samples. + x = [[.1, .4, .5], + [.2, .3, .5]] + # Calculate the probability of each pair of samples under the corresponding + # distribution in `dist`. + dist.prob(x) # Shape [2, 3] + ``` + + ```python + # Create batch_shape=[2, 3] via parameter broadcast: + alpha = [[1.], [2]] # Shape [2, 1] + beta = [3., 4, 5] # Shape [3] + dist = Kumaraswamy(alpha, beta) + + # alpha broadcast as: [[1., 1, 1,], + # [2, 2, 2]] + # beta broadcast as: [[3., 4, 5], + # [3, 4, 5]] + # batch_Shape [2, 3] + dist.sample([4, 5]) # Shape [4, 5, 2, 3] + + x = [.2, .3, .5] + # x will be broadcast as [[.2, .3, .5], + # [.2, .3, .5]], + # thus matching batch_shape [2, 3]. + dist.prob(x) # Shape [2, 3] + ``` + + """ + + def __init__(self, + concentration1=None, + concentration0=None, + validate_args=False, + allow_nan_stats=True, + name="Kumaraswamy"): + """Initialize a batch of Kumaraswamy distributions. + + Args: + concentration1: Positive floating-point `Tensor` indicating mean + number of successes; aka "alpha". Implies `self.dtype` and + `self.batch_shape`, i.e., + `concentration1.shape = [N1, N2, ..., Nm] = self.batch_shape`. + concentration0: Positive floating-point `Tensor` indicating mean + number of failures; aka "beta". Otherwise has same semantics as + `concentration1`. + validate_args: Python `bool`, default `False`. When `True` distribution + parameters are checked for validity despite possibly degrading runtime + performance. When `False` invalid inputs may silently render incorrect + outputs. + allow_nan_stats: Python `bool`, default `True`. When `True`, statistics + (e.g., mean, mode, variance) use the value "`NaN`" to indicate the + result is undefined. When `False`, an exception is raised if one or + more of the statistic's batch members are undefined. + name: Python `str` name prefixed to Ops created by this class. + """ + super(Kumaraswamy, self).__init__( + concentration1=concentration1, + concentration0=concentration0, + validate_args=validate_args, + allow_nan_stats=allow_nan_stats, + name=name) + self._reparameterization_type = distribution.FULLY_REPARAMETERIZED + + def _sample_n(self, n, seed=None): + expanded_concentration1 = array_ops.ones_like( + self.total_concentration, dtype=self.dtype) * self.concentration1 + expanded_concentration0 = array_ops.ones_like( + self.total_concentration, dtype=self.dtype) * self.concentration0 + shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) + uniform_sample = random_ops.random_uniform( + shape=shape, minval=0.0, maxval=1.0, dtype=self.dtype, seed=seed) + + kumaraswamy_sample = (1 - uniform_sample**(1. / expanded_concentration0))**( + 1. / expanded_concentration1) + return kumaraswamy_sample + + @distribution_util.AppendDocstring(_kumaraswamy_sample_note) + def _log_cdf(self, x): + a = self.concentration1 + b = self.concentration0 + return math_ops.log1p(-(1 - x**a)**b) + + @distribution_util.AppendDocstring(_kumaraswamy_sample_note) + def _cdf(self, x): + a = self.concentration1 + b = self.concentration0 + return 1 - (1 - x**a)**b + + def _survival_function(self, x): + a = self.concentration1 + b = self.concentration0 + return (1 - x**a)**b + + def _log_survival_function(self, x): + a = self.concentration1 + b = self.concentration0 + return b * math_ops.log1p(-x**a) + + def _log_unnormalized_prob(self, x): + x = self._maybe_assert_valid_sample(x) + a = self.concentration1 + b = self.concentration0 + return (a - 1) * math_ops.log(x) + (b - 1) * math_ops.log1p(-x**a) + + def _log_normalization(self): + a = self.concentration1 + b = self.concentration0 + return -(math_ops.log(a) + math_ops.log(b)) + + def _entropy(self): + a = self.concentration1 + b = self.concentration0 + return (1 - 1. / a) + ( + 1 - 1. / b) * _harmonic_number(b) + math_ops.log(a) + math_ops.log(b) + + def _moment(self, n): + """Compute the n'th (uncentered) moment.""" + expanded_concentration1 = array_ops.ones_like( + self.total_concentration, dtype=self.dtype) * self.concentration1 + expanded_concentration0 = array_ops.ones_like( + self.total_concentration, dtype=self.dtype) * self.concentration0 + beta_arg0 = 1 + n / expanded_concentration1 + beta_arg = array_ops.stack([beta_arg0, expanded_concentration0], -1) + log_moment = math_ops.log(expanded_concentration0) + special_math_ops.lbeta( + beta_arg) + return math_ops.exp(log_moment) + + def _mean(self): + return self._moment(1) + + def _variance(self): + # TODO(b/72696533): Investigate a more numerically stable version. + return self._moment(2) - math_ops.square(self._moment(1)) + + @distribution_util.AppendDocstring( + """Note: The mode is undefined when `concentration1 <= 1` or + `concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN` + is used for undefined modes. If `self.allow_nan_stats` is `False` an + exception is raised when one or more modes are undefined.""") + def _mode(self): + a = self.concentration1 + b = self.concentration0 + mode = ((a - 1) / (a * b - 1))**(1. / a) + if self.allow_nan_stats: + nan = array_ops.fill( + self.batch_shape_tensor(), + np.array(np.nan, dtype=self.dtype.as_numpy_dtype), + name="nan") + is_defined = (self.concentration1 > 1.) & (self.concentration0 > 1.) + return array_ops.where(is_defined, mode, nan) + return control_flow_ops.with_dependencies([ + check_ops.assert_less( + array_ops.ones([], dtype=self.dtype), + self.concentration1, + message="Mode undefined for concentration1 <= 1."), + check_ops.assert_less( + array_ops.ones([], dtype=self.dtype), + self.concentration0, + message="Mode undefined for concentration0 <= 1.") + ], mode) diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py index f2d492f5489..cef6a143fc6 100644 --- a/tensorflow/contrib/distributions/python/ops/mixture.py +++ b/tensorflow/contrib/distributions/python/ops/mixture.py @@ -71,6 +71,7 @@ class Mixture(distribution.Distribution): components, validate_args=False, allow_nan_stats=True, + use_static_graph=False, name="Mixture"): """Initialize a Mixture distribution. @@ -96,6 +97,11 @@ class Mixture(distribution.Distribution): exception if a statistic (e.g. mean/mode/etc...) is undefined for any batch member. If `True`, batch members with valid parameters leading to undefined statistics will return NaN for this statistic. + use_static_graph: Calls to `sample` will not rely on dynamic tensor + indexing, allowing for some static graph compilation optimizations, but + at the expense of sampling all underlying distributions in the mixture. + (Possibly useful when running on TPUs). + Default value: `False` (i.e., use dynamic indexing). name: A name for this distribution (optional). Raises: @@ -178,6 +184,10 @@ class Mixture(distribution.Distribution): self._static_event_shape = static_event_shape self._static_batch_shape = static_batch_shape + self._use_static_graph = use_static_graph + if use_static_graph and static_num_components is None: + raise ValueError("Number of categories must be known statically when " + "`static_sample=True`.") # We let the Mixture distribution access _graph_parents since its arguably # more like a baseclass. graph_parents = self._cat._graph_parents # pylint: disable=protected-access @@ -292,6 +302,31 @@ class Mixture(distribution.Distribution): return mixture_log_cdf def _sample_n(self, n, seed=None): + if self._use_static_graph: + # This sampling approach is almost the same as the approach used by + # `MixtureSameFamily`. The differences are due to having a list of + # `Distribution` objects rather than a single object, and maintaining + # random seed management that is consistent with the non-static code path. + samples = [] + cat_samples = self.cat.sample(n, seed=seed) + for c in range(self.num_components): + seed = distribution_util.gen_new_seed(seed, "mixture") + samples.append(self.components[c].sample(n, seed=seed)) + x = array_ops.stack( + samples, -self._static_event_shape.ndims - 1) # [n, B, k, E] + npdt = x.dtype.as_numpy_dtype + mask = array_ops.one_hot( + indices=cat_samples, # [n, B] + depth=self._num_components, # == k + on_value=np.ones([], dtype=npdt), + off_value=np.zeros([], dtype=npdt)) # [n, B, k] + mask = distribution_utils.pad_mixture_dimensions( + mask, self, self._cat, + self._static_event_shape.ndims) # [n, B, k, [1]*e] + return math_ops.reduce_sum( + x * mask, + axis=-1 - self._static_event_shape.ndims) # [n, B, E] + with ops.control_dependencies(self._assertions): n = ops.convert_to_tensor(n, name="n") static_n = tensor_util.constant_value(n) diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py index 49afbea7f05..b93bdc5ab40 100644 --- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py +++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.python.framework import dtypes +from tensorflow.contrib.distributions.python.ops import distribution_util as distribution_utils from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops @@ -239,7 +239,9 @@ class MixtureSameFamily(distribution.Distribution): depth=self._num_components, # == k on_value=np.ones([], dtype=npdt), off_value=np.zeros([], dtype=npdt)) # [n, B, k] - mask = self._pad_mix_dims(mask) # [n, B, k, [1]*e] + mask = distribution_utils.pad_mixture_dimensions( + mask, self, self.mixture_distribution, + self._event_shape().ndims) # [n, B, k, [1]*e] return math_ops.reduce_sum( x * mask, axis=-1 - self._event_ndims) # [n, B, E] @@ -254,8 +256,9 @@ class MixtureSameFamily(distribution.Distribution): def _mean(self): with ops.control_dependencies(self._runtime_assertions): - probs = self._pad_mix_dims( - self.mixture_distribution.probs) # [B, k, [1]*e] + probs = distribution_utils.pad_mixture_dimensions( + self.mixture_distribution.probs, self, self.mixture_distribution, + self._event_shape().ndims) # [B, k, [1]*e] return math_ops.reduce_sum( probs * self.components_distribution.mean(), axis=-1 - self._event_ndims) # [B, E] @@ -271,8 +274,9 @@ class MixtureSameFamily(distribution.Distribution): def _variance(self): with ops.control_dependencies(self._runtime_assertions): # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X]) - probs = self._pad_mix_dims( - self.mixture_distribution.probs) # [B, k, [1]*e] + probs = distribution_utils.pad_mixture_dimensions( + self.mixture_distribution.probs, self, self.mixture_distribution, + self._event_shape().ndims) # [B, k, [1]*e] mean_cond_var = math_ops.reduce_sum( probs * self.components_distribution.variance(), axis=-1 - self._event_ndims) # [B, E] @@ -291,8 +295,12 @@ class MixtureSameFamily(distribution.Distribution): with ops.control_dependencies(self._runtime_assertions): # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X]) - probs = self._pad_mix_dims(self._pad_mix_dims( - self.mixture_distribution.probs)) # [B, k, 1, 1] + probs = distribution_utils.pad_mixture_dimensions( + distribution_utils.pad_mixture_dimensions( + self.mixture_distribution.probs, self, self.mixture_distribution, + self._event_shape().ndims), + self, self.mixture_distribution, + self._event_shape().ndims) # [B, k, 1, 1] mean_cond_var = math_ops.reduce_sum( probs * self.components_distribution.covariance(), axis=-3) # [B, e, e] @@ -312,27 +320,6 @@ class MixtureSameFamily(distribution.Distribution): shape[:d], [1], shape[d:]], axis=0)) return x - def _pad_mix_dims(self, x): - with ops.name_scope("pad_mix_dims", values=[x]): - def _get_ndims(d): - if d.batch_shape.ndims is not None: - return d.batch_shape.ndims - return array_ops.shape(d.batch_shape_tensor())[0] - dist_batch_ndims = _get_ndims(self) - cat_batch_ndims = _get_ndims(self.mixture_distribution) - pad_ndims = array_ops.where( - self.mixture_distribution.is_scalar_batch(), - dist_batch_ndims, - dist_batch_ndims - cat_batch_ndims) - s = array_ops.shape(x) - x = array_ops.reshape(x, shape=array_ops.concat([ - s[:-1], - array_ops.ones([pad_ndims], dtype=dtypes.int32), - s[-1:], - array_ops.ones([self._event_ndims], dtype=dtypes.int32), - ], axis=0)) - return x - def _outer_squared_difference(x, y): """Convenience function analogous to tf.squared_difference.""" diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md index 09242ee47dd..9d2ca07c3a2 100644 --- a/tensorflow/contrib/eager/README.md +++ b/tensorflow/contrib/eager/README.md @@ -41,28 +41,8 @@ support for distributed and multi-GPU training and CPU performance. ## Installation -Since eager execution is not yet part of a TensorFlow release, using it requires -either [building from source](https://www.tensorflow.org/install/install_sources) -or the latest nightly builds. The nightly builds are available as: - -- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and - -- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images. - -For example, to run the latest nightly docker image: - -```sh -# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker -nvidia-docker pull tensorflow/tensorflow:nightly-gpu -nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu - -# If you do not have a GPU, use the CPU-only image -docker pull tensorflow/tensorflow:nightly -docker run -it -p 8888:8888 tensorflow/tensorflow:nightly -``` - -And then visit http://localhost:8888 in your browser for a Jupyter notebook -environment. Try out the notebooks below. +Eager execution is included in TensorFlow versions 1.5 and above. +Installation instructions at https://www.tensorflow.org/install/ ## Documentation diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py index 544a3eafc08..d177bfeab2d 100644 --- a/tensorflow/contrib/eager/python/datasets.py +++ b/tensorflow/contrib/eager/python/datasets.py @@ -112,7 +112,7 @@ class Iterator(object): remote_fn.add_to_graph(None) target = constant_op.constant("/device:CPU:0") with ops.device(self._device): - self._buffer_resource_handle = prefetching_ops.function_buffering_resource( + self._buffer_resource_handle = prefetching_ops.function_buffering_resource( # pylint: disable=line-too-long string_arg=iter_string_handle, f=remote_fn, target_device=target, @@ -120,8 +120,9 @@ class Iterator(object): thread_pool_size=1, container="", shared_name=_generate_shared_name("function_buffer_resource")) - self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter( - handle=self._buffer_resource_handle, handle_device=self._device) + self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter( # pylint: disable=line-too-long + handle=self._buffer_resource_handle, + handle_device=self._device) def __iter__(self): return self diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py index 2a7be95811f..ed7dbc89046 100644 --- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py +++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py @@ -95,8 +95,7 @@ class MNISTModel(tfe.Network): x = self.max_pool2d(x) x = tf.layers.flatten(x) x = self.fc1(x) - if training: - x = self.dropout(x) + x = self.dropout(x, training=training) x = self.fc2(x) return x diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py index 76e06269b6b..1f7beee6852 100644 --- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py +++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py @@ -22,6 +22,7 @@ import gc import tempfile import time +from six.moves import xrange import tensorflow as tf import tensorflow.contrib.eager as tfe diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py index 84e25cf81a2..19b0104c807 100644 --- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py +++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py @@ -26,6 +26,7 @@ import tempfile import time import numpy as np +from six.moves import xrange import tensorflow as tf # pylint: disable=g-bad-import-order diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md index 7eea93ce1f5..ffc1d0332ea 100644 --- a/tensorflow/contrib/eager/python/g3doc/guide.md +++ b/tensorflow/contrib/eager/python/g3doc/guide.md @@ -19,29 +19,34 @@ to models defined without using eager execution. ## Installation -Eager execution is **not** included in the latest release (version 1.4) of -TensorFlow. To use it, you will need to [build TensorFlow from -source](https://www.tensorflow.org/install/install_sources) or install the -nightly builds. +Eager execution is included in TensorFlow versions 1.5 and above. +Installation instructions at https://www.tensorflow.org/install/ -For example, the nightly builds can be installed using `pip`: +The contents of this guide are compatible with TensorFlow 1.5. +However, if you run into bugs that are fixed in source but not the +release, you may want to either either [building from +source](https://www.tensorflow.org/install/install_sources) +or the try latest nightly builds. The nightly builds are available as: -- `pip install tf-nightly` (for CPU-only TensorFlow) -- `pip install tf-nightly-gpu` (for GPU-enabled TensorFlow) +- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and -Or using `docker`, with [Jupyter Notebook](http://jupyter.org/) support: +- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images. + +For example, to run the latest nightly docker image: ```sh -# For CPU-only TensorFlow +# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker +docker pull tensorflow/tensorflow:nightly-gpu +docker run --runtime=nvidia -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu + +# If you do not have a GPU, use the CPU-only image docker pull tensorflow/tensorflow:nightly docker run -it -p 8888:8888 tensorflow/tensorflow:nightly - -# For GPU-enabled TensorFlow: -# (Requires https://github.com/NVIDIA/nvidia-docker) -nvidia-docker pull tensorflow/tensorflow:nightly-gpu -nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu ``` +And then visit http://localhost:8888 in your browser for a Jupyter notebook +environment. + ## Getting Started With TensorFlow installed, eager execution is enabled via a single call: diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py index bf029ca5f9d..ea8dbf2b46e 100644 --- a/tensorflow/contrib/eager/python/metrics_impl.py +++ b/tensorflow/contrib/eager/python/metrics_impl.py @@ -291,6 +291,9 @@ class Mean(Metric): Args: values: Tensor with the per-example value. weights: Optional weighting of each example. Defaults to 1. + + Returns: + The arguments, for easy chaining. """ if weights is None: self.denom.assign_add( @@ -302,6 +305,9 @@ class Mean(Metric): self.denom.assign_add(math_ops.reduce_sum(weights)) values = math_ops.cast(values, self.dtype) * weights self.numer.assign_add(math_ops.reduce_sum(values)) + if weights is None: + return values + return values, weights def result(self): t = self.numer / self.denom @@ -329,7 +335,13 @@ class Accuracy(Mean): per element of the Tensor. predictions: Tensor with the predicted label for each example. weights: Optional weighting of each example. Defaults to 1. + + Returns: + The arguments, for easy chaining. """ matches = math_ops.equal(labels, predictions) matches = math_ops.cast(matches, dtypes.float64) super(Accuracy, self).call(matches, weights=weights) + if weights is None: + return labels, predictions + return labels, predictions, weights diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py index 9cf34fd9b2d..a9ecaa3f8bc 100644 --- a/tensorflow/contrib/eager/python/metrics_test.py +++ b/tensorflow/contrib/eager/python/metrics_test.py @@ -180,6 +180,19 @@ class MetricsTest(test.TestCase): m2 = metrics.Mean() m2(2) + def testMetricsChain(self): + with context.graph_mode(), self.test_session(): + m1 = metrics.Mean() + m2 = metrics.Mean(name="m2") + update_m2 = m2(3.0) + update_m2_2 = m2(m1(1.0)) + m1.init_variables().run() + m2.init_variables().run() + update_m2.eval() + update_m2_2.eval() + self.assertAllEqual(m2.result().eval(), 2.0) + self.assertAllEqual(m1.result().eval(), 1.0) + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py index abc7e3690c7..1a7f7b85e68 100644 --- a/tensorflow/contrib/eager/python/saver_test.py +++ b/tensorflow/contrib/eager/python/saver_test.py @@ -73,16 +73,6 @@ class SaverTest(test.TestCase): with self.assertRaisesRegexp(ValueError, 'v1'): saver.save(ckpt_prefix) - def testDifferentGraphError(self): - with ops.device(self._dev()): - with ops.Graph().as_default(): - v1 = resource_variable_ops.ResourceVariable(1.0, name='v1') - with ops.Graph().as_default(): - saver = _saver.Saver([v1]) - ckpt_prefix = os.path.join(test.get_temp_dir(), 'ckpt') - with self.assertRaisesRegexp(ValueError, 'Graph'): - saver.save(ckpt_prefix) - def testSameObjectOK(self): with ops.device(self._dev()): v1 = resource_variable_ops.ResourceVariable(1.0, name='v1') diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py index 712d1cb94d2..d32bebf90c1 100644 --- a/tensorflow/contrib/eager/python/tfe.py +++ b/tensorflow/contrib/eager/python/tfe.py @@ -59,7 +59,6 @@ To use, at program startup, call `tfe.enable_eager_execution()`. @@in_eager_mode @@in_graph_mode -@@IsolateTest @@run_test_in_graph_and_eager_modes @@DEVICE_PLACEMENT_EXPLICIT @@ -101,7 +100,6 @@ from tensorflow.python.eager.execution_callbacks import nan_callback from tensorflow.python.eager.execution_callbacks import seterr from tensorflow.python.framework.ops import enable_eager_execution from tensorflow.python.framework.ops import eager_run as run -from tensorflow.python.framework.test_util import IsolateTest from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable from tensorflow.python.ops.variable_scope import EagerVariableStore diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py index 0dedb2fd7c0..b6659c2a179 100644 --- a/tensorflow/contrib/eager/python/tfe_test.py +++ b/tensorflow/contrib/eager/python/tfe_test.py @@ -102,10 +102,6 @@ class TFETest(test_util.TensorFlowTestCase): # Expect at least one device. self.assertTrue(tfe.list_devices()) - def testNumGPUs(self): - devices = tfe.list_devices() - self.assertEqual(len(devices) - 1, tfe.num_gpus()) - def testAddCheckNumericsOpsRaisesError(self): with self.assertRaisesRegexp( RuntimeError, diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py index caa9dd83233..c9153c93527 100644 --- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py +++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py @@ -457,6 +457,13 @@ def _get_local_devices(device_type): def _split_batch(features, labels, number_of_shards, device): """Split input features and labes into batches.""" + def ensure_divisible_by_shards(sequence): + batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0] + if batch_size % number_of_shards != 0: + raise ValueError( + 'Batch size {} needs to be divisible by the number of GPUs, which ' + 'is {}.'.format(batch_size, number_of_shards)) + def split_dictionary(dictionary): """Split a dictionary into shards.""" shards = [{} for _ in range(number_of_shards)] @@ -467,6 +474,7 @@ def _split_batch(features, labels, number_of_shards, device): sp_input=tensor, num_split=number_of_shards, axis=0)): shards[i][name] = shard else: + ensure_divisible_by_shards(tensor) for i, shard in enumerate(array_ops.split(tensor, number_of_shards)): shards[i][name] = shard return shards @@ -476,6 +484,7 @@ def _split_batch(features, labels, number_of_shards, device): if isinstance(features, dict): feature_shards = split_dictionary(features) else: + ensure_divisible_by_shards(features) feature_shards = array_ops.split(features, number_of_shards) if labels is None: @@ -483,6 +492,7 @@ def _split_batch(features, labels, number_of_shards, device): elif isinstance(labels, dict): label_shards = split_dictionary(labels) else: + ensure_divisible_by_shards(labels) label_shards = array_ops.split(labels, number_of_shards) return feature_shards, label_shards diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py index 03d31226af6..6936f8a1312 100644 --- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py +++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py @@ -37,6 +37,7 @@ from tensorflow.python.feature_column import feature_column from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops as ops_lib +from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops @@ -433,6 +434,17 @@ class ReplicateModelTest(test_util.TensorFlowTestCase): 'probabilities': np.array([[0.1], [0.02]]) }, session.run(estimator_spec.predictions)) + def test_batch_size_that_is_not_divisible_by_the_number_of_gpus(self): + features = np.array([[1.0], [2.0], [3.0]]) + labels = np.array([[1.0], [2.0], [3.0]]) + + with self.assertRaisesRegexp( + ValueError, '.*Batch.+size.+needs.+to.+be.+divisible.+by.+GPUs.+'): + replicated_model_fn = replicate_model_fn.replicate_model_fn( + self.model_fn, devices=['/gpu:0', '/gpu:1']) + _ = replicated_model_fn( + features, labels, model_fn_lib.ModeKeys.TRAIN, self.params) + def test_unsupported_loss_reduction(self): with self.assertRaisesRegexp(ValueError, '.+none.+reduction.+is.+specified.+'): @@ -981,8 +993,13 @@ class SplitBatchTest(test_util.TensorFlowTestCase): return list(map(evaluate_items, first_list)), list( map(evaluate_items, second_list)) + def assertSparseValuesEqual(self, a, b): + self.assertAllEqual(a.indices, b.indices) + self.assertAllEqual(a.values, b.values) + self.assertAllEqual(a.dense_shape, b.dense_shape) + def test_simple_half_split(self): - with self.test_session() as session: # pylint: disable=unused-variable + with self.test_session(): features = [0.0, 1.0, 2.0, 3.0] labels = [10.0, 11.0, 12.0, 13.0] feature_shards, label_shards = replicate_model_fn._split_batch( @@ -995,7 +1012,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase): self.assertAllEqual([[10.0, 11.0], [12.0, 13.0]], label_shards) def test_to_each_their_own(self): - with self.test_session() as session: # pylint: disable=unused-variable + with self.test_session(): features = [0.0, 1.0, 2.0, 3.0] labels = [10.0, 11.0, 12.0, 13.0] feature_shards, label_shards = replicate_model_fn._split_batch( @@ -1008,7 +1025,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase): self.assertAllEqual([[10.0], [11.0], [12.0], [13.0]], label_shards) def test_one_batch(self): - with self.test_session() as session: # pylint: disable=unused-variable + with self.test_session(): features = [0.0, 1.0, 2.0, 3.0] labels = [10.0, 11.0, 12.0, 13.0] feature_shards, label_shards = replicate_model_fn._split_batch( @@ -1021,7 +1038,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase): self.assertAllEqual([[10.0, 11.0, 12.0, 13.0]], label_shards) def test_half_split_in_dictionary(self): - with self.test_session() as session: # pylint: disable=unused-variable + with self.test_session(): features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]} labels = [10.0, 11.0, 12.0, 13.0] @@ -1035,6 +1052,60 @@ class SplitBatchTest(test_util.TensorFlowTestCase): self.assertAllEqual([10.0, 11.0], label_shards[0].eval()) self.assertAllEqual([12.0, 13.0], label_shards[1].eval()) + def test_sparse_tensor_can_be_split_unevenly(self): + with self.test_session(): + features = { + 'x': + sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 2], [2, 2]], + values=[1.0, 2.0, 3.0], + dense_shape=[3, 4]) + } + labels = np.array([[1.0], [2.0]]) + + feature_shards, label_shards = replicate_model_fn._split_batch( + features, labels, 2, device='/gpu:0') + + self.assertSparseValuesEqual( + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [1, 2]], values=[1., 2.], dense_shape=[2, 4]), + feature_shards[0]['x'].eval()) + self.assertSparseValuesEqual( + sparse_tensor.SparseTensorValue( + indices=[[0, 2]], values=[3.], dense_shape=[1, 4]), + feature_shards[1]['x'].eval()) + self.assertAllEqual([[1.0]], label_shards[0].eval()) + self.assertAllEqual([[2.0]], label_shards[1].eval()) + + def test_sparse_tensor_can_be_split_unevenly_repeated_row(self): + with self.test_session(): + features = { + 'x': + sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 0], [1, 1]], + values=[1.0, 2.0, 3.0], + dense_shape=[3, 4]) + } + labels = np.array([[1.0], [2.0]]) + + feature_shards, label_shards = replicate_model_fn._split_batch( + features, labels, 2, device='/gpu:0') + + print(feature_shards[0]['x'].eval()) + print(feature_shards[1]['x'].eval()) + self.assertSparseValuesEqual( + sparse_tensor.SparseTensorValue( + indices=[[0, 0], [1, 0], [1, 1]], + values=[1., 2., 3.], + dense_shape=[2, 4]), feature_shards[0]['x'].eval()) + + second_batch = feature_shards[1]['x'].eval() + self.assertFalse(len(second_batch.indices)) + self.assertFalse(len(second_batch.values)) + self.assertAllEqual([1, 4], second_batch.dense_shape) + self.assertAllEqual([[1.0]], label_shards[0].eval()) + self.assertAllEqual([[2.0]], label_shards[1].eval()) + def test_one_batch_in_dictionary(self): with self.test_session() as session: # pylint: disable=unused-variable features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]} diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD index fe86a20ab1f..180f1b68f3b 100644 --- a/tensorflow/contrib/factorization/BUILD +++ b/tensorflow/contrib/factorization/BUILD @@ -221,6 +221,7 @@ py_test( name = "kmeans_test", size = "medium", srcs = ["python/ops/kmeans_test.py"], + shard_count = 4, srcs_version = "PY2AND3", tags = ["notsan"], # b/67512932 deps = [ diff --git a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc index 31d08bfb65e..a8c5d0763c2 100644 --- a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc +++ b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc @@ -57,11 +57,11 @@ typedef Eigen::Map< class MaskedMatmulOp : public OpKernel { public: - explicit MaskedMatmulOp(OpKernelConstruction* context) - : OpKernel(context) { - OP_REQUIRES_OK(context, context->MatchSignature( - {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL}, - {DT_FLOAT})); + explicit MaskedMatmulOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK( + context, + context->MatchSignature( + {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL}, {DT_FLOAT})); } void Compute(OpKernelContext* context) override { @@ -110,12 +110,11 @@ class MaskedMatmulOp : public OpKernel { num_nonzero_elements, 2); Tensor* prod_values_tensor; - OP_REQUIRES_OK(context, - context->allocate_output( - 0, TensorShape({num_nonzero_elements}), - &prod_values_tensor)); - EigenMatFloatMap prod_values(prod_values_tensor->vec().data(), - 1, num_nonzero_elements); + OP_REQUIRES_OK(context, context->allocate_output( + 0, TensorShape({num_nonzero_elements}), + &prod_values_tensor)); + EigenMatFloatMap prod_values(prod_values_tensor->vec().data(), 1, + num_nonzero_elements); auto get_a_index = [&indices_mat, &a_dim_0](int64 i) { int64 a_index = internal::SubtleMustCopy(indices_mat(i, 0)); @@ -182,8 +181,8 @@ class MaskedMatmulOp : public OpKernel { } }; // Shard the work. - worker_threads.workers->ParallelFor( - num_nonzero_elements, cost_per_unit, work); + worker_threads.workers->ParallelFor(num_nonzero_elements, cost_per_unit, + work); } }; REGISTER_KERNEL_BUILDER(Name("MaskedMatmul").Device(DEVICE_CPU), diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py index 4d0f9b24240..c861cfff544 100644 --- a/tensorflow/contrib/factorization/python/ops/kmeans.py +++ b/tensorflow/contrib/factorization/python/ops/kmeans.py @@ -143,7 +143,7 @@ class _ModelFn(object): def model_fn(self, features, mode, config): """Model function for the estimator. - Note that this does not take a `1abels` arg. This works, but `input_fn` must + Note that this does not take a `labels` arg. This works, but `input_fn` must return either `features` or, equivalently, `(features, None)`. Args: diff --git a/tensorflow/contrib/ffmpeg/decode_video_op.cc b/tensorflow/contrib/ffmpeg/decode_video_op.cc index d44032968d5..6f8ad486d10 100644 --- a/tensorflow/contrib/ffmpeg/decode_video_op.cc +++ b/tensorflow/contrib/ffmpeg/decode_video_op.cc @@ -102,16 +102,12 @@ REGISTER_OP("DecodeVideo") return Status::OK(); }) .Doc(R"doc( -Processes the contents of an audio file into a tensor using FFmpeg to decode +Processes the contents of an video file into a tensor using FFmpeg to decode the file. -One row of the tensor is created for each channel in the audio file. Each -channel contains audio samples starting at the beginning of the audio and -having `1/samples_per_second` time between them. If the `channel_count` is -different from the contents of the file, channels will be merged or created. - -contents: The binary audio file contents, as a string or rank-0 string - tensor. +contents: The binary contents of the video file to decode. This is a + scalar. +output: A rank-4 `Tensor` that has `[frames, height, width, 3]` RGB as output. )doc"); } // namespace ffmpeg diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc index c85b1837ab5..e61221a6b0d 100644 --- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc +++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc @@ -47,20 +47,19 @@ std::vector FfmpegAudioCommandLine(const string& input_filename, int32 channel_count, const string& stream) { std::vector command({ - "-nostats", // No additional progress display. - "-nostdin", // No interactive commands accepted. - "-f", input_format_id, // eg: "mp3" - "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename, - "-loglevel", "error", // Print errors only. - "-hide_banner", // Skip printing build options, version, etc. - "-map_metadata", "-1", // Copy global metadata from input to output. - "-vn", // No video recording. - "-ac:a:0", StrCat(channel_count), "-ar:a:0", - StrCat(samples_per_second), - // Output set (in several ways) to signed 16-bit little-endian ints. - "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le", - "-sn", // No subtitle recording. - "-y" // Overwrite output file. + "-nostats", // No additional progress display. + "-nostdin", // No interactive commands accepted. + "-f", input_format_id, // eg: "mp3" + "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename, + "-loglevel", "error", // Print errors only. + "-hide_banner", // Skip printing build options, version, etc. + "-map_metadata", "-1", // Copy global metadata from input to output. + "-vn", // No video recording. + "-ac:a:0", StrCat(channel_count), "-ar:a:0", StrCat(samples_per_second), + // Output set (in several ways) to signed 16-bit little-endian ints. + "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le", + "-sn", // No subtitle recording. + "-y" // Overwrite output file. }); if (!stream.empty()) { command.emplace_back("-map"); @@ -75,21 +74,13 @@ std::vector FfmpegVideoCommandLine(const string& input_filename, const string& output_filename) { return {"-nostats", // No additional progress display. "-nostdin", // No interactive commands accepted. - "-i", - input_filename, - "-f", - "image2pipe", - "-probesize", - StrCat(kDefaultProbeSize), - "-loglevel", + "-i", input_filename, "-f", "image2pipe", "-probesize", + StrCat(kDefaultProbeSize), "-loglevel", // Info is needed to get the information about stream, etc. // It is generated to a separate file, not stdout/stderr. "info", "-hide_banner", // Skip printing build options, version, etc. - "-vcodec", - "rawvideo", - "-pix_fmt", - "rgb24", + "-vcodec", "rawvideo", "-pix_fmt", "rgb24", "-y", // Overwrite output file. StrCat(output_filename)}; } diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc index 85b61b26163..05728b3d375 100644 --- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc +++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc @@ -32,10 +32,8 @@ namespace tensorflow { namespace ffmpeg { namespace { -const char kTestWavFilename[] = - "contrib/ffmpeg/testdata/mono_10khz.wav"; -const char kTestMp3Filename[] = - "contrib/ffmpeg/testdata/test_sound1.mp3"; +const char kTestWavFilename[] = "contrib/ffmpeg/testdata/mono_10khz.wav"; +const char kTestMp3Filename[] = "contrib/ffmpeg/testdata/test_sound1.mp3"; // Set to true via a command line flag iff the test is expected to have FFmpeg // installed. @@ -139,7 +137,7 @@ TEST(FfmpegLibTest, TestRoundTripWav) { } // namespace ffmpeg } // namespace tensorflow -int main(int argc, char **argv) { +int main(int argc, char** argv) { tensorflow::string usage = tensorflow::ffmpeg::ParseTestFlags(&argc, argv); testing::InitGoogleTest(&argc, argv); if (argc != 1) { diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc index 36fc71794b0..d6c885a3242 100644 --- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc +++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc @@ -20,8 +20,6 @@ #include #include - -#include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/platform/env.h" diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py index 673c5178422..503b868aaa6 100644 --- a/tensorflow/contrib/framework/__init__.py +++ b/tensorflow/contrib/framework/__init__.py @@ -53,6 +53,7 @@ See the @{$python/contrib.framework} guide. @@assign_from_values_fn @@create_global_step @@filter_variables +@@fuse_op @@get_global_step @@get_or_create_global_step @@get_local_variables diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc index 6677dca752f..5bf6b675295 100644 --- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc +++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc @@ -21,8 +21,8 @@ limitations under the License. #include "tensorflow/contrib/framework/kernels/zero_initializer_op.h" -#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" namespace tensorflow { @@ -81,8 +81,8 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_GPU_KERNELS -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #undef REGISTER_KERNELS -} // namespace tensorflow +} // namespace tensorflow diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.h b/tensorflow/contrib/framework/kernels/zero_initializer_op.h index 14c9268efa8..99389a5ab6a 100644 --- a/tensorflow/contrib/framework/kernels/zero_initializer_op.h +++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.h @@ -29,5 +29,5 @@ struct TensorSetZero { }; } // namespace functor -} // end namespace tensorflow -#endif // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_ +} // end namespace tensorflow +#endif // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_ diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc index 1ee8e1498cf..706134ba9a5 100644 --- a/tensorflow/contrib/framework/ops/variable_ops.cc +++ b/tensorflow/contrib/framework/ops/variable_ops.cc @@ -26,8 +26,8 @@ REGISTER_OP("ZeroInitializer") .Attr("T: realnumbertype") .SetAllowsUninitializedInput() .SetShapeFn([](InferenceContext* c) { - c->set_output(0, c->input(0)); - return Status::OK(); + c->set_output(0, c->input(0)); + return Status::OK(); }) .Doc(R"doc( Initialize 'ref' with all zeros. This op requires that the tensor is not diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py index b5e9f8df792..6f65fe771eb 100644 --- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py +++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py @@ -31,7 +31,6 @@ from tensorflow.python.ops import variables from tensorflow.python.platform import googletest - class AccumulateNV2Test(test_util.TensorFlowTestCase): """Tests of the new, differentiable version of accumulate_n""" @@ -62,8 +61,9 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase): accum_n = av2.accumulate_n_v2(input_vars) sess.run(variables.global_variables_initializer()) accum_n_grad = gradients.gradients(accum_n, input_vars) - self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1 - [g.eval() for g in accum_n_grad]) + self.assertAllEqual( + np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1 + [g.eval() for g in accum_n_grad]) # The tests below used to be in a separate class under cwise_ops_test.py, # which did not run in the default test target. @@ -75,8 +75,8 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase): np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20) ] random_tensors = [ - ops.convert_to_tensor( - x, dtype=dtypes_lib.float32) for x in random_arrays + ops.convert_to_tensor(x, dtype=dtypes_lib.float32) + for x in random_arrays ] tf_val = av2.accumulate_n_v2(random_tensors) np_val = random_arrays[0] @@ -95,21 +95,21 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase): with self.assertRaises(ValueError): a = variables.Variable(0.2) b = variables.Variable(0.1) - tf_val = av2.accumulate_n_v2([a,b], shape=[2,2]) # Should be shape=[] + tf_val = av2.accumulate_n_v2([a, b], shape=[2, 2]) # Should be shape=[] def testIncompatibleShapes(self): with self.test_session(): with self.assertRaises(ValueError): - a = variables.Variable(np.array([0.1,0.2])) - b = variables.Variable(np.array([[0.3],[0.4]])) - tf_val = av2.accumulate_n_v2([a,b]) + a = variables.Variable(np.array([0.1, 0.2])) + b = variables.Variable(np.array([[0.3], [0.4]])) + tf_val = av2.accumulate_n_v2([a, b]) def testWrongType(self): with self.test_session(): with self.assertRaises(TypeError): a = variables.Variable(0.2, dtype=np.float32) b = variables.Variable(0.1, dtype=np.float32) - tf_val = av2.accumulate_n_v2([a,b], tensor_dtype=np.int32) + tf_val = av2.accumulate_n_v2([a, b], tensor_dtype=np.int32) def testWrongTypeOneInput(self): # Scenario that used to trigger a bug, even when testWrongType() worked diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py index 2bce00fde24..409657fe1da 100644 --- a/tensorflow/contrib/framework/python/ops/arg_scope.py +++ b/tensorflow/contrib/framework/python/ops/arg_scope.py @@ -53,7 +53,8 @@ net = layers.conv2d(net, 256, [5, 5], scope='conv2') ``` - Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later: + Example of how to use tf.contrib.framework.add_arg_scope to enable your + function to be called within an arg_scope later: @tf.contrib.framework.add_arg_scope def conv2d(*args, **kwargs) @@ -65,11 +66,10 @@ from __future__ import print_function from tensorflow.python.util import tf_contextlib from tensorflow.python.util import tf_decorator -__all__ = ['arg_scope', - 'add_arg_scope', - 'current_arg_scope', - 'has_arg_scope', - 'arg_scoped_arguments'] +__all__ = [ + 'arg_scope', 'add_arg_scope', 'current_arg_scope', 'has_arg_scope', + 'arg_scoped_arguments' +] _ARGSTACK = [{}] @@ -172,6 +172,7 @@ def add_arg_scope(func): Returns: A tuple with the decorated function func_with_args(). """ + def func_with_args(*args, **kwargs): current_scope = current_arg_scope() current_args = kwargs @@ -180,6 +181,7 @@ def add_arg_scope(func): current_args = current_scope[key_func].copy() current_args.update(kwargs) return func(*args, **current_args) + _add_op(func) setattr(func_with_args, '_key_op', _key_op(func)) return tf_decorator.make_decorator(func, func_with_args) diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py index a65d4bc50ff..96cdd8b1ca4 100644 --- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py +++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py @@ -116,7 +116,7 @@ def build_fused_conv_bias_relu_graph(device, input_shape, filter_shape, strides, for _ in range(1, num_iters): with ops.control_dependencies([fused_out]): # pylint: disable=g-line-too-long - fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation( + fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation( # pylint: disable=line-too-long inp, filt, bias, @@ -166,10 +166,10 @@ class FusedConv2DBiasActivationBenchmark(test.Benchmark): duration = (time.time() - start_time) / num_iters print("%s inputshape:%s filtershape:%s strides:%s padding:%s " - "%d iters: %.8f sec" % - (device, str(input_shape).replace(" ", ""), - str(filter_shape).replace(" ", ""), - str(strides).replace(" ", ""), padding, num_iters, duration)) + "%d iters: %.8f sec" % (device, str(input_shape).replace(" ", ""), + str(filter_shape).replace(" ", ""), + str(strides).replace(" ", ""), padding, + num_iters, duration)) name_template = ( "conv2d_{device}_input_shape_{inputshape}_filter_shape_{filtershape}_" "strides_{strides}_padding_{padding}") diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py index 0d51c282a89..082c42eba18 100644 --- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py +++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py @@ -59,7 +59,11 @@ _summary_type_map = { class GANEstimator(estimator.Estimator): """An estimator for Generative Adversarial Networks (GANs). - This Estimator is backed by TFGAN. + This Estimator is backed by TFGAN. The network functions follow the TFGAN API + except for one exception: if either `generator_fn` or `discriminator_fn` have + an argument called `mode`, then the tf.Estimator mode is passed in for that + argument. This helps with operations like batch normalization, which have + different train and evaluation behavior. Example: @@ -233,9 +237,11 @@ def _gan_model_fn( def _make_gan_model(generator_fn, discriminator_fn, real_data, generator_inputs, generator_scope, add_summaries, mode): """Make a `GANModel`, and optionally pass in `mode`.""" - # If `generator_fn` has an argument `mode`, pass mode to it. + # If network functions have an argument `mode`, pass mode to it. if 'mode' in inspect.getargspec(generator_fn).args: generator_fn = functools.partial(generator_fn, mode=mode) + if 'mode' in inspect.getargspec(discriminator_fn).args: + discriminator_fn = functools.partial(discriminator_fn, mode=mode) gan_model = tfgan_train.gan_model( generator_fn, discriminator_fn, diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py index e752f0bcccd..387a62bd741 100644 --- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py +++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py @@ -54,7 +54,8 @@ def generator_fn(noise_dict, mode): return layers.fully_connected(noise, noise.shape[1].value) -def discriminator_fn(data, _): +def discriminator_fn(data, unused_conditioning, mode): + del unused_conditioning, mode return layers.fully_connected(data, 1) @@ -99,7 +100,6 @@ def mock_head(testcase, expected_generator_inputs, expected_real_data, else: testcase.assertEqual(discriminator_scope_name, gan_model.discriminator_scope.name) - testcase.assertEqual(_or_none(discriminator_fn), gan_model.discriminator_fn) with ops.control_dependencies(assertions): if mode == model_fn_lib.ModeKeys.TRAIN: diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py index 986a5ff6dcb..d9b07e62f89 100644 --- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py +++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py @@ -28,6 +28,7 @@ from __future__ import division from __future__ import print_function import functools +import os import sys import tarfile @@ -189,20 +190,31 @@ def get_graph_def_from_resource(filename): return graph_pb2.GraphDef.FromString(resource_loader.load_resource(filename)) -def get_graph_def_from_url_tarball(url, filename): - """Get a GraphDef proto from a tarball on the web.""" - def _progress(count, block_size, total_size): - sys.stdout.write('\r>> Downloading %s %.1f%%' % ( - url, float(count * block_size) / float(total_size) * 100.0)) - sys.stdout.flush() - tar_filename, _ = urllib.request.urlretrieve(url, reporthook=_progress) +def get_graph_def_from_url_tarball(url, filename, tar_filename=None): + """Get a GraphDef proto from a tarball on the web. + + Args: + url: Web address of tarball + filename: Filename of graph definition within tarball + tar_filename: Temporary download filename (None = always download) + + Returns: + A GraphDef loaded from a file in the downloaded tarball. + """ + if not (tar_filename and os.path.exists(tar_filename)): + def _progress(count, block_size, total_size): + sys.stdout.write('\r>> Downloading %s %.1f%%' % ( + url, float(count * block_size) / float(total_size) * 100.0)) + sys.stdout.flush() + tar_filename, _ = urllib.request.urlretrieve(url, tar_filename, _progress) with tarfile.open(tar_filename, 'r:gz') as tar: proto_str = tar.extractfile(filename).read() return graph_pb2.GraphDef.FromString(proto_str) def _default_graph_def_fn(): - return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH) + return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH, + os.path.basename(INCEPTION_URL)) def run_inception(images, diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py index b960af28eaa..871f1ad54e2 100644 --- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py +++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py @@ -84,11 +84,11 @@ class ClassifierMetricsTest(test.TestCase): self.assertAllClose( np.array([0.014, 0.014], 'f'), np.array([x[0] for x in wscores], 'f'), - rtol=0.1) + rtol=0.15) self.assertAllClose( np.array([0.014, 0.020], 'f'), np.array([x[1] for x in wscores], 'f'), - rtol=0.1) + rtol=0.15) def test_sliced_wasserstein_distance_svd(self): """Test the distance.""" diff --git a/tensorflow/contrib/gdr/README.md b/tensorflow/contrib/gdr/README.md index 34ce60b3608..8242d93f129 100644 --- a/tensorflow/contrib/gdr/README.md +++ b/tensorflow/contrib/gdr/README.md @@ -119,4 +119,4 @@ In the original design (as in the reference), tensor buffers are only registered Reference === -Bairen Yi, Jiacheng Xia, Li Chen, and Kai Chen. 2017. Towards Zero Copy Dataflows using RDMA. In Proceedings of SIGCOMM Posters and Demos'17, Los Angeles, CA, USA, August 22-24, 2017, 3 pages. https://doi.org/10.1145/3123878.3123907 +Bairen Yi, Jiacheng Xia, Li Chen, and Kai Chen. 2017. Towards Zero Copy Dataflows using RDMA. In Proceedings of SIGCOMM Posters and Demos'17, Los Angeles, CA, USA, August 22-24, 2017, 3 pages. https://doi.org/10.1145/3123878.3131975 diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc index 5c7ac744289..81e70ae30a4 100644 --- a/tensorflow/contrib/gdr/gdr_memory_manager.cc +++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc @@ -86,8 +86,9 @@ int TryToReadNumaNode(ibv_device* device) { if (strings::safe_strto32(content, &value)) { if (value < 0) { LOG(INFO) << "Successful NUMA node read from SysFS had negative value (" - << value << "), but there must be at least one NUMA node" - ", so returning NUMA node zero"; + << value + << "), but there must be at least one NUMA node" + ", so returning NUMA node zero"; return 0; } LOG(INFO) << "NUMA node for device: " << device->name << " is " << value; @@ -290,8 +291,8 @@ Status GdrMemoryManager::Init() { // Host memory allocators for (Allocator* allocator : allocators) { auto* visitable_allocator = dynamic_cast(allocator); - CHECK(visitable_allocator) << "is not visitable for instrumentation" - << allocator->Name(); + CHECK(visitable_allocator) + << "is not visitable for instrumentation" << allocator->Name(); // Make sure we don't instrument the same allocator twice if (instrumented_.find(allocator) == std::end(instrumented_)) { visitable_allocator->AddAllocVisitor(alloc_visitor); @@ -635,8 +636,8 @@ void GdrMemoryManager::TensorFromTransportOptions( } else { checksum = GPUUtil::Checksum(*tensor); } - CHECK(checksum == remote_mr.checksum()) << "Checksum mismatch: " << checksum - << "!=" << remote_mr.checksum(); + CHECK(checksum == remote_mr.checksum()) + << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum(); #endif } done(Status::OK()); diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc index 6adf837ca0a..c2e32da133b 100644 --- a/tensorflow/contrib/image/kernels/image_ops.cc +++ b/tensorflow/contrib/image/kernels/image_ops.cc @@ -43,9 +43,9 @@ template struct FillProjectiveTransform; typedef Eigen::ThreadPoolDevice CPUDevice; using functor::FillProjectiveTransform; +using generator::Interpolation; using generator::INTERPOLATION_BILINEAR; using generator::INTERPOLATION_NEAREST; -using generator::Interpolation; using generator::ProjectiveGenerator; template @@ -72,11 +72,12 @@ class ImageProjectiveTransform : public OpKernel { const Tensor& transform_t = ctx->input(1); OP_REQUIRES(ctx, images_t.shape().dims() == 4, errors::InvalidArgument("Input images must have rank 4")); - OP_REQUIRES(ctx, (TensorShapeUtils::IsMatrix(transform_t.shape()) && - (transform_t.dim_size(0) == images_t.dim_size(0) || - transform_t.dim_size(0) == 1) && - transform_t.dim_size(1) == - ProjectiveGenerator::kNumParameters), + OP_REQUIRES(ctx, + (TensorShapeUtils::IsMatrix(transform_t.shape()) && + (transform_t.dim_size(0) == images_t.dim_size(0) || + transform_t.dim_size(0) == 1) && + transform_t.dim_size(1) == + ProjectiveGenerator::kNumParameters), errors::InvalidArgument( "Input transform should be num_images x 8 or 1 x 8")); auto images = images_t.tensor(); diff --git a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc index 9f0bf37aed3..8f9a5c28039 100755 --- a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc +++ b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc @@ -143,8 +143,8 @@ class SingleImageRandomDotStereogramsOp : public OpKernel { } data_box_left = deltaX_border_image / 2; // Center DATA in X dimension - data_box_width = data_Xwindow; // width of scan line - data_box_height = data_Ywindow; // hight of image + data_box_width = data_Xwindow; // width of scan line + data_box_height = data_Ywindow; // hight of image const T* inputZ = input_tensor.flat().data(); // Flatten input Z buffer diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc index 1f41f243f2e..8139d4272d6 100755 --- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc +++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc @@ -58,7 +58,9 @@ REGISTER_OP("SingleImageRandomDotStereograms") int colors; TF_RETURN_IF_ERROR(c->GetAttr("number_colors", &colors)); - c->set_output(0, c->MakeShape({y_dim, x_dim, colors > 256? c->MakeDim(3) : c->MakeDim(1)})); + c->set_output( + 0, c->MakeShape( + {y_dim, x_dim, colors > 256 ? c->MakeDim(3) : c->MakeDim(1)})); return Status::OK(); }) .Doc(R"doc( diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py index bb766e59d2c..d4a6a5bcbb5 100755 --- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py +++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py @@ -26,18 +26,20 @@ _sirds_ops = loader.load_op_library( resource_loader.get_path_to_datafile( "_single_image_random_dot_stereograms.so")) -def single_image_random_dot_stereograms( - depth_values, - hidden_surface_removal=None, - convergence_dots_size=None, - dots_per_inch=None, - eye_separation=None, mu=None, - normalize=None, normalize_max=None, - normalize_min=None, - border_level=None, - number_colors=None, - output_image_shape=None, - output_data_window=None): + +def single_image_random_dot_stereograms(depth_values, + hidden_surface_removal=None, + convergence_dots_size=None, + dots_per_inch=None, + eye_separation=None, + mu=None, + normalize=None, + normalize_max=None, + normalize_min=None, + border_level=None, + number_colors=None, + output_image_shape=None, + output_data_window=None): """Output a RandomDotStereogram Tensor for export via encode_PNG/JPG OP. Given the 2-D tensor 'depth_values' with encoded Z values, this operation @@ -45,7 +47,8 @@ def single_image_random_dot_stereograms( for the encode_PNG/JPG ops. Be careful with image compression as this may corrupt the encode 3-D data witin the image. - Based upon [this paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper). + Based upon [this + paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper). This outputs a SIRDS image as picture_out.png: @@ -113,7 +116,8 @@ def single_image_random_dot_stereograms( hidden_surface_removal=hidden_surface_removal, convergence_dots_size=convergence_dots_size, dots_per_inch=dots_per_inch, - eye_separation=eye_separation, mu=mu, + eye_separation=eye_separation, + mu=mu, normalize=normalize, normalize_max=normalize_max, normalize_min=normalize_min, @@ -123,4 +127,5 @@ def single_image_random_dot_stereograms( output_data_window=output_data_window) return result + ops.NotDifferentiable("SingleImageRandomDotStereograms") diff --git a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc index ca288c1f737..886f6798150 100644 --- a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc +++ b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc @@ -34,9 +34,8 @@ class ObtainNextOp : public OpKernel { // Allocate output. Tensor* output_tensor = nullptr; - OP_REQUIRES_OK( - ctx, - ctx->allocate_output("out_element", TensorShape({}), &output_tensor)); + OP_REQUIRES_OK(ctx, ctx->allocate_output("out_element", TensorShape({}), + &output_tensor)); // Obtain mutex for the "counter" tensor. mutex* mu; diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD new file mode 100644 index 00000000000..f7593aa462c --- /dev/null +++ b/tensorflow/contrib/kafka/BUILD @@ -0,0 +1,104 @@ +package( + default_visibility = ["//visibility:private"], +) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs") +load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py") +load("//tensorflow:tensorflow.bzl", "tf_kernel_library") +load("//tensorflow:tensorflow.bzl", "tf_py_test") + +tf_kernel_library( + name = "kafka_kernels", + srcs = ["kernels/kafka_dataset_ops.cc"], + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core/kernels:bounds_check_lib", + "//tensorflow/core/kernels:dataset", + "//third_party/eigen3", + "@kafka//:kafka", + ], +) + +tf_gen_op_libs( + op_lib_names = ["kafka_ops"], + deps = [ + "//tensorflow/core:lib", + ], +) + +tf_gen_op_wrapper_py( + name = "gen_kafka_ops", + out = "python/ops/gen_kafka_ops.py", + require_shape_functions = True, + deps = [":kafka_ops_op_lib"], +) + +py_library( + name = "kafka", + srcs = [ + "__init__.py", + "python/ops/kafka_dataset_ops.py", + ], + srcs_version = "PY2AND3", + visibility = ["//visibility:public"], + deps = [ + ":gen_kafka_ops", + "//tensorflow/contrib/util:util_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:framework", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:platform", + "//tensorflow/python:state_ops", + "//tensorflow/python:training", + "//tensorflow/python/data/ops:dataset_ops", + "//tensorflow/python/data/ops:iterator_ops", + "//tensorflow/python/data/ops:readers", + ], +) + +# The Kafka server has to be setup before running the test. +# The Kafka server is setup through Docker so the Docker engine +# has to be installed. +# +# Once the Docker engine is ready: +# To setup the Kafka server: +# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh start kafka +# +# After the test is complete: +# To team down the Kafka server: +# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh stop kafka +tf_py_test( + name = "kafka_test", + srcs = ["python/kernel_tests/kafka_test.py"], + additional_deps = [ + ":kafka", + "//third_party/py/numpy", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:platform_test", + ], + tags = [ + "manual", + ], +) + +filegroup( + name = "all_files", + srcs = glob( + ["**/*"], + exclude = [ + "**/METADATA", + "**/OWNERS", + ], + ), + visibility = ["//tensorflow:__subpackages__"], +) diff --git a/tensorflow/contrib/kafka/__init__.py b/tensorflow/contrib/kafka/__init__.py new file mode 100644 index 00000000000..4d755c40568 --- /dev/null +++ b/tensorflow/contrib/kafka/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Kafka Dataset. + +@@KafkaDataset +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.kafka.python.ops.kafka_dataset_ops import KafkaDataset + +from tensorflow.python.util.all_util import remove_undocumented + +_allowed_symbols = [ + "KafkaDataset", +] + +remove_undocumented(__name__) diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc new file mode 100644 index 00000000000..88ef5f35711 --- /dev/null +++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc @@ -0,0 +1,321 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/kernels/dataset.h" + +#include "tensorflow/core/framework/tensor.h" + +#include "src-cpp/rdkafkacpp.h" + +namespace tensorflow { + +class KafkaDatasetOp : public DatasetOpKernel { + public: + using DatasetOpKernel::DatasetOpKernel; + + void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override { + const Tensor* topics_tensor; + OP_REQUIRES_OK(ctx, ctx->input("topics", &topics_tensor)); + OP_REQUIRES( + ctx, topics_tensor->dims() <= 1, + errors::InvalidArgument("`topics` must be a scalar or a vector.")); + + std::vector topics; + topics.reserve(topics_tensor->NumElements()); + for (int i = 0; i < topics_tensor->NumElements(); ++i) { + topics.push_back(topics_tensor->flat()(i)); + } + + std::string servers = ""; + OP_REQUIRES_OK(ctx, + ParseScalarArgument(ctx, "servers", &servers)); + std::string group = ""; + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "group", &group)); + bool eof = false; + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "eof", &eof)); + int64 timeout = -1; + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "timeout", &timeout)); + OP_REQUIRES(ctx, (timeout > 0), + errors::InvalidArgument( + "Timeout value should be large than 0, got ", timeout)); + *output = new Dataset(ctx, std::move(topics), servers, group, eof, timeout); + } + + private: + class Dataset : public GraphDatasetBase { + public: + Dataset(OpKernelContext* ctx, std::vector topics, + const string& servers, const string& group, const bool eof, + const int64 timeout) + : GraphDatasetBase(ctx), + topics_(std::move(topics)), + servers_(servers), + group_(group), + eof_(eof), + timeout_(timeout) {} + + std::unique_ptr MakeIterator( + const string& prefix) const override { + return std::unique_ptr( + new Iterator({this, strings::StrCat(prefix, "::Kafka")})); + } + + const DataTypeVector& output_dtypes() const override { + static DataTypeVector* dtypes = new DataTypeVector({DT_STRING}); + return *dtypes; + } + + const std::vector& output_shapes() const override { + static std::vector* shapes = + new std::vector({{}}); + return *shapes; + } + + string DebugString() override { return "KafkaDatasetOp::Dataset"; } + + protected: + Status AsGraphDefInternal(DatasetGraphDefBuilder* b, + Node** output) const override { + Node* topics = nullptr; + TF_RETURN_IF_ERROR(b->AddVector(topics_, &topics)); + Node* servers = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(servers_, &servers)); + Node* group = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(group_, &group)); + Node* eof = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(eof_, &eof)); + Node* timeout = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(timeout_, &timeout)); + TF_RETURN_IF_ERROR( + b->AddDataset(this, {topics, servers, group, eof, timeout}, output)); + return Status::OK(); + } + + private: + class Iterator : public DatasetIterator { + public: + explicit Iterator(const Params& params) + : DatasetIterator(params) {} + + Status GetNextInternal(IteratorContext* ctx, + std::vector* out_tensors, + bool* end_of_sequence) override { + mutex_lock l(mu_); + do { + // We are currently processing a topic, so try to read the next line. + if (consumer_.get()) { + while (true) { + if (limit_ >= 0 && + (topic_partition_->offset() >= limit_ || offset_ >= limit_)) { + // EOF current topic + break; + } + std::unique_ptr message( + consumer_->consume(dataset()->timeout_)); + if (message->err() == RdKafka::ERR_NO_ERROR) { + // Produce the line as output. + Tensor line_tensor(cpu_allocator(), DT_STRING, {}); + line_tensor.scalar()() = + std::string(static_cast(message->payload()), + message->len()); + out_tensors->emplace_back(std::move(line_tensor)); + *end_of_sequence = false; + // Sync offset + offset_ = message->offset(); + return Status::OK(); + } + + if (message->err() == RdKafka::ERR__PARTITION_EOF && + dataset()->eof_) { + // EOF current topic + break; + } + if (message->err() != RdKafka::ERR__TIMED_OUT) { + return errors::Internal("Failed to consume:", + message->errstr()); + } + message.reset(nullptr); + consumer_->poll(0); + } + + // We have reached the end of the current topic, so maybe + // move on to next topic. + ResetStreamsLocked(); + ++current_topic_index_; + } + + // Iteration ends when there are no more topic to process. + if (current_topic_index_ == dataset()->topics_.size()) { + *end_of_sequence = true; + return Status::OK(); + } + + TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env())); + } while (true); + } + + protected: + Status SaveInternal(IteratorStateWriter* writer) override { + mutex_lock l(mu_); + TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_topic_index"), + current_topic_index_)); + + // `consumer_` is empty if + // 1. GetNext has not been called even once. + // 2. All topics have been read and iterator has been exhausted. + if (consumer_.get()) { + TF_RETURN_IF_ERROR( + writer->WriteScalar(full_name("current_pos"), offset_)); + } + return Status::OK(); + } + + Status RestoreInternal(IteratorContext* ctx, + IteratorStateReader* reader) override { + mutex_lock l(mu_); + ResetStreamsLocked(); + int64 current_topic_index; + TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_topic_index"), + ¤t_topic_index)); + current_topic_index_ = size_t(current_topic_index); + // The key "current_pos" is written only if the iterator was saved + // with an open topic. + if (reader->Contains(full_name("current_pos"))) { + int64 current_pos; + TF_RETURN_IF_ERROR( + reader->ReadScalar(full_name("current_pos"), ¤t_pos)); + + TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env())); + topic_partition_->set_offset(current_pos); + if (topic_partition_->offset() != current_pos) { + return errors::Internal("Failed to restore to offset ", + current_pos); + } + offset_ = current_pos; + } + return Status::OK(); + } + + private: + // Sets up Kafka streams to read from the topic at + // `current_topic_index_`. + Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (current_topic_index_ >= dataset()->topics_.size()) { + return errors::InvalidArgument( + "current_topic_index_:", current_topic_index_, + " >= topics_.size():", dataset()->topics_.size()); + } + + // Actually move on to next topic. + string entry = dataset()->topics_[current_topic_index_]; + + std::vector parts = str_util::Split(entry, ":"); + if (parts.size() < 1) { + return errors::InvalidArgument("Invalid parameters: ", entry); + } + string topic = parts[0]; + int32 partition = 0; + if (parts.size() > 1) { + if (!strings::safe_strto32(parts[1], &partition)) { + return errors::InvalidArgument("Invalid parameters: ", entry); + } + } + int64 offset = 0; + if (parts.size() > 2) { + if (!strings::safe_strto64(parts[2], &offset)) { + return errors::InvalidArgument("Invalid parameters: ", entry); + } + } + + topic_partition_.reset( + RdKafka::TopicPartition::create(topic, partition, offset)); + + offset_ = topic_partition_->offset(); + limit_ = -1; + if (parts.size() > 3) { + if (!strings::safe_strto64(parts[3], &limit_)) { + return errors::InvalidArgument("Invalid parameters: ", entry); + } + } + + std::unique_ptr conf( + RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL)); + std::unique_ptr topic_conf( + RdKafka::Conf::create(RdKafka::Conf::CONF_TOPIC)); + + std::string errstr; + + RdKafka::Conf::ConfResult result = + conf->set("default_topic_conf", topic_conf.get(), errstr); + if (result != RdKafka::Conf::CONF_OK) { + return errors::Internal("Failed to set default_topic_conf:", errstr); + } + + result = conf->set("bootstrap.servers", dataset()->servers_, errstr); + if (result != RdKafka::Conf::CONF_OK) { + return errors::Internal("Failed to set bootstrap.servers ", + dataset()->servers_, ":", errstr); + } + result = conf->set("group.id", dataset()->group_, errstr); + if (result != RdKafka::Conf::CONF_OK) { + return errors::Internal("Failed to set group.id ", dataset()->group_, + ":", errstr); + } + + consumer_.reset(RdKafka::KafkaConsumer::create(conf.get(), errstr)); + if (!consumer_.get()) { + return errors::Internal("Failed to create consumer:", errstr); + } + + std::vector partitions; + partitions.emplace_back(topic_partition_.get()); + RdKafka::ErrorCode err = consumer_->assign(partitions); + if (err != RdKafka::ERR_NO_ERROR) { + return errors::Internal( + "Failed to assign partition [", topic_partition_->topic(), ", ", + topic_partition_->partition(), ", ", topic_partition_->offset(), + "]:", RdKafka::err2str(err)); + } + + return Status::OK(); + } + + // Resets all Kafka streams. + void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { + consumer_->unassign(); + consumer_->close(); + consumer_.reset(nullptr); + } + + mutex mu_; + size_t current_topic_index_ GUARDED_BY(mu_) = 0; + int64 offset_ GUARDED_BY(mu_) = 0; + int64 limit_ GUARDED_BY(mu_) = -1; + std::unique_ptr topic_partition_ GUARDED_BY(mu_); + std::unique_ptr consumer_ GUARDED_BY(mu_); + }; + + const std::vector topics_; + const std::string servers_; + const std::string group_; + const bool eof_; + const int64 timeout_; + }; +}; + +REGISTER_KERNEL_BUILDER(Name("KafkaDataset").Device(DEVICE_CPU), + KafkaDatasetOp); + +} // namespace tensorflow diff --git a/tensorflow/contrib/kafka/ops/kafka_ops.cc b/tensorflow/contrib/kafka/ops/kafka_ops.cc new file mode 100644 index 00000000000..8cdf16103ba --- /dev/null +++ b/tensorflow/contrib/kafka/ops/kafka_ops.cc @@ -0,0 +1,44 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +REGISTER_OP("KafkaDataset") + .Input("topics: string") + .Input("servers: string") + .Input("group: string") + .Input("eof: bool") + .Input("timeout: int64") + .Output("handle: variant") + .SetIsStateful() + .SetShapeFn(shape_inference::ScalarShape) + .Doc(R"doc( +Creates a dataset that emits the messages of one or more Kafka topics. + +topics: A `tf.string` tensor containing one or more subscriptions, + in the format of [topic:partition:offset:length], + by default length is -1 for unlimited. +servers: A list of bootstrap servers. +group: The consumer group id. +eof: If True, the kafka reader will stop on EOF. +timeout: The timeout value for the Kafka Consumer to wait + (in millisecond). +)doc"); + +} // namespace tensorflow diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py new file mode 100644 index 00000000000..94cf6b5ace6 --- /dev/null +++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py @@ -0,0 +1,117 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +"""Tests for KafkaDataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import os + +from tensorflow.contrib.kafka.python.ops import kafka_dataset_ops +from tensorflow.python.data.ops import iterator_ops +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.lib.io import python_io +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import io_ops +from tensorflow.python.platform import test +from tensorflow.python.util import compat + +class KafkaDatasetTest(test.TestCase): + + def setUp(self): + # The Kafka server has to be setup before the test + # and tear down after the test manually. + # The docker engine has to be installed. + # + # To setup the Kafka server: + # $ bash kafka_test.sh start kafka + # + # To team down the Kafka server: + # $ bash kafka_test.sh stop kafka + pass + + def testKafkaDataset(self): + topics = array_ops.placeholder(dtypes.string, shape=[None]) + num_epochs = array_ops.placeholder(dtypes.int64, shape=[]) + batch_size = array_ops.placeholder(dtypes.int64, shape=[]) + + repeat_dataset = kafka_dataset_ops.KafkaDataset( + topics, group="test", eof=True).repeat(num_epochs) + batch_dataset = repeat_dataset.batch(batch_size) + + iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types) + init_op = iterator.make_initializer(repeat_dataset) + init_batch_op = iterator.make_initializer(batch_dataset) + get_next = iterator.get_next() + + with self.test_session() as sess: + # Basic test: read from topic 0. + sess.run( + init_op, feed_dict={topics: ["test:0:0:4"], + num_epochs: 1}) + for i in range(5): + self.assertEqual("D"+str(i), sess.run(get_next)) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + # Basic test: read from topic 1. + sess.run( + init_op, feed_dict={topics: ["test:0:5:-1"], + num_epochs: 1}) + for i in range(5): + self.assertEqual("D"+str(i + 5), sess.run(get_next)) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + # Basic test: read from both topics. + sess.run(init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"], + num_epochs: 1}) + for j in range(2): + for i in range(5): + self.assertEqual("D"+str(i + j * 5), sess.run(get_next)) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + # Test repeated iteration through both files. + sess.run(init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"], + num_epochs: 10}) + for _ in range(10): + for j in range(2): + for i in range(5): + self.assertEqual("D"+str(i + j * 5), sess.run(get_next)) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + # Test batched and repeated iteration through both files. + sess.run( + init_batch_op, + feed_dict={topics: ["test:0:0:4", "test:0:5:-1"], + num_epochs: 10, + batch_size: 5}) + for _ in range(10): + self.assertAllEqual(["D"+str(i) for i in range(5)], + sess.run(get_next)) + self.assertAllEqual(["D"+str(i + 5) for i in range(5)], + sess.run(get_next)) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh new file mode 100644 index 00000000000..7997c127311 --- /dev/null +++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -e +set -o pipefail + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 start|stop " >&2 + exit 1 +fi + +container=$2 +if [ "$1" == "start" ]; then + docker run -d --rm --net=host --name=$container spotify/kafka + echo Wait 5 secs until kafka is up and running + sleep 5 + echo Create test topic + docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test' + echo Create test message + docker exec $container bash -c 'echo -e "D0\nD1\nD2\nD3\nD4\nD5\nD6\nD7\nD8\nD9" > /test' + echo Produce test message + docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-console-producer.sh --topic test --broker-list 127.0.0.1:9092 < /test' + + echo Container $container started successfully +elif [ "$1" == "stop" ]; then + docker rm -f $container + + echo Container $container stopped successfully +else + echo "Usage: $0 start|stop " >&2 + exit 1 +fi + + + diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py new file mode 100644 index 00000000000..e561f595a40 --- /dev/null +++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py @@ -0,0 +1,73 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Kafka Dataset.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.kafka.python.ops import gen_kafka_ops +from tensorflow.contrib.util import loader +from tensorflow.python.data.ops.readers import Dataset +from tensorflow.python.framework import common_shapes +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.platform import resource_loader + +class KafkaDataset(Dataset): + """A Kafka Dataset that consumes the message. + """ + + def __init__( + self, topics, servers="localhost", group="", eof=False, timeout=1000): + """Create a KafkaReader. + + Args: + topics: A `tf.string` tensor containing one or more subscriptions, + in the format of [topic:partition:offset:length], + by default length is -1 for unlimited. + servers: A list of bootstrap servers. + group: The consumer group id. + eof: If True, the kafka reader will stop on EOF. + timeout: The timeout value for the Kafka Consumer to wait + (in millisecond). + """ + super(KafkaDataset, self).__init__() + self._topics = ops.convert_to_tensor( + topics, dtype=dtypes.string, name="topics") + self._servers = ops.convert_to_tensor( + servers, dtype=dtypes.string, name="servers") + self._group = ops.convert_to_tensor( + group, dtype=dtypes.string, name="group") + self._eof = ops.convert_to_tensor( + eof, dtype=dtypes.bool, name="eof") + self._timeout = ops.convert_to_tensor( + timeout, dtype=dtypes.int64, name="timeout") + + def _as_variant_tensor(self): + return gen_kafka_ops.kafka_dataset( + self._topics, self._servers, self._group, self._eof, self._timeout) + + @property + def output_classes(self): + return ops.Tensor + + @property + def output_shapes(self): + return tensor_shape.scalar() + + @property + def output_types(self): + return dtypes.string diff --git a/tensorflow/contrib/kernel_methods/python/losses_test.py b/tensorflow/contrib/kernel_methods/python/losses_test.py index d38d8041ce1..72507539f81 100644 --- a/tensorflow/contrib/kernel_methods/python/losses_test.py +++ b/tensorflow/contrib/kernel_methods/python/losses_test.py @@ -119,19 +119,20 @@ class SparseMulticlassHingeLossTest(test.TestCase): def testUnknownShape(self): """Result keeps same with `testZeroLossInt32Labels`""" - logits_np = np.array([[1.2, -1.4, -1.0], - [1.4, 1.8, 4.0], - [0.5, 1.8, -1.0]]) + logits_np = np.array([[1.2, -1.4, -1.0], [1.4, 1.8, 4.0], [0.5, 1.8, -1.0]]) labels_np = np.array([0, 2, 1], dtype=np.int32) - logits_shapes = [[3, 3], # batch_size, num_classes - [None, 3], - [3, None], - [None, None]] + logits_shapes = [ + [3, 3], # batch_size, num_classes + [None, 3], + [3, None], + [None, None] + ] for batch_size, num_classes in logits_shapes: with self.test_session(): - logits = array_ops.placeholder(dtypes.float32, shape=(batch_size, num_classes)) + logits = array_ops.placeholder( + dtypes.float32, shape=(batch_size, num_classes)) labels = array_ops.placeholder(dtypes.int32, shape=(batch_size,)) loss = losses.sparse_multiclass_hinge_loss(labels, logits) result = loss.eval(feed_dict={logits: logits_np, labels: labels_np}) diff --git a/tensorflow/contrib/kfac/examples/mlp.py b/tensorflow/contrib/kfac/examples/mlp.py index 0f0dbb53f45..87eed03888c 100644 --- a/tensorflow/contrib/kfac/examples/mlp.py +++ b/tensorflow/contrib/kfac/examples/mlp.py @@ -317,7 +317,10 @@ def train_mnist_estimator(data_dir, num_epochs, use_fake_data=False): return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, training_hooks=hooks) + run_config = tf.estimator.RunConfig( + model_dir="/tmp/mnist", save_checkpoints_steps=1, keep_checkpoint_max=100) + # Train until input_fn() is empty with Estimator. This is a prerequisite for # TPU compatibility. - estimator = tf.estimator.Estimator(model_fn=model_fn) + estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) estimator.train(input_fn=input_fn) diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py index 9436caf9618..0d2fa706f58 100644 --- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py +++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py @@ -457,7 +457,9 @@ class ConvDiagonalFB(FisherBlock): self._num_locations = ( inputs_shape[1] * inputs_shape[2] // (self._strides[1] * self._strides[2])) - self._damping = normalize_damping(damping, self._num_locations) + + self._damping = (self._num_locations + * normalize_damping(damping, self._num_locations)) self._factor = self._layer_collection.make_or_get_factor( fisher_factors.ConvDiagonalFactor, diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py index f59168cbc05..bcba18ae147 100644 --- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py +++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py @@ -31,6 +31,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import nn from tensorflow.python.ops import special_math_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables @@ -111,6 +112,54 @@ def diagonal_covariance_initializer(shape, dtype, partition_info): # pylint: di return array_ops.ones(shape, dtype) +def extract_image_patches(image, ksizes, strides, padding, name=None): + """Extracts image patches for an N-dimensional convolution. + + This function is a compatibility wrapper over tf.extract_image_patches(), as + ExtractImagePatches isn't yet implemented in XLA. + + Args: + image: Tensor of shape [batch, in_x, in_y, ..., in_channels]. Input images. + All dimensions except 'batch' must be defined. + ksizes: [filter_x, filter_y, ...]. Spatial shape of filter in each + dimension. + strides: [stride_x, stride_y, ...]. Spatial stride for filter in each + dimension. + padding: str. "VALID" or "SAME". + name: str or None. name of Op. + + Returns: + result: [batch, out_x, out_y, ..., filter_x, filter_y, ..., in_channels]. + Contains image patches to which conv kernel would be applied for each + output location. [out_x, out_y, ...] depends on padding. + """ + if not utils.on_tpu(): + return array_ops.extract_image_patches( + image, + ksizes=([1] + list(ksizes) + [1]), + strides=([1] + list(strides) + [1]), + rates=[1, 1, 1, 1], + padding=padding, + name=name) + + with tf_ops.name_scope(name, "extract_image_patches", + [image, ksizes, strides, padding]): + batch = image.shape.as_list()[0] + in_channels = image.shape.as_list()[-1] + + # Map each input feature to a location in the output. + out_channels = np.prod(ksizes) * in_channels + filters = linalg_ops.eye(out_channels), + filters = array_ops.reshape(filters, ksizes + [in_channels, out_channels]) + + result = nn.convolution(image, filters, padding, strides=strides) + out_spatial = result.shape.as_list()[1:-1] + result = array_ops.reshape( + result, [batch or -1] + out_spatial + ksizes + [in_channels]) + + return result + + def compute_cov(tensor, tensor_right=None, normalizer=None): """Compute the empirical second moment of the rows of a 2D Tensor. @@ -668,11 +717,10 @@ class ConvDiagonalFactor(DiagonalFactor): # TODO(b/64144716): there is potential here for a big savings in terms # of memory use. - patches = array_ops.extract_image_patches( + patches = extract_image_patches( self._inputs, - ksizes=[1, filter_height, filter_width, 1], - strides=self._strides, - rates=[1, 1, 1, 1], + ksizes=[filter_height, filter_width], + strides=self._strides[1:-1], padding=self._padding) if self._has_bias: @@ -816,11 +864,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): # TODO(b/64144716): there is potential here for a big savings in terms of # memory use. - patches = array_ops.extract_image_patches( + patches = extract_image_patches( self._inputs, - ksizes=[1, filter_height, filter_width, 1], - strides=self._strides, - rates=[1, 1, 1, 1], + ksizes=[filter_height, filter_width], + strides=self._strides[1:-1], padding=self._padding) flatten_size = (filter_height * filter_width * in_channels) diff --git a/tensorflow/contrib/kfac/python/ops/utils_lib.py b/tensorflow/contrib/kfac/python/ops/utils_lib.py index cc48e3c69f2..fe8e39c212c 100644 --- a/tensorflow/contrib/kfac/python/ops/utils_lib.py +++ b/tensorflow/contrib/kfac/python/ops/utils_lib.py @@ -24,6 +24,7 @@ from tensorflow.python.util.all_util import remove_undocumented # pylint: enable=unused-import,line-too-long,wildcard-import _allowed_symbols = [ + "set_global_constants", "SequenceDict", "tensors_to_column", "column_to_tensors", diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc index 932c5ab9924..01893d60615 100644 --- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc +++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc @@ -423,8 +423,9 @@ class SparseFeatureCrossOp : public OpKernel { "Input values should be a std::vector but received shape ", values_list_in[i].shape().DebugString(), " at position ", i)); OP_REQUIRES( - context, indices_list_in[i].shape().dim_size(0) == - values_list_in[i].shape().dim_size(0), + context, + indices_list_in[i].shape().dim_size(0) == + values_list_in[i].shape().dim_size(0), errors::InvalidArgument( "Expected size of values to be ", indices_list_in[i].shape().dim_size(0), " got ", diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py index c8e3307ee8b..fb7b2e315ef 100644 --- a/tensorflow/contrib/layers/python/layers/layers.py +++ b/tensorflow/contrib/layers/python/layers/layers.py @@ -60,12 +60,12 @@ __all__ = [ 'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution', 'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose', 'convolution3d', 'convolution3d_transpose', 'dense_to_sparse', - 'dropout', 'elu', 'flatten', - 'fully_connected', 'GDN', 'gdn', 'layer_norm', 'linear', 'pool', - 'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat', - 'scale_gradient', 'separable_conv2d', 'separable_convolution2d', 'softmax', - 'spatial_softmax', 'stack', 'unit_norm', 'legacy_fully_connected', - 'legacy_linear', 'legacy_relu', 'maxout' + 'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn', 'layer_norm', + 'linear', 'pool', 'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu', + 'relu6', 'repeat', 'scale_gradient', 'separable_conv2d', + 'separable_convolution2d', 'softmax', 'spatial_softmax', 'stack', + 'unit_norm', 'legacy_fully_connected', 'legacy_linear', 'legacy_relu', + 'maxout' ] DATA_FORMAT_NCHW = 'NCHW' @@ -1418,7 +1418,9 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None): with variable_scope.variable_scope( scope, 'dense_to_sparse', [tensor]) as sc: tensor = ops.convert_to_tensor(tensor) - indices = array_ops.where(math_ops.not_equal(tensor, constant_op.constant(eos_token, tensor.dtype))) + indices = array_ops.where( + math_ops.not_equal( + tensor, constant_op.constant(eos_token, tensor.dtype))) values = array_ops.gather_nd(tensor, indices) shape = array_ops.shape(tensor, out_type=dtypes.int64) outputs = sparse_tensor.SparseTensor(indices, values, shape) diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py index c5790c76221..8945690db8e 100644 --- a/tensorflow/contrib/layers/python/layers/layers_test.py +++ b/tensorflow/contrib/layers/python/layers/layers_test.py @@ -127,8 +127,8 @@ class AvgPool3DTest(test.TestCase): def testInvalidDataFormat(self): depth, height, width = 3, 6, 9 images = np.random.uniform(size=(5, depth, height, width, 3)) - with self.assertRaisesRegexp(ValueError, - 'data_format has to be either NCDHW or NDHWC.'): + with self.assertRaisesRegexp( + ValueError, 'data_format has to be either NCDHW or NDHWC.'): _layers.avg_pool3d(images, [3, 3, 3], data_format='CDHWN') def testCreateAvgPool(self): @@ -148,7 +148,8 @@ class AvgPool3DTest(test.TestCase): def testCollectOutputs(self): depth, height, width = 3, 6, 9 images = random_ops.random_uniform((5, depth, height, width, 3), seed=1) - output = _layers.avg_pool3d(images, [3, 3, 3], outputs_collections='outputs') + output = _layers.avg_pool3d( + images, [3, 3, 3], outputs_collections='outputs') output_collected = ops.get_collection('outputs')[0] self.assertEqual(output_collected.aliases, ['AvgPool3D']) self.assertEqual(output_collected, output) @@ -183,7 +184,8 @@ class AvgPool3DTest(test.TestCase): depth, height, width = 3, 6, 9 images = random_ops.random_uniform((5, depth, height, width, 3), seed=1) output = _layers.avg_pool3d(images, [3, 3, 3], stride=1, padding='SAME') - self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3]) + self.assertListEqual(output.get_shape().as_list(), + [5, depth, height, width, 3]) def testGlobalAvgPool(self): depth, height, width = 3, 6, 9 @@ -515,7 +517,9 @@ class ConvolutionTest(test.TestCase): with arg_scope( [layers_lib.convolution2d], normalizer_fn=_layers.batch_norm, - normalizer_params={'decay': 0.9}): + normalizer_params={ + 'decay': 0.9 + }): net = layers_lib.convolution2d(images, 32, [3, 3]) net = layers_lib.convolution2d(net, 32, [3, 3]) self.assertEqual(len(variables.get_variables()), 8) @@ -529,7 +533,9 @@ class ConvolutionTest(test.TestCase): with arg_scope( [layers_lib.convolution2d], normalizer_fn=_layers.batch_norm, - normalizer_params={'decay': 0.9}): + normalizer_params={ + 'decay': 0.9 + }): net = layers_lib.convolution2d(images, 32, [3, 3], scope='Conv') net = layers_lib.convolution2d( net, 32, [3, 3], scope='Conv', reuse=True) @@ -1031,7 +1037,8 @@ class Convolution2dTransposeTests(test.TestCase): for _ in range(10): num_filters = 1 input_size = [ - 1, np.random.randint(1, max_image_size), + 1, + np.random.randint(1, max_image_size), np.random.randint(1, max_image_size), 1 ] filter_size = [ @@ -1185,8 +1192,10 @@ class ConvolutionInPlaneTest(test.TestCase): with self.test_session() as sess: sess.run(init_op) - result = sess.run(horz_gradients, - feed_dict={image: np.ones((1, 10, 10, 1))}) + result = sess.run( + horz_gradients, feed_dict={ + image: np.ones((1, 10, 10, 1)) + }) expected = np.zeros((1, 10, 9, 1)) self.assertAllEqual(result, expected) @@ -1299,7 +1308,8 @@ class DenseToSparseTest(test.TestCase): expected_constant = np.reshape(np.arange(24, dtype=np.int64), (3, 4, 2)) tensor = constant_op.constant(expected_constant) sparse = _layers.dense_to_sparse(tensor) - dense = sparse_ops.sparse_to_dense(sparse.indices, sparse.dense_shape, sparse.values) + dense = sparse_ops.sparse_to_dense( + sparse.indices, sparse.dense_shape, sparse.values) with self.test_session() as sess: constant = sess.run(dense) self.assertAllEqual(expected_constant, constant) @@ -1418,8 +1428,7 @@ class FlattenTest(test.TestCase): with ops.Graph().as_default() as g, self.test_session(g): inputs = array_ops.placeholder(dtype=dtypes.float32) inputs.set_shape(tensor_shape.TensorShape((5,))) - with self.assertRaisesRegexp(ValueError, - 'incompatible with the layer'): + with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'): _layers.flatten(inputs) def testUnknownLastDim(self): @@ -1729,7 +1738,9 @@ class FCTest(test.TestCase): with arg_scope( [_layers.fully_connected], normalizer_fn=_layers.batch_norm, - normalizer_params={'decay': 0.9}): + normalizer_params={ + 'decay': 0.9 + }): net = _layers.fully_connected(images, 27) net = _layers.fully_connected(net, 27) self.assertEqual(len(variables.get_variables()), 8) @@ -1745,7 +1756,9 @@ class FCTest(test.TestCase): with arg_scope( [_layers.fully_connected], normalizer_fn=_layers.batch_norm, - normalizer_params={'decay': 0.9}): + normalizer_params={ + 'decay': 0.9 + }): net = _layers.fully_connected(images, 27, scope='fc1') net = _layers.fully_connected(net, 27, scope='fc1', reuse=True) self.assertEqual(len(variables.get_variables()), 4) @@ -1762,8 +1775,8 @@ class BatchNormTest(test.TestCase): def testBatchNormCenterFalse(self): a = array_ops.placeholder(dtype=dtypes.float32, shape=(10, 10, 10, 10)) # Test that center=False builds a valid graph. - _layers.batch_norm(a, center=False, data_format='NCHW', - zero_debias_moving_mean=True) + _layers.batch_norm( + a, center=False, data_format='NCHW', zero_debias_moving_mean=True) def testUnknownShape(self): with ops.Graph().as_default() as g, self.test_session(g): @@ -1800,8 +1813,8 @@ class BatchNormTest(test.TestCase): images = np.random.uniform(size=(5, height, width, 3)).astype( dtype.as_numpy_dtype) output = _layers.batch_norm(images, fused=fused) - expected_name = ('BatchNorm/FusedBatchNorm' if fused else - 'BatchNorm/batchnorm') + expected_name = ('BatchNorm/FusedBatchNorm' + if fused else 'BatchNorm/batchnorm') self.assertTrue(output.op.name.startswith(expected_name)) self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3]) self.assertEqual( @@ -2020,8 +2033,8 @@ class BatchNormTest(test.TestCase): expected_var = np.var(image_values, axis=axis) if fused: # Add Bessel's correction - expected_var, _ = self._addBesselsCorrection(batch_size * height * - width, expected_var) + expected_var, _ = self._addBesselsCorrection( + batch_size * height * width, expected_var) images = constant_op.constant( image_values, shape=image_shape, dtype=dtypes.float32) output = _layers.batch_norm( @@ -2540,8 +2553,8 @@ class BatchNormTest(test.TestCase): expected_var = np.var(image_values, axis=axis) if fused: # Add Bessel's correction - expected_var, _ = self._addBesselsCorrection(batch_size * height * - width, expected_var) + expected_var, _ = self._addBesselsCorrection( + batch_size * height * width, expected_var) images = constant_op.constant( image_values, shape=image_shape, dtype=dtypes.float32) output = _layers.batch_norm( @@ -2571,8 +2584,9 @@ class BatchNormTest(test.TestCase): np_output, new_images_gradients = sess.run([output, images_gradients]) # The outputs should be close to 0.0 mean and 1.0 variance self.assertAllClose( - np.mean( - np_output, axis=axis), [0] * channels, rtol=0.001, atol=0.001) + np.mean(np_output, axis=axis), [0] * channels, + rtol=0.001, + atol=0.001) self.assertAllClose( np.var(np_output, axis=axis), [1] * channels, rtol=0.01, atol=0.01) # The gradients should change slowly while updating moving_mean. @@ -2600,14 +2614,14 @@ class BatchNormTest(test.TestCase): channels = 3 with self.test_session() as sess: images = (np.ones((5, height, width, channels)) * 9.0).astype('f') - beta = init_ops.constant_initializer((np.ones(channels) * 5.0).astype( - 'f')) - gamma = init_ops.constant_initializer((np.ones(channels) * 2.0).astype( - 'f')) - mean = init_ops.constant_initializer((np.ones(channels) * 5.0).astype( - 'f')) - variance = init_ops.constant_initializer((np.ones(channels) * 4.0).astype( - 'f')) + beta = init_ops.constant_initializer( + (np.ones(channels) * 5.0).astype('f')) + gamma = init_ops.constant_initializer( + (np.ones(channels) * 2.0).astype('f')) + mean = init_ops.constant_initializer( + (np.ones(channels) * 5.0).astype('f')) + variance = init_ops.constant_initializer( + (np.ones(channels) * 4.0).astype('f')) output = _layers.batch_norm( images, is_training=False, @@ -2628,21 +2642,18 @@ class BatchNormTest(test.TestCase): with self.test_session(use_gpu=True) as sess: images = np.arange(np.product(shape), dtype=np.float32).reshape(shape) beta = init_ops.constant_initializer( - np.arange( - 2, channels + 2, dtype=np.float32)) + np.arange(2, channels + 2, dtype=np.float32)) gamma = init_ops.constant_initializer( - np.arange( - 10, channels + 10, dtype=np.float32) * 2.0) + np.arange(10, channels + 10, dtype=np.float32) * 2.0) mean = init_ops.constant_initializer( - np.arange( - 3, channels + 3, dtype=np.float32) * 5.0) + np.arange(3, channels + 3, dtype=np.float32) * 5.0) variance = init_ops.constant_initializer( - np.arange( - 1, channels + 1, dtype=np.float32) * 4.0) + np.arange(1, channels + 1, dtype=np.float32) * 4.0) if data_format == 'NCHW': # Reshape inputs from NHWC to NCHW format. images = array_ops.transpose( - images, [0, len(shape) - 1] + list(range(1, len(shape) - 1))) + images, [0, len(shape) - 1] + list(range(1, + len(shape) - 1))) output = _layers.batch_norm( images, is_training=is_training, @@ -2745,16 +2756,16 @@ class BatchNormTest(test.TestCase): # Tests that the adjustment is appropriately passed to and used by the core # BN layer. all_adjustments = [] + def _create_adjustment(shape): adjustments = [array_ops.ones(shape[-1:]), array_ops.zeros(shape[-1:])] all_adjustments.extend(adjustments) return adjustments + depth = 8 images = array_ops.zeros([10, 5, 5, depth]) output = _layers.batch_norm( - images, - is_training=True, - adjustment=_create_adjustment) + images, is_training=True, adjustment=_create_adjustment) self.assertListEqual(output.shape.as_list(), images.shape.as_list()) self.assertEqual(len(all_adjustments), 2) self.assertListEqual(all_adjustments[0].shape.as_list(), [depth]) @@ -2819,7 +2830,10 @@ class LayerNormTest(test.TestCase): # output_train and output_eval should be the same. self.assertAllClose(sess.run([output_train]), sess.run([output_eval])) - def doOutputTest(self, input_shape, tol=1e-5, begin_norm_axis=1, + def doOutputTest(self, + input_shape, + tol=1e-5, + begin_norm_axis=1, dtype=dtypes.float64): expected_mean = np.zeros(input_shape[:begin_norm_axis]) expected_var = np.ones(input_shape[:begin_norm_axis]) @@ -2850,13 +2864,10 @@ class LayerNormTest(test.TestCase): # Layer-norm implemented in numpy eps = 1e-12 expected_out = ( - (gamma * ( - input_values - - np.mean(input_values, axis=moments_axis, keepdims=True)) - / np.sqrt( - eps - + np.var(input_values, axis=moments_axis, keepdims=True))) - + beta) + (gamma * (input_values - np.mean( + input_values, axis=moments_axis, keepdims=True)) / + np.sqrt(eps + np.var( + input_values, axis=moments_axis, keepdims=True))) + beta) self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol) self.assertAllClose(expected_var, var, atol=tol) # The full computation gets a bigger tolerance @@ -2874,10 +2885,10 @@ class LayerNormTest(test.TestCase): def testOutput4DInputNormOnInnermostAxis(self): # Equivalent tests - self.doOutputTest((100, 10, 10, 3), begin_norm_axis=3, tol=1e-4, - dtype=dtypes.float64) - self.doOutputTest((100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4, - dtype=dtypes.float64) + self.doOutputTest( + (100, 10, 10, 3), begin_norm_axis=3, tol=1e-4, dtype=dtypes.float64) + self.doOutputTest( + (100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4, dtype=dtypes.float64) def testOutputSmallInput(self): self.doOutputTest((10, 10, 10, 30)) @@ -2914,7 +2925,7 @@ class GDNTest(test.TestCase): x = np.random.uniform(size=(1, 2, 3, 4)[:ndim]) y = self._runGDN(x, x.shape, False, 'channels_last') self.assertEqual(x.shape, y.shape) - self.assertAllClose(y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6) + self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6) def testChannelsFirst(self): # `bias_add` doesn't support NCHW on CPU. @@ -2923,8 +2934,7 @@ class GDNTest(test.TestCase): x = np.random.uniform(size=(4, 3, 2, 1)[:ndim]) y = self._runGDN(x, x.shape, False, 'channels_first') self.assertEqual(x.shape, y.shape) - self.assertAllClose( - y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6) + self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6) def testWrongDims(self): for ndim in [1, 2, 6]: @@ -2936,7 +2946,7 @@ class GDNTest(test.TestCase): x = np.random.uniform(size=(1, 2, 3, 4)) y = self._runGDN(x, x.shape, True, 'channels_last') self.assertEqual(x.shape, y.shape) - self.assertAllClose(y, x * np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6) + self.assertAllClose(y, x * np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6) class MaxPool2DTest(test.TestCase): @@ -3013,20 +3023,22 @@ class MaxPool3DTest(test.TestCase): def testInvalidDataFormat(self): depth, height, width = 3, 6, 9 images = np.random.uniform(size=(5, depth, height, width, 3)) - with self.assertRaisesRegexp(ValueError, - 'data_format has to be either NCDHW or NDHWC.'): + with self.assertRaisesRegexp( + ValueError, 'data_format has to be either NCDHW or NDHWC.'): _layers.max_pool3d(images, [3, 3, 3], data_format='CDHWN') def testCreateMaxPool(self): depth, height, width = 3, 6, 9 - images = np.random.uniform(size=(5, depth, height, width, 3)).astype(np.float32) + images = np.random.uniform(size=(5, depth, height, width, 3)).astype( + np.float32) output = _layers.max_pool3d(images, [3, 3, 3]) self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D') self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3]) def testCreateMaxPoolNCDHW(self): depth, height, width = 3, 6, 9 - images = np.random.uniform(size=(5, 3, depth, height, width)).astype(np.float32) + images = np.random.uniform(size=(5, 3, depth, height, width)).astype( + np.float32) output = _layers.max_pool3d(images, [3, 3, 3], data_format='NCDHW') self.assertEquals(output.op.name, 'MaxPool3D/transpose_1') self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2, 4]) @@ -3034,7 +3046,8 @@ class MaxPool3DTest(test.TestCase): def testCollectOutputs(self): depth, height, width = 3, 6, 9 images = random_ops.random_uniform((5, depth, height, width, 3), seed=1) - output = _layers.max_pool3d(images, [3, 3, 3], outputs_collections='outputs') + output = _layers.max_pool3d( + images, [3, 3, 3], outputs_collections='outputs') output_collected = ops.get_collection('outputs')[0] self.assertEqual(output_collected.aliases, ['MaxPool3D']) self.assertEqual(output_collected, output) @@ -3069,7 +3082,8 @@ class MaxPool3DTest(test.TestCase): depth, height, width = 3, 6, 9 images = random_ops.random_uniform((5, depth, height, width, 3), seed=1) output = _layers.max_pool3d(images, [3, 3, 3], stride=1, padding='SAME') - self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3]) + self.assertListEqual(output.get_shape().as_list(), + [5, depth, height, width, 3]) def testGlobalMaxPool(self): depth, height, width = 3, 6, 9 @@ -3481,8 +3495,7 @@ class SpatialSoftmaxTests(test.TestCase): sess.run(variables_lib.global_variables_initializer()) feed_dict = {features: np_features} keypoints = sess.run(spatial_softmax, feed_dict) - self.assertAllEqual(keypoints.shape, - (batch_shape[0], batch_shape[3] * 2)) + self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[3] * 2)) def testSpatialSoftmaxShapeNCHW(self): batch_shape = (2, 2, 35, 35) @@ -3493,8 +3506,7 @@ class SpatialSoftmaxTests(test.TestCase): sess.run(variables_lib.global_variables_initializer()) feed_dict = {features: np_features} keypoints = sess.run(spatial_softmax, feed_dict) - self.assertAllEqual(keypoints.shape, - (batch_shape[0], batch_shape[1] * 2)) + self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[1] * 2)) def testTwoMaxActivationsSameChannel(self): batch_size, height, width, nchannels = (2, 35, 35, 1) @@ -3513,8 +3525,8 @@ class SpatialSoftmaxTests(test.TestCase): x_loc = [avg_x] y_loc = [avg_y] - np_keypoints = self._SpatialSoftmax( - x_loc, y_loc, height, width, batch_size, nchannels) + np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size, + nchannels) # Make sure expected location keypoints matches actual location keypoints. with self.test_session() as sess: @@ -3532,13 +3544,13 @@ class SpatialSoftmaxTests(test.TestCase): spatial_softmax = _layers.spatial_softmax(features) np_features = np.zeros(batch_shape, dtype=np.float32) - edges = [(0, 0), (0, width-1), (height-1, 0), (height-1, width-1)] + edges = [(0, 0), (0, width - 1), (height - 1, 0), (height - 1, width - 1)] x_loc, y_loc = zip(*edges) for c in range(nchannels): np_features[:, x_loc[c], y_loc[c], c] = 100. - np_keypoints = self._SpatialSoftmax( - x_loc, y_loc, height, width, batch_size, nchannels) + np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size, + nchannels) # Make sure expected location keypoints matches actual location keypoints. with self.test_session() as sess: @@ -3567,10 +3579,10 @@ class SpatialSoftmaxTests(test.TestCase): np_features1[:, x_loc[c], y_loc[c], c] = 100. np_features2[:, x_loc[c], y_loc[c], c] = 100. - np_keypoints1 = self._SpatialSoftmax( - x_loc, y_loc, height1, width1, batch_size, nchannels) - np_keypoints2 = self._SpatialSoftmax( - x_loc, y_loc, height2, width2, batch_size, nchannels) + np_keypoints1 = self._SpatialSoftmax(x_loc, y_loc, height1, width1, + batch_size, nchannels) + np_keypoints2 = self._SpatialSoftmax(x_loc, y_loc, height2, width2, + batch_size, nchannels) # Make sure expected location keypoints matches actual location keypoints. with self.test_session() as sess: @@ -3596,8 +3608,8 @@ class SpatialSoftmaxTests(test.TestCase): for c in range(nchannels): np_features[:, x_loc[c], y_loc[c], c] = 100. - np_keypoints = self._SpatialSoftmax( - x_loc, y_loc, height, width, batch_size, nchannels) + np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size, + nchannels) # Make sure expected location keypoints matches actual location keypoints. with self.test_session() as sess: @@ -3619,8 +3631,8 @@ class SpatialSoftmaxTests(test.TestCase): for c in range(nchannels): np_features[:, c, x_loc[c], y_loc[c]] = 100. - np_keypoints = self._SpatialSoftmax( - x_loc, y_loc, height, width, batch_size, nchannels) + np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size, + nchannels) # Make sure expected location keypoints matches actual location keypoints. with self.test_session() as sess: @@ -3715,8 +3727,7 @@ class UnitNormTests(test.TestCase): image = random_ops.random_uniform((height, width, 3)) output = _layers.unit_norm(image, dim=dim, epsilon=1e-6) norms = math_ops.sqrt( - math_ops.reduce_sum( - math_ops.square(output), reduction_indices=dim)) + math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim)) shape = [height, width, 3] del shape[dim] @@ -3752,8 +3763,7 @@ class UnitNormTests(test.TestCase): image = array_ops.placeholder(dtypes.float32, (None, None, 3)) output = _layers.unit_norm(image, dim=dim, epsilon=1e-6) norms = math_ops.sqrt( - math_ops.reduce_sum( - math_ops.square(output), reduction_indices=dim)) + math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim)) with self.test_session(): actual = norms.eval({image: placeholder_value}) @@ -3817,8 +3827,8 @@ class PoincareNormalizeTest(test.TestCase): with self.test_session(): x_tf = constant_op.constant(x_np, name='x') y_tf = _layers.poincare_normalize(x_tf, dim) - err = gradient_checker.compute_gradient_error(x_tf, x_shape, - y_tf, x_shape) + err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf, + x_shape) print('PoinCareNormalize gradient err = %g ' % err) self.assertLess(err, 1e-4) @@ -3830,14 +3840,9 @@ class LegacyFullyConnectedTest(test.TestCase): test.TestCase.setUp(self) random_seed.set_random_seed(1234) self.input = constant_op.constant([[1., 2., 3.], [-4., 15., -6.]]) - self.input_3_dim_arr = [[[1., 1.1, 1.2], - [2., 2.1, 2.2], - [3., 3.1, 3.2], - [4., 4.1, 4.2]], - [[5., 5.1, 5.2], - [6., 6.1, 6.2], - [7., 7.1, 7.2], - [8., 8.1, 8.2]]] + self.input_3_dim_arr = [[[1., 1.1, 1.2], [2., 2.1, 2.2], [3., 3.1, 3.2], + [4., 4.1, 4.2]], [[5., 5.1, 5.2], [6., 6.1, 6.2], + [7., 7.1, 7.2], [8., 8.1, 8.2]]] self.input_3_dim = constant_op.constant(self.input_3_dim_arr) assert not ops.get_collection(ops.GraphKeys.SUMMARIES) @@ -3932,15 +3937,10 @@ class LegacyFullyConnectedTest(test.TestCase): self._custom_initializers(self.input, 2, [[13.0, 13.0], [11.0, 11.0]]) def test_custom_initializers_multi_dim(self): - self._custom_initializers(self.input_3_dim, 2, - [[[7.6, 7.6], - [13.6, 13.6], - [19.6, 19.6], - [25.6, 25.6]], - [[31.6, 31.6], - [37.6, 37.6], - [43.6, 43.6], - [49.6, 49.6]]]) + self._custom_initializers( + self.input_3_dim, 2, + [[[7.6, 7.6], [13.6, 13.6], [19.6, 19.6], [25.6, 25.6]], + [[31.6, 31.6], [37.6, 37.6], [43.6, 43.6], [49.6, 49.6]]]) def test_custom_collections(self): layers_lib.legacy_relu( @@ -4050,12 +4050,16 @@ class LegacyFullyConnectedTest(test.TestCase): with self.test_session() as sess: variables_lib.global_variables_initializer().run() # we can feed in input with first dimension 2 - shape_value = sess.run(array_ops.shape(y), - feed_dict={x: self.input_3_dim_arr}) + shape_value = sess.run( + array_ops.shape(y), feed_dict={ + x: self.input_3_dim_arr + }) self.assertAllClose(shape_value, [2, 4, 1]) # we can feed in input with first dimension 1 - shape_value = sess.run(array_ops.shape(y), - feed_dict={x: [self.input_3_dim_arr[0]]}) + shape_value = sess.run( + array_ops.shape(y), feed_dict={ + x: [self.input_3_dim_arr[0]] + }) self.assertAllClose(shape_value, [1, 4, 1]) # we cannot feed in input with inconsistent dimensions with self.assertRaises(ValueError): diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py index 71978d43944..18bf16e246b 100644 --- a/tensorflow/contrib/learn/python/learn/datasets/base.py +++ b/tensorflow/contrib/learn/python/learn/datasets/base.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Base utilities for loading datasets.""" from __future__ import absolute_import @@ -100,9 +99,7 @@ def load_iris(data_path=None): module_path = path.dirname(__file__) data_path = path.join(module_path, 'data', 'iris.csv') return load_csv_with_header( - data_path, - target_dtype=np.int, - features_dtype=np.float) + data_path, target_dtype=np.int, features_dtype=np.float) def load_boston(data_path=None): @@ -118,16 +115,10 @@ def load_boston(data_path=None): module_path = path.dirname(__file__) data_path = path.join(module_path, 'data', 'boston_house_prices.csv') return load_csv_with_header( - data_path, - target_dtype=np.float, - features_dtype=np.float) + data_path, target_dtype=np.float, features_dtype=np.float) -def retry(initial_delay, - max_delay, - factor=2.0, - jitter=0.25, - is_retriable=None): +def retry(initial_delay, max_delay, factor=2.0, jitter=0.25, is_retriable=None): """Simple decorator for wrapping retriable functions. Args: @@ -152,7 +143,7 @@ def retry(initial_delay, def delays(): delay = initial_delay while delay <= max_delay: - yield delay * random.uniform(1 - jitter, 1 + jitter) + yield delay * random.uniform(1 - jitter, 1 + jitter) delay *= factor def wrap(fn): @@ -172,7 +163,9 @@ def retry(initial_delay, else: raise return fn(*args, **kwargs) + return wrapped_fn + return wrap diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py index 1f3295747e1..37f9175015a 100644 --- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py +++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Functions for downloading and reading MNIST data.""" from __future__ import absolute_import @@ -123,8 +122,8 @@ class DataSet(object): numpy.random.seed(seed1 if seed is None else seed2) dtype = dtypes.as_dtype(dtype).base_dtype if dtype not in (dtypes.uint8, dtypes.float32): - raise TypeError('Invalid image dtype %r, expected uint8 or float32' % - dtype) + raise TypeError( + 'Invalid image dtype %r, expected uint8 or float32' % dtype) if fake_data: self._num_examples = 10000 self.one_hot = one_hot @@ -202,7 +201,9 @@ class DataSet(object): end = self._index_in_epoch images_new_part = self._images[start:end] labels_new_part = self._labels[start:end] - return numpy.concatenate((images_rest_part, images_new_part), axis=0) , numpy.concatenate((labels_rest_part, labels_new_part), axis=0) + return numpy.concatenate( + (images_rest_part, images_new_part), axis=0), numpy.concatenate( + (labels_rest_part, labels_new_part), axis=0) else: self._index_in_epoch += batch_size end = self._index_in_epoch @@ -257,16 +258,14 @@ def read_data_sets(train_dir, test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): - raise ValueError( - 'Validation size should be between 0 and {}. Received: {}.' - .format(len(train_images), validation_size)) + raise ValueError('Validation size should be between 0 and {}. Received: {}.' + .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] - options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py index 649996c49cc..9a843168c27 100644 --- a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py +++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py @@ -151,7 +151,7 @@ def spirals(n_samples=100, # Add more points if n_samples is not divisible by n_classes (unbalanced!) extras = n_samples % n_classes if extras > 0: - x_exrta, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args, + x_extra, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args, **kwargs) spir_x = np.append(spir_x, x_extra) spir_y = np.append(spir_y, y_extra) diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py index 5340afab46e..5809995c8c7 100644 --- a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py +++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py @@ -24,12 +24,14 @@ from tensorflow.python.platform import test from tensorflow.contrib.learn.python.learn import datasets from tensorflow.contrib.learn.python.learn.datasets import synthetic + class SyntheticTest(test.TestCase): """Test synthetic dataset generation""" def test_make_dataset(self): """Test if the synthetic routine wrapper complains about the name""" - self.assertRaises(ValueError, datasets.make_dataset, name='_non_existing_name') + self.assertRaises( + ValueError, datasets.make_dataset, name='_non_existing_name') def test_all_datasets_callable(self): """Test if all methods inside the `SYNTHETIC` are callable""" @@ -52,9 +54,10 @@ class SyntheticTest(test.TestCase): """ n_samples = 100 n_classes = 2 - circ = synthetic.circles(n_samples = n_samples, noise = None, n_classes = n_classes) + circ = synthetic.circles( + n_samples=n_samples, noise=None, n_classes=n_classes) self.assertIsInstance(circ, datasets.base.Dataset) - self.assertTupleEqual(circ.data.shape, (n_samples,2)) + self.assertTupleEqual(circ.data.shape, (n_samples, 2)) self.assertTupleEqual(circ.target.shape, (n_samples,)) self.assertSetEqual(set(circ.target), set(range(n_classes))) @@ -67,17 +70,24 @@ class SyntheticTest(test.TestCase): """ seed = 42 noise = 0.1 - circ0 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed) - circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed) + circ0 = synthetic.circles( + n_samples=100, noise=noise, n_classes=2, seed=seed) + circ1 = synthetic.circles( + n_samples=100, noise=noise, n_classes=2, seed=seed) np.testing.assert_array_equal(circ0.data, circ1.data) np.testing.assert_array_equal(circ0.target, circ1.target) - circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed+1) - self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data) - self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.target, circ1.target) + circ1 = synthetic.circles( + n_samples=100, noise=noise, n_classes=2, seed=seed + 1) + self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, + circ1.data) + self.assertRaises(AssertionError, np.testing.assert_array_equal, + circ0.target, circ1.target) - circ1 = synthetic.circles(n_samples = 100, noise = noise/2., n_classes = 2, seed = seed) - self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data) + circ1 = synthetic.circles( + n_samples=100, noise=noise / 2., n_classes=2, seed=seed) + self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, + circ1.data) def test_spirals(self): """Test if the circles are generated correctly @@ -89,13 +99,14 @@ class SyntheticTest(test.TestCase): - returned `target` shape is (n_samples,) - set of unique classes range is [0, n_classes) """ - self.assertRaises(ValueError, synthetic.spirals, mode='_unknown_mode_spiral_') + self.assertRaises( + ValueError, synthetic.spirals, mode='_unknown_mode_spiral_') n_samples = 100 modes = ('archimedes', 'bernoulli', 'fermat') for mode in modes: - spir = synthetic.spirals(n_samples = n_samples, noise = None, mode = mode) + spir = synthetic.spirals(n_samples=n_samples, noise=None, mode=mode) self.assertIsInstance(spir, datasets.base.Dataset) - self.assertTupleEqual(spir.data.shape, (n_samples,2)) + self.assertTupleEqual(spir.data.shape, (n_samples, 2)) self.assertTupleEqual(spir.target.shape, (n_samples,)) self.assertSetEqual(set(spir.target), set(range(2))) @@ -110,18 +121,24 @@ class SyntheticTest(test.TestCase): noise = 0.1 modes = ('archimedes', 'bernoulli', 'fermat') for mode in modes: - spir0 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed) - spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed) + spir0 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed) + spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed) np.testing.assert_array_equal(spir0.data, spir1.data) np.testing.assert_array_equal(spir0.target, spir1.target) - spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed+1) - self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data) - self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.target, spir1.target) + spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed + 1) + self.assertRaises(AssertionError, np.testing.assert_array_equal, + spir0.data, spir1.data) + self.assertRaises(AssertionError, np.testing.assert_array_equal, + spir0.target, spir1.target) - spir1 = synthetic.spirals(n_samples = 1000, noise = noise/2., seed = seed) - self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data) + spir1 = synthetic.spirals(n_samples=1000, noise=noise / 2., seed=seed) + self.assertRaises(AssertionError, np.testing.assert_array_equal, + spir0.data, spir1.data) + + def test_spirals_synthetic(self): + synthetic.spirals(3) -if __name__ == "__main__": +if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py index 6b125534a42..b968aeed1b7 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py @@ -44,7 +44,6 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test from tensorflow.python.training import input as input_lib - NUM_EXAMPLES = 100 N_CLASSES = 5 # Cardinality of multiclass labels. LABEL_DIMENSION = 3 # Dimensionality of regression labels. @@ -52,8 +51,10 @@ LABEL_DIMENSION = 3 # Dimensionality of regression labels. def _train_test_split(features_and_labels): features, labels = features_and_labels - train_set = (features[:int(len(features) / 2)], labels[:int(len(features) / 2)]) - test_set = (features[int(len(features) / 2):], labels[int(len(features) / 2):]) + train_set = (features[:int(len(features) / 2)], + labels[:int(len(features) / 2)]) + test_set = (features[int(len(features) / 2):], + labels[int(len(features) / 2):]) return train_set, test_set @@ -86,17 +87,17 @@ class DebugClassifierTest(test.TestCase): (train_features, train_labels), (test_features, test_labels) = _train_test_split( [self.features, self.labels]) - majority_class, _ = max(collections.Counter(train_labels).items(), - key=operator.itemgetter(1)) + majority_class, _ = max( + collections.Counter(train_labels).items(), key=operator.itemgetter(1)) expected_prediction = np.vstack( [[majority_class] for _ in range(test_labels.shape[0])]) classifier = debug.DebugClassifier(n_classes=N_CLASSES) - classifier.fit(input_fn=_input_fn_builder(train_features, train_labels), - steps=50) + classifier.fit( + input_fn=_input_fn_builder(train_features, train_labels), steps=50) - pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features, - None)) + pred = classifier.predict_classes( + input_fn=_input_fn_builder(test_features, None)) self.assertAllEqual(expected_prediction, np.vstack(pred)) def testPredictBinary(self): @@ -105,34 +106,34 @@ class DebugClassifierTest(test.TestCase): test_labels) = _train_test_split( [self.features, self.binary_labels]) - majority_class, _ = max(collections.Counter(train_labels).items(), - key=operator.itemgetter(1)) + majority_class, _ = max( + collections.Counter(train_labels).items(), key=operator.itemgetter(1)) expected_prediction = np.vstack( [[majority_class] for _ in range(test_labels.shape[0])]) classifier = debug.DebugClassifier(n_classes=2) - classifier.fit(input_fn=_input_fn_builder(train_features, train_labels), - steps=50) + classifier.fit( + input_fn=_input_fn_builder(train_features, train_labels), steps=50) - pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features, - None)) + pred = classifier.predict_classes( + input_fn=_input_fn_builder(test_features, None)) self.assertAllEqual(expected_prediction, np.vstack(pred)) - (train_features, train_labels), ( - test_features, test_labels) = _train_test_split( - [self.features, self.binary_float_labels]) + (train_features, + train_labels), (test_features, test_labels) = _train_test_split( + [self.features, self.binary_float_labels]) - majority_class, _ = max(collections.Counter(train_labels).items(), - key=operator.itemgetter(1)) + majority_class, _ = max( + collections.Counter(train_labels).items(), key=operator.itemgetter(1)) expected_prediction = np.vstack( [[majority_class] for _ in range(test_labels.shape[0])]) classifier = debug.DebugClassifier(n_classes=2) - classifier.fit(input_fn=_input_fn_builder(train_features, train_labels), - steps=50) + classifier.fit( + input_fn=_input_fn_builder(train_features, train_labels), steps=50) - pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features, - None)) + pred = classifier.predict_classes( + input_fn=_input_fn_builder(test_features, None)) self.assertAllEqual(expected_prediction, np.vstack(pred)) def testPredictProba(self): @@ -150,8 +151,8 @@ class DebugClassifierTest(test.TestCase): [class_distribution for _ in range(test_labels.shape[0])]) classifier = debug.DebugClassifier(n_classes=N_CLASSES) - classifier.fit(input_fn=_input_fn_builder(train_features, train_labels), - steps=50) + classifier.fit( + input_fn=_input_fn_builder(train_features, train_labels), steps=50) pred = classifier.predict_proba( input_fn=_input_fn_builder(test_features, None)) @@ -173,17 +174,17 @@ class DebugClassifierTest(test.TestCase): [class_distribution for _ in range(test_labels.shape[0])]) classifier = debug.DebugClassifier(n_classes=2) - classifier.fit(input_fn=_input_fn_builder(train_features, train_labels), - steps=50) + classifier.fit( + input_fn=_input_fn_builder(train_features, train_labels), steps=50) pred = classifier.predict_proba( input_fn=_input_fn_builder(test_features, None)) self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1) - (train_features, train_labels), ( - test_features, test_labels) = _train_test_split( - [self.features, self.binary_float_labels]) + (train_features, + train_labels), (test_features, test_labels) = _train_test_split( + [self.features, self.binary_float_labels]) class_distribution = np.zeros((1, 2)) for label in train_labels: @@ -194,8 +195,8 @@ class DebugClassifierTest(test.TestCase): [class_distribution for _ in range(test_labels.shape[0])]) classifier = debug.DebugClassifier(n_classes=2) - classifier.fit(input_fn=_input_fn_builder(train_features, train_labels), - steps=50) + classifier.fit( + input_fn=_input_fn_builder(train_features, train_labels), steps=50) pred = classifier.predict_proba( input_fn=_input_fn_builder(test_features, None)) @@ -232,13 +233,12 @@ class DebugClassifierTest(test.TestCase): def _input_fn(): iris = test_data.prepare_iris_data_for_logistic_regression() return { - 'feature': constant_op.constant( - iris.data, dtype=dtypes.float32) + 'feature': constant_op.constant(iris.data, dtype=dtypes.float32) }, constant_op.constant( iris.target, shape=[100], dtype=dtypes.int32) - classifier = debug.DebugClassifier(config=run_config.RunConfig( - tf_random_seed=1)) + classifier = debug.DebugClassifier( + config=run_config.RunConfig(tf_random_seed=1)) classifier.fit(input_fn=_input_fn, steps=5) scores = classifier.evaluate(input_fn=_input_fn, steps=1) self.assertIn('loss', scores) @@ -342,8 +342,7 @@ class DebugClassifierTest(test.TestCase): def _input_fn(): iris = base.load_iris() return { - 'feature': constant_op.constant( - iris.data, dtype=dtypes.float32) + 'feature': constant_op.constant(iris.data, dtype=dtypes.float32) }, constant_op.constant( iris.target, shape=[150], dtype=dtypes.int32) @@ -387,7 +386,9 @@ class DebugClassifierTest(test.TestCase): # Create 4 rows, one of them (y = x), three of them (y=Not(x)) # The logistic prediction should be (y = 0.25). labels = constant_op.constant([[1], [0], [0], [0]]) - features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),} + features = { + 'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32), + } return features, labels classifier = debug.DebugClassifier(n_classes=2) @@ -404,8 +405,7 @@ class DebugClassifierTest(test.TestCase): # The logistic prediction should be (y = 0.25). labels = constant_op.constant([[1.], [0.], [0.], [0.]]) features = { - 'x': array_ops.ones( - shape=[4, 1], dtype=dtypes.float32), + 'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32), 'w': constant_op.constant([[1.], [1.], [1.], [1.]]) } return features, labels @@ -414,8 +414,7 @@ class DebugClassifierTest(test.TestCase): # 4 rows, with different weights. labels = constant_op.constant([[1.], [0.], [0.], [0.]]) features = { - 'x': array_ops.ones( - shape=[4, 1], dtype=dtypes.float32), + 'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32), 'w': constant_op.constant([[7.], [1.], [1.], [1.]]) } return features, labels @@ -438,8 +437,7 @@ class DebugClassifierTest(test.TestCase): # than (y=Not(x)) due to the relative higher weight of the first row. labels = constant_op.constant([[1], [0], [0], [0]]) features = { - 'x': array_ops.ones( - shape=[4, 1], dtype=dtypes.float32), + 'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32), 'w': constant_op.constant([[100.], [3.], [2.], [2.]]) } return features, labels @@ -448,8 +446,7 @@ class DebugClassifierTest(test.TestCase): # Create 4 rows (y = x) labels = constant_op.constant([[1], [1], [1], [1]]) features = { - 'x': array_ops.ones( - shape=[4, 1], dtype=dtypes.float32), + 'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32), 'w': constant_op.constant([[1.], [1.], [1.], [1.]]) } return features, labels @@ -469,8 +466,7 @@ class DebugClassifierTest(test.TestCase): features = { 'x': input_lib.limit_epochs( - array_ops.ones( - shape=[4, 1], dtype=dtypes.float32), + array_ops.ones(shape=[4, 1], dtype=dtypes.float32), num_epochs=num_epochs), } return features, labels @@ -578,12 +574,11 @@ class DebugClassifierTest(test.TestCase): language = feature_column.sparse_column_with_hash_bucket('language', 100) feature_columns = [ feature_column.real_valued_column('age'), - feature_column.embedding_column( - language, dimension=1) + feature_column.embedding_column(language, dimension=1) ] - classifier = debug.DebugClassifier(config=run_config.RunConfig( - tf_random_seed=1)) + classifier = debug.DebugClassifier( + config=run_config.RunConfig(tf_random_seed=1)) classifier.fit(input_fn=input_fn, steps=5) def default_input_fn(unused_estimator, examples): @@ -614,8 +609,8 @@ class DebugRegressorTest(test.TestCase): classifier.fit( input_fn=_input_fn_builder(train_features, train_labels), steps=50) - pred = classifier.predict_scores(input_fn=_input_fn_builder(test_features, - None)) + pred = classifier.predict_scores( + input_fn=_input_fn_builder(test_features, None)) self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1) def testExperimentIntegration(self): @@ -698,7 +693,9 @@ class DebugRegressorTest(test.TestCase): # Create 4 rows, one of them (y = x), three of them (y=Not(x)) # The algorithm should learn (y = 0.25). labels = constant_op.constant([[1.], [0.], [0.], [0.]]) - features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),} + features = { + 'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32), + } return features, labels regressor = debug.DebugRegressor( @@ -853,5 +850,6 @@ class DebugRegressorTest(test.TestCase): predictions2 = list(regressor2.predict_scores(input_fn=predict_input_fn)) self.assertAllClose(predictions, predictions2) + if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py index 12f9bba531a..2bd57597c2e 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py @@ -1224,7 +1224,7 @@ class DNNRegressorTest(test.TestCase): self, predictions, expected_shape): predictions_nparray = np.array(predictions) self.assertAllEqual(expected_shape, predictions_nparray.shape) - self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.float)) + self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.floating)) def testPredict_AsIterableFalse(self): """Tests predict method with as_iterable=False.""" diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py index 8d59fe66d98..63d0f1e1d45 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py @@ -600,7 +600,8 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable, input_fn=None, batch_size=None, outputs=None, - as_iterable=True): + as_iterable=True, + iterate_batches=False): """Returns predictions for given features. Args: @@ -616,6 +617,9 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable, for each example until inputs are exhausted. Note: The inputs must terminate if you want the iterable to terminate (e.g. be sure to pass num_epochs=1 if you are using something like read_batch_features). + iterate_batches: If True, yield the whole batch at once instead of + decomposing the batch into individual samples. Only relevant when + as_iterable is True. Returns: A numpy array of predicted classes or regression values if the @@ -635,7 +639,8 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable, input_fn=input_fn, feed_fn=feed_fn, outputs=outputs, - as_iterable=as_iterable) + as_iterable=as_iterable, + iterate_batches=iterate_batches) def get_variable_value(self, name): """Returns value of the variable given by name. diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py index 9d7c1a099aa..d4a46b41d0c 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py @@ -41,7 +41,6 @@ from tensorflow.python.platform import test from tensorflow.python.training import input as input_lib from tensorflow.python.training import queue_runner_impl - _BOSTON_INPUT_DIM = 13 _IRIS_INPUT_DIM = 4 @@ -93,8 +92,8 @@ def boston_eval_fn(): constant_op.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM]) labels = array_ops.reshape( constant_op.constant(boston.target), [n_examples, 1]) - return array_ops.concat([features, features], 0), array_ops.concat( - [labels, labels], 0) + return array_ops.concat([features, features], + 0), array_ops.concat([labels, labels], 0) def extract(data, key): @@ -129,7 +128,10 @@ def linear_model_fn(features, labels, mode): (_, features), = features.items() prediction, loss = (models.linear_regression_zero_init(features, labels)) train_op = optimizers.optimize_loss( - loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1) + loss, + training_util.get_global_step(), + optimizer='Adagrad', + learning_rate=0.1) return prediction, loss, train_op @@ -139,7 +141,10 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode): model_fn.ModeKeys.INFER) prediction, loss = (models.linear_regression_zero_init(features, labels)) train_op = optimizers.optimize_loss( - loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1) + loss, + training_util.get_global_step(), + optimizer='Adagrad', + learning_rate=0.1) return model_fn.ModelFnOps( mode=mode, predictions=prediction, loss=loss, train_op=train_op) @@ -150,7 +155,10 @@ def logistic_model_no_mode_fn(features, labels): labels = array_ops.one_hot(labels, 3, 1, 0) prediction, loss = (models.logistic_regression_zero_init(features, labels)) train_op = optimizers.optimize_loss( - loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1) + loss, + training_util.get_global_step(), + optimizer='Adagrad', + learning_rate=0.1) return { 'class': math_ops.argmax(prediction, 1), 'prob': prediction @@ -173,7 +181,9 @@ class EstimatorInputTest(test.TestCase): scores = est.evaluate( x=boston_input, y=float64_target, - metrics={'MSE': metric_ops.streaming_mean_squared_error}) + metrics={ + 'MSE': metric_ops.streaming_mean_squared_error + }) del est # Create another estimator object with the same output dir. est2 = estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir) @@ -182,7 +192,9 @@ class EstimatorInputTest(test.TestCase): scores2 = est2.evaluate( x=boston_input, y=float64_target, - metrics={'MSE': metric_ops.streaming_mean_squared_error}) + metrics={ + 'MSE': metric_ops.streaming_mean_squared_error + }) self.assertAllClose(scores2['MSE'], scores['MSE']) predictions = np.array(list(est2.predict(x=boston_input))) other_score = _sklearn.mean_squared_error(predictions, @@ -197,7 +209,9 @@ class EstimatorInputTest(test.TestCase): scores = est.score( x=boston.data, y=float64_labels, - metrics={'MSE': metric_ops.streaming_mean_squared_error}) + metrics={ + 'MSE': metric_ops.streaming_mean_squared_error + }) predictions = np.array(list(est.predict(x=boston.data))) other_score = _sklearn.mean_squared_error(predictions, boston.target) self.assertAllClose(scores['MSE'], other_score) @@ -213,7 +227,9 @@ class EstimatorInputTest(test.TestCase): scores = est.evaluate( x=boston_input, y=float64_target, - metrics={'MSE': metric_ops.streaming_mean_squared_error}) + metrics={ + 'MSE': metric_ops.streaming_mean_squared_error + }) predictions = np.array(list(est.predict(x=boston_input))) other_score = _sklearn.mean_squared_error(predictions, boston.target) self.assertAllClose(other_score, scores['MSE']) @@ -228,14 +244,15 @@ class EstimatorInputTest(test.TestCase): scores = est.score( x=iris.data, y=iris.target, - metrics={('accuracy', 'class'): metric_ops.streaming_accuracy}) + metrics={ + ('accuracy', 'class'): metric_ops.streaming_accuracy + }) predictions = est.predict(x=iris.data) predictions_class = est.predict(x=iris.data, outputs=['class'])['class'] self.assertEqual(predictions['prob'].shape[0], iris.target.shape[0]) self.assertAllClose(predictions['class'], predictions_class) - self.assertAllClose( - predictions['class'], np.argmax( - predictions['prob'], axis=1)) + self.assertAllClose(predictions['class'], + np.argmax(predictions['prob'], axis=1)) other_score = _sklearn.accuracy_score(iris.target, predictions['class']) self.assertAllClose(scores['accuracy'], other_score) self.assertTrue('global_step' in scores) @@ -250,17 +267,18 @@ class EstimatorInputTest(test.TestCase): scores = est.evaluate( x=iris_data, y=iris_target, - metrics={('accuracy', 'class'): metric_ops.streaming_accuracy}) + metrics={ + ('accuracy', 'class'): metric_ops.streaming_accuracy + }) predictions = list(est.predict(x=iris_data)) predictions_class = list(est.predict(x=iris_data, outputs=['class'])) self.assertEqual(len(predictions), iris.target.shape[0]) classes_batch = np.array([p['class'] for p in predictions]) self.assertAllClose(classes_batch, np.array([p['class'] for p in predictions_class])) - self.assertAllClose( - classes_batch, - np.argmax( - np.array([p['prob'] for p in predictions]), axis=1)) + self.assertAllClose(classes_batch, + np.argmax( + np.array([p['prob'] for p in predictions]), axis=1)) other_score = _sklearn.accuracy_score(iris.target, classes_batch) self.assertAllClose(other_score, scores['accuracy']) self.assertTrue('global_step' in scores) diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py index 656d68b7688..ac2d10011e2 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py @@ -57,7 +57,10 @@ def _logistic_regression_model_fn(features, labels, mode): predictions = math_ops.sigmoid(logits) loss = losses.sigmoid_cross_entropy(labels, logits) train_op = optimizers.optimize_loss( - loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1) + loss, + training_util.get_global_step(), + optimizer='Adagrad', + learning_rate=0.1) return predictions, loss, train_op diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py index 66e15265171..8f6cd39864b 100644 --- a/tensorflow/contrib/learn/python/learn/evaluable.py +++ b/tensorflow/contrib/learn/python/learn/evaluable.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """`Evaluable` interface.""" from __future__ import absolute_import @@ -59,9 +58,12 @@ class Evaluable(object): for which this evaluation was performed. Args: - x: Matrix of shape [n_samples, n_features...] or dictionary of many matrices - containing the input samples for fitting the model. Can be iterator that returns - arrays of features or dictionary of array of features. If set, `input_fn` must + x: Matrix of shape [n_samples, n_features...] or dictionary of many + matrices + containing the input samples for fitting the model. Can be iterator that + returns + arrays of features or dictionary of array of features. If set, + `input_fn` must be `None`. y: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the label values (class labels in classification, real numbers in diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py index 9576ff21c24..bec976afd27 100644 --- a/tensorflow/contrib/learn/python/learn/experiment.py +++ b/tensorflow/contrib/learn/python/learn/experiment.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Experiment class collecting information needed for a single training run.""" from __future__ import absolute_import @@ -43,7 +42,6 @@ from tensorflow.python.training import saver from tensorflow.python.training import server_lib from tensorflow.python.util import compat - __all__ = ["Experiment"] @@ -278,8 +276,7 @@ class Experiment(object): self._train_steps_per_iteration = train_steps_per_iteration if (self._train_steps_per_iteration is not None and not isinstance(self._train_steps_per_iteration, int)): - raise ValueError( - "`train_steps_per_iteration` must be an integer.") + raise ValueError("`train_steps_per_iteration` must be an integer.") @property def estimator(self): @@ -359,9 +356,10 @@ class Experiment(object): config.cluster_spec and config.master): self._start_server() elif config.cluster_spec and config.master: - raise ValueError('For distributed runtime, Experiment class only works with' - 'tf.contrib.learn.RunConfig for now, but provided {}' - .format(type(config))) + raise ValueError( + "For distributed runtime, Experiment class only works with" + "tf.contrib.learn.RunConfig for now, but provided {}".format( + type(config))) extra_hooks = [] if delay_secs is None: @@ -414,11 +412,12 @@ class Experiment(object): logging.info("Waiting %d secs before starting eval.", delay_secs) time.sleep(delay_secs) - return self._call_evaluate(input_fn=self._eval_input_fn, - steps=self._eval_steps, - metrics=self._eval_metrics, - name=(name or "one_pass"), - hooks=self._eval_hooks) + return self._call_evaluate( + input_fn=self._eval_input_fn, + steps=self._eval_steps, + metrics=self._eval_metrics, + name=(name or "one_pass"), + hooks=self._eval_hooks) @deprecated( "2016-10-23", @@ -499,15 +498,12 @@ class Experiment(object): previous_path = None eval_result = None last_warning_time = 0 - while (not predicate_fn or - predicate_fn( - eval_result, - checkpoint_path=previous_path if eval_result else None)): + while (not predicate_fn or predicate_fn( + eval_result, checkpoint_path=previous_path if eval_result else None)): # Exit if we have already reached number of steps to train. if self._has_training_stopped(eval_result): logging.info("Exiting continuous eval, global_step=%s >= " - "train_step=%s", - eval_result[ops.GraphKeys.GLOBAL_STEP], + "train_step=%s", eval_result[ops.GraphKeys.GLOBAL_STEP], self._train_steps) return @@ -528,12 +524,13 @@ class Experiment(object): logging.warning(error_msg) last_warning_time = time.time() else: - eval_result = self._call_evaluate(input_fn=input_fn, - steps=self._eval_steps, - metrics=self._eval_metrics, - name=name, - checkpoint_path=latest_path, - hooks=self._eval_hooks) + eval_result = self._call_evaluate( + input_fn=input_fn, + steps=self._eval_steps, + metrics=self._eval_metrics, + name=name, + checkpoint_path=latest_path, + hooks=self._eval_hooks) # Ensure eval result is not None for next round of evaluation. if not eval_result: eval_result = {} @@ -558,8 +555,8 @@ class Experiment(object): return False global_step = eval_result.get(ops.GraphKeys.GLOBAL_STEP) - return global_step and self._train_steps and ( - global_step >= self._train_steps) + return global_step and self._train_steps and (global_step >= + self._train_steps) def continuous_eval(self, delay_secs=None, @@ -678,8 +675,7 @@ class Experiment(object): return eval_result, export_results @experimental - def continuous_train_and_eval(self, - continuous_eval_predicate_fn=None): + def continuous_train_and_eval(self, continuous_eval_predicate_fn=None): """Interleaves training and evaluation. The frequency of evaluation is controlled by the `train_steps_per_iteration` @@ -752,10 +748,9 @@ class Experiment(object): elif self._train_steps is not None: train_steps_per_iteration = int(self._train_steps / 10) - while (not predicate_fn or - predicate_fn( - eval_result, - checkpoint_path=latest_checkpoint if eval_result else None)): + while (not predicate_fn or predicate_fn( + eval_result, checkpoint_path=latest_checkpoint + if eval_result else None)): if self._has_training_stopped(eval_result): # Exits once max steps of training is satisfied. @@ -785,8 +780,7 @@ class Experiment(object): def _maybe_export(self, eval_result, checkpoint_path=None): """Export the Estimator using export_fn, if defined.""" export_dir_base = os.path.join( - compat.as_bytes(self._estimator.model_dir), - compat.as_bytes("export")) + compat.as_bytes(self._estimator.model_dir), compat.as_bytes("export")) export_results = [] for strategy in self._export_strategies: @@ -824,10 +818,11 @@ class Experiment(object): hooks=self._train_monitors, saving_listeners=self._saving_listeners) - eval_result = self._call_evaluate(input_fn=self._eval_input_fn, - steps=1, - metrics=self._eval_metrics, - name="one_pass") + eval_result = self._call_evaluate( + input_fn=self._eval_input_fn, + steps=1, + metrics=self._eval_metrics, + name="one_pass") _ = self._maybe_export(eval_result) return eval_result @@ -849,9 +844,14 @@ class Experiment(object): server.start() return server - def _call_train(self, _sentinel=None, # pylint: disable=invalid-name, - input_fn=None, steps=None, hooks=None, max_steps=None, - saving_listeners=None): + def _call_train( + self, + _sentinel=None, # pylint: disable=invalid-name, + input_fn=None, + steps=None, + hooks=None, + max_steps=None, + saving_listeners=None): if _sentinel is not None: raise ValueError("_call_train should be called with keyword args only") @@ -867,14 +867,18 @@ class Experiment(object): hooks=hooks, saving_listeners=saving_listeners) else: - return self._estimator.fit(input_fn=input_fn, - steps=steps, - max_steps=max_steps, - monitors=hooks) + return self._estimator.fit( + input_fn=input_fn, steps=steps, max_steps=max_steps, monitors=hooks) - def _call_evaluate(self, _sentinel=None, # pylint: disable=invalid-name, - input_fn=None, steps=None, metrics=None, name=None, - checkpoint_path=None, hooks=None): + def _call_evaluate( + self, + _sentinel=None, # pylint: disable=invalid-name, + input_fn=None, + steps=None, + metrics=None, + name=None, + checkpoint_path=None, + hooks=None): if _sentinel is not None: raise ValueError("_call_evaluate should be called with keyword args only") @@ -882,18 +886,20 @@ class Experiment(object): if metrics is not None: raise ValueError( "`eval_metrics` must be `None` with `tf.estimator.Estimator`") - return self._estimator.evaluate(input_fn=input_fn, - steps=steps, - name=name, - checkpoint_path=checkpoint_path, - hooks=hooks) + return self._estimator.evaluate( + input_fn=input_fn, + steps=steps, + name=name, + checkpoint_path=checkpoint_path, + hooks=hooks) else: - return self._estimator.evaluate(input_fn=input_fn, - steps=steps, - metrics=metrics, - name=name, - checkpoint_path=checkpoint_path, - hooks=hooks) + return self._estimator.evaluate( + input_fn=input_fn, + steps=steps, + metrics=metrics, + name=name, + checkpoint_path=checkpoint_path, + hooks=hooks) @contextlib.contextmanager diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py index f36a778b529..96be8b1bc40 100644 --- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py +++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py @@ -35,6 +35,7 @@ from tensorflow.python.platform import tf_logging as logging # pylint: disable=g-multiple-import,g-bad-import-order from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels + # pylint: enable=g-multiple-import,g-bad-import-order @@ -74,11 +75,11 @@ def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None): if not y_is_dict: output_shape = out_el_shape(y_shape, n_classes) else: - output_shape = dict([ - (k, out_el_shape(v, n_classes[k] - if n_classes is not None and k in n_classes else None)) - for k, v in list(y_shape.items()) - ]) + output_shape = dict([(k, + out_el_shape(v, n_classes[k] + if n_classes is not None and + k in n_classes else None)) + for k, v in list(y_shape.items())]) return input_shape, output_shape, batch_size @@ -314,23 +315,23 @@ class DataFeeder(object): input_dtype: DType of input (or dictionary of shapes). output_dtype: DType of output (or dictionary of shapes. """ - x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance( - y, dict) + x_is_dict, y_is_dict = isinstance( + x, dict), y is not None and isinstance(y, dict) if isinstance(y, list): y = np.array(y) self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items()) ]) if x_is_dict else check_array(x, x.dtype) - self._y = None if y is None else ( - dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())]) - if y_is_dict else check_array(y, y.dtype)) + self._y = None if y is None else (dict( + [(k, check_array(v, v.dtype)) for k, v in list(y.items())]) + if y_is_dict else check_array(y, y.dtype)) # self.n_classes is not None means we're converting raw target indices # to one-hot. if n_classes is not None: if not y_is_dict: - y_dtype = (np.int64 - if n_classes is not None and n_classes > 1 else np.float32) + y_dtype = ( + np.int64 if n_classes is not None and n_classes > 1 else np.float32) self._y = (None if y is None else check_array(y, dtype=y_dtype)) self.n_classes = n_classes @@ -352,8 +353,8 @@ class DataFeeder(object): # self._output_dtype == np.float32 when y is None self._output_dtype = ( dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())]) - if y_is_dict else ( - _check_dtype(self._y.dtype) if y is not None else np.float32)) + if y_is_dict else (_check_dtype(self._y.dtype) + if y is not None else np.float32)) # self.n_classes is None means we're passing in raw target indices if n_classes is not None and y_is_dict: @@ -478,8 +479,8 @@ class DataFeeder(object): # Assign input features from random indices. def extract(data, indices): - return (np.array(_access(data, indices)).reshape((indices.shape[0], 1)) if - len(data.shape) == 1 else _access(data, indices)) + return (np.array(_access(data, indices)).reshape((indices.shape[0], 1)) + if len(data.shape) == 1 else _access(data, indices)) # assign labels from random indices def assign_label(data, shape, dtype, n_classes, indices): @@ -511,16 +512,18 @@ class DataFeeder(object): feed_dict[self._epoch_placeholder.name] = [self.epoch] # Take next batch of indices. - x_len = list(self._x.values())[0].shape[ - 0] if x_is_dict else self._x.shape[0] + x_len = list( + self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0] end = min(x_len, self.offset + self._batch_size) batch_indices = self.indices[self.offset:end] # adding input placeholder feed_dict.update( dict([(self._input_placeholder[k].name, extract(v, batch_indices)) - for k, v in list(self._x.items())]) if x_is_dict else - {self._input_placeholder.name: extract(self._x, batch_indices)}) + for k, v in list(self._x.items())]) if x_is_dict else { + self._input_placeholder.name: + extract(self._x, batch_indices) + }) # move offset and reset it if necessary self.offset += self._batch_size @@ -545,7 +548,8 @@ class DataFeeder(object): assign_label(v, shape, dtype, n_classes, batch_indices) }) else: - shape, dtype, n_classes = self.output_shape, self._output_dtype, self.n_classes + shape, dtype, n_classes = (self.output_shape, self._output_dtype, + self.n_classes) feed_dict.update({ self._output_placeholder.name: assign_label(self._y, shape, dtype, n_classes, batch_indices) @@ -621,8 +625,9 @@ class StreamingDataFeeder(DataFeeder): elif y is None: y_first_el_shape = None else: - y_first_el_shape = ([1] + list(y_first_el[0].shape if isinstance( - y_first_el, list) else y_first_el.shape)) + y_first_el_shape = ( + [1] + list(y_first_el[0].shape + if isinstance(y_first_el, list) else y_first_el.shape)) self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape( x_first_el_shape, y_first_el_shape, n_classes, batch_size) @@ -683,8 +688,8 @@ class StreamingDataFeeder(DataFeeder): if shape is None: return None elif isinstance(shape, dict): - return dict([(k, np.zeros(shape[k], dtype[k])) - for k in list(shape.keys())]) + return dict( + [(k, np.zeros(shape[k], dtype[k])) for k in list(shape.keys())]) else: return np.zeros(shape, dtype=dtype) diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py index 0948dee7e2f..51381a7427c 100644 --- a/tensorflow/contrib/learn/python/learn/monitors.py +++ b/tensorflow/contrib/learn/python/learn/monitors.py @@ -879,7 +879,7 @@ class GraphDump(BaseMonitor): this_output = self.data[step] if step in self.data else {} other_output = other_dump.data[step] if step in other_dump.data else {} for key in this_output: - if not isinstance(key, str) and not isinstance(key, unicode): + if not isinstance(key, six.string_types): continue if key not in other_output: raise ValueError("%s missing at step %s.", (key, step)) diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py index 972fec026f2..429b6040be2 100644 --- a/tensorflow/contrib/learn/python/learn/trainable.py +++ b/tensorflow/contrib/learn/python/learn/trainable.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """`Trainable` interface.""" from __future__ import absolute_import @@ -28,18 +27,31 @@ class Trainable(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod - def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None, - monitors=None, max_steps=None): + def fit(self, + x=None, + y=None, + input_fn=None, + steps=None, + batch_size=None, + monitors=None, + max_steps=None): """Trains a model given training data `x` predictions and `y` labels. Args: - x: Matrix of shape [n_samples, n_features...] or the dictionary of Matrices. - Can be iterator that returns arrays of features or dictionary of arrays of features. - The training input samples for fitting the model. If set, `input_fn` must be `None`. - y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the dictionary of same. - Can be iterator that returns array of labels or dictionary of array of labels. - The training label values (class labels in classification, real numbers in regression). - If set, `input_fn` must be `None`. Note: For classification, label values must + x: Matrix of shape [n_samples, n_features...] or the dictionary of + Matrices. + Can be iterator that returns arrays of features or dictionary of arrays + of features. + The training input samples for fitting the model. If set, `input_fn` + must be `None`. + y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the + dictionary of same. + Can be iterator that returns array of labels or dictionary of array of + labels. + The training label values (class labels in classification, real numbers + in regression). + If set, `input_fn` must be `None`. Note: For classification, label + values must be integers representing the class index (i.e. values from 0 to n_classes-1). input_fn: Input function returning a tuple of: diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD index fe2f183ac97..cea3627ed56 100644 --- a/tensorflow/contrib/linear_optimizer/BUILD +++ b/tensorflow/contrib/linear_optimizer/BUILD @@ -126,6 +126,7 @@ py_library( py_test( name = "sdca_estimator_test", srcs = ["python/sdca_estimator_test.py"], + shard_count = 4, srcs_version = "PY2AND3", deps = [ ":sdca_estimator_py", diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD index 13350c5a438..cc0e20f75ee 100644 --- a/tensorflow/contrib/lite/BUILD +++ b/tensorflow/contrib/lite/BUILD @@ -53,6 +53,8 @@ cc_test( srcs = ["arena_planner_test.cc"], deps = [ ":arena_planner", + "//tensorflow/contrib/lite/testing:util", + "//tensorflow/core:lib", "@com_google_googletest//:gtest", ], ) @@ -167,6 +169,7 @@ cc_test( deps = [ ":framework", ":string_util", + "//tensorflow/contrib/lite/testing:util", "@com_google_googletest//:gtest", ], ) diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc index c27c327abc6..e10611e6d43 100644 --- a/tensorflow/contrib/lite/arena_planner_test.cc +++ b/tensorflow/contrib/lite/arena_planner_test.cc @@ -18,6 +18,8 @@ limitations under the License. #include #include +#include "tensorflow/contrib/lite/testing/util.h" +#include "tensorflow/core/platform/logging.h" namespace tflite { namespace { @@ -464,9 +466,7 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) { } // namespace tflite int main(int argc, char** argv) { - // ::tflite::LogToStderr(); - FLAGS_logtostderr = true; - + ::tflite::LogToStderr(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h index 8338fde8acb..a1037a525c3 100644 --- a/tensorflow/contrib/lite/builtin_op_data.h +++ b/tensorflow/contrib/lite/builtin_op_data.h @@ -116,25 +116,9 @@ typedef struct { } TfLiteAddParams; typedef struct { - // Number of spatial dimensions. - // For now only NHWC is supported, and the value should always be 2. - int num_spatial_dimensions; - // TODO(ahentz): We can't have dynamic data in this struct, at least not yet. - // For now we will fix the maximum possible number of dimensions. - int block_shape[2]; - int before_paddings[2]; - int after_paddings[2]; } TfLiteSpaceToBatchNDParams; typedef struct { - // Number of spatial dimensions. - // For now only NHWC is supported, and the value should always be 2. - int num_spatial_dimensions; - // TODO(ahentz): We can't have dynamic data in this struct, at least not yet. - // For now we will fix the maximum possible number of dimensions. - int block_shape[2]; - int before_crops[2]; - int after_crops[2]; } TfLiteBatchToSpaceNDParams; typedef struct { @@ -204,17 +188,9 @@ typedef struct { } TfLiteGatherParams; typedef struct { - // TODO(ahentz): We can't have dynamic data in this struct, at least not yet. - // For now we will fix the maximum possible number of dimensions. - int perm[8]; - int num_dimensions; } TfLiteTransposeParams; typedef struct { - // TODO(ahentz): We can't have dynamic data in this struct, at least not yet. - // For now we will fix the maximum possible number of dimensions. - int axis[8]; - int num_axis_dimensions; bool keep_dims; } TfLiteMeanParams; diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm index 10f31bb6f17..d74e275f043 100644 --- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm +++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm @@ -225,14 +225,8 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const assert(pixelBuffer != NULL); OSType sourcePixelFormat = CVPixelBufferGetPixelFormatType(pixelBuffer); - int doReverseChannels; - if (kCVPixelFormatType_32ARGB == sourcePixelFormat) { - doReverseChannels = 1; - } else if (kCVPixelFormatType_32BGRA == sourcePixelFormat) { - doReverseChannels = 0; - } else { - assert(false); // Unknown source format - } + assert(sourcePixelFormat == kCVPixelFormatType_32ARGB || + sourcePixelFormat == kCVPixelFormatType_32BGRA); const int sourceRowBytes = (int)CVPixelBufferGetBytesPerRow(pixelBuffer); const int image_width = (int)CVPixelBufferGetWidth(pixelBuffer); diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD index 476d85c0314..d216cdf69ba 100644 --- a/tensorflow/contrib/lite/examples/label_image/BUILD +++ b/tensorflow/contrib/lite/examples/label_image/BUILD @@ -42,7 +42,10 @@ cc_library( "bitmap_helpers_impl.h", "label_image.h", ], - deps = ["//tensorflow/contrib/lite:string"], + deps = [ + "//tensorflow/contrib/lite:string", + "//tensorflow/contrib/lite/kernels:builtin_ops", + ], ) # TODO(ahentz): Test disabled as it has a memory leek from read_bmp diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h index 860e27e5ba9..471fda2ba46 100644 --- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h +++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h @@ -26,15 +26,15 @@ uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height, int* channels, Settings* s); template -void downsize(T* out, uint8_t* in, int image_height, int image_width, - int image_channels, int wanted_height, int wanted_width, - int wanted_channels, Settings* s); +void resize(T* out, uint8_t* in, int image_height, int image_width, + int image_channels, int wanted_height, int wanted_width, + int wanted_channels, Settings* s); // explicit instantiation -template void downsize(uint8_t*, unsigned char*, int, int, int, int, - int, int, Settings*); -template void downsize(float*, unsigned char*, int, int, int, int, int, - int, Settings*); +template void resize(uint8_t*, unsigned char*, int, int, int, int, + int, int, Settings*); +template void resize(float*, unsigned char*, int, int, int, int, int, + int, Settings*); } // namespace label_image } // namespace tflite diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h index 64a931082b0..33ea695dda8 100644 --- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h +++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h @@ -16,30 +16,76 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H #define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H +#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/string_util.h" +#include "tensorflow/contrib/lite/version.h" + #include "tensorflow/contrib/lite/examples/label_image/label_image.h" namespace tflite { namespace label_image { template -void downsize(T* out, uint8_t* in, int image_height, int image_width, - int image_channels, int wanted_height, int wanted_width, - int wanted_channels, Settings* s) { - for (int y = 0; y < wanted_height; ++y) { - const int in_y = (y * image_height) / wanted_height; - uint8_t* in_row = in + (in_y * image_width * image_channels); - T* out_row = out + (y * wanted_width * wanted_channels); - for (int x = 0; x < wanted_width; ++x) { - const int in_x = (x * image_width) / wanted_width; - uint8_t* in_pixel = in_row + (in_x * image_channels); - T* out_pixel = out_row + (x * wanted_channels); - for (int c = 0; c < wanted_channels; ++c) { - if (s->input_floating) - out_pixel[c] = (in_pixel[c] - s->input_mean) / s->input_std; - else - out_pixel[c] = in_pixel[c]; - } - } +void resize(T* out, uint8_t* in, int image_height, int image_width, + int image_channels, int wanted_height, int wanted_width, + int wanted_channels, Settings* s) { + + int number_of_pixels = image_height * image_width * image_channels; + std::unique_ptr interpreter(new Interpreter); + + int base_index = 0; + + // two inputs: input and new_sizes + interpreter->AddTensors(2, &base_index); + // one output + interpreter->AddTensors(1, &base_index); + // set input and output tensors + interpreter->SetInputs({0, 1}); + interpreter->SetOutputs({2}); + + // set paramters of tensors + TfLiteQuantizationParams quant; + interpreter->SetTensorParametersReadWrite( + 0, kTfLiteFloat32, "input", + {1, image_height, image_width, image_channels}, quant); + interpreter->SetTensorParametersReadWrite(1, kTfLiteInt32, "new_size", {2}, + quant); + interpreter->SetTensorParametersReadWrite( + 2, kTfLiteFloat32, "output", + {1, wanted_height, wanted_width, wanted_channels}, quant); + + ops::builtin::BuiltinOpResolver resolver; + TfLiteRegistration* resize_op = + resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR); + interpreter->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, nullptr, + resize_op, nullptr); + + interpreter->AllocateTensors(); + + // fill input image + // in[] are integers, cannot do memcpy() directly + auto input = interpreter->typed_tensor(0); + for (int i = 0; i < number_of_pixels; i++) { + input[i] = in[i]; + } + + // fill new_sizes + interpreter->typed_tensor(1)[0] = wanted_height; + interpreter->typed_tensor(1)[1] = wanted_width; + + interpreter->Invoke(); + + auto output = interpreter->typed_tensor(2); + auto output_number_of_pixels = + wanted_height * wanted_height * wanted_channels; + + for (int i = 0; i < output_number_of_pixels; i++) { + if (s->input_floating) + out[i] = (output[i] - s->input_mean) / s->input_std; + else + out[i] = (uint8_t)output[i]; } } diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc index d7f49ad8757..a78900122ef 100644 --- a/tensorflow/contrib/lite/examples/label_image/label_image.cc +++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc @@ -151,14 +151,14 @@ void RunInference(Settings* s) { switch (interpreter->tensor(input)->type) { case kTfLiteFloat32: s->input_floating = true; - downsize(interpreter->typed_tensor(input), in, - image_height, image_width, image_channels, - wanted_height, wanted_width, wanted_channels, s); + resize(interpreter->typed_tensor(input), in, + image_height, image_width, image_channels, + wanted_height, wanted_width, wanted_channels, s); break; case kTfLiteUInt8: - downsize(interpreter->typed_tensor(input), in, - image_height, image_width, image_channels, - wanted_height, wanted_width, wanted_channels, s); + resize(interpreter->typed_tensor(input), in, + image_height, image_width, image_channels, + wanted_height, wanted_width, wanted_channels, s); break; default: LOG(FATAL) << "cannot handle input type " diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc index 69a597dc5a2..a8db149eaae 100644 --- a/tensorflow/contrib/lite/interpreter.cc +++ b/tensorflow/contrib/lite/interpreter.cc @@ -36,6 +36,10 @@ constexpr const int kSlotsToReserve = 128; namespace tflite { // A trivial implementation of GraphInfo around the Interpreter. +// NOTE: this interpreter info represents the subset of the +// graph that is executed according to execution plan. Thus, +// the indices are execution plan indices rather than raw node +// indices. class InterpreterInfo : public GraphInfo { public: explicit InterpreterInfo(Interpreter* interpreter) @@ -45,9 +49,12 @@ class InterpreterInfo : public GraphInfo { TfLiteTensor* tensor(size_t index) override { return interpreter_->tensor(index); } - size_t num_nodes() const override { return interpreter_->nodes_size(); } + size_t num_nodes() const override { + return interpreter_->execution_plan().size(); + } const TfLiteNode& node(size_t index) const override { - return interpreter_->node_and_registration(index)->first; + int node_index = interpreter_->execution_plan()[index]; + return interpreter_->node_and_registration(node_index)->first; } const std::vector& inputs() const override { return interpreter_->inputs(); @@ -73,7 +80,7 @@ Interpreter::Interpreter(ErrorReporter* error_reporter) // Reserve some space for the tensors to avoid excessive resizing. tensors_.reserve(kSlotsToReserve); nodes_and_registration_.reserve(kSlotsToReserve); - next_node_to_prepare_ = 0; + next_execution_plan_index_to_prepare_ = 0; UseNNAPI(false); } @@ -160,7 +167,7 @@ TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector& x) { } // namespace TfLiteStatus Interpreter::AllocateTensors() { - next_node_to_prepare_ = 0; + next_execution_plan_index_to_prepare_ = 0; if (memory_planner_) { TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations()); } @@ -190,7 +197,8 @@ TfLiteStatus Interpreter::AddNodeWithParameters( &context_, CheckTensorIndices("node outputs", outputs.data(), outputs.size())); - if (node_index) *node_index = nodes_and_registration_.size(); + int new_node_index = nodes_and_registration_.size(); + if (node_index) *node_index = new_node_index; nodes_and_registration_.resize(nodes_and_registration_.size() + 1); auto& node_and_reg = nodes_and_registration_.back(); TfLiteNode& node = node_and_reg.first; @@ -213,6 +221,7 @@ TfLiteStatus Interpreter::AddNodeWithParameters( } node.builtin_data = builtin_data_deleter.release(); node_and_reg.second = *registration; + execution_plan_.push_back(new_node_index); return kTfLiteOk; } @@ -240,16 +249,19 @@ bool HasDynamicTensor(const TfLiteContext& context, return false; } -TfLiteStatus Interpreter::PrepareOpsStartingAt(int first_node, - int* last_node_prepared) { - for (int i = first_node; i < nodes_and_registration_.size(); i++) { - TfLiteNode& node = nodes_and_registration_[i].first; - const TfLiteRegistration& registration = nodes_and_registration_[i].second; +TfLiteStatus Interpreter::PrepareOpsStartingAt( + int first_execution_plan_index, int* last_execution_plan_index_prepared) { + for (int execution_plan_index = first_execution_plan_index; + execution_plan_index < execution_plan_.size(); execution_plan_index++) { + int node_index = execution_plan_[execution_plan_index]; + TfLiteNode& node = nodes_and_registration_[node_index].first; + const TfLiteRegistration& registration = + nodes_and_registration_[node_index].second; if (OpPrepare(registration, &node) == kTfLiteError) { return kTfLiteError; } - *last_node_prepared = i; + *last_execution_plan_index_prepared = execution_plan_index; // Discontinue if the node has dynamic outputs. Note that we don't // stop for dynamic temporary tensors since they won't affect the @@ -268,14 +280,14 @@ TfLiteStatus Interpreter::PrepareOpsAndTensors() { memory_planner_->PlanAllocations(); } - int last_node_prepared = 0; + int last_exec_plan_index_prepared = 0; - TF_LITE_ENSURE_STATUS( - PrepareOpsStartingAt(next_node_to_prepare_, &last_node_prepared)); + TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt( + next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared)); TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations( - next_node_to_prepare_, last_node_prepared)); + next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared)); - next_node_to_prepare_ = last_node_prepared + 1; + next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1; return kTfLiteOk; } @@ -291,7 +303,8 @@ TfLiteStatus Interpreter::Invoke() { TfLiteStatus status = kTfLiteOk; if (nnapi_delegate_) { - if (next_node_to_prepare_ == nodes_and_registration_.size()) { + TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors()); + if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) { TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this)); return kTfLiteOk; } else { @@ -311,13 +324,17 @@ TfLiteStatus Interpreter::Invoke() { // TODO(b/71913981): we should force recalculation in the presence of dynamic // tensors, because they may have new value which in turn may affect shapes // and allocations. - for (int i = 0; i < nodes_and_registration_.size(); i++) { - if (i == next_node_to_prepare_) { + for (int execution_plan_index = 0; + execution_plan_index < execution_plan_.size(); execution_plan_index++) { + if (execution_plan_index == next_execution_plan_index_to_prepare_) { TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors()); - TF_LITE_ENSURE(&context_, next_node_to_prepare_ >= i); + TF_LITE_ENSURE(&context_, next_execution_plan_index_to_prepare_ >= + execution_plan_index); } - TfLiteNode& node = nodes_and_registration_[i].first; - const TfLiteRegistration& registration = nodes_and_registration_[i].second; + int node_index = execution_plan_[execution_plan_index]; + TfLiteNode& node = nodes_and_registration_[node_index].first; + const TfLiteRegistration& registration = + nodes_and_registration_[node_index].second; if (OpInvoke(registration, &node) == kTfLiteError) { status = kTfLiteError; } @@ -421,6 +438,14 @@ TfLiteStatus Interpreter::SetTensorParametersReadWrite( return kTfLiteOk; } +TfLiteStatus Interpreter::SetExecutionPlan(const std::vector& new_plan) { + for (int node_index : new_plan) { + TF_LITE_ENSURE(&context_, node_index >= 0 && node_index < nodes_size()); + } + execution_plan_ = new_plan; + return kTfLiteOk; +} + TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size) { // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too. diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h index 4f732769f9f..c822557d02c 100644 --- a/tensorflow/contrib/lite/interpreter.h +++ b/tensorflow/contrib/lite/interpreter.h @@ -108,7 +108,7 @@ class Interpreter { // Adds a node with the given parameters and returns the index of the new // node in `node_index` (optionally). Interpreter will take ownership of - // `builtin_data` and destroy it with `delete`. Ownership of 'init_data' + // `builtin_data` and destroy it with `free`. Ownership of 'init_data' // remains with the caller. TfLiteStatus AddNodeWithParameters(const std::vector& inputs, const std::vector& outputs, @@ -166,12 +166,19 @@ class Interpreter { // Return the number of ops in the model. int nodes_size() const { return nodes_and_registration_.size(); } + // WARNING: Experimental interface, subject to change + const std::vector& execution_plan() const { return execution_plan_; } + + // WARNING: Experimental interface, subject to change + // Overrides execution plan. This bounds checks indices sent in. + TfLiteStatus SetExecutionPlan(const std::vector& new_plan); + // Get a tensor data structure. // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this // read/write access to structure TfLiteTensor* tensor(int tensor_index) { if (tensor_index >= context_.tensors_size || tensor_index < 0) - return nullptr; + return nullptr; return &context_.tensors[tensor_index]; } @@ -279,7 +286,8 @@ class Interpreter { // dynamic tensors is found or all ops have been prepared. Fill // 'last_node_prepared' with the id of the op containing dynamic tensors, or // the last in the graph. - TfLiteStatus PrepareOpsStartingAt(int first_node, int* last_node_prepared); + TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index, + int* last_execution_plan_index_prepared); // Tensors needed by the interpreter. Use `AddTensors` to add more blank // tensor entries. Note, `tensors_.data()` needs to be synchronized to the @@ -354,7 +362,14 @@ class Interpreter { // node id, and execute the node to generate the output tensor before continue // to allocate successors. This process repeats until all nodes are executed. // NOTE: this relies on the order of nodes that is in topological order. - int next_node_to_prepare_; + int next_execution_plan_index_to_prepare_; + + // WARNING: This is an experimental interface that is subject to change. + // This is a list of node indices (to index into nodes_and_registration). + // This represents a valid topological sort (dependency ordered) execution + // plan. In particular, it is valid for this ordering to contain only a + // subset of the node indices. + std::vector execution_plan_; // Whether to delegate to NN API std::unique_ptr nnapi_delegate_; diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc index edff2109430..cfda19d72cd 100644 --- a/tensorflow/contrib/lite/interpreter_test.cc +++ b/tensorflow/contrib/lite/interpreter_test.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include "tensorflow/contrib/lite/error_reporter.h" #include "tensorflow/contrib/lite/string_util.h" +#include "tensorflow/contrib/lite/testing/util.h" namespace tflite { namespace { @@ -282,6 +283,51 @@ TEST(BasicInterpreter, NoOpInterpreter) { ASSERT_EQ(interpreter.Invoke(), kTfLiteOk); } +TEST(BasicInterpreter, ResizingTensors) { + Interpreter interpreter; + ASSERT_EQ(interpreter.AddTensors(1), kTfLiteOk); + ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk); + ASSERT_EQ(interpreter.SetOutputs({0}), kTfLiteOk); + + ASSERT_EQ(interpreter.SetTensorParametersReadWrite( + 0, kTfLiteFloat32, "", {3}, TfLiteQuantizationParams()), + kTfLiteOk); + + int t = interpreter.inputs()[0]; + TfLiteTensor* tensor = interpreter.tensor(t); + + ASSERT_EQ(interpreter.ResizeInputTensor(t, {1, 2, 3}), kTfLiteOk); + EXPECT_EQ(tensor->bytes, 6 * sizeof(float)); + ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk); + + tensor->data.f[5] = 0.123f; + + // Changing from kTfLiteArenaRw to kTfLiteDynamic is quite complicate: we need + // to unset data.raw, otherwise Realloc will try to free that memory. + tensor->data.raw = nullptr; + tensor->allocation_type = kTfLiteDynamic; + + ASSERT_EQ(interpreter.ResizeInputTensor(t, {1, 2, 4}), kTfLiteOk); + EXPECT_EQ(tensor->bytes, 8 * sizeof(float)); + ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk); + + // TODO(ahentz): We shouldn't have to force reallocation, but + // ResizeInputTensor doesn't realloc dynamic tensors. Also note that + // TfLiteTensorRealloc(tensor->bytes, tensor) is a no-op. + TfLiteTensorRealloc(9 * sizeof(float), tensor); + tensor->data.f[7] = 0.123f; + + ASSERT_EQ(interpreter.ResizeInputTensor(t, {2, 2, 4}), kTfLiteOk); + EXPECT_EQ(tensor->bytes, 16 * sizeof(float)); + ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk); + + // TODO(ahentz): We shouldn't have to force reallocation, but + // ResizeInputTensor doesn't realloc dynamic tensors. Also note that + // TfLiteTensorRealloc(tensor->bytes, tensor) is a no-op. + TfLiteTensorRealloc(17 * sizeof(float), tensor); + tensor->data.f[15] = 0.123f; +} + TEST(BasicInterpreter, OneOpInterpreter) { Interpreter interpreter; ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk); @@ -514,13 +560,138 @@ TEST(BasicInterpreter, TestCustomErrorReporter) { ASSERT_EQ(reporter.calls, 1); } +// Test fixture that allows playing with execution plans. It creates a two +// node graph that can be executed in either [0,1] order or [1,0] order. +// The CopyOp records when it is invoked in the class member run_order_ +// so we can test whether the execution plan was honored. +class TestExecutionPlan : public ::testing::Test { + // Encapsulates the node ids and provides them to a C primitive data type + // Allocatable with placement new, but never destructed, so make sure this + // doesn't own any heap allocated data. This is then is used as op local + // data to allow access to the test fixture data. + class CallReporting { + public: + CallReporting(int node_id, std::vector* run_order) + : node_id_(node_id), run_order_(run_order) {} + + void Record() { run_order_->push_back(node_id_); } + + private: + // The node id for this particular node + int node_id_; + // A pointer to the global run-order + std::vector* run_order_; + }; + + // Build a kernel registration for an op that copies its one input + // to an output + TfLiteRegistration CopyOpRegistration() { + TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr}; + + reg.prepare = [](TfLiteContext* context, TfLiteNode* node) { + // Set output size to input size + TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]]; + TfLiteTensor* tensor1 = &context->tensors[node->outputs->data[0]]; + TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims); + return context->ResizeTensor(context, tensor1, newSize); + }; + + reg.invoke = [](TfLiteContext* context, TfLiteNode* node) { + CallReporting* call_reporting = + reinterpret_cast(node->builtin_data); + // Copy input data to output data. + TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]]; + TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]]; + int num = a0->dims->data[0]; + for (int i = 0; i < num; i++) { + a1->data.f[i] = a0->data.f[i]; + } + call_reporting->Record(); + return kTfLiteOk; + }; + return reg; + } + + // Adds a copy node going from tensor `input` to output tensor `output`. + // Note, input is used as the node_id. Inject run_order as op accessible + // data. Note: this is a little strange of a way to do this, but it is + // using op functionality to avoid static global variables. + void MakeCopyNode(int input, int output) { + // Ownership of call_reporting is taken by interpreter (malloc is used due + // to nodes being a C99 interface so free() is used). + TfLiteRegistration copy_op = CopyOpRegistration(); + CallReporting* call_reporting_1 = + reinterpret_cast(malloc(sizeof(CallReporting))); + new (call_reporting_1) CallReporting(input, &run_order_); + ASSERT_EQ(interpreter_.AddNodeWithParameters( + {0}, {2}, nullptr, 0, + reinterpret_cast(call_reporting_1), ©_op), + kTfLiteOk); + ASSERT_EQ(interpreter_.ResizeInputTensor(input, {3}), kTfLiteOk); + } + + void SetUp() final { + // Add two inputs and two outputs that don't depend on each other + ASSERT_EQ(interpreter_.AddTensors(4), kTfLiteOk); + interpreter_.SetInputs({0, 1}); + interpreter_.SetOutputs({2, 3}); + TfLiteQuantizationParams quantized; + for (int tensor_index = 0; tensor_index < 4; tensor_index++) { + ASSERT_EQ(interpreter_.SetTensorParametersReadWrite( + tensor_index, kTfLiteFloat32, "", {3}, quantized), + kTfLiteOk); + } + + // Define two copy functions that also use the user_data to report that + // they were called. + // i.e. tensor[2] = copy(tensor[0]); tensor[3] = copy(tensor[1]); + // thus we can reorder the two nodes arbitrary and still satisfy dependency + // order. + MakeCopyNode(0, 2); + MakeCopyNode(1, 3); + + ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk); + } + + protected: + Interpreter interpreter_; + + // list of node_ids that were run + std::vector run_order_; +}; + +TEST_F(TestExecutionPlan, DefaultExecutionPlan) { + // Check default order + ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk); + ASSERT_EQ(run_order_, std::vector({0, 1})); +} + +TEST_F(TestExecutionPlan, ReversedExecutionPlan) { + // Check reversed order + interpreter_.SetExecutionPlan({1, 0}); + ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk); + ASSERT_EQ(run_order_, std::vector({1, 0})); +} + +TEST_F(TestExecutionPlan, SubsetExecutionPlan) { + // Check running only node index 1 + interpreter_.SetExecutionPlan({1}); + ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk); + ASSERT_EQ(run_order_, std::vector({1})); +} + +TEST_F(TestExecutionPlan, NullExecutionPlan) { + // Check nothing executed. + interpreter_.SetExecutionPlan({}); + ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk); + ASSERT_EQ(run_order_, std::vector()); +} + } // namespace } // namespace tflite int main(int argc, char** argv) { -#ifdef OS_LINUX - FLAGS_logtostderr = true; -#endif + ::tflite::LogToStderr(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD index 4195e7553c4..a8ef0daede4 100644 --- a/tensorflow/contrib/lite/kernels/BUILD +++ b/tensorflow/contrib/lite/kernels/BUILD @@ -71,6 +71,32 @@ cc_library( ], ) +cc_library( + name = "kernel_util", + srcs = [ + "kernel_util.cc", + ], + hdrs = [ + "kernel_util.h", + ], + deps = [ + "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite:context", + "//tensorflow/contrib/lite/kernels/internal:round", + ], +) + +tf_cc_test( + name = "kernel_util_test", + size = "small", + srcs = ["kernel_util_test.cc"], + deps = [ + ":kernel_util", + "//tensorflow/contrib/lite/testing:util", + "@com_google_googletest//:gtest", + ], +) + cc_library( name = "builtin_ops", srcs = [ @@ -78,6 +104,7 @@ cc_library( "add.cc", "basic_rnn.cc", "batch_to_space_nd.cc", + "bidirectional_sequence_rnn.cc", "concatenation.cc", "conv.cc", "depthwise_conv.cc", @@ -87,7 +114,6 @@ cc_library( "fully_connected.cc", "gather.cc", "hashtable_lookup.cc", - "kernel_util.cc", "l2norm.cc", "local_response_norm.cc", "lsh_projection.cc", @@ -111,7 +137,6 @@ cc_library( "unidirectional_sequence_rnn.cc", ], hdrs = [ - "kernel_util.h", "padding.h", "register.h", ], @@ -125,11 +150,13 @@ cc_library( }), deps = [ ":activation_functor", + ":kernel_util", ":op_macros", "//tensorflow/contrib/lite:builtin_op_data", "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:string_util", "//tensorflow/contrib/lite/kernels:gemm_support", + "//tensorflow/contrib/lite/kernels/internal:kernel_utils", "//tensorflow/contrib/lite/kernels/internal:optimized", "//tensorflow/contrib/lite/kernels/internal:optimized_base", "//tensorflow/contrib/lite/kernels/internal:quantization_util", @@ -223,6 +250,7 @@ tf_cc_test( ":builtin_ops", "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_absl//absl/memory", "@com_google_googletest//:gtest", ], ) @@ -263,6 +291,18 @@ tf_cc_test( ], ) +tf_cc_test( + name = "bidirectional_sequence_rnn_test", + size = "small", + srcs = ["bidirectional_sequence_rnn_test.cc"], + deps = [ + ":builtin_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + tf_cc_test( name = "unidirectional_sequence_rnn_test", size = "small", diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc index 8ac93bc8c8d..3c5c77815d0 100644 --- a/tensorflow/contrib/lite/kernels/activations.cc +++ b/tensorflow/contrib/lite/kernels/activations.cc @@ -15,8 +15,8 @@ limitations under the License. #include #include #include -#include #include +#include #include #include @@ -134,8 +134,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) { float* out = output->data.f; for (; in < in_end; in++, out++) *out = std::max(0.f, *in); return kTfLiteOk; - } - break; + } break; default: context->ReportError(context, "Only float32 supported currently."); return kTfLiteError; @@ -173,8 +172,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) { float* out = output->data.f; for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f); return kTfLiteOk; - } - break; + } break; default: context->ReportError(context, "Only float32 supported currently."); return kTfLiteError; @@ -192,8 +190,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) { float* out = output->data.f; for (; in < in_end; in++, out++) *out = std::tanh(*in); return kTfLiteOk; - } - break; + } break; default: context->ReportError(context, "Only float32 supported currently."); return kTfLiteError; diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc index 0e10a249aba..63ea89df56b 100644 --- a/tensorflow/contrib/lite/kernels/add.cc +++ b/tensorflow/contrib/lite/kernels/add.cc @@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0; constexpr int kInputTensor2 = 1; constexpr int kOutputTensor = 0; +struct OpData { + bool requires_broadcast; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* data = new OpData; + data->requires_broadcast = false; + return data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast(buffer); +} + TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + OpData* data = reinterpret_cast(node->user_data); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); @@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2)); - for (int i = 0; i < NumDimensions(input1); ++i) { - TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i), - SizeOfDimension(input2, i)); + TF_LITE_ENSURE_EQ(context, input1->type, input2->type); + output->type = input2->type; + + data->requires_broadcast = !HaveSameShapes(input1, input2); + + TfLiteIntArray* output_size = nullptr; + if (data->requires_broadcast) { + TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast( + context, input1, input2, &output_size)); + } else { + output_size = TfLiteIntArrayCopy(input1->dims); } - TF_LITE_ENSURE_EQ(context, input1->type, output->type); - TF_LITE_ENSURE_EQ(context, input2->type, output->type); - - TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims); return context->ResizeTensor(context, output, output_size); } template void EvalAddFloat(TfLiteContext* context, TfLiteNode* node, - TfLiteAddParams* params, TfLiteTensor* input1, - TfLiteTensor* input2, TfLiteTensor* output) { + TfLiteAddParams* params, const OpData* data, + TfLiteTensor* input1, TfLiteTensor* input2, + TfLiteTensor* output) { float output_activation_min, output_activation_max; CalculateActivationRangeFloat(params->activation, &output_activation_min, &output_activation_max); -#define TF_LITE_ADD(type) \ - type::Add(GetTensorData(input1), GetTensorDims(input1), \ - GetTensorData(input2), GetTensorDims(input2), \ - output_activation_min, output_activation_max, \ - GetTensorData(output), GetTensorDims(output)) - if (kernel_type == kReference) { - TF_LITE_ADD(reference_ops); +#define TF_LITE_ADD(type, opname) \ + type::opname(GetTensorData(input1), GetTensorDims(input1), \ + GetTensorData(input2), GetTensorDims(input2), \ + output_activation_min, output_activation_max, \ + GetTensorData(output), GetTensorDims(output)) + if (kernel_type == kReference) { + if (data->requires_broadcast) { + TF_LITE_ADD(reference_ops, BroadcastAdd); } else { - TF_LITE_ADD(optimized_ops); + TF_LITE_ADD(reference_ops, Add); + } + } else { + if (data->requires_broadcast) { + TF_LITE_ADD(optimized_ops, BroadcastAdd); + } else { + TF_LITE_ADD(optimized_ops, Add); + } } #undef TF_LITE_ADD } template void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, - TfLiteAddParams* params, TfLiteTensor* input1, - TfLiteTensor* input2, TfLiteTensor* output) { + TfLiteAddParams* params, const OpData* data, + TfLiteTensor* input1, TfLiteTensor* input2, + TfLiteTensor* output) { auto input1_offset = -input1->params.zero_point; auto input2_offset = -input2->params.zero_point; auto output_offset = output->params.zero_point; @@ -112,19 +141,20 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, CalculateActivationRangeUint8(params->activation, output, &output_activation_min, &output_activation_max); -#define TF_LITE_ADD(type) \ - type::BroadcastAdd( \ - left_shift, GetTensorData(input1), GetTensorDims(input1), \ - input1_offset, input1_multiplier, input1_shift, \ - GetTensorData(input2), GetTensorDims(input2), input2_offset, \ - input2_multiplier, input2_shift, output_offset, output_multiplier, \ - output_shift, output_activation_min, output_activation_max, \ - GetTensorData(output), GetTensorDims(output)); - +#define TF_LITE_ADD(type, opname) \ + type::opname(left_shift, GetTensorData(input1), \ + GetTensorDims(input1), input1_offset, input1_multiplier, \ + input1_shift, GetTensorData(input2), \ + GetTensorDims(input2), input2_offset, input2_multiplier, \ + input2_shift, output_offset, output_multiplier, output_shift, \ + output_activation_min, output_activation_max, \ + GetTensorData(output), GetTensorDims(output)); + // The quantized version of Add doesn't support activations, so we + // always use BroadcastAdd. if (kernel_type == kReference) { - TF_LITE_ADD(reference_ops); + TF_LITE_ADD(reference_ops, BroadcastAdd); } else { - TF_LITE_ADD(optimized_ops); + TF_LITE_ADD(optimized_ops, BroadcastAdd); } #undef TF_LITE_ADD } @@ -132,15 +162,17 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, template TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast(node->builtin_data); + OpData* data = reinterpret_cast(node->user_data); TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); if (output->type == kTfLiteFloat32) { - EvalAddFloat(context, node, params, input1, input2, output); + EvalAddFloat(context, node, params, data, input1, input2, + output); } else if (output->type == kTfLiteUInt8) { - EvalAddQuantized(context, node, params, input1, input2, + EvalAddQuantized(context, node, params, data, input1, input2, output); } else { context->ReportError(context, @@ -154,19 +186,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace add TfLiteRegistration* Register_ADD_REF() { - static TfLiteRegistration r = {nullptr, nullptr, add::Prepare, + static TfLiteRegistration r = {add::Init, add::Free, add::Prepare, add::Eval}; return &r; } TfLiteRegistration* Register_ADD_GENERIC_OPT() { - static TfLiteRegistration r = {nullptr, nullptr, add::Prepare, + static TfLiteRegistration r = {add::Init, add::Free, add::Prepare, add::Eval}; return &r; } TfLiteRegistration* Register_ADD_NEON_OPT() { - static TfLiteRegistration r = {nullptr, nullptr, add::Prepare, + static TfLiteRegistration r = {add::Init, add::Free, add::Prepare, add::Eval}; return &r; } diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/contrib/lite/kernels/add_test.cc index 306dfc3e803..956d05bed51 100644 --- a/tensorflow/contrib/lite/kernels/add_test.cc +++ b/tensorflow/contrib/lite/kernels/add_test.cc @@ -25,10 +25,11 @@ using ::testing::ElementsAreArray; class BaseAddOpModel : public SingleOpModel { public: - BaseAddOpModel(const TensorData& input, const TensorData& output, + BaseAddOpModel(const TensorData& input1, const TensorData& input2, + const TensorData& output, ActivationFunctionType activation_type) { - input1_ = AddInput(input); - input2_ = AddInput(input); + input1_ = AddInput(input1); + input2_ = AddInput(input2); output_ = AddOutput(output); SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions, CreateAddOptions(builder_, activation_type).Union()); @@ -70,6 +71,7 @@ float GetTolerance(int min, int max) { TEST(FloatAddOpModel, NoActivation) { FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, + {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); m.PopulateTensor(m.input1(), {-2.0, 0.2, 0.7, 0.8}); m.PopulateTensor(m.input2(), {0.1, 0.2, 0.3, 0.5}); @@ -78,9 +80,9 @@ TEST(FloatAddOpModel, NoActivation) { } TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) { - FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, - {TensorType_FLOAT32, {}}, - ActivationFunctionType_RELU_N1_TO_1); + FloatAddOpModel m( + {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}}, + {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1); m.PopulateTensor(m.input1(), {-2.0, 0.2, 0.7, 0.8}); m.PopulateTensor(m.input2(), {0.1, 0.2, 0.3, 0.5}); m.Invoke(); @@ -92,6 +94,7 @@ TEST(FloatAddOpModel, VariousInputShapes) { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; for (int i = 0; i < test_shapes.size(); ++i) { FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]}, + {TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); m.PopulateTensor(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0}); m.PopulateTensor(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1}); @@ -102,6 +105,23 @@ TEST(FloatAddOpModel, VariousInputShapes) { } } +TEST(FloatAddOpModel, WithBroadcast) { + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]}, + {TensorType_FLOAT32, {}}, // always a scalar + {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0}); + m.PopulateTensor(m.input2(), {0.1}); + m.Invoke(); + EXPECT_THAT( + m.GetOutput(), + ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1}))) + << "With shape number " << i; + } +} + TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) { float kQuantizedTolerance = GetTolerance(-1.0, 1.0); std::vector> inputs1 = { @@ -112,6 +132,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) { {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; for (int i = 0; i < inputs1.size(); ++i) { QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, + {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0}, ActivationFunctionType_NONE); m.QuantizeAndPopulate(m.input1(), inputs1[i]); @@ -133,6 +154,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) { {-0.2, 0.6, -0.1, 0.8}}; for (int i = 0; i < inputs1.size(); ++i) { QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, + {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0}, ActivationFunctionType_RELU_N1_TO_1); m.QuantizeAndPopulate(m.input1(), inputs1[i]); @@ -150,6 +172,7 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; for (int i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0}, + {TensorType_UINT8, test_shapes[i], -3.0, 3.0}, {TensorType_UINT8, {}, -3.0, 3.0}, ActivationFunctionType_NONE); m.QuantizeAndPopulate(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0}); @@ -162,6 +185,25 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) { } } +TEST(QuantizedAddOpModel, QuantizedWithBroadcast) { + float kQuantizedTolerance = GetTolerance(-3.0, 3.0); + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0}, + {TensorType_UINT8, {}, -3.0, 3.0}, + {TensorType_UINT8, {}, -3.0, 3.0}, + ActivationFunctionType_NONE); + m.QuantizeAndPopulate(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0}); + m.QuantizeAndPopulate(m.input2(), {0.1}); + m.Invoke(); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1}, + kQuantizedTolerance))) + << "With shape number " << i; + } +} + } // namespace } // namespace tflite int main(int argc, char** argv) { diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc index 3cee43c68b2..2c5074eca31 100644 --- a/tensorflow/contrib/lite/kernels/basic_rnn.cc +++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc @@ -15,14 +15,15 @@ limitations under the License. #include #include #include -#include #include +#include #include #include #include "tensorflow/contrib/lite/builtin_op_data.h" #include "tensorflow/contrib/lite/context.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" +#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" namespace tflite { @@ -76,8 +77,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2); output_size_array->data[0] = batch_size; output_size_array->data[1] = num_units; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output, - output_size_array)); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, output, output_size_array)); return kTfLiteOk; } @@ -101,50 +102,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const int batch_size = input->dims->data[0]; const int num_units = input_weights->dims->data[0]; const int input_size = input->dims->data[1]; - const int input_weights_stride = input_weights->dims->data[1]; - const int recurrent_weights_stride = recurrent_weights->dims->data[1]; - // For each batch - for (int b = 0; b < batch_size; b++) { - // Initialize the pointer to input, output and bias. - const float* input_ptr_batch = input->data.f + b * input_size; - float* output_ptr_batch = output->data.f + b * num_units; - float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units; - - // Initialize input_weights and recurrent_weights. - const float* input_weights_ptr = input_weights->data.f; - const float* recurrent_weights_ptr = recurrent_weights->data.f; - - // Output = bias - for (int o = 0; o < num_units; o++) { - output_ptr_batch[o] = bias_ptr[o]; - } - - // Output += input * input_weights - for (int o = 0; o < num_units; o++) { - for (int i = 0; i < input_size; i++) { - output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i]; - } - input_weights_ptr += input_weights_stride; - } - - // Output += recurrent_weights * hidden_state - for (int o = 0; o < num_units; o++) { - for (int h = 0; h < num_units; h++) { - output_ptr_batch[o] += - hidden_state_ptr_batch[h] * recurrent_weights_ptr[h]; - } - recurrent_weights_ptr += recurrent_weights_stride; - } - - // Output = activation(Output) and update hidden_state - for (int o = 0; o < num_units; o++) { - output_ptr_batch[o] = - (ActivationFunctor(params->activation))(output_ptr_batch[o]); - hidden_state_ptr_batch[o] = output_ptr_batch[o]; - } - } + // Initialize the pointer to hidden state. + float* hidden_state_ptr_batch = hidden_state->data.f; + // Initialize the pointer to input and output. + const float* input_ptr_batch = input->data.f; + float* output_ptr_batch = output->data.f; + // Initialize input_weights and recurrent_weights. + const float* input_weights_ptr = input_weights->data.f; + const float* recurrent_weights_ptr = recurrent_weights->data.f; + kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr, + recurrent_weights_ptr, bias_ptr, input_size, + num_units, batch_size, params->activation, + hidden_state_ptr_batch, output_ptr_batch); return kTfLiteOk; } diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc index 5ecccb985e9..fa7ef525db4 100644 --- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc +++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ // Unit test for TFLite RNN op. -#include #include +#include #include #include @@ -120,8 +120,7 @@ static float rnn_golden_output[] = { 0.415153, 0.210318, 0, 0, 0, 0, 0, 2.02616, 0, 0.728256, 0.84183, 0.0907453, - 0.628881, 3.58099, 1.49974, 0 -}; + 0.628881, 3.58099, 1.49974, 0}; class RNNOpModel : public SingleOpModel { public: diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc index 0eed680fdcc..889239f9321 100644 --- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc +++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc @@ -35,12 +35,14 @@ enum KernelType { struct BatchToSpaceNDContext { BatchToSpaceNDContext(TfLiteContext* context, TfLiteNode* node) { - params = reinterpret_cast(node->builtin_data); input = GetInput(context, node, 0); + block_shape = GetInput(context, node, 1); + crops = GetInput(context, node, 2); output = GetOutput(context, node, 0); } - TfLiteBatchToSpaceNDParams* params; TfLiteTensor* input; + TfLiteTensor* block_shape; + TfLiteTensor* crops; TfLiteTensor* output; }; @@ -48,23 +50,28 @@ struct BatchToSpaceNDContext { // The 4D array need to have exactly 2 spatial dimensions. // TODO(ycling): Support arbitrary dimension in BatchToSpaceND. const int kInputDimensionNum = 4; -const int kOutputDimensionNum = 4; +const int kBlockSizeDimensionNum = 1; const int kSpatialDimensionNum = 2; -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - // The 2nd tensor (block_shape) and the 3rd tensor (crops) are ignored now. - TF_LITE_ENSURE(context, NumInputs(node) >= 1 && NumInputs(node) <= 3); - TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); +TfLiteStatus ResizeOutputTensor(TfLiteContext* context, + BatchToSpaceNDContext* op_context) { + TfLiteIntArray* input_size = op_context->input->dims; + const int* block_shape = GetTensorData(op_context->block_shape); + const int* crops = GetTensorData(op_context->crops); - BatchToSpaceNDContext op_context(context, node); - TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input), - kInputDimensionNum); - TF_LITE_ENSURE_EQ(context, op_context.params->num_spatial_dimensions, + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape), + kBlockSizeDimensionNum); + TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0], + kSpatialDimensionNum); + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops), kSpatialDimensionNum); - TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type); - const TfLiteIntArray* input_size = op_context.input->dims; - const int* block_shape = op_context.params->block_shape; + // TODO(ycling): Add crops as part of calculation. Remove check for a crops + // containing all zeroes. + TF_LITE_ENSURE_EQ(context, crops[0], 0); + TF_LITE_ENSURE_EQ(context, crops[1], 0); + TF_LITE_ENSURE_EQ(context, crops[2], 0); + TF_LITE_ENSURE_EQ(context, crops[3], 0); // Number of batch must be multiple of (block_shape[0] * block_shape[1]). TF_LITE_ENSURE_EQ(context, @@ -76,27 +83,48 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const int output_width = input_size->data[2] * block_shape[1]; const int output_channel_size = input_size->data[3]; - TfLiteIntArray* output_size = TfLiteIntArrayCreate(kOutputDimensionNum); + TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size); output_size->data[0] = output_batch_size; output_size->data[1] = output_height; output_size->data[2] = output_width; output_size->data[3] = output_channel_size; - return context->ResizeTensor(context, op_context.output, output_size); + return context->ResizeTensor(context, op_context->output, output_size); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 3); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + BatchToSpaceNDContext op_context(context, node); + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input), + kInputDimensionNum); + TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type); + + if (!IsConstantTensor(op_context.block_shape) || + !IsConstantTensor(op_context.crops)) { + SetTensorToDynamic(op_context.output); + return kTfLiteOk; + } + return ResizeOutputTensor(context, &op_context); } template TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { BatchToSpaceNDContext op_context(context, node); - int block_shape_dims_array[1] = {kSpatialDimensionNum}; - Dims<4> block_shape_dims = GetTensorDims(block_shape_dims_array, 1); + // Resize the output tensor if the output tensor is dynamic. + if (IsDynamicTensor(op_context.output)) { + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + TfLiteTensorRealloc(op_context.output->bytes, op_context.output); + } -#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar) \ - type::BatchToSpaceND(GetTensorData(op_context.input), \ - GetTensorDims(op_context.input), \ - op_context.params->block_shape, block_shape_dims, \ - GetTensorData(op_context.output), \ +#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar) \ + type::BatchToSpaceND(GetTensorData(op_context.input), \ + GetTensorDims(op_context.input), \ + GetTensorData(op_context.block_shape), \ + GetTensorDims(op_context.block_shape), \ + GetTensorData(op_context.output), \ GetTensorDims(op_context.output)) switch (op_context.input->type) { // Already know in/out types are same. case kTfLiteFloat32: diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc index 3ec4efbebce..8485cde1b40 100644 --- a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc +++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc @@ -26,36 +26,76 @@ using ::testing::ElementsAreArray; class BatchToSpaceNDOpModel : public SingleOpModel { public: - BatchToSpaceNDOpModel(std::initializer_list input_shape, - std::initializer_list block_shape, - std::initializer_list before_crops, - std::initializer_list after_crops) { - input_ = AddInput(TensorType_FLOAT32); - output_ = AddOutput(TensorType_FLOAT32); - SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND, - BuiltinOptions_BatchToSpaceNDOptions, - CreateBatchToSpaceNDOptions( - builder_, builder_.CreateVector(block_shape), - builder_.CreateVector(before_crops), - builder_.CreateVector(after_crops)) - .Union()); - BuildInterpreter({input_shape}); - } - void SetInput(std::initializer_list data) { PopulateTensor(input_, data); } + void SetBlockShape(std::initializer_list data) { + PopulateTensor(block_shape_, data); + } + + void SetCrops(std::initializer_list data) { + PopulateTensor(crops_, data); + } + std::vector GetOutput() { return ExtractVector(output_); } std::vector GetOutputShape() { return GetTensorShape(output_); } - private: + protected: int input_; + int block_shape_; + int crops_; int output_; }; -TEST(BatchToSpaceNDOpTest, SimpleTest) { - BatchToSpaceNDOpModel m({4, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0}); +// Tests case where block_shape and crops are const tensors. +// +// Example usage is as follows: +// BatchToSpaceNDOpConstModel m(input_shape, block_shape, crops); +// m.SetInput(input_data); +// m.Invoke(); +class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel { + public: + BatchToSpaceNDOpConstModel(std::initializer_list input_shape, + std::initializer_list block_shape, + std::initializer_list crops) { + input_ = AddInput(TensorType_FLOAT32); + block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2}); + crops_ = AddConstInput(TensorType_INT32, crops, {2, 2}); + output_ = AddOutput(TensorType_FLOAT32); + + SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND, + BuiltinOptions_BatchToSpaceNDOptions, + CreateBatchToSpaceNDOptions(builder_).Union()); + BuildInterpreter({input_shape}); + } +}; + +// Tests case where block_shape and crops are non-const tensors. +// +// Example usage is as follows: +// BatchToSpaceNDOpDynamicModel m(input_shape); +// m.SetInput(input_data); +// m.SetBlockShape(block_shape); +// m.SetPaddings(crops); +// m.Invoke(); +class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel { + public: + BatchToSpaceNDOpDynamicModel(std::initializer_list input_shape) { + input_ = AddInput(TensorType_FLOAT32); + block_shape_ = AddInput(TensorType_INT32); + crops_ = AddInput(TensorType_INT32); + output_ = AddOutput(TensorType_FLOAT32); + + SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND, + BuiltinOptions_BatchToSpaceNDOptions, + CreateBatchToSpaceNDOptions(builder_).Union()); + BuildInterpreter({input_shape, {2}, {2, 2}}); + } +}; + +TEST(BatchToSpaceNDOpTest, SimpleConstTest) { + BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1})); @@ -63,11 +103,35 @@ TEST(BatchToSpaceNDOpTest, SimpleTest) { 4, 8, 11, 15, 12, 16})); } +TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) { + BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1}); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + m.SetBlockShape({2, 2}); + m.SetCrops({0, 0, 0, 0}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7, + 4, 8, 11, 15, 12, 16})); +} + TEST(BatchToSpaceNDOpTest, InvalidShapeTest) { - EXPECT_DEATH(BatchToSpaceNDOpModel({3, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0}), + EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}), "Cannot allocate tensors"); } +TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) { + EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 1}), + "1 != 0"); +} + +TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) { + BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1}); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + m.SetBlockShape({2, 2}); + m.SetCrops({0, 0, 1, 0}); + EXPECT_DEATH(m.Invoke(), "1 != 0"); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc new file mode 100644 index 00000000000..aa24c1f34cd --- /dev/null +++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc @@ -0,0 +1,205 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include +#include +#include +#include +#include +#include +#include + +#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/kernels/activation_functor.h" +#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace builtin { +namespace bidirectional_sequence_rnn { + +constexpr int kInputTensor = 0; +// Forward and backward cell tensors. +constexpr int kFwWeightsTensor = 1; +constexpr int kFwRecurrentWeightsTensor = 2; +constexpr int kFwBiasTensor = 3; +constexpr int kBwWeightsTensor = 4; +constexpr int kBwRecurrentWeightsTensor = 5; +constexpr int kBwBiasTensor = 6; +// State and output tensors. +constexpr int kFwHiddenStateTensor = 0; +constexpr int kFwOutputTensor = 1; +constexpr int kBwHiddenStateTensor = 2; +constexpr int kBwOutputTensor = 3; + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + // Check we have all the inputs and outputs we need. + TF_LITE_ENSURE_EQ(context, node->inputs->size, 7); + TF_LITE_ENSURE_EQ(context, node->outputs->size, 4); + + TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]]; + TfLiteTensor* fw_input_weights = + &context->tensors[node->inputs->data[kFwWeightsTensor]]; + TfLiteTensor* fw_recurrent_weights = + &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]]; + TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]]; + TfLiteTensor* bw_input_weights = + &context->tensors[node->inputs->data[kBwWeightsTensor]]; + TfLiteTensor* bw_recurrent_weights = + &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]]; + TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]]; + + // Check all the parameters of tensor match within themselves and match the + // input configuration. + const int batch_size = input->dims->data[0]; + const int max_time = input->dims->data[1]; + const int fw_num_units = fw_input_weights->dims->data[0]; + const int bw_num_units = bw_input_weights->dims->data[0]; + TF_LITE_ASSERT_EQ(input->dims->data[2], fw_input_weights->dims->data[1]); + TF_LITE_ASSERT_EQ(input->dims->data[2], bw_input_weights->dims->data[1]); + TF_LITE_ASSERT_EQ(fw_input_weights->dims->data[0], fw_bias->dims->data[0]); + TF_LITE_ASSERT_EQ(bw_input_weights->dims->data[0], bw_bias->dims->data[0]); + TF_LITE_ASSERT_EQ(fw_recurrent_weights->dims->data[0], + fw_bias->dims->data[0]); + TF_LITE_ASSERT_EQ(bw_recurrent_weights->dims->data[1], + bw_bias->dims->data[0]); + + TfLiteTensor* fw_output = + &context->tensors[node->outputs->data[kFwOutputTensor]]; + TfLiteTensor* bw_output = + &context->tensors[node->outputs->data[kBwOutputTensor]]; + + // Resize hidden states. + TfLiteIntArray* fw_hidden_state_size_array = TfLiteIntArrayCreate(2); + fw_hidden_state_size_array->data[0] = batch_size; + fw_hidden_state_size_array->data[1] = fw_num_units; + TfLiteTensor* fw_hidden_state = + &context->tensors[node->outputs->data[kFwHiddenStateTensor]]; + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_hidden_state, + fw_hidden_state_size_array)); + + TfLiteIntArray* bw_hidden_state_size_array = TfLiteIntArrayCreate(2); + bw_hidden_state_size_array->data[0] = batch_size; + bw_hidden_state_size_array->data[1] = fw_num_units; + TfLiteTensor* bw_hidden_state = + &context->tensors[node->outputs->data[kBwHiddenStateTensor]]; + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_hidden_state, + bw_hidden_state_size_array)); + + // Mark hidden states as a persistent tensor. + fw_hidden_state->allocation_type = kTfLiteArenaRwPersistent; + bw_hidden_state->allocation_type = kTfLiteArenaRwPersistent; + + // Resize outputs. + TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3); + fw_output_size_array->data[0] = batch_size; + fw_output_size_array->data[1] = max_time; + fw_output_size_array->data[2] = fw_num_units; + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, fw_output, fw_output_size_array)); + TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3); + bw_output_size_array->data[0] = batch_size; + bw_output_size_array->data[1] = max_time; + bw_output_size_array->data[2] = bw_num_units; + TF_LITE_ENSURE_OK( + context, context->ResizeTensor(context, bw_output, bw_output_size_array)); + + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + auto* params = reinterpret_cast(node->builtin_data); + + TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]]; + TfLiteTensor* fw_input_weights = + &context->tensors[node->inputs->data[kFwWeightsTensor]]; + TfLiteTensor* fw_recurrent_weights = + &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]]; + TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]]; + TfLiteTensor* fw_hidden_state = + &context->tensors[node->outputs->data[kFwHiddenStateTensor]]; + TfLiteTensor* fw_output = + &context->tensors[node->outputs->data[kFwOutputTensor]]; + + TfLiteTensor* bw_input_weights = + &context->tensors[node->inputs->data[kBwWeightsTensor]]; + TfLiteTensor* bw_recurrent_weights = + &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]]; + TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]]; + TfLiteTensor* bw_hidden_state = + &context->tensors[node->outputs->data[kBwHiddenStateTensor]]; + TfLiteTensor* bw_output = + &context->tensors[node->outputs->data[kBwOutputTensor]]; + + const int batch_size = input->dims->data[0]; + const int max_time = input->dims->data[1]; + const int input_size = input->dims->data[2]; + + const int fw_num_units = fw_input_weights->dims->data[0]; + const float* fw_bias_ptr = fw_bias->data.f; + const float* fw_input_weights_ptr = fw_input_weights->data.f; + const float* fw_recurrent_weights_ptr = fw_recurrent_weights->data.f; + + const int bw_num_units = bw_input_weights->dims->data[0]; + const float* bw_bias_ptr = bw_bias->data.f; + const float* bw_input_weights_ptr = bw_input_weights->data.f; + const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f; + + for (int b = 0; b < batch_size; b++) { + // Forward cell. + float* fw_hidden_state_ptr_batch = + fw_hidden_state->data.f + b * fw_num_units; + for (int s = 0; s < max_time; s++) { + const float* input_ptr_batch = + input->data.f + b * input_size * max_time + s * input_size; + float* output_ptr_batch = + fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units; + + kernel_utils::RnnBatchStep( + input_ptr_batch, fw_input_weights_ptr, fw_recurrent_weights_ptr, + fw_bias_ptr, input_size, fw_num_units, /*batch_size=*/1, + params->activation, fw_hidden_state_ptr_batch, output_ptr_batch); + } + // Backward cell. + float* bw_hidden_state_ptr_batch = + bw_hidden_state->data.f + b * bw_num_units; + for (int s = max_time - 1; s >= 0; s--) { + const float* input_ptr_batch = + input->data.f + b * input_size * max_time + s * input_size; + float* output_ptr_batch = + bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units; + + kernel_utils::RnnBatchStep( + input_ptr_batch, bw_input_weights_ptr, bw_recurrent_weights_ptr, + bw_bias_ptr, input_size, bw_num_units, /*batch_size=*/1, + params->activation, bw_hidden_state_ptr_batch, output_ptr_batch); + } + } + return kTfLiteOk; +} + +} // namespace bidirectional_sequence_rnn + +TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN() { + static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr, + bidirectional_sequence_rnn::Prepare, + bidirectional_sequence_rnn::Eval}; + return &r; +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc new file mode 100644 index 00000000000..12f4ff97cfd --- /dev/null +++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc @@ -0,0 +1,931 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// Unit test for TFLite Bidirectional RNN op. + +#include +#include + +#include +#include +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; + +static float rnn_input[] = { + 0.23689353, 0.285385, 0.037029743, -0.19858193, -0.27569133, + 0.43773448, 0.60379338, 0.35562468, -0.69424844, -0.93421471, + -0.87287879, 0.37144363, -0.62476718, 0.23791671, 0.40060222, + 0.1356622, -0.99774903, -0.98858172, -0.38952237, -0.47685933, + 0.31073618, 0.71511042, -0.63767755, -0.31729108, 0.33468103, + 0.75801885, 0.30660987, -0.37354088, 0.77002847, -0.62747043, + -0.68572164, 0.0069220066, 0.65791464, 0.35130811, 0.80834007, + -0.61777675, -0.21095741, 0.41213346, 0.73784804, 0.094794154, + 0.47791874, 0.86496925, -0.53376222, 0.85315156, 0.10288584, + 0.86684, -0.011186242, 0.10513687, 0.87825835, 0.59929144, + 0.62827742, 0.18899453, 0.31440187, 0.99059987, 0.87170351, + -0.35091716, 0.74861872, 0.17831337, 0.2755419, 0.51864719, + 0.55084288, 0.58982027, -0.47443086, 0.20875752, -0.058871567, + -0.66609079, 0.59098077, 0.73017097, 0.74604273, 0.32882881, + -0.17503482, 0.22396147, 0.19379807, 0.29120302, 0.077113032, + -0.70331609, 0.15804303, -0.93407321, 0.40182066, 0.036301374, + 0.66521823, 0.0300982, -0.7747041, -0.02038002, 0.020698071, + -0.90300065, 0.62870288, -0.23068321, 0.27531278, -0.095755219, + -0.712036, -0.17384434, -0.50593495, -0.18646687, -0.96508682, + 0.43519354, 0.14744234, 0.62589407, 0.1653645, -0.10651493, + -0.045277178, 0.99032974, -0.88255352, -0.85147917, 0.28153265, + 0.19455957, -0.55479527, -0.56042433, 0.26048636, 0.84702539, + 0.47587705, -0.074295521, -0.12287641, 0.70117295, 0.90532446, + 0.89782166, 0.79817224, 0.53402734, -0.33286154, 0.073485017, + -0.56172788, -0.044897556, 0.89964068, -0.067662835, 0.76863563, + 0.93455386, -0.6324693, -0.083922029}; + +static float rnn_golden_fw_output[] = { + 0.496726, 0, 0.965996, 0, 0.0584254, 0, + 0, 0.12315, 0, 0, 0.612266, 0.456601, + 0, 0.52286, 1.16099, 0.0291232, + + 0, 0, 0.524901, 0, 0, 0, + 0, 1.02116, 0, 1.35762, 0, 0.356909, + 0.436415, 0.0355727, 0, 0, + + 0, 0, 0, 0.262335, 0, 0, + 0, 1.33992, 0, 2.9739, 0, 0, + 1.31914, 2.66147, 0, 0, + + 0.942568, 0, 0, 0, 0.025507, 0, + 0, 0, 0.321429, 0.569141, 1.25274, 1.57719, + 0.8158, 1.21805, 0.586239, 0.25427, + + 1.04436, 0, 0.630725, 0, 0.133801, 0.210693, + 0.363026, 0, 0.533426, 0, 1.25926, 0.722707, + 0, 1.22031, 1.30117, 0.495867, + + 0.222187, 0, 0.72725, 0, 0.767003, 0, + 0, 0.147835, 0, 0, 0, 0.608758, + 0.469394, 0.00720298, 0.927537, 0, + + 0.856974, 0.424257, 0, 0, 0.937329, 0, + 0, 0, 0.476425, 0, 0.566017, 0.418462, + 0.141911, 0.996214, 1.13063, 0, + + 0.967899, 0, 0, 0, 0.0831304, 0, + 0, 1.00378, 0, 0, 0, 1.44818, + 1.01768, 0.943891, 0.502745, 0, + + 0.940135, 0, 0, 0, 0, 0, + 0, 2.13243, 0, 0.71208, 0.123918, 1.53907, + 1.30225, 1.59644, 0.70222, 0, + + 0.804329, 0, 0.430576, 0, 0.505872, 0.509603, + 0.343448, 0, 0.107756, 0.614544, 1.44549, 1.52311, + 0.0454298, 0.300267, 0.562784, 0.395095, + + 0.228154, 0, 0.675323, 0, 1.70536, 0.766217, + 0, 0, 0, 0.735363, 0.0759267, 1.91017, + 0.941888, 0, 0, 0, + + 0, 0, 1.5909, 0, 0, 0, + 0, 0.5755, 0, 0.184687, 0, 1.56296, + 0.625285, 0, 0, 0, + + 0, 0, 0.0857888, 0, 0, 0, + 0, 0.488383, 0.252786, 0, 0, 0, + 1.02817, 1.85665, 0, 0, + + 0.00981836, 0, 1.06371, 0, 0, 0, + 0, 0, 0, 0.290445, 0.316406, 0, + 0.304161, 1.25079, 0.0707152, 0, + + 0.986264, 0.309201, 0, 0, 0, 0, + 0, 1.64896, 0.346248, 0, 0.918175, 0.78884, + 0.524981, 1.92076, 2.07013, 0.333244, + + 0.415153, 0.210318, 0, 0, 0, 0, + 0, 2.02616, 0, 0.728256, 0.84183, 0.0907453, + 0.628881, 3.58099, 1.49974, 0}; + +static float rnn_golden_bw_output[] = { + 0.496726, 0, 1.00883, 0, 0.0584256, 0, 0, + 0.236412, 0, 0, 0.612267, 0.487726, 0, 0.54883, + 1.16099, 0.0291233, 0, 0, 0.428302, 0, 0, + 0, 0, 1.13262, 0, 1.64415, 0, 0.311249, + 0.570804, 0.259696, 0, 0, 0, 0, 0, + 0.262334, 0, 0, 0, 1.23781, 0, 2.86532, + 0, 0, 1.34389, 2.76409, 0, 0, 1.03969, + 0, 0.00410865, 0, 0.0470295, 0, 0, 0, + 0.371556, 0.27175, 1.36614, 1.63956, 0.683887, 1.06176, 0.719552, + 0.301314, 0.971195, 0, 0.697143, 0, 0.215219, 0.210693, + 0.363027, 0, 0.501283, 0, 1.13399, 0.623774, 0, + 1.09851, 1.33313, 0.470441, 0.210965, 0, 0.664178, 0, + 0.839686, 0, 0, 0.147834, 0, 0, 0, + 0.58786, 0.490128, 0, 0.905806, 0, 0.932134, 0.424257, + 0, 0, 0.860629, 0, 0, 0, 0.476425, + 0, 0.566017, 0.513721, 0.207341, 1.09508, 1.08385, 0, + 0.973787, 0, 0, 0, 0, 0, 0, + 1.20698, 0, 0, 0, 1.56135, 1.12369, 0.99588, + 0.459803, 0, 0.915854, 0, 0, 0, 0, + 0, 0, 2.03206, 0, 0.773264, 0.267228, 1.55012, + 1.202, 1.51611, 0.701202, 0, 0.725088, 0, 0.509069, + 0, 0.671349, 0.581129, 0.343447, 0, 0.107755, 0.611838, + 1.4331, 1.55871, 0.015242, 0.140624, 0.492562, 0.395095, 0.147722, + 0, 0.784925, 0, 1.65477, 0.715257, 0, 0, + 0, 0.685024, 0, 1.89505, 1.00037, 0, 0, + 0, 0, 0, 1.52659, 0, 0, 0, + 0, 0.618583, 0, 0.11115, 0, 1.37194, 0.630225, + 0, 0, 0, 0, 0, 0.0322124, 0, + 0, 0, 0, 0.430834, 0.252786, 0, 0, + 0, 0.991297, 1.98451, 0, 0, 0.111511, 0, + 1.05513, 0, 0, 0, 0, 0, 0, + 0.290445, 0.412559, 0.0429958, 0.256564, 1.27858, 0.289948, 0, + 1.01693, 0.327141, 0, 0, 0, 0, 0, + 1.83508, 0.346248, 0, 0.961535, 0.790026, 0.552203, 2.13457, + 2.19233, 0.333244, 0.316526, 0.179398, 0, 0, 0, + 0, 0, 1.86126, 0, 0.728256, 0.750013, 0.011861, + 0.576383, 3.38891, 1.29273, 0}; + +constexpr std::initializer_list weights = { + 0.461459, 0.153381, 0.529743, -0.00371218, 0.676267, -0.211346, + 0.317493, 0.969689, -0.343251, 0.186423, 0.398151, 0.152399, + 0.448504, 0.317662, 0.523556, -0.323514, 0.480877, 0.333113, + -0.757714, -0.674487, -0.643585, 0.217766, -0.0251462, 0.79512, + -0.595574, -0.422444, 0.371572, -0.452178, -0.556069, -0.482188, + -0.685456, -0.727851, 0.841829, 0.551535, -0.232336, 0.729158, + -0.00294906, -0.69754, 0.766073, -0.178424, 0.369513, -0.423241, + 0.548547, -0.0152023, -0.757482, -0.85491, 0.251331, -0.989183, + 0.306261, -0.340716, 0.886103, -0.0726757, -0.723523, -0.784303, + 0.0354295, 0.566564, -0.485469, -0.620498, 0.832546, 0.697884, + -0.279115, 0.294415, -0.584313, 0.548772, 0.0648819, 0.968726, + 0.723834, -0.0080452, -0.350386, -0.272803, 0.115121, -0.412644, + -0.824713, -0.992843, -0.592904, -0.417893, 0.863791, -0.423461, + -0.147601, -0.770664, -0.479006, 0.654782, 0.587314, -0.639158, + 0.816969, -0.337228, 0.659878, 0.73107, 0.754768, -0.337042, + 0.0960841, 0.368357, 0.244191, -0.817703, -0.211223, 0.442012, + 0.37225, -0.623598, -0.405423, 0.455101, 0.673656, -0.145345, + -0.511346, -0.901675, -0.81252, -0.127006, 0.809865, -0.721884, + 0.636255, 0.868989, -0.347973, -0.10179, -0.777449, 0.917274, + 0.819286, 0.206218, -0.00785118, 0.167141, 0.45872, 0.972934, + -0.276798, 0.837861, 0.747958, -0.0151566, -0.330057, -0.469077, + 0.277308, 0.415818}; + +static float endtoend_input[] = { + 0.996808, 0.060710, 0.981855, 0.570017, 0.525164, 0.796859, 0.696547, + 0.505925, 0.991844, 0.461208, 0.949371, 0.027624, 0.539236, 0.841854, + 0.915222, 0.538569, 0.069375, 0.237905, 0.903700, 0.441703, 0.536196, + 0.402724, 0.761635, 0.025063, 0.082592, 0.688245, 0.239310, 0.256931, + 0.658900, 0.105695, 0.301983, 0.655708, 0.166405, 0.283837, 0.225725, + 0.691569, 0.080696, 0.922272, 0.197494, 0.072540, 0.383481, 0.146865, + 0.100163, 0.922717, 0.988720, 0.015386, 0.461286, 0.058095, 0.253290, + 0.364986, 0.499797, 0.789487, 0.767709, 0.261433, 0.814549, 0.850302, + 0.949678, 0.053859, 0.107233, 0.608577, 0.159554, 0.409215, 0.264285, + 0.325960, 0.693053, 0.490011, 0.017529, 0.773749, 0.412283, 0.215023, + 0.846288, 0.795764, 0.361889, 0.946452, 0.718481, 0.350608, 0.961837, + 0.179767, 0.408703, 0.215128, 0.544753, 0.908500, 0.004614, 0.312462, + 0.169933, 0.819163, 0.162764, 0.119611, 0.873022, 0.269997, 0.728188, + 0.032576, 0.679212, 0.992474, 0.358536, 0.372265, 0.482484, 0.376065, + 0.146014, 0.894767, 0.591088, 0.992302, 0.690531, 0.952977, 0.938754, + 0.409012, 0.303585, 0.900591, 0.588780, 0.712287, 0.115719, 0.133533, + 0.620788, 0.120334, 0.445995, 0.790720, 0.939497, 0.608759, 0.910331, + 0.812519, 0.878756, 0.638519, 0.845096, 0.557968, 0.630993, 0.203632, + 0.930233, 0.113477, 0.579697, 0.076247, 0.008244, 0.170785, 0.068549, + 0.698776, 0.123761, 0.007303, 0.107788, 0.427346, 0.907894, 0.696568, + 0.139633, 0.023613, 0.830100, 0.760421, 0.143947, 0.276096, 0.551141, + 0.083444, 0.884855, 0.461472, 0.895963, 0.763611, 0.099992, 0.741059, + 0.321579, 0.730984, 0.944691, 0.251812, 0.844461, 0.524388, 0.328059, + 0.852706, 0.695172, 0.396607, 0.551482, 0.818934, 0.403910, 0.659270, + 0.246280, 0.311804, 0.355838, 0.385913, 0.335418, 0.185938, 0.146334, + 0.479364, 0.462034, 0.697475, 0.562808, 0.346888, 0.158948, 0.458771, + 0.110499, 0.258939, 0.199830, 0.432078, 0.989924, 0.144521, 0.683890, + 0.834385, 0.668908, 0.011949, 0.687091, 0.364081, 0.408556, 0.238572, + 0.183015, 0.812466, 0.897842, 0.429294, 0.124271, 0.253680, 0.815207, + 0.459688, 0.439618, 0.961541, 0.939053, 0.901651, 0.659016, 0.501861, + 0.248539, 0.817964, 0.960632, 0.359038, 0.076903, 0.160462, 0.791117, + 0.066826, 0.304983, 0.475007, 0.901211, 0.973891, 0.486955, 0.588302, + 0.337972, 0.895512, 0.826874, 0.520987, 0.707978, 0.724716, 0.950281, + 0.832249, 0.978396, 0.765488, 0.291937, 0.418014, 0.727029, 0.230990, + 0.319665, 0.386045, 0.732850, 0.568204, 0.204009, 0.693482, 0.927242, + 0.280912, 0.853944, 0.718359, 0.347738, 0.158927, 0.193366, 0.248950, + 0.132818, 0.680321, 0.837252, 0.470790, 0.575833, 0.664126, 0.991777, + 0.283811, 0.388843, 0.942058, 0.116060, 0.367239, 0.707546, 0.407997, + 0.785253, 0.434575, 0.638986, 0.104917, 0.820620, 0.371837, 0.673121, + 0.024629, 0.065319, 0.600363, 0.305541, 0.919263, 0.318722, 0.653279, + 0.078190, 0.512088, 0.902229, 0.211009, 0.192409, 0.739480, 0.681799, + 0.768242, 0.403607, 0.673576, 0.052052, 0.792450, 0.615634, 0.168112, + 0.159689, 0.323180, 0.576109, 0.944941, 0.757755, 0.215095, 0.049858, + 0.578375, 0.586932, 0.722979, 0.603003, 0.652251, 0.323343, 0.908544, + 0.571514, 0.642065, 0.561823, 0.649704, 0.154153, 0.464051, 0.860713, + 0.346562, 0.203532, 0.542512, 0.114804, 0.607139, 0.216088, 0.166856, + 0.399588, 0.831722, 0.334968, 0.559277, 0.154902, 0.911077, 0.504218, + 0.912656, 0.126172, 0.554076, 0.491031, 0.713104, 0.277055, 0.094034, + 0.365355, 0.600398, 0.002578, 0.936869, 0.242463, 0.564401, 0.586574, + 0.396616, 0.028452, 0.447287, 0.743178, 0.231984, 0.989799, 0.857982, + 0.839122, 0.205887, 0.024838, 0.238711, 0.037608, 0.359806, 0.797987, + 0.192510, 0.270883, 0.302205, 0.105166, 0.397055, 0.856281, 0.596197, + 0.110160, 0.133336, 0.690231, 0.475515, 0.733734, 0.692809, 0.412384, + 0.976196, 0.257209, 0.998958, 0.372812, 0.285661, 0.446245, 0.115990, + 0.517645, 0.436044, 0.973972, 0.356767, 0.641930, 0.998810, 0.595478, + 0.679539, 0.358617, 0.393465, 0.872049, 0.629500, 0.695670, 0.977215, + 0.026555, 0.551951, 0.573412, 0.136715, 0.685287, 0.263643, 0.612229, + 0.419020, 0.956451, 0.024613, 0.395216, 0.213661, 0.023572, 0.768029, + 0.499322, 0.469816, 0.884019, 0.016967, 0.905860, 0.857991, 0.373734, + 0.547791, 0.856802, 0.969211, 0.227330, 0.215418, 0.362676, 0.099378, + 0.844918, 0.058346, 0.076594, 0.871473, 0.610297, 0.650006, 0.008188, + 0.295583, 0.913648, 0.620417, 0.714603, 0.870100, 0.645031, 0.109820, + 0.083760, 0.668602, 0.877849, 0.583082, 0.138419, 0.761868, 0.600049, + 0.044279, 0.619859, 0.973783, 0.592069, 0.476661, 0.942994, 0.819399, + 0.692079, 0.305670, 0.918778, 0.536997, 0.364016, 0.995371, 0.408470, + 0.974313, 0.645377, 0.416658, 0.269896, 0.559025, 0.037075, 0.984499, + 0.429125, 0.682105, 0.094319, 0.512885, 0.350707, 0.972168, 0.095967, + 0.489126, 0.734035, 0.696016, 0.533405, 0.353894, 0.669799, 0.125474, + 0.830555, 0.612793, 0.944873, 0.522634, 0.918463, 0.863651, 0.059631, + 0.282479, 0.859022, 0.468101, 0.256791, 0.504398, 0.884758, 0.526687, + 0.063423, 0.921833, 0.511186, 0.492548, 0.603939, 0.605505, 0.005433, + 0.954646, 0.577673, 0.101400, 0.443772, 0.311708, 0.797417, 0.977176, + 0.665602, 0.467216, 0.102650, 0.496157, 0.080009, 0.047524, 0.018791, + 0.998471, 0.911174, 0.078422, 0.280950, 0.770196, 0.546523, 0.537741, + 0.274594, 0.431281, 0.064428, 0.338017, 0.353115, 0.575615, 0.830565, + 0.957053, 0.181120, 0.835998, 0.911699, 0.758793, 0.937398, 0.355471, + 0.070501, 0.734815, 0.332647, 0.736103, 0.202031, 0.435297, 0.232261, + 0.282039, 0.482821, 0.251052, 0.280511, 0.393995, 0.329474, 0.561460, + 0.164191, 0.875997, 0.099202, 0.438785, 0.307278, 0.163630, 0.776802, + 0.660393, 0.739244, 0.607367, 0.617446, 0.920364, 0.443365, 0.529145, + 0.679157, 0.380763, 0.884616, 0.749658, 0.115578, 0.217263, 0.485761, + 0.317609, 0.652560, 0.718021, 0.599648, 0.135381, 0.969073, 0.880159, + 0.529376, 0.298547, 0.441619, 0.693567, 0.174544, 0.540821, 0.132351, + 0.481822, 0.704450, 0.909153, 0.142215, 0.443695, 0.516520, 0.759661, + 0.364059, 0.959885, 0.288806, 0.043216, 0.340648, 0.173422, 0.792874, + 0.456226, 0.390685, 0.278634, 0.773834, 0.043245, 0.996656, 0.373483, + 0.178625, 0.965729, 0.253641, 0.708001, 0.264276, 0.695260, 0.401568, + 0.438820, 0.236081, 0.533919, 0.920642, 0.940531, 0.443072, 0.062857, + 0.384226, 0.959592, 0.822518, 0.748285, 0.919477, 0.111325, 0.791501, + 0.260124, 0.284747, 0.584375, 0.716350, 0.675431, 0.863009, 0.490184, + 0.718676, 0.859665, 0.863666, 0.897301, 0.825393, 0.117308, 0.605302, + 0.089669, 0.812568, 0.006870, 0.528489, 0.048649, 0.540788, 0.449131, + 0.989180, 0.983860, 0.511988, 0.373407, 0.943452, 0.334506, 0.121692, + 0.862929, 0.445831, 0.913193, 0.123053, 0.730578, 0.497568, 0.839402, + 0.406009, 0.360577, 0.329586, 0.124685, 0.220241, 0.193253, 0.021986, + 0.045634, 0.310560, 0.627288, 0.135303, 0.123128, 0.634158, 0.663792, + 0.171777, 0.174946, 0.112923, 0.160958, 0.158806, 0.624911, 0.534364, + 0.102259, 0.959418, 0.656056, 0.965187, 0.405249, 0.569249, 0.088240, + 0.135827, 0.066817, 0.927642, 0.541836, 0.427393, 0.257229, 0.666520, + 0.647634, 0.450481, 0.688506, 0.693269, 0.761042, 0.315794, 0.828572, + 0.884170, 0.949952, 0.492364, 0.055947, 0.124898, 0.605288, 0.216905, + 0.283705, 0.230199, 0.751269, 0.385963, 0.189616, 0.407326, 0.351151, + 0.594865, 0.976575, 0.439391, 0.730692, 0.043392, 0.367033, 0.272527, + 0.470785, 0.624261, 0.939048, 0.118419, 0.074743, 0.627554, 0.811688, + 0.835784, 0.943348, 0.640260, 0.719954, 0.893300, 0.132625, 0.775901, + 0.018199, 0.737913, 0.992806, 0.301903, 0.968111, 0.744076, 0.687867, + 0.157728, 0.151401, 0.039017, 0.752593, 0.127976, 0.478408, 0.483284, + 0.171368, 0.845441, 0.755811, 0.642153, 0.469702, 0.694859, 0.760572, + 0.544445, 0.322413, 0.572260, 0.380229, 0.265761, 0.212521, 0.100183, + 0.159062, 0.345146, 0.876084, 0.177261, 0.083058, 0.868891, 0.479164, + 0.051169, 0.612966, 0.167030, 0.208897, 0.764367, 0.206048, 0.961490, + 0.892343, 0.684456, 0.444774, 0.063711, 0.529896, 0.200585, 0.705863, + 0.999598, 0.895444, 0.466435, 0.544043, 0.217857, 0.038696, 0.924272, + 0.483618, 0.251217, 0.024455, 0.642680, 0.596362, 0.900539, 0.819941, + 0.679420, 0.769430, 0.299105, 0.730590, 0.382396, 0.466135, 0.939487, + 0.146763, 0.672183, 0.900977, 0.039106, 0.356638, 0.345750, 0.102817, + 0.886535, 0.546336, 0.808681, 0.886133, 0.441780, 0.275116, 0.430176, + 0.659637, 0.313812, 0.354448, 0.143255, 0.565028, 0.378903, 0.785935, + 0.161391, 0.279443, 0.605876, 0.840811, 0.048873, 0.904980, 0.571401, + 0.431269, 0.371115, 0.510887, 0.578032, 0.043298, 0.411864, 0.617138, + 0.399936, 0.757614, 0.719955, 0.286471, 0.303950, 0.528636, 0.172604, + 0.745730, 0.803752, 0.602780, 0.405367, 0.117564, 0.957228, 0.548622, + 0.682592, 0.336131, 0.334557, 0.843983, 0.615574, 0.940433, 0.684794, + 0.664447, 0.845413, 0.256194, 0.095715, 0.216529, 0.767082, 0.673747, + 0.259827, 0.178946, 0.290885, 0.659763, 0.936560, 0.010840, 0.946234, + 0.240510, 0.539476, 0.118838, 0.986240, 0.343228, 0.721618, 0.391606, + 0.460792, 0.678846, 0.940228, 0.143384, 0.014977, 0.274785, 0.987367, + 0.630551, 0.215218, 0.672161, 0.294998, 0.060631, 0.928355, 0.390713, + 0.277160, 0.695436, 0.064460, 0.536987, 0.874382, 0.355345, 0.196751, + 0.810942, 0.366185, 0.142985, 0.051452, 0.905661, 0.261823, 0.037691, + 0.248889, 0.983441, 0.429297, 0.709681, 0.662286, 0.369525, 0.853066, + 0.677263, 0.644310, 0.840433, 0.307814, 0.859528, 0.512593, 0.602812, + 0.920160, 0.440948, 0.993525, 0.197320, 0.136384, 0.057984, 0.734307, + 0.010766, 0.413329, 0.931058, 0.821707, 0.779514, 0.074043, 0.873159, + 0.685175, 0.335865, 0.910850, 0.934065, 0.319306, 0.340147, 0.643746, + 0.981592, 0.709673, 0.496812, 0.658856, 0.353983, 0.337245, 0.966670, + 0.213511, 0.849838, 0.569482, 0.133671, 0.290786, 0.563007, 0.330991, + 0.427170, 0.620991, 0.065299, 0.437936, 0.034320, 0.996356, 0.259643, + 0.813834, 0.070399, 0.132802, 0.499009, 0.406265, 0.043652, 0.433074, + 0.725570, 0.383800, 0.076820, 0.707163, 0.093473, 0.573632, 0.366018, + 0.447456, 0.910877, 0.332688, 0.660967, 0.760714, 0.902170, 0.794638, + 0.051500, 0.465177, 0.125630, 0.478670, 0.086168, 0.190928, 0.916605, + 0.120488, 0.187285, 0.176248, 0.934322, 0.257684, 0.309050, 0.433331, + 0.663949, 0.352703, 0.866405, 0.389519, 0.736502, 0.943226, 0.096682, + 0.829975, 0.516858, 0.462700, 0.277430, 0.427734, 0.795388, 0.938398, + 0.188449, 0.697558, 0.733036, 0.239948, 0.162735, 0.858666, 0.718618, + 0.248903, 0.049594, 0.635223, 0.369391, 0.236879, 0.811472, 0.303713, + 0.494563, 0.120522, 0.737044, 0.158511, 0.473225, 0.603450, 0.548030, + 0.209727, 0.546675, 0.644712, 0.039702, 0.063533, 0.107412, 0.317132, + 0.491267, 0.902800, 0.255530, 0.679716, 0.600359, 0.988566, 0.919664, + 0.763094, 0.847232, 0.638283, 0.011997, 0.896825, 0.273506, 0.381388, + 0.133704, 0.084978, 0.685101, 0.628267, 0.205500, 0.422145, 0.786778, + 0.678725, 0.025595, 0.334808, 0.888452, 0.572271, 0.979520, 0.928154, + 0.635804, 0.086932, 0.245286, 0.127071, 0.989732, 0.500816, 0.806787, + 0.590091, 0.489382, 0.726451, 0.353185, 0.336614, 0.364734, 0.365182, + 0.233439, 0.638240, 0.746570, 0.367143, 0.723218, 0.431671, 0.995410, + 0.928718, 0.853816, 0.782188, 0.607442, 0.879411, 0.116995, 0.495894, + 0.451682, 0.096515, 0.424048, 0.087485, 0.183447, 0.669334, 0.214556, + 0.173179, 0.170151, 0.021343, 0.763269, 0.659533, 0.747794, 0.116454, + 0.996147, 0.112528, 0.481635, 0.229586, 0.750768, 0.228205, 0.596730, + 0.473985, 0.659876, 0.592139, 0.402703, 0.513692, 0.374327, 0.010145, + 0.393103, 0.491322, 0.506039, 0.844785, 0.587837, 0.930088, 0.932270, + 0.771284, 0.599422, 0.146826, 0.944463, 0.769573, 0.168169, 0.707732, + 0.429106, 0.915964, 0.824186, 0.425253, 0.028492, 0.305821, 0.654839, + 0.779259, 0.534026, 0.251569, 0.253245, 0.193901, 0.843708, 0.655947, + 0.707593, 0.218035, 0.666093, 0.100696, 0.709357, 0.172132, 0.945481, + 0.297195, 0.102220, 0.877751, 0.068479, 0.701642, 0.024577, 0.012941, + 0.471215, 0.192747, 0.720673, 0.900321, 0.108710, 0.544859, 0.325574, + 0.137202, 0.850679, 0.980413, 0.916462, 0.384705, 0.231982, 0.169706, + 0.578607, 0.075690, 0.825654, 0.286200, 0.293725, 0.491746, 0.386896, + 0.003083, 0.663878, 0.332377, 0.300278, 0.766098, 0.210128, 0.368756, + 0.467740, 0.234705, 0.381697, 0.938955, 0.427451, 0.102370, 0.839275, + 0.536162, 0.647229, 0.164849, 0.673364, 0.497908, 0.145262, 0.589825, + 0.882613, 0.377244, 0.759532, 0.461220, 0.452934, 0.585185, 0.747420, + 0.746660, 0.076932, 0.134316, 0.749743, 0.740810, 0.466692, 0.050020, + 0.506908, 0.676820, 0.418776, 0.974648, 0.911525, 0.800474, 0.913602, + 0.338976, 0.902844, 0.752878, 0.875138, 0.550072, 0.917727, 0.548502, + 0.047981, 0.062989, 0.138327, 0.930594, 0.440233, 0.897859, 0.391814, + 0.893168, 0.483044, 0.139234, 0.639828, 0.559975, 0.273549, 0.389570, + 0.300785, 0.740242, 0.439590, 0.807693, 0.417062, 0.858367, 0.782341, + 0.328586, 0.658840, 0.695943, 0.667562, 0.561684, 0.448821, 0.542700, + 0.111756, 0.366548, 0.091202, 0.159737, 0.429537, 0.229529, 0.090331, + 0.869770, 0.127388, 0.482145, 0.762938, 0.610432, 0.621379, 0.402765, + 0.170407, 0.894928, 0.792336, 0.471192, 0.635170, 0.231926, 0.278886, + 0.052232, 0.090293, 0.061226, 0.380818, 0.749133, 0.757170, 0.048380, + 0.310817, 0.205990, 0.591080, 0.422573, 0.572538, 0.682282, 0.582310, + 0.002075, 0.911812, 0.672641, 0.871845, 0.039199, 0.154786, 0.634783, + 0.649631, 0.776165, 0.037548, 0.820038, 0.671093, 0.829884, 0.291231, + 0.306263, 0.061810, 0.570116, 0.358495, 0.152103, 0.631343, 0.739313, + 0.901236, 0.388512, 0.787693, 0.212053, 0.594503, 0.378773, 0.634626, + 0.167040, 0.061056, 0.216937, 0.169115, 0.972867, 0.889578, 0.040960, + 0.012067, 0.044364, 0.675743, 0.661698, 0.820529, 0.713291, 0.481736, + 0.491623, 0.543175, 0.772966, 0.797886, 0.604985, 0.343083, 0.156380, + 0.757088, 0.974425, 0.895693, 0.658324, 0.362938, 0.683386, 0.870376, + 0.957440, 0.062159, 0.505002, 0.124481, 0.123215, 0.721939, 0.293596, + 0.096082, 0.611517, 0.334556, 0.108149, 0.655881, 0.010299, 0.769846, + 0.476411, 0.723590, 0.251582, 0.968033, 0.266765, 0.024548, 0.765919, + 0.871750, 0.367631, 0.922299, 0.628838, 0.342056, 0.817992, 0.287162, + 0.704994, 0.501378, 0.157538, 0.662434, 0.563537, 0.662541, 0.786915, + 0.686752, 0.384480, 0.080511, 0.782834, 0.995997, 0.415067, 0.890983, + 0.651878, 0.425365, 0.660829, 0.128289, 0.148956, 0.912411, 0.096322, + 0.415721, 0.936959, 0.862241, 0.287471, 0.304590, 0.784540, 0.916309, + 0.646646, 0.602533, 0.203471, 0.351640, 0.103911, 0.361009, 0.014074, + 0.667448, 0.023550, 0.800989, 0.354200, 0.408030, 0.881500, 0.137034, + 0.404026, 0.296566, 0.028017, 0.055904, 0.721932, 0.688846, 0.184193, + 0.870887, 0.601257, 0.280515, 0.286608, 0.538216, 0.142755, 0.574079, + 0.842806, 0.927296, 0.490388, 0.489452, 0.529828, 0.693859, 0.841092, + 0.633739, 0.054869, 0.855167, 0.301187, 0.078419, 0.656156, 0.655388, + 0.486448, 0.537656, 0.792422, 0.890475, 0.834222, 0.820439, 0.946379, + 0.556153, 0.509285, 0.130571, 0.427041, 0.110542, 0.411086, 0.713648, + 0.648758, 0.553842, 0.287727, 0.491563, 0.481137, 0.778116, 0.981015, + 0.010966, 0.471975, 0.822107, 0.644705, 0.526844, 0.677274, 0.945892, + 0.605263, 0.333430, 0.601280, 0.091711, 0.871086, 0.393702, 0.982186, + 0.705307, 0.214141, 0.928564, 0.261461, 0.723426, 0.059136, 0.688501, + 0.833968, 0.470222, 0.402150, 0.482725, 0.024063, 0.689877, 0.974289, + 0.505201, 0.467993, 0.955304, 0.516166, 0.939968, 0.777411, 0.160871, + 0.466812, 0.454685, 0.106763, 0.072075, 0.788115, 0.708043, 0.163786, + 0.659201, 0.101744, 0.145971, 0.364508, 0.315885, 0.074536, 0.625969, + 0.039311, 0.133672, 0.314471, 0.873279, 0.603893, 0.716620, 0.356004, + 0.627957, 0.406498, 0.330292, 0.133157, 0.874490, 0.285596, 0.649324, + 0.814458, 0.063007, 0.810195, 0.281270, 0.517693, 0.916958, 0.353345, + 0.305808, 0.625000, 0.517131, 0.965009, 0.726745, 0.663102, 0.329518, + 0.042630, 0.737638, 0.955487, 0.081940, 0.871310, 0.269957, 0.955219, + 0.475203, 0.986578, 0.311223, 0.103160, 0.393075, 0.641515, 0.236317, + 0.267566, 0.927112, 0.885641, 0.082024, 0.990119, 0.695835, 0.363295, + 0.507812, 0.612793, 0.716640, 0.813620, 0.237793, 0.233770, 0.778629, + 0.964538, 0.896872, 0.108147, 0.007167, 0.634510, 0.063633, 0.089108, + 0.505820, 0.333591, 0.044327, 0.981023, 0.320168, 0.355550, 0.084182, + 0.713244, 0.997065, 0.320499, 0.980810, 0.924177, 0.206140, 0.062834, + 0.914296, 0.901975, 0.426129, 0.422107, 0.514768, 0.142768, 0.235727, + 0.752561, 0.376539, 0.014356, 0.717099, 0.273411, 0.122502, 0.724266, + 0.907921, 0.186136, 0.813374, 0.413741, 0.519726, 0.857701, 0.394764, + 0.839895, 0.213251, 0.478946, 0.553139, 0.210317, 0.799446, 0.533948, + 0.134493, 0.005586, 0.596782, 0.048789, 0.907561, 0.022911, 0.470896, + 0.422329, 0.165679, 0.706623, 0.174890, 0.542218, 0.720979, 0.891989, + 0.815629, 0.843481, 0.616255, 0.723551, 0.029617, 0.429630, 0.137292, + 0.549343, 0.287331, 0.532056, 0.389238, 0.500583, 0.011002, 0.942377, + 0.710899, 0.810448, 0.476326, 0.845392, 0.816033, 0.073108, 0.894181, + 0.723594, 0.096019, 0.365077, 0.145923, 0.261699, 0.071700, 0.320813, + 0.803917, 0.792679, 0.212802, 0.619546, 0.636160, 0.829057, 0.343096, + 0.665777, 0.258687, 0.480388, 0.215121, 0.546018, 0.012444, 0.604359, + 0.046601, 0.023446, 0.546736, 0.757500, 0.833893, 0.023062, 0.602892, + 0.649927, 0.096170, 0.497074, 0.373521, 0.192189, 0.862151, 0.519444, + 0.453887, 0.933851, 0.840257, 0.257804, 0.726531, 0.053058, 0.877350, + 0.362691, 0.882115, 0.220446, 0.028468, 0.140802, 0.700834, 0.243589, + 0.686821, 0.713278, 0.847948, 0.733421, 0.736723, 0.394684, 0.490921, + 0.570617, 0.417746, 0.093813, 0.220543, 0.513916, 0.590887, 0.594064, + 0.706105, 0.453038, 0.113508, 0.159992, 0.386889, 0.953765, 0.417796, + 0.113420, 0.006823, 0.295146, 0.476111, 0.888938, 0.515592, 0.504579, + 0.029741, 0.216426, 0.748168, 0.716561, 0.929703, 0.596117, 0.449982, + 0.666427, 0.990801, 0.940903, 0.237043, 0.408547, 0.034717, 0.457587, + 0.922463, 0.625603, 0.051651, 0.628568, 0.078641, 0.165159, 0.788560, + 0.465530, 0.118923, 0.206356, 0.578950, 0.125746, 0.501502, 0.055060, + 0.014685, 0.017094, 0.559640, 0.044425, 0.233519, 0.307808, 0.760986, + 0.163223, 0.903925, 0.210969, 0.829650, 0.894726, 0.151872, 0.066693, + 0.303273, 0.186589, 0.524279, 0.225736, 0.812192, 0.575930, 0.854304, + 0.890833, 0.741089, 0.642864, 0.356363, 0.860012, 0.849220, 0.935313, + 0.985758, 0.350722, 0.990373, 0.000443, 0.367815, 0.550013, 0.044868, + 0.601335, 0.857820, 0.805855, 0.764557, 0.761745, 0.016823, 0.594207, + 0.656471, 0.168696, 0.660900, 0.959744, 0.355284, 0.185179, 0.185480, + 0.167477, 0.761110, 0.039784, 0.058310, 0.502199, 0.682648, 0.414673, + 0.362211, 0.531868, 0.349985, 0.347969, 0.882589, 0.340358, 0.348412, + 0.250404, 0.890371, 0.393280, 0.851739, 0.748191, 0.199135, 0.616297, + 0.509936, 0.215958, 0.210504, 0.166407, 0.384654, 0.871404, 0.126151, + 0.739938, 0.056583, 0.311631, 0.907415, 0.817693, 0.351415, 0.965724, + 0.319891, 0.034062, 0.380397, 0.682102, 0.565930, 0.730382, 0.030072, + 0.448519, 0.070741, 0.378484, 0.698924, 0.961112, 0.771764, 0.550663, + 0.709303, 0.970899, 0.166959, 0.219239, 0.186857, 0.377463, 0.385647, + 0.571511, 0.248867, 0.511798, 0.311449, 0.305450, 0.823429, 0.218864, + 0.123142, 0.174844, 0.184588, 0.443034, 0.208906, 0.564986, 0.125136, + 0.774836, 0.295368, 0.155207, 0.223355, 0.366109, 0.533691, 0.922279, + 0.327221, 0.305455, 0.472942, 0.036524, 0.276354, 0.639901, 0.255763, + 0.463211, 0.017364, 0.641410, 0.034722, 0.266231, 0.153207, 0.346171, + 0.571680, 0.976636, 0.565036, 0.694822, 0.151480, 0.749624, 0.137856, + 0.360386, 0.314610, 0.262992, 0.135222, 0.609978, 0.418200, 0.358578, + 0.976087, 0.951891, 0.280856, 0.303307, 0.257346, 0.753798, 0.339831, + 0.533700, 0.393699, 0.595594, 0.996911, 0.411063, 0.237003, 0.031634, + 0.677294, 0.390211, 0.377805, 0.248974, 0.366847, 0.942841, 0.943796, + 0.518327, 0.692465, 0.081653, 0.878713, 0.007074, 0.344645, 0.013936, + 0.617052, 0.762845, 0.372513, 0.593138, 0.714736, 0.653370, 0.896446, + 0.972082, 0.407168, 0.236276, 0.505782, 0.800867, 0.831870, 0.502693, + 0.211930, 0.068873, 0.534327, 0.889224, 0.459084, 0.912132, 0.138197, + 0.825931, 0.854972, 0.081994, 0.344259, 0.547437, 0.163646, 0.222972, + 0.554511, 0.508291, 0.236908, 0.171563, 0.271135, 0.609421, 0.764701, + 0.985871, 0.262790, 0.661147, 0.957953, 0.669958, 0.897423, 0.463734, + 0.470825, 0.729293, 0.966427, 0.682755, 0.798166, 0.500754, 0.571978, + 0.257251, 0.412886, 0.710176, 0.083182, 0.267858, 0.792169, 0.427441, + 0.815295, 0.955815, 0.650413, 0.369805, 0.464106, 0.887320, 0.541368, + 0.735242, 0.496741, 0.306069, 0.721113, 0.759531, 0.967216, 0.679065, + 0.429489, 0.864639, 0.142799, 0.900314, 0.593932, 0.109227, 0.583069, + 0.392098, 0.609981, 0.155047, 0.649349, 0.022867, 0.865222, 0.732531, + 0.290725, 0.657392, 0.159972, 0.106019, 0.613207, 0.810384, 0.475824, + 0.077313, 0.697704, 0.017192, 0.812555}; + +static float golden_endtoend_output[] = { + -1.881211, -0.028385, -3.585066, 1.939770, -3.461155, 1.280415, -4.408978, + 0.608663, -2.704937, 1.859742, -5.777429, 2.691839, -1.049012, 1.640870, + -4.856245, 1.604236, 0.992707, 0.422858, -4.307465, 1.887332, -0.884831, + -0.154277, -2.634801, 0.586827, -1.849960, 1.399608, -4.531559, 1.943591, + 0.271676, -2.893054, -2.066826, 0.235467, -1.248263, -1.164534, -2.640174, + -0.112878, -4.386484, 1.253024, -4.135623, 1.068984, -0.043579, -0.832957, + -3.257258, -0.514396, -1.651174, 0.638630, -4.364372, 1.548441, -0.289455, + 0.539845, -4.097627, 0.635001, -0.465071, -0.927701, -2.481498, 0.356616, + -2.355012, 0.728806, -3.340283, 1.609038, -4.786268, -0.532272, -1.886150, + 0.254797, 0.746620, -1.657134, -3.264265, 0.525551, -1.756837, 0.845446, + -5.572190, 1.715797, -2.856942, 3.394245, -5.803662, 2.281806, -3.014739, + 2.616136, -4.728482, 1.659984, -2.106307, 2.711709, -6.173832, 1.352869, + -0.038035, 0.107619, -4.279774, 2.341930, -0.980413, -0.119538, -4.049717, + 1.172128, -3.477744, 2.602274, -6.231380, 2.537300, -0.862214, 0.568722, + -3.858362, 0.197867, -1.725885, 3.687312, -7.067363, 2.403544, -0.944963, + 0.235639, -3.250094, 0.659117, -1.459576, 0.426128, -3.637207, 1.030386, + -4.224351, 3.516220, -6.053367, 0.993473, -2.182416, -0.762625, -1.884405, + -0.113736, -2.572602, 0.329290, -1.913233, 0.517418, -0.019757, 0.203176, + -3.715881, 0.482136, -1.912823, 1.357907, -5.473043, 1.714658, -3.177160, + 0.089285, -3.127669, 1.268076, 0.772498, -1.622712, -3.850314, 0.436124, + -1.495983, 3.439982, -7.623405, 1.726721, -0.423979, 0.180201, -2.902406, + 0.986457, -1.845638, 0.460903, -5.359343, -1.133931, -1.074456, 0.717304, + -3.519856, 1.012126, -0.562301, 1.881967, -6.716627, 2.525036, 0.945480, + 0.337081, -5.210562, 2.572035, -0.943370, 0.442026, -2.666313, 0.411296, + 0.002787, -0.000735, -2.498933, 0.771719, -3.568153, 3.833721, -6.617026, + 2.813922, -0.573970, 1.025208, -3.909923, 1.722648, -1.406849, 0.719783, + -5.207438, 1.819442, -0.530895, -0.010887, -2.939614, 0.971225, -1.660297, + 1.345243, -4.454571, 2.244876, -2.021213, 1.756090, -4.880947, 0.364597, + -2.380270, 2.763117, -5.613013, 2.137534, 0.289101, -2.279400, -3.365582, + 0.170028, -1.142254, -0.709604, -3.656223, 1.804870, -0.854690, 0.592102, + -5.010415, 2.462687, -1.474710, 0.566002, -3.621819, -0.391946, -0.423524, + -0.631428, -3.513310, 0.962825, -1.480262, 0.319791, -3.610137, 1.842339, + -0.250073, 1.182022, -6.249267, 1.604172, 1.153759, -0.734054, -4.620415, + -0.030858, 0.050911, 1.524406, -4.724010, 1.451846, -3.277104, 2.414182, + -4.605285, 1.846092, -1.503047, -0.618200, -2.746546, -0.459332, -0.980326, + -1.199977, -2.043865, -0.165793, -2.214698, 3.108281, -7.127830, -0.123065, + 1.244948, -3.039923, -4.660061, -0.225957, -0.307210, -1.513205, -2.456005, + 0.840048, -0.741445, 2.328635, -6.015267, 2.723240, -1.381171, -0.728878, + -5.114925, -0.362034, -0.574923, 0.518080, -3.892457, 1.798948, 0.435119, + -0.371696, -2.807571, 1.302864, -2.063052, 1.036388, -4.232038, 1.397059, + -1.615668, -1.511019, -3.095508, 1.290955, -3.428723, 2.000287, -4.196487, + 1.566983, 0.196957, 0.224343, -4.926359, -0.691975, -0.214941, 1.546821, + -5.384868, 2.290820, -1.878865, 0.493692, -4.129823, 2.112036, 0.516558, + -2.553077, -2.717338, 0.017146, -2.016057, 1.628995, -4.240602, 1.189533, + -5.460220, 1.254738, -4.214903, 0.755659, -2.893235, 2.937762, -6.169453, + 2.035456, -5.613212, -0.122254, -1.973646, -0.060619, -2.119598, 1.413512, + -4.938738, 1.890244, 0.544169, -2.062413, -3.329637, -0.062515, -1.855805, + -0.791297, -2.570353, 0.607615, 0.305812, 0.338930, -4.150270, 2.274937, + 0.042653, 0.133825, -3.538155, 1.523639, -3.173690, -1.496599, -2.414655, + 0.464687, -1.448998, -0.368907, -3.520129, 0.203382, -2.443626, 1.266233, + -3.393848, 0.605911, -0.015353, 1.402006, -4.441003, 1.419281, 0.603587, + 0.434146, -4.966566, 2.171872, -0.688264, -0.009981, -4.461103, 1.538354, + -5.029816, -0.264424, -1.713510, -0.315258, -1.891606, 0.252074, -2.419428, + 0.043970, -1.291143, 2.048704, -4.590105, 0.524734, -1.889576, 0.134836, + -3.462745, 1.390663, -0.112773, 0.402735, -4.203784, 1.381043, -1.201634, + -1.968277, -1.425637, -0.181725, -1.250742, -2.102041, -3.925464, -1.256797, + -3.701354, -1.754610, -1.917231, -1.455910, -1.838006, 2.041781, -5.666212, + 2.752957, -2.659553, 2.553637, -4.872212, 1.443437, -2.081846, 3.311263, + -5.912457, 1.871049, 0.196148, -0.307044, -4.024967, 2.149149, 0.361809, + 0.620415, -5.939984, 0.180672, -1.209180, -0.269122, -3.240285, 1.460315, + -1.040803, 1.125700, -6.060366, 0.887767, -3.214111, 1.314368, -3.026808, + 1.023640, -3.815175, 1.795642, -4.355603, 1.064454, -0.046472, 0.618463, + -5.941646, 2.861891, -2.852155, -0.990457, -2.624445, 1.794494, -1.176747, + -0.358159, -3.206776, 1.138721, -2.819523, -1.825522, -1.450902, -0.187312, + -0.808727, 0.636872, -4.120567, 1.192623, 0.810731, -1.768519, -3.699450, + 1.527116, -2.772720, 3.012835, -5.912736, 1.599365, -4.696381, 2.234591, + -4.139552, 1.061768, -1.880089, 3.596274, -7.006379, 2.382152, -3.158115, + 3.844430, -7.044156, 2.307596, -2.473970, 1.312644, -5.467269, 0.197154, + -1.530040, 1.762275, -5.550757, 0.630276, -3.048947, 1.043777, -3.096658, + 1.345893, -1.329494, 2.065748, -4.711032, 2.227600, -0.413321, -0.032428, + -4.599650, 1.668734, -4.351490, -0.200022, -2.359903, 0.021997, 0.116028, + 1.159718, -5.093972, -0.142951, -2.409895, 0.906133, -2.728812, 0.809932, + -2.597363, 0.494130, -2.357861, 0.369825, -2.165235, 1.148522, -3.130562, + 0.759034, 0.646335, -1.463660, -3.508299, 1.059679, -1.485465, 1.007319, + -4.340716, 1.789864, -1.590654, 1.612324, -4.452007, 2.389805, -5.200148, + -1.068398, -1.306923, -0.472408, -0.392165, -0.524996, -2.933478, 1.518430, + -1.287781, 0.113422, -3.020525, 1.338359, -0.105982, 0.936014, -4.132197, + 1.836807, -0.616589, -1.029716, -3.271347, 0.284889, -2.653359, 2.135829, + -4.643613, 1.627981, 0.287733, -2.017263, -2.776574, 1.184792, 1.004161, + -1.483019, -4.339290, -0.787322, 0.582420, 1.137839, -5.673941, -0.001862, + -1.219142, 0.532561, -4.457245, 1.826807, -3.343291, 3.034610, -6.179855, + 2.235917, -4.369989, 4.018128, -6.632714, 0.926585, -0.485469, 0.536073, + -4.179557, 1.489637, -0.521762, 1.636089, -6.137912, 1.500867, -4.086009, + 1.961372, -3.688977, 1.358220, -1.544034, 1.763837, -4.357567, 1.852201, + -2.018725, 1.046264, -6.211127, 1.609419, -0.118441, 1.602284, -6.242423, + 1.518578, -0.604078, 1.106613, -5.393445, 2.595629, 0.142712, -1.903953, + -2.821177, 0.032758, -0.009152, 0.184628, -4.227636, 2.046843, -2.240138, + 1.256176, -5.108516, -0.308447, -2.998571, 4.657396, -7.582112, 2.510951, + -3.535784, 1.704560, -5.068484, 1.318466, -3.058265, 3.073172, -6.998089, + 3.178849, -2.420286, 2.277806, -4.999528, 1.423890, -1.672914, 0.447460, + -4.088940, 1.351087, -1.051546, -0.417955, -4.042147, 1.604102, -1.700931, + 2.796663, -6.497579, 2.857974, -0.240828, 0.858001, -5.778933, 2.778508, + -0.406211, 1.300766, -5.073671, 2.089362, -0.201673, 1.588396, -6.000150, + 2.185055, -2.332125, 0.768216, -2.609184, 0.327277, -3.358943, -1.020736, + -2.389984, 0.315512, -0.561905, 1.948740, -6.408485, 2.231985, -0.603652, + 0.661829, -5.070386, -1.063058, -0.624796, 1.375772, -4.379606, 1.929358, + -1.047263, 0.739100, -5.217857, 2.127625, -5.025338, 0.650344, -2.068460, + 0.076936, -0.457505, -1.050984, -1.917765, 1.150908, 0.782625, 0.855595, + -5.321719, 0.787209, -0.460232, 1.106736, -5.552326, 2.801043, -0.360217, + -0.434432, -4.273378, 0.967556, -0.972652, 0.874811, -5.429918, -0.331039, + 0.115477, 0.111883, -5.418786, 1.240546, -1.842794, 0.505880, -3.676064, + -0.682369, 1.858984, -0.742566, -5.784060, 0.673239, -1.280398, 0.280842, + -4.848077, 2.214860, -0.785100, -0.588488, -2.438206, 0.786651, -1.568752, + 1.935400, -6.320256, 2.125338, -1.476457, -1.651941, -2.695734, 0.007338, + -3.280860, 2.310385, -5.319578, 1.890123, -0.775723, 0.630606, -4.321582, + 1.085521, -1.847371, 1.188521, -4.596577, 2.056443, -2.340172, -0.108501, + -3.156392, 0.933279, -0.495331, 0.122405, -5.171133, 1.763245, -0.796913, + 2.310487, -7.247197, 2.401678, -1.908860, 0.043798, -2.393796, 0.573806, + -0.608531, 0.154710, -4.669001, 0.750680, 0.468380, 0.392591, -4.755001, + 2.615217, -1.957774, 1.153513, -4.530099, 1.124362, -3.569415, 1.697154, + -3.536335, 0.910758, -2.976264, 1.833129, -4.287203, -0.547050, -2.409768, + 0.061585, -1.324116, 0.268497, -2.962222, -1.524245, -2.063413, 0.442058, + -4.292337, 3.538863, -6.699603, 1.718664, -2.290363, 1.994596, -6.245037, + -0.433084, -0.367059, 1.020297, -4.940721, 2.902264, -0.577056, -0.709887, + -5.001413, -0.268316, -1.112048, -1.083307, -1.753492, 0.209973, 0.139540, + 0.917602, -5.232745, 2.538467, -2.139234, -0.187388, -1.837249, -0.478582, + -0.731653, -0.481550, -2.531261, 1.044770, 0.707750, 0.279971, -3.221119, + 1.552074, -2.373144, 0.859518, -3.665156, 1.620278, -1.440871, -0.525581, + -2.758271, 1.491873, -2.302013, 1.119935, -5.257080, 2.627170, -3.174739, + 1.363282, -4.831639, 1.101076, -4.337008, 2.689639, -5.165915, 1.069201, + -1.882078, -0.120370, -2.287967, 1.147619, -1.403616, 1.077150, -5.084296, + 1.658236, -0.919642, 0.487423, -3.001075, 0.741268, 0.107300, 0.943556, + -3.544311, 1.000239, -1.627171, 2.871253, -5.179172, 1.429893, -0.826040, + 0.188670, -4.499894, 1.013447, -2.101299, 0.317516, -3.452141, -0.833776, + -1.362144, 1.272437, -4.449355, 1.613591, -2.039873, 2.613175, -6.229640, + 1.659790, -1.595520, -0.237462, -2.744997, 0.337841, 0.148981, -1.703771, + -2.388023, 1.276469, 1.058508, -0.401642, -4.680769, 0.861881, -1.336381, + 1.153080, -2.834378, 0.721075, 0.900115, 1.360511, -5.573611, 0.949182, + -2.970844, 2.017563, -5.186108, -0.201038, -1.192824, 0.610142, -4.450919, + -0.897114, -1.812093, 0.422310, -5.245487, 0.256549, 0.320275, -2.324150, + -2.967040, -0.260536, -0.721467, 0.454148, -5.058031, 0.526370, -0.895656, + 0.732240, -3.327363, 1.353953, -1.277912, -0.483171, -1.926713, 0.065044, + -2.167506, -0.196606, -1.923437, 0.604962, -2.088319, 1.406834, -5.227296, + 2.247351, -4.421744, 1.729791, -5.007922, 1.264769, -0.897019, 0.922902, + -3.887108, 2.087432, -1.310226, -0.101938, -3.359082, -0.079662, -0.514988, + -0.963179, -4.038209, 2.223278, -0.590083, -2.310458, -1.748338, 0.363406, + -0.540731, -0.885913, -4.179595, 2.216781, -3.044339, -0.447100, -2.446098, + 0.931101, -1.676190, 2.096175, -4.980755, 2.262151, -1.095047, 1.897516, + -5.996138, 2.191038, 0.297128, -0.780974, -2.884299, 1.195408, -0.521065, + -1.955837, -3.091064, -0.404183, -1.961519, 4.076096, -7.521851, 2.242064, + -1.988043, 0.303300, -2.422585, 0.322230, -3.377634, 3.499955, -7.084434, + 2.375587, -0.718851, 2.150076, -5.412241, 2.374280, -2.006088, 2.229828, + -5.848188, 2.543077, -2.171042, 2.096026, -5.300007, 0.141405, -1.187745, + 0.105340, -4.003816, 1.034281, -3.980804, 1.856709, -5.103042, 0.623737, + -2.080307, 0.896140, -3.104050, 0.983158, -0.424898, -1.154270, -3.805728, + 1.978917, -1.314387, 1.235096, -3.148906, 1.113173, 0.111713, 2.055213, + -7.565283, 2.100342}; +constexpr std::initializer_list biases = { + 0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068, -0.23566568, + -0.389184, 0.47481549, -0.4791103, 0.29931796, 0.10463274, 0.83918178, + 0.37197268, 0.61957061, 0.3956964, -0.37609905}; + +constexpr std::initializer_list recurrent_weights = { + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1}; + +class BidirectionalRNNOpModel : public SingleOpModel { + public: + BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units, + int bw_units, int input_size) + : batches_(batches), + sequence_len_(sequence_len), + fw_units_(fw_units), + bw_units_(bw_units), + input_size_(input_size) { + input_ = AddInput(TensorType_FLOAT32); + fw_weights_ = AddInput(TensorType_FLOAT32); + fw_recurrent_weights_ = AddInput(TensorType_FLOAT32); + fw_bias_ = AddInput(TensorType_FLOAT32); + fw_hidden_state_ = AddOutput(TensorType_FLOAT32); + fw_output_ = AddOutput(TensorType_FLOAT32); + bw_weights_ = AddInput(TensorType_FLOAT32); + bw_recurrent_weights_ = AddInput(TensorType_FLOAT32); + bw_bias_ = AddInput(TensorType_FLOAT32); + bw_hidden_state_ = AddOutput(TensorType_FLOAT32); + bw_output_ = AddOutput(TensorType_FLOAT32); + SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, + BuiltinOptions_SequenceRNNOptions, + CreateSequenceRNNOptions(builder_, /*time_major=*/false, + ActivationFunctionType_RELU) + .Union()); + BuildInterpreter({ + {batches_, sequence_len_, input_size_}, // input + {fw_units_, input_size_}, // fw_weights + {fw_units_, fw_units_}, // fw_recurrent_weights + {fw_units_}, // fw_bias + {bw_units_, input_size_}, // bw_weights + {bw_units_, bw_units_}, // bw_recurrent_weights + {bw_units_} // bw_bias + }); + } + + void SetFwBias(std::initializer_list f) { + PopulateTensor(fw_bias_, f); + } + + void SetBwBias(std::initializer_list f) { + PopulateTensor(bw_bias_, f); + } + + void SetFwWeights(std::initializer_list f) { + PopulateTensor(fw_weights_, f); + } + + void SetBwWeights(std::initializer_list f) { + PopulateTensor(bw_weights_, f); + } + + void SetFwRecurrentWeights(std::initializer_list f) { + PopulateTensor(fw_recurrent_weights_, f); + } + + void SetBwRecurrentWeights(std::initializer_list f) { + PopulateTensor(bw_recurrent_weights_, f); + } + + void SetInput(std::initializer_list data) { + PopulateTensor(input_, data); + } + + void SetInput(int offset, float* begin, float* end) { + PopulateTensor(input_, offset, begin, end); + } + + void ResetHiddenStates() { + const int fw_zero_buffer_size = fw_units_ * batches_; + std::unique_ptr fw_zero_buffer(new float[fw_zero_buffer_size]); + memset(fw_zero_buffer.get(), 0, fw_zero_buffer_size * sizeof(float)); + PopulateTensor(fw_hidden_state_, 0, fw_zero_buffer.get(), + fw_zero_buffer.get() + fw_zero_buffer_size); + const int bw_zero_buffer_size = bw_units_ * batches_; + std::unique_ptr bw_zero_buffer(new float[bw_zero_buffer_size]); + memset(bw_zero_buffer.get(), 0, bw_zero_buffer_size * sizeof(float)); + PopulateTensor(bw_hidden_state_, 0, bw_zero_buffer.get(), + bw_zero_buffer.get() + bw_zero_buffer_size); + } + + std::vector GetFwOutput() { return ExtractVector(fw_output_); } + std::vector GetBwOutput() { return ExtractVector(bw_output_); } + + int input_size() { return input_size_; } + int num_fw_units() { return fw_units_; } + int num_bw_units() { return bw_units_; } + int num_batches() { return batches_; } + int sequence_len() { return sequence_len_; } + + private: + int input_; + int fw_weights_; + int fw_recurrent_weights_; + int fw_bias_; + int fw_hidden_state_; + int fw_output_; + int bw_weights_; + int bw_recurrent_weights_; + int bw_bias_; + int bw_hidden_state_; + int bw_output_; + + int batches_; + int sequence_len_; + int fw_units_; + int bw_units_; + int input_size_; +}; + +// TODO(mirkov): add another test which directly compares to TF once TOCO +// supports the conversion from dynamic_rnn with BasicRNNCell. +TEST(BidirectionalRNNOpTest, BlackBoxTest) { + BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16, + /*fw_units=*/16, /*bw_units=*/16, + /*input_size=*/8); + rnn.SetFwWeights(weights); + rnn.SetBwWeights(weights); + rnn.SetFwBias(biases); + rnn.SetBwBias(biases); + rnn.SetFwRecurrentWeights(recurrent_weights); + rnn.SetBwRecurrentWeights(recurrent_weights); + + rnn.ResetHiddenStates(); + const int input_sequence_size = rnn.input_size() * rnn.sequence_len(); + float* batch_start = rnn_input; + float* batch_end = batch_start + input_sequence_size; + rnn.SetInput(0, batch_start, batch_end); + rnn.SetInput(input_sequence_size, batch_start, batch_end); + + rnn.Invoke(); + + float* golden_fw_start = rnn_golden_fw_output; + float* golden_fw_end = + golden_fw_start + rnn.num_fw_units() * rnn.sequence_len(); + std::vector fw_expected; + fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end); + fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end); + EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected))); + + float* golden_bw_start = rnn_golden_bw_output; + float* golden_bw_end = + golden_bw_start + rnn.num_bw_units() * rnn.sequence_len(); + std::vector bw_expected; + bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end); + bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end); + EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected))); +} + +// Check that if the input sequence is reversed the outputs are the same just +// forward and backward are swapped (and reversed). +TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) { + BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16, + /*fw_units=*/16, /*bw_units=*/16, + /*input_size=*/8); + rnn.SetFwWeights(weights); + rnn.SetBwWeights(weights); + rnn.SetFwBias(biases); + rnn.SetBwBias(biases); + rnn.SetFwRecurrentWeights(recurrent_weights); + rnn.SetBwRecurrentWeights(recurrent_weights); + + rnn.ResetHiddenStates(); + + // Reverse inputs in each batch: in_1, in_2,..., in_k is inserted in the + // following order: [in_k,..., in_2, in_1, in_k,...,in_2, in_1]. + for (int i = 0; i < rnn.sequence_len(); i++) { + float* batch_start = rnn_input + i * rnn.input_size(); + float* batch_end = batch_start + rnn.input_size(); + const int reverse_idx = rnn.sequence_len() - i - 1; + rnn.SetInput(reverse_idx * rnn.input_size(), batch_start, batch_end); + rnn.SetInput((rnn.sequence_len() + reverse_idx) * rnn.input_size(), + batch_start, batch_end); + } + + rnn.Invoke(); + + // The forward and backward outputs are swapped. + std::vector fw_expected; // consider using std::deque instead. + for (int i = 0; i < rnn.sequence_len(); i++) { + float* golden_fw_start = rnn_golden_bw_output + i * rnn.num_fw_units(); + float* golden_fw_end = golden_fw_start + rnn.num_fw_units(); + fw_expected.insert(fw_expected.begin(), golden_fw_start, golden_fw_end); + } + fw_expected.insert(fw_expected.end(), fw_expected.begin(), fw_expected.end()); + EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected))); + + std::vector bw_expected; + for (int i = 0; i < rnn.sequence_len(); i++) { + float* golden_bw_start = rnn_golden_fw_output + i * rnn.num_bw_units(); + float* golden_bw_end = golden_bw_start + rnn.num_bw_units(); + bw_expected.insert(bw_expected.begin(), golden_bw_start, golden_bw_end); + } + bw_expected.insert(bw_expected.end(), bw_expected.begin(), bw_expected.end()); + EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected))); +} + +// Tests an end-to-end neural network with a Bidirectional RNN followed by a +// DNN that aggregates the outputs from the two sequences. +TEST(BidirectionalRNNOpTest, EndToEndTest) { + BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4, + /*fw_units=*/16, /*bw_units=*/16, + /*input_size=*/8); + const int output_size = 4; + float dnn_weights[] = { + -0.5782342, -0.052212059, 0.73036242, -0.81216097, -0.80088139, + -0.23420811, -0.39647382, 0.31423986, 0.61819065, -0.73659575, + -0.89698344, -0.8931554, -0.0845688, 0.5617367, 0.38415289, + -0.11487955, -0.7617774, 0.17927337, 0.15726972, 0.059798479, + 0.19009054, -0.27616632, -0.39142907, 0.77744663, -0.046830714, + -0.6603595, 0.21945822, 0.051494241, 0.23785079, 0.19239247, + -0.53268754, 0.65961659, -0.85981959, -0.80232513, 0.84745562, + -0.66070104, -0.036533296, -0.54901814, 0.65353882, -0.41834265, + -0.28561389, 0.75655544, -0.31149811, 0.62981737, 0.31829214, + -0.92734522, -0.48506218, 0.55651462, 0.25192821, 0.67220747, + -0.3836869, -0.55798125, -0.60395885, 0.22488403, -0.78053463, + 0.3492105, 0.56452453, 0.4389236, -0.59929526, -0.19762468, + -0.36868393, -0.13198286, -0.53800809, -0.22850353}; + + std::initializer_list dnn_biases = { + 0.29177809, -0.98799044, 0.065919638, 0.68781924}; + + rnn.SetFwWeights(weights); + rnn.SetBwWeights(weights); + rnn.SetFwBias(biases); + rnn.SetBwBias(biases); + rnn.SetFwRecurrentWeights(recurrent_weights); + rnn.SetBwRecurrentWeights(recurrent_weights); + + rnn.ResetHiddenStates(); + + const int input_sequence_size = rnn.input_size() * rnn.sequence_len(); + const int output_sequence_size = output_size * rnn.sequence_len(); + const int num_examples = 64; + for (int k = 0; k < num_examples; k++) { + float* batch_start = endtoend_input + k * input_sequence_size; + float* batch_end = batch_start + input_sequence_size; + rnn.SetInput(0, batch_start, batch_end); + + rnn.Invoke(); + + std::vector fw_output = rnn.GetFwOutput(); + std::vector bw_output = rnn.GetBwOutput(); + EXPECT_EQ(fw_output.size(), bw_output.size()); + + std::transform(fw_output.begin(), fw_output.end(), bw_output.begin(), + fw_output.begin(), std::plus()); + + std::vector sequence_result; + for (int s = 0; s < rnn.sequence_len(); s++) { + const float* rnn_output = fw_output.data() + s * rnn.num_fw_units(); + std::vector results(dnn_biases); + for (int i = 0; i < output_size; i++) { + for (int j = 0; j < rnn.num_fw_units(); j++) { + results[i] += *(rnn_output + j) * dnn_weights[output_size * j + i]; + } + } + sequence_result.insert(sequence_result.end(), results.begin(), + results.end()); + } + + float* golden_start = golden_endtoend_output + k * output_sequence_size; + float* golden_end = golden_start + output_sequence_size; + + std::vector expected; + expected.insert(expected.end(), golden_start, golden_end); + EXPECT_THAT(sequence_result, ElementsAreArray(ArrayFloatNear(expected))); + } +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + // On Linux, add: tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc index 37f499a4d09..7a456474341 100644 --- a/tensorflow/contrib/lite/kernels/conv.cc +++ b/tensorflow/contrib/lite/kernels/conv.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/contrib/lite/builtin_op_data.h" #include "tensorflow/contrib/lite/context.h" #include "tensorflow/contrib/lite/kernels/gemm_support.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" @@ -38,11 +39,16 @@ namespace ops { namespace builtin { namespace conv { -// This file has three implementation of Conv. +// This file has 4 implementation of Conv. enum KernelType { kReference, kGenericOptimized, // Neon-free - kNeonOptimized, + kMultithreadOptimized, + // The kernel uses use CBLAS interface for matrix multiplication. + // It's fast when an optimized CBLAS implementation is available (e.g. Apple + // Accelerate Framework), and it's slow when falling back to naive + // implementation. + kCblasOptimized, }; struct OpData { @@ -290,26 +296,34 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, auto filter_offset = -filter->params.zero_point; auto output_offset = output->params.zero_point; - if (kernel_type == kReference) { - reference_ops::Conv( - GetTensorData(input), GetTensorDims(input), input_offset, - GetTensorData(filter), GetTensorDims(filter), filter_offset, - GetTensorData(bias), GetTensorDims(bias), params->stride_width, - params->stride_height, data->padding.width, data->padding.height, - output_offset, data->output_multiplier, data->output_shift, - data->output_activation_min, data->output_activation_max, - GetTensorData(output), GetTensorDims(output), - GetTensorData(im2col), GetTensorDims(im2col), gemm_context); - } else { - optimized_ops::Conv( - GetTensorData(input), GetTensorDims(input), input_offset, - GetTensorData(filter), GetTensorDims(filter), filter_offset, - GetTensorData(bias), GetTensorDims(bias), params->stride_width, - params->stride_height, data->padding.width, data->padding.height, - output_offset, data->output_multiplier, data->output_shift, - data->output_activation_min, data->output_activation_max, - GetTensorData(output), GetTensorDims(output), - GetTensorData(im2col), GetTensorDims(im2col), gemm_context); + switch (kernel_type) { + case kReference: + reference_ops::Conv( + GetTensorData(input), GetTensorDims(input), input_offset, + GetTensorData(filter), GetTensorDims(filter), filter_offset, + GetTensorData(bias), GetTensorDims(bias), + params->stride_width, params->stride_height, data->padding.width, + data->padding.height, output_offset, data->output_multiplier, + data->output_shift, data->output_activation_min, + data->output_activation_max, GetTensorData(output), + GetTensorDims(output), GetTensorData(im2col), + GetTensorDims(im2col), gemm_context); + break; + case kGenericOptimized: + case kMultithreadOptimized: + case kCblasOptimized: + // There is only one optimized implementation for Quantized Conv. + optimized_ops::Conv( + GetTensorData(input), GetTensorDims(input), input_offset, + GetTensorData(filter), GetTensorDims(filter), filter_offset, + GetTensorData(bias), GetTensorDims(bias), + params->stride_width, params->stride_height, data->padding.width, + data->padding.height, output_offset, data->output_multiplier, + data->output_shift, data->output_activation_min, + data->output_activation_max, GetTensorData(output), + GetTensorDims(output), GetTensorData(im2col), + GetTensorDims(im2col), gemm_context); + break; } } @@ -322,31 +336,57 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, CalculateActivationRangeFloat(params->activation, &output_activation_min, &output_activation_max); - if (kernel_type == kReference) { - reference_ops::Conv(GetTensorData(input), GetTensorDims(input), - GetTensorData(filter), GetTensorDims(filter), - GetTensorData(bias), GetTensorDims(bias), - params->stride_width, params->stride_height, - data->padding.width, data->padding.height, - output_activation_min, output_activation_max, - GetTensorData(output), GetTensorDims(output), - GetTensorData(im2col), GetTensorDims(im2col)); - } else { - const float* filter_data; - if (data->need_hwcn_weights) { - filter_data = GetTensorData(hwcn_weights); - } else { - filter_data = GetTensorData(filter); + switch (kernel_type) { + case kReference: { + reference_ops::Conv(GetTensorData(input), GetTensorDims(input), + GetTensorData(filter), GetTensorDims(filter), + GetTensorData(bias), GetTensorDims(bias), + params->stride_width, params->stride_height, + data->padding.width, data->padding.height, + output_activation_min, output_activation_max, + GetTensorData(output), GetTensorDims(output), + GetTensorData(im2col), GetTensorDims(im2col)); + break; + } + case kGenericOptimized: { + optimized_ops::Conv(GetTensorData(input), GetTensorDims(input), + GetTensorData(filter), GetTensorDims(filter), + GetTensorData(bias), GetTensorDims(bias), + params->stride_width, params->stride_height, + data->padding.width, data->padding.height, + output_activation_min, output_activation_max, + GetTensorData(output), GetTensorDims(output), + GetTensorData(im2col), GetTensorDims(im2col)); + break; + } + case kMultithreadOptimized: { + const float* filter_data; + if (data->need_hwcn_weights) { + filter_data = GetTensorData(hwcn_weights); + } else { + filter_data = GetTensorData(filter); + } + multithreaded_ops::Conv( + GetTensorData(input), GetTensorDims(input), filter_data, + GetTensorDims(filter), GetTensorData(bias), + GetTensorDims(bias), params->stride_width, params->stride_height, + data->padding.width, data->padding.height, params->padding, + output_activation_min, output_activation_max, + GetTensorData(output), GetTensorDims(output), + GetTensorData(im2col), GetTensorDims(im2col)); + break; + } + case kCblasOptimized: { + cblas_ops::Conv(GetTensorData(input), GetTensorDims(input), + GetTensorData(filter), GetTensorDims(filter), + GetTensorData(bias), GetTensorDims(bias), + params->stride_width, params->stride_height, + data->padding.width, data->padding.height, + output_activation_min, output_activation_max, + GetTensorData(output), GetTensorDims(output), + GetTensorData(im2col), GetTensorDims(im2col)); + break; } - - multithreaded_ops::Conv( - GetTensorData(input), GetTensorDims(input), filter_data, - GetTensorDims(filter), GetTensorData(bias), GetTensorDims(bias), - params->stride_width, params->stride_height, data->padding.width, - data->padding.height, params->padding, output_activation_min, - output_activation_max, GetTensorData(output), - GetTensorDims(output), GetTensorData(im2col), - GetTensorDims(im2col)); } } @@ -407,17 +447,25 @@ TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() { return &r; } -TfLiteRegistration* Register_CONVOLUTION_NEON_OPT() { +TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() { static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare, - conv::Eval}; + conv::Eval}; + return &r; +} + +TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() { + static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare, + conv::Eval}; return &r; } TfLiteRegistration* Register_CONV_2D() { -#ifdef USE_NEON - return Register_CONVOLUTION_NEON_OPT(); +// TODO(ycling): Define a compilation flag and use CBLAS kernel when a +// fast CBLAS implementatino is available. +#ifdef TFLITE_USE_CBLAS_CONVOLUTION_KERNEL + return Register_CONVOLUTION_CBLAS_OPT(); #else - return Register_CONVOLUTION_GENERIC_OPT(); + return Register_CONVOLUTION_MULTITHREADED_OPT(); #endif } diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc index 1d0a81c3135..d2393c3c97b 100644 --- a/tensorflow/contrib/lite/kernels/conv_test.cc +++ b/tensorflow/contrib/lite/kernels/conv_test.cc @@ -15,12 +15,25 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/contrib/lite/interpreter.h" #include "tensorflow/contrib/lite/kernels/register.h" #include "tensorflow/contrib/lite/kernels/test_util.h" #include "tensorflow/contrib/lite/model.h" namespace tflite { + +namespace ops { +namespace builtin { + +TfLiteRegistration* Register_CONVOLUTION_REF(); +TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT(); +TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT(); +TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT(); + +} // namespace builtin +} // namespace ops + namespace { using ::testing::ElementsAreArray; @@ -30,9 +43,9 @@ class BaseConvolutionOpModel : public SingleOpModel { // TODO(ahentz): Also test different activation types, bias, padding types, // stride values. BaseConvolutionOpModel( - const TensorData& input, const TensorData& filter, - const TensorData& output, int stride_width = 2, int stride_height = 2, - enum Padding padding = Padding_VALID, + TfLiteRegistration* registration, const TensorData& input, + const TensorData& filter, const TensorData& output, int stride_width = 2, + int stride_height = 2, enum Padding padding = Padding_VALID, enum ActivationFunctionType activation = ActivationFunctionType_NONE) { input_ = AddInput(input); filter_ = AddInput(filter); @@ -62,6 +75,8 @@ class BaseConvolutionOpModel : public SingleOpModel { stride_height, activation) .Union()); + resolver_ = absl::make_unique(BuiltinOperator_CONV_2D, + registration); BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)}); } @@ -83,12 +98,26 @@ class ConvolutionOpModel : public BaseConvolutionOpModel { void SetInput(std::initializer_list data) { PopulateTensor(input_, data); } - std::vector GetOutput() { return ExtractVector(output_); } }; -TEST(ConvolutionOpTest, SimpleTestFloat32) { - ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}}, +const auto kKernelMap = new std::map({ + {"Reference", ops::builtin::Register_CONVOLUTION_REF()}, + {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()}, + {"MultithreadedOptimized", + ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()}, + {"CblasOptimized", ops::builtin::Register_CONVOLUTION_CBLAS_OPT()}, +}); + +class ConvolutionOpTest : public SingleOpTest { + protected: + const std::map& GetKernelMap() override { + return *kKernelMap; + } +}; + +TEST_P(ConvolutionOpTest, SimpleTestFloat32) { + ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}}, {TensorType_FLOAT32, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}}); @@ -117,8 +146,8 @@ TEST(ConvolutionOpTest, SimpleTestFloat32) { })); } -TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) { - ConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 6, 1}}, +TEST_P(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) { + ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 6, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}}, /*stride_width=*/3, /*stride_height=*/1); @@ -139,7 +168,7 @@ TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) { })); } -TEST(ConvolutionOpTest, HandCalculatedFloat32) { +TEST_P(ConvolutionOpTest, HandCalculatedFloat32) { const int depth = 1; const int image_width = 4; const int image_height = 3; @@ -150,6 +179,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) { const int stride_height = 1; const Padding padding = Padding_SAME; ConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {image_batch_count, image_height, image_width, depth}}, {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}}, @@ -192,7 +222,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) { 178, 187, 234, 261, 121})); } -TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) { +TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) { const int depth = 1; const int image_width = 4; const int image_height = 3; @@ -203,6 +233,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) { const int stride_height = 1; const Padding padding = Padding_SAME; ConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {image_batch_count, image_height, image_width, depth}}, {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}}, @@ -245,7 +276,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) { 367, 188, 197, 244, 271, 131})); } -TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) { +TEST_P(ConvolutionOpTest, HandCalculatedWithReluFloat32) { const int depth = 1; const int image_width = 4; const int image_height = 3; @@ -256,6 +287,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) { const int stride_height = 1; const Padding padding = Padding_SAME; ConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {image_batch_count, image_height, image_width, depth}}, {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}}, @@ -300,7 +332,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) { ElementsAreArray({0, 0, 0, 0, 35, 112, 157, 0, 0, 34, 61, 0})); } -TEST(ConvolutionOpTest, HandCalculatedValidFloat32) { +TEST_P(ConvolutionOpTest, HandCalculatedValidFloat32) { const int depth = 1; const int image_width = 4; const int image_height = 3; @@ -311,6 +343,7 @@ TEST(ConvolutionOpTest, HandCalculatedValidFloat32) { const int stride_height = 1; const Padding padding = Padding_VALID; ConvolutionOpModel m( + GetRegistration(), {TensorType_FLOAT32, {image_batch_count, image_height, image_width, depth}}, {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}}, @@ -366,8 +399,9 @@ class QuantizedConvolutionOpModel : public BaseConvolutionOpModel { // In this tests we set the input and output scales so that the results // match exactly the 'non-quantized' version. -TEST(ConvolutionOpTest, SimpleTestQuantized) { - QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64}, +TEST_P(ConvolutionOpTest, SimpleTestQuantized) { + QuantizedConvolutionOpModel m(GetRegistration(), + {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64}, {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64}, {TensorType_UINT8, {}, -127, 128}); m.SetInput({ @@ -405,8 +439,9 @@ TEST(ConvolutionOpTest, SimpleTestQuantized) { })); } -TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) { - QuantizedConvolutionOpModel m({TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64}, +TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) { + QuantizedConvolutionOpModel m(GetRegistration(), + {TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64}, {TensorType_UINT8, {1, 2, 2, 1}, -63.5, 64}, {TensorType_UINT8, {}, -127, 128}, /*stride_width=*/3, /*stride_height=*/1); @@ -430,6 +465,11 @@ TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) { 167, 93, // })); } + +INSTANTIATE_TEST_CASE_P( + ConvolutionOpTest, ConvolutionOpTest, + ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc index dcdc5fffad9..ef2b5422253 100644 --- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc +++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc @@ -123,18 +123,16 @@ TEST(EmbeddingLookupOpTest, SimpleTestSqrtn) { [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; }); m.Invoke(); - EXPECT_THAT( - m.GetOutput(), - ElementsAreArray(ArrayFloatNear({ - 1.00, 1.01, 1.10, 1.11, 1.20, 1.21, // Row 1 - 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, // - - 6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f), - 6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f), - 7.20f / std::sqrt(20.0f), - 7.26f / - std::sqrt( - 20.0f), // 2 * Row 3 + 4 * Row 0, // 2 * Row 3 + 4 * Row 0 - }))); + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear({ + 1.00, 1.01, 1.10, 1.11, 1.20, 1.21, // Row 1 + 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, // - + 6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f), + 6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f), + 7.20f / std::sqrt(20.0f), + 7.26f / std::sqrt(20.0f), // 2 * Row 3 + 4 * Row 0, // 2 * + // Row 3 + 4 * Row 0 + }))); } TEST(EmbeddingLookupOpTest, Indices3DTest) { diff --git a/tensorflow/contrib/lite/kernels/gather_test.cc b/tensorflow/contrib/lite/kernels/gather_test.cc index 658d977b8dc..cdadbeda188 100644 --- a/tensorflow/contrib/lite/kernels/gather_test.cc +++ b/tensorflow/contrib/lite/kernels/gather_test.cc @@ -81,10 +81,8 @@ TEST(GatherOpTest, Test0DIndex) { m.SetInputFloat({-2.0, 0.2, 0.7, 0.8}); m.SetPositions({1}); m.Invoke(); - EXPECT_THAT(m.GetOutputFloat(), - ElementsAreArray(ArrayFloatNear({0.7, 0.8}))); - EXPECT_THAT(m.GetOutputShape(), - ElementsAreArray({2})); + EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.7, 0.8}))); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); } TEST(GatherOpTest, Test0DIndexWith0DResult) { @@ -94,8 +92,7 @@ TEST(GatherOpTest, Test0DIndexWith0DResult) { m.SetInputFloat({1.0, 2.0, 3.0}); m.SetPositions({1}); m.Invoke(); - EXPECT_THAT(m.GetOutputFloat(), - ElementsAreArray(ArrayFloatNear({2.0}))); + EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0}))); EXPECT_TRUE(m.GetOutputShape().empty()); } diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc index cb6038f9009..ba0ed5ce063 100644 --- a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc +++ b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc @@ -116,7 +116,10 @@ TEST(HashtableLookupOpTest, Test2DInput) { 1.0, 1.1, // 1-st item }))); EXPECT_THAT(m.GetHit(), ElementsAreArray({ - 1, 0, 1, 1, + 1, + 0, + 1, + 1, })); } diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD index 38b032c6de7..404c7d37183 100644 --- a/tensorflow/contrib/lite/kernels/internal/BUILD +++ b/tensorflow/contrib/lite/kernels/internal/BUILD @@ -124,6 +124,13 @@ config_setting( }, ) +config_setting( + name = "darwin_x86_64", + values = { + "cpu": "darwin_x86_64", + }, +) + config_setting( name = "freebsd", values = { @@ -154,6 +161,7 @@ cc_library( ":x86": tflite_deps_intel, ":x86_64": tflite_deps_intel, ":darwin": tflite_deps_intel, + ":darwin_x86_64": tflite_deps_intel, ":freebsd": tflite_deps_intel, "//conditions:default": [], }), @@ -162,6 +170,8 @@ cc_library( cc_library( name = "optimized", hdrs = [ + "optimized/cblas_conv.h", + "optimized/cblas_reference.h", "optimized/eigen_spatial_convolutions.h", "optimized/eigen_tensor_reduced_instantiations_oss.h", "optimized/multithreaded_conv.h", @@ -232,6 +242,7 @@ cc_library( ":x86": tflite_deps_intel, ":x86_64": tflite_deps_intel, ":darwin": tflite_deps_intel, + ":darwin_x86_64": tflite_deps_intel, ":freebsd": tflite_deps_intel, "//conditions:default": [], }), @@ -284,6 +295,16 @@ cc_library( ], ) +cc_library( + name = "kernel_utils", + srcs = ["kernel_utils.cc"], + hdrs = ["kernel_utils.h"], + deps = [ + ":tensor_utils", + "//tensorflow/contrib/lite:builtin_op_data", + ], +) + cc_library( name = "tensor_utils", srcs = [ diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h index 1d963afb7e1..51426bb1c58 100644 --- a/tensorflow/contrib/lite/kernels/internal/compatibility.h +++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h @@ -27,6 +27,10 @@ limitations under the License. #define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false) #endif +#ifndef TFLITE_DCHECK_NE +#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : assert(false) +#endif + #ifndef TFLITE_DCHECK_GE #define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false) #endif @@ -52,6 +56,10 @@ limitations under the License. #define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort() #endif +#ifndef TFLITE_CHECK_NE +#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : abort() +#endif + #ifndef TFLITE_CHECK_GE #define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort() #endif diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc new file mode 100644 index 00000000000..510395126ce --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc @@ -0,0 +1,44 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h" + +namespace tflite { +namespace kernel_utils { + +void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr, + const float* recurrent_weights_ptr, const float* bias_ptr, + int input_size, int num_units, int batch_size, + TfLiteFusedActivation activation, + float* hidden_state_ptr_batch, float* output_ptr_batch) { + // Output = bias + tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size, + output_ptr_batch); + // Output += input * input_weights + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size, + output_ptr_batch, /*result_stride=*/1); + // Output += recurrent_weights * hidden_state + tensor_utils::MatrixBatchVectorMultiplyAccumulate( + recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch, + batch_size, output_ptr_batch, /*result_stride=*/1); + // Output = activation(Output) and update hidden_state + tensor_utils::ApplyActivationToVector( + output_ptr_batch, num_units * batch_size, activation, output_ptr_batch); + tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size, + hidden_state_ptr_batch); +} + +} // namespace kernel_utils +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h new file mode 100644 index 00000000000..9872d4500b8 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h @@ -0,0 +1,40 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_ + +#include "tensorflow/contrib/lite/builtin_op_data.h" + +namespace tflite { +namespace kernel_utils { + +// Performs an RNN batch inference step for inputs specified by input_ptr_batch. +// The RNN cell is specified by the pointers to its input and recurrent weights, +// and biases, along with the input size, number of units, activation. +// +// The pointers to the hidden state and the output are updated as a result. +// +// The pointers with the suffix "_batch" point to data aligned in batch_major +// order, and each step processes batch_size many inputs from input_ptr_batch, +// and updates batch_size many outputs and hidden states. +void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr, + const float* recurrent_weights_ptr, const float* bias_ptr, + int input_size, int num_units, int batch_size, + TfLiteFusedActivation activation, + float* hidden_state_ptr_batch, float* output_ptr_batch); + +} // namespace kernel_utils +} // namespace tflite +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h new file mode 100644 index 00000000000..fcb9fac6713 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h @@ -0,0 +1,89 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_ + +// The Conv implementation based on CBLAS interface. This is only used on iOS +// for now, utilizing Apple's Accelerate framework. + +// TODO(ycling): Update the BUILD file and integrate with Apple Accelerate +// Famework when it's available. +#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" + +namespace tflite { +namespace cblas_ops { + +inline void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, int pad_width, + int pad_height, float output_activation_min, + float output_activation_max, float* output_data, + const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + gemmlowp::ScopedProfilingLabel label("Conv/cblas"); + + const float* gemm_input_data = nullptr; + const Dims<4>* gemm_input_dims = nullptr; + const int filter_width = ArraySize(filter_dims, 1); + const int filter_height = ArraySize(filter_dims, 2); + const bool need_im2col = stride_width != 1 || stride_height != 1 || + filter_width != 1 || filter_height != 1; + if (need_im2col) { + TFLITE_DCHECK(im2col_data); + optimized_ops::Im2col(input_data, input_dims, stride_width, stride_height, + pad_width, pad_height, filter_height, filter_width, 0, + im2col_data, im2col_dims); + gemm_input_data = im2col_data; + gemm_input_dims = &im2col_dims; + } else { + TFLITE_DCHECK(!im2col_data); + gemm_input_data = input_data; + gemm_input_dims = &input_dims; + } + + // The following code computes matrix multiplication c = a * transponse(b) + // with CBLAS, where: + // * `a` is a matrix with dimensions (m, k). + // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n). + // * `c` is a matrix with dimensions (m, n). + // The naming of variables are aligned with CBLAS specification here. + const float* a = gemm_input_data; + const float* b = filter_data; + float* c = output_data; + int m = gemm_input_dims->sizes[1] * gemm_input_dims->sizes[2] * + gemm_input_dims->sizes[3]; + int n = output_dims.sizes[0]; + int k = gemm_input_dims->sizes[0]; + // The stride of matrix a, b and c respectively. + int stride_a = k; + int stride_b = k; + int stride_c = n; + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a, + stride_a, b, stride_b, 0.0f, c, stride_c); + + optimized_ops::AddBiasAndEvalActivationFunction( + bias_data, bias_dims, output_data, output_dims, output_activation_min, + output_activation_max); +} + +} // namespace cblas_ops +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h new file mode 100644 index 00000000000..65789157437 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h @@ -0,0 +1,66 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_ + +#include "tensorflow/contrib/lite/kernels/internal/compatibility.h" + +// The reference implementation for a small subset of CBLAS interface. +// This is only used for testing CBLAS implementation, and should never be used +// in production code. + +namespace tflite { +namespace cblas_ops { + +// The following code follows the original CBLAS specification, and it might +// conflict with the TensorFlow naming convention. +// TODO(ycling): Find another way to test CBLAS with bazel, without writing +// a reference implementation by ourselves. +enum CBLAS_ORDER { CblasRowMajor = 0, CblasColMajor = 1 }; + +enum CBLAS_TRANSPOSE { CblasNoTrans = 0, CblasTrans = 1, CblasConjTrans = 2 }; + +// A reference implementation for matrix multiplication. +// The following code computes, c = a * transponse(b) matrix multiplication +// with CBLAS, where: +// * `a` is a matrix with dimensions (m, k). +// * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n). +// * `c` is a matrix with dimensions (m, n). +// The naming of variables is aligned with CBLAS specification here. +void cblas_sgemm(const enum CBLAS_ORDER order, + const enum CBLAS_TRANSPOSE trans_a, + const enum CBLAS_TRANSPOSE trans_b, const int m, const int n, + const int k, const float alpha, const float *a, + const int stride_a, const float *b, const int stride_b, + const float beta, float *c, const int stride_c) { + TFLITE_DCHECK(order == CblasRowMajor); + TFLITE_DCHECK(trans_a == CblasNoTrans); + TFLITE_DCHECK(trans_b == CblasTrans); + for (int row = 0; row < m; ++row) { + for (int col = 0; col < n; ++col) { + float value = beta * c[stride_c * row + col]; + for (int idx = 0; idx < k; ++idx) { + value += alpha * a[stride_a * row + idx] * b[stride_b * col + idx]; + } + c[stride_c * row + col] = value; + } + } +} + +} // namespace cblas_ops +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h index 629783d7e58..e0eca2e736b 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h @@ -36,15 +36,11 @@ inline bool TestCPUFeatureNeon() { #elif defined USE_NEON || defined __ARM_NEON -inline bool TestCPUFeatureNeon() { - return true; -} +inline bool TestCPUFeatureNeon() { return true; } #else -inline bool TestCPUFeatureNeon() { - return false; -} +inline bool TestCPUFeatureNeon() { return false; } #endif diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h index 81796e295d9..e2c87df80bd 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h @@ -992,11 +992,11 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, for (int k = 0; k < 4; k++) { acc[k] = vld1q_f32(acc_buffer + i + 4 * k); } - for (int k = 0; k < 4; k++) { - acc[k] = vmaxq_f32( - vdupq_n_f32(output_activation_min), - vminq_f32(vdupq_n_f32(output_activation_max), acc[k])); - } + for (int k = 0; k < 4; k++) { + acc[k] = vmaxq_f32( + vdupq_n_f32(output_activation_min), + vminq_f32(vdupq_n_f32(output_activation_max), acc[k])); + } for (int k = 0; k < 4; k++) { vst1q_f32(output_ptr + 4 * k, acc[k]); } diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h index f21fbf532ac..ce3cde76999 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h @@ -39,7 +39,6 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #endif - namespace Eigen { /** SpatialConvolution @@ -215,13 +214,12 @@ EIGEN_DEVICE_FUNC } // TODO(yangke): choose() is defined in TensorContraction.h -- consider // moving it to somewhere more "common". - return - input - .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, - row_in_stride, col_in_stride, padding_type) - .reshape(pre_contract_dims) - .contract(kernel.reshape(kernel_dims), contract_dims) - .reshape(post_contract_dims); + return input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, + row_in_stride, col_in_stride, padding_type) + .reshape(pre_contract_dims) + .contract(kernel.reshape(kernel_dims), contract_dims) + .reshape(post_contract_dims); } } // end namespace Eigen diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index 31bade26f98..4bcf4993e92 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -2370,13 +2370,15 @@ inline int StartIndex(int start, int stride, int dim, bool masked) { return masked ? (stride > 0 ? 0 : dim - 1) : start; } -inline int StopIndex(int stop, int stride, int dim, bool masked) { - return masked ? (stride > 0 ? dim : -1) : stop; +inline int StopIndex(int start, int stop, int stride, int dim, bool masked, + bool shrink_axis_masked) { + return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1 + : masked ? (stride > 0 ? dim : -1) : stop; } template inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, - int begin_mask, int end_mask, + int begin_mask, int end_mask, int shrink_axis_mask, const std::vector& starts, const std::vector& stops, const std::vector& strides, T* output_data, @@ -2387,19 +2389,23 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, const int start_b = StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8); const int stop_b = - StopIndex(stops[3], strides[3], input_dims.sizes[3], end_mask & 8); + StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3], + end_mask & 8, shrink_axis_mask & 8); const int start_h = StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4); const int stop_h = - StopIndex(stops[2], strides[2], input_dims.sizes[2], end_mask & 4); + StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2], + end_mask & 4, shrink_axis_mask & 4); const int start_w = StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2); const int stop_w = - StopIndex(stops[1], strides[1], input_dims.sizes[1], end_mask & 2); + StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1], + end_mask & 2, shrink_axis_mask & 2); const int start_d = StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1); const int stop_d = - StopIndex(stops[0], strides[0], input_dims.sizes[0], end_mask & 1); + StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0], + end_mask & 1, shrink_axis_mask & 1); T* out_ptr = output_data; for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]); @@ -2417,6 +2423,18 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, } } +template +inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, + int begin_mask, int end_mask, + const std::vector& starts, + const std::vector& stops, + const std::vector& strides, T* output_data, + const Dims<4>& output_dims) { + StridedSlice(input_data, input_dims, begin_mask, end_mask, + /*shrink_axis_mask=*/0, starts, stops, strides, output_data, + output_dims); +} + template inline void Slice(const T* input_data, const Dims<4>& input_dims, const std::vector& begin, const std::vector& size, diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc index b0546c00cf9..955e8c5764c 100644 --- a/tensorflow/contrib/lite/kernels/kernel_util.cc +++ b/tensorflow/contrib/lite/kernels/kernel_util.cc @@ -13,8 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/kernels/kernel_util.h" + #include #include +#include + #include "tensorflow/contrib/lite/kernels/internal/round.h" namespace tflite { @@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation, } } +bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) { + return TfLiteIntArrayEqual(input1->dims, input2->dims); +} + +TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, + TfLiteTensor* input1, + TfLiteTensor* input2, + TfLiteIntArray** output_shape) { + int64_t dims1 = NumDimensions(input1); + int64_t dims2 = NumDimensions(input2); + int64_t out_dims = std::max(dims1, dims2); + std::unique_ptr shape( + TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree); + for (int i = 0; i < out_dims; ++i) { + int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1); + int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1); + TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1); + shape->data[out_dims - i - 1] = std::max(d1, d2); + } + *output_shape = shape.release(); + return kTfLiteOk; +} + } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h index bfdfba00f51..28f53b9fbbc 100644 --- a/tensorflow/contrib/lite/kernels/kernel_util.h +++ b/tensorflow/contrib/lite/kernels/kernel_util.h @@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node, inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; } inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; } +inline int64_t NumElements(const TfLiteTensor* t) { + int64_t count = 1; + for (int i = 0; i < NumDimensions(t); ++i) { + count *= SizeOfDimension(t, i); + } + return count; +} + inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context, const TfLiteNode* node, int index) { const bool use_tensor = node->inputs->data[index] != kOptionalTensor; @@ -57,7 +65,10 @@ inline bool IsDynamicTensor(TfLiteTensor* tensor) { // Sets tensor to dynamic. inline void SetTensorToDynamic(TfLiteTensor* tensor) { - tensor->allocation_type = kTfLiteDynamic; + if (tensor->allocation_type != kTfLiteDynamic) { + tensor->allocation_type = kTfLiteDynamic; + tensor->data.raw = nullptr; + } } // Calculates the multiplication factor for a quantized convolution (or @@ -76,6 +87,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation, float* activation_min, float* activation_max); +// Return true if the given tensors have the same shape. +bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2); + +// Calculate the output_shape that is necessary for element-wise operations +// with broadcasting involving the two input tensors. +TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context, + TfLiteTensor* input1, + TfLiteTensor* input2, + TfLiteIntArray** output_shape); } // namespace tflite #endif // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_ diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc new file mode 100644 index 00000000000..c65b68970f6 --- /dev/null +++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc @@ -0,0 +1,152 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +#include +#include +#include "tensorflow/contrib/lite/testing/util.h" + +namespace tflite { +namespace { + +void ReportError(TfLiteContext* context, const char* format, ...) {} + +class KernelUtilTest : public ::testing::Test { + public: + KernelUtilTest() { + context_.ReportError = ReportError; + + tensor1_.dims = nullptr; + tensor2_.dims = nullptr; + tensor1_.allocation_type = kTfLiteMmapRo; + tensor2_.allocation_type = kTfLiteMmapRo; + } + ~KernelUtilTest() { + TfLiteTensorFree(&tensor1_); + TfLiteTensorFree(&tensor2_); + } + + void SetShape(TfLiteTensor* tensor, std::initializer_list dims) { + TfLiteTensorFree(tensor); + tensor->dims = TfLiteIntArrayCreate(dims.size()); + int i = 0; + for (int d : dims) { + tensor->dims->data[i] = d; + ++i; + } + } + + std::vector GetShape(TfLiteIntArray* dims) { + std::vector result; + for (int i = 0; i < dims->size; ++i) { + result.push_back(dims->data[i]); + } + return result; + } + + protected: + TfLiteContext context_; + TfLiteTensor tensor1_; + TfLiteTensor tensor2_; +}; + +TEST_F(KernelUtilTest, SameShapeEmpty) { + EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor1_, {1, 2, 3}); + EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor2_, {1, 2}); + EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor2_, {1, 2, 3, 4}); + EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor2_, {1, 2, 3}); + EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor2_, {}); + EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_)); + + SetShape(&tensor1_, {}); + EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_)); +} + +TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) { + TfLiteIntArray* output = nullptr; + SetShape(&tensor1_, {1, 2}); + SetShape(&tensor2_, {1, 3}); + EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_EQ(output, nullptr); +} + +TEST_F(KernelUtilTest, BroadcastShapeOnes) { + TfLiteIntArray* output = nullptr; + SetShape(&tensor1_, {1, 1}); + SetShape(&tensor2_, {1, 3}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + TfLiteIntArrayFree(output); + + SetShape(&tensor1_, {1, 2}); + SetShape(&tensor2_, {1, 1}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + TfLiteIntArrayFree(output); +} + +TEST_F(KernelUtilTest, BroadcastShapeScalars) { + TfLiteIntArray* output = nullptr; + SetShape(&tensor1_, {1, 2}); + SetShape(&tensor2_, {}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2)); + TfLiteIntArrayFree(output); + + SetShape(&tensor1_, {}); + SetShape(&tensor2_, {2}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2)); + TfLiteIntArrayFree(output); +} + +TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) { + TfLiteIntArray* output = nullptr; + SetShape(&tensor1_, {1, 2}); + SetShape(&tensor2_, {3, 1, 1}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2)); + TfLiteIntArrayFree(output); + + SetShape(&tensor1_, {1, 2, 3, 4}); + SetShape(&tensor2_, {1, 3, 1}); + EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_, + &tensor2_, &output)); + EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4)); + TfLiteIntArrayFree(output); +} + +} // namespace +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc index 540e5a364dd..ec1c4020276 100644 --- a/tensorflow/contrib/lite/kernels/mean.cc +++ b/tensorflow/contrib/lite/kernels/mean.cc @@ -35,10 +35,12 @@ struct MeanContext { MeanContext(TfLiteContext* context, TfLiteNode* node) { params = reinterpret_cast(node->builtin_data); input = GetInput(context, node, 0); + axis = GetInput(context, node, 1); output = GetOutput(context, node, 0); } TfLiteMeanParams* params; TfLiteTensor* input; + TfLiteTensor* axis; TfLiteTensor* output; }; @@ -54,45 +56,26 @@ void Free(TfLiteContext* context, void* buffer) { delete reinterpret_cast(buffer); } -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2); - TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - - MeanContext op_context(context, node); - int input_num_dims = NumDimensions(op_context.input); - int axis_num_dims = op_context.params->num_axis_dimensions; - - // Creates a temp index to iterate through input data. - int* scratch_tensor_index = reinterpret_cast(node->user_data); - TfLiteIntArrayFree(node->temporaries); - node->temporaries = TfLiteIntArrayCreate(2); - node->temporaries->data[0] = *scratch_tensor_index; - TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]]; - scratch_tensor->type = kTfLiteInt32; - scratch_tensor->allocation_type = kTfLiteArenaRw; - TfLiteIntArray* index_size = TfLiteIntArrayCreate(1); - index_size->data[0] = input_num_dims; - TF_LITE_ENSURE_OK(context, - context->ResizeTensor(context, scratch_tensor, index_size)); - - // Creates a temp tensor to store resolved axis given input data. - node->temporaries->data[1] = *scratch_tensor_index + 1; - TfLiteTensor* axis_tensor = &context->tensors[node->temporaries->data[1]]; - axis_tensor->type = kTfLiteInt32; - axis_tensor->allocation_type = kTfLiteArenaRw; +// Resizes the temp tensor that stores resolved axis. +TfLiteStatus ResizeTempAxis(TfLiteContext* context, MeanContext* op_context, + TfLiteTensor* resolved_axis) { TfLiteIntArray* axis_size = TfLiteIntArrayCreate(1); - axis_size->data[0] = op_context.params->num_axis_dimensions; - TF_LITE_ENSURE_OK(context, - context->ResizeTensor(context, axis_tensor, axis_size)); + axis_size->data[0] = static_cast(NumElements(op_context->axis)); + return context->ResizeTensor(context, resolved_axis, axis_size); +} - // Determines size of output tensor. - const TfLiteIntArray* input_dims = op_context.input->dims; - const int* axis = op_context.params->axis; - if (op_context.params->keep_dims) { +// Resizes output array based on the input size and resolved axis. +TfLiteStatus ResizeOutputTensor(TfLiteContext* context, + MeanContext* op_context) { + size_t num_axis = NumElements(op_context->axis); + const TfLiteIntArray* input_dims = op_context->input->dims; + int input_num_dims = NumDimensions(op_context->input); + const int* axis = GetTensorData(op_context->axis); + if (op_context->params->keep_dims) { TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_num_dims); for (int idx = 0; idx < input_num_dims; ++idx) { bool is_axis = false; - for (int axis_idx = 0; axis_idx < axis_num_dims; ++axis_idx) { + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) { if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) { is_axis = true; break; @@ -104,11 +87,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output_dims->data[idx] = input_dims->data[idx]; } } - return context->ResizeTensor(context, op_context.output, output_dims); + return context->ResizeTensor(context, op_context->output, output_dims); } else { // Calculates size of reducing axis. - int num_reduce_axis = axis_num_dims; - for (int i = 0; i < axis_num_dims; ++i) { + int num_reduce_axis = num_axis; + for (int i = 0; i < num_axis; ++i) { int current = axis[i]; if (current < 0) { current += input_num_dims; @@ -131,7 +114,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { int num_skip_axis = 0; for (int idx = 0; idx < input_num_dims; ++idx) { bool is_axis = false; - for (int axis_idx = 0; axis_idx < axis_num_dims; ++axis_idx) { + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) { if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) { ++num_skip_axis; is_axis = true; @@ -142,24 +125,76 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output_dims->data[idx - num_skip_axis] = input_dims->data[idx]; } } - return context->ResizeTensor(context, op_context.output, output_dims); + return context->ResizeTensor(context, op_context->output, output_dims); } } +// Initializes temp tensors to store index and resolved axis. +TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node, + MeanContext* op_context) { + // Creates a temp index to iterate through input data. + int* scratch_tensor_index = reinterpret_cast(node->user_data); + TfLiteIntArrayFree(node->temporaries); + node->temporaries = TfLiteIntArrayCreate(2); + node->temporaries->data[0] = *scratch_tensor_index; + TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]]; + scratch_tensor->type = kTfLiteInt32; + scratch_tensor->allocation_type = kTfLiteArenaRw; + TfLiteIntArray* index_size = TfLiteIntArrayCreate(1); + index_size->data[0] = NumDimensions(op_context->input); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, scratch_tensor, index_size)); + + // Creates a temp tensor to store resolved axis given input data. + node->temporaries->data[1] = *scratch_tensor_index + 1; + TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]]; + resolved_axis->type = kTfLiteInt32; + return kTfLiteOk; +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + MeanContext op_context(context, node); + TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context)); + + TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]]; + // Leaves work to Eval if axis is not constant; else resizes output. + if (!IsConstantTensor(op_context.axis)) { + SetTensorToDynamic(op_context.output); + SetTensorToDynamic(resolved_axis); + return kTfLiteOk; + } + resolved_axis->allocation_type = kTfLiteArenaRw; + TF_LITE_ENSURE_OK(context, + ResizeTempAxis(context, &op_context, resolved_axis)); + return ResizeOutputTensor(context, &op_context); +} + template TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { MeanContext op_context(context, node); + int num_axis = static_cast(NumElements(op_context.axis)); TfLiteTensor* temp_index = &context->tensors[node->temporaries->data[0]]; TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]]; + // Resize the output tensor if the output tensor is dynamic. + if (IsDynamicTensor(op_context.output)) { + TF_LITE_ENSURE_OK(context, + ResizeTempAxis(context, &op_context, resolved_axis)); + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + TfLiteTensorRealloc(resolved_axis->bytes, resolved_axis); + TfLiteTensorRealloc(op_context.output->bytes, op_context.output); + } -#define TF_LITE_MEAN(kernel_type, data_type) \ - kernel_type::Mean<>( \ - GetTensorData(op_context.input), \ - op_context.input->dims->data, op_context.input->dims->size, \ - GetTensorData(op_context.output), \ - op_context.output->dims->data, op_context.output->dims->size, \ - op_context.params->axis, op_context.params->num_axis_dimensions, \ - op_context.params->keep_dims, GetTensorData(temp_index), \ +#define TF_LITE_MEAN(kernel_type, data_type) \ + kernel_type::Mean<>( \ + GetTensorData(op_context.input), \ + op_context.input->dims->data, op_context.input->dims->size, \ + GetTensorData(op_context.output), \ + op_context.output->dims->data, op_context.output->dims->size, \ + GetTensorData(op_context.axis), num_axis, \ + op_context.params->keep_dims, GetTensorData(temp_index), \ GetTensorData(resolved_axis)) if (kernel_type == kReference) { diff --git a/tensorflow/contrib/lite/kernels/mean_test.cc b/tensorflow/contrib/lite/kernels/mean_test.cc index 4305c0632f5..c4c53c2ded3 100644 --- a/tensorflow/contrib/lite/kernels/mean_test.cc +++ b/tensorflow/contrib/lite/kernels/mean_test.cc @@ -25,58 +25,108 @@ using ::testing::ElementsAreArray; class BaseMeanOpModel : public SingleOpModel { public: - BaseMeanOpModel(const TensorData& input, const TensorData& output, - std::initializer_list axis, bool keep_dims) { - input_ = AddInput(input); - output_ = AddOutput(output); - SetBuiltinOp( - BuiltinOperator_MEAN, BuiltinOptions_MeanOptions, - CreateMeanOptions(builder_, builder_.CreateVector(axis), keep_dims) - .Union()); - BuildInterpreter({GetShape(input_)}); - } + void SetAxis(std::initializer_list data) { PopulateTensor(axis_, data); } - int input() { return input_; } - - protected: - int input_; - int output_; -}; - -class FloatMeanOpModel : public BaseMeanOpModel { - public: - using BaseMeanOpModel::BaseMeanOpModel; - - void SetInput(std::initializer_list data) { + template + void SetInput(std::initializer_list data) { PopulateTensor(input_, data); } - std::vector GetOutput() { return ExtractVector(output_); } + template + std::vector GetOutput() { + return ExtractVector(output_); + } + std::vector GetOutputShape() { return GetTensorShape(output_); } + + protected: + int input_; + int axis_; + int output_; }; -TEST(FloatMeanOpTest, NotKeepDims) { +// Model for the tests case where axis is a const tensor. +class MeanOpConstModel : public BaseMeanOpModel { + public: + MeanOpConstModel(const TensorData& input, const TensorData& output, + std::initializer_list axis_shape, + std::initializer_list axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions, + CreateMeanOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Model for the tests case where axis is a dynamic tensor. +class MeanOpDynamicModel : public BaseMeanOpModel { + public: + MeanOpDynamicModel(const TensorData& input, const TensorData& output, + const TensorData& axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddInput(axis); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions, + CreateMeanOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +TEST(ConstMeanOpTest, NotKeepDims) { std::initializer_list data = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; - FloatMeanOpModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}}, - {1, 0, -3, -3}, false); + MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}}, + {4}, {1, 0, -3, -3}, false); m.SetInput(data); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); - EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({12, 13}))); + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({12, 13}))); } -TEST(FloatMeanOpTest, KeepDims) { +TEST(ConstMeanOpTest, KeepDims) { std::initializer_list data = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; - FloatMeanOpModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}}, - {0, 2}, true); + MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}}, + {2}, {0, 2}, true); m.SetInput(data); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); - EXPECT_THAT(m.GetOutput(), + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5}))); +} + +TEST(DynamicMeanOpTest, NotKeepDims) { + std::initializer_list data = { + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}}, + false); + std::initializer_list axis = {1, 0, -3, -3}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({12, 13}))); +} + +TEST(DynamicMeanOpTest, KeepDims) { + std::initializer_list data = { + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, + 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}}, + true); + std::initializer_list axis = {0, 2}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5}))); } diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc index 81c73f25231..54575019de4 100644 --- a/tensorflow/contrib/lite/kernels/mul.cc +++ b/tensorflow/contrib/lite/kernels/mul.cc @@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0; constexpr int kInputTensor2 = 1; constexpr int kOutputTensor = 0; +struct OpData { + bool requires_broadcast; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* data = new OpData; + data->requires_broadcast = false; + return data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast(buffer); +} + TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + OpData* data = reinterpret_cast(node->user_data); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); @@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2)); - for (int i = 0; i < NumDimensions(input1); ++i) { - TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i), - SizeOfDimension(input2, i)); + TF_LITE_ENSURE_EQ(context, input1->type, input2->type); + output->type = input2->type; + + data->requires_broadcast = !HaveSameShapes(input1, input2); + + TfLiteIntArray* output_size = nullptr; + if (data->requires_broadcast) { + TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast( + context, input1, input2, &output_size)); + } else { + output_size = TfLiteIntArrayCopy(input1->dims); } - TF_LITE_ENSURE_EQ(context, input1->type, output->type); - TF_LITE_ENSURE_EQ(context, input2->type, output->type); - - TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims); return context->ResizeTensor(context, output, output_size); } template void EvalFloat(TfLiteContext* context, TfLiteNode* node, - TfLiteMulParams* params, TfLiteTensor* input1, - TfLiteTensor* input2, TfLiteTensor* output) { + TfLiteMulParams* params, const OpData* data, + TfLiteTensor* input1, TfLiteTensor* input2, + TfLiteTensor* output) { float output_activation_min, output_activation_max; CalculateActivationRangeFloat(params->activation, &output_activation_min, &output_activation_max); -#define TF_LITE_MUL(type) \ - type::Mul(GetTensorData(input1), GetTensorDims(input1), \ - GetTensorData(input2), GetTensorDims(input2), \ - output_activation_min, output_activation_max, \ - GetTensorData(output), GetTensorDims(output)) +#define TF_LITE_MUL(type, opname) \ + type::opname(GetTensorData(input1), GetTensorDims(input1), \ + GetTensorData(input2), GetTensorDims(input2), \ + output_activation_min, output_activation_max, \ + GetTensorData(output), GetTensorDims(output)) if (kernel_type == kReference) { - TF_LITE_MUL(reference_ops); + if (data->requires_broadcast) { + TF_LITE_MUL(reference_ops, BroadcastMul); + } else { + TF_LITE_MUL(reference_ops, Mul); + } } else { - TF_LITE_MUL(optimized_ops); + if (data->requires_broadcast) { + TF_LITE_MUL(optimized_ops, BroadcastMul); + } else { + TF_LITE_MUL(optimized_ops, Mul); + } } #undef TF_LITE_MUL } template void EvalQuantized(TfLiteContext* context, TfLiteNode* node, - TfLiteMulParams* params, TfLiteTensor* input1, - TfLiteTensor* input2, TfLiteTensor* output) { + TfLiteMulParams* params, const OpData* data, + TfLiteTensor* input1, TfLiteTensor* input2, + TfLiteTensor* output) { auto input1_offset = -input1->params.zero_point; auto input2_offset = -input2->params.zero_point; auto output_offset = output->params.zero_point; @@ -98,17 +127,19 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, CalculateActivationRangeUint8(params->activation, output, &output_activation_min, &output_activation_max); -#define TF_LITE_MUL(type) \ - type::BroadcastMul(GetTensorData(input1), GetTensorDims(input1), \ - input1_offset, GetTensorData(input2), \ - GetTensorDims(input2), input2_offset, output_offset, \ - output_multiplier, output_shift, output_activation_min, \ - output_activation_max, GetTensorData(output), \ - GetTensorDims(output)); +#define TF_LITE_MUL(type, opname) \ + type::opname(GetTensorData(input1), GetTensorDims(input1), \ + input1_offset, GetTensorData(input2), \ + GetTensorDims(input2), input2_offset, output_offset, \ + output_multiplier, output_shift, output_activation_min, \ + output_activation_max, GetTensorData(output), \ + GetTensorDims(output)); + // The quantized version of Mul doesn't support activations, so we + // always use BroadcastMul. if (kernel_type == kReference) { - TF_LITE_MUL(reference_ops); + TF_LITE_MUL(reference_ops, BroadcastMul); } else { - TF_LITE_MUL(optimized_ops); + TF_LITE_MUL(optimized_ops, BroadcastMul); } #undef TF_LITE_MUL } @@ -116,15 +147,17 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, template TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast(node->builtin_data); + OpData* data = reinterpret_cast(node->user_data); TfLiteTensor* input1 = GetInput(context, node, kInputTensor1); TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); if (output->type == kTfLiteFloat32) { - EvalFloat(context, node, params, input1, input2, output); + EvalFloat(context, node, params, data, input1, input2, output); } else if (output->type == kTfLiteUInt8) { - EvalQuantized(context, node, params, input1, input2, output); + EvalQuantized(context, node, params, data, input1, input2, + output); } else { context->ReportError(context, "Mul only supports FLOAT32 and quantized UINT8 now."); @@ -137,19 +170,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace mul TfLiteRegistration* Register_MUL_REF() { - static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare, + static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare, mul::Eval}; return &r; } TfLiteRegistration* Register_MUL_GENERIC_OPT() { - static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare, + static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare, mul::Eval}; return &r; } TfLiteRegistration* Register_MUL_NEON_OPT() { - static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare, + static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare, mul::Eval}; return &r; } diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/contrib/lite/kernels/mul_test.cc index 8838b300c0a..f1a30f82634 100644 --- a/tensorflow/contrib/lite/kernels/mul_test.cc +++ b/tensorflow/contrib/lite/kernels/mul_test.cc @@ -25,10 +25,11 @@ using ::testing::ElementsAreArray; class BaseMulOpModel : public SingleOpModel { public: - BaseMulOpModel(TensorData input, TensorData output, + BaseMulOpModel(const TensorData& input1, const TensorData& input2, + const TensorData& output, ActivationFunctionType activation_type) { - input1_ = AddInput(input); - input2_ = AddInput(input); + input1_ = AddInput(input1); + input2_ = AddInput(input2); output_ = AddOutput(output); SetBuiltinOp(BuiltinOperator_MUL, BuiltinOptions_MulOptions, CreateMulOptions(builder_, activation_type).Union()); @@ -70,6 +71,7 @@ class QuantizedMulOpModel : public BaseMulOpModel { TEST(FloatMulOpTest, NoActivation) { FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, + {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); m.PopulateTensor(m.input1(), {-2.0, 0.2, 0.7, 0.8}); m.PopulateTensor(m.input2(), {0.1, 0.2, 0.3, 0.5}); @@ -79,9 +81,9 @@ TEST(FloatMulOpTest, NoActivation) { } TEST(FloatMulOpTest, ActivationRELU_N1_TO_1) { - FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, - {TensorType_FLOAT32, {}}, - ActivationFunctionType_RELU_N1_TO_1); + FloatMulOpModel m( + {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}}, + {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1); m.PopulateTensor(m.input1(), {-2.0, 0.2, 0.7, 0.8}); m.PopulateTensor(m.input2(), {0.1, 0.2, 0.3, 5}); m.Invoke(); @@ -94,6 +96,7 @@ TEST(FloatMulOpTest, VariousInputShapes) { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; for (int i = 0; i < test_shapes.size(); ++i) { FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]}, + {TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); m.PopulateTensor(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0}); m.PopulateTensor(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1}); @@ -105,8 +108,26 @@ TEST(FloatMulOpTest, VariousInputShapes) { } } +TEST(FloatMulOpTest, WithBroadcast) { + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]}, + {TensorType_FLOAT32, {}}, // always a scalar + {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); + m.PopulateTensor(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0}); + m.PopulateTensor(m.input2(), {0.1}); + m.Invoke(); + EXPECT_THAT( + m.GetOutput(), + ElementsAreArray(ArrayFloatNear({-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}))) + << "With shape number " << i; + } +} + TEST(QuantizedMulOpTest, NoActivation) { QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, + {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0}, ActivationFunctionType_NONE); m.QuantizeAndPopulate(m.input1(), {-0.8, 0.2, 0.9, 0.7}); @@ -117,6 +138,32 @@ TEST(QuantizedMulOpTest, NoActivation) { kQuantizedTolerance))); } +// for quantized Mul, the error shouldn't exceed 2*step +float GetTolerance(int min, int max) { + float kQuantizedStep = (max - min) / 255.0; + float kQuantizedTolerance = 2.0 * kQuantizedStep; + return kQuantizedTolerance; +} + +TEST(QuantizedMulOpTest, WithBroadcast) { + float kQuantizedTolerance = GetTolerance(-3.0, 3.0); + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0}, + {TensorType_UINT8, {}, -3.0, 3.0}, // always a scalar + {TensorType_UINT8, {}, -3.0, 3.0}, + ActivationFunctionType_NONE); + m.QuantizeAndPopulate(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0}); + m.QuantizeAndPopulate(m.input2(), {0.1}); + m.Invoke(); + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear( + {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance))) + << "With shape number " << i; + } +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc index 17166715ca3..cee3ec6197c 100644 --- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc +++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc @@ -243,7 +243,6 @@ class LSTMOpModel : public SingleOpModel { int n_output_; }; - TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { const int n_batch = 1; const int n_input = 2; @@ -282,7 +281,6 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) { {0}, // projection_bias tensor }); - lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781, 0.04717243, 0.48944736, -0.38535351, -0.17212132}); diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc index 4003ed10df4..48114e5a406 100644 --- a/tensorflow/contrib/lite/kernels/pad.cc +++ b/tensorflow/contrib/lite/kernels/pad.cc @@ -177,9 +177,7 @@ TfLiteRegistration* Register_PAD_GENERIC_OPT() { return &r; } -TfLiteRegistration* Register_PAD() { - return Register_PAD_GENERIC_OPT(); -} +TfLiteRegistration* Register_PAD() { return Register_PAD_GENERIC_OPT(); } } // namespace builtin } // namespace ops diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc index f605deaa5b4..1fb779fd517 100644 --- a/tensorflow/contrib/lite/kernels/register.cc +++ b/tensorflow/contrib/lite/kernels/register.cc @@ -31,6 +31,7 @@ TfLiteRegistration* Register_CONV_2D(); TfLiteRegistration* Register_DEPTHWISE_CONV_2D(); TfLiteRegistration* Register_SVDF(); TfLiteRegistration* Register_RNN(); +TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN(); TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN(); TfLiteRegistration* Register_EMBEDDING_LOOKUP(); TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE(); @@ -73,6 +74,8 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D()); AddBuiltin(BuiltinOperator_SVDF, Register_SVDF()); AddBuiltin(BuiltinOperator_RNN, Register_RNN()); + AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, + Register_BIDIRECTIONAL_SEQUENCE_RNN()); AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, Register_UNIDIRECTIONAL_SEQUENCE_RNN()); AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP()); diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc index 9a419af0238..4a2101f2468 100644 --- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc +++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc @@ -36,6 +36,17 @@ constexpr int kInputTensor = 0; constexpr int kSizeTensor = 1; constexpr int kOutputTensor = 0; +TfLiteStatus ResizeOutputTensor(TfLiteContext* context, TfLiteTensor* input, + TfLiteTensor* size, TfLiteTensor* output) { + TfLiteIntArray* output_size = TfLiteIntArrayCreate(4); + output_size->data[0] = input->dims->data[0]; + const int32* size_data = GetTensorData(size); + output_size->data[1] = size_data[0]; + output_size->data[2] = size_data[1]; + output_size->data[3] = input->dims->data[3]; + return context->ResizeTensor(context, output, output_size); +} + TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); @@ -55,9 +66,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // integers. output->type = kTfLiteFloat32; - // TODO(ahentz): if the input is constant, we can allocate here. - output->allocation_type = kTfLiteDynamic; - return kTfLiteOk; + if (!IsConstantTensor(size)) { + SetTensorToDynamic(output); + return kTfLiteOk; + } + return ResizeOutputTensor(context, input, size, output); } template @@ -66,15 +79,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = GetOutput(context, node, kOutputTensor); TfLiteTensor* size = GetInput(context, node, kSizeTensor); - // TODO(ahentz): we only need to do this here if it wasn't done in Eval(). - TfLiteIntArray* output_size = TfLiteIntArrayCreate(4); - output_size->data[0] = input->dims->data[0]; - const int32* size_data = GetTensorData(size); - output_size->data[1] = size_data[0]; - output_size->data[2] = size_data[1]; - output_size->data[3] = input->dims->data[3]; - context->ResizeTensor(context, output, output_size); - TfLiteTensorRealloc(output->bytes, output); + if (IsDynamicTensor(output)) { + TF_LITE_ENSURE_OK(context, + ResizeOutputTensor(context, input, size, output)); + TfLiteTensorRealloc(output->bytes, output); + } if (output->type == kTfLiteFloat32) { #define TF_LITE_RESIZE_BILINEAR(type) \ diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc index 2b1aaf654f8..4e03f3820a5 100644 --- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc +++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc @@ -25,14 +25,24 @@ using ::testing::ElementsAreArray; class ResizeBilinearOpModel : public SingleOpModel { public: - ResizeBilinearOpModel(std::initializer_list input_shape) { - input_ = AddInput(TensorType_FLOAT32); - size_ = AddInput(TensorType_INT32); - output_ = AddOutput(TensorType_FLOAT32); + ResizeBilinearOpModel(const TensorData& input, + std::initializer_list size_data = {}) { + bool const_size = size_data.size() != 0; + input_ = AddInput(input); + if (const_size) { + size_ = AddConstInput(TensorType_INT32, size_data, {2}); + } else { + size_ = AddInput({TensorType_INT32, {2}}); + } + output_ = AddOutput(TensorType_FLOAT32); // Always float. SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR, BuiltinOptions_ResizeBilinearOptions, CreateResizeBilinearOptions(builder_).Union()); - BuildInterpreter({input_shape, {2}}); + if (const_size) { + BuildInterpreter({GetShape(input_)}); + } else { + BuildInterpreter({GetShape(input_), GetShape(size_)}); + } } void SetInput(std::initializer_list data) { @@ -49,23 +59,33 @@ class ResizeBilinearOpModel : public SingleOpModel { }; TEST(ResizeBilinearOpTest, HorizontalResize) { - ResizeBilinearOpModel m({1, 1, 2, 1}); + ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}}); m.SetInput({3, 6}); m.SetSize({1, 3}); m.Invoke(); EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6}))); + + ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3}); + const_m.SetInput({3, 6}); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6}))); } TEST(ResizeBilinearOpTest, VerticalResize) { - ResizeBilinearOpModel m({1, 2, 1, 1}); + ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}}); m.SetInput({3, 9}); m.SetSize({3, 1}); m.Invoke(); EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9}))); + + ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1}); + const_m.SetInput({3, 9}); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9}))); } TEST(ResizeBilinearOpTest, TwoDimensionalResize) { - ResizeBilinearOpModel m({1, 2, 2, 1}); + ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}); m.SetInput({ 3, 6, // 9, 12 // @@ -77,10 +97,22 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize) { 7, 9, 10, // 9, 11, 12, // }))); + + ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3}); + const_m.SetInput({ + 3, 6, // + 9, 12 // + }); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + }))); } TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) { - ResizeBilinearOpModel m({2, 2, 2, 1}); + ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}); m.SetInput({ 3, 6, // 9, 12, // @@ -97,10 +129,27 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) { 8, 12, 14, // 10, 14, 16, // }))); + + ResizeBilinearOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3}); + const_m.SetInput({ + 3, 6, // + 9, 12, // + 4, 10, // + 10, 16 // + }); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({ + 3, 5, 6, // + 7, 9, 10, // + 9, 11, 12, // + 4, 8, 10, // + 8, 12, 14, // + 10, 14, 16, // + }))); } TEST(ResizeBilinearOpTest, ThreeDimensionalResize) { - ResizeBilinearOpModel m({1, 2, 2, 2}); + ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}); m.SetInput({ 3, 4, 6, 10, // 9, 10, 12, 16, // @@ -112,6 +161,18 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize) { 7, 8, 9, 12, 10, 14, // 9, 10, 11, 14, 12, 16, // }))); + + ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3}); + const_m.SetInput({ + 3, 4, 6, 10, // + 9, 10, 12, 16, // + }); + const_m.Invoke(); + EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({ + 3, 4, 5, 8, 6, 10, // + 7, 8, 9, 12, 10, 14, // + 9, 10, 11, 14, 12, 16, // + }))); } } // namespace diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc index 2e22d0db56a..e2e1873f770 100644 --- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc +++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc @@ -33,17 +33,16 @@ enum KernelType { kGenericOptimized, }; -// Inputs specified in the 2nd tensor (block_shape) and 3rd tensor (paddings) -// are ignored. Only use the `block_shape` and `paddings` specified in params. -// TODO(nupurgarg): Support inputs as tensors in SpaceToBatchND. struct SpaceToBatchNDContext { SpaceToBatchNDContext(TfLiteContext* context, TfLiteNode* node) { - params = reinterpret_cast(node->builtin_data); input = GetInput(context, node, 0); + block_shape = GetInput(context, node, 1); + paddings = GetInput(context, node, 2); output = GetOutput(context, node, 0); } - TfLiteSpaceToBatchNDParams* params; TfLiteTensor* input; + TfLiteTensor* block_shape; + TfLiteTensor* paddings; TfLiteTensor* output; }; @@ -51,32 +50,29 @@ struct SpaceToBatchNDContext { // The 4D array need to have exactly 2 spatial dimensions. // TODO(nupurgarg): Support arbitrary dimension in SpaceToBatchND. const int kInputDimensionNum = 4; -const int kOutputDimensionNum = 4; +const int kBlockSizeDimensionNum = 1; const int kSpatialDimensionNum = 2; -const int kPaddingDimensionNum = 4; -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - TF_LITE_ENSURE(context, NumInputs(node) >= 1 && NumInputs(node) <= 3); - TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); +TfLiteStatus ResizeOutputTensor(TfLiteContext* context, + SpaceToBatchNDContext* op_context) { + TfLiteIntArray* input_size = op_context->input->dims; + const int32* block_shape = GetTensorData(op_context->block_shape); + const int32* paddings_data = GetTensorData(op_context->paddings); - SpaceToBatchNDContext op_context(context, node); - TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input), - kInputDimensionNum); - TF_LITE_ENSURE_EQ(context, op_context.params->num_spatial_dimensions, + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape), + kBlockSizeDimensionNum); + TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0], + kSpatialDimensionNum); + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->paddings), kSpatialDimensionNum); - TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type); - const TfLiteIntArray* input_size = op_context.input->dims; - const int* block_shape = op_context.params->block_shape; - - TfLiteIntArray* output_size = TfLiteIntArrayCreate(kOutputDimensionNum); + TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size); // Ensures the input height and width (with padding) is a multiple of block // shape height and width. for (int dim = 0; dim < kSpatialDimensionNum; ++dim) { - int final_dim_size = - (input_size->data[dim + 1] + op_context.params->before_paddings[dim] + - op_context.params->after_paddings[dim]); + int final_dim_size = (input_size->data[dim + 1] + paddings_data[dim * 2] + + paddings_data[dim * 2 + 1]); TF_LITE_ENSURE_EQ(context, final_dim_size % block_shape[dim], 0); output_size->data[dim + 1] = final_dim_size / block_shape[dim]; } @@ -88,33 +84,44 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output_size->data[0] = output_batch_size; output_size->data[3] = output_channel_size; - return context->ResizeTensor(context, op_context.output, output_size); + return context->ResizeTensor(context, op_context->output, output_size); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 3); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + SpaceToBatchNDContext op_context(context, node); + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input), + kInputDimensionNum); + TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type); + + if (!IsConstantTensor(op_context.block_shape) || + !IsConstantTensor(op_context.paddings)) { + SetTensorToDynamic(op_context.output); + return kTfLiteOk; + } + return ResizeOutputTensor(context, &op_context); } template TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { SpaceToBatchNDContext op_context(context, node); - int block_shape_dims_array[1] = {kSpatialDimensionNum}; - Dims<4> block_shape_dims = GetTensorDims(block_shape_dims_array, 1); + // Resize the output tensor if the output tensor is dynamic. + if (IsDynamicTensor(op_context.output)) { + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + TfLiteTensorRealloc(op_context.output->bytes, op_context.output); + } - // Initialize padding array in the format accepted by the kernel code. - // TODO(nupurgarg): Make kernel code accept padding array format that is - // consistent with Pad operation (i.e. before_paddings and after_paddings). - TfLiteIntArray* padding_data = TfLiteIntArrayCreate(kPaddingDimensionNum); - padding_data->data[0] = op_context.params->before_paddings[0]; - padding_data->data[1] = op_context.params->after_paddings[0]; - padding_data->data[2] = op_context.params->before_paddings[1]; - padding_data->data[3] = op_context.params->after_paddings[1]; - int padding_dims_array[1] = {kPaddingDimensionNum}; - Dims<4> padding_dims = GetTensorDims(padding_dims_array, 1); - -#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar) \ - type::SpaceToBatchND(GetTensorData(op_context.input), \ - GetTensorDims(op_context.input), \ - op_context.params->block_shape, block_shape_dims, \ - padding_data->data, padding_dims, \ - GetTensorData(op_context.output), \ +#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar) \ + type::SpaceToBatchND(GetTensorData(op_context.input), \ + GetTensorDims(op_context.input), \ + GetTensorData(op_context.block_shape), \ + GetTensorDims(op_context.block_shape), \ + GetTensorData(op_context.paddings), \ + GetTensorDims(op_context.paddings), \ + GetTensorData(op_context.output), \ GetTensorDims(op_context.output)) switch (op_context.input->type) { // Already know in/out types are same. case kTfLiteFloat32: @@ -151,8 +158,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteError; } #undef TF_LITE_SPACE_TO_BATCH_ND - - TfLiteIntArrayFree(padding_data); return kTfLiteOk; } diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc index 45a6aef73d0..92a4a037d58 100644 --- a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc +++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc @@ -26,41 +26,81 @@ using ::testing::ElementsAreArray; class SpaceToBatchNDOpModel : public SingleOpModel { public: - SpaceToBatchNDOpModel(std::initializer_list input_shape, - std::initializer_list block_shape, - std::initializer_list before_paddings, - std::initializer_list after_paddings) { - input_ = AddInput(TensorType_FLOAT32); - output_ = AddOutput(TensorType_FLOAT32); - SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND, - BuiltinOptions_SpaceToBatchNDOptions, - CreateSpaceToBatchNDOptions( - builder_, builder_.CreateVector(block_shape), - builder_.CreateVector(before_paddings), - builder_.CreateVector(after_paddings)) - .Union()); - BuildInterpreter({input_shape}); - } - void SetInput(std::initializer_list data) { PopulateTensor(input_, data); } + void SetBlockShape(std::initializer_list data) { + PopulateTensor(block_shape_, data); + } + + void SetPaddings(std::initializer_list data) { + PopulateTensor(paddings_, data); + } + std::vector GetOutput() { return ExtractVector(output_); } std::vector GetOutputShape() { return GetTensorShape(output_); } - private: + protected: int input_; + int block_shape_; + int paddings_; int output_; }; +// Tests case where block_shape and paddings are const tensors. +// +// Example usage is as follows: +// SpaceToBatchNDOpConstModel m(input_shape, block_shape, paddings); +// m.SetInput(input_data); +// m.Invoke(); +class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel { + public: + SpaceToBatchNDOpConstModel(std::initializer_list input_shape, + std::initializer_list block_shape, + std::initializer_list paddings) { + input_ = AddInput(TensorType_FLOAT32); + block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2}); + paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2}); + output_ = AddOutput(TensorType_FLOAT32); + + SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND, + BuiltinOptions_SpaceToBatchNDOptions, + CreateSpaceToBatchNDOptions(builder_).Union()); + BuildInterpreter({input_shape}); + } +}; + +// Tests case where block_shape and paddings are non-const tensors. +// +// Example usage is as follows: +// SpaceToBatchNDOpDynamicModel m(input_shape); +// m.SetInput(input_data); +// m.SetBlockShape(block_shape); +// m.SetPaddings(paddings); +// m.Invoke(); +class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel { + public: + SpaceToBatchNDOpDynamicModel(std::initializer_list input_shape) { + input_ = AddInput(TensorType_FLOAT32); + block_shape_ = AddInput(TensorType_INT32); + paddings_ = AddInput(TensorType_INT32); + output_ = AddOutput(TensorType_FLOAT32); + + SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND, + BuiltinOptions_SpaceToBatchNDOptions, + CreateSpaceToBatchNDOptions(builder_).Union()); + BuildInterpreter({input_shape, {2}, {2, 2}}); + } +}; + TEST(SpaceToBatchNDOpTest, InvalidShapeTest) { - EXPECT_DEATH(SpaceToBatchNDOpModel({1, 3, 3, 1}, {2, 2}, {0, 0}, {0, 0}), + EXPECT_DEATH(SpaceToBatchNDOpConstModel({1, 3, 3, 1}, {2, 2}, {0, 0, 0, 0}), "Cannot allocate tensors"); } -TEST(SpaceToBatchNDOpTest, SimpleTest) { - SpaceToBatchNDOpModel m({1, 4, 4, 1}, {2, 2}, {0, 0}, {0, 0}); +TEST(SpaceToBatchNDOpTest, SimpleConstTest) { + SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1})); @@ -68,8 +108,19 @@ TEST(SpaceToBatchNDOpTest, SimpleTest) { 13, 15, 6, 8, 14, 16})); } -TEST(SpaceToBatchNDOpTest, MultipleInputBatches) { - SpaceToBatchNDOpModel m({2, 2, 4, 1}, {2, 2}, {0, 0}, {0, 0}); +TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) { + SpaceToBatchNDOpDynamicModel m({1, 4, 4, 1}); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + m.SetBlockShape({2, 2}); + m.SetPaddings({0, 0, 0, 0}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7, + 13, 15, 6, 8, 14, 16})); +} + +TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) { + SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1})); @@ -77,8 +128,19 @@ TEST(SpaceToBatchNDOpTest, MultipleInputBatches) { 13, 15, 6, 8, 14, 16})); } -TEST(SpaceToBatchNDOpTest, SimplePadding) { - SpaceToBatchNDOpModel m({1, 5, 2, 1}, {3, 2}, {1, 2}, {0, 0}); +TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) { + SpaceToBatchNDOpDynamicModel m({2, 2, 4, 1}); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + m.SetBlockShape({2, 2}); + m.SetPaddings({0, 0, 0, 0}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7, + 13, 15, 6, 8, 14, 16})); +} + +TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) { + SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1})); @@ -88,8 +150,21 @@ TEST(SpaceToBatchNDOpTest, SimplePadding) { })); } -TEST(SpaceToBatchNDOpTest, ComplexPadding) { - SpaceToBatchNDOpModel m({1, 4, 2, 1}, {3, 2}, {1, 2}, {1, 4}); +TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) { + SpaceToBatchNDOpDynamicModel m({1, 5, 2, 1}); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + m.SetBlockShape({3, 2}); + m.SetPaddings({1, 0, 2, 0}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 0, 0, 0, 5, 0, 0, 0, 6, 0, 1, 0, 7, + 0, 2, 0, 8, 0, 3, 0, 9, 0, 4, 0, 10, + })); +} + +TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) { + SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4}); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1})); @@ -100,6 +175,20 @@ TEST(SpaceToBatchNDOpTest, ComplexPadding) { })); } +TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) { + SpaceToBatchNDOpDynamicModel m({1, 4, 2, 1}); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8}); + m.SetBlockShape({3, 2}); + m.SetPaddings({1, 1, 2, 4}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({ + 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, + 0, 1, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 8, 0, 0, + 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, + })); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc index 91ba4a9b785..c4ffdf79d3a 100644 --- a/tensorflow/contrib/lite/kernels/strided_slice.cc +++ b/tensorflow/contrib/lite/kernels/strided_slice.cc @@ -57,65 +57,6 @@ struct StridedSliceContext { int dims; }; -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - TF_LITE_ENSURE_EQ(context, NumInputs(node), 4); - TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - - StridedSliceContext op_context(context, node); - - // Ensure validity of input tensor and its dimension - TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1); - TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1); - TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1); - TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type); - // Only INT32 begin/end/strides are supported - // TODO(soroosh) add support for INT64 - TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32); - TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32); - TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32); - TF_LITE_ENSURE_MSG(context, op_context.dims <= 4, - "StridedSlice op only supports 1D-4D input arrays."); - - // TODO(soroosh): add the following missing functionalities - TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0, - "ellipsis_mask is not implemented yet."); - TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0, - "new_axis_mask is not implemented yet."); - TF_LITE_ENSURE_MSG(context, op_context.params->shrink_axis_mask == 0, - "shrink_axis_mask is not implemented yet."); - - // TODO(soroosh): optimize for constant tensors to do allocation in Prepare - op_context.output->allocation_type = kTfLiteDynamic; - return kTfLiteOk; -} // namespace strided_slice - -// TODO(soroosh): consolidate with BytesRequired in interpreter.h -TfLiteStatus BytesRequired(TfLiteContext* context, TfLiteType type, - const int* dims, int dims_size, size_t* bytes) { - // TODO(aselle): Check for overflow here using overflow.h in TensorFlow - // MultiplyWithoutOverflow. - TF_LITE_ENSURE(context, bytes != nullptr); - size_t count = 1; - for (int k = 0; k < dims_size; k++) count *= dims[k]; - switch (type) { - case kTfLiteFloat32: - *bytes = sizeof(float) * count; - break; - case kTfLiteInt32: - *bytes = sizeof(int32_t) * count; - break; - case kTfLiteUInt8: - *bytes = sizeof(uint8_t) * count; - break; - case kTfLiteInt64: - *bytes = sizeof(int64_t) * count; - break; - default: - return kTfLiteError; - } - return kTfLiteOk; -} - // Reverse order of bits in the mask to match the expected order in kernel inline int ReverseMaskBits(int mask, int num_dimensions) { int out = 0; @@ -146,40 +87,111 @@ inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) { std::min(std::max(index, -dim), dim - 1), dim)); } +inline int32_t GetBeginValueAtIndex(StridedSliceContext* op_context, int idx) { + const int dim = op_context->input->dims->data[idx]; + const bool pos_stride = GetTensorData(op_context->strides)[idx] > 0; + return op_context->params->begin_mask & (1 << idx) + ? pos_stride ? 0 : dim - 1 + : ClampedIndex(GetTensorData(op_context->begin)[idx], dim, + pos_stride); +} + +inline int32_t GetEndValueAtIndex(StridedSliceContext* op_context, int idx) { + const int dim = op_context->input->dims->data[idx]; + const bool pos_stride = GetTensorData(op_context->strides)[idx] > 0; + return op_context->params->end_mask & (1 << idx) + ? pos_stride ? dim : -1 + : ClampedIndex(GetTensorData(op_context->end)[idx], dim, + pos_stride); +} + +// Processes the indexing tensors (begin, end and strides) to resize the +// output tensor. This function is callable from both Prepare() and Eval() as +// long as the caller ensures the indexing tensors are present. +TfLiteStatus ResizeOutputTensor(TfLiteContext* context, + StridedSliceContext* op_context) { + std::vector output_shape_vector; + + for (int idx = op_context->dims - 1; idx >= 0; --idx) { + int32_t stride = GetTensorData(op_context->strides)[idx]; + TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero"); + + int32_t begin = GetBeginValueAtIndex(op_context, idx); + int32_t end = GetEndValueAtIndex(op_context, idx); + + // This is valid for both positive and negative strides + int32_t dim_shape = ceil((end - begin) / static_cast(stride)); + dim_shape = dim_shape < 0 ? 0 : dim_shape; + if (!(op_context->params->shrink_axis_mask & (1 << idx))) { + output_shape_vector.push_back(dim_shape); + } + } + + TfLiteIntArray* output_shape = + TfLiteIntArrayCreate(output_shape_vector.size()); + + std::reverse_copy(output_shape_vector.begin(), output_shape_vector.end(), + output_shape->data); + + TF_LITE_ENSURE_STATUS( + context->ResizeTensor(context, op_context->output, output_shape)); + + return kTfLiteOk; +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 4); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + StridedSliceContext op_context(context, node); + + // Ensure validity of input tensor and its dimension + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1); + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1); + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1); + TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type); + // Only INT32 begin/end/strides are supported + // TODO(soroosh) add support for INT64 + TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32); + TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32); + TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32); + TF_LITE_ENSURE_MSG(context, op_context.dims <= 4, + "StridedSlice op only supports 1D-4D input arrays."); + + // TODO(soroosh): add the following missing functionalities + TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0, + "ellipsis_mask is not implemented yet."); + TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0, + "new_axis_mask is not implemented yet."); + + // Postpone allocation of output if any of the indexing tensors is not + // constant + if (!(IsConstantTensor(op_context.begin) && + IsConstantTensor(op_context.end) && + IsConstantTensor(op_context.strides))) { + SetTensorToDynamic(op_context.output); + return kTfLiteOk; + } + return ResizeOutputTensor(context, &op_context); +} + template TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { StridedSliceContext op_context(context, node); - std::vector starts; - std::vector stops; - std::vector strides; + if (IsDynamicTensor(op_context.output)) { + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + TfLiteTensorRealloc(op_context.output->bytes, op_context.output); + } + + std::vector starts; + std::vector stops; + std::vector strides; - // Determine size of output tensor and map indices - TfLiteIntArray* output_shape = TfLiteIntArrayCreate(op_context.dims); for (int idx = op_context.dims - 1; idx >= 0; --idx) { - int dim = op_context.input->dims->data[idx]; - int32_t stride = GetTensorData(op_context.strides)[idx]; - TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero"); - bool pos_stride = stride > 0; - - int32_t begin = - op_context.params->begin_mask & (1 << idx) - ? pos_stride ? 0 : dim - 1 - : ClampedIndex(GetTensorData(op_context.begin)[idx], dim, - pos_stride); - int32_t end = - op_context.params->end_mask & (1 << idx) - ? pos_stride ? dim : -1 - : ClampedIndex(GetTensorData(op_context.end)[idx], dim, - pos_stride); - - // This is valid for both positive and negative strides - output_shape->data[idx] = ceil((end - begin) / static_cast(stride)); - output_shape->data[idx] = - output_shape->data[idx] < 0 ? 0 : output_shape->data[idx]; - starts.emplace_back(begin); - stops.emplace_back(end); - strides.emplace_back(stride); + starts.emplace_back(GetBeginValueAtIndex(&op_context, idx)); + stops.emplace_back(GetEndValueAtIndex(&op_context, idx)); + strides.emplace_back(GetTensorData(op_context.strides)[idx]); } for (int i = op_context.dims; i < kMaxDim; i++) { @@ -188,27 +200,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { strides.emplace_back(1); } - TF_LITE_ENSURE_STATUS( - context->ResizeTensor(context, op_context.output, output_shape)); - - size_t required_bytes; - TF_LITE_ENSURE_OK( - context, - BytesRequired(context, op_context.output->type, output_shape->data, - output_shape->size, &required_bytes)); - TfLiteTensorRealloc(required_bytes, op_context.output); - op_context.params->begin_mask = ReverseMaskBits(op_context.params->begin_mask, op_context.dims); op_context.params->end_mask = ReverseMaskBits(op_context.params->end_mask, op_context.dims); + op_context.params->shrink_axis_mask = + ReverseMaskBits(op_context.params->shrink_axis_mask, op_context.dims); -#define TF_LITE_STRIDED_SLICE(kernel_type, data_type) \ - kernel_type::StridedSlice( \ - GetTensorData(op_context.input), \ - GetTensorDims(op_context.input), op_context.params->begin_mask, \ - op_context.params->end_mask, starts, stops, strides, \ - GetTensorData(op_context.output), \ +#define TF_LITE_STRIDED_SLICE(kernel_type, data_type) \ + kernel_type::StridedSlice( \ + GetTensorData(op_context.input), \ + GetTensorDims(op_context.input), op_context.params->begin_mask, \ + op_context.params->end_mask, op_context.params->shrink_axis_mask, \ + starts, stops, strides, GetTensorData(op_context.output), \ GetTensorDims(op_context.output)) switch (op_context.input->type) { diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc index cd4a364682c..5cac04b3836 100644 --- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc +++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc @@ -21,6 +21,7 @@ limitations under the License. namespace tflite { namespace { +using ::int32; using ::testing::ElementsAreArray; class StridedSliceOpModel : public SingleOpModel { @@ -79,8 +80,6 @@ TEST(StridedSliceOpTest, UnssupportedArgs) { "ellipsis_mask is not implemented yet."); EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0), "new_axis_mask is not implemented yet."); - EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 0, 1), - "shrink_axis_mask is not implemented yet."); } TEST(StridedSliceOpTest, In1D) { @@ -213,6 +212,7 @@ TEST(StridedSliceOpTest, In1D_EndMask) { EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3})); EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4})); } + TEST(StridedSliceOpTest, In1D_NegStride) { StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0); m.SetInput({1, 2, 3}); @@ -234,6 +234,7 @@ TEST(StridedSliceOpTest, In1D_EvenLenStride2) { EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); EXPECT_THAT(m.GetOutput(), ElementsAreArray({1})); } + TEST(StridedSliceOpTest, In1D_OddLenStride2) { StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0); m.SetInput({1, 2, 3}); @@ -255,6 +256,7 @@ TEST(StridedSliceOpTest, In2D_Identity) { EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3})); EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6})); } + TEST(StridedSliceOpTest, In2D) { StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0); m.SetInput({1, 2, 3, 4, 5, 6}); @@ -320,6 +322,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) { EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3})); EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 5, 4})); } + TEST(StridedSliceOpTest, In2D_NegStrideEndMask) { StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0); m.SetInput({1, 2, 3, 4, 5, 6}); @@ -354,6 +357,7 @@ TEST(StridedSliceOpTest, In3D_NegStride) { EXPECT_THAT(m.GetOutput(), ElementsAreArray({12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1})); } + TEST(StridedSliceOpTest, In3D_Strided2) { StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); @@ -365,6 +369,159 @@ TEST(StridedSliceOpTest, In3D_Strided2) { EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5})); } +TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) { + StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1); + m.SetInput({1, 2, 3, 4}); + m.SetBegin({1}); + m.SetEnd({3}); + m.SetStrides({1}); + m.Invoke(); + EXPECT_TRUE(m.GetOutputShape().empty()); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({2})); +} + +TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) { + StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1); + m.SetInput({1, 2, 3, 4}); + m.SetBegin({2}); + m.SetEnd({1}); + m.SetStrides({1}); + m.Invoke(); + EXPECT_TRUE(m.GetOutputShape().empty()); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({3})); +} + +TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) { + StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1); + m.SetInput({1, 2, 3, 4}); + m.SetBegin({1}); + m.SetEnd({3}); + m.SetStrides({1}); + m.Invoke(); + EXPECT_TRUE(m.GetOutputShape().empty()); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1})); +} + +TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) { + StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1); + m.SetInput({1, 2, 3, 4}); + m.SetBegin({-2}); + m.SetEnd({-3}); + m.SetStrides({-1}); + m.Invoke(); + EXPECT_TRUE(m.GetOutputShape().empty()); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({3})); +} + +TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) { + StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1); + m.SetInput({1, 2, 3, 4, 5, 6}); + m.SetBegin({0, 0}); + m.SetEnd({2, 3}); + m.SetStrides({1, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3})); +} + +TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) { + StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2); + m.SetInput({1, 2, 3, 4, 5, 6}); + m.SetBegin({0, 0}); + m.SetEnd({2, 3}); + m.SetStrides({1, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4})); +} + +TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) { + StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3); + m.SetInput({1, 2, 3, 4, 5, 6}); + m.SetBegin({0, 0}); + m.SetEnd({2, 3}); + m.SetStrides({1, 1}); + m.Invoke(); + EXPECT_TRUE(m.GetOutputShape().empty()); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1})); +} + +TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) { + StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + m.SetBegin({0, 0, 0}); + m.SetEnd({2, 3, 2}); + m.SetStrides({1, 1, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6})); +} + +TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) { + StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + m.SetBegin({0, 0, 0}); + m.SetEnd({2, 3, 2}); + m.SetStrides({1, 1, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 7, 8})); +} + +TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) { + StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + m.SetBegin({0, 0, 0}); + m.SetEnd({2, 3, 2}); + m.SetStrides({1, 1, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2})); +} + +TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) { + StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + m.SetBegin({0, 0, 0}); + m.SetEnd({2, 3, 2}); + m.SetStrides({1, 1, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5, 7, 9, 11})); +} + +TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) { + StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + m.SetBegin({0, 0, 0}); + m.SetEnd({2, 3, 2}); + m.SetStrides({1, 1, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5})); +} + +TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) { + StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + m.SetBegin({0, 0, 0}); + m.SetEnd({2, 3, 2}); + m.SetStrides({1, 1, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 7})); +} + +TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) { + StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7); + m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + m.SetBegin({0, 0, 0}); + m.SetEnd({2, 3, 2}); + m.SetStrides({1, 1, 1}); + m.Invoke(); + EXPECT_TRUE(m.GetOutputShape().empty()); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1})); +} } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc index 72f705fe424..c69755447d5 100644 --- a/tensorflow/contrib/lite/kernels/svdf.cc +++ b/tensorflow/contrib/lite/kernels/svdf.cc @@ -15,8 +15,8 @@ limitations under the License. #include #include #include -#include #include +#include #include #include diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc index 4de2ceaf053..0f166dc69b9 100644 --- a/tensorflow/contrib/lite/kernels/svdf_test.cc +++ b/tensorflow/contrib/lite/kernels/svdf_test.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ // Unit test for TFLite SVDF op. -#include #include +#include #include #include diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc index 3a58e7ec321..6f56aa6bf38 100644 --- a/tensorflow/contrib/lite/kernels/test_util.cc +++ b/tensorflow/contrib/lite/kernels/test_util.cc @@ -172,11 +172,14 @@ void SingleOpModel::BuildInterpreter( auto* model = GetModel(builder_.GetBufferPointer()); - ops::builtin::BuiltinOpResolver builtins; - for (const auto& reg : custom_registrations_) { - builtins.AddCustom(reg.first.data(), reg.second()); + if (!resolver_) { + auto resolver = new ops::builtin::BuiltinOpResolver(); + for (const auto& reg : custom_registrations_) { + resolver->AddCustom(reg.first.data(), reg.second()); + } + resolver_ = std::unique_ptr(resolver); } - InterpreterBuilder(model, builtins)(&interpreter_); + InterpreterBuilder(model, *resolver_)(&interpreter_); CHECK(interpreter_ != nullptr); diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h index cc445299ff9..7d476ba1eaf 100644 --- a/tensorflow/contrib/lite/kernels/test_util.h +++ b/tensorflow/contrib/lite/kernels/test_util.h @@ -85,6 +85,23 @@ struct TensorData { int32_t zero_point; }; +class SingleOpResolver : public OpResolver { + public: + SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration) + : op_(op), registration_(registration) {} + TfLiteRegistration* FindOp(BuiltinOperator op) const override { + if (op == op_) { + return registration_; + } + return nullptr; + } + TfLiteRegistration* FindOp(const char* op) const override { return nullptr; } + + private: + const BuiltinOperator op_; + TfLiteRegistration* registration_; +}; + class SingleOpModel { public: SingleOpModel() {} @@ -178,11 +195,16 @@ class SingleOpModel { return result; } + void SetResolver(std::unique_ptr resolver) { + resolver_ = std::move(resolver); + } + protected: int32_t GetTensorSize(int index) const; flatbuffers::FlatBufferBuilder builder_; std::unique_ptr interpreter_; + std::unique_ptr resolver_; private: int AddTensor(TensorData t, std::initializer_list data); @@ -197,6 +219,36 @@ class SingleOpModel { std::map> custom_registrations_; }; +// Base class for single op unit tests. +// The tests are parameterized to test multiple kernels for a single op. +// The parameters are strings like "optimized" and "reference" to have better +// readability in test reports. +// +// To use this class: +// * Define a constant map from strings to TfLiteRegistration. +// * Implement a test class that inherits SingleOpTest. +// * Instantiate the test cases with SingleOpTest::GetKernelTags helper +// function. +// * Call GetRegistration to get the TfLiteRegistration to be used before +// building the interpreter. +class SingleOpTest : public ::testing::TestWithParam { + public: + static std::vector GetKernelTags( + const std::map& kernel_map) { + std::vector tags; + for (auto it : kernel_map) { + tags.push_back(it.first); + } + return tags; + } + + protected: + virtual const std::map& GetKernelMap() = 0; + TfLiteRegistration* GetRegistration() { + return GetKernelMap().at(GetParam()); + } +}; + // Strings have a special implementation that is in test_util.cc template <> std::vector SingleOpModel::ExtractVector(int index); diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/contrib/lite/kernels/transpose.cc index 75d8136b6a2..093814bc449 100644 --- a/tensorflow/contrib/lite/kernels/transpose.cc +++ b/tensorflow/contrib/lite/kernels/transpose.cc @@ -31,60 +31,78 @@ enum KernelType { kReference, }; -// TODO(nupurgarg): Permutation arrays represented as a tensor are ignored. Only -// use the `perm` specified in `params`. struct TransposeContext { TransposeContext(TfLiteContext* context, TfLiteNode* node) { - params = reinterpret_cast(node->builtin_data); input = GetInput(context, node, 0); + perm = GetInput(context, node, 1); output = GetOutput(context, node, 0); } - TfLiteTransposeParams* params; TfLiteTensor* input; + TfLiteTensor* perm; TfLiteTensor* output; }; -TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { - TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2); - TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); +TfLiteStatus ResizeOutputTensor(TfLiteContext* context, + TransposeContext* op_context) { + int dims = NumDimensions(op_context->input); + const int* perm_data = GetTensorData(op_context->perm); - TransposeContext op_context(context, node); - int dims = NumDimensions(op_context.input); - - // Ensure validity of input tensor and permutation array. - TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type); - TF_LITE_ENSURE_EQ(context, dims, op_context.params->num_dimensions); - TF_LITE_ENSURE_MSG(context, dims <= 4, - "Transpose op only supports 1D-4D input arrays."); + // Ensure validity of the permutations tensor as a 1D tensor. + TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->perm), 1); + TF_LITE_ENSURE_EQ(context, op_context->perm->dims->data[0], dims); for (int idx = 0; idx < dims; ++idx) { - TF_LITE_ENSURE_MSG(context, - op_context.params->perm[idx] >= 0 && - op_context.params->perm[idx] < dims, + TF_LITE_ENSURE_MSG(context, (perm_data[idx] >= 0 && perm_data[idx] < dims), "Transpose op permutations array is out of bounds."); } // Determine size of output tensor. - const TfLiteIntArray* input_size = op_context.input->dims; - TfLiteIntArray* output_size = TfLiteIntArrayCreate(dims); + TfLiteIntArray* input_size = op_context->input->dims; + TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size); for (int idx = 0; idx < dims; ++idx) { - output_size->data[idx] = input_size->data[op_context.params->perm[idx]]; + output_size->data[idx] = input_size->data[perm_data[idx]]; } - return context->ResizeTensor(context, op_context.output, output_size); + return context->ResizeTensor(context, op_context->output, output_size); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + TransposeContext op_context(context, node); + + // Ensure validity of input tensor. + TF_LITE_ENSURE_MSG(context, NumDimensions(op_context.input) <= 4, + "Transpose op only supports 1D-4D input arrays."); + TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type); + + if (!IsConstantTensor(op_context.perm)) { + SetTensorToDynamic(op_context.output); + return kTfLiteOk; + } + return ResizeOutputTensor(context, &op_context); } template TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TransposeContext op_context(context, node); + // Resize the output tensor if the output tensor is dynamic. + if (IsDynamicTensor(op_context.output)) { + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + TfLiteTensorRealloc(op_context.output->bytes, op_context.output); + } + // Reverse the permuted axes and convert to 4D due to the way Dims are // constructed in GetTensorDims. + const int* perm_data = GetTensorData(op_context.perm); + const int size = op_context.perm->dims->data[0]; const int kOutputDimensionNum = 4; int reversed_perm[kOutputDimensionNum]; - int size = op_context.params->num_dimensions; + for (int output_k = 0, input_k = size - 1; output_k < size; ++output_k, --input_k) { - reversed_perm[output_k] = size - op_context.params->perm[input_k] - 1; + reversed_perm[output_k] = size - perm_data[input_k] - 1; } for (int k = size; k < kOutputDimensionNum; ++k) { reversed_perm[k] = k; diff --git a/tensorflow/contrib/lite/kernels/transpose_test.cc b/tensorflow/contrib/lite/kernels/transpose_test.cc index 7f5832cd5fa..337bc144b96 100644 --- a/tensorflow/contrib/lite/kernels/transpose_test.cc +++ b/tensorflow/contrib/lite/kernels/transpose_test.cc @@ -127,59 +127,110 @@ TEST(TransposeTest, TestRefOps4D) { class TransposeOpModel : public SingleOpModel { public: - TransposeOpModel(std::initializer_list input_shape, - std::initializer_list perm) { - input_ = AddInput(TensorType_FLOAT32); - output_ = AddOutput(TensorType_FLOAT32); - SetBuiltinOp( - BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions, - CreateTransposeOptions(builder_, builder_.CreateVector(perm)) - .Union()); - BuildInterpreter({input_shape}); - } - void SetInput(std::initializer_list data) { PopulateTensor(input_, data); } + void SetPerm(std::initializer_list data) { + PopulateTensor(perm_, data); + } + std::vector GetOutput() { return ExtractVector(output_); } std::vector GetOutputShape() { return GetTensorShape(output_); } - private: + protected: int input_; + int perm_; int output_; }; +// Tests case where perm is a const tensor. +// +// Example usage is as follows: +// SpaceToBatchNDOpConstModel m(input_shape, perm_shape, perm_data); +// m.SetInput(input_data); +// m.Invoke(); +class TransposeOpConstModel : public TransposeOpModel { + public: + TransposeOpConstModel(std::initializer_list input_shape, + std::initializer_list perm_shape, + std::initializer_list perm) { + input_ = AddInput(TensorType_FLOAT32); + perm_ = AddConstInput(TensorType_INT32, perm, perm_shape); + output_ = AddOutput(TensorType_FLOAT32); + SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions, + CreateTransposeOptions(builder_).Union()); + BuildInterpreter({input_shape}); + } +}; + +// Tests case where perm is a non-const tensor. +// +// Example usage is as follows: +// TransposeOpDynamicModel m(input_shape, perm_shape); +// m.SetInput(input_data); +// m.SetPerm(perm_data); +// m.Invoke(); +class TransposeOpDynamicModel : public TransposeOpModel { + public: + TransposeOpDynamicModel(std::initializer_list input_shape, + std::initializer_list perm_shape) { + input_ = AddInput(TensorType_FLOAT32); + perm_ = AddInput(TensorType_INT32); + output_ = AddOutput(TensorType_FLOAT32); + SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions, + CreateTransposeOptions(builder_).Union()); + BuildInterpreter({input_shape, perm_shape}); + } +}; + TEST(TransposeTest, TestUnequalPermSize) { - EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {2, 2}), - "dims != op_context.params->num_dimensions"); + EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {2}, {2, 2}), "2 != 4"); } TEST(TransposeTest, TestPermOutOfBounds) { - EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {0, -1, -2, -3}), + EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, -1, -2, -3}), "Transpose op permutations array is out of bounds."); - EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {0, 1, 2, 4}), + EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, 1, 2, 4}), "Transpose op permutations array is out of bounds."); } -TEST(TransposeTest, Test1DInputTensor) { - TransposeOpModel m({3}, {0}); +TEST(TransposeTest, Test1DInputConstTensor) { + TransposeOpConstModel m({3}, {1}, {0}); m.SetInput({1, 2, 3}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3})); EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3})); } -TEST(TransposeTest, Test2DInputTensor) { - TransposeOpModel m({3, 2}, {1, 0}); +TEST(TransposeTest, Test1DInputDynamicTensor) { + TransposeOpDynamicModel m({3}, {1}); + m.SetInput({1, 2, 3}); + m.SetPerm({0}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3})); +} + +TEST(TransposeTest, Test2DInputConstTensor) { + TransposeOpConstModel m({3, 2}, {2}, {1, 0}); m.SetInput({0, 1, 2, 3, 4, 5}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3})); EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5})); } -TEST(TransposeTest, Test3DInputTensor) { - TransposeOpModel m({2, 3, 4}, {2, 0, 1}); +TEST(TransposeTest, Test2DInputDynamicTensor) { + TransposeOpDynamicModel m({3, 2}, {2}); + m.SetInput({0, 1, 2, 3, 4, 5}); + m.SetPerm({1, 0}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5})); +} + +TEST(TransposeTest, Test3DInputConstTensor) { + TransposeOpConstModel m({2, 3, 4}, {3}, {2, 0, 1}); m.SetInput({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); m.Invoke(); @@ -189,29 +240,50 @@ TEST(TransposeTest, Test3DInputTensor) { 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23})); } +TEST(TransposeTest, Test3DInputDynamicTensor) { + TransposeOpDynamicModel m({2, 3, 4}, {3}); + m.SetInput({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}); + m.SetPerm({2, 0, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3})); + EXPECT_THAT(m.GetOutput(), + ElementsAreArray({0, 4, 8, 12, 16, 20, 1, 5, 9, 13, 17, 21, + 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23})); +} + TEST(TransposeTest, Test5DInputTensor) { - EXPECT_DEATH(TransposeOpModel({1, 2, 3, 4, 5}, {0, 1, 2, 3, 4}), + EXPECT_DEATH(TransposeOpConstModel({1, 2, 3, 4, 5}, {5}, {0, 1, 2, 3, 4}), "Transpose op only supports 1D-4D input arrays."); } -TEST(TransposeTest, SimpleTestNoReorder) { - TransposeOpModel m({1, 2, 3, 1}, {0, 1, 2, 3}); +TEST(TransposeTest, SimpleTestNoReorderConstTensor) { + TransposeOpConstModel m({1, 2, 3, 1}, {4}, {0, 1, 2, 3}); m.SetInput({1, 2, 3, 4, 5, 6}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1})); EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6})); } -TEST(TransposeTest, SimpleTestWithReorder) { - TransposeOpModel m({1, 2, 3, 1}, {2, 1, 3, 0}); +TEST(TransposeTest, SimpleTestNoReorderDynamicTensor) { + TransposeOpDynamicModel m({1, 2, 3, 1}, {4}); + m.SetInput({1, 2, 3, 4, 5, 6}); + m.SetPerm({0, 1, 2, 3}); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6})); +} + +TEST(TransposeTest, SimpleTestWithReorderConstTensor) { + TransposeOpConstModel m({1, 2, 3, 1}, {4}, {2, 1, 3, 0}); m.SetInput({1, 2, 3, 4, 5, 6}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2, 1, 1})); EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6})); } -TEST(TransposeTest, ComplexTestWithReorder) { - TransposeOpModel m({2, 3, 4, 5}, {2, 0, 1, 3}); +TEST(TransposeTest, ComplexTestWithReorderConstTensor) { + TransposeOpConstModel m({2, 3, 4, 5}, {4}, {2, 0, 1, 3}); m.SetInput({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, @@ -237,6 +309,34 @@ TEST(TransposeTest, ComplexTestWithReorder) { EXPECT_THAT(m.GetOutput(), result); } +TEST(TransposeTest, ComplexTestWithReorderDynamicTensor) { + TransposeOpDynamicModel m({2, 3, 4, 5}, {4}); + m.SetInput({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, + 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119}); + m.SetPerm({2, 0, 1, 3}); + m.Invoke(); + + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3, 5})); + auto result = ElementsAreArray( + {0, 1, 2, 3, 4, 20, 21, 22, 23, 24, 40, 41, 42, 43, 44, + 60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104, + 5, 6, 7, 8, 9, 25, 26, 27, 28, 29, 45, 46, 47, 48, 49, + 65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109, + 10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50, 51, 52, 53, 54, + 70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114, + 15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55, 56, 57, 58, 59, + 75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119}); + EXPECT_THAT(m.GetOutput(), result); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc index f5f1ec2cf3f..ac00c37b67d 100644 --- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc +++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc @@ -15,14 +15,15 @@ limitations under the License. #include #include #include -#include #include +#include #include #include #include "tensorflow/contrib/lite/builtin_op_data.h" #include "tensorflow/contrib/lite/context.h" #include "tensorflow/contrib/lite/kernels/activation_functor.h" +#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" namespace tflite { @@ -82,48 +83,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output_size_array->data[0] = (time_major) ? max_time : batch_size; output_size_array->data[1] = (time_major) ? batch_size : max_time; output_size_array->data[2] = num_units; - TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output, - output_size_array)); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, output, output_size_array)); return kTfLiteOk; } -namespace { -void RnnStep(const float* input_ptr_batch, const float* input_weights_ptr, - const float* recurrent_weights_ptr, const float* bias_ptr, - int input_size, int num_units, int input_weights_stride, - int recurrent_weights_stride, TfLiteFusedActivation activation, - float* hidden_state_ptr_batch, float* output_ptr_batch) { - // Output = bias - for (int o = 0; o < num_units; o++) { - output_ptr_batch[o] = bias_ptr[o]; - } - - // Output += input * input_weights - for (int o = 0; o < num_units; o++) { - for (int i = 0; i < input_size; i++) { - output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i]; - } - input_weights_ptr += input_weights_stride; - } - - // Output += recurrent_weights * hidden_state - for (int o = 0; o < num_units; o++) { - for (int h = 0; h < num_units; h++) { - output_ptr_batch[o] += - hidden_state_ptr_batch[h] * recurrent_weights_ptr[h]; - } - recurrent_weights_ptr += recurrent_weights_stride; - } - - // Output = activation(Output) and update hidden_state - for (int o = 0; o < num_units; o++) { - output_ptr_batch[o] = (ActivationFunctor(activation))(output_ptr_batch[o]); - hidden_state_ptr_batch[o] = output_ptr_batch[o]; - } -} -} // namespace - TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast(node->builtin_data); @@ -147,30 +112,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { (time_major) ? input->dims->data[0] : input->dims->data[1]; const int num_units = input_weights->dims->data[0]; const int input_size = input->dims->data[2]; - const int input_weights_stride = input_weights->dims->data[1]; - const int recurrent_weights_stride = recurrent_weights->dims->data[1]; // Initialize input_weights and recurrent_weights. const float* input_weights_ptr = input_weights->data.f; const float* recurrent_weights_ptr = recurrent_weights->data.f; if (time_major) { - // Unroll the sequence + // Initialize the pointer to hidden state. + float* hidden_state_ptr_batch = hidden_state->data.f; + // Unroll the sequence and use batch batch operations for efficiency. for (int s = 0; s < max_time; s++) { - for (int b = 0; b < batch_size; b++) { - // Initialize the pointer to hidden state. - float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units; - // Initialize the pointer to input and output. - const float* input_ptr_batch = - input->data.f + s * input_size * batch_size + b * input_size; - float* output_ptr_batch = - output->data.f + s * num_units * batch_size + b * num_units; + // Initialize the pointer to input and output. + const float* input_ptr_batch = + input->data.f + s * input_size * batch_size; + float* output_ptr_batch = output->data.f + s * num_units * batch_size; - RnnStep(input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, - bias_ptr, input_size, num_units, input_weights_stride, - recurrent_weights_stride, params->activation, - hidden_state_ptr_batch, output_ptr_batch); - } + kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr, + recurrent_weights_ptr, bias_ptr, input_size, + num_units, batch_size, params->activation, + hidden_state_ptr_batch, output_ptr_batch); } } else { // For each batch @@ -184,10 +144,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { float* output_ptr_batch = output->data.f + b * num_units * max_time + s * num_units; - RnnStep(input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, - bias_ptr, input_size, num_units, input_weights_stride, - recurrent_weights_stride, params->activation, - hidden_state_ptr_batch, output_ptr_batch); + kernel_utils::RnnBatchStep( + input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr, + input_size, num_units, /*batch_size=*/1, params->activation, + hidden_state_ptr_batch, output_ptr_batch); } } } diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc index 82c680ec3d8..7e32969763b 100644 --- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc +++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc @@ -14,8 +14,8 @@ limitations under the License. ==============================================================================*/ // Unit test for TFLite Sequential RNN op. -#include #include +#include #include #include @@ -120,8 +120,7 @@ static float rnn_golden_output[] = { 0.415153, 0.210318, 0, 0, 0, 0, 0, 2.02616, 0, 0.728256, 0.84183, 0.0907453, - 0.628881, 3.58099, 1.49974, 0 -}; + 0.628881, 3.58099, 1.49974, 0}; class UnidirectionalRNNOpModel : public SingleOpModel { public: diff --git a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh new file mode 100755 index 00000000000..b58ae266017 --- /dev/null +++ b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh @@ -0,0 +1,81 @@ +#!/bin/bash -x +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e + +echo "Starting" +TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.." + +TMP_DIR=$(mktemp -d) +echo "Package dir: " $TMP_DIR +FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks +FW_DIR_TFLITE=$FW_DIR/tensorflow_lite.framework +FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers + +echo "Creating target Headers directories" +mkdir -p $FW_DIR_TFLITE_HDRS + +echo "Headers, populating: TensorFlow Lite" +cd $TFLITE_DIR/../../.. + +find tensorflow/contrib/lite -name '*.h' \ + -not -path 'tensorflow/contrib/lite/downloads/*' \ + -not -path 'tensorflow/contrib/lite/examples/*' \ + -not -path 'tensorflow/contrib/lite/gen/*' \ + -not -path 'tensorflow/contrib/lite/toco/*' \ + -not -path 'tensorflow/contrib/lite/nnapi/*' \ + -not -path 'tensorflow/contrib/lite/java/*' \ + | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T - +cd $FW_DIR_TFLITE_HDRS +tar xf tmp.tar +rm -f tmp.tar + +echo "Headers, populating: Flatbuffer" +cd $TFLITE_DIR/downloads/flatbuffers/include/ +find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T - +cd $FW_DIR_TFLITE_HDRS +tar xf tmp.tar +rm -f tmp.tar + +cd $TFLITE_DIR/../../.. +echo "Generate master LICENSE file and copy to target" +bazel build //tensorflow/tools/lib_package:clicenses_generate +cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \ + $FW_DIR_TFLITE + +echo "Copying static libraries" +cp $TFLITE_DIR/gen/lib/libtensorflow-lite.a \ + $FW_DIR_TFLITE/tensorflow_lite + +# This is required, otherwise they interfere with the documentation of the +# pod at cocoapods.org. +echo "Remove all README files" +cd $FW_DIR_TFLITE_HDRS +find . -type f -name README\* -exec rm -f {} \; +find . -type f -name readme\* -exec rm -f {} \; + +TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks" +echo "Moving results to target: " $TARGET_GEN_LOCATION +cd $FW_DIR +zip -q -r tensorflow_lite.framework.zip tensorflow_lite.framework -x .DS_Store +rm -rf $TARGET_GEN_LOCATION +mkdir -p $TARGET_GEN_LOCATION +cp -r tensorflow_lite.framework.zip $TARGET_GEN_LOCATION + +echo "Cleaning up" +rm -rf $TMP_DIR + +echo "Finished" diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc index 415d984ad8c..b36bfcef843 100644 --- a/tensorflow/contrib/lite/model.cc +++ b/tensorflow/contrib/lite/model.cc @@ -328,6 +328,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type, builtin_data = reinterpret_cast(params); break; } + case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN: case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: { TfLiteSequenceRNNParams* params = MallocPOD(); if (auto* sequence_rnn_params = @@ -515,62 +516,18 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type, break; } case BuiltinOperator_SPACE_TO_BATCH_ND: { - auto* params = MallocPOD(); - if (auto* schema_params = - op->builtin_options_as_SpaceToBatchNDOptions()) { - const auto& block_shape = schema_params->block_shape(); - FlatBufferIntVectorToArray(sizeof(params->block_shape), block_shape, - params->block_shape, error_reporter); - const auto& before_paddings = schema_params->before_paddings(); - FlatBufferIntVectorToArray(sizeof(params->before_paddings), - before_paddings, params->before_paddings, - error_reporter); - const auto& after_paddings = schema_params->after_paddings(); - FlatBufferIntVectorToArray(sizeof(params->after_paddings), - after_paddings, params->after_paddings, - error_reporter); - params->num_spatial_dimensions = block_shape->Length(); - } - builtin_data = reinterpret_cast(params); break; } case BuiltinOperator_BATCH_TO_SPACE_ND: { - auto* params = MallocPOD(); - if (auto* schema_params = - op->builtin_options_as_BatchToSpaceNDOptions()) { - const auto& block_shape = schema_params->block_shape(); - FlatBufferIntVectorToArray(sizeof(params->block_shape), block_shape, - params->block_shape, error_reporter); - const auto& before_crops = schema_params->before_crops(); - FlatBufferIntVectorToArray(sizeof(params->before_crops), before_crops, - params->before_crops, error_reporter); - const auto& after_crops = schema_params->after_crops(); - FlatBufferIntVectorToArray(sizeof(params->after_crops), after_crops, - params->after_crops, error_reporter); - params->num_spatial_dimensions = block_shape->Length(); - } - builtin_data = reinterpret_cast(params); break; } case BuiltinOperator_TRANSPOSE: { - auto* params = MallocPOD(); - if (auto* schema_params = op->builtin_options_as_TransposeOptions()) { - const auto& perm = schema_params->perm(); - FlatBufferIntVectorToArray(sizeof(params->perm), perm, params->perm, - error_reporter); - params->num_dimensions = perm->Length(); - } - builtin_data = reinterpret_cast(params); break; } case BuiltinOperator_MEAN: { auto* params = MallocPOD(); if (auto* schema_params = op->builtin_options_as_MeanOptions()) { - const auto& axis = schema_params->axis(); - FlatBufferIntVectorToArray(sizeof(params->axis), axis, params->axis, - error_reporter); params->keep_dims = schema_params->keep_dims(); - params->num_axis_dimensions = axis->Length(); } builtin_data = reinterpret_cast(params); break; diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h index 7019c29959f..76032771af2 100644 --- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h +++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h @@ -1571,7 +1571,7 @@ inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model, } /** - * Specfifies which operands will be the model's inputs and outputs. + * Specifies which operands will be the model's inputs and outputs. * * An operand cannot be used for both input and output. Doing so will * return an error. diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc index d5b9319407a..da9ceec2f14 100644 --- a/tensorflow/contrib/lite/nnapi_delegate.cc +++ b/tensorflow/contrib/lite/nnapi_delegate.cc @@ -319,6 +319,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, case tflite::BuiltinOperator_SVDF: case tflite::BuiltinOperator_HASHTABLE_LOOKUP: case tflite::BuiltinOperator_RNN: + case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN: case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: case tflite::BuiltinOperator_EMBEDDING_LOOKUP: case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD index 3d6a3ec0fd4..2d8c49b7d7a 100644 --- a/tensorflow/contrib/lite/python/BUILD +++ b/tensorflow/contrib/lite/python/BUILD @@ -13,6 +13,7 @@ py_library( srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ + ":op_hint", "//tensorflow/contrib/lite/toco:model_flags_proto_py", "//tensorflow/contrib/lite/toco:toco_flags_proto_py", "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco", @@ -20,6 +21,17 @@ py_library( ], ) +py_library( + name = "op_hint", + srcs = ["op_hint.py"], + srcs_version = "PY2AND3", + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/contrib/framework:framework_py", + "//tensorflow/python:platform", + ], +) + py_test( name = "lite_test", srcs = ["lite_test.py"], @@ -27,6 +39,7 @@ py_test( tags = ["no_oss"], deps = [ ":lite", + ":op_hint", "//tensorflow/python:array_ops", "//tensorflow/python:client_testlib", "//tensorflow/python:dtypes", diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py index 3c369774bed..5d2f2165376 100644 --- a/tensorflow/contrib/lite/python/lite.py +++ b/tensorflow/contrib/lite/python/lite.py @@ -18,16 +18,21 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice. @@toco_convert @@toco_convert_protos +@@OpHint +@@convert_op_hints_to_stubs """ from __future__ import absolute_import from __future__ import division from __future__ import print_function - import os import subprocess import tempfile +# pylint: disable=unused-import +from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs +from tensorflow.contrib.lite.python.op_hint import OpHint +# pylint: enable=unused-import from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2 from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2 from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2 diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py index 7d55f3fe6fe..b8b4510188b 100644 --- a/tensorflow/contrib/lite/python/lite_test.py +++ b/tensorflow/contrib/lite/python/lite_test.py @@ -18,10 +18,14 @@ from __future__ import division from __future__ import print_function from tensorflow.contrib.lite.python import lite +from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base from tensorflow.python.client import session from tensorflow.python.framework import dtypes from tensorflow.python.framework import test_util +from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes +from tensorflow.python.framework.graph_util_impl import _extract_graph_summary from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops from tensorflow.python.platform import test @@ -35,7 +39,8 @@ class LiteTest(test_util.TensorFlowTestCase): # Try running on valid graph result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor]) self.assertTrue(result) - # TODO(aselle): remove tests that fail. + # TODO(aselle): remove tests that fail (we must get TOCO to not fatal + # all the time). # Try running on identity graph (known fail) # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"): # result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor]) @@ -51,5 +56,116 @@ class LiteTest(test_util.TensorFlowTestCase): quantized_input_stats=[(0., 1.)]) self.assertTrue(result) + +class LiteTestOpHint(test_util.TensorFlowTestCase): + """Test the hint to stub functionality.""" + + def _getGraphOpTypes(self, graphdef, output_nodes): + """Returns used op types in `graphdef` reachable from `output_nodes`. + + This is used to check that after the stub transformation the expected + nodes are there. Typically use this with self.assertCountEqual(...). + + NOTE: this is not a exact test that the graph is the correct output, but + it balances compact expressibility of test with sanity checking. + + Args: + graphdef: TensorFlow proto graphdef. + output_nodes: A list of output node names that we need to reach. + + Returns: + A set of node types reachable from `output_nodes`. + """ + name_to_input_name, name_to_node, _ = ( + _extract_graph_summary(graphdef)) + # Find all nodes that are needed by the outputs + used_node_names = _bfs_for_reachable_nodes(output_nodes, name_to_input_name) + return set([name_to_node[node_name].op for node_name in used_node_names]) + + def _countIdentities(self, nodes): + """Count the number of "Identity" op types in the list of proto nodes. + + Args: + nodes: NodeDefs of the graph. + + Returns: + The number of nodes with op type "Identity" found. + """ + return len([x for x in nodes if x.op == "Identity"]) + + def testSwishLiteHint(self): + """Makes a custom op swish and makes sure it gets converted as a unit.""" + image = array_ops.constant([1., 2., 3., 4.]) + swish_scale = array_ops.constant(1.0) + + def _swish(input_tensor, scale): + custom = lite.OpHint("cool_activation") + input_tensor, scale = custom.add_inputs(input_tensor, scale) + output = math_ops.sigmoid(input_tensor) * input_tensor * scale + output, = custom.add_outputs(output) + return output + output = array_ops.identity(_swish(image, swish_scale), name="ModelOutput") + + with self.test_session() as sess: + # check if identities have been put into the graph (2 input, 1 output, + # and 1 final output). + self.assertEqual(self._countIdentities(sess.graph_def.node), 4) + + stubbed_graphdef = lite.convert_op_hints_to_stubs(sess) + + self.assertCountEqual( + self._getGraphOpTypes( + stubbed_graphdef, output_nodes=[_tensor_name_base(output)]), + ["cool_activation", "Const", "Identity"]) + + def testScaleAndBiasAndIdentity(self): + """This tests a scaled add which has 3 inputs and 2 outputs.""" + a = array_ops.constant(1.) + x = array_ops.constant([2., 3.]) + b = array_ops.constant([4., 5.]) + + def _scaled_and_bias_and_identity(a, x, b): + custom = lite.OpHint("scale_and_bias_and_identity") + a, x, b = custom.add_inputs(a, x, b) + return custom.add_outputs(a * x + b, x) + output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b), + name="ModelOutput") + + with self.test_session() as sess: + # make sure one identity for each input (3) and output (2) => 3 + 2 = 5 + # +1 for the final output + self.assertEqual(self._countIdentities(sess.graph_def.node), 6) + + stubbed_graphdef = lite.convert_op_hints_to_stubs(sess) + + self.assertCountEqual( + self._getGraphOpTypes( + stubbed_graphdef, output_nodes=[_tensor_name_base(output)]), + ["scale_and_bias_and_identity", "Const", "Identity", "Pack"]) + + def testTwoFunctions(self): + """Tests if two functions are converted correctly.""" + a = array_ops.constant([1.]) + b = array_ops.constant([1.]) + def _double_values(x): + custom = lite.OpHint("add_test") + x = custom.add_inputs(x) + output = math_ops.multiply(x, x) + output, = custom.add_outputs(output) + return output + output = array_ops.identity( + math_ops.add(_double_values(a), _double_values(b)), name="ModelOutput") + + with self.test_session() as sess: + # make sure one identity for each input (2) and output (2) => 2 + 2 + # +1 for the final output + self.assertEqual(self._countIdentities(sess.graph_def.node), 5) + stubbed_graphdef = lite.convert_op_hints_to_stubs(sess) + self.assertCountEqual( + self._getGraphOpTypes( + stubbed_graphdef, output_nodes=[_tensor_name_base(output)]), + ["add_test", "Const", "Identity", "Add"]) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/lite/python/op_hint.py b/tensorflow/contrib/lite/python/op_hint.py new file mode 100644 index 00000000000..7c587e38b16 --- /dev/null +++ b/tensorflow/contrib/lite/python/op_hint.py @@ -0,0 +1,291 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Define tflite op hints (intrinsic operations). + +This essentially allows defining a TensorFlow API for tflite operations in +Python with hints on how they are represented in TensorFlow Lite. This basically +is a form of tflite intrinsic. It wraps a subpart of a TensorFlow execution +graph and is useful for LSTMs and other complicated TensorFlow constructions +that are difficult to pattern match in TOCO, but are represented by a single +accelerated tflite op. + +Example: + def tflite_cool_activation(input): + # A cool activation function. + custom = tf.contrib.lite.OpHint("cool_activation") + input = custom.add_inputs(input) + output = tf.sigmoid(input) * input + custom.add_outputs(output) + return output + + image = tf.placeholder(tf.float32, (1, 16, 16, 1)) + output = tf.identity(tflite_cool_activation(image)) + + session = tf.Session() + + graphdef_to_convert = tf.contrib.lite.convert_op_hints_to_stubs(session) + tflite_graph = tf.contrib.lite.toco_convert(graphdef_to_convert, + [image], [output]) + [image], [output]) + with open("/tmp/graph.fb", "wb") as fp: + fp.write(tflite_graph) + +How does it work?: + +OpHint is a helper that you use when defining a vanilla python function. +It allows you to wrap arguments with tf.identities with some custom attributes. +These attributes allow you to find the original block of ops that was created. +For example, if you use cool_activation above you essentially get: + +a_input = tf.identity() +result = tf.multiply(tf.sigmoid(a_input), a_input) +output = tf.identity() + +a_input, output are identities that have parameters representing +what argument they are, what the name of the function they should turn into +in tf lite as well as a guid that uniquely identifies a particular invocation. + +Once you have built your whole tensorflow graph, you can run it and train it +as usual, but after you have done that, you need to convert the graph into +a form that replaces these subgraphs wrapped in identities to stub ops. These +ops don't actually exist in the normal TensorFlow runtime, but will be +understood by toco later. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections as _collections +import itertools as _itertools +import uuid as _uuid + +from tensorflow.contrib import framework as _framework +from tensorflow.python.framework import ops as _ops +from tensorflow.python.ops import array_ops as _array_ops +from tensorflow.python.util.all_util import remove_undocumented + + +class OpHint(object): + """A class that helps build tflite function invocations. + + It allows you to take a bunch of TensorFlow ops and annotate the construction + such that toco knows how to convert it to tflite. This embeds a pseudo + function in a TensorFlow graph. This allows embedding high-level API usage + information in a lower level TensorFlow implementation so that an alternative + implementation can be substituted later. + + Essentially, any "input" into this pseudo op is fed into an identity, and + attributes are added to that input before being used by the constituent ops + that make up the pseudo op. A similar process is done to any output that + is to be exported from the current op. + + TODO(aselle): When TensorFlow functions functionality works for arbitrary + constructs, this mechanism can be retired and changed to use python defun's. + """ + + # Attr constants that are used for representation in the GraphDef + FUNCTION_NAME_ATTR = "_tflite_function_name" + FUNCTION_UUID_ATTR = "_tflite_function_uuid" + FUNCTION_INPUT_INDEX_ATTR = "_tflite_function_input_index" + FUNCTION_OUTPUT_INDEX_ATTR = "_tflite_function_output_index" + + def __init__(self, function_name, **kwargs): + """Create a OpHint. + + Args: + function_name: Name of the function (the custom op name in tflite) + **kwargs: Keyword arguments of any constant attributes for the function. + """ + self._function_name = function_name + self._unique_function_id = _uuid.uuid1().hex # TODO(aselle): Unique enough? + self._curr_input_index = 0 + self._curr_output_index = 0 + self._attrs_to_store_later = kwargs + self._stored_attrs = False + + def _setattr(self, dest_op, name, value): + tensor_value = _ops.convert_to_tensor(value) + dest_op.op.node_def.attr[name].tensor.CopyFrom( + tensor_value.op.node_def.attr["value"].tensor) + + def add_inputs(self, *args): + """Add a sequence of inputs to the function invocation. + + Args: + *args: List of inputs to be converted (should be Tf.Tensor). + Returns: + Wrapped inputs (identity standins that have additional metadata). These + are also are also tf.Tensor's. + """ + + def augmented_identity(arg): + identity_op = _array_ops.identity(arg) + attr = identity_op.op.node_def.attr + attr[OpHint.FUNCTION_NAME_ATTR].s = self._function_name + attr[OpHint.FUNCTION_UUID_ATTR].s = self._unique_function_id + attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i = self._curr_input_index + self._curr_input_index += 1 + return identity_op + + return [augmented_identity(arg) for arg in args] + + def add_outputs(self, *args): + """Add a sequence of outputs to the function invocation. + + Args: + *args: List of outputs to be converted (should be tf.Tensor). + Returns: + Wrapped outputs (identity standins that have additional metadata). These + are also tf.Tensor's. + """ + + def augmented_identity(arg): + identity_op = _array_ops.identity(arg) + attr = identity_op.op.node_def.attr + attr[OpHint.FUNCTION_NAME_ATTR].s = self._function_name + attr[OpHint.FUNCTION_UUID_ATTR].s = self._unique_function_id + attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i = self._curr_output_index + self._curr_output_index += 1 + return identity_op + + wrapped_outputs = [augmented_identity(arg) for arg in args] + + if not self._stored_attrs: + for key, value in self._attrs_to_store_later.iteritems(): + self._setattr(wrapped_outputs[0], "_tflite_attr_" + key, value) + self._stored_attrs = True + + return wrapped_outputs + + +class _LiteFuncCall(object): + """Represent a TensorFlow Lite custom function. + + This is uses to accumulate found hints in the graphdef into a single + conceptual unit. + + Properties: + self.inputs: inputs to the op (hash from index # to argument) + self.outputs: outputs to the op (hash from index # to argument) + self.function_name: the tflite custom op name to use + self.uuid: a unique call id for this particular call (i.e. + multiple function calls would have the same function_name but different + uuids. + self.params: A param name to key value for op constant data. I.e. for + axis on a reduction, strides on a convolution, etc. + """ + + def __init__(self): + self.inputs = {} + self.outputs = {} + self.function_name = None + self.uuid = None + self.params = {} + + def __str__(self): + return "tflite function %s call %s\n\tinputs: %r\n\toutputs: %r" % ( + self.function_name, self.uuid, self.inputs, self.outputs) + + +def _find_all_hints_in_graph_def(session): + """Look at the current default graph and return a list of LiteFuncCall objs. + + Args: + session: A TensorFlow session that contains the graph to convert. + Returns: + a list of `LifeFuncCall` objects in the form + + """ + func_calls = _collections.defaultdict(_LiteFuncCall) + seen_ops = set() + + for op in session.graph.get_operations(): + for operand in _itertools.chain(op.inputs, op.outputs): + if operand in seen_ops: + continue + seen_ops.add(operand) + attr = operand.op.node_def.attr + uuid = attr[OpHint.FUNCTION_UUID_ATTR].s + if OpHint.FUNCTION_UUID_ATTR not in attr: + continue + call_def = func_calls[uuid] + call_def.uuid = uuid + if OpHint.FUNCTION_UUID_ATTR in attr: + call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s + if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr: + call_def.inputs[attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i] = operand + if OpHint.FUNCTION_OUTPUT_INDEX_ATTR in attr: + call_def.outputs[attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i] = operand + + for a in attr: + if a.startswith("_tflite_attr_"): + # TODO(aselle): Remember the attribute tensors so we can put them + # in collapse. + call_def.params[a.replace("_tflite_attr_,", "")] = attr[a].tensor + + return func_calls + + +def _tensor_name_base(full_tensor_name): + """Removes the device assignment code from a tensor. + + e.g. _tensor_name_base("foo:3") => "foo" + + Args: + full_tensor_name: A tensor name that is annotated with a device placement + (this is what tensor flow introspection gives). + Returns: + A name without any device assignment. + """ + return full_tensor_name.name.split(":")[0] + + +def convert_op_hints_to_stubs(session): + """Converts a graphdef with LiteOp hints into stub operations. + + This is used to prepare for toco conversion of complex intrinsic usages. + + Args: + session: A TensorFlow session that contains the graph to convert. + Returns: + A new graphdef with all ops contained in OpHints being replaced by + a single op call with the right parameters. + """ + hints = _find_all_hints_in_graph_def(session) + current_graph_def = session.graph_def + for call in hints.values(): + input_names = [None] * len(call.inputs) + output_names = [None] * len(call.outputs) + output_dtypes = [None] * len(call.outputs) + output_quantized = False + for input_index, tensor in call.inputs.items(): + input_names[input_index] = _tensor_name_base(tensor) + for output_index, tensor in call.outputs.items(): + output_names[output_index] = _tensor_name_base(tensor) + output_dtypes[output_index] = tensor.dtype.as_datatype_enum + # TODO(aselle): Support quantized flag properly + current_graph_def = _framework.fuse_op( + current_graph_def, input_names, output_names, output_dtypes, + output_quantized, call.uuid, call.function_name) + for node in current_graph_def.node: + if node.name == call.uuid: + for param, tensor in call.params.items(): + node.attr[param].tensor.CopyFrom(tensor) + return current_graph_def + + +_allowed_symbols = ["OpHint", "convert_op_hints_to_stubs"] +remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs index ec202cd4073..c0b220e8721 100644 --- a/tensorflow/contrib/lite/schema/schema.fbs +++ b/tensorflow/contrib/lite/schema/schema.fbs @@ -119,6 +119,7 @@ enum BuiltinOperator : byte { SQUEEZE = 43, UNIDIRECTIONAL_SEQUENCE_LSTM = 44, STRIDED_SLICE = 45, + BIDIRECTIONAL_SEQUENCE_RNN = 46, } // Options for the builtin operators. @@ -224,6 +225,12 @@ table SequenceRNNOptions { fused_activation_function:ActivationFunctionType; } +// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell. +table BidirectionalSequenceRNNOptions { + time_major:bool; + fused_activation_function:ActivationFunctionType; +} + // An implementation of TensorFlow fully_connected (a.k.a Dense) layer. table FullyConnectedOptions { fused_activation_function:ActivationFunctionType; @@ -282,15 +289,9 @@ table ReshapeOptions { } table SpaceToBatchNDOptions { - block_shape:[int]; - before_paddings:[int]; - after_paddings:[int]; } table BatchToSpaceNDOptions { - block_shape:[int]; - before_crops:[int]; - after_crops:[int]; } table SkipGramOptions { @@ -326,11 +327,9 @@ table GatherOptions { } table TransposeOptions { - perm:[int]; } table MeanOptions { - axis:[int]; keep_dims: bool; } diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h index c04a73a2bf0..29f3a17be7a 100755 --- a/tensorflow/contrib/lite/schema/schema_generated.h +++ b/tensorflow/contrib/lite/schema/schema_generated.h @@ -51,6 +51,9 @@ struct RNNOptionsT; struct SequenceRNNOptions; struct SequenceRNNOptionsT; +struct BidirectionalSequenceRNNOptions; +struct BidirectionalSequenceRNNOptionsT; + struct FullyConnectedOptions; struct FullyConnectedOptionsT; @@ -211,11 +214,12 @@ enum BuiltinOperator { BuiltinOperator_SQUEEZE = 43, BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44, BuiltinOperator_STRIDED_SLICE = 45, + BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN = 46, BuiltinOperator_MIN = BuiltinOperator_ADD, - BuiltinOperator_MAX = BuiltinOperator_STRIDED_SLICE + BuiltinOperator_MAX = BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN }; -inline BuiltinOperator (&EnumValuesBuiltinOperator())[43] { +inline BuiltinOperator (&EnumValuesBuiltinOperator())[44] { static BuiltinOperator values[] = { BuiltinOperator_ADD, BuiltinOperator_AVERAGE_POOL_2D, @@ -259,7 +263,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[43] { BuiltinOperator_DIV, BuiltinOperator_SQUEEZE, BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, - BuiltinOperator_STRIDED_SLICE}; + BuiltinOperator_STRIDED_SLICE, + BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN}; return values; } @@ -310,6 +315,7 @@ inline const char **EnumNamesBuiltinOperator() { "SQUEEZE", "UNIDIRECTIONAL_SEQUENCE_LSTM", "STRIDED_SLICE", + "BIDIRECTIONAL_SEQUENCE_RNN", nullptr}; return names; } @@ -2005,6 +2011,85 @@ flatbuffers::Offset CreateSequenceRNNOptions( flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable { + typedef BidirectionalSequenceRNNOptions TableType; + bool time_major; + ActivationFunctionType fused_activation_function; + BidirectionalSequenceRNNOptionsT() + : time_major(false), + fused_activation_function(ActivationFunctionType_NONE) {} +}; + +struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS + : private flatbuffers::Table { + typedef BidirectionalSequenceRNNOptionsT NativeTableType; + enum { VT_TIME_MAJOR = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 }; + bool time_major() const { return GetField(VT_TIME_MAJOR, 0) != 0; } + ActivationFunctionType fused_activation_function() const { + return static_cast( + GetField(VT_FUSED_ACTIVATION_FUNCTION, 0)); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_TIME_MAJOR) && + VerifyField(verifier, VT_FUSED_ACTIVATION_FUNCTION) && + verifier.EndTable(); + } + BidirectionalSequenceRNNOptionsT *UnPack( + const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo( + BidirectionalSequenceRNNOptionsT *_o, + const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack( + flatbuffers::FlatBufferBuilder &_fbb, + const BidirectionalSequenceRNNOptionsT *_o, + const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct BidirectionalSequenceRNNOptionsBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_time_major(bool time_major) { + fbb_.AddElement(BidirectionalSequenceRNNOptions::VT_TIME_MAJOR, + static_cast(time_major), 0); + } + void add_fused_activation_function( + ActivationFunctionType fused_activation_function) { + fbb_.AddElement( + BidirectionalSequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION, + static_cast(fused_activation_function), 0); + } + explicit BidirectionalSequenceRNNOptionsBuilder( + flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + BidirectionalSequenceRNNOptionsBuilder &operator=( + const BidirectionalSequenceRNNOptionsBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset +CreateBidirectionalSequenceRNNOptions( + flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false, + ActivationFunctionType fused_activation_function = + ActivationFunctionType_NONE) { + BidirectionalSequenceRNNOptionsBuilder builder_(_fbb); + builder_.add_fused_activation_function(fused_activation_function); + builder_.add_time_major(time_major); + return builder_.Finish(); +} + +flatbuffers::Offset +CreateBidirectionalSequenceRNNOptions( + flatbuffers::FlatBufferBuilder &_fbb, + const BidirectionalSequenceRNNOptionsT *_o, + const flatbuffers::rehasher_function_t *_rehasher = nullptr); + struct FullyConnectedOptionsT : public flatbuffers::NativeTable { typedef FullyConnectedOptions TableType; ActivationFunctionType fused_activation_function; @@ -2541,21 +2626,14 @@ flatbuffers::Offset CreateLSTMOptions( struct ResizeBilinearOptionsT : public flatbuffers::NativeTable { typedef ResizeBilinearOptions TableType; - int32_t new_height; - int32_t new_width; - ResizeBilinearOptionsT() : new_height(0), new_width(0) {} + ResizeBilinearOptionsT() {} }; struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { typedef ResizeBilinearOptionsT NativeTableType; - enum { VT_NEW_HEIGHT = 4, VT_NEW_WIDTH = 6 }; - int32_t new_height() const { return GetField(VT_NEW_HEIGHT, 0); } - int32_t new_width() const { return GetField(VT_NEW_WIDTH, 0); } bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && - VerifyField(verifier, VT_NEW_HEIGHT) && - VerifyField(verifier, VT_NEW_WIDTH) && verifier.EndTable(); + return VerifyTableStart(verifier) && verifier.EndTable(); } ResizeBilinearOptionsT *UnPack( const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -2570,13 +2648,6 @@ struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS struct ResizeBilinearOptionsBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; - void add_new_height(int32_t new_height) { - fbb_.AddElement(ResizeBilinearOptions::VT_NEW_HEIGHT, new_height, - 0); - } - void add_new_width(int32_t new_width) { - fbb_.AddElement(ResizeBilinearOptions::VT_NEW_WIDTH, new_width, 0); - } explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -2590,11 +2661,8 @@ struct ResizeBilinearOptionsBuilder { }; inline flatbuffers::Offset CreateResizeBilinearOptions( - flatbuffers::FlatBufferBuilder &_fbb, int32_t new_height = 0, - int32_t new_width = 0) { + flatbuffers::FlatBufferBuilder &_fbb) { ResizeBilinearOptionsBuilder builder_(_fbb); - builder_.add_new_width(new_width); - builder_.add_new_height(new_height); return builder_.Finish(); } @@ -2766,33 +2834,14 @@ flatbuffers::Offset CreateReshapeOptions( struct SpaceToBatchNDOptionsT : public flatbuffers::NativeTable { typedef SpaceToBatchNDOptions TableType; - std::vector block_shape; - std::vector before_paddings; - std::vector after_paddings; SpaceToBatchNDOptionsT() {} }; struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { typedef SpaceToBatchNDOptionsT NativeTableType; - enum { VT_BLOCK_SHAPE = 4, VT_BEFORE_PADDINGS = 6, VT_AFTER_PADDINGS = 8 }; - const flatbuffers::Vector *block_shape() const { - return GetPointer *>(VT_BLOCK_SHAPE); - } - const flatbuffers::Vector *before_paddings() const { - return GetPointer *>(VT_BEFORE_PADDINGS); - } - const flatbuffers::Vector *after_paddings() const { - return GetPointer *>(VT_AFTER_PADDINGS); - } bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && - VerifyOffset(verifier, VT_BLOCK_SHAPE) && - verifier.Verify(block_shape()) && - VerifyOffset(verifier, VT_BEFORE_PADDINGS) && - verifier.Verify(before_paddings()) && - VerifyOffset(verifier, VT_AFTER_PADDINGS) && - verifier.Verify(after_paddings()) && verifier.EndTable(); + return VerifyTableStart(verifier) && verifier.EndTable(); } SpaceToBatchNDOptionsT *UnPack( const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -2807,18 +2856,6 @@ struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS struct SpaceToBatchNDOptionsBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; - void add_block_shape( - flatbuffers::Offset> block_shape) { - fbb_.AddOffset(SpaceToBatchNDOptions::VT_BLOCK_SHAPE, block_shape); - } - void add_before_paddings( - flatbuffers::Offset> before_paddings) { - fbb_.AddOffset(SpaceToBatchNDOptions::VT_BEFORE_PADDINGS, before_paddings); - } - void add_after_paddings( - flatbuffers::Offset> after_paddings) { - fbb_.AddOffset(SpaceToBatchNDOptions::VT_AFTER_PADDINGS, after_paddings); - } explicit SpaceToBatchNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -2832,62 +2869,25 @@ struct SpaceToBatchNDOptionsBuilder { }; inline flatbuffers::Offset CreateSpaceToBatchNDOptions( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> block_shape = 0, - flatbuffers::Offset> before_paddings = 0, - flatbuffers::Offset> after_paddings = 0) { + flatbuffers::FlatBufferBuilder &_fbb) { SpaceToBatchNDOptionsBuilder builder_(_fbb); - builder_.add_after_paddings(after_paddings); - builder_.add_before_paddings(before_paddings); - builder_.add_block_shape(block_shape); return builder_.Finish(); } -inline flatbuffers::Offset -CreateSpaceToBatchNDOptionsDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector *block_shape = nullptr, - const std::vector *before_paddings = nullptr, - const std::vector *after_paddings = nullptr) { - return tflite::CreateSpaceToBatchNDOptions( - _fbb, block_shape ? _fbb.CreateVector(*block_shape) : 0, - before_paddings ? _fbb.CreateVector(*before_paddings) : 0, - after_paddings ? _fbb.CreateVector(*after_paddings) : 0); -} - flatbuffers::Offset CreateSpaceToBatchNDOptions( flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); struct BatchToSpaceNDOptionsT : public flatbuffers::NativeTable { typedef BatchToSpaceNDOptions TableType; - std::vector block_shape; - std::vector before_crops; - std::vector after_crops; BatchToSpaceNDOptionsT() {} }; struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { typedef BatchToSpaceNDOptionsT NativeTableType; - enum { VT_BLOCK_SHAPE = 4, VT_BEFORE_CROPS = 6, VT_AFTER_CROPS = 8 }; - const flatbuffers::Vector *block_shape() const { - return GetPointer *>(VT_BLOCK_SHAPE); - } - const flatbuffers::Vector *before_crops() const { - return GetPointer *>(VT_BEFORE_CROPS); - } - const flatbuffers::Vector *after_crops() const { - return GetPointer *>(VT_AFTER_CROPS); - } bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && - VerifyOffset(verifier, VT_BLOCK_SHAPE) && - verifier.Verify(block_shape()) && - VerifyOffset(verifier, VT_BEFORE_CROPS) && - verifier.Verify(before_crops()) && - VerifyOffset(verifier, VT_AFTER_CROPS) && - verifier.Verify(after_crops()) && verifier.EndTable(); + return VerifyTableStart(verifier) && verifier.EndTable(); } BatchToSpaceNDOptionsT *UnPack( const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -2902,18 +2902,6 @@ struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS struct BatchToSpaceNDOptionsBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; - void add_block_shape( - flatbuffers::Offset> block_shape) { - fbb_.AddOffset(BatchToSpaceNDOptions::VT_BLOCK_SHAPE, block_shape); - } - void add_before_crops( - flatbuffers::Offset> before_crops) { - fbb_.AddOffset(BatchToSpaceNDOptions::VT_BEFORE_CROPS, before_crops); - } - void add_after_crops( - flatbuffers::Offset> after_crops) { - fbb_.AddOffset(BatchToSpaceNDOptions::VT_AFTER_CROPS, after_crops); - } explicit BatchToSpaceNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -2927,29 +2915,11 @@ struct BatchToSpaceNDOptionsBuilder { }; inline flatbuffers::Offset CreateBatchToSpaceNDOptions( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> block_shape = 0, - flatbuffers::Offset> before_crops = 0, - flatbuffers::Offset> after_crops = 0) { + flatbuffers::FlatBufferBuilder &_fbb) { BatchToSpaceNDOptionsBuilder builder_(_fbb); - builder_.add_after_crops(after_crops); - builder_.add_before_crops(before_crops); - builder_.add_block_shape(block_shape); return builder_.Finish(); } -inline flatbuffers::Offset -CreateBatchToSpaceNDOptionsDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector *block_shape = nullptr, - const std::vector *before_crops = nullptr, - const std::vector *after_crops = nullptr) { - return tflite::CreateBatchToSpaceNDOptions( - _fbb, block_shape ? _fbb.CreateVector(*block_shape) : 0, - before_crops ? _fbb.CreateVector(*before_crops) : 0, - after_crops ? _fbb.CreateVector(*after_crops) : 0); -} - flatbuffers::Offset CreateBatchToSpaceNDOptions( flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); @@ -3324,19 +3294,13 @@ flatbuffers::Offset CreateGatherOptions( struct TransposeOptionsT : public flatbuffers::NativeTable { typedef TransposeOptions TableType; - std::vector perm; TransposeOptionsT() {} }; struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { typedef TransposeOptionsT NativeTableType; - enum { VT_PERM = 4 }; - const flatbuffers::Vector *perm() const { - return GetPointer *>(VT_PERM); - } bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_PERM) && - verifier.Verify(perm()) && verifier.EndTable(); + return VerifyTableStart(verifier) && verifier.EndTable(); } TransposeOptionsT *UnPack( const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -3351,9 +3315,6 @@ struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct TransposeOptionsBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; - void add_perm(flatbuffers::Offset> perm) { - fbb_.AddOffset(TransposeOptions::VT_PERM, perm); - } explicit TransposeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -3367,41 +3328,27 @@ struct TransposeOptionsBuilder { }; inline flatbuffers::Offset CreateTransposeOptions( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> perm = 0) { + flatbuffers::FlatBufferBuilder &_fbb) { TransposeOptionsBuilder builder_(_fbb); - builder_.add_perm(perm); return builder_.Finish(); } -inline flatbuffers::Offset CreateTransposeOptionsDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector *perm = nullptr) { - return tflite::CreateTransposeOptions( - _fbb, perm ? _fbb.CreateVector(*perm) : 0); -} - flatbuffers::Offset CreateTransposeOptions( flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); struct MeanOptionsT : public flatbuffers::NativeTable { typedef MeanOptions TableType; - std::vector axis; bool keep_dims; MeanOptionsT() : keep_dims(false) {} }; struct MeanOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { typedef MeanOptionsT NativeTableType; - enum { VT_AXIS = 4, VT_KEEP_DIMS = 6 }; - const flatbuffers::Vector *axis() const { - return GetPointer *>(VT_AXIS); - } + enum { VT_KEEP_DIMS = 4 }; bool keep_dims() const { return GetField(VT_KEEP_DIMS, 0) != 0; } bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_AXIS) && - verifier.Verify(axis()) && + return VerifyTableStart(verifier) && VerifyField(verifier, VT_KEEP_DIMS) && verifier.EndTable(); } MeanOptionsT *UnPack( @@ -3417,9 +3364,6 @@ struct MeanOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct MeanOptionsBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; - void add_axis(flatbuffers::Offset> axis) { - fbb_.AddOffset(MeanOptions::VT_AXIS, axis); - } void add_keep_dims(bool keep_dims) { fbb_.AddElement(MeanOptions::VT_KEEP_DIMS, static_cast(keep_dims), 0); @@ -3437,22 +3381,12 @@ struct MeanOptionsBuilder { }; inline flatbuffers::Offset CreateMeanOptions( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> axis = 0, - bool keep_dims = false) { + flatbuffers::FlatBufferBuilder &_fbb, bool keep_dims = false) { MeanOptionsBuilder builder_(_fbb); - builder_.add_axis(axis); builder_.add_keep_dims(keep_dims); return builder_.Finish(); } -inline flatbuffers::Offset CreateMeanOptionsDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector *axis = nullptr, bool keep_dims = false) { - return tflite::CreateMeanOptions( - _fbb, axis ? _fbb.CreateVector(*axis) : 0, keep_dims); -} - flatbuffers::Offset CreateMeanOptions( flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); @@ -5098,6 +5032,56 @@ inline flatbuffers::Offset CreateSequenceRNNOptions( _fused_activation_function); } +inline BidirectionalSequenceRNNOptionsT * +BidirectionalSequenceRNNOptions::UnPack( + const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new BidirectionalSequenceRNNOptionsT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void BidirectionalSequenceRNNOptions::UnPackTo( + BidirectionalSequenceRNNOptionsT *_o, + const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { + auto _e = time_major(); + _o->time_major = _e; + }; + { + auto _e = fused_activation_function(); + _o->fused_activation_function = _e; + }; +} + +inline flatbuffers::Offset +BidirectionalSequenceRNNOptions::Pack( + flatbuffers::FlatBufferBuilder &_fbb, + const BidirectionalSequenceRNNOptionsT *_o, + const flatbuffers::rehasher_function_t *_rehasher) { + return CreateBidirectionalSequenceRNNOptions(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset +CreateBidirectionalSequenceRNNOptions( + flatbuffers::FlatBufferBuilder &_fbb, + const BidirectionalSequenceRNNOptionsT *_o, + const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { + flatbuffers::FlatBufferBuilder *__fbb; + const BidirectionalSequenceRNNOptionsT *__o; + const flatbuffers::rehasher_function_t *__rehasher; + } _va = {&_fbb, _o, _rehasher}; + (void)_va; + auto _time_major = _o->time_major; + auto _fused_activation_function = _o->fused_activation_function; + return tflite::CreateBidirectionalSequenceRNNOptions( + _fbb, _time_major, _fused_activation_function); +} + inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack( const flatbuffers::resolver_function_t *_resolver) const { auto _o = new FullyConnectedOptionsT(); @@ -5457,14 +5441,6 @@ inline void ResizeBilinearOptions::UnPackTo( const flatbuffers::resolver_function_t *_resolver) const { (void)_o; (void)_resolver; - { - auto _e = new_height(); - _o->new_height = _e; - }; - { - auto _e = new_width(); - _o->new_width = _e; - }; } inline flatbuffers::Offset ResizeBilinearOptions::Pack( @@ -5484,9 +5460,7 @@ inline flatbuffers::Offset CreateResizeBilinearOptions( const flatbuffers::rehasher_function_t *__rehasher; } _va = {&_fbb, _o, _rehasher}; (void)_va; - auto _new_height = _o->new_height; - auto _new_width = _o->new_width; - return tflite::CreateResizeBilinearOptions(_fbb, _new_height, _new_width); + return tflite::CreateResizeBilinearOptions(_fbb); } inline CallOptionsT *CallOptions::UnPack( @@ -5616,33 +5590,6 @@ inline void SpaceToBatchNDOptions::UnPackTo( const flatbuffers::resolver_function_t *_resolver) const { (void)_o; (void)_resolver; - { - auto _e = block_shape(); - if (_e) { - _o->block_shape.resize(_e->size()); - for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { - _o->block_shape[_i] = _e->Get(_i); - } - } - }; - { - auto _e = before_paddings(); - if (_e) { - _o->before_paddings.resize(_e->size()); - for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { - _o->before_paddings[_i] = _e->Get(_i); - } - } - }; - { - auto _e = after_paddings(); - if (_e) { - _o->after_paddings.resize(_e->size()); - for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { - _o->after_paddings[_i] = _e->Get(_i); - } - } - }; } inline flatbuffers::Offset SpaceToBatchNDOptions::Pack( @@ -5662,14 +5609,7 @@ inline flatbuffers::Offset CreateSpaceToBatchNDOptions( const flatbuffers::rehasher_function_t *__rehasher; } _va = {&_fbb, _o, _rehasher}; (void)_va; - auto _block_shape = - _o->block_shape.size() ? _fbb.CreateVector(_o->block_shape) : 0; - auto _before_paddings = - _o->before_paddings.size() ? _fbb.CreateVector(_o->before_paddings) : 0; - auto _after_paddings = - _o->after_paddings.size() ? _fbb.CreateVector(_o->after_paddings) : 0; - return tflite::CreateSpaceToBatchNDOptions(_fbb, _block_shape, - _before_paddings, _after_paddings); + return tflite::CreateSpaceToBatchNDOptions(_fbb); } inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack( @@ -5684,33 +5624,6 @@ inline void BatchToSpaceNDOptions::UnPackTo( const flatbuffers::resolver_function_t *_resolver) const { (void)_o; (void)_resolver; - { - auto _e = block_shape(); - if (_e) { - _o->block_shape.resize(_e->size()); - for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { - _o->block_shape[_i] = _e->Get(_i); - } - } - }; - { - auto _e = before_crops(); - if (_e) { - _o->before_crops.resize(_e->size()); - for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { - _o->before_crops[_i] = _e->Get(_i); - } - } - }; - { - auto _e = after_crops(); - if (_e) { - _o->after_crops.resize(_e->size()); - for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { - _o->after_crops[_i] = _e->Get(_i); - } - } - }; } inline flatbuffers::Offset BatchToSpaceNDOptions::Pack( @@ -5730,14 +5643,7 @@ inline flatbuffers::Offset CreateBatchToSpaceNDOptions( const flatbuffers::rehasher_function_t *__rehasher; } _va = {&_fbb, _o, _rehasher}; (void)_va; - auto _block_shape = - _o->block_shape.size() ? _fbb.CreateVector(_o->block_shape) : 0; - auto _before_crops = - _o->before_crops.size() ? _fbb.CreateVector(_o->before_crops) : 0; - auto _after_crops = - _o->after_crops.size() ? _fbb.CreateVector(_o->after_crops) : 0; - return tflite::CreateBatchToSpaceNDOptions(_fbb, _block_shape, _before_crops, - _after_crops); + return tflite::CreateBatchToSpaceNDOptions(_fbb); } inline SkipGramOptionsT *SkipGramOptions::UnPack( @@ -5999,15 +5905,6 @@ inline void TransposeOptions::UnPackTo( const flatbuffers::resolver_function_t *_resolver) const { (void)_o; (void)_resolver; - { - auto _e = perm(); - if (_e) { - _o->perm.resize(_e->size()); - for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { - _o->perm[_i] = _e->Get(_i); - } - } - }; } inline flatbuffers::Offset TransposeOptions::Pack( @@ -6027,8 +5924,7 @@ inline flatbuffers::Offset CreateTransposeOptions( const flatbuffers::rehasher_function_t *__rehasher; } _va = {&_fbb, _o, _rehasher}; (void)_va; - auto _perm = _o->perm.size() ? _fbb.CreateVector(_o->perm) : 0; - return tflite::CreateTransposeOptions(_fbb, _perm); + return tflite::CreateTransposeOptions(_fbb); } inline MeanOptionsT *MeanOptions::UnPack( @@ -6042,15 +5938,6 @@ inline void MeanOptions::UnPackTo( MeanOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const { (void)_o; (void)_resolver; - { - auto _e = axis(); - if (_e) { - _o->axis.resize(_e->size()); - for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { - _o->axis[_i] = _e->Get(_i); - } - } - }; { auto _e = keep_dims(); _o->keep_dims = _e; @@ -6074,9 +5961,8 @@ inline flatbuffers::Offset CreateMeanOptions( const flatbuffers::rehasher_function_t *__rehasher; } _va = {&_fbb, _o, _rehasher}; (void)_va; - auto _axis = _o->axis.size() ? _fbb.CreateVector(_o->axis) : 0; auto _keep_dims = _o->keep_dims; - return tflite::CreateMeanOptions(_fbb, _axis, _keep_dims); + return tflite::CreateMeanOptions(_fbb, _keep_dims); } inline SqueezeOptionsT *SqueezeOptions::UnPack( diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD index 50e8ca75f8e..b949045128f 100644 --- a/tensorflow/contrib/lite/testing/BUILD +++ b/tensorflow/contrib/lite/testing/BUILD @@ -197,7 +197,7 @@ cc_binary( tf_cc_test( name = "generated_examples_zip_test", - size = "medium", + size = "large", srcs = ["generated_examples_zip_test.cc"], args = [ "--zip_files_dir=tensorflow/contrib/lite/testing/optest", @@ -206,7 +206,7 @@ tf_cc_test( "--unzip_binary_path=/usr/bin/unzip", ], data = [":optest"], - shard_count = 10, + shard_count = 20, tags = ["no_oss"], deps = [ ":parse_testdata_lib", diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py index a6393516578..b2227a7c98f 100644 --- a/tensorflow/contrib/lite/testing/generate_examples.py +++ b/tensorflow/contrib/lite/testing/generate_examples.py @@ -94,7 +94,8 @@ KNOWN_BUGS = { r"softmax.*input_shape=\[1,3,4,3\]": "67749831", # SpaceToDepth only supports float32. r"space_to_depth.*(float16|int32|uint8|int64)": "68018134", - # BatchToSpaceND doesn't support cropping. + # BatchToSpaceND doesn't support cropping. This catches test cases with + # const tensors as crops. r"batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\]": "70594634", # BatchToSpaceND only supports 4D tensors. r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733", @@ -694,6 +695,7 @@ def make_mean_tests(zip_path): [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0], [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3] ], + "const_axis": [True, False], "keep_dims": [True, False], }, { "input_dtype": [tf.float32, tf.int32, tf.int64], @@ -704,6 +706,7 @@ def make_mean_tests(zip_path): -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2], [2, 2, 3], [-3, -3, -4], [-3, 2, 1] ], + "const_axis": [True, False], "keep_dims": [True, False], }] @@ -713,17 +716,31 @@ def make_mean_tests(zip_path): dtype=parameters["input_dtype"], name="input", shape=parameters["input_shape"]) + + # Get axis as either a placeholder or constants. + if parameters["const_axis"]: + axis = parameters["axis"] + input_tensors = [input_tensor] + else: + if isinstance(parameters["axis"], list): + shape = [len(parameters["axis"])] + else: + shape = [0] # shape for None or integers. + axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape) + input_tensors = [input_tensor, axis] + out = tf.reduce_mean( - input_tensor, - axis=parameters["axis"], - keep_dims=parameters["keep_dims"]) - return [input_tensor], [out] + input_tensor, axis=axis, keep_dims=parameters["keep_dims"]) + return input_tensors, [out] def build_inputs(parameters, sess, inputs, outputs): - input_values = create_tensor_data(parameters["input_dtype"], - parameters["input_shape"]) - return [input_values], sess.run( - outputs, feed_dict=dict(zip(inputs, [input_values]))) + values = [ + create_tensor_data(parameters["input_dtype"], parameters["input_shape"]) + ] + if not parameters["const_axis"]: + if parameters["axis"]: + values.append(np.array(parameters["axis"])) + return values, sess.run(outputs, feed_dict=dict(zip(inputs, values))) make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) @@ -1318,12 +1335,16 @@ def make_space_to_batch_nd_tests(zip_path): "input_shape": [[1, 2, 2, 3], [2, 2, 4, 1]], "block_shape": [[1, 3], [2, 2]], "paddings": [[[0, 0], [0, 0]], [[0, 0], [2, 0]], [[1, 1], [1, 1]]], + "constant_block_shape": [True, False], + "constant_paddings": [True, False], }, { "dtype": [tf.float32], "input_shape": [[2, 3, 7, 3]], "block_shape": [[1, 3], [2, 2]], "paddings": [[[0, 0], [2, 0]], [[1, 0], [1, 0]]], + "constant_block_shape": [True, False], + "constant_paddings": [True, False], }, # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others. { @@ -1331,23 +1352,47 @@ def make_space_to_batch_nd_tests(zip_path): "input_shape": [[1, 4, 4, 4, 1, 1]], "block_shape": [[2, 2, 2]], "paddings": [[[0, 0], [0, 0], [0, 0]]], + "constant_block_shape": [True, False], + "constant_paddings": [True, False], }, ] def build_graph(parameters): + """Build a space_to_batch graph given `parameters`.""" input_tensor = tf.placeholder( dtype=parameters["dtype"], name="input", shape=parameters["input_shape"]) - out = tf.space_to_batch_nd(input_tensor, parameters["block_shape"], - parameters["paddings"]) - return [input_tensor], [out] + input_tensors = [input_tensor] + + # Get block_shape either as a const or as a placeholder (tensor). + if parameters["constant_block_shape"]: + block_shape = parameters["block_shape"] + else: + shape = [len(parameters["block_shape"])] + block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape) + input_tensors.append(block_shape) + + # Get paddings either as a const or as a placeholder (tensor). + if parameters["constant_paddings"]: + paddings = parameters["paddings"] + else: + shape = [len(parameters["paddings"]), 2] + paddings = tf.placeholder(dtype=tf.int32, name="paddings", shape=shape) + input_tensors.append(paddings) + + out = tf.space_to_batch_nd(input_tensor, block_shape, paddings) + return input_tensors, [out] def build_inputs(parameters, sess, inputs, outputs): - input_values = create_tensor_data(parameters["dtype"], - parameters["input_shape"]) - return [input_values], sess.run( - outputs, feed_dict=dict(zip(inputs, [input_values]))) + values = [ + create_tensor_data(parameters["dtype"], parameters["input_shape"]) + ] + if not parameters["constant_block_shape"]: + values.append(np.array(parameters["block_shape"])) + if not parameters["constant_paddings"]: + values.append(np.array(parameters["paddings"])) + return values, sess.run(outputs, feed_dict=dict(zip(inputs, values))) make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) @@ -1361,6 +1406,8 @@ def make_batch_to_space_nd_tests(zip_path): "input_shape": [[12, 2, 2, 1]], "block_shape": [[1, 4], [2, 2], [3, 4]], "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]], + "constant_block_shape": [True, False], + "constant_crops": [True, False], }, # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others. { @@ -1368,23 +1415,47 @@ def make_batch_to_space_nd_tests(zip_path): "input_shape": [[8, 2, 2, 2, 1, 1]], "block_shape": [[2, 2, 2]], "crops": [[[0, 0], [0, 0], [0, 0]]], + "constant_block_shape": [True, False], + "constant_crops": [True, False], }, ] def build_graph(parameters): + """Build a batch_to_space graph given `parameters`.""" input_tensor = tf.placeholder( dtype=parameters["dtype"], name="input", shape=parameters["input_shape"]) - out = tf.batch_to_space_nd(input_tensor, parameters["block_shape"], - parameters["crops"]) - return [input_tensor], [out] + input_tensors = [input_tensor] + + # Get block_shape either as a const or as a placeholder (tensor). + if parameters["constant_block_shape"]: + block_shape = parameters["block_shape"] + else: + shape = [len(parameters["block_shape"])] + block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape) + input_tensors.append(block_shape) + + # Get crops either as a const or as a placeholder (tensor). + if parameters["constant_crops"]: + crops = parameters["crops"] + else: + shape = [len(parameters["crops"]), 2] + crops = tf.placeholder(dtype=tf.int32, name="crops", shape=shape) + input_tensors.append(crops) + + out = tf.batch_to_space_nd(input_tensor, block_shape, crops) + return input_tensors, [out] def build_inputs(parameters, sess, inputs, outputs): - input_values = create_tensor_data(parameters["dtype"], - parameters["input_shape"]) - return [input_values], sess.run( - outputs, feed_dict=dict(zip(inputs, [input_values]))) + values = [ + create_tensor_data(parameters["dtype"], parameters["input_shape"]) + ] + if not parameters["constant_block_shape"]: + values.append(np.array(parameters["block_shape"])) + if not parameters["constant_crops"]: + values.append(np.array(parameters["crops"])) + return values, sess.run(outputs, feed_dict=dict(zip(inputs, values))) make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) @@ -1397,29 +1468,44 @@ def make_transpose_tests(zip_path): "dtype": [tf.int32, tf.int64, tf.float32], "input_shape": [[2, 2, 3]], "perm": [[0, 1, 2], [0, 2, 1]], + "constant_perm": [True, False], }, { "dtype": [tf.float32], "input_shape": [[1, 2, 3, 4]], "perm": [[0, 1, 2, 3], [3, 0, 1, 2]], + "constant_perm": [True, False], }, { "dtype": [tf.float32], "input_shape": [[1, 2, 3, 4, 5]], "perm": [[0, 1, 2, 3, 4]], + "constant_perm": [True, False], }] def build_graph(parameters): + """Build a transpose graph given `parameters`.""" input_tensor = tf.placeholder( dtype=parameters["dtype"], name="input", shape=parameters["input_shape"]) - out = tf.transpose(input_tensor, perm=parameters["perm"]) - return [input_tensor], [out] + + if parameters["constant_perm"]: + perm = parameters["perm"] + input_tensors = [input_tensor] + else: + shape = [len(parameters["perm"]), 2] + perm = tf.placeholder(dtype=tf.int32, name="perm", shape=shape) + input_tensors = [input_tensor, perm] + + out = tf.transpose(input_tensor, perm=perm) + return input_tensors, [out] def build_inputs(parameters, sess, inputs, outputs): - input_values = create_tensor_data(parameters["dtype"], - parameters["input_shape"]) - return [input_values], sess.run( - outputs, feed_dict=dict(zip(inputs, [input_values]))) + values = [ + create_tensor_data(parameters["dtype"], parameters["input_shape"]) + ] + if not parameters["constant_perm"]: + values.append(np.array(parameters["perm"])) + return values, sess.run(outputs, feed_dict=dict(zip(inputs, values))) make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) @@ -1474,9 +1560,11 @@ def make_strided_slice_tests(zip_path): "input_shape": [[12, 2, 2, 5]], "begin": [[0, 0, 0, 0], [1, 0, 1, 0]], "end": [[8, 2, 2, 3], [12, 2, 2, 5]], - "strides": [None, [1, 1, 1, 1], [2, 1, 3, 1]], - "begin_mask": [None, 0, 1, 2, 8], - "end_mask": [None, 0, 1, 2, 8], + "strides": [None, [2, 1, 3, 1]], + "begin_mask": [None, 1, 8], + "end_mask": [None, 1, 8], + "shrink_axis_mask": [None, 1, 8, 11, 15, -1], + "constant_indices": [False, True], }, # 2-D { @@ -1485,9 +1573,11 @@ def make_strided_slice_tests(zip_path): "input_shape": [[2, 3]], "begin": [[0, 0], [1, 0]], "end": [[2, 3], [2, 2]], - "strides": [None, [1, 1], [2, 2]], - "begin_mask": [None, 0, 1, 2], - "end_mask": [None, 0, 1, 2], + "strides": [None, [2, 2]], + "begin_mask": [None, 1, 2], + "end_mask": [None, 1, 2], + "shrink_axis_mask": [None, 1, 2, 3, -1], + "constant_indices": [False, True], }, # Negative strides { @@ -1497,8 +1587,10 @@ def make_strided_slice_tests(zip_path): "begin": [[0, -1]], "end": [[2, -3]], "strides": [[1, -1]], - "begin_mask": [None, 0, 1, 2], - "end_mask": [None, 0, 1, 2], + "begin_mask": [None, 1, 2], + "end_mask": [None, 1, 2], + "shrink_axis_mask": [None, 1, 2, 3, -1], + "constant_indices": [False], }, ] @@ -1508,23 +1600,29 @@ def make_strided_slice_tests(zip_path): dtype=parameters["dtype"], name="input", shape=parameters["input_shape"]) - begin = tf.placeholder( - dtype=parameters["index_type"], - name="begin", - shape=[len(parameters["input_shape"])]) - end = tf.placeholder( - dtype=parameters["index_type"], - name="end", - shape=[len(parameters["input_shape"])]) - strides = ( - tf.placeholder( - dtype=parameters["index_type"], - name="strides", - shape=[len(parameters["input_shape"])]) - if parameters["strides"] is not None else None) - tensors = [input_tensor, begin, end] - if strides is not None: - tensors.append(strides) + if parameters["constant_indices"]: + begin = parameters["begin"] + end = parameters["end"] + strides = parameters["strides"] + tensors = [input_tensor] + else: + begin = tf.placeholder( + dtype=parameters["index_type"], + name="begin", + shape=[len(parameters["input_shape"])]) + end = tf.placeholder( + dtype=parameters["index_type"], + name="end", + shape=[len(parameters["input_shape"])]) + strides = ( + tf.placeholder( + dtype=parameters["index_type"], + name="strides", + shape=[len(parameters["input_shape"])]) + if parameters["strides"] is not None else None) + tensors = [input_tensor, begin, end] + if strides is not None: + tensors.append(strides) out = tf.strided_slice( input_tensor, begin, @@ -1539,14 +1637,17 @@ def make_strided_slice_tests(zip_path): input_values = create_tensor_data(parameters["dtype"], parameters["input_shape"]) index_type = _TF_TYPE_INFO[parameters["index_type"]][0] - begin_values = np.array(parameters["begin"]).astype(index_type) - end_values = np.array(parameters["end"]).astype(index_type) - stride_values = ( - np.array(parameters["strides"]).astype(index_type) - if parameters["strides"] is not None else None) - values = [input_values, begin_values, end_values] - if stride_values is not None: - values.append(stride_values) + values = [input_values] + if not parameters["constant_indices"]: + begin_values = np.array(parameters["begin"]).astype(index_type) + end_values = np.array(parameters["end"]).astype(index_type) + stride_values = ( + np.array(parameters["strides"]).astype(index_type) + if parameters["strides"] is not None else None) + values.append(begin_values) + values.append(end_values) + if stride_values is not None: + values.append(stride_values) return values, sess.run(outputs, feed_dict=dict(zip(inputs, values))) diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc index 41652a07d21..e8b425a5925 100644 --- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc +++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc @@ -47,9 +47,7 @@ tensorflow::Env* env = tensorflow::Env::Default(); // Key is a substring of the test name and value is a bug number. // TODO(ahentz): make sure we clean this list up frequently. std::map kBrokenTests = { - // Add doesn't support broadcasting. - {R"(^\/adda.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"}, - {R"(^\/mula.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"}, + // Sub and Div don't support broadcasting. {R"(^\/diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"}, {R"(^\/suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"}, @@ -67,7 +65,11 @@ std::map kBrokenTests = { // L2Norm only supports tensors with 4D or fewer. {R"(^\/l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"}, - // SpaceToBatch only supports 4D tensors. + // BatchToSpaceND doesn't support cropping. This catches test cases with + // non-const tensors as crops. + {R"(^\/batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\])", "70594634"}, + + // SpaceToBatchND only supports 4D tensors. {R"(^\/space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"}, // L2Norm only works for dim=-1. @@ -92,7 +94,7 @@ std::map kBrokenTests = { {R"(^\/resize_bilinearalign_corners=True,.*,size=\[5,6\])", "72401483"}, // Transpose only supports 1D-4D input tensors. - {R"(^\/transposedtype=.*,input_shape=\[.,.,.,.,.\],perm=.*)", "71545879"}, + {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"}, }; // Allows test data to be unzipped into a temporary directory and makes @@ -239,8 +241,7 @@ INSTANTIATE_TESTS(avg_pool) INSTANTIATE_TESTS(space_to_batch_nd) INSTANTIATE_TESTS(batch_to_space_nd) INSTANTIATE_TESTS(concat) -// TODO(b/71642435) re-enable this test -// INSTANTIATE_TESTS(constant) +INSTANTIATE_TESTS(constant) INSTANTIATE_TESTS(control_dep) INSTANTIATE_TESTS(conv) INSTANTIATE_TESTS(depthwiseconv) diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD index 6fc7e5e3fdd..20c156a9326 100644 --- a/tensorflow/contrib/lite/toco/BUILD +++ b/tensorflow/contrib/lite/toco/BUILD @@ -205,6 +205,7 @@ cc_library( "graph_transformations/remove_trivial_quantized_activation_func.cc", "graph_transformations/remove_trivial_reshape.cc", "graph_transformations/remove_unused_op.cc", + "graph_transformations/reorder_activation_functions.cc", "graph_transformations/resolve_batch_normalization.cc", "graph_transformations/resolve_batch_to_space_nd_attributes.cc", "graph_transformations/resolve_constant_binary.cc", diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc index 529df3cd2e5..4c70b01a9d0 100644 --- a/tensorflow/contrib/lite/toco/export_tensorflow.cc +++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc @@ -621,7 +621,8 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op, GraphDef* tensorflow_graph) { string softmax_input; Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]); - if (providing_op->type == OperatorType::kTensorFlowReshape) { + if (providing_op != nullptr && + providing_op->type == OperatorType::kTensorFlowReshape) { softmax_input = src_op.inputs[0]; } else { // Insert a reshape operator that reduces the dimensions down to the 2 that diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc index 88e59664ec4..ab943f72d1d 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc @@ -68,12 +68,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) { return false; } - // TODO(b/72172404): Great many ops don't support activation function - // fusing. Switch to a categorizing function instead. - if (op->type == OperatorType::kConcatenation || - op->type == OperatorType::kSlice || - op->type == OperatorType::kTensorFlowReshape || - op->type == OperatorType::kTensorFlowSplit) { + if (!OperatorSupportsFusedActivation(op->type)) { AddMessageF( "Not fusing activation function because the %s op doesn't support it", LogName(*op)); diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h index e11bebcd4e0..cf90ebe9969 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h +++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h @@ -144,6 +144,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator) DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays) DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays) DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax) +DECLARE_GRAPH_TRANSFORMATION(ReorderActivationFunctions) DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes) DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat) DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul) diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc index 082820fddcf..c363b93394f 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc @@ -16,7 +16,6 @@ limitations under the License. #include #include -#include "absl/strings/string_view.h" #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h" #include "tensorflow/contrib/lite/toco/model.h" #include "tensorflow/contrib/lite/toco/tooling_util.h" @@ -202,23 +201,6 @@ bool MatchOperatorInputs(const Operator& op, const Model& model, return true; } -absl::string_view FindLongestCommonPrefix(absl::string_view a, - absl::string_view b) { - if (a.empty() || b.empty()) return absl::string_view(); - - const char* pa = a.data(); - const char* pb = b.data(); - size_t count = 0; - const ssize_t limit = std::min(a.size(), b.size()); - while (count < limit && *pa == *pb) { - ++pa; - ++pb; - ++count; - } - - return absl::string_view(a.data(), count); -} - } // namespace bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) { diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc index 4fb3b6ae7a5..7f26884bc15 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc @@ -1120,7 +1120,8 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) { stop += input_array.shape().dims(i); } - int dim_size = (stop - start) / op->strides[i]; + int dim_size = ceil((stop - start) / static_cast(op->strides[i])); + dim_size = dim_size < 0 ? 0 : dim_size; if (op->shrink_axis_mask & mask) { CHECK_EQ(dim_size, 1) << "Output size for an axis must compute to 1 when " "shrinking that axis"; diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc new file mode 100644 index 00000000000..cabbc4d313b --- /dev/null +++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc @@ -0,0 +1,85 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include +#include +#include +#include + +#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h" +#include "tensorflow/contrib/lite/toco/model.h" +#include "tensorflow/contrib/lite/toco/runtime/types.h" +#include "tensorflow/contrib/lite/toco/tooling_util.h" +#include "tensorflow/core/platform/logging.h" + +namespace toco { + +bool ReorderActivationFunctions::Run(Model* model, std::size_t op_index) { + const auto ac_it = model->operators.begin() + op_index; + std::unique_ptr& ac_op = *ac_it; + DCHECK(ac_op); + + if (ac_op->type != OperatorType::kRelu6 && + ac_op->type != OperatorType::kRelu1 && + ac_op->type != OperatorType::kRelu) { + return false; + } + + auto exchange_it = FindOpWithOutput(*model, ac_op->inputs[0]); + if (exchange_it == model->operators.end()) return false; + // Find the op producing the array passed to this activation function + std::unique_ptr& exchange_op = *exchange_it; + DCHECK(exchange_op); + + if (exchange_op->type != OperatorType::kTensorFlowReshape) { + return false; + } + + DCHECK_EQ(exchange_op->outputs[0], ac_op->inputs[0]); + const auto& exchange_op_input = exchange_op->inputs[0]; + const auto& intermediate_array = exchange_op->outputs[0]; + const auto& ac_op_output = ac_op->outputs[0]; + + int count_ops_consuming_output = + CountOpsWithInput(*model, intermediate_array); + DCHECK_GE(count_ops_consuming_output, 1); + if (count_ops_consuming_output > 1) { + AddMessageF( + "Not exchanging activation function with %s because it is consumed by " + "more than 1 other operator", + LogName(*exchange_op)); + return false; + } + + // Rewire by changing inputs, including all consumers. + Operator* consumer = GetFirstOpWithInput(*model, ac_op_output); + while (consumer) { + for (int i = 0; i < consumer->inputs.size(); ++i) { + if (consumer->inputs[i] == ac_op_output) { + consumer->inputs[i] = intermediate_array; + } + } + consumer = GetFirstOpWithInput(*model, ac_op_output); + } + ac_op->inputs[0] = exchange_op_input; + exchange_op->inputs[0] = ac_op_output; + + // Finally, reorder operators. Note that this only works when there are no + // other direct descendents of the exchange_op. + ac_op.swap(exchange_op); + + return true; +} + +} // namespace toco diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc index 5ac449749ad..db68968bad1 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc @@ -73,7 +73,7 @@ void CopyTensorSegments(const std::vector& input_arrays, // Receives a series of input arrays of type Array and an integer showing the // axis on which those arrays will be concatenated. It returns the concatenated -// arrray. +// array. template void ConcatenateTensorBuffers(const std::vector& input_arrays, int concatenation_axis, diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc index ca378af4c5c..9862dbe99d5 100644 --- a/tensorflow/contrib/lite/toco/import_tensorflow.cc +++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc @@ -173,7 +173,8 @@ void ImportFloatArray(const TensorProto& input_tensor, Array* output_array) { } auto& output_float_data = output_array->GetMutableBuffer().data; - output_float_data.resize(input_flat_size); + output_float_data.resize(RequiredBufferSizeForShape(output_array->shape()), + 0.f); if (input_tensor.float_val_size() == 1) { for (int i = 0; i < input_flat_size; i++) { output_float_data[i] = input_tensor.float_val(0); @@ -203,7 +204,7 @@ void ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) { } auto& output_int_data = output_array->GetMutableBuffer().data; - output_int_data.resize(input_flat_size); + output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0); if (input_tensor.int_val_size()) { for (int i = 0; i < input_tensor.int_val_size(); i++) { output_int_data[i] = input_tensor.int_val(i); @@ -229,7 +230,7 @@ void ImportInt32Array(const TensorProto& input_tensor, Array* output_array) { } auto& output_int_data = output_array->GetMutableBuffer().data; - output_int_data.resize(input_flat_size); + output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0); if (input_tensor.int_val_size()) { for (int i = 0; i < input_tensor.int_val_size(); i++) { output_int_data[i] = input_tensor.int_val(i); @@ -255,7 +256,7 @@ void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) { } auto& output_int_data = output_array->GetMutableBuffer().data; - output_int_data.resize(input_flat_size); + output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0); if (input_tensor.int64_val_size()) { for (int i = 0; i < input_tensor.int64_val_size(); i++) { output_int_data[i] = input_tensor.int64_val(i); @@ -281,7 +282,7 @@ void ImportStringArray(const TensorProto& input_tensor, Array* output_array) { } auto& output_string_data = output_array->GetMutableBuffer().data; - output_string_data.resize(input_flat_size); + output_string_data.resize(RequiredBufferSizeForShape(output_array->shape())); if (input_flat_size != input_tensor.string_val_size()) { LOG(FATAL) << "Input_content string_val doesn't have the right " "dimensions for this string tensor."; diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h index d1af371fd4c..6fba8f2629f 100644 --- a/tensorflow/contrib/lite/toco/model.h +++ b/tensorflow/contrib/lite/toco/model.h @@ -15,6 +15,7 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_ #define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_ +#include #include #include #include diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.cc b/tensorflow/contrib/lite/toco/tensorflow_util.cc index 82e2800ca2f..0e7e9c41a06 100644 --- a/tensorflow/contrib/lite/toco/tensorflow_util.cc +++ b/tensorflow/contrib/lite/toco/tensorflow_util.cc @@ -51,7 +51,8 @@ void LogDumpGraphDef(int log_level, const string& message, BEGIN DUMP OF TENSORFLOW GRAPHDEF (%s) There are %d nodes. There are %zu different op types: -)MSG", message, tf_graph.node_size(), ops.size()); +)MSG", + message, tf_graph.node_size(), ops.size()); for (const auto& op : ops) { toco::port::AppendF(&dump, " %s\n", op); } @@ -63,7 +64,8 @@ PROTO DUMP BEGIN NODE: name = %s op = %s inputs = [ -)MSG", node.name(), node.op()); +)MSG", + node.name(), node.op()); for (const auto& input : node.input()) { toco::port::AppendF(&dump, " %s\n", input); } diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD index 72c92665644..a2b8145a672 100644 --- a/tensorflow/contrib/lite/toco/tflite/BUILD +++ b/tensorflow/contrib/lite/toco/tflite/BUILD @@ -117,6 +117,7 @@ cc_library( ":types", "//tensorflow/contrib/lite/schema:schema_fbs", "//tensorflow/contrib/lite/toco:model", + "//tensorflow/contrib/lite/toco:tooling_util", "@flatbuffers", ], ) diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc index 391ef87029d..27719599708 100644 --- a/tensorflow/contrib/lite/toco/tflite/export.cc +++ b/tensorflow/contrib/lite/toco/tflite/export.cc @@ -26,6 +26,9 @@ namespace toco { namespace tflite { +using flatbuffers::FlatBufferBuilder; +using flatbuffers::Offset; +using flatbuffers::Vector; using ::tflite::Buffer; using ::tflite::BuiltinOperator; using ::tflite::BuiltinOperator_CUSTOM; @@ -39,9 +42,6 @@ using ::tflite::Operator; using ::tflite::OperatorCode; using ::tflite::SubGraph; using ::tflite::Tensor; -using flatbuffers::FlatBufferBuilder; -using flatbuffers::Offset; -using flatbuffers::Vector; namespace { diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/contrib/lite/toco/tflite/import.cc index bbf201fd288..5b1ab514b23 100644 --- a/tensorflow/contrib/lite/toco/tflite/import.cc +++ b/tensorflow/contrib/lite/toco/tflite/import.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/contrib/lite/schema/schema_generated.h" #include "tensorflow/contrib/lite/toco/tflite/operator.h" #include "tensorflow/contrib/lite/toco/tflite/types.h" +#include "tensorflow/contrib/lite/toco/tooling_util.h" namespace toco { @@ -119,8 +120,16 @@ void ImportOperators( auto inputs = input_op->inputs(); for (int i = 0; i < inputs->Length(); i++) { auto input_index = inputs->Get(i); - const string& input_name = tensors_table.at(input_index); - op->inputs.push_back(input_name); + // input_index == -1 indicates optional tensor. + if (input_index != -1) { + const string& input_name = tensors_table.at(input_index); + op->inputs.push_back(input_name); + } else { + const string& tensor_name = + toco::AvailableArrayName(*model, "OptionalTensor"); + model->CreateOptionalArray(tensor_name); + op->inputs.push_back(tensor_name); + } } auto outputs = input_op->outputs(); for (int i = 0; i < outputs->Length(); i++) { diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc index 298f49025f9..461494fd99e 100644 --- a/tensorflow/contrib/lite/toco/tflite/operator.cc +++ b/tensorflow/contrib/lite/toco/tflite/operator.cc @@ -140,25 +140,11 @@ class SpaceToBatchND flatbuffers::Offset WriteOptions( const TocoOperator& op, flatbuffers::FlatBufferBuilder* builder) const override { - auto block_shape = builder->CreateVector(op.block_shape); - auto before_paddings = builder->CreateVector(op.before_paddings); - auto after_paddings = builder->CreateVector(op.after_paddings); - return ::tflite::CreateSpaceToBatchNDOptions( - *builder, block_shape, before_paddings, after_paddings); + return ::tflite::CreateSpaceToBatchNDOptions(*builder); } void ReadOptions(const TfLiteOptions& options, - TocoOperator* op) const override { - op->block_shape.insert(op->block_shape.end(), - options.block_shape()->begin(), - options.block_shape()->end()); - op->before_paddings.insert(op->before_paddings.end(), - options.before_paddings()->begin(), - options.before_paddings()->end()); - op->after_paddings.insert(op->after_paddings.end(), - options.after_paddings()->begin(), - options.after_paddings()->end()); - } + TocoOperator* op) const override {} }; class Sub : public BuiltinOperator WriteOptions( const TocoOperator& op, flatbuffers::FlatBufferBuilder* builder) const override { - auto block_shape = builder->CreateVector(op.block_shape); - auto before_crops = builder->CreateVector(op.before_crops); - auto after_crops = builder->CreateVector(op.after_crops); - return ::tflite::CreateBatchToSpaceNDOptions(*builder, block_shape, - before_crops, after_crops); + return ::tflite::CreateBatchToSpaceNDOptions(*builder); } void ReadOptions(const TfLiteOptions& options, - TocoOperator* op) const override { - op->block_shape.insert(op->block_shape.end(), - options.block_shape()->begin(), - options.block_shape()->end()); - op->before_crops.insert(op->before_crops.end(), - options.before_crops()->begin(), - options.before_crops()->end()); - op->after_crops.insert(op->after_crops.end(), - options.after_crops()->begin(), - options.after_crops()->end()); - } + TocoOperator* op) const override {} }; class Cast : public CustomOperator { @@ -478,8 +450,7 @@ class Pad : public BuiltinOperator WriteOptions( const TocoOperator& op, flatbuffers::FlatBufferBuilder* builder) const override { - return ::tflite::CreateTransposeOptions(*builder, - builder->CreateVector(op.perm)); + return ::tflite::CreateTransposeOptions(*builder); } void ReadOptions(const TfLiteOptions& options, - TocoOperator* op) const override { - op->perm.insert(op->perm.end(), options.perm()->begin(), - options.perm()->end()); - } + TocoOperator* op) const override {} }; class Mean : public BuiltinOperator WriteOptions( const TocoOperator& op, flatbuffers::FlatBufferBuilder* builder) const override { - auto axis = builder->CreateVector(op.axis); - return ::tflite::CreateMeanOptions(*builder, axis, op.keep_dims); + return ::tflite::CreateMeanOptions(*builder, op.keep_dims); } void ReadOptions(const TfLiteOptions& options, TocoOperator* op) const override { - op->axis.insert(op->axis.end(), options.axis()->begin(), - options.axis()->end()); op->keep_dims = options.keep_dims(); } }; diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc index 9036a16d1c9..6daa296282e 100644 --- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc +++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc @@ -119,40 +119,12 @@ TEST_F(OperatorTest, BuiltinAdd) { output_toco_op->fused_activation_function); } -TEST_F(OperatorTest, BuiltinSpaceToBatchND) { - SpaceToBatchNDOperator op; - op.block_shape = {2, 2}; - op.before_paddings = {1, 2}; - op.after_paddings = {3, 4}; - - auto output_toco_op = SerializeAndDeserialize( - GetOperator("SPACE_TO_BATCH_ND", OperatorType::kSpaceToBatchND), op); - EXPECT_EQ(op.block_shape, output_toco_op->block_shape); - EXPECT_EQ(op.before_paddings, output_toco_op->before_paddings); - EXPECT_EQ(op.after_paddings, output_toco_op->after_paddings); -} - -TEST_F(OperatorTest, BuiltinBatchToSpaceND) { - BatchToSpaceNDOperator op; - op.block_shape = {2, 2}; - op.before_crops = {1, 2}; - op.after_crops = {3, 4}; - - auto output_toco_op = SerializeAndDeserialize( - GetOperator("BATCH_TO_SPACE_ND", OperatorType::kBatchToSpaceND), op); - EXPECT_EQ(op.block_shape, output_toco_op->block_shape); - EXPECT_EQ(op.before_crops, output_toco_op->before_crops); - EXPECT_EQ(op.after_crops, output_toco_op->after_crops); -} - TEST_F(OperatorTest, BuiltinMean) { MeanOperator op; - op.axis = {1, 2}; op.keep_dims = false; auto output_toco_op = SerializeAndDeserialize(GetOperator("MEAN", OperatorType::kMean), op); - EXPECT_EQ(op.axis, output_toco_op->axis); EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims); } @@ -370,15 +342,6 @@ TEST_F(OperatorTest, Svdf) { EXPECT_EQ(op.rank, output_toco_op->rank); } -TEST_F(OperatorTest, Transpose) { - TransposeOperator op; - op.perm = {0, 1, 2, 3}; - - auto output_toco_op = SerializeAndDeserialize( - GetOperator("TRANSPOSE", OperatorType::kTranspose), op); - EXPECT_EQ(op.perm, output_toco_op->perm); -} - TEST_F(OperatorTest, Squeeze) { SqueezeOperator op; op.squeeze_dims = {-2, -3, 4, 1, 4}; diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc index f8281f3a572..c5a62fdb620 100644 --- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc +++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc @@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags( "For Protobuf formats, the binary format will be used."), Flag("input_format", parsed_flags.input_format.bind(), parsed_flags.input_format.default_value(), - "Input file format. One of: tensorflow_graphdef, "), + "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."), Flag("output_format", parsed_flags.output_format.bind(), - parsed_flags.output_format.default_value(), "Output file format."), + parsed_flags.output_format.default_value(), + "Output file format. " + "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."), Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(), parsed_flags.default_ranges_min.default_value(), "If defined, will be used as the default value for the min bound " @@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags( Flag("inference_type", parsed_flags.inference_type.bind(), parsed_flags.inference_type.default_value(), "Target data type of arrays in the output file (for input_arrays, " - "this may be overridden by inference_input_type)."), + "this may be overridden by inference_input_type). " + "One of FLOAT, QUANTIZED_UINT8."), Flag("inference_input_type", parsed_flags.inference_input_type.bind(), parsed_flags.inference_input_type.default_value(), - "Target data type of input arrays. If not specified, inference_type " - "is used."), + "Target data type of input arrays. " + "If not specified, inference_type is used. " + "One of FLOAT, QUANTIZED_UINT8."), Flag("input_type", parsed_flags.input_type.bind(), parsed_flags.input_type.default_value(), "Deprecated ambiguous flag that set both --input_data_types and " @@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags( Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(), parsed_flags.drop_fake_quant.default_value(), - "Ignore and discard FakeQuant nodes. For instance, that can be used " - "to " + "Ignore and discard FakeQuant nodes. For instance, to " "generate plain float code without fake-quantization from a " - "quantized " - "graph."), + "quantized graph."), Flag( "reorder_across_fake_quant", parsed_flags.reorder_across_fake_quant.bind(), parsed_flags.reorder_across_fake_quant.default_value(), "Normally, FakeQuant nodes must be strict boundaries for graph " "transformations, in order to ensure that quantized inference has " - "the " - "exact same arithmetic behavior as quantized training --- which is " - "the " - "whole point of quantized training and of FakeQuant nodes in the " - "first " - "place. However, that entails subtle requirements on where exactly " + "the exact same arithmetic behavior as quantized training --- which " + "is the whole point of quantized training and of FakeQuant nodes in " + "the first place. " + "However, that entails subtle requirements on where exactly " "FakeQuant nodes must be placed in the graph. Some quantized graphs " "have FakeQuant nodes at unexpected locations, that prevent graph " "transformations that are necessary in order to generate inference " "code for these graphs. Such graphs should be fixed, but as a " "temporary work-around, setting this reorder_across_fake_quant flag " - "allows toco to perform necessary graph transformaitons on them, " + "allows TOCO to perform necessary graph transformaitons on them, " "at the cost of no longer faithfully matching inference and training " "arithmetic."), Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(), parsed_flags.allow_custom_ops.default_value(), - "If true, allow TOCO to create TF Lite Custom operators for all the" - "unsupported Tensorflow ops."), + "If true, allow TOCO to create TF Lite Custom operators for all the " + "unsupported TensorFlow ops."), Flag( "drop_control_dependency", parsed_flags.drop_control_dependency.bind(), diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc index 727df1cc76a..b715881774b 100644 --- a/tensorflow/contrib/lite/toco/toco_tooling.cc +++ b/tensorflow/contrib/lite/toco/toco_tooling.cc @@ -68,6 +68,7 @@ void MakeGeneralGraphTransformationsSet( transformations->Add(new ResolveTensorFlowMatMul); transformations->Add(new FuseBinaryIntoPrecedingAffine); transformations->Add(new FuseBinaryIntoFollowingAffine); + transformations->Add(new ReorderActivationFunctions); transformations->Add(new ResolveBatchNormalization); transformations->Add(new ResolveConstantBinaryOperator); transformations->Add(new ResolveConstantFill); diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc index 187c426a5b3..d0bf66c542b 100644 --- a/tensorflow/contrib/lite/toco/tooling_util.cc +++ b/tensorflow/contrib/lite/toco/tooling_util.cc @@ -33,6 +33,24 @@ limitations under the License. namespace toco { +// Find the longest common prefix of two strings. +absl::string_view FindLongestCommonPrefix(absl::string_view a, + absl::string_view b) { + if (a.empty() || b.empty()) return absl::string_view(); + + const char* pa = a.data(); + const char* pb = b.data(); + size_t count = 0; + const size_t limit = std::min(a.size(), b.size()); + while (count < limit && *pa == *pb) { + ++pa; + ++pb; + ++count; + } + + return absl::string_view(a.data(), count); +} + string LogName(const Operator& op) { const string& opname = HelpfulOperatorTypeName(op); if (op.outputs.empty()) { @@ -286,6 +304,19 @@ string HelpfulOperatorTypeName(const Operator& op) { return OperatorTypeName(op.type); } +bool OperatorSupportsFusedActivation(OperatorType type) { + switch (type) { + case OperatorType::kConcatenation: + case OperatorType::kSlice: + case OperatorType::kSqueeze: + case OperatorType::kTensorFlowReshape: + case OperatorType::kTensorFlowSplit: + return false; + default: + return true; + } +} + void LogSummary(int log_level, const Model& model) { VLOG(log_level) << "Operators summary (" << model.operators.size() << " operators):"; @@ -1317,13 +1348,14 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) { } string AvailableArrayName(const Model& model, const string& name) { - if (!model.HasArray(name) && !model.optional_arrays.count(name)) { + if (!model.HasArray(name) && !model.IsOptionalArray(name)) { return name; } const int kNumSuffixesToTry = 1000; for (int i = 0; i < kNumSuffixesToTry; i++) { const string& name_with_suffix = toco::port::StringF("%s_%d", name, i); - if (!model.HasArray(name_with_suffix)) { + if (!model.HasArray(name_with_suffix) && + !model.IsOptionalArray(name_with_suffix)) { return name_with_suffix; } } diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h index 2ac51c7e5bb..a7e77a02eb1 100644 --- a/tensorflow/contrib/lite/toco/tooling_util.h +++ b/tensorflow/contrib/lite/toco/tooling_util.h @@ -23,6 +23,7 @@ limitations under the License. #include #include +#include "absl/strings/string_view.h" #include "tensorflow/core/platform/logging.h" #if TOCO_SUPPORT_PORTABLE_PROTOS #include "third_party/protobuf/src/google/protobuf/text_format.h" @@ -49,6 +50,8 @@ namespace toco { constexpr int kLogLevelModelChanged = 1; constexpr int kLogLevelModelUnchanged = 2; +absl::string_view FindLongestCommonPrefix(absl::string_view a, + absl::string_view b); string LogName(const Operator& op); bool IsInputArray(const Model& model, const string& name); @@ -79,6 +82,8 @@ std::vector>::iterator FindOp(Model& model, const char* OperatorTypeName(OperatorType type); string HelpfulOperatorTypeName(const Operator& op); +bool OperatorSupportsFusedActivation(OperatorType type); + void DumpGraphvizVideoFrame(const Model& model); void LogDump(int log_level, const string& message, const Model& model); void LogSummary(int log_level, const string& message, const Model& model); diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD index 1bffcfb9873..6786b161845 100644 --- a/tensorflow/contrib/lite/tools/BUILD +++ b/tensorflow/contrib/lite/tools/BUILD @@ -99,8 +99,11 @@ cc_library( srcs = ["verifier.cc"], hdrs = ["verifier.h"], deps = [ + "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:schema_fbs_version", + "//tensorflow/contrib/lite:string_util", "//tensorflow/contrib/lite/schema:schema_fbs", + "@com_google_absl//absl/base:core_headers", ], ) @@ -112,8 +115,10 @@ cc_test( ":verifier", "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:schema_fbs_version", + "//tensorflow/contrib/lite:string_util", "//tensorflow/contrib/lite/schema:schema_fbs", "//tensorflow/contrib/lite/testing:util", + "//tensorflow/core:framework_lite", "@com_google_googletest//:gtest", "@flatbuffers", ], diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc index 95a08953798..726e2aaa316 100644 --- a/tensorflow/contrib/lite/tools/verifier.cc +++ b/tensorflow/contrib/lite/tools/verifier.cc @@ -14,13 +14,32 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/tools/verifier.h" +#include #include "tensorflow/contrib/lite/schema/schema_generated.h" +#include "tensorflow/contrib/lite/string_util.h" #include "tensorflow/contrib/lite/version.h" namespace tflite { namespace { +// Reports error message when the reporter is set. +void ReportError(ErrorReporter* error_reporter, const char* format, ...) { + if (error_reporter) { + va_list args; + va_start(args, format); + error_reporter->Report(format, args); + va_end(args); + } +} + +// Returns the int32_t value pointed by ptr. +const uint32_t* GetIntPtr(const char* ptr) { + return reinterpret_cast(ptr); +} + +// Verifies flatbuffer format of the model contents and returns the in-memory +// model. const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) { ::flatbuffers::Verifier verifier(static_cast(buf), len); if (VerifyModelBuffer(verifier)) { @@ -30,14 +49,159 @@ const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) { } } -} // namespace +const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2; -bool Verify(const void* buf, size_t len) { - const Model* model = VerifyFlatbufferAndGetModel(buf, len); - if (model == nullptr) { +// Verifies string tensor has legit buffer contents that follow the schema +// defined in lite/string_util.h +bool VerifyStringTensorBuffer(const Buffer& buffer, + ErrorReporter* error_reporter) { + uint32_t buffer_size = buffer.data()->size(); + const char* buffer_ptr = reinterpret_cast(buffer.data()->data()); + + uint32_t num_strings = *GetIntPtr(buffer_ptr); + if (num_strings > kMaxNumString) { + ReportError(error_reporter, + "String tensor has invalid num of string set: %d", num_strings); + return false; + } + uint32_t header_offsets = + static_cast(num_strings + 2) * sizeof(int32_t); + + if (buffer_size < header_offsets) { + ReportError(error_reporter, + "String tensor buffer requires at least %d bytes, but is " + "allocated with %d bytes", + header_offsets, buffer_size); return false; } - return model->version() == TFLITE_SCHEMA_VERSION; + uint32_t prev_ptr = header_offsets; + uint32_t offset = sizeof(int32_t); + + if (*GetIntPtr(buffer_ptr + offset) != header_offsets) { + ReportError(error_reporter, + "String tensor buffer initial offset must be: %d", + header_offsets); + return false; + } + offset += sizeof(int32_t); + for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) { + int string_offset = *GetIntPtr(buffer_ptr + offset); + if (string_offset < prev_ptr || string_offset > buffer_size) { + ReportError(error_reporter, "String tensor buffer is invalid: index %d", + i); + return false; + } + } + if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) { + ReportError(error_reporter, "String tensor buffer last offset must be %d", + buffer_size); + return false; + } + return true; +} + +// Verifies numeric tensor has legit buffer. +bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer, + ErrorReporter* error_reporter) { + uint64_t bytes_required = 1; + for (int dim : *tensor.shape()) { + bytes_required *= dim; + if (bytes_required > UINT_MAX) { + ReportError(error_reporter, "Tensor dimension overflow"); + return false; + } + } + switch (tensor.type()) { + case TensorType_FLOAT32: + bytes_required *= sizeof(float); + break; + case TensorType_INT32: + bytes_required *= sizeof(int32_t); + break; + case TensorType_UINT8: + bytes_required *= sizeof(uint8_t); + break; + case TensorType_INT64: + bytes_required *= sizeof(int64_t); + break; + case TensorType_FLOAT16: + // FALLTHROUGH_INTENDED; + default: + ReportError(error_reporter, "Invalid tensor type: %d", tensor.type()); + return false; + } + if (bytes_required > UINT_MAX) { + ReportError(error_reporter, "Tensor dimension overflow"); + return false; + } + + if (bytes_required != buffer.data()->size()) { + ReportError( + error_reporter, + "Tensor requires %d bytes, but is allocated with %d bytes buffer", + bytes_required, buffer.data()->size()); + return false; + } + return true; + + // TODO(yichengfan): verify quantized tensors. +} + +// Verifies tensors have valid properties and legit buffer if set. +bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) { + if (!model.subgraphs()) { + return true; + } + for (const auto& subgraph : *model.subgraphs()) { + if (!subgraph->tensors()) { + return true; + } + for (const auto& tensor : *subgraph->tensors()) { + if (!tensor->buffer()) { + return true; + } + if (tensor->buffer() >= model.buffers()->size()) { + ReportError(error_reporter, "Invalid tensor buffer index: %d", + tensor->buffer()); + return false; + } + auto* buffer = model.buffers()->Get(tensor->buffer()); + if (!buffer || !buffer->data()) { + ReportError(error_reporter, "Tensor buffer %d not set", + tensor->buffer()); + return false; + } + + if (tensor->type() == TensorType_STRING) { + if (!VerifyStringTensorBuffer(*buffer, error_reporter)) { + return false; + } + } else { + if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) { + return false; + } + } + } + } + return true; +} + +} // namespace + +bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter) { + const Model* model = VerifyFlatbufferAndGetModel(buf, len); + if (model == nullptr) { + ReportError(error_reporter, "Invalid flatbuffer format"); + return false; + } + if (model->version() != TFLITE_SCHEMA_VERSION) { + ReportError(error_reporter, "Invalid model version %d", model->version()); + return false; + } + if (!VerifyTensors(*model, error_reporter)) { + return false; + } + return true; } } // namespace tflite diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h index 03e1f22b7e8..d2bf3c91d54 100644 --- a/tensorflow/contrib/lite/tools/verifier.h +++ b/tensorflow/contrib/lite/tools/verifier.h @@ -18,13 +18,15 @@ limitations under the License. #include +#include "tensorflow/contrib/lite/error_reporter.h" + namespace tflite { // Verifies the integrity of a Tensorflow Lite flatbuffer model file. // Currently, it verifies: // * The file is following a legit flatbuffer schema. // * The model is in supported version. -bool Verify(const void* buf, size_t len); +bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter); } // namespace tflite diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc index 0481a55a78e..87f6854e9e6 100644 --- a/tensorflow/contrib/lite/tools/verifier_test.cc +++ b/tensorflow/contrib/lite/tools/verifier_test.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/contrib/lite/tools/verifier.h" +#include +#include + #include "flatbuffers/flatbuffers.h" #include "flatbuffers/util.h" #include @@ -20,7 +22,9 @@ limitations under the License. #include "tensorflow/contrib/lite/error_reporter.h" #include "tensorflow/contrib/lite/schema/schema_generated.h" #include "tensorflow/contrib/lite/testing/util.h" +#include "tensorflow/contrib/lite/tools/verifier.h" #include "tensorflow/contrib/lite/version.h" +#include "tensorflow/core/framework/numeric_types.h" namespace tflite { @@ -28,31 +32,62 @@ using flatbuffers::FlatBufferBuilder; using flatbuffers::Offset; using flatbuffers::Vector; -// Class that abstracts the list of buffers at the end of the TF Lite structure -class DeferredBufferWriter { +// Build single subgraph model. +class TfLiteFlatbufferModelBuilder { public: - DeferredBufferWriter() { - data_.push_back({}); // sentinel empty buffer. + TfLiteFlatbufferModelBuilder() { + buffers_.push_back( + CreateBuffer(builder_, builder_.CreateVector(std::vector{}))); } - Offset>> BuildBuffers(FlatBufferBuilder *builder) { - std::vector> buffer_vector; - for (const auto &vec : data_) { - auto data_buffer = builder->CreateVector(vec.data(), vec.size()); - buffer_vector.push_back(tflite::CreateBuffer(*builder, data_buffer)); + void AddTensor(const std::vector& shape, tflite::TensorType type, + const std::vector& buffer, const char* name) { + int buffer_index = 0; + if (!buffer.empty()) { + buffer_index = buffers_.size(); + buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer))); } - return builder->CreateVector(buffer_vector); + tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index, + name, /*quantization=*/0)); } - // Registers a buffer index and takes ownership of the data to write to it. - int Record(std::vector data) { - int buffer_index = data_.size(); - data_.emplace_back(std::move(data)); - return buffer_index; + void AddOperator(const std::vector& inputs, + const std::vector& outputs, + tflite::BuiltinOperator builtin_op, const char* custom_op) { + operator_codes_.push_back( + CreateOperatorCodeDirect(builder_, builtin_op, custom_op)); + operators_.push_back(CreateOperator( + builder_, operator_codes_.size() - 1, builder_.CreateVector(inputs), + builder_.CreateVector(outputs), BuiltinOptions_NONE, + /*builtin_options=*/0, + /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS)); + } + + void FinishModel(const std::vector& inputs, + const std::vector& outputs) { + auto subgraph = std::vector>({CreateSubGraph( + builder_, builder_.CreateVector(tensors_), + builder_.CreateVector(inputs), builder_.CreateVector(outputs), + builder_.CreateVector(operators_), + builder_.CreateString("test_subgraph"))}); + auto result = CreateModel( + builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(operator_codes_), + builder_.CreateVector(subgraph), builder_.CreateString("test_model"), + builder_.CreateVector(buffers_)); + tflite::FinishModelBuffer(builder_, result); + } + + bool Verify() { + return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(), + DefaultErrorReporter()); } private: - std::vector> data_; + FlatBufferBuilder builder_; + std::vector> operators_; + std::vector> operator_codes_; + std::vector> tensors_; + std::vector> buffers_; }; TEST(VerifyModel, TestEmptyModel) { @@ -62,43 +97,26 @@ TEST(VerifyModel, TestEmptyModel) { /*description=*/0, /*buffers=*/0); ::tflite::FinishModelBuffer(builder, model); - ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize())); + ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize(), + DefaultErrorReporter())); } TEST(VerifyModel, TestSimpleModel) { - FlatBufferBuilder builder; - auto inputs = builder.CreateVector({0}); - auto outputs = builder.CreateVector({1}); - auto operator_codes = builder.CreateVector(std::vector>{ - CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "test")}); - auto operators = - builder.CreateVector(std::vector>{CreateOperator( - builder, /*opcode_index=*/0, - /*inputs=*/builder.CreateVector({0}), - /*outputs=*/builder.CreateVector({1}), BuiltinOptions_NONE, - /*builtin_options=*/0, - /*custom_options=*/0, ::tflite::CustomOptionsFormat_FLEXBUFFERS)}); - std::vector shape; - auto tensors = builder.CreateVector(std::vector>{ - CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0, - "input", /*quantization=*/0), - CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0, - "output", /*quantization=*/0)}); - auto subgraph = std::vector>( - {CreateSubGraph(builder, tensors, inputs, outputs, operators, - builder.CreateString("Main"))}); - - auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, operator_codes, - builder.CreateVector(subgraph), - builder.CreateString("SmartReply"), /*buffers=*/0); - - ::tflite::FinishModelBuffer(builder, model); - ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize())); + TfLiteFlatbufferModelBuilder builder; + builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test"); + builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input"); + builder.AddTensor( + {2}, TensorType_STRING, + {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'}, + "data"); + builder.AddTensor({2, 3}, TensorType_INT32, {}, "output"); + builder.FinishModel({0, 1}, {2}); + ASSERT_TRUE(builder.Verify()); } TEST(VerifyModel, TestCorruptedData) { - string model = "123"; - ASSERT_FALSE(Verify(model.data(), model.size())); + std::string model = "123"; + ASSERT_FALSE(Verify(model.data(), model.size(), /*error_reporter=*/nullptr)); } TEST(VerifyModel, TestUnsupportedVersion) { @@ -106,7 +124,8 @@ TEST(VerifyModel, TestUnsupportedVersion) { auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0, /*subgraphs=*/0, /*description=*/0, /*buffers=*/0); ::tflite::FinishModelBuffer(builder, model); - ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize())); + ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(), + DefaultErrorReporter())); } TEST(VerifyModel, TestRandomModificationIsNotAllowed) { @@ -116,20 +135,105 @@ TEST(VerifyModel, TestRandomModificationIsNotAllowed) { /*subgraphs=*/0, /*description=*/0, /*buffers=*/0); ::tflite::FinishModelBuffer(builder, model); - string model_content(reinterpret_cast(builder.GetBufferPointer()), - builder.GetSize()); + std::string model_content(reinterpret_cast(builder.GetBufferPointer()), + builder.GetSize()); for (int i = 0; i < model_content.size(); i++) { model_content[i] = (model_content[i] + 137) % 255; - EXPECT_FALSE(Verify(model_content.data(), model_content.size())) + EXPECT_FALSE(Verify(model_content.data(), model_content.size(), + DefaultErrorReporter())) << "Fail at position: " << i; } } +TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) { + TfLiteFlatbufferModelBuilder builder; + builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input"); + builder.FinishModel({}, {}); + ASSERT_FALSE(builder.Verify()); +} + +TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) { + TfLiteFlatbufferModelBuilder builder; + builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input"); + builder.FinishModel({}, {}); + ASSERT_FALSE(builder.Verify()); +} + +TEST(VerifyModel, TestIntTensorShapeOverflow) { + TfLiteFlatbufferModelBuilder builder; + builder.AddTensor({1024, 2048, 4096}, TensorType_UINT8, {1, 2, 3, 4}, + "input"); + builder.FinishModel({}, {}); + ASSERT_FALSE(builder.Verify()); +} + +TEST(VerifyModel, TensorBufferIsNotValid) { + FlatBufferBuilder builder; + std::vector shape = {2, 3}; + auto tensors = builder.CreateVector(std::vector>{ + CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/2, + "input", /*quantization=*/0)}); + auto subgraph = std::vector>( + {CreateSubGraph(builder, tensors, /*inputs=*/0, /*outputs=*/0, + /*operators=*/0, builder.CreateString("Main"))}); + + auto buffers = builder.CreateVector(std::vector>{ + CreateBuffer(builder, + builder.CreateVector(std::vector{1, 2, 3, 4, 5, 6})), + }); + + auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, /*operator_codes=*/0, + builder.CreateVector(subgraph), + builder.CreateString("SmartReply"), buffers); + + ::tflite::FinishModelBuffer(builder, model); + ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(), + DefaultErrorReporter())); +} + +TEST(VerifyModel, StringTensorHasInvalidNumString) { + TfLiteFlatbufferModelBuilder builder; + builder.AddTensor( + {2}, TensorType_STRING, + {0x00, 0x00, 0x00, 0x20, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, + "input"); + builder.FinishModel({}, {}); + ASSERT_FALSE(builder.Verify()); +} + +TEST(VerifyModel, StringTensorOffsetTooSmall) { + TfLiteFlatbufferModelBuilder builder; + builder.AddTensor( + {2}, TensorType_STRING, + {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input"); + builder.FinishModel({}, {}); + ASSERT_FALSE(builder.Verify()); +} + +TEST(VerifyModel, StringTensorOffsetOutOfRange) { + TfLiteFlatbufferModelBuilder builder; + builder.AddTensor( + {2}, TensorType_STRING, + {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input"); + builder.FinishModel({}, {}); + ASSERT_FALSE(builder.Verify()); +} + +TEST(VerifyModel, StringTensorIsLargerThanRequired) { + TfLiteFlatbufferModelBuilder builder; + builder.AddTensor( + {2}, TensorType_STRING, + {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B', 'C'}, + "input"); + builder.FinishModel({}, {}); + ASSERT_FALSE(builder.Verify()); +} + // TODO(yichengfan): make up malicious files to test with. } // namespace tflite -int main(int argc, char **argv) { +int main(int argc, char** argv) { ::tflite::LogToStderr(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/tensorflow/contrib/lite/tools/visualize.py b/tensorflow/contrib/lite/tools/visualize.py index d0d78e3afab..f571dd59da0 100644 --- a/tensorflow/contrib/lite/tools/visualize.py +++ b/tensorflow/contrib/lite/tools/visualize.py @@ -198,10 +198,13 @@ class TensorMapper(object): def GenerateGraph(subgraph_idx, g, opcode_mapper): """Produces the HTML required to have a d3 visualization of the dag.""" + def TensorName(idx): - return "t%d"%idx + return "t%d" % idx + def OpName(idx): - return "o%d"%idx + return "o%d" % idx + edges = [] nodes = [] first = {} @@ -210,27 +213,35 @@ def GenerateGraph(subgraph_idx, g, opcode_mapper): for tensor_input_position, tensor_index in enumerate(op["inputs"]): if tensor_index not in first: first[tensor_index] = ( - op_index*pixel_mult, - tensor_input_position*pixel_mult - pixel_mult/2) - edges.append( - {"source": TensorName(tensor_index), "target": OpName(op_index)}) + op_index * pixel_mult, + tensor_input_position * pixel_mult - pixel_mult / 2) + edges.append({ + "source": TensorName(tensor_index), + "target": OpName(op_index) + }) for tensor_index in op["outputs"]: - edges.append( - {"target": TensorName(tensor_index), "source": OpName(op_index)}) - nodes.append({"id": OpName(op_index), - "name": opcode_mapper(op["opcode_index"]), - "group": 2, - "x": pixel_mult, - "y": op_index * pixel_mult}) + edges.append({ + "target": TensorName(tensor_index), + "source": OpName(op_index) + }) + nodes.append({ + "id": OpName(op_index), + "name": opcode_mapper(op["opcode_index"]), + "group": 2, + "x": pixel_mult, + "y": op_index * pixel_mult + }) for tensor_index, tensor in enumerate(g["tensors"]): - initial_y = (first[tensor_index] if tensor_index in first - else len(g["operators"])) + initial_y = ( + first[tensor_index] if tensor_index in first else len(g["operators"])) - nodes.append({"id": TensorName(tensor_index), - "name": "%s (%d)" % (tensor["name"], tensor_index), - "group": 1, - "x": 2, - "y": initial_y}) + nodes.append({ + "id": TensorName(tensor_index), + "name": "%s (%d)" % (tensor["name"], tensor_index), + "group": 1, + "x": 2, + "y": initial_y + }) graph_str = json.dumps({"nodes": nodes, "edges": edges}) html = _D3_HTML_TEMPLATE % (graph_str, subgraph_idx) @@ -267,7 +278,7 @@ def GenerateTableHtml(items, keys_to_print, display_index=True): for h, mapper in keys_to_print: val = tensor[h] if h in tensor else None val = val if mapper is None else mapper(val) - html += "%s\n"%val + html += "%s\n" % val html += "\n" html += "\n" @@ -279,18 +290,19 @@ def CreateHtmlFile(tflite_input, html_output): # Convert the model into a JSON flatbuffer using flatc (build if doesn't # exist. - if not os.path.exists(tflite_input): + if not os.path.exists(tflite_input): raise RuntimeError("Invalid filename %r" % tflite_input) if tflite_input.endswith(".tflite") or tflite_input.endswith(".bin"): # Run convert - cmd = (_BINARY + " -t " - "--strict-json --defaults-json -o /tmp {schema} -- {input}".format( - input=tflite_input, schema=_SCHEMA)) + cmd = ( + _BINARY + " -t " + "--strict-json --defaults-json -o /tmp {schema} -- {input}".format( + input=tflite_input, schema=_SCHEMA)) print(cmd) os.system(cmd) - real_output = ("/tmp/"+ os.path.splitext(os.path.split(tflite_input)[-1])[0] - + ".json") + real_output = ("/tmp/" + os.path.splitext( + os.path.split(tflite_input)[-1])[0] + ".json") data = json.load(open(real_output)) elif tflite_input.endswith(".json"): @@ -302,12 +314,13 @@ def CreateHtmlFile(tflite_input, html_output): html += "

TensorFlow Lite Model

" data["filename"] = tflite_input # Avoid special case - toplevel_stuff = [("filename", None), ("version", None), - ("description", None)] + toplevel_stuff = [("filename", None), ("version", None), ("description", + None)] html += "\n" for key, mapping in toplevel_stuff: - if not mapping: mapping = lambda x: x + if not mapping: + mapping = lambda x: x html += "\n" % (key, mapping(data[key])) html += "
%s%s
\n" @@ -320,22 +333,22 @@ def CreateHtmlFile(tflite_input, html_output): html += "
" tensor_mapper = TensorMapper(g) opcode_mapper = OpCodeMapper(data) - op_keys_to_display = [ - ("inputs", tensor_mapper), ("outputs", tensor_mapper), - ("builtin_options", None), ("opcode_index", opcode_mapper)] - tensor_keys_to_display = [ - ("name", None), ("type", None), ("shape", None), ("buffer", None), - ("quantization", None)] + op_keys_to_display = [("inputs", tensor_mapper), ("outputs", tensor_mapper), + ("builtin_options", None), ("opcode_index", + opcode_mapper)] + tensor_keys_to_display = [("name", None), ("type", None), ("shape", None), + ("buffer", None), ("quantization", None)] html += "

Subgraph %d

\n" % subgraph_idx # Inputs and outputs. html += "

Inputs/Outputs

\n" - html += GenerateTableHtml([{"inputs": g["inputs"], - "outputs": g["outputs"]}], - [("inputs", tensor_mapper), - ("outputs", tensor_mapper)], - display_index=False) + html += GenerateTableHtml( + [{ + "inputs": g["inputs"], + "outputs": g["outputs"] + }], [("inputs", tensor_mapper), ("outputs", tensor_mapper)], + display_index=False) # Print the tensors. html += "

Tensors

\n" @@ -357,8 +370,7 @@ def CreateHtmlFile(tflite_input, html_output): # Operator codes html += "

Operator Codes

\n" - html += GenerateTableHtml(data["operator_codes"], - operator_keys_to_display) + html += GenerateTableHtml(data["operator_codes"], operator_keys_to_display) html += "\n" @@ -370,10 +382,10 @@ def main(argv): tflite_input = argv[1] html_output = argv[2] except IndexError: - print ("Usage: %s " % (argv[0])) + print("Usage: %s " % (argv[0])) else: CreateHtmlFile(tflite_input, html_output) + if __name__ == "__main__": main(sys.argv) - diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py index 7c523ad4926..8c3a8afe7a0 100644 --- a/tensorflow/contrib/losses/python/losses/loss_ops.py +++ b/tensorflow/contrib/losses/python/losses/loss_ops.py @@ -30,20 +30,13 @@ from tensorflow.python.ops import nn_ops from tensorflow.python.util.deprecation import deprecated from tensorflow.python.util.deprecation import deprecated_args -__all__ = ["absolute_difference", - "add_loss", - "cosine_distance", - "compute_weighted_loss", - "get_losses", - "get_regularization_losses", - "get_total_loss", - "hinge_loss", - "log_loss", - "mean_pairwise_squared_error", - "mean_squared_error", - "sigmoid_cross_entropy", - "softmax_cross_entropy", - "sparse_softmax_cross_entropy"] +__all__ = [ + "absolute_difference", "add_loss", "cosine_distance", + "compute_weighted_loss", "get_losses", "get_regularization_losses", + "get_total_loss", "hinge_loss", "log_loss", "mean_pairwise_squared_error", + "mean_squared_error", "sigmoid_cross_entropy", "softmax_cross_entropy", + "sparse_softmax_cross_entropy" +] def _scale_losses(losses, weights): @@ -66,8 +59,8 @@ def _scale_losses(losses, weights): # First, compute the sum of the losses over all elements: start_index = max(0, weights.get_shape().ndims) reduction_indices = list(range(start_index, losses.get_shape().ndims)) - reduced_losses = math_ops.reduce_sum(losses, - reduction_indices=reduction_indices) + reduced_losses = math_ops.reduce_sum( + losses, reduction_indices=reduction_indices) reduced_losses = math_ops.multiply(reduced_losses, weights) return math_ops.reduce_sum(reduced_losses) @@ -90,9 +83,10 @@ def _safe_div(numerator, denominator, name="value"): """ return array_ops.where( math_ops.greater(denominator, 0), - math_ops.div(numerator, array_ops.where( - math_ops.equal(denominator, 0), - array_ops.ones_like(denominator), denominator)), + math_ops.div(numerator, + array_ops.where( + math_ops.equal(denominator, 0), + array_ops.ones_like(denominator), denominator)), array_ops.zeros_like(numerator), name=name) @@ -176,14 +170,15 @@ def _num_present(losses, weights, per_batch=False): """ # If weights is a scalar, its easy to compute: if weights.get_shape().ndims == 0: - batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses), - [0], [1]), []) - num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)), - math_ops.to_float(batch_size)) - num_per_batch = array_ops.where(math_ops.equal(weights, 0), - 0.0, num_per_batch) - num_per_batch = math_ops.multiply(array_ops.ones( - array_ops.reshape(batch_size, [1])), num_per_batch) + batch_size = array_ops.reshape( + array_ops.slice(array_ops.shape(losses), [0], [1]), []) + num_per_batch = math_ops.div( + math_ops.to_float(array_ops.size(losses)), + math_ops.to_float(batch_size)) + num_per_batch = array_ops.where( + math_ops.equal(weights, 0), 0.0, num_per_batch) + num_per_batch = math_ops.multiply( + array_ops.ones(array_ops.reshape(batch_size, [1])), num_per_batch) return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch) # First, count the number of nonzero weights: @@ -194,8 +189,8 @@ def _num_present(losses, weights, per_batch=False): reduction_indices=reduction_indices) # Next, determine the number of elements that weights would broadcast to: - broadcast_dims = array_ops.slice(array_ops.shape(losses), - [weights.get_shape().ndims], [-1]) + broadcast_dims = array_ops.slice( + array_ops.shape(losses), [weights.get_shape().ndims], [-1]) num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims)) num_per_batch = math_ops.multiply(num_nonzero_per_batch, num_to_broadcast) @@ -303,8 +298,11 @@ def absolute_difference(predictions, labels=None, weights=1.0, scope=None): @deprecated("2016-12-30", "Use tf.losses.sigmoid_cross_entropy instead. Note that the order " "of the predictions and labels arguments has been changed.") -def sigmoid_cross_entropy( - logits, multi_class_labels, weights=1.0, label_smoothing=0, scope=None): +def sigmoid_cross_entropy(logits, + multi_class_labels, + weights=1.0, + label_smoothing=0, + scope=None): """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits. `weights` acts as a coefficient for the loss. If a scalar is provided, @@ -340,20 +338,22 @@ def sigmoid_cross_entropy( multi_class_labels = math_ops.cast(multi_class_labels, logits.dtype) if label_smoothing > 0: - multi_class_labels = (multi_class_labels * (1 - label_smoothing) + - 0.5 * label_smoothing) + multi_class_labels = ( + multi_class_labels * (1 - label_smoothing) + 0.5 * label_smoothing) - losses = nn.sigmoid_cross_entropy_with_logits(labels=multi_class_labels, - logits=logits, - name="xentropy") + losses = nn.sigmoid_cross_entropy_with_logits( + labels=multi_class_labels, logits=logits, name="xentropy") return compute_weighted_loss(losses, weights, scope=scope) @deprecated("2016-12-30", "Use tf.losses.softmax_cross_entropy instead. Note that the order " "of the logits and labels arguments has been changed.") -def softmax_cross_entropy( - logits, onehot_labels, weights=1.0, label_smoothing=0, scope=None): +def softmax_cross_entropy(logits, + onehot_labels, + weights=1.0, + label_smoothing=0, + scope=None): """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits. `weights` acts as a coefficient for the loss. If a scalar is provided, @@ -393,9 +393,8 @@ def softmax_cross_entropy( smooth_negatives = label_smoothing / num_classes onehot_labels = onehot_labels * smooth_positives + smooth_negatives - losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels, - logits=logits, - name="xentropy") + losses = nn.softmax_cross_entropy_with_logits( + labels=onehot_labels, logits=logits, name="xentropy") return compute_weighted_loss(losses, weights, scope=scope) @@ -429,9 +428,8 @@ def sparse_softmax_cross_entropy(logits, labels, weights=1.0, scope=None): [logits, labels, weights]) as scope: labels = array_ops.reshape(labels, shape=[array_ops.shape(labels)[0]]) - losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels, - logits=logits, - name="xentropy") + losses = nn.sparse_softmax_cross_entropy_with_logits( + labels=labels, logits=logits, name="xentropy") return compute_weighted_loss(losses, weights, scope=scope) @@ -470,8 +468,7 @@ def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None): predictions = math_ops.to_float(predictions) labels = math_ops.to_float(labels) losses = -math_ops.multiply( - labels, - math_ops.log(predictions + epsilon)) - math_ops.multiply( + labels, math_ops.log(predictions + epsilon)) - math_ops.multiply( (1 - labels), math_ops.log(1 - predictions + epsilon)) return compute_weighted_loss(losses, weights, scope=scope) @@ -490,7 +487,8 @@ def hinge_loss(logits, labels=None, scope=None): scope: The scope for the operations performed in computing the loss. Returns: - An unweighted `Tensor` of same shape as `logits` and `labels` representing the + An unweighted `Tensor` of same shape as `logits` and `labels` representing + the loss values across the batch. Raises: @@ -544,8 +542,10 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None): @deprecated("2016-12-30", "Use tf.losses.mean_pairwise_squared_error instead. Note that the " "order of the predictions and labels arguments has been changed.") -def mean_pairwise_squared_error( - predictions, labels=None, weights=1.0, scope=None): +def mean_pairwise_squared_error(predictions, + labels=None, + weights=1.0, + scope=None): """Adds a pairwise-errors-squared loss to the training procedure. Unlike `mean_squared_error`, which is a measure of the differences between @@ -602,31 +602,34 @@ def mean_pairwise_squared_error( reduction_indices = list(range(1, diffs.get_shape().ndims)) sum_squares_diff_per_batch = math_ops.reduce_sum( - math_ops.square(diffs), - reduction_indices=reduction_indices) + math_ops.square(diffs), reduction_indices=reduction_indices) num_present_per_batch = _num_present(diffs, weights, per_batch=True) - term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, - num_present_per_batch) + term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, num_present_per_batch) sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices) - term2 = 2.0 * _safe_div(math_ops.square(sum_diff), - math_ops.square(num_present_per_batch)) + term2 = 2.0 * _safe_div( + math_ops.square(sum_diff), math_ops.square(num_present_per_batch)) loss = _scale_losses(term1 - term2, weights) - mean_loss = array_ops.where(math_ops.reduce_sum(num_present_per_batch) > 0, - loss, - array_ops.zeros_like(loss), - name="value") + mean_loss = array_ops.where( + math_ops.reduce_sum(num_present_per_batch) > 0, + loss, + array_ops.zeros_like(loss), + name="value") add_loss(mean_loss) return mean_loss @deprecated("2016-12-30", "Use tf.losses.cosine_distance instead.") @deprecated_args(None, "dim is deprecated, use axis instead", "dim") -def cosine_distance( - predictions, labels=None, axis=None, weights=1.0, scope=None, dim=None): +def cosine_distance(predictions, + labels=None, + axis=None, + weights=1.0, + scope=None, + dim=None): """Adds a cosine-distance loss to the training procedure. Note that the function assumes that `predictions` and `labels` are already @@ -662,5 +665,8 @@ def cosine_distance( labels = math_ops.to_float(labels) radial_diffs = math_ops.multiply(predictions, labels) - losses = 1 - math_ops.reduce_sum(radial_diffs, reduction_indices=[axis,]) + losses = 1 - math_ops.reduce_sum( + radial_diffs, reduction_indices=[ + axis, + ]) return compute_weighted_loss(losses, weights, scope=scope) diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py index 9d0f95e6f3e..1417772e049 100644 --- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py +++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py @@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops from tensorflow.python.framework import random_seed +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops @@ -274,6 +275,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase): self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3) +@test_util.with_c_api class SparseSoftmaxCrossEntropyLossTest(test.TestCase): def testNoneWeightRaisesValueError(self): @@ -471,7 +473,11 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase): labels = constant_op.constant([[0, 1], [2, 3]]) weights = constant_op.constant([1.2, 3.4, 5.6, 7.8]) - with self.assertRaises(errors_impl.InvalidArgumentError): + if ops._USE_C_API: + error_type = ValueError + else: + error_type = errors_impl.InvalidArgumentError + with self.assertRaises(error_type): loss_ops.sparse_softmax_cross_entropy( logits, labels, weights=weights).eval() diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile index c50f8ceec0a..81327407d44 100644 --- a/tensorflow/contrib/makefile/Makefile +++ b/tensorflow/contrib/makefile/Makefile @@ -407,7 +407,7 @@ $(MARCH_OPTION) \ -I$(JETPACK)/cuda/extras/CUPTI/include - LIBS += \ + CUDA_LIBS := \ -ltfcuda \ -lcudart_static \ -lcudnn \ @@ -420,10 +420,10 @@ $(MARCH_OPTION) \ -lculibos \ -lcurand_static - OBJDIR := $(OBJDIR)Tegra/ - LIBDIR := $(LIBDIR)Tegra/ - BINDIR := $(BINDIR)Tegra/ - DEPDIR := $(DEPDIR)Tegra/ + OBJDIR := $(OBJDIR)android_arm64-v8a/ + LIBDIR := $(LIBDIR)android_arm64-v8a/ + BINDIR := $(BINDIR)android_arm64-v8a/ + DEPDIR := $(DEPDIR)android_arm64-v8a/ TEGRA_LIBS := \ -L$(JETPACK)/cuda/targets/aarch64-linux-androideabi/lib \ @@ -606,7 +606,8 @@ $(wildcard tensorflow/core/util/*/*.cc) \ tensorflow/core/util/version_info.cc # Remove duplicates (for version_info.cc) CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS)) -CORE_CC_EXCLUDE_SRCS := \ + +CORE_CC_EXCLUDE_SRCS_NON_GPU := \ $(wildcard tensorflow/core/*/*test.cc) \ $(wildcard tensorflow/core/*/*testutil*) \ $(wildcard tensorflow/core/*/*testlib*) \ @@ -626,49 +627,31 @@ $(wildcard tensorflow/core/lib/jpeg/*) \ $(wildcard tensorflow/core/lib/png/*) \ $(wildcard tensorflow/core/util/events_writer.*) \ $(wildcard tensorflow/core/util/reporter.*) \ -$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \ -$(wildcard tensorflow/core/platform/default/stream_executor.*) \ $(wildcard tensorflow/core/platform/default/test_benchmark.*) \ -$(wildcard tensorflow/core/platform/cuda.h) \ -$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \ $(wildcard tensorflow/core/platform/cloud/*) \ $(wildcard tensorflow/core/platform/google/*) \ $(wildcard tensorflow/core/platform/google/*/*) \ $(wildcard tensorflow/core/platform/jpeg.*) \ $(wildcard tensorflow/core/platform/png.*) \ $(wildcard tensorflow/core/platform/s3/*) \ -$(wildcard tensorflow/core/platform/stream_executor.*) \ $(wildcard tensorflow/core/platform/windows/*) \ -$(wildcard tensorflow/core/user_ops/*.cu.cc) \ -$(wildcard tensorflow/core/common_runtime/gpu/*) \ -$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*) \ $(wildcard tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.*) \ $(wildcard tensorflow/core/grappler/inputs/file_input_yielder.*) \ -$(wildcard tensorflow/core/grappler/clusters/single_machine.*) +$(wildcard tensorflow/core/grappler/clusters/single_machine.*) \ +tensorflow/core/util/cuda_kernel_helper_test.cu.cc + +CORE_CC_EXCLUDE_SRCS := \ +$(CORE_CC_EXCLUDE_SRCS_NON_GPU) \ +$(wildcard tensorflow/core/platform/stream_executor.*) \ +$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \ +$(wildcard tensorflow/core/platform/cuda.h) \ +$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \ +$(wildcard tensorflow/core/user_ops/*.cu.cc) \ +$(wildcard tensorflow/core/common_runtime/gpu/*) \ +$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*) ifeq ($(BUILD_FOR_TEGRA),1) -CORE_CC_ALL_SRCS := \ -$(wildcard tensorflow/core/*.cc) \ -$(wildcard tensorflow/core/common_runtime/*.cc) \ -$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \ -$(wildcard tensorflow/core/framework/*.cc) \ -$(wildcard tensorflow/core/graph/*.cc) \ -$(wildcard tensorflow/core/platform/*.cc) \ -$(wildcard tensorflow/core/platform/*/*.cc) \ -$(wildcard tensorflow/core/platform/*/*/*.cc) \ -$(wildcard tensorflow/core/util/*.cc) \ -$(wildcard tensorflow/core/util/*/*.cc) \ -$(wildcard tensorflow/cc/training/*.cc) \ -$(wildcard tensorflow/stream_executor/*.cc) \ -$(wildcard tensorflow/stream_executor/*/*.cc) \ -$(wildcard tensorflow/core/grappler/optimizers/*.cc) \ -$(wildcard tensorflow/core/grappler/*.cc) \ -$(wildcard tensorflow/core/grappler/costs/*.cc) \ -$(wildcard tensorflow/core/grappler/clusters/*.cc) \ -$(wildcard tensorflow/core/grappler/utils/*.cc) \ -$(wildcard tensorflow/core/lib/core/*.cc) \ -$(wildcard tensorflow/core/lib/*/*.cc) \ -tensorflow/core/grappler/inputs/utils.cc \ +CORE_CC_ALL_SRCS := $(CORE_CC_ALL_SRCS) \ tensorflow/core/kernels/concat_lib_gpu.cc \ tensorflow/core/kernels/cuda_solvers.cc \ tensorflow/core/kernels/cudnn_pooling_gpu.cc \ @@ -677,28 +660,14 @@ tensorflow/core/kernels/fractional_avg_pool_op.cc \ tensorflow/core/kernels/fractional_max_pool_op.cc \ tensorflow/core/kernels/fractional_pool_common.cc \ tensorflow/core/kernels/pooling_ops_3d.cc \ -tensorflow/core/kernels/sparse_fill_empty_rows_op.cc +tensorflow/core/kernels/sparse_fill_empty_rows_op.cc \ +tensorflow/core/kernels/list_kernels.cc \ +$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \ +$(wildcard tensorflow/stream_executor/*.cc) \ +$(wildcard tensorflow/stream_executor/*/*.cc) CORE_CC_EXCLUDE_SRCS := \ -$(wildcard tensorflow/core/*/*test.cc) \ -$(wildcard tensorflow/core/*/*testutil*) \ -$(wildcard tensorflow/core/*/*testlib*) \ -$(wildcard tensorflow/core/*/*/*test.cc) \ -$(wildcard tensorflow/core/*/*/*testutil*) \ -$(wildcard tensorflow/core/framework/op_gen_lib.cc) \ -$(wildcard tensorflow/core/lib/gif/*) \ -$(wildcard tensorflow/core/lib/jpeg/*) \ -$(wildcard tensorflow/core/lib/png/*) \ -$(wildcard tensorflow/core/lib/db/*) \ -$(wildcard tensorflow/core/platform/jpeg.*) \ -$(wildcard tensorflow/core/platform/png.*) \ -$(wildcard tensorflow/core/platform/cloud/*) \ -$(wildcard tensorflow/core/platform/s3/*) \ -$(wildcard tensorflow/core/platform/windows/*) \ -$(wildcard tensorflow/core/*/*/*testlib*) \ -$(wildcard tensorflow/cc/training/*test.cc) \ -tensorflow/core/lib/io/record_reader.cc \ -tensorflow/core/util/cuda_kernel_helper_test.cu.cc +$(CORE_CC_EXCLUDE_SRCS_NON_GPU) CUDA_CC_SRCS := $(wildcard tensorflow/core/kernels/*.cu.cc) CUDA_CC_OBJS := $(addprefix $(OBJDIR), $(CUDA_CC_SRCS:.cc=.o)) @@ -760,7 +729,7 @@ $(BENCHMARK_NAME): $(BENCHMARK_OBJS) $(LIB_PATH) $(CUDA_LIB_DEPS) @mkdir -p $(dir $@) $(CXX) $(CXXFLAGS) $(INCLUDES) \ -o $(BENCHMARK_NAME) $(BENCHMARK_OBJS) \ - $(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS) + $(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS) $(CUDA_LIBS) # NVCC compilation rules for Tegra ifeq ($(BUILD_FOR_TEGRA),1) diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh index 281c4653c62..f67c5161861 100755 --- a/tensorflow/contrib/makefile/build_all_android.sh +++ b/tensorflow/contrib/makefile/build_all_android.sh @@ -37,7 +37,7 @@ fi ARCH=armeabi-v7a -while getopts "Es:t:Tx:a" opt_name; do +while getopts "Es:t:Tx:a:" opt_name; do case "$opt_name" in E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";; s) SUB_MAKEFILES="${OPTARG}";; diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh index a18df256f97..2d997918397 100755 --- a/tensorflow/contrib/makefile/build_all_ios.sh +++ b/tensorflow/contrib/makefile/build_all_ios.sh @@ -96,7 +96,7 @@ if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then if [[ -z "${BUILD_ARCH}" ]]; then # Compile protobuf for the target iOS device architectures. - tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${DEFAULT_ARCH} + tensorflow/contrib/makefile/compile_ios_protobuf.sh else # Compile protobuf for the target iOS device architectures. tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${BUILD_ARCH} diff --git a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in index d9277ed60cb..3081084ee76 100644 --- a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in +++ b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in @@ -54,7 +54,7 @@ $(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS) $(CUDA_LIB_DEPS) -o $@ $(INFERENCE_OBJS) $(LIB_OBJS) $(TEGRA_LIBS) \ $(LIBFLAGS) $(LDFLAGS) \ -shared -Wl,-soname,$(INFERENCE_SO_NAME) \ - $(LIBS) + $(LIBS) $(CUDA_LIBS) $(INFERENCE_SO_NAME): $(INFERENCE_SO_PATH) diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt index 5f275663986..5a812af4e95 100644 --- a/tensorflow/contrib/makefile/tf_op_files.txt +++ b/tensorflow/contrib/makefile/tf_op_files.txt @@ -91,6 +91,7 @@ tensorflow/core/kernels/reduction_ops_max.cc tensorflow/core/kernels/reduction_ops_common.cc tensorflow/core/kernels/reduction_ops_any.cc tensorflow/core/kernels/reduction_ops_all.cc +tensorflow/core/kernels/roll_op.cc tensorflow/core/kernels/queue_ops.cc tensorflow/core/kernels/queue_base.cc tensorflow/core/kernels/pooling_ops_common.cc @@ -270,6 +271,7 @@ tensorflow/core/ops/parsing_ops.cc tensorflow/core/ops/no_op.cc tensorflow/core/ops/nn_ops.cc tensorflow/core/ops/nn_grad.cc +tensorflow/core/ops/manip_ops.cc tensorflow/core/ops/math_ops.cc tensorflow/core/ops/math_grad.cc tensorflow/core/ops/logging_ops.cc @@ -291,3 +293,4 @@ tensorflow/core/kernels/batchtospace_op.cc tensorflow/core/kernels/warn_about_ints.cc tensorflow/core/kernels/segment_reduction_ops.cc tensorflow/core/kernels/batch_util.cc +tensorflow/core/ops/audio_ops.cc diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc index 39c0d5af45b..974fb537499 100644 --- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc +++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc @@ -80,9 +80,9 @@ REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_GPU).HostMemory("out"), BytesLimitOp); #ifdef TENSORFLOW_USE_SYCL -REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"), - BytesLimitOp); -#endif // TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER( + Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"), BytesLimitOp); +#endif // TENSORFLOW_USE_SYCL // Op that measures the peak memory in bytes. class MaxBytesInUseOp : public MemoryStatsOp { @@ -107,6 +107,6 @@ REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER( Name("MaxBytesInUse").Device(DEVICE_SYCL).HostMemory("out"), MaxBytesInUseOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py index 2932ae1c8df..ff88b4fa841 100644 --- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py +++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py @@ -171,7 +171,14 @@ def _clean_save_and_restore(graph_def, op, removed_op_names): shape_op_value_tensor.tensor_shape.dim[0].size = len(shapes) op.attr['dtypes'].list.type[:] = dtypes + if not name_op.attr['_output_shapes'].list.shape: + name_op.attr['_output_shapes'].list.shape.add() + name_op.attr['_output_shapes'].list.shape[0].dim.add() name_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(names) + + if not shape_op.attr['_output_shapes'].list.shape: + shape_op.attr['_output_shapes'].list.shape.add() + shape_op.attr['_output_shapes'].list.shape[0].dim.add() shape_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(shapes) diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py index 55946c128b1..c2340d03776 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py @@ -739,7 +739,7 @@ def _streaming_confusion_matrix_at_thresholds(predictions, else: for include in includes: if include not in all_includes: - raise ValueError('Invaild key: %s.' % include) + raise ValueError('Invalid key: %s.' % include) predictions, labels, weights = metrics_impl._remove_squeezable_dimensions( # pylint: disable=protected-access predictions, labels, weights) diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py index d07fece4bc6..6a3b535eb44 100644 --- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py +++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py @@ -58,6 +58,7 @@ def read_cifar10(filename_queue): class CIFAR10Record(object): pass + result = CIFAR10Record() # Dimensions of the images in the CIFAR-10 dataset. @@ -147,8 +148,9 @@ def distorted_inputs(data_dir, batch_size): images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size. labels: Labels. 1D tensor of [batch_size] size. """ - filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) - for i in xrange(1, 6)] + filenames = [ + os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6) + ] for f in filenames: if not tf.gfile.Exists(f): raise ValueError('Failed to find file: ' + f) @@ -174,10 +176,9 @@ def distorted_inputs(data_dir, batch_size): # Because these operations are not commutative, consider randomizing # the order their operation. - distorted_image = tf.image.random_brightness(distorted_image, - max_delta=63) - distorted_image = tf.image.random_contrast(distorted_image, - lower=0.2, upper=1.8) + distorted_image = tf.image.random_brightness(distorted_image, max_delta=63) + distorted_image = tf.image.random_contrast( + distorted_image, lower=0.2, upper=1.8) # Subtract off the mean and divide by the variance of the pixels. float_image = tf.image.per_image_standardization(distorted_image) @@ -188,15 +189,18 @@ def distorted_inputs(data_dir, batch_size): # Ensure that the random shuffling has good mixing properties. min_fraction_of_examples_in_queue = 0.4 - min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * - min_fraction_of_examples_in_queue) - print ('Filling queue with %d CIFAR images before starting to train. ' - 'This will take a few minutes.' % min_queue_examples) + min_queue_examples = int( + NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * min_fraction_of_examples_in_queue) + print('Filling queue with %d CIFAR images before starting to train. ' + 'This will take a few minutes.' % min_queue_examples) # Generate a batch of images and labels by building up a queue of examples. - return _generate_image_and_label_batch(float_image, read_input.label, - min_queue_examples, batch_size, - shuffle=True) + return _generate_image_and_label_batch( + float_image, + read_input.label, + min_queue_examples, + batch_size, + shuffle=True) def inputs(eval_data, data_dir, batch_size): @@ -212,8 +216,9 @@ def inputs(eval_data, data_dir, batch_size): labels: Labels. 1D tensor of [batch_size] size. """ if not eval_data: - filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) - for i in xrange(1, 6)] + filenames = [ + os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6) + ] num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN else: filenames = [os.path.join(data_dir, 'test_batch.bin')] @@ -235,8 +240,8 @@ def inputs(eval_data, data_dir, batch_size): # Image processing for evaluation. # Crop the central [height, width] of the image. - resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image, - width, height) + resized_image = tf.image.resize_image_with_crop_or_pad( + reshaped_image, width, height) # Subtract off the mean and divide by the variance of the pixels. float_image = tf.image.per_image_standardization(resized_image) @@ -247,10 +252,13 @@ def inputs(eval_data, data_dir, batch_size): # Ensure that the random shuffling has good mixing properties. min_fraction_of_examples_in_queue = 0.4 - min_queue_examples = int(num_examples_per_epoch * - min_fraction_of_examples_in_queue) + min_queue_examples = int( + num_examples_per_epoch * min_fraction_of_examples_in_queue) # Generate a batch of images and labels by building up a queue of examples. - return _generate_image_and_label_batch(float_image, read_input.label, - min_queue_examples, batch_size, - shuffle=False) + return _generate_image_and_label_batch( + float_image, + read_input.label, + min_queue_examples, + batch_size, + shuffle=False) diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py index 73dd56398c4..660f0168b10 100644 --- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py +++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py @@ -48,7 +48,7 @@ from tensorflow.contrib.model_pruning.python import pruning # Global constants describing the CIFAR-10 data set. IMAGE_SIZE = cifar10_input.IMAGE_SIZE NUM_CLASSES = cifar10_input.NUM_CLASSES -NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN +NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN # pylint: disable=line-too-long NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL BATCH_SIZE = 128 DATA_DIR = '/tmp/cifar10_data' diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc index 8d14a3ef040..6a7f5efecdb 100644 --- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc +++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc @@ -24,11 +24,11 @@ limitations under the License. #include #include -#include "tensorflow/core/distributed_runtime/tensor_coding.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/gpu/gpu_util.h" #include "tensorflow/core/distributed_runtime/session_mgr.h" +#include "tensorflow/core/distributed_runtime/tensor_coding.h" namespace tensorflow { @@ -62,7 +62,6 @@ BaseRemoteRendezvous* MPIRendezvousMgr::Create(int64 step_id, void MPIRemoteRendezvous::RecvFromRemoteAsync( const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args, DoneCallback done) { - Status s = Status::OK(); MPIRequestTensorCall* rendezvous_call = new MPIRequestTensorCall(); @@ -103,37 +102,37 @@ void MPIRemoteRendezvous::RecvFromRemoteAsync( // Create the function which is called when the Tensor is send by remote const int64 temp1 = step_id_; rendezvous_call->recv_call_ = - [this, parsed, recv_args, done, dst, temp1, rendezvous_call]( - MPIRecvTensorResponse mpi_response) { - Status s; - Device* dst_device; - if (s.ok()) { - s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device); - CHECK(s.ok()) << "Device lookup failed"; - } + [this, parsed, recv_args, done, dst, temp1, + rendezvous_call](MPIRecvTensorResponse mpi_response) { + Status s; + Device* dst_device; + if (s.ok()) { + s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device); + CHECK(s.ok()) << "Device lookup failed"; + } - VLOG(3) << "MPI Received tensor " << parsed.FullKey() - << " @ step: " << temp1 - << " single-send: " << mpi_response.singlesend(); + VLOG(3) << "MPI Received tensor " << parsed.FullKey() + << " @ step: " << temp1 + << " single-send: " << mpi_response.singlesend(); - Tensor val; - if (mpi_response.singlesend()) { - dst_device->MakeTensorFromProto(mpi_response.response().tensor(), - recv_args.alloc_attrs, &val); - } else { - TensorResponse tr; - tr.InitAlloc(dst_device, recv_args.alloc_attrs); - tr.InitPartial(mpi_response.response()); - const size_t nBytes = tr.tensor().TotalBytes(); - void* data = const_cast(DMAHelper::base(&tr.tensor())); - MPI_Status status; - MPI_CHECK(MPI_Recv(data, static_cast(nBytes), MPI_BYTE, dst, - TAG_SENDTENSOR2, MPI_COMM_WORLD, &status)); - val = std::move(tr.tensor()); - } + Tensor val; + if (mpi_response.singlesend()) { + dst_device->MakeTensorFromProto(mpi_response.response().tensor(), + recv_args.alloc_attrs, &val); + } else { + TensorResponse tr; + tr.InitAlloc(dst_device, recv_args.alloc_attrs); + tr.InitPartial(mpi_response.response()); + const size_t nBytes = tr.tensor().TotalBytes(); + void* data = const_cast(DMAHelper::base(&tr.tensor())); + MPI_Status status; + MPI_CHECK(MPI_Recv(data, static_cast(nBytes), MPI_BYTE, dst, + TAG_SENDTENSOR2, MPI_COMM_WORLD, &status)); + val = std::move(tr.tensor()); + } - done(s, Args(), recv_args, val, mpi_response.response().is_dead()); - }; + done(s, Args(), recv_args, val, mpi_response.response().is_dead()); + }; MPIRendezvousMgr* mgr = reinterpret_cast(this->rendezvous_mgr_); @@ -152,16 +151,18 @@ MPIRemoteRendezvous::~MPIRemoteRendezvous() {} void MPIRendezvousMgr::AddRequest(RecvTensorRequest request, const int mpi_dst) { TF_CHECK_OK(recv_tensor_recent_request_ids_.TrackUnique( - req.request_id(), "RecvTensor (MPIRendezvousMgr)", req)); + request.request_id(), "RecvTensor (MPIRendezvousMgr)", request)); const int64 step_id = request.step_id(); const std::string& key = request.rendezvous_key(); Rendezvous::ParsedKey parsed; TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed)); MPIRecvTensorCallBack send_cb = [this, mpi_dst, parsed]( - const Status& status, const Rendezvous::Args& send_args, - const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead, - MPISendTensorCall* mpi_send_call) { + const Status& status, + const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, + const Tensor& val, bool is_dead, + MPISendTensorCall* mpi_send_call) { // TODO(jbedorf) this should be a loop over max size CHECK(mpi_send_call->mRes_.ByteSize() < INT_MAX) << "Buffer too large for single transfer"; @@ -194,74 +195,78 @@ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request, }; // Wrapper around the read callback to place the callback on our queue - Rendezvous::DoneCallback done_cb = [this, parsed, step_id, send_cb]( - const Status& status, const Rendezvous::Args& send_args, - const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) { - if (!status.ok()) { - CHECK(status.ok()) << "RecvLocalAsync was not ok, key: " - << parsed.FullKey() << " step: " << step_id - << " error message: " << status.error_message(); - return; - } + Rendezvous::DoneCallback done_cb = + [this, parsed, step_id, send_cb]( + const Status& status, const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) { + if (!status.ok()) { + CHECK(status.ok()) + << "RecvLocalAsync was not ok, key: " << parsed.FullKey() + << " step: " << step_id + << " error message: " << status.error_message(); + return; + } - VLOG(3) << "MPI Sending tensor " << parsed.FullKey() - << " @ step: " << step_id << std::endl; + VLOG(3) << "MPI Sending tensor " << parsed.FullKey() + << " @ step: " << step_id << std::endl; - auto mpi_send_call = new MPISendTensorCall(); - mpi_send_call->Init(parsed, step_id, is_dead); + auto mpi_send_call = new MPISendTensorCall(); + mpi_send_call->Init(parsed, step_id, is_dead); - Device* src_dev = nullptr; - Status s = this->worker_env_2->device_mgr->LookupDevice(parsed.src_device, - &src_dev); - CHECK(s.ok()) << "src device not found"; + Device* src_dev = nullptr; + Status s = this->worker_env_2->device_mgr->LookupDevice( + parsed.src_device, &src_dev); + CHECK(s.ok()) << "src device not found"; - // Control if shape and data should be send together or if we can optimize - // it in two different transfers, thereby reducing memory copies - bool doOptimalTransfer = true; - if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false; - if (val.TotalBytes() < 1024) doOptimalTransfer = false; + // Control if shape and data should be send together or if we can + // optimize it in two different transfers, thereby reducing memory + // copies + bool doOptimalTransfer = true; + if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false; + if (val.TotalBytes() < 1024) doOptimalTransfer = false; - doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_; + doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_; - if (doOptimalTransfer) { - // First send the Tensor description and in a follow up transfer the data - mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype( - val.dtype()); - val.shape().AsProto(mpi_send_call->mRes_.mutable_response() - ->mutable_tensor() - ->mutable_tensor_shape()); - mpi_send_call->mRes_.set_singlesend(false); - } else { - // Send the Tensor description and data in a single transfer - if (src_dev->tensorflow_gpu_device_info() && - (!send_args.alloc_attrs.on_host())) { - Notification n; - GPUUtil::SetProtoFromGPU( - val, src_dev, send_args.device_context, - mpi_send_call->mRes_.mutable_response()->mutable_tensor(), is_dead, - [&n, &s](const Status& s_) { - s = s_; - n.Notify(); - }); - n.WaitForNotification(); - } else { - val.AsProtoTensorContent( - mpi_send_call->mRes_.mutable_response()->mutable_tensor()); - } - } + if (doOptimalTransfer) { + // First send the Tensor description and in a follow up transfer the + // data + mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype( + val.dtype()); + val.shape().AsProto(mpi_send_call->mRes_.mutable_response() + ->mutable_tensor() + ->mutable_tensor_shape()); + mpi_send_call->mRes_.set_singlesend(false); + } else { + // Send the Tensor description and data in a single transfer + if (src_dev->tensorflow_gpu_device_info() && + (!send_args.alloc_attrs.on_host())) { + Notification n; + GPUUtil::SetProtoFromGPU( + val, src_dev, send_args.device_context, + mpi_send_call->mRes_.mutable_response()->mutable_tensor(), + is_dead, [&n, &s](const Status& s_) { + s = s_; + n.Notify(); + }); + n.WaitForNotification(); + } else { + val.AsProtoTensorContent( + mpi_send_call->mRes_.mutable_response()->mutable_tensor()); + } + } - std::function res = std::bind( - send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call); + std::function res = std::bind( + send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call); - SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res)); + SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res)); - this->QueueSendRequest(req); + this->QueueSendRequest(req); - // Wait for the notification that indicates the tensor has been - // successfully transmitted to the remote process. Only needed if we - // have not parsed the tensor to proto - if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification(); - }; // done_cb + // Wait for the notification that indicates the tensor has been + // successfully transmitted to the remote process. Only needed if we + // have not parsed the tensor to proto + if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification(); + }; // done_cb worker_env_2->compute_pool->Schedule([this, step_id, parsed, done_cb]() { this->RecvLocalAsync(step_id, parsed, done_cb); @@ -293,9 +298,8 @@ void MPIRendezvousMgr::MPIBackgroundThread() { } // Remove sends that have been completed - active_sends.remove_if([](std::unique_ptr& i) { - return i->IsFinished(); - }); + active_sends.remove_if( + [](std::unique_ptr& i) { return i->IsFinished(); }); // send a Tensor request RequestQueueEntry req; diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h index ca42ee2f6d2..5596601ddb9 100644 --- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h +++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h @@ -18,12 +18,12 @@ limitations under the License. #ifdef TENSORFLOW_USE_MPI -#include -#include #include -#include -#include #include +#include +#include +#include +#include #include #include #include @@ -33,6 +33,7 @@ limitations under the License. #include "tensorflow/contrib/mpi/mpi_msg.pb.h" #include "tensorflow/contrib/mpi/mpi_utils.h" #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h" +#include "tensorflow/core/distributed_runtime/recent_request_ids.h" #include "tensorflow/core/distributed_runtime/request_id.h" #include "tensorflow/core/distributed_runtime/worker_env.h" #include "tensorflow/core/protobuf/worker.pb.h" @@ -160,7 +161,8 @@ class MPIRendezvousMgr : public BaseRendezvousMgr { private: typedef std::function MPIRecvTensorCallBack; + const Tensor&, const bool, MPISendTensorCall*)> + MPIRecvTensorCallBack; typedef std::pair> RequestQueueEntry; typedef std::pair> diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc index d585c0565eb..a31fa9ce0b3 100644 --- a/tensorflow/contrib/mpi/mpi_server_lib.cc +++ b/tensorflow/contrib/mpi/mpi_server_lib.cc @@ -22,8 +22,8 @@ limitations under the License. #include "grpc/support/alloc.h" -#include "tensorflow/core/distributed_runtime/server_lib.h" #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h" +#include "tensorflow/core/distributed_runtime/server_lib.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/env.h" diff --git a/tensorflow/contrib/mpi/mpi_utils.h b/tensorflow/contrib/mpi/mpi_utils.h index 45e21f2b25a..fa297c28cb4 100644 --- a/tensorflow/contrib/mpi/mpi_utils.h +++ b/tensorflow/contrib/mpi/mpi_utils.h @@ -18,8 +18,8 @@ limitations under the License. #ifdef TENSORFLOW_USE_MPI -#include #include +#include #include #include "tensorflow/core/lib/strings/str_util.h" diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc index 2d5b98022c3..8dca90a1e34 100644 --- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc +++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc @@ -35,8 +35,8 @@ limitations under the License. #define OMPI_SKIP_MPICXX #include "third_party/mpi/mpi.h" -#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h" #include "tensorflow/contrib/mpi_collectives/kernels/ring.h" +#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h" /* * MPI Allreduce and Allgather Ops for TensorFlow. diff --git a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py index f0a116239d6..2fbefef0d36 100644 --- a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py +++ b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py @@ -26,7 +26,8 @@ from tensorflow.python.framework import ops from tensorflow.python.platform import resource_loader _mpi_ops_so = loader.load_op_library( - resource_loader.get_path_to_datafile("_mpi_ops.so")) + resource_loader.get_path_to_datafile('_mpi_ops.so')) + def size(name=None): """An op which returns the number of MPI processes. @@ -120,15 +121,14 @@ def allgather(tensor, name=None): """ # Specify that first allgather is to collect the tensor gather sizes, # indicated by passing in a scalar (0-D tensor) of value 0 - sizes_flag = tf.constant(0, dtype=tf.int64, name="size_flag_const") - my_size = tf.slice(tf.shape(tensor, out_type=tf.int64), [0], [1], name="size_slice") + sizes_flag = tf.constant(0, dtype=tf.int64, name='size_flag_const') + my_size = tf.slice( + tf.shape(tensor, out_type=tf.int64), [0], [1], name='size_slice') if name is None: - name = "allgather" - sizing_name = "{}_sizing".format(name) + name = 'allgather' + sizing_name = '{}_sizing'.format(name) sizes = gen_mpi_ops.mpi_allgather(my_size, sizes_flag, name=sizing_name) return gen_mpi_ops.mpi_allgather(tensor, sizes, name=name) ops.NotDifferentiable('MPIAllgather') - - diff --git a/tensorflow/contrib/ndlstm/__init__.py b/tensorflow/contrib/ndlstm/__init__.py index 52e83069cb0..da89bb4ab60 100644 --- a/tensorflow/contrib/ndlstm/__init__.py +++ b/tensorflow/contrib/ndlstm/__init__.py @@ -16,3 +16,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function + +from tensorflow.contrib.ndlstm.python import lstm2d +from tensorflow.contrib.ndlstm.python import lstm1d diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py index b24e332e4ae..2e2e9086c00 100644 --- a/tensorflow/contrib/ndlstm/python/lstm1d.py +++ b/tensorflow/contrib/ndlstm/python/lstm1d.py @@ -88,7 +88,7 @@ def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False): if reverse: inputs = array_ops.reverse_v2(inputs, [0]) outputs, _ = rnn.dynamic_rnn( - lstm_cell, inputs, time_major=True, dtype=inputs.dtype) + lstm_cell, inputs, time_major=True, dtype=inputs.dtype) if reverse: outputs = array_ops.reverse_v2(outputs, [0]) return outputs diff --git a/tensorflow/contrib/nearest_neighbor/kernels/heap.h b/tensorflow/contrib/nearest_neighbor/kernels/heap.h index 32925569a82..a2dbb8052bf 100644 --- a/tensorflow/contrib/nearest_neighbor/kernels/heap.h +++ b/tensorflow/contrib/nearest_neighbor/kernels/heap.h @@ -56,7 +56,7 @@ class HeapBase { // This method adds an element at the end of the internal array without // "heapifying" the array afterwards. This is useful for setting up a heap - // where a single call to heapify at the end of the inital insertion + // where a single call to heapify at the end of the initial insertion // operations suffices. void InsertUnsorted(const KeyType& key, const DataType& data) { if (v_.size() == static_cast(num_elements_)) { diff --git a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc index 2b412fac9a6..13db6f62f52 100644 --- a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc +++ b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc @@ -75,7 +75,8 @@ class HyperplaneLSHProbesOp : public OpKernel { num_hyperplanes_per_table, ".")); OP_REQUIRES(context, num_hyperplanes_per_table <= 30, InvalidArgument("Need num_hyperplanes_per_table <= 30, got ", - num_hyperplanes_per_table, ". " + num_hyperplanes_per_table, + ". " "If you need more hyperplanes, change this Op" " to work for larger integer types (int64).")); @@ -88,12 +89,13 @@ class HyperplaneLSHProbesOp : public OpKernel { InvalidArgument("num_probes must be at least 1.")); int expected_num_hyperplanes = num_tables * num_hyperplanes_per_table; - OP_REQUIRES( - context, products_tensor.dim_size(1) == expected_num_hyperplanes, - InvalidArgument("Expected number of hyperplanes is ", - expected_num_hyperplanes, " but received ", - products_tensor.dim_size(1), " inner products per " - "point.")); + OP_REQUIRES(context, + products_tensor.dim_size(1) == expected_num_hyperplanes, + InvalidArgument("Expected number of hyperplanes is ", + expected_num_hyperplanes, " but received ", + products_tensor.dim_size(1), + " inner products per " + "point.")); auto products_eigen_tensor = products_tensor.matrix(); ConstMatrixMap products_matrix(products_eigen_tensor.data(), @@ -116,13 +118,11 @@ class HyperplaneLSHProbesOp : public OpKernel { // lschmidt's workstation. int64 cost_per_unit = 21 * num_hyperplanes_per_table * num_tables; if (num_probes > num_tables) { - cost_per_unit += 110 * num_hyperplanes_per_table - * (num_probes - num_tables); + cost_per_unit += + 110 * num_hyperplanes_per_table * (num_probes - num_tables); } context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor( - batch_size, - cost_per_unit, - [&](int64 start, int64 end) { + batch_size, cost_per_unit, [&](int64 start, int64 end) { HyperplaneMultiprobe multiprobe( num_hyperplanes_per_table, num_tables); diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py index f243317f1df..82ebca7f203 100644 --- a/tensorflow/contrib/opt/python/training/external_optimizer.py +++ b/tensorflow/contrib/opt/python/training/external_optimizer.py @@ -397,10 +397,6 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface): 'automatically and cannot be injected manually'.format(kwarg)) minimize_kwargs.update(optimizer_kwargs) - if method == 'SLSQP': - # SLSQP doesn't support step callbacks. Obviate associated warning - # message. - del minimize_kwargs['callback'] import scipy.optimize # pylint: disable=g-import-not-at-top result = scipy.optimize.minimize(*minimize_args, **minimize_kwargs) diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py index 0f597d0a246..953586ee70c 100644 --- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py +++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py @@ -299,6 +299,45 @@ class ScipyOptimizerInterfaceTest(TestCase): method = optimizer.optimizer_kwargs.get('method') self.assertEqual('SLSQP', method) + def test_callbacks(self): + vector_val = np.array([7., -2.], dtype=np.float32) + vector = variables.Variable(vector_val, 'vector') + + minimum_location_val = np.arange(2) + minimum_location = constant_op.constant( + minimum_location_val, dtype=dtypes.float32) + + loss = math_ops.reduce_sum(math_ops.square(vector - minimum_location)) / 2. + loss_val_first = ((vector_val - minimum_location_val)**2).sum() / 2. + + optimizer = external_optimizer.ScipyOptimizerInterface(loss, method='SLSQP') + + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + + initial_vector_val = sess.run(vector) + + extra_fetches = [loss] + + step_callback = test.mock.Mock() + loss_callback = test.mock.Mock() + + optimizer.minimize( + sess, + fetches=extra_fetches, + loss_callback=loss_callback, + step_callback=step_callback) + + loss_val_last = sess.run(loss) + + call_first = test.mock.call(loss_val_first) + call_last = test.mock.call(loss_val_last) + loss_calls = [call_first, call_last] + loss_callback.assert_has_calls(loss_calls, any_order=True) + + args, _ = step_callback.call_args + self.assertAllClose(minimum_location_val, args[0]) + if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc index 9cee405cef2..e18923c8aae 100644 --- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc +++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc @@ -14,13 +14,12 @@ // limitations under the License. // ============================================================================= -#include "tensorflow/core/framework/register_types.h" #include "tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h" +#include "tensorflow/core/framework/register_types.h" namespace tensorflow { -REGISTER_KERNEL_BUILDER(Name("PeriodicResample") - .Device(DEVICE_CPU), +REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU), PeriodicResampleOp); } // namespace tensorflow diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h index ba410f025d4..3ab588c4588 100644 --- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h +++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h @@ -118,9 +118,9 @@ template #include -#include #include #include #include #include #include #include -#include -#include #include +#include +#include +#include #include #include "tensorflow/core/framework/graph.pb.h" @@ -46,10 +46,10 @@ limitations under the License. // These are all common classes it's handy to reference with no namespace. using tensorflow::Flag; -using tensorflow::Tensor; +using tensorflow::int32; using tensorflow::Status; using tensorflow::string; -using tensorflow::int32; +using tensorflow::Tensor; // Used to store the memory-mapped buffers we use for capture. struct CameraBuffer { diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc index 0b18045789f..c6935a093f7 100644 --- a/tensorflow/contrib/pi_examples/label_image/label_image.cc +++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc @@ -23,9 +23,9 @@ limitations under the License. // // Full build instructions are at tensorflow/contrib/pi_examples/README.md. -#include #include #include +#include #include #include @@ -46,10 +46,10 @@ limitations under the License. // These are all common classes it's handy to reference with no namespace. using tensorflow::Flag; -using tensorflow::Tensor; +using tensorflow::int32; using tensorflow::Status; using tensorflow::string; -using tensorflow::int32; +using tensorflow::Tensor; // Takes a file name, and loads a list of labels from it, one per line, and // returns a vector of the strings. It pads with empty strings so the length @@ -77,23 +77,22 @@ Status ReadLabelsFile(string file_name, std::vector* result, // Error handling for JPEG decoding. void CatchError(j_common_ptr cinfo) { (*cinfo->err->output_message)(cinfo); - jmp_buf *jpeg_jmpbuf = reinterpret_cast(cinfo->client_data); + jmp_buf* jpeg_jmpbuf = reinterpret_cast(cinfo->client_data); jpeg_destroy(cinfo); longjmp(*jpeg_jmpbuf, 1); } // Decompresses a JPEG file from disk. Status LoadJpegFile(string file_name, std::vector* data, - int* width, int* height, int* channels) { + int* width, int* height, int* channels) { struct jpeg_decompress_struct cinfo; - FILE * infile; + FILE* infile; JSAMPARRAY buffer; int row_stride; if ((infile = fopen(file_name.c_str(), "rb")) == NULL) { LOG(ERROR) << "Can't open " << file_name; - return tensorflow::errors::NotFound("JPEG file ", file_name, - " not found"); + return tensorflow::errors::NotFound("JPEG file ", file_name, " not found"); } struct jpeg_error_mgr jerr; @@ -116,10 +115,11 @@ Status LoadJpegFile(string file_name, std::vector* data, data->resize((*height) * (*width) * (*channels)); row_stride = cinfo.output_width * cinfo.output_components; - buffer = (*cinfo.mem->alloc_sarray) - ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1); + buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE, + row_stride, 1); while (cinfo.output_scanline < cinfo.output_height) { - tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]); + tensorflow::uint8* row_address = + &((*data)[cinfo.output_scanline * row_stride]); jpeg_read_scanlines(&cinfo, buffer, 1); memcpy(row_address, buffer[0], row_stride); } @@ -141,24 +141,25 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height, int image_height; int image_channels; TF_RETURN_IF_ERROR(LoadJpegFile(file_name, &image_data, &image_width, - &image_height, &image_channels)); - LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height - << "x" << image_channels; + &image_height, &image_channels)); + LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height << "x" + << image_channels; const int wanted_channels = 3; if (image_channels < wanted_channels) { - return tensorflow::errors::FailedPrecondition("Image needs to have at least ", - wanted_channels, " but only has ", - image_channels); + return tensorflow::errors::FailedPrecondition( + "Image needs to have at least ", wanted_channels, " but only has ", + image_channels); } - // In these loops, we convert the eight-bit data in the image into float, resize - // it using bilinear filtering, and scale it numerically to the float range that - // the model expects (given by input_mean and input_std). + // In these loops, we convert the eight-bit data in the image into float, + // resize it using bilinear filtering, and scale it numerically to the float + // range that the model expects (given by input_mean and input_std). tensorflow::Tensor image_tensor( - tensorflow::DT_FLOAT, tensorflow::TensorShape( - {1, wanted_height, wanted_width, wanted_channels})); + tensorflow::DT_FLOAT, + tensorflow::TensorShape( + {1, wanted_height, wanted_width, wanted_channels})); auto image_tensor_mapped = image_tensor.tensor(); tensorflow::uint8* in = image_data.data(); - float *out = image_tensor_mapped.data(); + float* out = image_tensor_mapped.data(); const size_t image_rowlen = image_width * image_channels; const float width_scale = static_cast(image_width) / wanted_width; const float height_scale = static_cast(image_height) / wanted_height; @@ -166,35 +167,37 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height, const float in_y = y * height_scale; const int top_y_index = static_cast(floorf(in_y)); const int bottom_y_index = - std::min(static_cast(ceilf(in_y)), (image_height - 1)); + std::min(static_cast(ceilf(in_y)), (image_height - 1)); const float y_lerp = in_y - top_y_index; tensorflow::uint8* in_top_row = in + (top_y_index * image_rowlen); tensorflow::uint8* in_bottom_row = in + (bottom_y_index * image_rowlen); - float *out_row = out + (y * wanted_width * wanted_channels); + float* out_row = out + (y * wanted_width * wanted_channels); for (int x = 0; x < wanted_width; ++x) { const float in_x = x * width_scale; const int left_x_index = static_cast(floorf(in_x)); const int right_x_index = - std::min(static_cast(ceilf(in_x)), (image_width - 1)); + std::min(static_cast(ceilf(in_x)), (image_width - 1)); tensorflow::uint8* in_top_left_pixel = - in_top_row + (left_x_index * wanted_channels); + in_top_row + (left_x_index * wanted_channels); tensorflow::uint8* in_top_right_pixel = - in_top_row + (right_x_index * wanted_channels); + in_top_row + (right_x_index * wanted_channels); tensorflow::uint8* in_bottom_left_pixel = - in_bottom_row + (left_x_index * wanted_channels); + in_bottom_row + (left_x_index * wanted_channels); tensorflow::uint8* in_bottom_right_pixel = - in_bottom_row + (right_x_index * wanted_channels); + in_bottom_row + (right_x_index * wanted_channels); const float x_lerp = in_x - left_x_index; - float *out_pixel = out_row + (x * wanted_channels); + float* out_pixel = out_row + (x * wanted_channels); for (int c = 0; c < wanted_channels; ++c) { - const float top_left((in_top_left_pixel[c] - input_mean) / input_std); - const float top_right((in_top_right_pixel[c] - input_mean) / input_std); - const float bottom_left((in_bottom_left_pixel[c] - input_mean) / input_std); - const float bottom_right((in_bottom_right_pixel[c] - input_mean) / input_std); - const float top = top_left + (top_right - top_left) * x_lerp; - const float bottom = - bottom_left + (bottom_right - bottom_left) * x_lerp; - out_pixel[c] = top + (bottom - top) * y_lerp; + const float top_left((in_top_left_pixel[c] - input_mean) / input_std); + const float top_right((in_top_right_pixel[c] - input_mean) / input_std); + const float bottom_left((in_bottom_left_pixel[c] - input_mean) / + input_std); + const float bottom_right((in_bottom_right_pixel[c] - input_mean) / + input_std); + const float top = top_left + (top_right - top_left) * x_lerp; + const float bottom = + bottom_left + (bottom_right - bottom_left) * x_lerp; + out_pixel[c] = top + (bottom - top) * y_lerp; } } } @@ -233,10 +236,10 @@ Status GetTopLabels(const std::vector& outputs, int how_many_labels, scores.push_back(std::pair({i, unsorted_scores_flat(i)})); } std::sort(scores.begin(), scores.end(), - [](const std::pair &left, - const std::pair &right) { - return left.second > right.second; - }); + [](const std::pair& left, + const std::pair& right) { + return left.second > right.second; + }); scores.resize(how_many_labels); Tensor sorted_indices(tensorflow::DT_INT32, {scores.size()}); Tensor sorted_scores(tensorflow::DT_FLOAT, {scores.size()}); diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD index d395de986d2..479ea9becae 100644 --- a/tensorflow/contrib/py2tf/BUILD +++ b/tensorflow/contrib/py2tf/BUILD @@ -18,66 +18,13 @@ py_library( name = "py2tf", srcs = [ "__init__.py", - "api.py", - "config.py", - "conversion.py", - "naming.py", ], srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ - "//tensorflow/contrib/py2tf/converters", - "//tensorflow/contrib/py2tf/pyct", - "//tensorflow/contrib/py2tf/pyct/static_analysis", + "//tensorflow/contrib/py2tf/impl", + "//tensorflow/contrib/py2tf/utils", "@gast_archive//:gast", "@six_archive//:six", ], ) - -# Separate target that allows access to internal symbols for testing. -py_library( - name = "py2tf_internal", - srcs = [ - "api.py", - "config.py", - "conversion.py", - "naming.py", - ], - srcs_version = "PY2AND3", - visibility = ["//tensorflow:__subpackages__"], - deps = [ - "//tensorflow/contrib/py2tf/converters", - "//tensorflow/contrib/py2tf/pyct", - "//tensorflow/contrib/py2tf/pyct/static_analysis", - "@gast_archive//:gast", - "@six_archive//:six", - ], -) - -py_test( - name = "api_test", - srcs = ["api_test.py"], - deps = [ - ":py2tf_internal", - "//tensorflow/python:client_testlib", - ], -) - -py_test( - name = "conversion_test", - srcs = ["conversion_test.py"], - deps = [ - ":py2tf_internal", - "//tensorflow/python:client_testlib", - "@gast_archive//:gast", - ], -) - -py_test( - name = "naming_test", - srcs = ["naming_test.py"], - deps = [ - ":py2tf_internal", - "//tensorflow/python:client_testlib", - ], -) diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/py2tf/__init__.py index d187da99e06..0d51bf0bf24 100644 --- a/tensorflow/contrib/py2tf/__init__.py +++ b/tensorflow/contrib/py2tf/__init__.py @@ -21,11 +21,13 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.py2tf.api import to_code -from tensorflow.contrib.py2tf.api import to_graph +from tensorflow.contrib.py2tf import utils +from tensorflow.contrib.py2tf.impl.api import convert +from tensorflow.contrib.py2tf.impl.api import graph_ready +from tensorflow.contrib.py2tf.impl.api import to_code +from tensorflow.contrib.py2tf.impl.api import to_graph from tensorflow.python.util.all_util import remove_undocumented - -_allowed_symbols = ['to_graph', 'to_code'] +_allowed_symbols = ['to_graph', 'to_code', 'convert', 'graph_ready', 'utils'] remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD index 2b0a1234e69..03ded49022c 100644 --- a/tensorflow/contrib/py2tf/converters/BUILD +++ b/tensorflow/contrib/py2tf/converters/BUILD @@ -17,6 +17,7 @@ filegroup( py_library( name = "converters", srcs = [ + "asserts.py", "break_canonicalization.py", "builtin_functions.py", "call_trees.py", @@ -45,13 +46,26 @@ py_library( deps = [ ":converters", "//tensorflow/contrib/py2tf/pyct/static_analysis", + "//tensorflow/contrib/py2tf/utils", "@gast_archive//:gast", ], ) +py_test( + name = "asserts_test", + srcs = ["asserts_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":test_lib", + "//tensorflow/contrib/py2tf/pyct", + "//tensorflow/python:client_testlib", + ], +) + py_test( name = "break_canonicalization_test", srcs = ["break_canonicalization_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -62,6 +76,18 @@ py_test( py_test( name = "call_trees_test", srcs = ["call_trees_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":test_lib", + "//tensorflow/contrib/py2tf/pyct", + "//tensorflow/python:client_testlib", + ], +) + +py_test( + name = "decorators_test", + srcs = ["decorators_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -72,6 +98,7 @@ py_test( py_test( name = "continue_canonicalization_test", srcs = ["continue_canonicalization_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -82,6 +109,7 @@ py_test( py_test( name = "control_flow_test", srcs = ["control_flow_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -92,6 +120,7 @@ py_test( py_test( name = "builtin_functions_test", srcs = ["builtin_functions_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -102,6 +131,8 @@ py_test( py_test( name = "for_canonicalization_test", srcs = ["for_canonicalization_test.py"], + srcs_version = "PY2AND3", + tags = ["nomac"], deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -112,6 +143,7 @@ py_test( py_test( name = "logical_expressions_test", srcs = ["logical_expressions_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -122,6 +154,7 @@ py_test( py_test( name = "print_functions_test", srcs = ["print_functions_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", @@ -133,6 +166,7 @@ py_test( py_test( name = "side_effect_guards_test", srcs = ["side_effect_guards_test.py"], + srcs_version = "PY2AND3", deps = [ ":test_lib", "//tensorflow/contrib/py2tf/pyct", diff --git a/tensorflow/contrib/py2tf/converters/asserts.py b/tensorflow/contrib/py2tf/converters/asserts.py new file mode 100644 index 00000000000..2d6ee1d0982 --- /dev/null +++ b/tensorflow/contrib/py2tf/converters/asserts.py @@ -0,0 +1,53 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Converts Assert statements to their corresponding TF calls.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import gast + +from tensorflow.contrib.py2tf.pyct import templates +from tensorflow.contrib.py2tf.pyct import transformer + + +class AssertsTransformer(transformer.Base): + """Transforms Print nodes to Call so they can be handled as functions.""" + + # pylint:disable=invalid-name + + def visit_Assert(self, node): + self.generic_visit(node) + + # Note: The lone tf.Assert call will be wrapped with control_dependencies + # by side_effect_guards. + template = """ + tf.Assert(test, [tf.constant(msg)]) + """ + + if node.msg is None: + return templates.replace( + template, test=node.test, msg=gast.Str('Assertion error')) + elif isinstance(node.msg, gast.Str): + return templates.replace(template, test=node.test, msg=node.msg) + else: + raise NotImplementedError('Can only convert string messages for now.') + + # pylint:enable=invalid-name + + +def transform(node, context): + return AssertsTransformer(context).visit(node) diff --git a/tensorflow/contrib/py2tf/converters/asserts_test.py b/tensorflow/contrib/py2tf/converters/asserts_test.py new file mode 100644 index 00000000000..6611f2777a9 --- /dev/null +++ b/tensorflow/contrib/py2tf/converters/asserts_test.py @@ -0,0 +1,42 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for asserts module.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import gast + +from tensorflow.contrib.py2tf.converters import asserts +from tensorflow.contrib.py2tf.converters import converter_test_base +from tensorflow.python.platform import test + + +class AssertsTest(converter_test_base.TestCase): + + def test_transform(self): + + def test_fn(a): + assert a > 0 + + node = self.parse_and_analyze(test_fn, {}) + node = asserts.transform(node, self.ctx) + + self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call)) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/py2tf/converters/call_trees.py b/tensorflow/contrib/py2tf/converters/call_trees.py index 0aae030450a..4c238b7fb9e 100644 --- a/tensorflow/contrib/py2tf/converters/call_trees.py +++ b/tensorflow/contrib/py2tf/converters/call_trees.py @@ -29,46 +29,46 @@ import gast from tensorflow.contrib.py2tf.pyct import anno from tensorflow.contrib.py2tf.pyct import parser from tensorflow.contrib.py2tf.pyct import templates +from tensorflow.contrib.py2tf.pyct import transformer +from tensorflow.python.util import tf_inspect class FunctionNamer(object): """Describes the interface for CallTreeTransformer's namer.""" def compiled_function_name(self, - original_name, - live_object=None, + original_fqn, + live_entity=None, owner_type=None): """Generate the name corresponding to the compiled version of a function. Args: - original_name: String - live_object: Callable, the actual target function, if known. + original_fqn: string or tuple(string) + live_entity: Callable, the actual target function, if known. owner_type: Optional object. If present, it indicates that the function is a member of the given type. Returns: - String. + string, bool """ raise NotImplementedError() - def compiled_class_name(self, original_name, live_object=None): + def compiled_class_name(self, original_fqn, live_entity=None): """Generate the name corresponding to the compiled version of a class. Args: - original_name: String - live_object: The actual target class, if known. + original_fqn: string or tuple(string) + live_entity: The actual target class, if known. Returns: - String. + string """ raise NotImplementedError() -class CallTreeTransformer(gast.NodeTransformer): +class CallTreeTransformer(transformer.Base): """Transforms the call tree by renaming transformed symbols.""" - def __init__(self, namer, namespace, uncompiled_modules, - nocompile_decorators): - self.namer = namer - self.namespace = namespace + def __init__(self, context, uncompiled_modules, nocompile_decorators): + super(CallTreeTransformer, self).__init__(context) self.uncompiled_modules = uncompiled_modules self.nocompile_decorators = nocompile_decorators @@ -78,7 +78,7 @@ class CallTreeTransformer(gast.NodeTransformer): if isinstance(node, gast.Call): return self._resolve_name(node.func) if isinstance(node, gast.Name): - return self.namespace.get(node.id) + return self.context.namespace.get(node.id) if isinstance(node, gast.Attribute): parent = self._resolve_name(node.value) if parent is not None: @@ -91,8 +91,12 @@ class CallTreeTransformer(gast.NodeTransformer): if anno.hasanno(node, 'live_val'): return anno.getanno(node, 'live_val') if isinstance(node, gast.Attribute) and anno.hasanno(node, 'type'): - member = getattr(anno.getanno(node, 'type'), node.attr) - return member + owner_type = anno.getanno(node, 'type') + if hasattr(owner_type, node.attr): + return getattr(owner_type, node.attr) + else: + raise ValueError('Type "%s" has not attribute "%s". Is it dynamic?' % + (owner_type, node.attr)) return None def _should_compile(self, node, fqn): @@ -106,14 +110,14 @@ class CallTreeTransformer(gast.NodeTransformer): # The decorators themselves are not to be converted. # If present, the decorators should appear as static functions. - target_obj = self._try_resolve_target(node.func) - if target_obj is not None: + target_entity = self._try_resolve_target(node.func) + if target_entity is not None: # This attribute is set by the decorator itself. # TODO(mdan): This may not play nicely with other wrapping decorators. - if hasattr(target_obj, '__pyct_is_compile_decorator'): + if hasattr(target_entity, '__pyct_is_compile_decorator'): return False - if target_obj in self.nocompile_decorators: + if target_entity in self.nocompile_decorators: return False # Inspect the target function decorators. If any include a @convert @@ -122,7 +126,8 @@ class CallTreeTransformer(gast.NodeTransformer): # To parse and re-analize each function for every call site could be quite # wasteful. Maybe we could cache the parsed AST? try: - target_node = parser.parse_object(target_obj).body[0] + target_node, _ = parser.parse_entity(target_entity) + target_node = target_node.body[0] except TypeError: # Functions whose source we cannot access are compilable (e.g. wrapped # to py_func). @@ -136,48 +141,57 @@ class CallTreeTransformer(gast.NodeTransformer): return True + def _determine_function_owner(self, m): + # TODO(mdan): The parent type should be known at analysis. Use that instead. + if hasattr(m, 'im_class'): # Python 2 + return m.im_class + if hasattr(m, '__qualname__'): # Python 3 + # Object attributes: should be bound to "self". + if hasattr(m, '__self__'): + return type(m.__self__) + + # Class attributes: should have the owner name in their namespace. + qn = m.__qualname__.split('.') + if len(qn) < 2: + return None + owner_name, func_name = qn[-2:] + if func_name != m.__name__: + raise ValueError('Inconsistent names detected ' + '(__qualname__[1] = "%s", __name__ = "%s") for %s.' % + (func_name, m.__name__, m)) + if owner_name == '': + return None + if owner_name not in self.context.namespace: + raise ValueError( + 'Could not resolve name "%s" while analyzing %s. Namespace:\n%s' % + (owner_name, m, self.context.namespace)) + return self.context.namespace[owner_name] + return None + def _rename_compilable_function(self, node): assert anno.hasanno(node.func, 'live_val') assert anno.hasanno(node.func, 'fqn') - target_obj = anno.getanno(node.func, 'live_val') + target_entity = anno.getanno(node.func, 'live_val') target_fqn = anno.getanno(node.func, 'fqn') if not self._should_compile(node, target_fqn): return node if anno.hasanno(node, 'is_constructor'): - new_name = self.namer.compiled_class_name( - '__'.join(target_fqn), live_object=target_obj) + new_name = self.context.namer.compiled_class_name( + target_fqn, live_entity=target_entity) + do_rename = True else: - new_name = self.namer.compiled_function_name( - '__'.join(target_fqn), live_object=target_obj) - node.func = gast.Name(new_name, gast.Load(), None) - return node + owner_type = self._determine_function_owner(target_entity) + new_name, do_rename = self.context.namer.compiled_function_name( + target_fqn, live_entity=target_entity, owner_type=owner_type) - def _rename_member_function_of_known_type(self, node): - assert isinstance(node.func, gast.Attribute) - - type_fqn = anno.getanno(node.func, 'type_fqn') - assert anno.hasanno(node.func, 'type') - target_type = anno.getanno(node.func, 'type') - - if not self._should_compile(node, type_fqn): - return node - - # TODO(mdan): We should not assume that the namer only needs the - # member function name. - method_name = node.func.attr - method_object = getattr(target_type, method_name) - new_name = self.namer.compiled_function_name( - method_name, live_object=method_object, owner_type=target_type) - if new_name != node.func.attr: - # If a member function call is renamed, then the new function is no - # longer bound to the target object. We then refactor the call from: - # foo.bar(...) - # to: - # renamed_foo(bar, ...) - # TODO(mdan): This risks causing duplication, if target_type is renamed. - node.args = [node.func.value] + node.args + if do_rename: + if target_entity is not None: + if tf_inspect.ismethod(target_entity): + # The renaming process will transform it into a regular function. + # TODO(mdan): Is this complete? How does it work with nested members? + node.args = [node.func.value] + node.args node.func = gast.Name(new_name, gast.Load(), None) return node @@ -193,7 +207,7 @@ class CallTreeTransformer(gast.NodeTransformer): wrapper_def, call_expr = templates.replace( template, call=node.func, - wrapper=self.namer.compiled_function_name(node.func.id), + wrapper=self.context.namer.compiled_function_name(node.func.id)[0], args=tuple(gast.Name(n, gast.Load(), None) for n in args_scope.used)) anno.setanno(call_expr.value, 'args_scope', args_scope) # TODO(mdan): Rename this annotation to 'graph_ready' @@ -201,15 +215,15 @@ class CallTreeTransformer(gast.NodeTransformer): return (wrapper_def, call_expr) - def _function_is_compilable(self, target_obj): + def _function_is_compilable(self, target_entity): # TODO(mdan): This is just a placeholder. Implement. - return not isinstance(target_obj, types.BuiltinFunctionType) + return not isinstance(target_entity, types.BuiltinFunctionType) def visit_Expr(self, node): if isinstance(node.value, gast.Call): if anno.hasanno(node.value.func, 'live_val'): - target_obj = anno.getanno(node.value.func, 'live_val') - if not self._function_is_compilable(target_obj): + target_entity = anno.getanno(node.value.func, 'live_val') + if not self._function_is_compilable(target_entity): if anno.hasanno(node.value.func, 'fqn'): target_fqn = anno.getanno(node.value.func, 'fqn') if not self._should_compile(node.value, target_fqn): @@ -227,8 +241,8 @@ class CallTreeTransformer(gast.NodeTransformer): # If the function is wrapped by one of the marker decorators, # consider it graph ready. if anno.hasanno(node.func, 'live_val'): - target_obj = anno.getanno(node.func, 'live_val') - if target_obj in self.nocompile_decorators: + target_entity = anno.getanno(node.func, 'live_val') + if target_entity in self.nocompile_decorators: if len(node.args) < 1: raise ValueError( 'Found call to decorator function "%s", but it had no arguments. ' @@ -237,28 +251,28 @@ class CallTreeTransformer(gast.NodeTransformer): self.generic_visit(node) if anno.hasanno(node.func, 'live_val'): - target_obj = anno.getanno(node.func, 'live_val') - if self._function_is_compilable(target_obj): + target_entity = anno.getanno(node.func, 'live_val') + if self._function_is_compilable(target_entity): node = self._rename_compilable_function(node) else: raise NotImplementedError('py_func with return values') - elif anno.hasanno(node.func, 'type_fqn'): - node = self._rename_member_function_of_known_type(node) else: - raise NotImplementedError( - 'Member function call (of unknown type): %s.' % node.func.id) + if self.context.recursive: + raise NotImplementedError('Could not resolve target function.') + else: + # TODO(mdan): Double check. Is this reachable code? + pass return node # pylint:enable=invalid-name -def transform(node, namer, namespace, uncompiled_modules, nocompile_decorators): +def transform(node, context, uncompiled_modules, nocompile_decorators): """Transform function call to the compiled counterparts. Args: node: AST to transform. - namer: FunctionNamer-like. - namespace: Dict mapping symbol names to their corresponding live objects. + context: An EntityContext object. uncompiled_modules: set of string tuples, each tuple represents the fully qualified name of a package containing functions that will not be compiled. @@ -269,7 +283,6 @@ def transform(node, namer, namespace, uncompiled_modules, nocompile_decorators): node: The transformed AST new_names: set(string), containing any newly-generated names """ - transformer = CallTreeTransformer(namer, namespace, uncompiled_modules, - nocompile_decorators) - node = transformer.visit(node) + t = CallTreeTransformer(context, uncompiled_modules, nocompile_decorators) + node = t.visit(node) return node diff --git a/tensorflow/contrib/py2tf/converters/call_trees_test.py b/tensorflow/contrib/py2tf/converters/call_trees_test.py index 8cb8d7be0f1..e63c10de0fe 100644 --- a/tensorflow/contrib/py2tf/converters/call_trees_test.py +++ b/tensorflow/contrib/py2tf/converters/call_trees_test.py @@ -28,8 +28,13 @@ from tensorflow.python.platform import test class TestNamer(call_trees.FunctionNamer): - def compiled_function_name(self, original_name, live_object=None): - return 'renamed_%s' % original_name + def compiled_function_name(self, + original_fqn, + live_entity=None, + owner_type=None): + if owner_type is not None: + return None, False + return ('renamed_%s' % '_'.join(original_fqn)), True class CallTreesTest(converter_test_base.TestCase): @@ -45,14 +50,35 @@ class CallTreesTest(converter_test_base.TestCase): def test_fn_2(a): return test_fn_1(a) + 1 - node = self.parse_and_analyze(test_fn_2, {'test_fn_1': test_fn_1}) - node = call_trees.transform(node, TestNamer(), {}, (), ()) + node = self.parse_and_analyze( + test_fn_2, {'test_fn_1': test_fn_1}, namer=TestNamer()) + node = call_trees.transform(node, self.ctx, (), ()) result = compiler.ast_to_object(node) # Only test_fn_2 is transformed, so we'll insert renamed_test_fn_1 manually. setattr(result, 'renamed_test_fn_1', renamed_test_fn_1) self.assertEquals(3, result.test_fn_2(1)) + def test_simple_methods(self): + + class TestClass(object): + + def test_fn_1(self, a): + return a + 1 + + def test_fn_2(self, a): + return self.test_fn_1(a) + 1 + + node = self.parse_and_analyze( + TestClass.test_fn_2, {'TestClass': TestClass}, + namer=TestNamer(), + arg_types={'self': (TestClass.__name__, TestClass)}) + node = call_trees.transform(node, self.ctx, (), ()) + result = compiler.ast_to_object(node) + + tc = TestClass() + self.assertEquals(3, result.test_fn_2(tc, 1)) + def test_uncompiled_modules(self): def test_fn(a): @@ -60,11 +86,13 @@ class CallTreesTest(converter_test_base.TestCase): a = math_ops.add(a, constant_op.constant(1)) return a - node = self.parse_and_analyze(test_fn, { - 'math_ops': math_ops, - 'constant_op': constant_op - }) - node = call_trees.transform(node, TestNamer(), {}, + node = self.parse_and_analyze( + test_fn, { + 'math_ops': math_ops, + 'constant_op': constant_op + }, + namer=TestNamer()) + node = call_trees.transform(node, self.ctx, set(((math_ops.__name__,), (constant_op.__name__,))), ()) result = compiler.ast_to_object(node) diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/py2tf/converters/converter_test_base.py index ed006bad6d8..6bfa55443c4 100644 --- a/tensorflow/contrib/py2tf/converters/converter_test_base.py +++ b/tensorflow/contrib/py2tf/converters/converter_test_base.py @@ -31,18 +31,23 @@ class TestCase(test.TestCase): def parse_and_analyze(self, test_fn, namespace, + namer=None, arg_types=None, - include_type_analysis=True): + include_type_analysis=True, + recursive=True): + node, source = parser.parse_entity(test_fn) ctx = context.EntityContext( - namer=None, - source_code=None, + namer=namer, + source_code=source, source_file=None, namespace=namespace, arg_values=None, - arg_types=arg_types) - node = parser.parse_object(test_fn) - node = access.resolve(node) - node = live_values.resolve(node, namespace, {}) + arg_types=arg_types, + recursive=recursive) + node = access.resolve(node, ctx) + node = live_values.resolve(node, ctx, {}) if include_type_analysis: node = type_info.resolve(node, ctx) + node = live_values.resolve(node, ctx, {}) + self.ctx = ctx return node diff --git a/tensorflow/contrib/py2tf/converters/decorators.py b/tensorflow/contrib/py2tf/converters/decorators.py index a4313bfa510..3f620c1cd2d 100644 --- a/tensorflow/contrib/py2tf/converters/decorators.py +++ b/tensorflow/contrib/py2tf/converters/decorators.py @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Handles decorators.""" +"""Handles decorators. + +Note: this module only deals with functions whose decorators are still recorded +in the AST. This does not always happen. See the unit test for an example. +""" from __future__ import absolute_import from __future__ import division @@ -34,17 +38,19 @@ class DecoratorsTransformer(gast.NodeTransformer): def visit_FunctionDef(self, node): self.generic_visit(node) + kept_decorators = [] for dec in node.decorator_list: if isinstance(dec, gast.Call): - dec = dec.func - if not anno.hasanno(dec, 'live_val'): + dec_func = dec.func + else: + dec_func = dec + if not anno.hasanno(dec_func, 'live_val'): raise ValueError( - 'Could not resolve decorator: %s' % pretty_printer.fmt(dec)) - dec_value = anno.getanno(dec, 'live_val') - if dec_value in self.remove_decorators: - continue - raise ValueError('Dont know how to convert decorators for now.') - node.decorator_list = [] + 'Could not resolve decorator: %s' % pretty_printer.fmt(dec_func)) + dec_value = anno.getanno(dec_func, 'live_val') + if dec_value not in self.remove_decorators: + kept_decorators.append(dec) + node.decorator_list = kept_decorators return node # pylint:enable=invalid-name diff --git a/tensorflow/contrib/py2tf/converters/decorators_test.py b/tensorflow/contrib/py2tf/converters/decorators_test.py new file mode 100644 index 00000000000..f50d593043a --- /dev/null +++ b/tensorflow/contrib/py2tf/converters/decorators_test.py @@ -0,0 +1,96 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for decorators module.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import textwrap + +from tensorflow.contrib.py2tf.converters import converter_test_base +from tensorflow.contrib.py2tf.converters import decorators +from tensorflow.contrib.py2tf.pyct import compiler +from tensorflow.python.platform import test +from tensorflow.python.util import tf_inspect + + +class DecoratorsTest(converter_test_base.TestCase): + + def test_function_decorator(self): + + def function_decorator(): + + def decorator(f): + return lambda a: f(a) + 1 + + return decorator + + # The Python parser does capture decorators into the AST. + # However, the interpreter desugars them on load, and refering to the + # decorated function at runtime usually loses any trace of the decorator. + # Below is an example when that doesn't happen. + def static_wrapper(): + + @function_decorator() + def test_fn(a): # pylint:disable=unused-variable + return a + + node = self.parse_and_analyze(static_wrapper, + {'function_decorator': function_decorator}) + node = node.body[0].body[0] + + node = decorators.transform(node, remove_decorators=()) + result = compiler.ast_to_object( + node, + source_prefix=textwrap.dedent(tf_inspect.getsource(function_decorator))) + self.assertEqual(2, result.test_fn(1)) + + node = decorators.transform(node, remove_decorators=(function_decorator,)) + result = compiler.ast_to_object(node) + self.assertEqual(1, result.test_fn(1)) + + def test_simple_decorator(self): + + def simple_decorator(f): + return lambda a: f(a) + 1 + + # The Python parser does capture decorators into the AST. + # However, the interpreter desugars them upon load, and refering to the + # decorated function at runtime usually loses any trace of the decorator. + # Below is an example when that doesn't happen. + def static_wrapper(): + + @simple_decorator + def test_fn(a): # pylint:disable=unused-variable + return a + + node = self.parse_and_analyze(static_wrapper, + {'simple_decorator': simple_decorator}) + node = node.body[0].body[0] + + node = decorators.transform(node, remove_decorators=()) + result = compiler.ast_to_object( + node, + source_prefix=textwrap.dedent(tf_inspect.getsource(simple_decorator))) + self.assertEqual(2, result.test_fn(1)) + + node = decorators.transform(node, remove_decorators=(simple_decorator,)) + result = compiler.ast_to_object(node) + self.assertEqual(1, result.test_fn(1)) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py index 4df723989d4..1eda8ae630a 100644 --- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py +++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py @@ -96,6 +96,7 @@ class SideEffectGuardTransformer(gast.NodeTransformer): return node def _gate_symbols(self, guard_statement, guarded_args): + # TODO(mdan): This won't work for variables. template = """ (args,) = (tf.identity(a) for a in (args,)) """ @@ -110,33 +111,22 @@ class SideEffectGuardTransformer(gast.NodeTransformer): # opt.minimize(loss) # or: # tf.py_func(...) - args_scope = anno.getanno(node.value, 'args_scope') - temp_name = self.namer.new_symbol('temp', args_scope.parent.referenced) - # TODO(mdan): Unsafe reference modification! - args_scope.mark_write(temp_name) template = """ - temp_result = call - if temp_result is not None: - if not isinstance(temp_result, (list, tuple)): - temp_result = (temp_result,) - ctx = tf.control_dependencies(temp_result) - else: - ctx = contextmanager(lambda: (yield))() - with ctx: - # TODO(mdan): Also insert ops to re-fetch if variables are involved. + with py2tf_utils.control_dependency_on_returns(tf, call): + # TODO(mdan): Also insert ops to re-fetch if variables are involved? pass # Will be removed below. """ # TODO(mdan): This is brittle. Reorganize the mechanism. - statements = templates.replace( - template, call=node.value, temp_result=temp_name) + statements = templates.replace(template, call=node.value) control_deps_guard = statements[-1] control_deps_guard.body = [] # First, attempt to gate future evaluation of args. If that's not # possible, gate all remaining statements (and that may fail too, see # _visit_and_reindent. - guarded_args = tuple( - n for n in args_scope.used if n in args_scope.parent.modified) + args_scope = anno.getanno(node.value, 'args_scope') + guarded_args = tuple(args_scope.used & (args_scope.parent.modified + | args_scope.parent.returned)) if guarded_args: node = tuple(statements[:-1]) + ( self._gate_symbols(control_deps_guard, guarded_args),) diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py index 5c56973dc2a..452d7ab2be8 100644 --- a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py +++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.contrib.py2tf import utils from tensorflow.contrib.py2tf.converters import converter_test_base from tensorflow.contrib.py2tf.converters import side_effect_guards from tensorflow.contrib.py2tf.pyct import compiler @@ -46,6 +47,7 @@ class SideEffectGuardsTest(converter_test_base.TestCase): node = side_effect_guards.transform(node, TestNamer()) result = compiler.ast_to_object(node) setattr(result, 'state_ops', state_ops) + setattr(result, 'py2tf_utils', utils) # TODO(mdan): Configure the namespaces instead of doing these hacks. ops.identity = array_ops.identity diff --git a/tensorflow/contrib/py2tf/impl/BUILD b/tensorflow/contrib/py2tf/impl/BUILD new file mode 100644 index 00000000000..22f0c25cabc --- /dev/null +++ b/tensorflow/contrib/py2tf/impl/BUILD @@ -0,0 +1,65 @@ +licenses(["notice"]) # Apache 2.0 + +load("//tensorflow:tensorflow.bzl", "py_test") + +filegroup( + name = "all_files", + srcs = glob( + ["**/*"], + exclude = [ + "**/METADATA", + "**/OWNERS", + ], + ), + visibility = ["//tensorflow:__subpackages__"], +) + +py_library( + name = "impl", + srcs = [ + "api.py", + "config.py", + "conversion.py", + "naming.py", + ], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:__subpackages__"], + deps = [ + "//tensorflow/contrib/py2tf/converters", + "//tensorflow/contrib/py2tf/pyct", + "//tensorflow/contrib/py2tf/pyct/static_analysis", + "@gast_archive//:gast", + "@six_archive//:six", + ], +) + +py_test( + name = "api_test", + srcs = ["api_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":impl", + "//tensorflow/python:client_testlib", + ], +) + +py_test( + name = "conversion_test", + srcs = ["conversion_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":impl", + "//tensorflow/python:client_testlib", + "@gast_archive//:gast", + ], +) + +py_test( + name = "naming_test", + srcs = ["naming_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":impl", + "//tensorflow/python:client_testlib", + ], +) diff --git a/tensorflow/contrib/py2tf/api.py b/tensorflow/contrib/py2tf/impl/api.py similarity index 97% rename from tensorflow/contrib/py2tf/api.py rename to tensorflow/contrib/py2tf/impl/api.py index ca1f4e2645e..85d40f31580 100644 --- a/tensorflow/contrib/py2tf/api.py +++ b/tensorflow/contrib/py2tf/impl/api.py @@ -23,8 +23,8 @@ from functools import wraps import gast import six -from tensorflow.contrib.py2tf import config -from tensorflow.contrib.py2tf import conversion +from tensorflow.contrib.py2tf.impl import config +from tensorflow.contrib.py2tf.impl import conversion from tensorflow.contrib.py2tf.pyct import compiler from tensorflow.contrib.py2tf.pyct import parser from tensorflow.python.util import tf_inspect @@ -86,8 +86,8 @@ def convert_inline(f, *args, **kwargs): def convert(recursive=False, arg_types=None): """Decorator that compiles a function to graph mode. - The decorator is dynamic - invoking compilation whenever the decorated function - is called. This means the parameter values are known at compilation. + The decorator is dynamic - invoking compilation whenever the decorated + function is called. This means the parameter values are known at compilation. Args: recursive: Whether to recusrively convert any functions that the decorator diff --git a/tensorflow/contrib/py2tf/api_test.py b/tensorflow/contrib/py2tf/impl/api_test.py similarity index 98% rename from tensorflow/contrib/py2tf/api_test.py rename to tensorflow/contrib/py2tf/impl/api_test.py index 2384447708d..dbd079a3ca6 100644 --- a/tensorflow/contrib/py2tf/api_test.py +++ b/tensorflow/contrib/py2tf/impl/api_test.py @@ -18,8 +18,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.py2tf import api -from tensorflow.contrib.py2tf import config +from tensorflow.contrib.py2tf.impl import api +from tensorflow.contrib.py2tf.impl import config from tensorflow.contrib.py2tf.pyct import parser from tensorflow.python.framework import constant_op from tensorflow.python.ops import math_ops diff --git a/tensorflow/contrib/py2tf/config.py b/tensorflow/contrib/py2tf/impl/config.py similarity index 85% rename from tensorflow/contrib/py2tf/config.py rename to tensorflow/contrib/py2tf/impl/config.py index 8c502a7a9e5..6525806a093 100644 --- a/tensorflow/contrib/py2tf/config.py +++ b/tensorflow/contrib/py2tf/impl/config.py @@ -32,7 +32,9 @@ DEFAULT_UNCOMPILED_MODULES = set(( NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',)) # TODO(mdan): Also allow controlling the generated names (for testability). +# TODO(mdan): Verify that these names are not hidden by generated code. +# TODO(mdan): Make sure copybara renames the reference below. COMPILED_IMPORT_STATEMENTS = ( - 'from contextlib import contextmanager', 'import tensorflow as tf', -) + 'from tensorflow.contrib.py2tf import utils as ' + 'py2tf_utils') diff --git a/tensorflow/contrib/py2tf/conversion.py b/tensorflow/contrib/py2tf/impl/conversion.py similarity index 91% rename from tensorflow/contrib/py2tf/conversion.py rename to tensorflow/contrib/py2tf/impl/conversion.py index b484eebbd58..ed71ff5c063 100644 --- a/tensorflow/contrib/py2tf/conversion.py +++ b/tensorflow/contrib/py2tf/impl/conversion.py @@ -21,8 +21,7 @@ from __future__ import print_function import gast import six -from tensorflow.contrib.py2tf import config -from tensorflow.contrib.py2tf import naming +from tensorflow.contrib.py2tf.converters import asserts from tensorflow.contrib.py2tf.converters import break_canonicalization from tensorflow.contrib.py2tf.converters import builtin_functions from tensorflow.contrib.py2tf.converters import call_trees @@ -33,6 +32,8 @@ from tensorflow.contrib.py2tf.converters import for_canonicalization from tensorflow.contrib.py2tf.converters import logical_expressions from tensorflow.contrib.py2tf.converters import print_functions from tensorflow.contrib.py2tf.converters import side_effect_guards +from tensorflow.contrib.py2tf.impl import config +from tensorflow.contrib.py2tf.impl import naming from tensorflow.contrib.py2tf.pyct import context from tensorflow.contrib.py2tf.pyct import parser from tensorflow.contrib.py2tf.pyct.static_analysis import access @@ -171,7 +172,8 @@ def class_to_graph(c, conversion_map): def function_to_graph(f, conversion_map, arg_values, arg_types, owner_type=None): """Specialization of `entity_to_graph` for callable functions.""" - node = parser.parse_object(f).body[0] + node, source = parser.parse_entity(f) + node = node.body[0] namespace = six.get_function_globals(f) # This is needed for non-global functions. @@ -185,28 +187,29 @@ def function_to_graph(f, conversion_map, arg_values, arg_types, namer = conversion_map.new_namer(namespace) ctx = context.EntityContext( namer=namer, - source_code=tf_inspect.getsource(f), - source_file=tf_inspect.getfile(f), + source_code=source, + source_file='', namespace=namespace, arg_values=arg_values, - arg_types=arg_types) + arg_types=arg_types, + recursive=conversion_map.recursive) node = node_to_graph(node, ctx, conversion_map.nocompile_decorators) - # Simulate a rename to ensure the top level is in the name map. This is needed - # for top level functions, and it also helps the consistency verification made - # by update_name_map. - if owner_type is not None: - new_name = namer.compiled_function_name(f.__name__, f, owner_type) - else: - new_name = namer.compiled_function_name(f.__name__, f) + # TODO(mdan): This somewhat duplicates the call rename logic in call_treest.py + new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type) + if not did_rename: + new_name = f.__name__ + if node.name != f.__name__: + raise NotImplementedError('Strange corner case. Send us offending code!') + node.name = new_name conversion_map.update_name_map(namer) - return node, conversion_map.name_map[f] + return node, new_name def _static_analysis_pass(node, ctx): - node = access.resolve(node) - node = live_values.resolve(node, ctx.namespace, config.PYTHON_LITERALS) + node = access.resolve(node, ctx) + node = live_values.resolve(node, ctx, config.PYTHON_LITERALS) node = type_info.resolve(node, ctx) return node @@ -243,6 +246,7 @@ def node_to_graph(node, ctx, nocompile_decorators): node = _static_analysis_pass(node, ctx) node = decorators.transform(node, nocompile_decorators) node = break_canonicalization.transform(node, ctx.namer) + node = asserts.transform(node, ctx) # Note: sequencing continue canonicalization before for loop one avoids # dealing with the extra loop increment operation that the for @@ -259,8 +263,7 @@ def node_to_graph(node, ctx, nocompile_decorators): node = _static_analysis_pass(node, ctx) node = print_functions.transform(node) - node = call_trees.transform(node, ctx.namer, ctx.namespace, - config.DEFAULT_UNCOMPILED_MODULES, + node = call_trees.transform(node, ctx, config.DEFAULT_UNCOMPILED_MODULES, nocompile_decorators) node = control_flow.transform(node, ctx.namer) node = logical_expressions.transform(node) diff --git a/tensorflow/contrib/py2tf/conversion_test.py b/tensorflow/contrib/py2tf/impl/conversion_test.py similarity index 97% rename from tensorflow/contrib/py2tf/conversion_test.py rename to tensorflow/contrib/py2tf/impl/conversion_test.py index 26f915f4f46..3888958f19b 100644 --- a/tensorflow/contrib/py2tf/conversion_test.py +++ b/tensorflow/contrib/py2tf/impl/conversion_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import gast -from tensorflow.contrib.py2tf import conversion +from tensorflow.contrib.py2tf.impl import conversion from tensorflow.python.platform import test diff --git a/tensorflow/contrib/py2tf/naming.py b/tensorflow/contrib/py2tf/impl/naming.py similarity index 66% rename from tensorflow/contrib/py2tf/naming.py rename to tensorflow/contrib/py2tf/impl/naming.py index a90758962b8..5c7e4c5f95a 100644 --- a/tensorflow/contrib/py2tf/naming.py +++ b/tensorflow/contrib/py2tf/impl/naming.py @@ -18,8 +18,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.python.util import tf_inspect - class Namer(object): """Implementation of the namer interfaces required by various converters. @@ -45,10 +43,15 @@ class Namer(object): self.generated_names = set() - def compiled_class_name(self, original_name, live_object=None): + def compiled_class_name(self, original_fqn, live_entity=None): """See call_trees.FunctionNamer.compiled_class_name.""" - if live_object is not None and live_object in self.renamed_calls: - return self.renamed_calls[live_object] + if live_entity is not None and live_entity in self.renamed_calls: + return self.renamed_calls[live_entity] + + if isinstance(original_fqn, tuple): + original_name = '__'.join(original_fqn) + else: + original_name = original_fqn new_name_root = 'Tf%s' % original_name new_name = new_name_root @@ -57,41 +60,46 @@ class Namer(object): n += 1 new_name = '%s_%d' % (new_name_root, n) - if live_object is not None: - self.renamed_calls[live_object] = new_name + if live_entity is not None: + self.renamed_calls[live_entity] = new_name self.generated_names.add(new_name) + if live_entity is not None: + self.renamed_calls[live_entity] = new_name return new_name def compiled_function_name(self, - original_name, - live_object=None, + original_fqn, + live_entity=None, owner_type=None): """See call_trees.FunctionNamer.compiled_function_name.""" - if live_object is not None and live_object in self.renamed_calls: - return self.renamed_calls[live_object] if not self.recursive: - new_name = original_name - elif owner_type is None or owner_type in self.partial_types: - # Top level functions: rename - new_name_root = 'tf__%s' % original_name - new_name = new_name_root - n = 0 - while new_name in self.global_namespace: - n += 1 - new_name = '%s_%d' % (new_name_root, n) - else: - if tf_inspect.isclass(owner_type): - # Class members: do not rename (the entire class will be renamed) - new_name = original_name - else: - raise NotImplementedError('Member function "%s" of non-class type: %s' % - (original_name, owner_type)) + return None, False - if live_object is not None: - self.renamed_calls[live_object] = new_name + if owner_type is not None and owner_type not in self.partial_types: + # Members are not renamed when part of an entire converted class. + return None, False + + if isinstance(original_fqn, tuple): + original_name = '__'.join(original_fqn) + else: + original_name = original_fqn + + if live_entity is not None and live_entity in self.renamed_calls: + return self.renamed_calls[live_entity], True + + new_name_root = 'tf__%s' % original_name + new_name = new_name_root + n = 0 + while new_name in self.global_namespace: + n += 1 + new_name = '%s_%d' % (new_name_root, n) + + if live_entity is not None: + self.renamed_calls[live_entity] = new_name self.generated_names.add(new_name) - return new_name + + return new_name, True def new_symbol(self, name_root, reserved_locals): """See control_flow.SymbolNamer.new_symbol.""" diff --git a/tensorflow/contrib/py2tf/naming_test.py b/tensorflow/contrib/py2tf/impl/naming_test.py similarity index 82% rename from tensorflow/contrib/py2tf/naming_test.py rename to tensorflow/contrib/py2tf/impl/naming_test.py index 7bfc9b8733b..beb4e54937b 100644 --- a/tensorflow/contrib/py2tf/naming_test.py +++ b/tensorflow/contrib/py2tf/impl/naming_test.py @@ -18,7 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.py2tf import naming +from tensorflow.contrib.py2tf.impl import naming from tensorflow.python.platform import test @@ -29,8 +29,9 @@ class NamerTest(test.TestCase): pass namer = naming.Namer({}, True, None, ()) - self.assertEqual('tf__foo', namer.compiled_function_name('foo')) - self.assertEqual('tf__bar', namer.compiled_function_name('bar', bar)) + self.assertEqual(('tf__foo', True), namer.compiled_function_name('foo')) + self.assertEqual(('tf__bar', True), namer.compiled_function_name( + 'bar', bar)) self.assertEqual({bar: 'tf__bar'}, namer.renamed_calls) self.assertItemsEqual(('tf__bar', 'tf__foo'), namer.generated_names) @@ -39,15 +40,18 @@ class NamerTest(test.TestCase): pass namer = naming.Namer({}, True, None, ()) - self.assertEqual('tf__foo', namer.compiled_function_name('foo', foo)) - self.assertEqual('tf__foo', namer.compiled_function_name('foo', foo)) + self.assertEqual(('tf__foo', True), namer.compiled_function_name( + 'foo', foo)) + self.assertEqual(('tf__foo', True), namer.compiled_function_name( + 'foo', foo)) def test_compiled_function_name_avoids_global_conflicts(self): def foo(): pass namer = naming.Namer({'tf__foo': 1}, True, None, ()) - self.assertEqual('tf__foo_1', namer.compiled_function_name('foo', foo)) + self.assertEqual(('tf__foo_1', True), + namer.compiled_function_name('foo', foo)) def test_new_symbol_tracks_names(self): namer = naming.Namer({}, True, None, ()) diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD index e0331dbc97c..1b2408ba0ea 100644 --- a/tensorflow/contrib/py2tf/pyct/BUILD +++ b/tensorflow/contrib/py2tf/pyct/BUILD @@ -31,6 +31,7 @@ py_library( deps = [ "@astor_archive//:astor", "@gast_archive//:gast", + "@six_archive//:six", "@termcolor_archive//:termcolor", ], ) @@ -38,6 +39,7 @@ py_library( py_test( name = "anno_test", srcs = ["anno_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", @@ -47,6 +49,7 @@ py_test( py_test( name = "compiler_test", srcs = ["compiler_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", @@ -57,6 +60,7 @@ py_test( py_test( name = "parser_test", srcs = ["parser_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", @@ -66,6 +70,7 @@ py_test( py_test( name = "pretty_printer_test", srcs = ["pretty_printer_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", @@ -75,6 +80,7 @@ py_test( py_test( name = "templates_test", srcs = ["templates_test.py"], + srcs_version = "PY2AND3", deps = [ ":pyct", "//tensorflow/python:client_testlib", diff --git a/tensorflow/contrib/py2tf/pyct/compiler.py b/tensorflow/contrib/py2tf/pyct/compiler.py index b09353cc72b..fc71469d1ea 100644 --- a/tensorflow/contrib/py2tf/pyct/compiler.py +++ b/tensorflow/contrib/py2tf/pyct/compiler.py @@ -41,7 +41,7 @@ def ast_to_source(node, indentation): return astor.source_repr.pretty_source(generator.result).lstrip() -def ast_to_object(node, indentation=' '): +def ast_to_object(node, indentation=' ', source_prefix=None): """Return the Python objects represented by given AST. Compiling the AST code this way ensures that the source code is readable by @@ -50,6 +50,7 @@ def ast_to_object(node, indentation=' '): Args: node: The code to compile, as an AST object. indentation: The string to use for indentation. + source_prefix: Optional string to print as-is into the source file. Returns: A module object containing the compiled source code. @@ -58,5 +59,8 @@ def ast_to_object(node, indentation=' '): with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: module_name = os.path.basename(f.name[:-3]) + if source_prefix: + f.write(source_prefix) + f.write('\n') f.write(source) return imp.load_source(module_name, f.name) diff --git a/tensorflow/contrib/py2tf/pyct/context.py b/tensorflow/contrib/py2tf/pyct/context.py index 73f3613d09d..fef74ebefa2 100644 --- a/tensorflow/contrib/py2tf/pyct/context.py +++ b/tensorflow/contrib/py2tf/pyct/context.py @@ -33,10 +33,11 @@ class EntityContext(object): """ def __init__(self, namer, source_code, source_file, namespace, arg_values, - arg_types): + arg_types, recursive): self.namer = namer self.source_code = source_code self.source_file = source_file self.namespace = namespace self.arg_values = {} if arg_values is None else arg_values self.arg_types = {} if arg_types is None else arg_types + self.recursive = recursive diff --git a/tensorflow/contrib/py2tf/pyct/parser.py b/tensorflow/contrib/py2tf/pyct/parser.py index 3daa69b9cef..dc7df883b34 100644 --- a/tensorflow/contrib/py2tf/pyct/parser.py +++ b/tensorflow/contrib/py2tf/pyct/parser.py @@ -28,11 +28,13 @@ import gast from tensorflow.python.util import tf_inspect -def parse_object(obj): - """Return the AST of given object.""" - return parse_str(tf_inspect.getsource(obj)) +def parse_entity(entity): + """Return the AST of given entity.""" + source = tf_inspect.getsource(entity) + source = textwrap.dedent(source) + return parse_str(source), source def parse_str(src): """Return the AST of given piece of code.""" - return gast.parse(textwrap.dedent(src)) + return gast.parse(src) diff --git a/tensorflow/contrib/py2tf/pyct/parser_test.py b/tensorflow/contrib/py2tf/pyct/parser_test.py index 46f9aa82071..f35dfa04c70 100644 --- a/tensorflow/contrib/py2tf/pyct/parser_test.py +++ b/tensorflow/contrib/py2tf/pyct/parser_test.py @@ -18,6 +18,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import textwrap + from tensorflow.contrib.py2tf.pyct import parser from tensorflow.python.platform import test @@ -28,15 +30,16 @@ def f(x): class ParserTest(test.TestCase): - def test_parse_object(self): - mod = parser.parse_object(f) + def test_parse_entity(self): + mod, _ = parser.parse_entity(f) self.assertEqual('f', mod.body[0].name) def test_parse_str(self): - mod = parser.parse_str(""" + mod = parser.parse_str( + textwrap.dedent(""" def f(x): return x + 1 - """) + """)) self.assertEqual('f', mod.body[0].name) diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD index abaf9536781..32e2954fffc 100644 --- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD +++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD @@ -32,6 +32,7 @@ py_library( py_test( name = "access_test", srcs = ["access_test.py"], + srcs_version = "PY2AND3", deps = [ ":static_analysis", "//tensorflow/contrib/py2tf/pyct", @@ -43,6 +44,7 @@ py_test( py_test( name = "live_values_test", srcs = ["live_values_test.py"], + srcs_version = "PY2AND3", deps = [ ":static_analysis", "//tensorflow/contrib/py2tf/pyct", @@ -53,6 +55,7 @@ py_test( py_test( name = "type_info_test", srcs = ["type_info_test.py"], + srcs_version = "PY2AND3", deps = [ ":static_analysis", "//tensorflow/contrib/py2tf/pyct", diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py b/tensorflow/contrib/py2tf/pyct/static_analysis/access.py index 8f3ac48b68c..33629f87d1d 100644 --- a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py +++ b/tensorflow/contrib/py2tf/pyct/static_analysis/access.py @@ -23,6 +23,7 @@ import copy import gast from tensorflow.contrib.py2tf.pyct import anno +from tensorflow.contrib.py2tf.pyct import transformer # TODO(mdan): Add support for PY3 (e.g. Param vs arg). @@ -53,6 +54,8 @@ class Scope(object): self.modified = set() self.created = set() self.used = set() + self.params = set() + self.returned = set() # TODO(mdan): Rename to `locals` @property @@ -69,42 +72,73 @@ class Scope(object): self.modified = copy.copy(other.modified) self.created = copy.copy(other.created) self.used = copy.copy(other.used) + self.params = copy.copy(other.params) + self.returned = copy.copy(other.returned) def merge_from(self, other): self.modified |= other.modified self.created |= other.created self.used |= other.used + self.params |= other.params + self.returned |= other.returned def has(self, name): - if name in self.modified: + if name in self.modified or name in self.params: return True elif self.parent is not None: return self.parent.has(name) return False + def is_modified_since_entry(self, name): + if name in self.modified: + return True + elif self.parent is not None and not self.isolated: + return self.parent.is_modified_since_entry(name) + return False + + def is_param(self, name): + if name in self.params: + return True + elif self.parent is not None and not self.isolated: + return self.parent.is_param(name) + return False + def mark_read(self, name): self.used.add(name) if self.parent is not None and name not in self.created: self.parent.mark_read(name) + def mark_param(self, name): + self.params.add(name) + + def mark_creation(self, name): + self.created.add(name) + def mark_write(self, name): self.modified.add(name) if self.isolated: - self.created.add(name) + self.mark_creation(name) else: if self.parent is None: - self.created.add(name) + self.mark_creation(name) else: if not self.parent.has(name): - self.created.add(name) + self.mark_creation(name) self.parent.mark_write(name) + def mark_returned(self, name): + self.returned.add(name) + if not self.isolated and self.parent is not None: + self.parent.mark_returned(name) -class AccessResolver(gast.NodeTransformer): + +class AccessResolver(transformer.Base): """Annotates nodes with local scope information. See Scope.""" - def __init__(self): + def __init__(self, context): + super(AccessResolver, self).__init__(context) self.scope = Scope(None) + self._in_return_statement = False def visit_Name(self, node): # TODO(mdan): This is insufficient for object fields, e.g. hp.learning_rate. @@ -120,10 +154,17 @@ class AccessResolver(gast.NodeTransformer): # TODO(mdan): This bay be incorrect with nested functions. # For nested functions, we'll have to add the notion of hiding args from # the parent scope, not writing to them. - self.scope.mark_write(node.id) + self.scope.mark_creation(node.id) + self.scope.mark_param(node.id) else: raise ValueError('Unknown context %s for node %s.' % (type(node.ctx), node.id)) + anno.setanno(node, 'is_modified_since_entry', + self.scope.is_modified_since_entry(node.id)) + anno.setanno(node, 'is_param', self.scope.is_param(node.id)) + + if self._in_return_statement: + self.scope.mark_returned(node.id) return node def visit_Print(self, node): @@ -138,7 +179,7 @@ class AccessResolver(gast.NodeTransformer): def visit_Call(self, node): current_scope = self.scope - args_scope = Scope(current_scope) + args_scope = Scope(current_scope, isolated=False) self.scope = args_scope for n in node.args: self.visit(n) @@ -200,6 +241,12 @@ class AccessResolver(gast.NodeTransformer): node, ((node.body, 'body'), (node.orelse, 'orelse'))) return node + def visit_Return(self, node): + self._in_return_statement = True + node = self.generic_visit(node) + self._in_return_statement = False + return node -def resolve(node): - return AccessResolver().visit(node) + +def resolve(node, context): + return AccessResolver(context).visit(node) diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py index 0912ebb4c35..df0283b54d5 100644 --- a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py +++ b/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py @@ -21,6 +21,7 @@ from __future__ import print_function import gast from tensorflow.contrib.py2tf.pyct import anno +from tensorflow.contrib.py2tf.pyct import context from tensorflow.contrib.py2tf.pyct import parser from tensorflow.contrib.py2tf.pyct.static_analysis import access from tensorflow.python.platform import test @@ -95,6 +96,19 @@ class ScopeTest(test.TestCase): class AccessResolverTest(test.TestCase): + def _parse_and_analyze(self, test_fn): + node, source = parser.parse_entity(test_fn) + ctx = context.EntityContext( + namer=None, + source_code=source, + source_file=None, + namespace={}, + arg_values=None, + arg_types=None, + recursive=True) + node = access.resolve(node, ctx) + return node + def test_local_markers(self): def test_fn(a): # pylint:disable=unused-argument @@ -103,9 +117,7 @@ class AccessResolverTest(test.TestCase): b -= 1 return b - node = parser.parse_object(test_fn) - node = access.resolve(node) - + node = self._parse_and_analyze(test_fn) self.assertFalse(anno.getanno(node.body[0].body[0].value, 'is_local')) # c in b = c self.assertTrue(anno.getanno(node.body[0].body[1].test.left, @@ -126,9 +138,7 @@ class AccessResolverTest(test.TestCase): print(a, b) return c - node = parser.parse_object(test_fn) - node = access.resolve(node) - + node = self._parse_and_analyze(test_fn) print_node = node.body[0].body[2] if isinstance(print_node, gast.Print): # Python 2 @@ -151,9 +161,7 @@ class AccessResolverTest(test.TestCase): foo(a, b) # pylint:disable=undefined-variable return c - node = parser.parse_object(test_fn) - node = access.resolve(node) - + node = self._parse_and_analyze(test_fn) call_node = node.body[0].body[2].value # We basically need to detect which variables are captured by the call # arguments. @@ -169,15 +177,13 @@ class AccessResolverTest(test.TestCase): b -= 1 return b, c - node = parser.parse_object(test_fn) - node = access.resolve(node) - + node = self._parse_and_analyze(test_fn) while_node = node.body[0].body[1] self.assertScopeIs( anno.getanno(while_node, 'body_scope'), ('b',), ('b', 'c'), ('c',)) self.assertScopeIs( anno.getanno(while_node, 'body_parent_scope'), ('a', 'b', 'c'), - ('a', 'b', 'c'), ('a', 'b', 'c')) + ('b', 'c'), ('a', 'b', 'c')) def test_for(self): @@ -188,15 +194,13 @@ class AccessResolverTest(test.TestCase): b -= 1 return b, c - node = parser.parse_object(test_fn) - node = access.resolve(node) - + node = self._parse_and_analyze(test_fn) for_node = node.body[0].body[1] self.assertScopeIs( anno.getanno(for_node, 'body_scope'), ('b',), ('b', 'c'), ('c',)) self.assertScopeIs( anno.getanno(for_node, 'body_parent_scope'), ('a', 'b', 'c'), - ('a', 'b', 'c', '_'), ('a', 'b', 'c', '_')) + ('b', 'c', '_'), ('a', 'b', 'c', '_')) def test_if(self): @@ -211,9 +215,7 @@ class AccessResolverTest(test.TestCase): u = -y return z, u - node = parser.parse_object(test_fn) - node = access.resolve(node) - + node = self._parse_and_analyze(test_fn) if_node = node.body[0].body[0] self.assertScopeIs( anno.getanno(if_node, 'body_scope'), ('x', 'y'), ('x', 'y', 'z'), diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py index 242e544b528..5a2903e6b59 100644 --- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py +++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py @@ -26,26 +26,19 @@ from __future__ import print_function import gast from tensorflow.contrib.py2tf.pyct import anno +from tensorflow.contrib.py2tf.pyct import transformer -class LiveValueResolver(gast.NodeTransformer): +class LiveValueResolver(transformer.Base): """Annotates nodes with live values.""" - def __init__(self, namespace, literals): - """Create a new resolver. - - Args: - namespace: A dict representing the namespace visible to the AST in the - intended execution context. - literals: A dict mapping literal lymbol names to their value. An example - literal is "None". - """ - self.namespace = namespace + def __init__(self, context, literals): + super(LiveValueResolver, self).__init__(context) self.literals = literals def visit_ClassDef(self, node): self.generic_visit(node) - anno.setanno(node, 'live_val', self.namespace[node.name]) + anno.setanno(node, 'live_val', self.context.namespace[node.name]) return node def visit_Name(self, node): @@ -53,20 +46,31 @@ class LiveValueResolver(gast.NodeTransformer): if isinstance(node.ctx, gast.Load): assert anno.hasanno(node, 'is_local'), node symbol_is_local = anno.getanno(node, 'is_local') - if not symbol_is_local: + assert anno.hasanno(node, 'is_modified_since_entry'), node + symbol_is_modified = anno.getanno(node, 'is_modified_since_entry') + assert anno.hasanno(node, 'is_param'), node + symbol_is_param = anno.getanno(node, 'is_param') + + if not symbol_is_local and not symbol_is_param: if node.id in self.literals: anno.setanno(node, 'live_val', self.literals[node.id]) # TODO(mdan): Could live values have FQNs? i.e. 'a'.join() - elif node.id in self.namespace: - obj = self.namespace[node.id] + elif node.id in self.context.namespace: + obj = self.context.namespace[node.id] anno.setanno(node, 'live_val', obj) anno.setanno(node, 'fqn', (obj.__name__,)) else: - raise ValueError('Could not find global symbol %s.' % node.id) + raise ValueError('Could not resolve symbol "%s".' % node.id) else: pass # TODO(mdan): Attempt to trace its value through the local chain. # TODO(mdan): Use type annotations as fallback. + + if not symbol_is_modified: + if node.id in self.context.arg_values: + obj = self.context.arg_values[node.id] + anno.setanno(node, 'live_val', obj) + anno.setanno(node, 'fqn', (obj.__class__.__name__,)) return node def visit_Attribute(self, node): @@ -79,15 +83,25 @@ class LiveValueResolver(gast.NodeTransformer): node.attr)) anno.setanno(node, 'live_val', getattr(parent_object, node.attr)) anno.setanno(node, 'fqn', anno.getanno(node.value, 'fqn') + (node.attr,)) + # TODO(mdan): Investigate the role built-in annotations can play here. + elif anno.hasanno(node.value, 'type'): + parent_type = anno.getanno(node.value, 'type') + if hasattr(parent_type, node.attr): + # This should hold for static members like methods. + # This would not hold for dynamic members like function attributes. + # For the dynamic case, we simply leave the node without an annotation, + # and let downstream consumers figure out what to do. + anno.setanno(node, 'live_val', getattr(parent_type, node.attr)) + anno.setanno(node, 'fqn', + anno.getanno(node.value, 'type_fqn') + (node.attr,)) elif isinstance(node.value, gast.Name): stem_name = node.value # All nonlocal symbols should be fully resolved. assert anno.hasanno(stem_name, 'is_local'), stem_name - assert anno.getanno(stem_name, 'is_local'), stem_name # TODO(mdan): Figure out what to do when calling attribute on local object # Maybe just leave as-is? return node -def resolve(node, namespace, literals): - return LiveValueResolver(namespace, literals).visit(node) +def resolve(node, context, literals): + return LiveValueResolver(context, literals).visit(node) diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py index e77497654a0..f3057b34667 100644 --- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py +++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py @@ -19,24 +19,45 @@ from __future__ import division from __future__ import print_function from tensorflow.contrib.py2tf.pyct import anno +from tensorflow.contrib.py2tf.pyct import context from tensorflow.contrib.py2tf.pyct import parser from tensorflow.contrib.py2tf.pyct.static_analysis import access from tensorflow.contrib.py2tf.pyct.static_analysis import live_values +from tensorflow.contrib.py2tf.pyct.static_analysis import type_info from tensorflow.python.framework import constant_op from tensorflow.python.platform import test class LiveValuesResolverTest(test.TestCase): + def _parse_and_analyze(self, + test_fn, + namespace, + literals=None, + arg_types=None): + literals = literals or {} + arg_types = arg_types or {} + node, source = parser.parse_entity(test_fn) + ctx = context.EntityContext( + namer=None, + source_code=source, + source_file=None, + namespace=namespace, + arg_values=None, + arg_types=arg_types, + recursive=True) + node = access.resolve(node, ctx) + node = live_values.resolve(node, ctx, literals) + node = type_info.resolve(node, ctx) + node = live_values.resolve(node, ctx, literals) + return node + def test_literals(self): def test_fn(): return Foo # pylint: disable=undefined-variable - node = parser.parse_object(test_fn) - node = access.resolve(node) - node = live_values.resolve(node, {}, {'Foo': 'bar'}) - + node = self._parse_and_analyze(test_fn, {}, {'Foo': 'bar'}) retval_node = node.body[0].body[0].value self.assertEquals('bar', anno.getanno(retval_node, 'live_val')) @@ -48,10 +69,7 @@ class LiveValuesResolverTest(test.TestCase): def test_fn(): return foo() - node = parser.parse_object(test_fn) - node = access.resolve(node) - node = live_values.resolve(node, {'foo': foo}, {}) - + node = self._parse_and_analyze(test_fn, {'foo': foo}) func_node = node.body[0].body[0].value.func self.assertEquals(foo, anno.getanno(func_node, 'live_val')) self.assertEquals(('foo',), anno.getanno(func_node, 'fqn')) @@ -61,15 +79,29 @@ class LiveValuesResolverTest(test.TestCase): def test_fn(): return constant_op.constant(0) - node = parser.parse_object(test_fn) - node = access.resolve(node) - node = live_values.resolve(node, {'constant_op': constant_op}, {}) - + node = self._parse_and_analyze(test_fn, {'constant_op': constant_op}) func_node = node.body[0].body[0].value.func self.assertEquals(constant_op.constant, anno.getanno(func_node, 'live_val')) self.assertEquals((constant_op.__name__, 'constant'), anno.getanno(func_node, 'fqn')) + def test_attributes_with_type_hints(self): + + class TestClass(object): + + def member(self): + pass + + def test_fn(self): + return self.member() + + node = self._parse_and_analyze( + TestClass.test_fn, {'constant_op': constant_op}, + arg_types={'self': (TestClass.__name__, TestClass)}) + func_node = node.body[0].body[0].value.func + self.assertEquals(TestClass.member, anno.getanno(func_node, 'live_val')) + self.assertEquals(('TestClass', 'member'), anno.getanno(func_node, 'fqn')) + if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py index 0042aa90ed2..cf74142cbe9 100644 --- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py +++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py @@ -36,8 +36,6 @@ class Scope(object): most recently assigned to the symbol. """ - # TODO(mdan): Should rather use a CFG here? - def __init__(self, parent): """Create a new scope. @@ -117,18 +115,32 @@ class TypeInfoResolver(transformer.Base): node.orelse = self._visit_block(node.orelse) return node + def _process_function_arg(self, arg_name): + if self.function_level == 1 and arg_name in self.context.arg_types: + # Forge a node to hold the type information, so that method calls on + # it can resolve the type. + type_holder = gast.Name(arg_name, gast.Load(), None) + type_string, type_obj = self.context.arg_types[arg_name] + anno.setanno(type_holder, 'type', type_obj) + anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.'))) + self.scope.setval(arg_name, type_holder) + + def visit_arg(self, node): + self._process_function_arg(node.arg) + return node + def visit_Name(self, node): self.generic_visit(node) if isinstance(node.ctx, gast.Param): - self.scope.setval(node.id, gast.Name(node.id, gast.Load(), None)) - if self.function_level == 1 and node.id in self.context.arg_types: - # Forge a node to hold the type information, so that method calls on - # it can resolve the type. - type_holder = gast.Name(node.id, gast.Load(), None) - type_string, type_obj = self.context.arg_types[node.id] - anno.setanno(type_holder, 'type', type_obj) - anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.'))) - self.scope.setval(node.id, type_holder) + self._process_function_arg(node.id) + elif isinstance(node.ctx, gast.Load) and self.scope.hasval(node.id): + # E.g. if we had + # a = b + # then for future references to `a` we should have traced_source = `b` + traced_source = self.scope.getval(node.id) + if anno.hasanno(traced_source, 'type'): + anno.setanno(node, 'type', anno.getanno(traced_source, 'type')) + anno.setanno(node, 'type_fqn', anno.getanno(traced_source, 'type_fqn')) return node def _process_variable_assignment(self, source, targets): @@ -172,38 +184,6 @@ class TypeInfoResolver(transformer.Base): self._process_variable_assignment(node.value, node.targets) return node - def visit_Call(self, node): - target = node.func - if not anno.hasanno(target, 'live_val'): - if not isinstance(target, gast.Attribute): - # Suspecting this pattern would reach here: - # foo = bar - # foo() - raise ValueError('Dont know how to handle dynamic functions.') - if not isinstance(target.value, gast.Name): - # Possible example of this kind: - # foo = module.Foo() - # foo.bar.baz() - # TODO(mdan): This should be doable by using the FQN. - raise ValueError('Dont know how to handle object properties yet.') - # In the example below, object_source is 'tr.train.Optimizer()': - # opt = tf.train.Optimizer() - # opt.foo() - if self.scope.hasval(target.value.id): - object_source = self.scope.getval(target.value.id) - if not anno.hasanno(object_source, 'type'): - raise ValueError('Could not determine type of "%s". Is it dynamic?' % - (target.value.id)) - anno.setanno(target, 'type', anno.getanno(object_source, 'type')) - anno.setanno(target, 'type_fqn', anno.getanno(object_source, - 'type_fqn')) - else: - # TODO(mdan): Figure out what could the user do to get past this. - raise ValueError('No info on "%s". Is it dynamically built?' % - (target.value.id)) - self.generic_visit(node) - return node - def resolve(node, context): return TypeInfoResolver(context).visit(node) diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py index a491f49ca3b..68fa1ee92a7 100644 --- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py +++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py @@ -21,7 +21,6 @@ from __future__ import print_function from tensorflow.contrib.py2tf.pyct import anno from tensorflow.contrib.py2tf.pyct import context from tensorflow.contrib.py2tf.pyct import parser -from tensorflow.contrib.py2tf.pyct import transformer from tensorflow.contrib.py2tf.pyct.static_analysis import access from tensorflow.contrib.py2tf.pyct.static_analysis import live_values from tensorflow.contrib.py2tf.pyct.static_analysis import type_info @@ -57,17 +56,19 @@ class ScopeTest(test.TestCase): class TypeInfoResolverTest(test.TestCase): def _parse_and_analyze(self, test_fn, namespace, arg_types=None): + node, source = parser.parse_entity(test_fn) ctx = context.EntityContext( namer=None, - source_code=None, + source_code=source, source_file=None, namespace=namespace, arg_values=None, - arg_types=arg_types) - node = parser.parse_object(test_fn) - node = access.resolve(node) - node = live_values.resolve(node, namespace, {}) + arg_types=arg_types, + recursive=True) + node = access.resolve(node, ctx) + node = live_values.resolve(node, ctx, {}) node = type_info.resolve(node, ctx) + node = live_values.resolve(node, ctx, {}) return node def test_constructor_detection(self): @@ -83,16 +84,16 @@ class TypeInfoResolverTest(test.TestCase): self.assertEquals((training.__name__, 'GradientDescentOptimizer'), anno.getanno(call_node, 'type_fqn')) - def test_class_members(self): + def test_class_members_of_detected_constructor(self): def test_fn(): opt = training.GradientDescentOptimizer(0.1) opt.minimize(0) node = self._parse_and_analyze(test_fn, {'training': training}) - attr_call_node = node.body[0].body[1].value.func - self.assertEquals((training.__name__, 'GradientDescentOptimizer'), - anno.getanno(attr_call_node, 'type_fqn')) + method_call = node.body[0].body[1].value.func + self.assertEquals(training.GradientDescentOptimizer.minimize, + anno.getanno(method_call, 'live_val')) def test_class_members_in_with_stmt(self): @@ -106,11 +107,11 @@ class TypeInfoResolverTest(test.TestCase): self.assertEquals((session.__name__, 'Session'), anno.getanno(constructor_call, 'type_fqn')) - member_call = node.body[0].body[0].body[0].value.func - self.assertEquals((session.__name__, 'Session'), - anno.getanno(member_call, 'type_fqn')) + method_call = node.body[0].body[0].body[0].value.func + self.assertEquals(session.Session.run, anno.getanno(method_call, + 'live_val')) - def test_constructor_deta_dependent(self): + def test_constructor_data_dependent(self): def test_fn(x): if x > 0: @@ -119,16 +120,18 @@ class TypeInfoResolverTest(test.TestCase): opt = training.GradientDescentOptimizer(0.01) opt.minimize(0) - with self.assertRaises(transformer.PyFlowParseError): - self._parse_and_analyze(test_fn, {'training': training}) + node = self._parse_and_analyze(test_fn, {'training': training}) + method_call = node.body[0].body[1].value.func + self.assertFalse(anno.hasanno(method_call, 'live_val')) def test_parameter_class_members(self): def test_fn(opt): opt.minimize(0) - with self.assertRaises(transformer.PyFlowParseError): - self._parse_and_analyze(test_fn, {'training': training}) + node = self._parse_and_analyze(test_fn, {}) + method_call = node.body[0].body[0].value.func + self.assertFalse(anno.hasanno(method_call, 'live_val')) def test_parameter_class_members_with_value_hints(self): @@ -138,14 +141,13 @@ class TypeInfoResolverTest(test.TestCase): node = self._parse_and_analyze( test_fn, {'training': training}, arg_types={ - 'opt': (('%s.GradientDescentOptimizer' % training.__name__), - training.GradientDescentOptimizer(0.1)) + 'opt': (training.GradientDescentOptimizer.__name__, + training.GradientDescentOptimizer) }) - attr_call_node = node.body[0].body[0].value.func - self.assertEquals( - tuple(training.__name__.split('.')) + ('GradientDescentOptimizer',), - anno.getanno(attr_call_node, 'type_fqn')) + method_call = node.body[0].body[0].value.func + self.assertEquals(training.GradientDescentOptimizer.minimize, + anno.getanno(method_call, 'live_val')) def test_function_variables(self): @@ -156,8 +158,9 @@ class TypeInfoResolverTest(test.TestCase): foo = bar foo() - with self.assertRaises(transformer.PyFlowParseError): - self._parse_and_analyze(test_fn, {'bar': bar}) + node = self._parse_and_analyze(test_fn, {'bar': bar}) + method_call = node.body[0].body[1].value.func + self.assertFalse(anno.hasanno(method_call, 'live_val')) def test_nested_members(self): @@ -165,8 +168,9 @@ class TypeInfoResolverTest(test.TestCase): foo = training.GradientDescentOptimizer(0.1) foo.bar.baz() - with self.assertRaises(transformer.PyFlowParseError): - self._parse_and_analyze(test_fn, {'training': training}) + node = self._parse_and_analyze(test_fn, {'training': training}) + method_call = node.body[0].body[1].value.func + self.assertFalse(anno.hasanno(method_call, 'live_val')) if __name__ == '__main__': diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/py2tf/pyct/templates.py index 77c5fbe02a1..6be526f20dd 100644 --- a/tensorflow/contrib/py2tf/pyct/templates.py +++ b/tensorflow/contrib/py2tf/pyct/templates.py @@ -23,6 +23,7 @@ from __future__ import print_function import ast import copy +import textwrap import gast @@ -119,7 +120,7 @@ def replace(template, **replacements): """ if not isinstance(template, str): raise ValueError('Expected string template, got %s' % type(template)) - tree = parser.parse_str(template) + tree = parser.parse_str(textwrap.dedent(template)) for k in replacements: replacements[k] = _strings_to_names(replacements[k]) return ReplaceTransformer(replacements).visit(tree).body diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/py2tf/pyct/transformer.py index d5aa23eaebb..8a836b7c1bf 100644 --- a/tensorflow/contrib/py2tf/pyct/transformer.py +++ b/tensorflow/contrib/py2tf/pyct/transformer.py @@ -18,7 +18,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import sys + import gast +import six from tensorflow.contrib.py2tf.pyct import pretty_printer @@ -48,11 +51,15 @@ class Base(gast.NodeTransformer): self._lineno = node.lineno self._col_offset = node.col_offset return super(Base, self).visit(node) - except ValueError as e: - msg = '%s\nOccurred at node:\n%s' % (str(e), pretty_printer.fmt(node)) + except (ValueError, AttributeError, NotImplementedError) as e: + msg = '%s: %s\nOccurred at node:\n%s' % (e.__class__.__name__, str(e), + pretty_printer.fmt(node)) if source_code: - line = self._source.splitlines()[self._lineno - 1] + line = source_code.splitlines()[self._lineno - 1] else: line = '' - raise PyFlowParseError( - msg, (source_file, self._lineno, self._col_offset + 1, line)) + six.reraise(PyFlowParseError, + PyFlowParseError( + msg, + (source_file, self._lineno, self._col_offset + 1, line)), + sys.exc_info()[2]) diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/py2tf/utils/BUILD new file mode 100644 index 00000000000..01804aa8834 --- /dev/null +++ b/tensorflow/contrib/py2tf/utils/BUILD @@ -0,0 +1,37 @@ +licenses(["notice"]) # Apache 2.0 + +load("//tensorflow:tensorflow.bzl", "py_test") + +filegroup( + name = "all_files", + srcs = glob( + ["**/*"], + exclude = [ + "**/METADATA", + "**/OWNERS", + ], + ), + visibility = ["//tensorflow:__subpackages__"], +) + +py_library( + name = "utils", + srcs = [ + "__init__.py", + "context_managers.py", + ], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:__subpackages__"], + deps = [ + ], +) + +py_test( + name = "context_managers_test", + srcs = ["context_managers_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":utils", + "//tensorflow/python:client_testlib", + ], +) diff --git a/tensorflow/contrib/py2tf/utils/__init__.py b/tensorflow/contrib/py2tf/utils/__init__.py new file mode 100644 index 00000000000..bca33e89e99 --- /dev/null +++ b/tensorflow/contrib/py2tf/utils/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utility module that contains APIs usable in the generated code.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.py2tf.utils.context_managers import control_dependency_on_returns diff --git a/tensorflow/contrib/py2tf/utils/context_managers.py b/tensorflow/contrib/py2tf/utils/context_managers.py new file mode 100644 index 00000000000..47d98399971 --- /dev/null +++ b/tensorflow/contrib/py2tf/utils/context_managers.py @@ -0,0 +1,41 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Various context managers.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import contextlib + + +def control_dependency_on_returns(tf, return_value): + """Create a TF control dependency on the return values of a function. + + If the function had no return value, a no-op context is returned. + + Args: + tf: The TensorFlow module. + return_value: The return value to set as control dependency. + + Returns: + A context manager. + """ + if return_value is None: + return contextlib.contextmanager(lambda: (yield))() + # TODO(mdan): Filter to tensor objects. + if not isinstance(return_value, (list, tuple)): + return_value = (return_value,) + return tf.control_dependencies(return_value) diff --git a/tensorflow/contrib/py2tf/utils/context_managers_test.py b/tensorflow/contrib/py2tf/utils/context_managers_test.py new file mode 100644 index 00000000000..c903f082528 --- /dev/null +++ b/tensorflow/contrib/py2tf/utils/context_managers_test.py @@ -0,0 +1,43 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for context_managers module.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.py2tf.utils import context_managers +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import ops +from tensorflow.python.platform import test + + +class ContextManagersTest(test.TestCase): + + def test_control_dependency_on_returns(self): + # Just dry run them. + with context_managers.control_dependency_on_returns(ops, None): + pass + with context_managers.control_dependency_on_returns( + ops, constant_op.constant(1)): + pass + with context_managers.control_dependency_on_returns( + ops, [constant_op.constant(1), + constant_op.constant(2)]): + pass + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD index 3c5b34a0a6a..b7d525a1fa2 100644 --- a/tensorflow/contrib/quantize/BUILD +++ b/tensorflow/contrib/quantize/BUILD @@ -77,9 +77,13 @@ py_library( "//tensorflow/contrib/graph_editor:graph_editor_py", "//tensorflow/python:array_ops", "//tensorflow/python:framework_ops", + "//tensorflow/python:layers", "//tensorflow/python:math_ops", "//tensorflow/python:nn", "//tensorflow/python:nn_ops", + "//tensorflow/python:ops", + "//tensorflow/python:training", + "//tensorflow/python:variables", ], ) diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py index aa605e6caad..8ec5334a393 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py @@ -17,7 +17,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function - import re from tensorflow.contrib import graph_editor from tensorflow.contrib.quantize.python import common @@ -26,14 +25,16 @@ from tensorflow.contrib.quantize.python import input_to_ops from tensorflow.core.framework import attr_value_pb2 from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops +from tensorflow.python.layers import utils from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.ops import nn_ops +from tensorflow.python.training import training_util from tensorflow.python.util import compat -def FoldBatchNorms(graph): +def FoldBatchNorms(graph, freeze_batch_norm_delay=None, is_training=True): """Finds batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise @@ -41,15 +42,25 @@ def FoldBatchNorms(graph): Args: graph: Graph to walk and modify. + freeze_batch_norm_delay: How many steps to wait before freezing + moving mean and variance and using them for batch normalization. This value + is used only when is_training is True. + is_training: Bool, true if training Raises: ValueError: When batch norm folding fails. """ - _FoldFusedBatchNorms(graph) - _FoldUnfusedBatchNorms(graph) + _FoldFusedBatchNorms( + graph, + freeze_batch_norm_delay=freeze_batch_norm_delay, + is_training=is_training) + _FoldUnfusedBatchNorms( + graph, + freeze_batch_norm_delay=freeze_batch_norm_delay, + is_training=is_training) -def _FoldFusedBatchNorms(graph): +def _FoldFusedBatchNorms(graph, freeze_batch_norm_delay, is_training): """Finds fused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise @@ -57,6 +68,9 @@ def _FoldFusedBatchNorms(graph): Args: graph: Graph to walk and modify. + freeze_batch_norm_delay: How many steps to wait before freezing + moving mean and variance and using them for batch normalization + is_training: Bool, true if training Raises: ValueError: When batch norm folding fails. @@ -67,8 +81,7 @@ def _FoldFusedBatchNorms(graph): # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope # named `scope`. Otherwise, TF creates a unique scope whose name starts with # `scope`. - with graph.as_default(), graph.name_scope(scope + sep), ops.device( - match.bn_op.device): + with graph.as_default(), graph.name_scope(scope + sep): with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep): # new weights = old weights * gamma / sqrt(variance + epsilon) # new biases = -mean * gamma / sqrt(variance + epsilon) + beta @@ -79,9 +92,18 @@ def _FoldFusedBatchNorms(graph): match.mean_tensor * multiplier_tensor, name='bias') + correction_scale, correction_recip, correction_offset = None, None, None + if is_training: + correction_scale, correction_recip, correction_offset = ( + _ComputeBatchNormCorrections( + context='', + match=match, + freeze_batch_norm_delay=freeze_batch_norm_delay, + fused_batch_norm=True)) # The shape of depthwise weights is different, so we need to reshape the # multiplier_tensor to ensure that the scaled_weight_tensor has the # expected shape. + weights = match.weight_tensor if match.layer_op.type == 'DepthwiseConv2dNative': new_shape = [ match.weight_tensor.get_shape().as_list()[2], @@ -90,15 +112,29 @@ def _FoldFusedBatchNorms(graph): multiplier_tensor = array_ops.reshape( multiplier_tensor, new_shape, name='scale_reshape') + if correction_scale is not None: + correction_scale = array_ops.reshape( + correction_scale, new_shape, name='correction_reshape') + + if correction_scale is not None: + weights = math_ops.multiply( + correction_scale, weights, name='correction_mult') + # TODO(suharshs): This naming of the following ops needs to carefully # follow the naming expected by quantize.py. Generalize the quantize code # to not require these delicate naming conventions. scaled_weight_tensor = math_ops.multiply( - match.weight_tensor, multiplier_tensor, name='mul_fold') + weights, multiplier_tensor, name='mul_fold') new_layer_tensor = _CloneWithNewOperands( match.layer_op, match.input_tensor, scaled_weight_tensor) + if correction_recip is not None: + new_layer_tensor = math_ops.multiply( + correction_recip, new_layer_tensor, name='post_conv_mul') + new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset), + 'correction_add') + bias_add_tensor = math_ops.add( new_layer_tensor, bias_tensor, name='add_fold') @@ -165,6 +201,8 @@ def _FindFusedBatchNorms(graph): mean_pattern = graph_matcher.OpTypePattern('*') variance_pattern = graph_matcher.OpTypePattern('*') + moving_average_pattern = graph_matcher.OpTypePattern('*') + bn_decay_pattern = graph_matcher.OpTypePattern('*') conv_pattern = graph_matcher.OpTypePattern( 'Conv2D|DepthwiseConv2dNative', inputs=[input_pattern, weight_pattern]) # MatMul has a Reshape between it and FusedBatchNorm. @@ -180,6 +218,11 @@ def _FindFusedBatchNorms(graph): conv_pattern, gamma_pattern, beta_pattern, mean_pattern, variance_pattern ]) + conv_moving_average_sub_pattern = graph_matcher.OpTypePattern( + 'Sub', inputs=[moving_average_pattern, conv_batch_norm_pattern]) + # TODO(suharshs): Use a OneofPattern here when available + conv_moving_average_mul_pattern = graph_matcher.OpTypePattern( + 'Mul', inputs=[conv_moving_average_sub_pattern, bn_decay_pattern]) matmul_batch_norm_pattern = graph_matcher.OpTypePattern( 'FusedBatchNorm', inputs=[ @@ -191,8 +234,34 @@ def _FindFusedBatchNorms(graph): inputs=[matmul_batch_norm_pattern, graph_matcher.OpTypePattern('*')]) + matmul_moving_average_sub_pattern = graph_matcher.OpTypePattern( + 'Sub', inputs=[moving_average_pattern, matmul_batch_norm_pattern]) + matmul_moving_average_mul_pattern = graph_matcher.OpTypePattern( + 'Mul', inputs=[matmul_moving_average_sub_pattern, bn_decay_pattern]) + conv_matcher = graph_matcher.GraphMatcher(conv_batch_norm_pattern) matmul_matcher = graph_matcher.GraphMatcher(matmul_bn_output_reshape_pattern) + conv_moving_average_mul_matcher = graph_matcher.GraphMatcher( + conv_moving_average_mul_pattern) + matmul_moving_average_mul_matcher = graph_matcher.GraphMatcher( + matmul_moving_average_mul_pattern) + + def _GetMovingAverageTensors(graph, moving_avg_mul_matcher, + moving_avg_sub_pattern, bn_op): + """Gets the moving mean and variance tensors and the batch norm momentum.""" + for mul_match_result in moving_avg_mul_matcher.match_graph(graph): + sub_op = mul_match_result.get_op(moving_avg_sub_pattern) + + if sub_op.inputs[1].name == bn_op.outputs[1].name: + # During training: Batch Mean is bn_op.outputs[1] + moving_mean_tensor = sub_op.inputs[0] + bn_decay_mean_tensor = mul_match_result.get_tensor(bn_decay_pattern) + if sub_op.inputs[1].name == bn_op.outputs[2].name: + # During training: Batch Var is bn_op.outputs[2] + moving_variance_tensor = sub_op.inputs[0] + bn_decay_var_tensor = mul_match_result.get_tensor(bn_decay_pattern) + return (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor, + bn_decay_var_tensor) def _GetCommonTensors(match_result, bn_op, bn_input_tensor): """Gets tensors needed for FusedBatchNormMatch from match_result.""" @@ -222,10 +291,14 @@ def _FindFusedBatchNorms(graph): # calculation, the variance is corrected by the term N/N-1 (Bessel's # correction). The variance tensor read from FuseBatchNorm has bessel's # correction applied, so we undo it here. - n = math_ops.cast( - array_ops.size(bn_input_tensor) / array_ops.size(mean_tensor), - dtypes.float32) - variance_tensor = bn_op.outputs[2] * (n - 1) / n + scope, sep, _ = bn_op.name.rpartition('/') + g = ops.get_default_graph() + with g.as_default(), g.name_scope(scope + sep): + n = math_ops.cast( + array_ops.size(bn_input_tensor) / array_ops.size(mean_tensor), + dtypes.float32) + variance_tensor = math_ops.multiply( + bn_op.outputs[2], (n - 1) / n, name='Undo_Bessel_Correction') else: mean_tensor = match_result.get_tensor(mean_pattern) variance_tensor = match_result.get_tensor(variance_pattern) @@ -233,15 +306,30 @@ def _FindFusedBatchNorms(graph): variance_tensor) for match_result in conv_matcher.match_graph(graph): + moving_mean_tensor = None + moving_variance_tensor = None + bn_decay_mean_tensor = None + bn_decay_var_tensor = None layer_op = match_result.get_op(conv_pattern) layer_tensor = match_result.get_tensor(conv_pattern) bn_op = match_result.get_op(conv_batch_norm_pattern) - # In the case of convolution the output_tensor is the output of bn_op. - output_tensor = bn_op.outputs[0] + if bn_op.get_attr('is_training'): + (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor, + bn_decay_var_tensor) = _GetMovingAverageTensors( + graph, + moving_avg_mul_matcher=conv_moving_average_mul_matcher, + moving_avg_sub_pattern=conv_moving_average_sub_pattern, + bn_op=bn_op) + output_tensor = bn_op.outputs[0] + batch_epsilon_tensor = bn_op.get_attr('epsilon') (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor, - variance_tensor) = _GetCommonTensors(match_result, bn_op, layer_tensor) - yield _FusedBatchNormMatch( + variance_tensor) = _GetCommonTensors( + match_result, + bn_op, + layer_tensor, + ) + yield _BatchNormMatch( layer_op=layer_op, bn_op=bn_op, output_tensor=output_tensor, @@ -250,20 +338,38 @@ def _FindFusedBatchNorms(graph): gamma_tensor=gamma_tensor, beta_tensor=beta_tensor, mean_tensor=mean_tensor, - variance_tensor=variance_tensor) + variance_tensor=variance_tensor, + moving_mean_tensor=moving_mean_tensor, + moving_variance_tensor=moving_variance_tensor, + bn_decay_mean_tensor=bn_decay_mean_tensor, + bn_decay_var_tensor=bn_decay_var_tensor, + batch_epsilon_tensor=batch_epsilon_tensor) for match_result in matmul_matcher.match_graph(graph): + moving_mean_tensor = None + moving_variance_tensor = None + bn_decay_mean_tensor = None + bn_decay_var_tensor = None layer_op = match_result.get_op(matmul_pattern) layer_tensor = match_result.get_tensor(matmul_pattern) bn_op = match_result.get_op(matmul_batch_norm_pattern) + if bn_op.get_attr('is_training'): + (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor, + bn_decay_var_tensor) = _GetMovingAverageTensors( + graph, + moving_avg_mul_matcher=matmul_moving_average_mul_matcher, + moving_avg_sub_pattern=matmul_moving_average_sub_pattern, + bn_op=bn_op) + # In the MatMul case, the output of batch norm is reshaped back into a # 2D tensor, so the output_tensor is the output of the Reshape op. output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern) output_tensor = output_reshape_op.outputs[0] + batch_epsilon_tensor = bn_op.get_attr('epsilon') (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor, variance_tensor) = _GetCommonTensors(match_result, bn_op, layer_tensor) - yield _FusedBatchNormMatch( + yield _BatchNormMatch( layer_op=layer_op, bn_op=bn_op, output_tensor=output_tensor, @@ -272,15 +378,21 @@ def _FindFusedBatchNorms(graph): gamma_tensor=gamma_tensor, beta_tensor=beta_tensor, mean_tensor=mean_tensor, - variance_tensor=variance_tensor) + variance_tensor=variance_tensor, + moving_mean_tensor=moving_mean_tensor, + moving_variance_tensor=moving_variance_tensor, + bn_decay_mean_tensor=bn_decay_mean_tensor, + bn_decay_var_tensor=bn_decay_var_tensor, + batch_epsilon_tensor=batch_epsilon_tensor) -class _FusedBatchNormMatch(object): - """Contains all information related to a found FusedBatchNorm.""" +class _BatchNormMatch(object): + """Contains all information related to a found Fused/UnfusedBatchNorm.""" def __init__(self, layer_op, bn_op, output_tensor, input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor, - variance_tensor): + variance_tensor, moving_mean_tensor, moving_variance_tensor, + bn_decay_mean_tensor, bn_decay_var_tensor, batch_epsilon_tensor): self._layer_op = layer_op self._bn_op = bn_op self._output_tensor = output_tensor @@ -290,6 +402,11 @@ class _FusedBatchNormMatch(object): self._beta_tensor = beta_tensor self._mean_tensor = mean_tensor self._variance_tensor = variance_tensor + self._moving_mean_tensor = moving_mean_tensor + self._moving_variance_tensor = moving_variance_tensor + self._bn_decay_mean_tensor = bn_decay_mean_tensor + self._bn_decay_var_tensor = bn_decay_var_tensor + self._batch_epsilon_tensor = batch_epsilon_tensor @property def layer_op(self): @@ -327,8 +444,28 @@ class _FusedBatchNormMatch(object): def variance_tensor(self): return self._variance_tensor + @property + def moving_mean_tensor(self): + return self._moving_mean_tensor -def _FoldUnfusedBatchNorms(graph): + @property + def moving_variance_tensor(self): + return self._moving_variance_tensor + + @property + def batch_epsilon_tensor(self): + return self._batch_epsilon_tensor + + @property + def bn_decay_mean_tensor(self): + return self._bn_decay_mean_tensor + + @property + def bn_decay_var_tensor(self): + return self._bn_decay_var_tensor + + +def _FoldUnfusedBatchNorms(graph, freeze_batch_norm_delay, is_training): """Finds unfused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise @@ -336,6 +473,9 @@ def _FoldUnfusedBatchNorms(graph): Args: graph: Graph to walk and modify. + freeze_batch_norm_delay: How many steps to wait before freezing + moving mean and variance and using them for batch normalization + is_training: Bool, True if training Raises: ValueError: When batch norm folding fails. @@ -346,7 +486,12 @@ def _FoldUnfusedBatchNorms(graph): has_scaling = _HasScaling(graph, input_to_ops_map, bn) # The mangling code intimately depends on BatchNorm node's internals. - original_op, folded_op = _CreateFoldedOp(graph, bn, has_scaling=has_scaling) + original_op, folded_op = _CreateFoldedOp( + graph, + bn, + has_scaling=has_scaling, + freeze_batch_norm_delay=freeze_batch_norm_delay, + is_training=is_training) activation = common.GetEndpointActivationOp(graph, bn) if activation: @@ -407,7 +552,186 @@ def _HasScaling(graph, input_to_ops_map, bn): return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1 -def _CreateFoldedOp(graph, context, has_scaling): +def _GetBatchNormParams(graph, context, has_scaling): + """Extracts relevant tensors for folding batch norms. + + Args: + graph: Graph to inspect. + context: The scope under which we look for batch norm params + has_scaling: Bool that specifies if scaling is done as part of batch + norm + + Returns: + _BatchNormMatch containing all required batch norm parameters + """ + gamma_tensor = None + batch_mean_tensor = None + batch_variance_tensor = None + moving_mean_tensor = None + moving_variance_tensor = None + batch_epsilon_tensor = None + bn_decay_mean_tensor = None + bn_decay_var_tensor = None + + split_context = context.split('/') + base_context = split_context[-1] + + oplist = graph.get_operations() + op_suffix_gamma = base_context + '/BatchNorm/gamma' + op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze' + op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1' + op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read' + op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read' + op_suffix_epsilon = base_context + '/BatchNorm/batchnorm/add/y' + op_suffix_bn_decay_mean = base_context + '/BatchNorm/AssignMovingAvg/decay' + op_suffix_bn_decay_var = base_context + '/BatchNorm/AssignMovingAvg_1/decay' + + # Parse through list of ops to find relevant ops + for op in oplist: + if op.name.endswith(op_suffix_mean): + # This is an efficient way to check for two things: + # Is batch norm present and is it training mode? + # Batch statistics are computed only during batch norm in training + batch_mean_tensor = graph.get_tensor_by_name(op.name + ':0') + if op.name.endswith(op_suffix_variance): + batch_variance_tensor = graph.get_tensor_by_name(op.name + ':0') + if op.name.endswith(op_suffix_moving_mean): + moving_mean_tensor = graph.get_tensor_by_name(op.name + ':0') + if op.name.endswith(op_suffix_moving_variance): + moving_variance_tensor = graph.get_tensor_by_name(op.name + ':0') + if op.name.endswith(op_suffix_epsilon): + batch_epsilon_tensor = graph.get_tensor_by_name(op.name + ':0') + if op.name.endswith(op_suffix_bn_decay_mean): + bn_decay_mean_tensor = graph.get_tensor_by_name(op.name + ':0') + if op.name.endswith(op_suffix_bn_decay_var): + bn_decay_var_tensor = graph.get_tensor_by_name(op.name + ':0') + if has_scaling: + if op.name.endswith(op_suffix_gamma): + gamma_tensor = graph.get_tensor_by_name(op.name + ':0') + + if not has_scaling: + gamma_tensor = array_ops.ones(batch_mean_tensor.shape) + + return _BatchNormMatch( + layer_op=None, + bn_op=None, + output_tensor=None, + input_tensor=None, + weight_tensor=None, + gamma_tensor=gamma_tensor, + beta_tensor=None, + mean_tensor=batch_mean_tensor, + variance_tensor=batch_variance_tensor, + moving_mean_tensor=moving_mean_tensor, + moving_variance_tensor=moving_variance_tensor, + bn_decay_mean_tensor=bn_decay_mean_tensor, + bn_decay_var_tensor=bn_decay_var_tensor, + batch_epsilon_tensor=batch_epsilon_tensor) + + +def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay, + fused_batch_norm): + """Computes batch norm correction params. + + Before batch normalization is frozen: + We use batch statistics for batch norm. + correction_scale = sigma_b/sigma_mv + correction_recip = 1/correction_scale + correction_offset = 0 + + After batch normalization is frozen: + correction_scale = sigma_b/sigma_mv + correction_recip = 1 + correction_offset = gamma*(mu_b/sigma_b-mu_mv/sigma_mv). + + Batch norm is frozen if global_step > bn_freeze_delay. + The corrections ensure that: + a) The weights are quantized after scaling by gamma/sigma_mv. This enables + smoother training as the scaling on the weights changes slowly, rather than + jump across mini-batches + b) Changing the values of the corrections allows for one to switch between + using batch statistics to using moving mean and average, without requiring + changes to batch_norm + + + Args: + context: The scope under which we look for batch norm params + match: Object containg required batch norm tensors for correction + computation + freeze_batch_norm_delay: Delay in steps at which computation switches + from regular batch norm to frozen mean and variance. + fused_batch_norm: Bool, true if fused batch norm is used + + Returns: + A tuple of correction_scale, correction_recip, correction_offset + """ + + g = ops.get_default_graph() + with g.name_scope(context + 'batch_norm_correction'): + recip_sigma_mv = math_ops.rsqrt( + match.moving_variance_tensor + match.batch_epsilon_tensor) + recip_sigma = math_ops.rsqrt( + match.variance_tensor + match.batch_epsilon_tensor) + correction_scale = math_ops.divide( + recip_sigma_mv, recip_sigma, name='scale_compute') + correction_scale = array_ops.identity( + correction_scale, name='correction_scale') + correction_recip = math_ops.reciprocal( + correction_scale, name='reciprocal_compute') + correction_offset = math_ops.multiply( + match.gamma_tensor, + match.mean_tensor * recip_sigma - + match.moving_mean_tensor * recip_sigma_mv, + name='offset_compute') + + if freeze_batch_norm_delay is not None: + use_mv_avg = math_ops.greater_equal( + training_util.get_or_create_global_step(), + freeze_batch_norm_delay, + name='use_moving_average') + else: + use_mv_avg = False + + bn_decay_zero = 0.0 + bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers()) + bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers()) + + bn_decay_mean_out = utils.smart_cond( + use_mv_avg, + lambda: bn_decay_zero, + lambda: match.bn_decay_mean_tensor, + name='freeze_moving_mean') + graph_editor.reroute_ts( + [bn_decay_mean_out], [match.bn_decay_mean_tensor], + can_modify=bn_decay_mean_consumers) + + if fused_batch_norm is False: + bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers()) + bn_decay_var_out = utils.smart_cond( + use_mv_avg, + lambda: bn_decay_zero, + lambda: match.bn_decay_var_tensor, + name='freeze_moving_var') + graph_editor.reroute_ts( + [bn_decay_var_out], [match.bn_decay_var_tensor], + can_modify=bn_decay_var_consumers) + + correction_recip = utils.smart_cond( + use_mv_avg, + lambda: array_ops.ones(correction_scale.shape), + lambda: correction_recip, + name='correction_recip') + + correction_offset = utils.smart_cond( + use_mv_avg, + lambda: correction_offset, + lambda: array_ops.zeros(correction_offset.shape), + name='correction_offset') + return correction_scale, correction_recip, correction_offset + + +def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay, + is_training): """Folds in batch norm layer into preceding convolution or FC layer. Creates 3 new nodes, connects their inputs and adds them to the graph: @@ -419,6 +743,9 @@ def _CreateFoldedOp(graph, context, has_scaling): context: String, batch norm context, i.e. node into which BatchNorm is nested. has_scaling: Whether the batch norm has scaling enabled. + freeze_batch_norm_delay: How many steps to wait before freezing + moving mean and variance and using them for batch normalization + is_training: Bool, true if training Raises: ValueError: When operation type is not supported, or input and output tensor @@ -435,19 +762,43 @@ def _CreateFoldedOp(graph, context, has_scaling): mul_scale_name) op_below = mul_scale.inputs[0].op weights = op_below.inputs[1] - + match = _GetBatchNormParams( + graph=graph, context=context, has_scaling=has_scaling) + correction_scale, correction_recip, correction_offset = None, None, None + if is_training: + correction_scale, correction_recip, correction_offset = ( + _ComputeBatchNormCorrections( + context=context, + match=match, + freeze_batch_norm_delay=freeze_batch_norm_delay, + fused_batch_norm=False)) # Special handling for weights of depthwise convolution. if op_below.type == 'DepthwiseConv2dNative': - new_shape = [weights.get_shape().as_list()[2], - weights.get_shape().as_list()[3]] + new_shape = [ + weights.get_shape().as_list()[2], + weights.get_shape().as_list()[3] + ] scale_name = 'mul' if has_scaling else 'Rsqrt' - scale = graph.get_operation_by_name(context + '/BatchNorm/batchnorm/' + - scale_name) + scale = graph.get_operation_by_name( + context + '/BatchNorm/batchnorm/' + scale_name) scale = array_ops.reshape(scale.outputs[0], new_shape, context + '/scale_reshape') - mul_fold = _CloneOp(mul_scale, context + '/mul_fold', - [(0, weights), (1, scale)]) + + if correction_scale is not None: + correction_scale = array_ops.reshape(correction_scale, new_shape, + context + '/correction_reshape') + with ops.device(mul_scale.device): + weights = math_ops.multiply(correction_scale, weights, + context + '/correction_mult') + + mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights), + (1, scale)]) elif op_below.type in ['Conv2D', 'MatMul']: + + if correction_scale is not None: + with ops.device(mul_scale.device): + weights = math_ops.multiply(correction_scale, weights, + context + '/correction_mult') mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights)]) else: raise ValueError('Cannot handle operation of type: %s' % op_below.op) @@ -456,10 +807,17 @@ def _CreateFoldedOp(graph, context, has_scaling): conv_or_fc_folded = _CloneOp(op_below, op_below.name + '_Fold', [(1, mul_fold.outputs[0])]) - add_shift = graph.get_operation_by_name(context + - '/BatchNorm/batchnorm/add_1') - add_fold = _CloneOp(add_shift, context + '/add_fold', - [(0, conv_or_fc_folded.outputs[0])]) + add_shift = graph.get_operation_by_name( + context + '/BatchNorm/batchnorm/add_1') + + corrected_output = conv_or_fc_folded.outputs[0] + if correction_offset is not None: + with ops.device(conv_or_fc_folded.device): + corrected_output = math_ops.multiply(correction_recip, corrected_output, + context + '/post_conv_mul') + corrected_output = math_ops.add(corrected_output, (correction_offset), + context + '/correction_add') + add_fold = _CloneOp(add_shift, context + '/add_fold', [(0, corrected_output)]) _AssertShapesMatch('add_fold', add_fold.inputs[0], add_fold.outputs[0]) return add_shift, add_fold diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py index ecf321ff573..330bd8a6474 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py @@ -46,26 +46,27 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): def _RunTestOverParameters(self, test_fn): parameters_list = [ - # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm) - (nn_ops.relu6, 'Relu6', False, False, False), - (nn_ops.relu, 'Relu', False, False, False), - (nn_ops.relu6, 'Relu6', True, False, False), - (nn_ops.relu, 'Relu', True, False, False), - (nn_ops.relu6, 'Relu6', False, True, False), - (nn_ops.relu, 'Relu', False, True, False), - (nn_ops.relu6, 'Relu6', True, True, False), - (nn_ops.relu, 'Relu', True, True, False), + # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm, + # freeze_batch_norm_delay) + (nn_ops.relu6, 'Relu6', False, False, False, 100), + (nn_ops.relu, 'Relu', False, False, False, None), + (nn_ops.relu6, 'Relu6', True, False, False, 100), + (nn_ops.relu, 'Relu', True, False, False, None), + (nn_ops.relu6, 'Relu6', False, True, False, 100), + (nn_ops.relu, 'Relu', False, True, False, None), + (nn_ops.relu6, 'Relu6', True, True, False, 100), + (nn_ops.relu, 'Relu', True, True, False, None), # Fused batch norm always has scaling enabled. - (nn_ops.relu6, 'Relu6', False, True, True), - (nn_ops.relu, 'Relu', False, True, True), - (nn_ops.relu6, 'Relu6', True, True, True), - (nn_ops.relu, 'Relu', True, True, True), + (nn_ops.relu6, 'Relu6', False, True, True, None), + (nn_ops.relu, 'Relu', False, True, True, 100), + (nn_ops.relu6, 'Relu6', True, True, True, None), + (nn_ops.relu, 'Relu', True, True, True, 100), ] for params in parameters_list: - test_fn(params[0], params[1], params[2], params[3], params[4]) + test_fn(params[0], params[1], params[2], params[3], params[4], params[5]) def _TestFoldConv2d(self, relu, relu_op_name, with_bypass, has_scaling, - fused_batch_norm): + fused_batch_norm, freeze_batch_norm_delay): """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*. Args: @@ -75,6 +76,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): inputs to just before Relu*. has_scaling: Bool, when true the batch norm has scaling. fused_batch_norm: Bool, when true the batch norm is fused. + freeze_batch_norm_delay: None or the number of steps after which training + switches to using frozen mean and variance """ g = ops.Graph() with g.as_default(): @@ -99,12 +102,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): node = math_ops.add(inputs, node, name='test/Add') relu(node, name='test/' + relu_op_name) - fold_batch_norms.FoldBatchNorms(g) + fold_batch_norms.FoldBatchNorms( + g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay) folded_mul = g.get_operation_by_name(scope + '/mul_fold') self.assertEqual(folded_mul.type, 'Mul') self._AssertInputOpsAre(folded_mul, [ - scope + '/weights/read', + scope + '/correction_mult', self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm) ]) self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold']) @@ -113,12 +117,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): self.assertEqual(folded_conv.type, 'Conv2D') self._AssertInputOpsAre(folded_conv, [scope + '/mul_fold', inputs.op.name]) - self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold']) + self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul']) folded_add = g.get_operation_by_name(scope + '/add_fold') self.assertEqual(folded_add.type, 'Add') self._AssertInputOpsAre(folded_add, [ - scope + '/Conv2D_Fold', + scope + '/correction_add', self._BathNormBiasName(scope, fused_batch_norm) ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] @@ -128,7 +132,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): self._RunTestOverParameters(self._TestFoldConv2d) def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass, - has_scaling, fused_batch_norm): + has_scaling, fused_batch_norm, + freeze_batch_norm_delay): """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*. Tests that folding works even with an input shape where some dimensions are @@ -141,6 +146,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): inputs to just before Relu*. has_scaling: Bool, when true the batch norm has scaling. fused_batch_norm: Bool, when true the batch norm is fused. + freeze_batch_norm_delay: None or the number of steps after which training + switches to using frozen mean and variance """ g = ops.Graph() with g.as_default(): @@ -164,12 +171,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): node = math_ops.add(inputs, node, name='test/Add') relu(node, name='test/' + relu_op_name) - fold_batch_norms.FoldBatchNorms(g) + fold_batch_norms.FoldBatchNorms( + g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay) folded_mul = g.get_operation_by_name(scope + '/mul_fold') self.assertEqual(folded_mul.type, 'Mul') self._AssertInputOpsAre(folded_mul, [ - scope + '/weights/read', + scope + '/correction_mult', self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm) ]) self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold']) @@ -177,12 +185,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold') self.assertEqual(folded_conv.type, 'Conv2D') self._AssertInputOpsAre(folded_conv, [scope + '/mul_fold', inputs.op.name]) - self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold']) + self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul']) folded_add = g.get_operation_by_name(scope + '/add_fold') self.assertEqual(folded_add.type, 'Add') self._AssertInputOpsAre(folded_add, [ - scope + '/Conv2D_Fold', + scope + '/correction_add', self._BathNormBiasName(scope, fused_batch_norm) ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] @@ -192,7 +200,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): self._RunTestOverParameters(self._TestFoldConv2dUnknownShape) def _TestFoldFullyConnectedLayer(self, relu, relu_op_name, with_bypass, - has_scaling, fused_batch_norm): + has_scaling, fused_batch_norm, + freeze_batch_norm_delay): """Tests folding cases: inputs -> FC with batch norm -> Relu*. Args: @@ -202,6 +211,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): inputs to just before Relu*. has_scaling: Bool, when true the batch norm has scaling. fused_batch_norm: Bool, when true the batch norm is fused. + freeze_batch_norm_delay: None or the number of steps after which training + switches to using frozen mean and variance """ g = ops.Graph() with g.as_default(): @@ -223,12 +234,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): node = math_ops.add(inputs, node, name='test/Add') relu(node, name='test/' + relu_op_name) - fold_batch_norms.FoldBatchNorms(g) + fold_batch_norms.FoldBatchNorms( + g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay) folded_mul = g.get_operation_by_name(scope + '/mul_fold') self.assertEqual(folded_mul.type, 'Mul') self._AssertInputOpsAre(folded_mul, [ - scope + '/weights/read', + scope + '/correction_mult', self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm) ]) self._AssertOutputGoesToOps(folded_mul, g, [scope + '/MatMul_Fold']) @@ -237,12 +249,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): self.assertEqual(folded_conv.type, 'MatMul') self._AssertInputOpsAre(folded_conv, [scope + '/mul_fold', inputs.op.name]) - self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold']) + self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul']) folded_add = g.get_operation_by_name(scope + '/add_fold') self.assertEqual(folded_add.type, 'Add') self._AssertInputOpsAre(folded_add, [ - scope + '/MatMul_Fold', + scope + '/correction_add', self._BathNormBiasName(scope, fused_batch_norm) ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] @@ -252,7 +264,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): self._RunTestOverParameters(self._TestFoldFullyConnectedLayer) def _TestFoldDepthwiseConv2d(self, relu, relu_op_name, with_bypass, - has_scaling, fused_batch_norm): + has_scaling, fused_batch_norm, + freeze_batch_norm_delay): """Tests folding: inputs -> DepthwiseConv2d with batch norm -> Relu*. Args: @@ -262,6 +275,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): inputs to just before Relu*. has_scaling: Bool, when true the batch norm has scaling. fused_batch_norm: Bool, when true the batch norm is fused. + freeze_batch_norm_delay: None or the number of steps after which training + switches to using frozen mean and variance """ g = ops.Graph() with g.as_default(): @@ -286,7 +301,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): node = math_ops.add(inputs, node, name='test/Add') relu(node, name='test/' + relu_op_name) - fold_batch_norms.FoldBatchNorms(g) + fold_batch_norms.FoldBatchNorms( + g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay) folded_mul = g.get_operation_by_name(scope + '/mul_fold') self.assertEqual(folded_mul.type, 'Mul') @@ -295,8 +311,7 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): else: scale_reshape_op_name = scope + '/scale_reshape' self._AssertInputOpsAre(folded_mul, - [scope + '/depthwise_weights/read', - scale_reshape_op_name]) + [scope + '/correction_mult', scale_reshape_op_name]) self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold']) scale_reshape = g.get_operation_by_name(scale_reshape_op_name) @@ -311,12 +326,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): self.assertEqual(folded_conv.type, 'DepthwiseConv2dNative') self._AssertInputOpsAre(folded_conv, [scope + '/mul_fold', inputs.op.name]) - self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold']) + self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul']) folded_add = g.get_operation_by_name(scope + '/add_fold') self.assertEqual(folded_add.type, 'Add') self._AssertInputOpsAre(folded_add, [ - scope + '/depthwise_Fold', + scope + '/correction_add', self._BathNormBiasName(scope, fused_batch_norm) ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] @@ -326,7 +341,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): self._RunTestOverParameters(self._TestFoldDepthwiseConv2d) def _TestCompareFoldAndUnfolded(self, relu, relu_op_name, with_bypass, - has_scaling, fused_batch_norm): + has_scaling, fused_batch_norm, + freeze_batch_norm_delay): """Tests that running folded and unfolded BN returns the same results. Args: @@ -336,6 +352,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): inputs to just before Relu*. has_scaling: Bool, when true the batch norm has scaling. fused_batch_norm: Bool, when true the batch norm is fused. + freeze_batch_norm_delay: None or the number of steps after which training + switches to using frozen mean and variance """ random_seed.set_random_seed(1234) unfolded_g = ops.Graph() @@ -361,11 +379,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): if with_bypass: node = math_ops.add(inputs, node, name='test/Add') relu_node = relu(node, name='test/' + relu_op_name) - folded_g = copy_graph.CopyGraph(unfolded_g) with folded_g.as_default(): - fold_batch_norms.FoldBatchNorms(folded_g) - + fold_batch_norms.FoldBatchNorms( + folded_g, + is_training=True, + freeze_batch_norm_delay=freeze_batch_norm_delay) with session.Session(graph=unfolded_g) as sess: sess.run(variables.global_variables_initializer()) grad_node = gradients.gradients(relu_node, inputs) diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py index e3581cc5590..b458f039df0 100644 --- a/tensorflow/contrib/quantize/python/graph_matcher.py +++ b/tensorflow/contrib/quantize/python/graph_matcher.py @@ -18,8 +18,19 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import abc -class OpTypePattern(object): + +class Pattern(object): + """The parent class of all patterns (e.g. OpTypePattern and OneofPattern).""" + + @abc.abstractmethod + def match(self, op, tensor): + """Returns the result of matching op/tensor against this pattern.""" + raise NotImplementedError('Method "match" not implemented.') + + +class OpTypePattern(Pattern): """A tree pattern that matches TF expressions with certain op types.""" def __init__(self, op_type, name=None, inputs=None): @@ -34,7 +45,7 @@ class OpTypePattern(object): similar TF op types. name: Optional string. The name of the pattern that can be looked up in MatchResult. - inputs: Optional list of `OpTypePattern`s or strings that specify the + inputs: Optional list of `Pattern`s or strings that specify the patterns for the inputs of a matching op. If None, this pattern accepts any inputs of a matching op. """ @@ -43,22 +54,51 @@ class OpTypePattern(object): if inputs is None: inputs = [] self._inputs = [ - input_pattern if isinstance(input_pattern, OpTypePattern) else - OpTypePattern(input_pattern) for input_pattern in inputs + input_pattern + if isinstance(input_pattern, Pattern) else OpTypePattern(input_pattern) + for input_pattern in inputs ] - @property - def op_type(self): - return self._op_type - - @property - def inputs(self): - return self._inputs - @property def name(self): return self._name + def match(self, op, tensor): + if self._op_type != '*': + if op.type not in self._op_type.split('|'): + return None + + match_result = MatchResult() + match_result.add(self, op, tensor) + + if not self._inputs: + # If pattern.inputs is empty, skips the rest and accepts all the inputs. + return match_result + + if len(op.inputs) != len(self._inputs): + return None + + for input_tensor, input_pattern in zip(op.inputs, self._inputs): + input_match_result = input_pattern.match(input_tensor.op, input_tensor) + if input_match_result is None: + return None + match_result.merge_from(input_match_result) + return match_result + + +class OneofPattern(Pattern): + """Matches one of the given sub-patterns.""" + + def __init__(self, sub_patterns): + self._sub_patterns = sub_patterns + + def match(self, op, tensor): + for sub_pattern in self._sub_patterns: + match_result = sub_pattern.match(op, tensor) + if match_result is not None: + return match_result + return None + class MatchResult(object): r"""Encapsulates the result of a match done by GraphMatcher. @@ -102,16 +142,36 @@ class MatchResult(object): return pattern_or_name if isinstance(pattern_or_name, str): + if pattern_or_name not in self._name_to_pattern: + return None return self._name_to_pattern[pattern_or_name] raise ValueError('pattern_or_name has type %s. Expect OpTypePattern or str.' % type(pattern_or_name)) + def _get_op_tensor(self, pattern_or_name): + pattern = self._to_pattern(pattern_or_name) + if pattern is None: + return None + + if pattern not in self._pattern_to_op_tensor: + return None + + return self._pattern_to_op_tensor[pattern] + def get_op(self, pattern_or_name): - return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][0] + op_tensor = self._get_op_tensor(pattern_or_name) + return op_tensor[0] if op_tensor else None def get_tensor(self, pattern_or_name): - return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][1] + op_tensor = self._get_op_tensor(pattern_or_name) + return op_tensor[1] if op_tensor else None + + def merge_from(self, other_match_result): + # pylint: disable=protected-access + self._pattern_to_op_tensor.update(other_match_result._pattern_to_op_tensor) + self._name_to_pattern.update(other_match_result._name_to_pattern) + # pylint: enable=protected-access class GraphMatcher(object): @@ -121,7 +181,7 @@ class GraphMatcher(object): """Initializes a GraphMatcher. Args: - pattern: The `OpTypePattern` against which `GraphMatcher` matches + pattern: The `Pattern` against which `GraphMatcher` matches subgraphs. """ self._pattern = pattern @@ -133,7 +193,7 @@ class GraphMatcher(object): with key `pattern`. Args: - pattern: An `OpTypePattern`. + pattern: An `Pattern`. op: A `tf.Operation` to match against the pattern. tensor: the output `tf.Tensor` of `op` that is used by the matching op of `pattern`'s parent. Can be None if `pattern` is already the root of the @@ -142,20 +202,11 @@ class GraphMatcher(object): Returns: True if an TF expression rooted at `op` matches `pattern`. """ - if pattern.op_type != '*': - if op.type not in pattern.op_type.split('|'): - return False - - self._match_result.add(pattern, op, tensor) - - if not pattern.inputs: - # If pattern.inputs is empty, skips the rest and accepts all the inputs. - return True - - return len(op.inputs) == len(pattern.inputs) and all([ - self._match_pattern(input_pattern, input_tensor.op, input_tensor) - for input_tensor, input_pattern in zip(op.inputs, pattern.inputs) - ]) + match_result = pattern.match(op, tensor) + if match_result is None: + return False + self._match_result.merge_from(match_result) + return True def match_op(self, op): """Matches `op` against `self._pattern`. diff --git a/tensorflow/contrib/quantize/python/graph_matcher_test.py b/tensorflow/contrib/quantize/python/graph_matcher_test.py index e1572865e42..6d587572181 100644 --- a/tensorflow/contrib/quantize/python/graph_matcher_test.py +++ b/tensorflow/contrib/quantize/python/graph_matcher_test.py @@ -105,7 +105,7 @@ class GraphMatcherTest(test_util.TensorFlowTestCase): self.assertEqual(match_result.get_op(y1_pattern), y1.op) self.assertEqual(match_result.get_tensor(y1_pattern), y1) - def test_oneof_pattern(self): + def test_oneof_type_pattern(self): # - + # / \ / \ # x y z @@ -125,6 +125,44 @@ class GraphMatcherTest(test_util.TensorFlowTestCase): for match_result in matcher.match_graph(g) ], [plus.op, minus.op]) + def test_oneof_pattern(self): + reshape_pattern = graph_matcher.OpTypePattern('Reshape') + transpose_pattern = graph_matcher.OneofPattern([ + graph_matcher.OpTypePattern( + 'Transpose', + name='transpose', + inputs=[ + graph_matcher.OpTypePattern( + 'Slice', name='slice', inputs=[reshape_pattern, '*', '*']), + '*' + ]), + graph_matcher.OpTypePattern( + 'Transpose', name='transpose', inputs=[reshape_pattern, '*']) + ]) + + matcher = graph_matcher.GraphMatcher(transpose_pattern) + + g = ops.Graph() + with g.as_default(): + inputs = array_ops.placeholder(dtypes.float32, shape=[6]) + reshape = array_ops.reshape(inputs, [2, 3]) + transpose = array_ops.transpose(reshape) + [match_result] = list(matcher.match_graph(g)) + self.assertEqual(match_result.get_tensor(reshape_pattern), reshape) + self.assertEqual(match_result.get_tensor('slice'), None) + self.assertEqual(match_result.get_op('transpose'), transpose.op) + + g = ops.Graph() + with g.as_default(): + inputs = array_ops.placeholder(dtypes.float32, shape=[6]) + reshape = array_ops.reshape(inputs, [2, 3]) + slicing = array_ops.slice(reshape, [0, 0], [-1, -1]) + transpose = array_ops.transpose(slicing) + [match_result] = list(matcher.match_graph(g)) + self.assertEqual(match_result.get_tensor(reshape_pattern), reshape) + self.assertEqual(match_result.get_tensor('slice'), slicing) + self.assertEqual(match_result.get_op('transpose'), transpose.op) + if __name__ == '__main__': googletest.main() diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py index bbd9743d801..89b744c5591 100644 --- a/tensorflow/contrib/quantize/python/quantize_graph.py +++ b/tensorflow/contrib/quantize/python/quantize_graph.py @@ -52,9 +52,19 @@ def _create_graph(input_graph, """ # TODO(suharshs): Describe the process in more detail in the doc string. g = copy_graph.CopyGraph(input_graph) + if is_training: + # TODO(raghuramank): Need to make freeze_batch_norm_delay + # a function of the batch size. For now setting this to 250 epochs + # This corresponds to 5 million steps at a batch size of 64. + freeze_batch_norm_delay = 5000000 + else: + freeze_batch_norm_delay = None with g.as_default(): with ops.device(device_name_or_function): - fold_batch_norms.FoldBatchNorms(g) + fold_batch_norms.FoldBatchNorms( + g, + freeze_batch_norm_delay=freeze_batch_norm_delay, + is_training=is_training) quantize.Quantize(g, is_training=is_training) if elements is None: return g diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py index 44998b3b659..bc383a80349 100644 --- a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py +++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py @@ -35,20 +35,34 @@ _VALID_PADDING = ["VALID", b"VALID"] _SAME_PADDING = ["SAME", b"SAME"] -def _stride_size(node): +def _stride_size(node, name_to_node): """Computes stride size given a TF node. Args: node: Tensorflow node (NodeDef proto). + name_to_node: For MaxPoolV2, mapping from variable name Tensorflow node. Returns: stride_x: Stride size for horizontal direction (integer). stride_y: Stride size for vertical direction (integer). + + Raises: + ValueError: If stride input cannot be found in `name_to_node`. """ - strides_attr = node.attr["strides"] - logging.vlog(4, "strides_attr = %s", strides_attr) - stride_y = strides_attr.list.i[1] - stride_x = strides_attr.list.i[2] + if node.op == "MaxPoolV2": + strides_input_name = node.input[2] + if not strides_input_name.endswith("/strides"): + raise ValueError("Strides name does not end with '/strides'") + strides_node = name_to_node[strides_input_name] + value = strides_node.attr["value"] + t = make_ndarray(value.tensor) + stride_y = t[1] + stride_x = t[2] + else: + strides_attr = node.attr["strides"] + logging.vlog(4, "strides_attr = %s", strides_attr) + stride_y = strides_attr.list.i[1] + stride_x = strides_attr.list.i[2] return stride_x, stride_y @@ -144,11 +158,12 @@ def _padding_size_conv_pool(node, kernel_size, stride, input_resolution=None): return total_padding, padding -def _pool_kernel_size(node): +def _pool_kernel_size(node, name_to_node): """Computes kernel size given a TF pooling node. Args: node: Tensorflow node (NodeDef proto). + name_to_node: For MaxPoolV2, mapping from node name to NodeDef. Returns: kernel_size_x: Kernel size for horizontal direction (integer). @@ -157,13 +172,27 @@ def _pool_kernel_size(node): Raises: ValueError: If pooling is invalid. """ - ksize = node.attr["ksize"] - kernel_size_y = ksize.list.i[1] - kernel_size_x = ksize.list.i[2] - if ksize.list.i[0] != 1: - raise ValueError("pool ksize for first dim is not 1") - if ksize.list.i[3] != 1: - raise ValueError("pool ksize for last dim is not 1") + if node.op == "MaxPoolV2": + ksize_input_name = node.input[1] + if not ksize_input_name.endswith("/ksize"): + raise ValueError("Kernel size name does not end with '/ksize'") + ksize_node = name_to_node[ksize_input_name] + value = ksize_node.attr["value"] + t = make_ndarray(value.tensor) + kernel_size_y = t[1] + kernel_size_x = t[2] + if t[0] != 1: + raise ValueError("pool ksize for first dim is not 1") + if t[3] != 1: + raise ValueError("pool ksize for last dim is not 1") + else: + ksize = node.attr["ksize"] + kernel_size_y = ksize.list.i[1] + kernel_size_x = ksize.list.i[2] + if ksize.list.i[0] != 1: + raise ValueError("pool ksize for first dim is not 1") + if ksize.list.i[3] != 1: + raise ValueError("pool ksize for last dim is not 1") return kernel_size_x, kernel_size_y @@ -243,7 +272,7 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False): logging.vlog(3, "node.op = %s", node.op) logging.vlog(4, "node = %s", node) if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative": - stride_x, stride_y = _stride_size(node) + stride_x, stride_y = _stride_size(node, name_to_node) kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_node) # Compute the padding for this node separately for each direction. total_padding_x, padding_x = _padding_size_conv_pool( @@ -260,9 +289,9 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False): stride_y = 1 total_padding_x, padding_x, total_padding_y, padding_y = ( _padding_size_pad_layer(node, name_to_node)) - elif node.op == "MaxPool" or node.op == "AvgPool": - stride_x, stride_y = _stride_size(node) - kernel_size_x, kernel_size_y = _pool_kernel_size(node) + elif node.op == "MaxPool" or node.op == "MaxPoolV2" or node.op == "AvgPool": + stride_x, stride_y = _stride_size(node, name_to_node) + kernel_size_x, kernel_size_y = _pool_kernel_size(node, name_to_node) # Compute the padding for this node separately for each direction. total_padding_x, padding_x = _padding_size_conv_pool( node, kernel_size_x, stride_x, input_resolution[1] diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc index c33804906fc..2def4f3f176 100644 --- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc +++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc @@ -15,8 +15,8 @@ limitations under the License. #define EIGEN_USE_THREADS -#include #include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h" +#include #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h index 9bb1724a2c0..d8c0a0631d3 100644 --- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h +++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h @@ -16,10 +16,10 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_ #define TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_ +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_types.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #define Sum(a, b) ((a) + (b)) #define Prod(a, b) ((a) * (b)) @@ -58,11 +58,11 @@ inline T negative_infinity() { } // namespace reduce_functions -#define CALL_ALL_REDUCEOPS(func, ...) \ - func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__) \ - func(Prod, functor::reduce_functions::one, ##__VA_ARGS__) \ - func(Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__) \ - func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__) +#define CALL_ALL_REDUCEOPS(func, ...) \ + func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__) \ + func(Prod, functor::reduce_functions::one, ##__VA_ARGS__) func( \ + Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__) \ + func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__) #define ReduceSliceFunctorReduceop(reduceop, dummy) \ template \ diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc index 501cddb8c8f..9f2be03d718 100644 --- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc +++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc @@ -17,10 +17,10 @@ limitations under the License. #define EIGEN_USE_GPU +#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" -#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h" #include "tensorflow/core/util/cuda_kernel_helper.h" namespace tensorflow { diff --git a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc index b8b56c0e229..92879ab5356 100644 --- a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc +++ b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc @@ -87,9 +87,9 @@ and 'indices' is [[0,1] [1,1] [0,2]], -the the output will be [[ 1, 2, 3] - [ 0, 0, 0] - [41,52,63]]. +the output will be [[ 1, 2, 3] + [ 0, 0, 0] + [41,52,63]]. ``` The data must be at least rank 1. The indices must be of shape (?,2) where the @@ -132,9 +132,9 @@ and 'indices' is [[0,1] [1,1] [0,2]], -the the output will be [[ 1, 2, 3] - [ 1, 1, 1] - [40,100,180]]. +the output will be [[ 1, 2, 3] + [ 1, 1, 1] + [40,100,180]]. ``` The data must be at least rank 1. The indices can be of shape (?,2) where the @@ -189,9 +189,9 @@ and 'indices' is [[0,1] [1,1] [0,2]], -the the output will be [[ 1, 20, 3] - [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE] - [ 400, 20, 60]]. +the output will be [[ 1, 20, 3] + [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE] + [ 400, 20, 60]]. ``` The data must be at least rank 1. The indices can be of shape (?,2) where the @@ -246,9 +246,9 @@ and 'indices' is [[0,1] [1,1] [0,2]], -the the output will be [[ 1, 20, 3] - [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE] - [ 1, 5, 3]]. +the output will be [[ 1, 20, 3] + [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE] + [ 1, 5, 3]]. ``` The data must be at least rank 1. The indices can be of shape (?,2) where the diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc index e02c1b6a2bd..63c72836d79 100644 --- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc +++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc @@ -36,17 +36,12 @@ using GPUDevice = Eigen::GpuDevice; namespace functor { template -struct Resampler2DFunctor{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const CPUDevice& d, - const T* __restrict__ data, - const T* __restrict__ warp, - T* __restrict__ output, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points){ +struct Resampler2DFunctor { + void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d, + const T* __restrict__ data, const T* __restrict__ warp, + T* __restrict__ output, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points) { const int warp_batch_stride = num_sampling_points * 2; const int data_batch_stride = data_height * data_width * data_channels; const int output_batch_stride = num_sampling_points * data_channels; @@ -59,24 +54,19 @@ struct Resampler2DFunctor{ // The functions take care of performing the relevant pointer // arithmetics abstracting away the low level details in the // main loop over samples. Note that data is stored in NHWC format. - auto set_output = [&](const int sample_id, - const int channel, + auto set_output = [&](const int sample_id, const int channel, const T value) { - output[batch_id * output_batch_stride + - sample_id * data_channels + + output[batch_id * output_batch_stride + sample_id * data_channels + channel] = value; }; - auto get_data_point = [&](const int x, - const int y, - const int chan) { + auto get_data_point = [&](const int x, const int y, const int chan) { const bool point_is_in_range = (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1); return point_is_in_range - ? data[batch_id * data_batch_stride + - data_channels * (y * data_width + x) + - chan] - : zero; + ? data[batch_id * data_batch_stride + + data_channels * (y * data_width + x) + chan] + : zero; }; for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) { @@ -89,8 +79,7 @@ struct Resampler2DFunctor{ // The effect is that the sampled signal smoothly goes to 0 outside // the original input domain, rather than presenting a jump // discontinuity at the image boundaries. - if (x > static_cast(-1.0) && - y > static_cast(-1.0) && + if (x > static_cast(-1.0) && y > static_cast(-1.0) && x < static_cast(data_width) && y < static_cast(data_height)) { // Precompute floor (f) and ceil (c) values for x and y. @@ -103,12 +92,10 @@ struct Resampler2DFunctor{ for (int chan = 0; chan < data_channels; ++chan) { const T img_fxfy = dx * dy * get_data_point(fx, fy, chan); - const T img_cxcy = (one - dx) * (one - dy) * - get_data_point(cx, cy, chan); - const T img_fxcy = dx * (one - dy) * - get_data_point(fx, cy, chan); - const T img_cxfy = (one - dx) * dy * - get_data_point(cx, fy, chan); + const T img_cxcy = + (one - dx) * (one - dy) * get_data_point(cx, cy, chan); + const T img_fxcy = dx * (one - dy) * get_data_point(fx, cy, chan); + const T img_cxfy = (one - dx) * dy * get_data_point(cx, fy, chan); set_output(sample_id, chan, img_fxfy + img_cxcy + img_fxcy + img_cxfy); } @@ -125,8 +112,8 @@ struct Resampler2DFunctor{ // estimate of the cost of each work unit is needed to correctly shard the // workload. Shard assumes each cost unit is 1ns, minimum cost per shard // being 10us. - const int64 cost = static_cast(num_sampling_points) * - data_channels * 1000; + const int64 cost = + static_cast(num_sampling_points) * data_channels * 1000; auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers, batch_size, cost, resample_batches); @@ -138,8 +125,8 @@ struct Resampler2DFunctor{ template class ResamplerOp : public ::tensorflow::OpKernel { public: - explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) : - ::tensorflow::OpKernel(context) {} + explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) + : ::tensorflow::OpKernel(context) {} void Compute(::tensorflow::OpKernelContext* ctx) override { const ::tensorflow::Tensor& data = ctx->input(0); @@ -158,16 +145,17 @@ class ResamplerOp : public ::tensorflow::OpKernel { ::tensorflow::errors::InvalidArgument( "warp should be at least a matrix, got shape ", warp_shape.DebugString())); - OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2, + OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2, ::tensorflow::errors::Unimplemented( "Only bilinear interpolation is supported, warping " "coordinates must be 2D; warp shape last entry should be " - "2, but shape vector is: ", warp_shape.DebugString())); + "2, but shape vector is: ", + warp_shape.DebugString())); OP_REQUIRES(ctx, data_shape.dim_size(0) == warp_shape.dim_size(0), ::tensorflow::errors::InvalidArgument( "Batch size of data and warp tensor must be the same, but " - "input shapes are: ", data_shape.DebugString(), ", ", - warp_shape.DebugString())); + "input shapes are: ", + data_shape.DebugString(), ", ", warp_shape.DebugString())); const int batch_size = data_shape.dim_size(0); const int data_height = data_shape.dim_size(1); const int data_width = data_shape.dim_size(2); @@ -180,16 +168,10 @@ class ResamplerOp : public ::tensorflow::OpKernel { // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU. if (num_sampling_points > 0) { - functor::Resampler2DFunctor()(ctx, - ctx->eigen_device(), - data.flat().data(), - warp.flat().data(), - output->flat().data(), - batch_size, - data_height, - data_width, - data_channels, - num_sampling_points); + functor::Resampler2DFunctor()( + ctx, ctx->eigen_device(), data.flat().data(), + warp.flat().data(), output->flat().data(), batch_size, + data_height, data_width, data_channels, num_sampling_points); } } @@ -197,12 +179,9 @@ class ResamplerOp : public ::tensorflow::OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(ResamplerOp); }; - -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Resampler") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T"), \ +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Resampler").Device(DEVICE_CPU).TypeConstraint("T"), \ ResamplerOp); TF_CALL_half(REGISTER); @@ -211,40 +190,32 @@ TF_CALL_double(REGISTER); #undef REGISTER #if GOOGLE_CUDA -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER(Name("Resampler") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T"), \ - ResamplerOp) +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Resampler").Device(DEVICE_GPU).TypeConstraint("T"), \ + ResamplerOp) TF_CALL_float(REGISTER); TF_CALL_double(REGISTER); #undef REGISTER #endif // GOOGLE_CUDA - namespace functor { template -struct ResamplerGrad2DFunctor{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const CPUDevice& d, - const T* __restrict__ data, - const T* __restrict__ warp, - const T* __restrict__ grad_output, - T* __restrict__ grad_data, - T* __restrict__ grad_warp, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points){ +struct ResamplerGrad2DFunctor { + void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d, + const T* __restrict__ data, const T* __restrict__ warp, + const T* __restrict__ grad_output, T* __restrict__ grad_data, + T* __restrict__ grad_warp, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points) { // Set gradients to 0, because the kernel incrementally updates the // tensor entries by adding partial contributions. - const int resampler_output_size = batch_size * num_sampling_points * - data_channels; + const int resampler_output_size = + batch_size * num_sampling_points * data_channels; const int grad_warp_size = resampler_output_size / data_channels * 2; - const int grad_data_size = data_height * data_width * data_channels * - batch_size; + const int grad_data_size = + data_height * data_width * data_channels * batch_size; memset(grad_data, 0, sizeof(T) * grad_data_size); memset(grad_warp, 0, sizeof(T) * grad_warp_size); @@ -260,35 +231,29 @@ struct ResamplerGrad2DFunctor{ // The functions take care of performing the relevant pointer // arithmetics abstracting away the low level details in the // main loop over samples. Note that data is stored in NHWC format. - auto get_data_point = [&](const int x, - const int y, - const int chan) { + auto get_data_point = [&](const int x, const int y, const int chan) { const bool point_is_in_range = - (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1); + (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1); return point_is_in_range - ? data[batch_id * data_batch_stride + - data_channels * (y * data_width + x) + - chan] - : zero; + ? data[batch_id * data_batch_stride + + data_channels * (y * data_width + x) + chan] + : zero; }; auto update_grad_data = [&](const int x, const int y, const int chan, const T value) { const bool point_is_in_range = (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1); - if (point_is_in_range){ + if (point_is_in_range) { grad_data[batch_id * data_batch_stride + - data_channels * (y * data_width + x) + - chan] += value; + data_channels * (y * data_width + x) + chan] += value; } }; - auto update_grad_warp = [&](const int sample_id, - const int channel, + auto update_grad_warp = [&](const int sample_id, const int channel, const T value) { - grad_warp[batch_id * warp_batch_stride + - sample_id * 2 + - channel] += value; + grad_warp[batch_id * warp_batch_stride + sample_id * 2 + channel] += + value; }; for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) { @@ -301,8 +266,7 @@ struct ResamplerGrad2DFunctor{ // The effect is that the sampled signal smoothly goes to 0 outside // the original input domain, rather than presenting a jump // discontinuity at the image boundaries. - if (x > static_cast(-1.0) && - y > static_cast(-1.0) && + if (x > static_cast(-1.0) && y > static_cast(-1.0) && x < static_cast(data_width) && y < static_cast(data_height)) { // Precompute floor (f) and ceil (c) values for x and y. @@ -316,27 +280,25 @@ struct ResamplerGrad2DFunctor{ for (int chan = 0; chan < data_channels; ++chan) { const T grad_output_value = grad_output[batch_id * output_batch_stride + - sample_id * data_channels + - chan]; + sample_id * data_channels + chan]; const T img_fxfy = get_data_point(fx, fy, chan); const T img_cxcy = get_data_point(cx, cy, chan); const T img_fxcy = get_data_point(fx, cy, chan); const T img_cxfy = get_data_point(cx, fy, chan); // Update partial gradients wrt relevant warp field entries - update_grad_warp(sample_id, 0, - grad_output_value * - ((one - dy) * (img_cxcy - img_fxcy) + - dy * (img_cxfy - img_fxfy))); + update_grad_warp( + sample_id, 0, + grad_output_value * ((one - dy) * (img_cxcy - img_fxcy) + + dy * (img_cxfy - img_fxfy))); - update_grad_warp(sample_id, 1, - grad_output_value * - ((one - dx) * (img_cxcy - img_cxfy) + - dx * (img_fxcy - img_fxfy))); + update_grad_warp( + sample_id, 1, + grad_output_value * ((one - dx) * (img_cxcy - img_cxfy) + + dx * (img_fxcy - img_fxfy))); // Update partial gradients wrt sampled data - update_grad_data(fx, fy, chan, - grad_output_value * dx * dy); + update_grad_data(fx, fy, chan, grad_output_value * dx * dy); update_grad_data(cx, cy, chan, grad_output_value * (one - dx) * (one - dy)); update_grad_data(fx, cy, chan, @@ -355,8 +317,8 @@ struct ResamplerGrad2DFunctor{ // being 10us. // TODO(fviola): Check out if there is a better way of doing this. auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); - const int64 cost = static_cast(num_sampling_points) * - data_channels * 1000; + const int64 cost = + static_cast(num_sampling_points) * data_channels * 1000; ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers, batch_size, cost, update_grads_for_batches); } @@ -364,12 +326,11 @@ struct ResamplerGrad2DFunctor{ } // namespace functor - template class ResamplerGradOp : public ::tensorflow::OpKernel { public: - explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) : - ::tensorflow::OpKernel(context) {} + explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) + : ::tensorflow::OpKernel(context) {} void Compute(::tensorflow::OpKernelContext* ctx) override { const ::tensorflow::Tensor& data = ctx->input(0); @@ -383,7 +344,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel { "tensor must be a batch of 2d data; data shape should have " "4 entries corresponding to [batch_size, data_height, " "data_width, data_channels], but is: ", - data_shape.DebugString())); + data_shape.DebugString())); const int batch_size = data_shape.dim_size(0); const int data_height = data_shape.dim_size(1); const int data_width = data_shape.dim_size(2); @@ -394,7 +355,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel { ::tensorflow::errors::InvalidArgument( "warp should be at least a matrix, got shape ", warp_shape.DebugString())); - OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2, + OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2, ::tensorflow::errors::Unimplemented( "Only bilinear interpolation is supported, warping " "coordinates must be 2D; warp shape last entry should be " @@ -417,18 +378,11 @@ class ResamplerGradOp : public ::tensorflow::OpKernel { OP_REQUIRES_OK(ctx, ctx->allocate_output(1, warp.shape(), &grad_warp)); // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU. if (num_sampling_points > 0) { - functor::ResamplerGrad2DFunctor()(ctx, - ctx->eigen_device(), - data.flat().data(), - warp.flat().data(), - grad_output.flat().data(), - grad_data->flat().data(), - grad_warp->flat().data(), - batch_size, - data_height, - data_width, - data_channels, - num_sampling_points); + functor::ResamplerGrad2DFunctor()( + ctx, ctx->eigen_device(), data.flat().data(), + warp.flat().data(), grad_output.flat().data(), + grad_data->flat().data(), grad_warp->flat().data(), batch_size, + data_height, data_width, data_channels, num_sampling_points); } } @@ -436,11 +390,9 @@ class ResamplerGradOp : public ::tensorflow::OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(ResamplerGradOp); }; -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("ResamplerGrad") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T"), \ +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("ResamplerGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ ResamplerGradOp); TF_CALL_half(REGISTER); @@ -449,11 +401,10 @@ TF_CALL_double(REGISTER); #undef REGISTER #if GOOGLE_CUDA -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER(Name("ResamplerGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T"), \ - ResamplerGradOp) +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("ResamplerGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + ResamplerGradOp) // Disable half and double precision since atomicAdds are not supported // TF_CALL_half(REGISTER); // TF_CALL_double(REGISTER); diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.h b/tensorflow/contrib/resampler/kernels/resampler_ops.h index 85d3676efac..7fe3b9c0df7 100644 --- a/tensorflow/contrib/resampler/kernels/resampler_ops.h +++ b/tensorflow/contrib/resampler/kernels/resampler_ops.h @@ -29,38 +29,25 @@ namespace functor { // Helper functor for the Resampler Op in 2D template -struct Resampler2DFunctor{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const Device& d, - const T* __restrict__ data, - const T* __restrict__ warp, - T* __restrict__ output, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points); +struct Resampler2DFunctor { + void operator()(::tensorflow::OpKernelContext* ctx, const Device& d, + const T* __restrict__ data, const T* __restrict__ warp, + T* __restrict__ output, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points); }; - // Helper functor for the Resampler Gradient Op in 2D template -struct ResamplerGrad2DFunctor{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const Device& d, - const T* __restrict__ data, - const T* __restrict__ warp, - const T* __restrict__ grad_output, - T* __restrict__ grad_data, - T* __restrict__ grad_warp, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points); +struct ResamplerGrad2DFunctor { + void operator()(::tensorflow::OpKernelContext* ctx, const Device& d, + const T* __restrict__ data, const T* __restrict__ warp, + const T* __restrict__ grad_output, T* __restrict__ grad_data, + T* __restrict__ grad_warp, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points); }; - } // namespace functor } // namespace tensorflow diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc index 636847a212f..3c07051f685 100644 --- a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc +++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc @@ -31,18 +31,15 @@ using GPUDevice = Eigen::GpuDevice; namespace { -#define GET_DATA_POINT(x, y) \ - data[batch_id * data_batch_stride + \ - data_channels * (y * data_width + x) + \ +#define GET_DATA_POINT(x, y) \ + data[batch_id * data_batch_stride + data_channels * (y * data_width + x) + \ chan] template __global__ void Resampler2DKernel(const T* __restrict__ data, const T* __restrict__ warp, - T* __restrict__ output, - const int batch_size, - const int data_height, - const int data_width, + T* __restrict__ output, const int batch_size, + const int data_height, const int data_width, const int data_channels, const int num_sampling_points) { const int output_data_size = batch_size * num_sampling_points * data_channels; @@ -75,10 +72,8 @@ __global__ void Resampler2DKernel(const T* __restrict__ data, // The effect is that the sampled signal smoothly goes to 0 outside // the original input domain, rather than presenting a jump // discontinuity at the image boundaries. - if (x > static_cast(-1.0) && - y > static_cast(-1.0) && - x < static_cast(data_width) && - y < static_cast(data_height)) { + if (x > static_cast(-1.0) && y > static_cast(-1.0) && + x < static_cast(data_width) && y < static_cast(data_height)) { // Precompute floor (f) and ceil (c) values for x and y. const int fx = std::floor(static_cast(x)); const int fy = std::floor(static_cast(y)); @@ -87,21 +82,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data, const T dx = static_cast(cx) - x; const T dy = static_cast(cy) - y; - const T img_fxfy = (fx >= 0 && fy >= 0) - ? dx * dy * GET_DATA_POINT(fx, fy) - : zero; + const T img_fxfy = + (fx >= 0 && fy >= 0) ? dx * dy * GET_DATA_POINT(fx, fy) : zero; const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1) - ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy) - : zero; + ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy) + : zero; const T img_fxcy = (fx >= 0 && cy <= data_height - 1) - ? dx * (one - dy) * GET_DATA_POINT(fx, cy) - : zero; + ? dx * (one - dy) * GET_DATA_POINT(fx, cy) + : zero; const T img_cxfy = (cx <= data_width - 1 && fy >= 0) - ? (one - dx) * dy * GET_DATA_POINT(cx, fy) - : zero; + ? (one - dx) * dy * GET_DATA_POINT(cx, fy) + : zero; output[out_index] = img_fxfy + img_cxcy + img_fxcy + img_cxfy; } else { @@ -115,24 +109,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data, namespace functor { template -struct Resampler2DFunctor{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const GPUDevice& d, - const T* __restrict__ data, - const T* __restrict__ warp, - T* __restrict__ output, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points) { - const int output_data_size = batch_size * num_sampling_points * data_channels; - ::tensorflow::CudaLaunchConfig config = - ::tensorflow::GetCudaLaunchConfig(output_data_size, d); - Resampler2DKernel - <<>>( - data, warp, output, batch_size, data_height, data_width, - data_channels, num_sampling_points); +struct Resampler2DFunctor { + void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d, + const T* __restrict__ data, const T* __restrict__ warp, + T* __restrict__ output, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points) { + const int output_data_size = + batch_size * num_sampling_points * data_channels; + ::tensorflow::CudaLaunchConfig config = + ::tensorflow::GetCudaLaunchConfig(output_data_size, d); + Resampler2DKernel + <<>>( + data, warp, output, batch_size, data_height, data_width, + data_channels, num_sampling_points); } }; @@ -145,26 +135,20 @@ template struct Resampler2DFunctor; namespace { -#define UPDATE_GRAD_DATA_POINT(x, y, v) \ - atomicAdd(grad_data + (batch_id * data_batch_stride + \ - data_channels * (y * data_width + x) + \ - chan), \ +#define UPDATE_GRAD_DATA_POINT(x, y, v) \ + atomicAdd(grad_data + (batch_id * data_batch_stride + \ + data_channels * (y * data_width + x) + chan), \ v) - template -__global__ void ResamplerGrad2DKernel(const T* __restrict__ data, - const T* __restrict__ warp, - const T* __restrict__ grad_output, - T* __restrict__ grad_data, - T* __restrict__ grad_warp, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points) { - const int resampler_output_size = batch_size * num_sampling_points * - data_channels; +__global__ void ResamplerGrad2DKernel( + const T* __restrict__ data, const T* __restrict__ warp, + const T* __restrict__ grad_output, T* __restrict__ grad_data, + T* __restrict__ grad_warp, const int batch_size, const int data_height, + const int data_width, const int data_channels, + const int num_sampling_points) { + const int resampler_output_size = + batch_size * num_sampling_points * data_channels; CUDA_1D_KERNEL_LOOP(index, resampler_output_size) { const int out_index = index; @@ -199,10 +183,8 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data, // The effect is that the sampled signal smoothly goes to 0 outside // the original input domain, rather than presenting a jump // discontinuity at the image boundaries. - if (x > static_cast(-1.0) && - y > static_cast(-1.0) && - x < static_cast(data_width) && - y < static_cast(data_height)) { + if (x > static_cast(-1.0) && y > static_cast(-1.0) && + x < static_cast(data_width) && y < static_cast(data_height)) { // Precompute floor (f) and ceil (c) values for x and y. const int fx = std::floor(static_cast(x)); const int fy = std::floor(static_cast(y)); @@ -211,21 +193,17 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data, const T dx = static_cast(cx) - x; const T dy = static_cast(cy) - y; - const T img_fxfy = (fx >= 0 && fy >= 0) - ? GET_DATA_POINT(fx, fy) - : zero; + const T img_fxfy = (fx >= 0 && fy >= 0) ? GET_DATA_POINT(fx, fy) : zero; const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1) - ? GET_DATA_POINT(cx, cy) - : zero; + ? GET_DATA_POINT(cx, cy) + : zero; - const T img_fxcy = (fx >= 0 && cy <= data_height - 1) - ? GET_DATA_POINT(fx, cy) - : zero; + const T img_fxcy = + (fx >= 0 && cy <= data_height - 1) ? GET_DATA_POINT(fx, cy) : zero; - const T img_cxfy = (cx <= data_width - 1 && fy >= 0) - ? GET_DATA_POINT(cx, fy) - : zero; + const T img_cxfy = + (cx <= data_width - 1 && fy >= 0) ? GET_DATA_POINT(cx, fy) : zero; // Update partial gradients wrt relevant warp field entries atomicAdd(grad_warp + warp_id_x, @@ -241,7 +219,7 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data, } if (cx <= data_width - 1 && cy <= data_height - 1) { UPDATE_GRAD_DATA_POINT(cx, cy, - grad_output_value * (one - dx) * (one - dy)); + grad_output_value * (one - dx) * (one - dy)); } if (fx >= 0 && cy <= data_height - 1) { UPDATE_GRAD_DATA_POINT(fx, cy, grad_output_value * dx * (one - dy)); @@ -261,43 +239,37 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data, namespace functor { template -struct ResamplerGrad2DFunctor{ - void operator ()(::tensorflow::OpKernelContext* ctx, - const GPUDevice& d, - const T* __restrict__ data, - const T* __restrict__ warp, - const T* __restrict__ grad_output, - T* __restrict__ grad_data, - T* __restrict__ grad_warp, - const int batch_size, - const int data_height, - const int data_width, - const int data_channels, - const int num_sampling_points) { - // Set gradients to 0, because the kernel incrementally updates the - // tensor entries by adding partial contributions. - const int grad_warp_size = batch_size * num_sampling_points * 2; - const int grad_data_size = batch_size * data_height * data_width * - data_channels; +struct ResamplerGrad2DFunctor { + void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d, + const T* __restrict__ data, const T* __restrict__ warp, + const T* __restrict__ grad_output, T* __restrict__ grad_data, + T* __restrict__ grad_warp, const int batch_size, + const int data_height, const int data_width, + const int data_channels, const int num_sampling_points) { + // Set gradients to 0, because the kernel incrementally updates the + // tensor entries by adding partial contributions. + const int grad_warp_size = batch_size * num_sampling_points * 2; + const int grad_data_size = + batch_size * data_height * data_width * data_channels; - ::tensorflow::CudaLaunchConfig config = - ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d); - ::tensorflow::SetZero - <<>>( - grad_warp_size, grad_warp); + ::tensorflow::CudaLaunchConfig config = + ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d); + ::tensorflow:: + SetZero<<>>( + grad_warp_size, grad_warp); - config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d); - ::tensorflow::SetZero - <<>>( - grad_data_size, grad_data); + config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d); + ::tensorflow:: + SetZero<<>>( + grad_data_size, grad_data); - const int resampler_output_size = batch_size * num_sampling_points * - data_channels; - config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d); - ResamplerGrad2DKernel - <<>>( - data, warp, grad_output, grad_data, grad_warp, batch_size, - data_height, data_width, data_channels, num_sampling_points); + const int resampler_output_size = + batch_size * num_sampling_points * data_channels; + config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d); + ResamplerGrad2DKernel + <<>>( + data, warp, grad_output, grad_data, grad_warp, batch_size, + data_height, data_width, data_channels, num_sampling_points); } }; diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc index e62501e9b10..03006dab323 100644 --- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc +++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc @@ -36,11 +36,10 @@ perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory) { namespace functor { template -void TensorCuBlasGemm::operator()(OpKernelContext* ctx, - bool transa, bool transb, uint64 m, - uint64 n, uint64 k, T alpha, const T* a, - int lda, const T* b, int ldb, T beta, T* c, - int ldc) { +void TensorCuBlasGemm::operator()(OpKernelContext* ctx, bool transa, + bool transb, uint64 m, uint64 n, uint64 k, + T alpha, const T* a, int lda, const T* b, + int ldb, T beta, T* c, int ldc) { #if GOOGLE_CUDA perftools::gputools::blas::Transpose trans[] = { perftools::gputools::blas::Transpose::kNoTranspose, diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.cc b/tensorflow/contrib/rnn/kernels/gru_ops.cc index 0796f82b214..bd3d898fb09 100644 --- a/tensorflow/contrib/rnn/kernels/gru_ops.cc +++ b/tensorflow/contrib/rnn/kernels/gru_ops.cc @@ -15,8 +15,8 @@ limitations under the License. #define EIGEN_USE_THREADS -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/contrib/rnn/kernels/gru_ops.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" namespace tensorflow { @@ -61,9 +61,9 @@ class GRUCellBlockOp : public OpKernel { h_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("h_prev.dims(1) != cell_size: ", - h_prev_tensor->dim_size(1), " vs. ", - cell_size)); + errors::InvalidArgument( + "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1), + " vs. ", cell_size)); // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size] OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size, @@ -82,10 +82,10 @@ class GRUCellBlockOp : public OpKernel { "w_c.dim_size(0) != input_size + cell_size: ", w_c_tensor->dim_size(0), " vs. ", input_size + cell_size)); - OP_REQUIRES( - ctx, w_c_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("w_c.dim_size(1) != cell_size: ", - w_c_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1), + " vs. ", cell_size)); // Shape of 'b_ru' must be [2*cell_size] OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2, @@ -97,10 +97,10 @@ class GRUCellBlockOp : public OpKernel { errors::InvalidArgument("Rank of b_ru must be 1", b_ru_tensor->dims(), " vs. 1", 1)); // Shape of 'b_c' must be [cell_size] - OP_REQUIRES( - ctx, b_c_tensor->dim_size(0) == cell_size, - errors::InvalidArgument("b_c.dim_size(0) != cell_size: ", - b_c_tensor->dim_size(0), " vs. ", cell_size)); + OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size, + errors::InvalidArgument( + "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0), + " vs. ", cell_size)); OP_REQUIRES(ctx, b_c_tensor->dims() == 1, errors::InvalidArgument("Rank of b_c must be 1", b_c_tensor->dims(), " vs. 1")); @@ -216,9 +216,9 @@ class GRUBlockCellGradOp : public OpKernel { h_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("h_prev.dims(1) != cell_size: ", - h_prev_tensor->dim_size(1), " vs. ", - cell_size)); + errors::InvalidArgument( + "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1), + " vs. ", cell_size)); // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size] OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size, @@ -237,10 +237,10 @@ class GRUBlockCellGradOp : public OpKernel { "w_c.dim_size(0) != input_size + cell_size: ", w_c_tensor->dim_size(0), " vs. ", input_size + cell_size)); - OP_REQUIRES( - ctx, w_c_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("w_c.dim_size(1) != cell_size: ", - w_c_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1), + " vs. ", cell_size)); // Shape of 'b_ru' must be [2*cell_size] OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2, @@ -253,54 +253,54 @@ class GRUBlockCellGradOp : public OpKernel { b_ru_tensor->dims(), " vs. 1")); // Shape of 'b_c' must be [cell_size] - OP_REQUIRES( - ctx, b_c_tensor->dim_size(0) == cell_size, - errors::InvalidArgument("b_c.dim_size(0) != cell_size: ", - b_c_tensor->dim_size(0), " vs. ", cell_size)); + OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size, + errors::InvalidArgument( + "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0), + " vs. ", cell_size)); OP_REQUIRES(ctx, b_c_tensor->dims() == 1, errors::InvalidArgument("Rank of b_c must be 1 ", b_c_tensor->dims(), " vs. 1")); // Shape of 'r' must be [batch_size, cell_size] - OP_REQUIRES( - ctx, r_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("r.dims(0) != batch_size: ", - r_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, r_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("r.dims(1) != cell_size: ", - r_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, r_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "r.dims(0) != batch_size: ", r_tensor->dim_size(0), " vs. ", + batch_size)); + OP_REQUIRES(ctx, r_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "r.dims(1) != cell_size: ", r_tensor->dim_size(1), " vs. ", + cell_size)); // Shape of 'u' must be [batch_size, cell_size] - OP_REQUIRES( - ctx, u_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("u.dims(0) != batch_size: ", - u_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, u_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("u.dims(1) != cell_size: ", - u_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, u_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "u.dims(0) != batch_size: ", u_tensor->dim_size(0), " vs. ", + batch_size)); + OP_REQUIRES(ctx, u_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "u.dims(1) != cell_size: ", u_tensor->dim_size(1), " vs. ", + cell_size)); // Shape of 'c' must be [batch_size, cell_size] - OP_REQUIRES( - ctx, c_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("c.dims(0) != batch_size: ", - c_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, c_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("c.dims(1) != cell_size: ", - c_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, c_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "c.dims(0) != batch_size: ", c_tensor->dim_size(0), " vs. ", + batch_size)); + OP_REQUIRES(ctx, c_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "c.dims(1) != cell_size: ", c_tensor->dim_size(1), " vs. ", + cell_size)); // Shape of 'd_h' must be [batch_size, cell_size] - OP_REQUIRES( - ctx, d_h_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("d_h.dims(0) != batch_size: ", - d_h_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, d_h_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("d_h.dims(1) != cell_size: ", - d_h_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, d_h_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "d_h.dims(0) != batch_size: ", d_h_tensor->dim_size(0), + " vs. ", batch_size)); + OP_REQUIRES(ctx, d_h_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "d_h.dims(1) != cell_size: ", d_h_tensor->dim_size(1), + " vs. ", cell_size)); // Create output tensors. Tensor* d_x_tensor = nullptr; diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc index 941a457fd3a..5e7cf0ce84d 100644 --- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc +++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc @@ -281,23 +281,23 @@ class LSTMBlockCellOp : public OpKernel { h_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("h_prev.dims(1) != cell_size: ", - h_prev_tensor->dim_size(1), " vs. ", - cell_size)); + errors::InvalidArgument( + "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1), + " vs. ", cell_size)); OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size, errors::InvalidArgument( "w.dim_size(0) != input_size + cell_size: ", w_tensor->dim_size(0), " vs. ", input_size + cell_size)); - OP_REQUIRES( - ctx, w_tensor->dim_size(1) == cell_size * 4, - errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ", - w_tensor->dim_size(1), " vs. ", cell_size * 4)); + OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4, + errors::InvalidArgument( + "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1), + " vs. ", cell_size * 4)); - OP_REQUIRES( - ctx, b_tensor->dim_size(0) == cell_size * 4, - errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ", - b_tensor->dim_size(0), " vs. ", cell_size * 4)); + OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4, + errors::InvalidArgument( + "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0), + " vs. ", cell_size * 4)); // Allocate our output tensors. Tensor* i_tensor = nullptr; @@ -484,77 +484,77 @@ class LSTMBlockCellGradOp : public OpKernel { h_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("h_prev.dims(1) != cell_size: ", - h_prev_tensor->dim_size(1), " vs. ", - cell_size)); + errors::InvalidArgument( + "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1), + " vs. ", cell_size)); OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size, errors::InvalidArgument( "w.dim_size(0) != input_size + cell_size: ", w_tensor->dim_size(0), " vs. ", input_size + cell_size)); - OP_REQUIRES( - ctx, w_tensor->dim_size(1) == cell_size * 4, - errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ", - w_tensor->dim_size(1), " vs. ", cell_size * 4)); + OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4, + errors::InvalidArgument( + "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1), + " vs. ", cell_size * 4)); - OP_REQUIRES( - ctx, b_tensor->dim_size(0) == cell_size * 4, - errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ", - b_tensor->dim_size(0), " vs. ", cell_size * 4)); + OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4, + errors::InvalidArgument( + "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0), + " vs. ", cell_size * 4)); - OP_REQUIRES( - ctx, i_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("i.dim_size(0) != batch_size: ", - i_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, i_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("i.dim_size(1) != cell_size: ", - i_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, i_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "i.dim_size(0) != batch_size: ", i_tensor->dim_size(0), + " vs. ", batch_size)); + OP_REQUIRES(ctx, i_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "i.dim_size(1) != cell_size: ", i_tensor->dim_size(1), + " vs. ", cell_size)); - OP_REQUIRES( - ctx, cs_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("cs.dim_size(0) != batch_size: ", - cs_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, cs_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("cs.dim_size(1) != cell_size: ", - cs_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, cs_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "cs.dim_size(0) != batch_size: ", cs_tensor->dim_size(0), + " vs. ", batch_size)); + OP_REQUIRES(ctx, cs_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "cs.dim_size(1) != cell_size: ", cs_tensor->dim_size(1), + " vs. ", cell_size)); - OP_REQUIRES( - ctx, f_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("f.dim_size(0) != batch_size: ", - f_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, f_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("i.dim_size(1) != cell_size: ", - f_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, f_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "f.dim_size(0) != batch_size: ", f_tensor->dim_size(0), + " vs. ", batch_size)); + OP_REQUIRES(ctx, f_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "i.dim_size(1) != cell_size: ", f_tensor->dim_size(1), + " vs. ", cell_size)); - OP_REQUIRES( - ctx, o_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("o.dim_size(0) != batch_size: ", - o_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, o_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("o.dim_size(1) != cell_size: ", - o_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, o_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "o.dim_size(0) != batch_size: ", o_tensor->dim_size(0), + " vs. ", batch_size)); + OP_REQUIRES(ctx, o_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "o.dim_size(1) != cell_size: ", o_tensor->dim_size(1), + " vs. ", cell_size)); - OP_REQUIRES( - ctx, ci_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("ci.dim_size(0) != batch_size: ", - ci_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, ci_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("ci.dim_size(1) != cell_size: ", - ci_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, ci_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "ci.dim_size(0) != batch_size: ", ci_tensor->dim_size(0), + " vs. ", batch_size)); + OP_REQUIRES(ctx, ci_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "ci.dim_size(1) != cell_size: ", ci_tensor->dim_size(1), + " vs. ", cell_size)); - OP_REQUIRES( - ctx, co_tensor->dim_size(0) == batch_size, - errors::InvalidArgument("co.dim_size(0) != batch_size: ", - co_tensor->dim_size(0), " vs. ", batch_size)); - OP_REQUIRES( - ctx, co_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("co.dim_size(1) != cell_size: ", - co_tensor->dim_size(1), " vs. ", cell_size)); + OP_REQUIRES(ctx, co_tensor->dim_size(0) == batch_size, + errors::InvalidArgument( + "co.dim_size(0) != batch_size: ", co_tensor->dim_size(0), + " vs. ", batch_size)); + OP_REQUIRES(ctx, co_tensor->dim_size(1) == cell_size, + errors::InvalidArgument( + "co.dim_size(1) != cell_size: ", co_tensor->dim_size(1), + " vs. ", cell_size)); OP_REQUIRES(ctx, cs_grad_tensor->dim_size(0) == batch_size, errors::InvalidArgument( @@ -860,9 +860,9 @@ class BlockLSTMOp : public OpKernel { h_prev_tensor->dim_size(0), " vs. ", batch_size)); OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size, - errors::InvalidArgument("h_prev.dims(1) != cell_size: ", - h_prev_tensor->dim_size(1), " vs. ", - cell_size)); + errors::InvalidArgument( + "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1), + " vs. ", cell_size)); const Tensor* w_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor)); @@ -872,46 +872,46 @@ class BlockLSTMOp : public OpKernel { errors::InvalidArgument( "w.dim_size(0) != input_size + cell_size: ", w_tensor->dim_size(0), " vs. ", input_size + cell_size)); - OP_REQUIRES( - ctx, w_tensor->dim_size(1) == cell_size * 4, - errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ", - w_tensor->dim_size(1), " vs. ", cell_size * 4)); + OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4, + errors::InvalidArgument( + "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1), + " vs. ", cell_size * 4)); const Tensor* wci_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor)); OP_REQUIRES(ctx, wci_tensor->dims() == 1, errors::InvalidArgument("wci must be 1D")); - OP_REQUIRES( - ctx, wci_tensor->dim_size(0) == cell_size, - errors::InvalidArgument("wci.dim_size(0) != cell_size: ", - wci_tensor->dim_size(0), " vs. ", cell_size)); + OP_REQUIRES(ctx, wci_tensor->dim_size(0) == cell_size, + errors::InvalidArgument( + "wci.dim_size(0) != cell_size: ", wci_tensor->dim_size(0), + " vs. ", cell_size)); const Tensor* wcf_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor)); OP_REQUIRES(ctx, wcf_tensor->dims() == 1, errors::InvalidArgument("wcf must be 1D")); - OP_REQUIRES( - ctx, wcf_tensor->dim_size(0) == cell_size, - errors::InvalidArgument("wcf.dim_size(0) != cell_size: ", - wcf_tensor->dim_size(0), " vs. ", cell_size)); + OP_REQUIRES(ctx, wcf_tensor->dim_size(0) == cell_size, + errors::InvalidArgument( + "wcf.dim_size(0) != cell_size: ", wcf_tensor->dim_size(0), + " vs. ", cell_size)); const Tensor* wco_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor)); OP_REQUIRES(ctx, wco_tensor->dims() == 1, errors::InvalidArgument("wco must be 1D")); - OP_REQUIRES( - ctx, wco_tensor->dim_size(0) == cell_size, - errors::InvalidArgument("wco.dim_size(0) != cell_size: ", - wco_tensor->dim_size(0), " vs. ", cell_size)); + OP_REQUIRES(ctx, wco_tensor->dim_size(0) == cell_size, + errors::InvalidArgument( + "wco.dim_size(0) != cell_size: ", wco_tensor->dim_size(0), + " vs. ", cell_size)); const Tensor* b_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor)); OP_REQUIRES(ctx, b_tensor->dims() == 1, errors::InvalidArgument("b must be 1D")); - OP_REQUIRES( - ctx, b_tensor->dim_size(0) == cell_size * 4, - errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ", - b_tensor->dim_size(0), " vs. ", cell_size * 4)); + OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4, + errors::InvalidArgument( + "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0), + " vs. ", cell_size * 4)); TensorShape batch_cell_shape({timelen, batch_size, cell_size}); Tensor* i_out; @@ -1065,9 +1065,9 @@ class BlockLSTMGradOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor)); const int64 cell_size = w_tensor->dim_size(1) / 4; OP_REQUIRES(ctx, input_size + cell_size == w_tensor->dim_size(0), - errors::InvalidArgument("w matrix rows don't match: ", - input_size + cell_size, " vs. ", - w_tensor->dim_size(0))); + errors::InvalidArgument( + "w matrix rows don't match: ", input_size + cell_size, + " vs. ", w_tensor->dim_size(0))); const Tensor* wci_tensor = nullptr; OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor)); @@ -1193,7 +1193,6 @@ class BlockLSTMGradOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::v(), batch_cell_shape, &h_grad_tensor)); - const Device& device = ctx->eigen_device(); functor::TensorZero()(device, cs_grad_tensor.flat()); diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h index bc6b85f3f1a..d23cedc234b 100644 --- a/tensorflow/contrib/rnn/kernels/lstm_ops.h +++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h @@ -92,7 +92,6 @@ struct TensorZeroPadding { } }; - struct LSTMBlockCell { LSTMBlockCell(const int batch_size, const int input_size, const int cell_size) : batch_size_(batch_size), diff --git a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc index 544cd163c50..68184b643e5 100644 --- a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc +++ b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc @@ -149,8 +149,9 @@ TEST_F(LSTMOpsTest, BlockLSTMGrad_ShapeFn) { INFER_ERROR("must be rank 1", op, "?;?;?;?;?;?;?;?;[1,?]" + suffix); // Output with all input knowns makes known rank outputs. - INFER_OK(op, JoinedCopies("?", 18), "[?,?,?];" + JoinedCopies("[?,?]", 3) + - ";" + JoinedCopies("[?]", 4)); + INFER_OK( + op, JoinedCopies("?", 18), + "[?,?,?];" + JoinedCopies("[?,?]", 3) + ";" + JoinedCopies("[?]", 4)); // Output with copies input shapes to output. string input = strings::StrCat("?;[?,?,?];", JoinedCopies("[?,?]", 3), ";", diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py index cafeb56ad88..9b84635e85e 100644 --- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py +++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py @@ -42,7 +42,6 @@ from tensorflow.python.platform import test from tensorflow.python.framework import test_util from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell - # pylint: enable=protected-access Linear = core_rnn_cell._Linear # pylint: disable=invalid-name @@ -84,19 +83,22 @@ class RNNCellTest(test.TestCase): ], [v.name for v in cell.trainable_variables]) self.assertFalse(cell.non_trainable_variables) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run( - [g], {x.name: np.array([[1., 1.]]), - m.name: np.array([[0.1, 0.1]])}) + res = sess.run([g], { + x.name: np.array([[1., 1.]]), + m.name: np.array([[0.1, 0.1]]) + }) self.assertEqual(res[0].shape, (1, 2)) def testBasicRNNCellNotTrainable(self): with self.test_session() as sess: + def not_trainable_getter(getter, *args, **kwargs): kwargs["trainable"] = False return getter(*args, **kwargs) with variable_scope.variable_scope( - "root", initializer=init_ops.constant_initializer(0.5), + "root", + initializer=init_ops.constant_initializer(0.5), custom_getter=not_trainable_getter): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 2]) @@ -108,9 +110,10 @@ class RNNCellTest(test.TestCase): "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME ], [v.name for v in cell.non_trainable_variables]) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run( - [g], {x.name: np.array([[1., 1.]]), - m.name: np.array([[0.1, 0.1]])}) + res = sess.run([g], { + x.name: np.array([[1., 1.]]), + m.name: np.array([[0.1, 0.1]]) + }) self.assertEqual(res[0].shape, (1, 2)) def testGRUCell(self): @@ -121,9 +124,10 @@ class RNNCellTest(test.TestCase): m = array_ops.zeros([1, 2]) g, _ = rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run( - [g], {x.name: np.array([[1., 1.]]), - m.name: np.array([[0.1, 0.1]])}) + res = sess.run([g], { + x.name: np.array([[1., 1.]]), + m.name: np.array([[0.1, 0.1]]) + }) # Smoke test self.assertAllClose(res[0], [[0.175991, 0.175991]]) with variable_scope.variable_scope( @@ -133,10 +137,10 @@ class RNNCellTest(test.TestCase): m = array_ops.zeros([1, 2]) g, _ = rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run( - [g], - {x.name: np.array([[1., 1., 1.]]), - m.name: np.array([[0.1, 0.1]])}) + res = sess.run([g], { + x.name: np.array([[1., 1., 1.]]), + m.name: np.array([[0.1, 0.1]]) + }) # Smoke test self.assertAllClose(res[0], [[0.156736, 0.156736]]) @@ -148,11 +152,26 @@ class RNNCellTest(test.TestCase): m = array_ops.zeros([1, 2]) g, _ = contrib_rnn_cell.SRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) + res = sess.run([g], { + x.name: np.array([[1., 1.]]), + m.name: np.array([[0.1, 0.1]]) + }) + # Smoke test + self.assertAllClose(res[0], [[0.509682, 0.509682]]) + + def testSRUCellWithDiffSize(self): + with self.test_session() as sess: + with variable_scope.variable_scope( + "root", initializer=init_ops.constant_initializer(0.5)): + x = array_ops.zeros([1, 3]) + m = array_ops.zeros([1, 2]) + g, _ = contrib_rnn_cell.SRUCell(2)(x, m) + sess.run([variables_lib.global_variables_initializer()]) res = sess.run( - [g], {x.name: np.array([[1., 1.]]), + [g], {x.name: np.array([[1., 1., 1.]]), m.name: np.array([[0.1, 0.1]])}) # Smoke test - self.assertAllClose(res[0], [[0.509682, 0.509682]]) + self.assertAllClose(res[0], [[0.55255556, 0.55255556]]) def testBasicLSTMCell(self): for dtype in [dtypes.float16, dtypes.float32]: @@ -164,8 +183,7 @@ class RNNCellTest(test.TestCase): m = array_ops.zeros([1, 8], dtype=dtype) cell = rnn_cell_impl.MultiRNNCell( [ - rnn_cell_impl.BasicLSTMCell( - 2, state_is_tuple=False) + rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False) for _ in range(2) ], state_is_tuple=False) @@ -183,22 +201,21 @@ class RNNCellTest(test.TestCase): "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME ] - self.assertEqual( - expected_variable_names, - [v.name for v in cell.trainable_variables]) + self.assertEqual(expected_variable_names, + [v.name for v in cell.trainable_variables]) self.assertFalse(cell.non_trainable_variables) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run( - [g, out_m], - {x.name: np.array([[1., 1.]]), - m.name: 0.1 * np.ones([1, 8])}) + res = sess.run([g, out_m], { + x.name: np.array([[1., 1.]]), + m.name: 0.1 * np.ones([1, 8]) + }) self.assertEqual(len(res), 2) variables = variables_lib.global_variables() self.assertEqual(expected_variable_names, [v.name for v in variables]) # The numbers in results were not calculated, this is just a # smoke test. - self.assertAllClose( - res[0], np.array([[0.240, 0.240]], dtype=np_dtype), 1e-2) + self.assertAllClose(res[0], np.array( + [[0.240, 0.240]], dtype=np_dtype), 1e-2) expected_mem = np.array( [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]], dtype=np_dtype) @@ -208,13 +225,13 @@ class RNNCellTest(test.TestCase): # Test BasicLSTMCell with input_size != num_units. x = array_ops.zeros([1, 3], dtype=dtype) m = array_ops.zeros([1, 4], dtype=dtype) - g, out_m = rnn_cell_impl.BasicLSTMCell( - 2, state_is_tuple=False)(x, m) + g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( - [g, out_m], - {x.name: np.array([[1., 1., 1.]], dtype=np_dtype), - m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)}) + [g, out_m], { + x.name: np.array([[1., 1., 1.]], dtype=np_dtype), + m.name: 0.1 * np.ones([1, 4], dtype=np_dtype) + }) self.assertEqual(len(res), 2) def testBasicLSTMCellDimension0Error(self): @@ -232,9 +249,11 @@ class RNNCellTest(test.TestCase): g, out_m = rnn_cell_impl.BasicLSTMCell( num_units, state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) - sess.run([g, out_m], - {x.name: 1 * np.ones([batch_size, input_size]), - m.name: 0.1 * np.ones([batch_size - 1, state_size])}) + sess.run( + [g, out_m], { + x.name: 1 * np.ones([batch_size, input_size]), + m.name: 0.1 * np.ones([batch_size - 1, state_size]) + }) def testBasicLSTMCellStateSizeError(self): """Tests that state_size must be num_units * 2.""" @@ -251,9 +270,11 @@ class RNNCellTest(test.TestCase): g, out_m = rnn_cell_impl.BasicLSTMCell( num_units, state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) - sess.run([g, out_m], - {x.name: 1 * np.ones([batch_size, input_size]), - m.name: 0.1 * np.ones([batch_size, state_size])}) + sess.run( + [g, out_m], { + x.name: 1 * np.ones([batch_size, input_size]), + m.name: 0.1 * np.ones([batch_size, state_size]) + }) def testBasicLSTMCellStateTupleType(self): with self.test_session(): @@ -301,11 +322,12 @@ class RNNCellTest(test.TestCase): state_is_tuple=True) g, (out_m0, out_m1) = cell(x, (m0, m1)) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run([g, out_m0, out_m1], { - x.name: np.array([[1., 1.]]), - m0.name: 0.1 * np.ones([1, 4]), - m1.name: 0.1 * np.ones([1, 4]) - }) + res = sess.run( + [g, out_m0, out_m1], { + x.name: np.array([[1., 1.]]), + m0.name: 0.1 * np.ones([1, 4]), + m1.name: 0.1 * np.ones([1, 4]) + }) self.assertEqual(len(res), 3) # The numbers in results were not calculated, this is just a smoke test. # Note, however, these values should match the original @@ -336,10 +358,11 @@ class RNNCellTest(test.TestCase): state_is_tuple=False) output, state = cell(x, m) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run([output, state], { - x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]), - m.name: 0.1 * np.ones((batch_size, state_size)) - }) + res = sess.run( + [output, state], { + x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]), + m.name: 0.1 * np.ones((batch_size, state_size)) + }) self.assertEqual(len(res), 2) # The numbers in results were not calculated, this is mostly just a # smoke test. @@ -442,10 +465,10 @@ class RNNCellTest(test.TestCase): rnn_cell_impl.GRUCell(3), num_proj=3) g, new_m = cell(x, m) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run( - [g, new_m], - {x.name: np.array([[1., 1.]]), - m.name: np.array([[0.1, 0.1, 0.1]])}) + res = sess.run([g, new_m], { + x.name: np.array([[1., 1.]]), + m.name: np.array([[0.1, 0.1, 0.1]]) + }) self.assertEqual(res[1].shape, (1, 3)) # The numbers in results were not calculated, this is just a smoke test. self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]]) @@ -479,9 +502,11 @@ class RNNCellTest(test.TestCase): base_cell = rnn_cell_impl.GRUCell(3) g, m_new = base_cell(x, m) variable_scope.get_variable_scope().reuse_variables() + def residual_with_slice_fn(inp, out): inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3]) return inp_sliced + out + g_res, m_new_res = rnn_cell_impl.ResidualWrapper( base_cell, residual_with_slice_fn)(x, m) sess.run([variables_lib.global_variables_initializer()]) @@ -551,10 +576,10 @@ class RNNCellTest(test.TestCase): self.assertEqual(embedding_cell.output_size, 2) g, new_m = embedding_cell(x, m) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run( - [g, new_m], - {x.name: np.array([[1]]), - m.name: np.array([[0.1, 0.1]])}) + res = sess.run([g, new_m], { + x.name: np.array([[1]]), + m.name: np.array([[0.1, 0.1]]) + }) self.assertEqual(res[1].shape, (1, 2)) # The numbers in results were not calculated, this is just a smoke test. self.assertAllClose(res[0], [[0.17139, 0.17139]]) @@ -584,8 +609,8 @@ class RNNCellTest(test.TestCase): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 4]) _, ml = rnn_cell_impl.MultiRNNCell( - [rnn_cell_impl.GRUCell(2) - for _ in range(2)], state_is_tuple=False)(x, m) + [rnn_cell_impl.GRUCell(2) for _ in range(2)], + state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run(ml, { x.name: np.array([[1., 1.]]), @@ -605,19 +630,20 @@ class RNNCellTest(test.TestCase): # Test incorrectness of state with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"): rnn_cell_impl.MultiRNNCell( - [rnn_cell_impl.GRUCell(2) - for _ in range(2)], state_is_tuple=True)(x, m_bad) + [rnn_cell_impl.GRUCell(2) for _ in range(2)], + state_is_tuple=True)(x, m_bad) _, ml = rnn_cell_impl.MultiRNNCell( - [rnn_cell_impl.GRUCell(2) - for _ in range(2)], state_is_tuple=True)(x, m_good) + [rnn_cell_impl.GRUCell(2) for _ in range(2)], + state_is_tuple=True)(x, m_good) sess.run([variables_lib.global_variables_initializer()]) - res = sess.run(ml, { - x.name: np.array([[1., 1.]]), - m_good[0].name: np.array([[0.1, 0.1]]), - m_good[1].name: np.array([[0.1, 0.1]]) - }) + res = sess.run( + ml, { + x.name: np.array([[1., 1.]]), + m_good[0].name: np.array([[0.1, 0.1]]), + m_good[1].name: np.array([[0.1, 0.1]]) + }) # The numbers in results were not calculated, this is just a # smoke test. However, these numbers should match those of @@ -628,8 +654,11 @@ class RNNCellTest(test.TestCase): class DropoutWrapperTest(test.TestCase): - def _testDropoutWrapper(self, batch_size=None, time_steps=None, - parallel_iterations=None, **kwargs): + def _testDropoutWrapper(self, + batch_size=None, + time_steps=None, + parallel_iterations=None, + **kwargs): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): @@ -640,14 +669,14 @@ class DropoutWrapperTest(test.TestCase): x = constant_op.constant( [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32) m = rnn_cell_impl.LSTMStateTuple( - *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32) - ] * 2) + *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32 + )] * 2) else: x = constant_op.constant( np.random.randn(time_steps, batch_size, 3).astype(np.float32)) m = rnn_cell_impl.LSTMStateTuple(*[ - constant_op.constant( - [[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32) + constant_op. + constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32) ] * 2) outputs, final_state = rnn.dynamic_rnn( cell=rnn_cell_impl.DropoutWrapper( @@ -674,8 +703,8 @@ class DropoutWrapperTest(test.TestCase): res = self._testDropoutWrapper( input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep) true_full_output = np.array( - [[[0.751109, 0.751109, 0.751109]], - [[0.895509, 0.895509, 0.895509]]], dtype=np.float32) + [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]], + dtype=np.float32) true_full_final_c = np.array( [[1.949385, 1.949385, 1.949385]], dtype=np.float32) self.assertAllClose(true_full_output, res[0]) @@ -687,8 +716,8 @@ class DropoutWrapperTest(test.TestCase): res = self._testDropoutWrapper( input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep) true_full_output = np.array( - [[[0.751109, 0.751109, 0.751109]], - [[0.895509, 0.895509, 0.895509]]], dtype=np.float32) + [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]], + dtype=np.float32) true_full_final_c = np.array( [[1.949385, 1.949385, 1.949385]], dtype=np.float32) self.assertAllClose(true_full_output, res[0]) @@ -703,16 +732,20 @@ class DropoutWrapperTest(test.TestCase): ## consistent across both calls. Otherwise the seed may not end ## up being munged consistently across both graphs. res_standard_1 = self._testDropoutWrapper( - input_keep_prob=keep_some, output_keep_prob=keep_some, - state_keep_prob=keep_some, seed=10, + input_keep_prob=keep_some, + output_keep_prob=keep_some, + state_keep_prob=keep_some, + seed=10, parallel_iterations=1) # Clear away the graph and the test session (which keeps variables around) ops.reset_default_graph() self._ClearCachedSession() random_seed.set_random_seed(2) res_standard_2 = self._testDropoutWrapper( - input_keep_prob=keep_some, output_keep_prob=keep_some, - state_keep_prob=keep_some, seed=10, + input_keep_prob=keep_some, + output_keep_prob=keep_some, + state_keep_prob=keep_some, + seed=10, parallel_iterations=1) self.assertAllClose(res_standard_1[0], res_standard_2[0]) self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c) @@ -722,11 +755,12 @@ class DropoutWrapperTest(test.TestCase): keep_all = variable_scope.get_variable("all", initializer=1.0) keep_none = variable_scope.get_variable("none", initializer=1e-10) res = self._testDropoutWrapper( - input_keep_prob=keep_all, output_keep_prob=keep_none, + input_keep_prob=keep_all, + output_keep_prob=keep_none, state_keep_prob=keep_all) true_full_output = np.array( - [[[0.751109, 0.751109, 0.751109]], - [[0.895509, 0.895509, 0.895509]]], dtype=np.float32) + [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]], + dtype=np.float32) true_full_final_c = np.array( [[1.949385, 1.949385, 1.949385]], dtype=np.float32) self.assertAllClose(np.zeros(res[0].shape), res[0]) @@ -739,13 +773,13 @@ class DropoutWrapperTest(test.TestCase): # Even though we dropout state, by default DropoutWrapper never # drops out the memory ("c") term of an LSTMStateTuple. res = self._testDropoutWrapper( - input_keep_prob=keep_all, output_keep_prob=keep_all, + input_keep_prob=keep_all, + output_keep_prob=keep_all, state_keep_prob=keep_none) - true_c_state = np.array( - [[1.713925, 1.713925, 1.713925]], dtype=np.float32) + true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32) true_full_output = np.array( - [[[0.751109, 0.751109, 0.751109]], - [[0.895509, 0.895509, 0.895509]]], dtype=np.float32) + [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]], + dtype=np.float32) self.assertAllClose(true_full_output[0], res[0][0]) # Second output is modified by zero input state self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4) @@ -758,13 +792,14 @@ class DropoutWrapperTest(test.TestCase): keep_all = variable_scope.get_variable("all", initializer=1.0) keep_none = variable_scope.get_variable("none", initializer=1e-10) true_full_output = np.array( - [[[0.751109, 0.751109, 0.751109]], - [[0.895509, 0.895509, 0.895509]]], dtype=np.float32) + [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]], + dtype=np.float32) true_full_final_c = np.array( [[1.949385, 1.949385, 1.949385]], dtype=np.float32) # All outputs are different because inputs are zeroed out res = self._testDropoutWrapper( - input_keep_prob=keep_none, output_keep_prob=keep_all, + input_keep_prob=keep_none, + output_keep_prob=keep_all, state_keep_prob=keep_all) self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4) self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4) @@ -774,9 +809,13 @@ class DropoutWrapperTest(test.TestCase): keep_some = 0.8 keep_all = variable_scope.get_variable("all", initializer=1.0) res = self._testDropoutWrapper( - input_keep_prob=keep_all, output_keep_prob=keep_some, - state_keep_prob=keep_all, variational_recurrent=True, - input_size=3, batch_size=5, time_steps=7) + input_keep_prob=keep_all, + output_keep_prob=keep_some, + state_keep_prob=keep_all, + variational_recurrent=True, + input_size=3, + batch_size=5, + time_steps=7) # Ensure the same dropout pattern for all time steps output_mask = np.abs(res[0]) > 1e-6 for m in output_mask[1:]: @@ -785,9 +824,13 @@ class DropoutWrapperTest(test.TestCase): def testDropoutWrapperRecurrentStateInputAndOutput(self): keep_some = 0.9 res = self._testDropoutWrapper( - input_keep_prob=keep_some, output_keep_prob=keep_some, - state_keep_prob=keep_some, variational_recurrent=True, - input_size=3, batch_size=5, time_steps=7) + input_keep_prob=keep_some, + output_keep_prob=keep_some, + state_keep_prob=keep_some, + variational_recurrent=True, + input_size=3, + batch_size=5, + time_steps=7) # Smoke test for the state/input masks. output_mask = np.abs(res[0]) > 1e-6 @@ -811,17 +854,27 @@ class DropoutWrapperTest(test.TestCase): random_seed.set_random_seed(2347) np.random.seed(23487) res0 = self._testDropoutWrapper( - input_keep_prob=keep_some, output_keep_prob=keep_some, - state_keep_prob=keep_some, variational_recurrent=True, - input_size=3, batch_size=5, time_steps=7, seed=-234987) + input_keep_prob=keep_some, + output_keep_prob=keep_some, + state_keep_prob=keep_some, + variational_recurrent=True, + input_size=3, + batch_size=5, + time_steps=7, + seed=-234987) ops.reset_default_graph() self._ClearCachedSession() random_seed.set_random_seed(2347) np.random.seed(23487) res1 = self._testDropoutWrapper( - input_keep_prob=keep_some, output_keep_prob=keep_some, - state_keep_prob=keep_some, variational_recurrent=True, - input_size=3, batch_size=5, time_steps=7, seed=-234987) + input_keep_prob=keep_some, + output_keep_prob=keep_some, + state_keep_prob=keep_some, + variational_recurrent=True, + input_size=3, + batch_size=5, + time_steps=7, + seed=-234987) output_mask = np.abs(res0[0]) > 1e-6 for time_step in output_mask: @@ -858,9 +911,10 @@ class SlimRNNCellTest(test.TestCase): g, _ = rnn_cell_impl._SlimRNNCell(my_cell)(x, m) # pylint: enable=protected-access sess.run([variables_lib.global_variables_initializer()]) - res = sess.run( - [g], {x.name: np.array([[1., 1.]]), - m.name: np.array([[0.1, 0.1]])}) + res = sess.run([g], { + x.name: np.array([[1., 1.]]), + m.name: np.array([[0.1, 0.1]]) + }) self.assertEqual(res[0].shape, (1, 2)) def testBasicRNNCellMatch(self): diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py index 0258d7202df..57521c6a9ba 100644 --- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py +++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py @@ -45,6 +45,7 @@ from tensorflow.python.platform import test from tensorflow.python.platform import tf_logging from tensorflow.python.util import nest + class Plus1RNNCell(rnn_lib.RNNCell): """RNN Cell generating (output, new_state) = (input + 1, state + 1).""" @@ -160,8 +161,7 @@ class RNNTest(test.TestCase): input_size = 5 max_length = 8 # unrolled up to this length inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32) self.assertEqual(len(outputs), len(inputs)) @@ -178,10 +178,9 @@ class RNNTest(test.TestCase): self.assertAllClose(v, input_value + 1.0) # Final state - self.assertAllClose( - values[-1], - max_length * np.ones( - (batch_size, input_size), dtype=np.float32)) + self.assertAllClose(values[-1], + max_length * np.ones( + (batch_size, input_size), dtype=np.float32)) def testDropout(self): cell = Plus1RNNCell() @@ -191,8 +190,7 @@ class RNNTest(test.TestCase): input_size = 5 max_length = 8 inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] with variable_scope.variable_scope("share_scope"): outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32) @@ -207,8 +205,10 @@ class RNNTest(test.TestCase): with self.test_session(use_gpu=True) as sess: input_value = np.random.randn(batch_size, input_size) values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value}) - full_dropout_values = sess.run(dropped_outputs, - feed_dict={inputs[0]: input_value}) + full_dropout_values = sess.run( + dropped_outputs, feed_dict={ + inputs[0]: input_value + }) for v in values[:-1]: self.assertAllClose(v, input_value + 1.0) @@ -222,8 +222,7 @@ class RNNTest(test.TestCase): input_size = 5 max_length = 8 inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] with variable_scope.variable_scope("drop_scope"): dynamic_outputs, dynamic_state = rnn.static_rnn( @@ -234,12 +233,16 @@ class RNNTest(test.TestCase): input_value = np.random.randn(batch_size, input_size) dynamic_values = sess.run( dynamic_outputs, - feed_dict={inputs[0]: input_value, - sequence_length: [2, 3]}) + feed_dict={ + inputs[0]: input_value, + sequence_length: [2, 3] + }) dynamic_state_value = sess.run( [dynamic_state], - feed_dict={inputs[0]: input_value, - sequence_length: [2, 3]}) + feed_dict={ + inputs[0]: input_value, + sequence_length: [2, 3] + }) # outputs are fully calculated for t = 0, 1 for v in dynamic_values[:2]: @@ -289,8 +292,7 @@ class RNNTest(test.TestCase): input_size = 5 max_length = 8 # unrolled up to this length inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] return rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope=scope) @@ -316,8 +318,7 @@ class LSTMTest(test.TestCase): cell = rnn_cell.LSTMCell( num_units, initializer=initializer, state_is_tuple=False) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32) self.assertEqual(len(outputs), len(inputs)) @@ -343,8 +344,7 @@ class LSTMTest(test.TestCase): initializer=initializer, state_is_tuple=False) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32) self.assertEqual(len(outputs), len(inputs)) @@ -374,8 +374,7 @@ class LSTMTest(test.TestCase): initializer=initializer, state_is_tuple=False) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] with variable_scope.variable_scope("share_scope"): outputs, state = rnn.static_state_saving_rnn( @@ -388,7 +387,9 @@ class LSTMTest(test.TestCase): input_value = np.random.randn(batch_size, input_size) (last_state_value, saved_state_value) = sess.run( [state, state_saver.saved_state["save_lstm"]], - feed_dict={inputs[0]: input_value}) + feed_dict={ + inputs[0]: input_value + }) self.assertAllEqual(last_state_value, saved_state_value) def testNoProjNoShardingTupleStateSaver(self): @@ -406,8 +407,7 @@ class LSTMTest(test.TestCase): initializer=initializer, state_is_tuple=True) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] with variable_scope.variable_scope("share_scope"): outputs, state = rnn.static_state_saving_rnn( @@ -420,7 +420,9 @@ class LSTMTest(test.TestCase): input_value = np.random.randn(batch_size, input_size) last_and_saved_states = sess.run( state + (state_saver.saved_state["c"], state_saver.saved_state["m"]), - feed_dict={inputs[0]: input_value}) + feed_dict={ + inputs[0]: input_value + }) self.assertEqual(4, len(last_and_saved_states)) self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:]) @@ -432,16 +434,17 @@ class LSTMTest(test.TestCase): with self.test_session(graph=ops_lib.Graph()) as sess: initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=self._seed) - state_saver = TestStateSaver(batch_size, { - "c0": num_units, - "m0": num_units, - "c1": num_units + 1, - "m1": num_units + 1, - "c2": num_units + 2, - "m2": num_units + 2, - "c3": num_units + 3, - "m3": num_units + 3 - }) + state_saver = TestStateSaver( + batch_size, { + "c0": num_units, + "m0": num_units, + "c1": num_units + 1, + "m1": num_units + 1, + "c2": num_units + 2, + "m2": num_units + 2, + "c3": num_units + 3, + "m3": num_units + 3 + }) def _cell(i): return rnn_cell.LSTMCell( @@ -459,8 +462,7 @@ class LSTMTest(test.TestCase): self.assertEqual(len(cell.state_size[i]), 2) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] state_names = (("c0", "m0"), ("c1", "m1"), ("c2", "m2"), ("c3", "m3")) @@ -475,10 +477,15 @@ class LSTMTest(test.TestCase): variables_lib.global_variables_initializer().run() input_value = np.random.randn(batch_size, input_size) - last_states = sess.run(list(nest.flatten(state)), - feed_dict={inputs[0]: input_value}) - saved_states = sess.run(list(state_saver.saved_state.values()), - feed_dict={inputs[0]: input_value}) + last_states = sess.run( + list(nest.flatten(state)), feed_dict={ + inputs[0]: input_value + }) + saved_states = sess.run( + list(state_saver.saved_state.values()), + feed_dict={ + inputs[0]: input_value + }) self.assertEqual(8, len(last_states)) self.assertEqual(8, len(saved_states)) flat_state_names = nest.flatten(state_names) @@ -499,8 +506,7 @@ class LSTMTest(test.TestCase): initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=self._seed) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(None, input_size)) + array_ops.placeholder(dtypes.float32, shape=(None, input_size)) ] cell = rnn_cell.LSTMCell( num_units, @@ -526,8 +532,7 @@ class LSTMTest(test.TestCase): initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=self._seed) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(None, input_size)) + array_ops.placeholder(dtypes.float32, shape=(None, input_size)) ] cell_notuple = rnn_cell.LSTMCell( num_units, @@ -569,14 +574,20 @@ class LSTMTest(test.TestCase): variables_lib.global_variables_initializer().run() input_value = np.random.randn(batch_size, input_size) - outputs_notuple_v = sess.run(outputs_notuple, - feed_dict={inputs[0]: input_value}) - outputs_tuple_v = sess.run(outputs_tuple, - feed_dict={inputs[0]: input_value}) + outputs_notuple_v = sess.run( + outputs_notuple, feed_dict={ + inputs[0]: input_value + }) + outputs_tuple_v = sess.run( + outputs_tuple, feed_dict={ + inputs[0]: input_value + }) self.assertAllEqual(outputs_notuple_v, outputs_tuple_v) - (state_notuple_v,) = sess.run((state_notuple,), - feed_dict={inputs[0]: input_value}) + (state_notuple_v,) = sess.run( + (state_notuple,), feed_dict={ + inputs[0]: input_value + }) state_tuple_v = sess.run(state_tuple, feed_dict={inputs[0]: input_value}) self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v)) @@ -593,8 +604,7 @@ class LSTMTest(test.TestCase): -0.01, 0.01, seed=self._seed) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(None, input_size)) + array_ops.placeholder(dtypes.float32, shape=(None, input_size)) ] cell = rnn_cell.LSTMCell( @@ -625,8 +635,7 @@ class LSTMTest(test.TestCase): with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess: initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed) inputs = max_length * [ - array_ops.placeholder( - dtypes.float64, shape=(None, input_size)) + array_ops.placeholder(dtypes.float64, shape=(None, input_size)) ] cell = rnn_cell.LSTMCell( @@ -661,8 +670,7 @@ class LSTMTest(test.TestCase): max_length = 8 with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess: inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(None, input_size)) + array_ops.placeholder(dtypes.float32, shape=(None, input_size)) ] initializer = init_ops.constant_initializer(0.001) @@ -721,8 +729,7 @@ class LSTMTest(test.TestCase): initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=self._seed) inputs = max_length * [ - array_ops.placeholder( - dtypes.float64, shape=(None, input_size)) + array_ops.placeholder(dtypes.float64, shape=(None, input_size)) ] cell = rnn_cell.LSTMCell( @@ -743,16 +750,21 @@ class LSTMTest(test.TestCase): self.assertEqual(len(outputs), len(inputs)) - variables_lib.global_variables_initializer().run( - feed_dict={sequence_length: [2, 3]}) + variables_lib.global_variables_initializer().run(feed_dict={ + sequence_length: [2, 3] + }) input_value = np.asarray( np.random.randn(batch_size, input_size), dtype=np.float64) values = sess.run( - outputs, feed_dict={inputs[0]: input_value, - sequence_length: [2, 3]}) + outputs, feed_dict={ + inputs[0]: input_value, + sequence_length: [2, 3] + }) state_value = sess.run( - [state], feed_dict={inputs[0]: input_value, - sequence_length: [2, 3]}) + [state], feed_dict={ + inputs[0]: input_value, + sequence_length: [2, 3] + }) self.assertEqual(values[0].dtype, input_value.dtype) self.assertEqual(state_value[0].dtype, input_value.dtype) @@ -767,8 +779,7 @@ class LSTMTest(test.TestCase): initializer_d = init_ops.random_uniform_initializer( -1, 1, seed=self._seed + 1) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(None, input_size)) + array_ops.placeholder(dtypes.float32, shape=(None, input_size)) ] cell = rnn_cell.LSTMCell( num_units, @@ -792,8 +803,10 @@ class LSTMTest(test.TestCase): variables_lib.global_variables_initializer().run() input_value = np.random.randn(batch_size, input_size) - output_values = sess.run(outputs0 + outputs1 + outputs2, - feed_dict={inputs[0]: input_value}) + output_values = sess.run( + outputs0 + outputs1 + outputs2, feed_dict={ + inputs[0]: input_value + }) outputs0_values = output_values[:max_length] outputs1_values = output_values[max_length:2 * max_length] outputs2_values = output_values[2 * max_length:] @@ -814,8 +827,7 @@ class LSTMTest(test.TestCase): with self.test_session(graph=ops_lib.Graph()) as sess: initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(None, input_size)) + array_ops.placeholder(dtypes.float32, shape=(None, input_size)) ] cell = rnn_cell.LSTMCell( num_units, @@ -833,8 +845,10 @@ class LSTMTest(test.TestCase): variables_lib.global_variables_initializer().run() input_value = np.random.randn(batch_size, input_size) - output_values = sess.run(outputs0 + outputs1, - feed_dict={inputs[0]: input_value}) + output_values = sess.run( + outputs0 + outputs1, feed_dict={ + inputs[0]: input_value + }) outputs0_values = output_values[:max_length] outputs1_values = output_values[max_length:] self.assertEqual(len(outputs0_values), len(outputs1_values)) @@ -861,8 +875,7 @@ class LSTMTest(test.TestCase): -0.01, 0.01, seed=self._seed) if in_graph_mode: inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(None, input_size)) + array_ops.placeholder(dtypes.float32, shape=(None, input_size)) ] else: inputs = max_length * [ @@ -939,8 +952,7 @@ class LSTMTest(test.TestCase): -0.01, 0.01, seed=self._seed) if in_graph_mode: inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(None, input_size)) + array_ops.placeholder(dtypes.float32, shape=(None, input_size)) ] else: inputs = max_length * [ @@ -1100,8 +1112,8 @@ class LSTMTest(test.TestCase): # Test gradients to inputs and variables w.r.t. outputs & final state static_grad_values = sess.run(static_gradients, feed_dict=feeds) - static_individual_grad_values = sess.run(static_individual_gradients, - feed_dict=feeds) + static_individual_grad_values = sess.run( + static_individual_gradients, feed_dict=feeds) static_individual_var_grad_values = sess.run( static_individual_variable_gradients, feed_dict=feeds) @@ -1148,8 +1160,10 @@ class LSTMTest(test.TestCase): # Generate gradients of several individual outputs w.r.t. inputs dynamic_individual_gradients = nest.flatten([ gradients_impl.gradients(y, [concat_inputs]) - for y in - [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic] + for y in [ + split_outputs_dynamic[0], split_outputs_dynamic[-1], + state_dynamic + ] ]) # Generate gradients of individual variables w.r.t. inputs @@ -1159,8 +1173,10 @@ class LSTMTest(test.TestCase): "Count of trainable variables: %d" % len(trainable_variables)) dynamic_individual_variable_gradients = nest.flatten([ gradients_impl.gradients(y, trainable_variables) - for y in - [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic] + for y in [ + split_outputs_dynamic[0], split_outputs_dynamic[-1], + state_dynamic + ] ]) # Test forward pass @@ -1170,8 +1186,8 @@ class LSTMTest(test.TestCase): # Test gradients to inputs and variables w.r.t. outputs & final state dynamic_grad_values = sess.run(dynamic_gradients, feed_dict=feeds) - dynamic_individual_grad_values = sess.run(dynamic_individual_gradients, - feed_dict=feeds) + dynamic_individual_grad_values = sess.run( + dynamic_individual_gradients, feed_dict=feeds) dynamic_individual_var_grad_values = sess.run( dynamic_individual_variable_gradients, feed_dict=feeds) @@ -1207,8 +1223,8 @@ class LSTMTest(test.TestCase): for i, (a, b) in enumerate( zip(static_individual_var_grad_values, dynamic_individual_var_grad_values)): - tf_logging.info("Comparing individual variable gradients iteration %d" % - i) + tf_logging.info( + "Comparing individual variable gradients iteration %d" % i) self.assertAllEqual(a, b) @test_util.run_in_graph_and_eager_modes() @@ -1223,10 +1239,7 @@ class BidirectionalRNNTest(test.TestCase): self._seed = 23489 np.random.seed(self._seed) - def _createBidirectionalRNN(self, - use_shape, - use_sequence_length, - scope=None): + def _createBidirectionalRNN(self, use_shape, use_sequence_length, scope=None): num_units = 3 input_size = 5 batch_size = 2 @@ -1270,8 +1283,10 @@ class BidirectionalRNNTest(test.TestCase): # Run with pre-specified sequence length of 2, 3 out, s_fw, s_bw = sess.run( [outputs, state_fw, state_bw], - feed_dict={inputs[0]: input_value, - sequence_length: [2, 3]}) + feed_dict={ + inputs[0]: input_value, + sequence_length: [2, 3] + }) # Since the forward and backward LSTM cells were initialized with the # same parameters, the forward and backward output has to be the same, @@ -1312,8 +1327,10 @@ class BidirectionalRNNTest(test.TestCase): input_value, inputs, outputs, state_fw, state_bw, _ = ( self._createBidirectionalRNN(use_shape, False)) variables_lib.global_variables_initializer().run() - out, s_fw, s_bw = sess.run([outputs, state_fw, state_bw], - feed_dict={inputs[0]: input_value}) + out, s_fw, s_bw = sess.run( + [outputs, state_fw, state_bw], feed_dict={ + inputs[0]: input_value + }) # Since the forward and backward LSTM cells were initialized with the # same parameters, the forward and backward output has to be the same, @@ -1396,13 +1413,11 @@ class BidirectionalRNNTest(test.TestCase): use_time_major, use_sequence_length): with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess: input_value, inputs, outputs, state_fw, state_bw, sequence_length = ( - self._createBidirectionalDynamicRNN(use_shape, - use_state_tuple, use_time_major, - use_sequence_length)) + self._createBidirectionalDynamicRNN( + use_shape, use_state_tuple, use_time_major, use_sequence_length)) variables_lib.global_variables_initializer().run() # Run with pre-specified sequence length of 2, 3 - feed_dict = ( - {sequence_length: [2, 3]} if use_sequence_length else {}) + feed_dict = ({sequence_length: [2, 3]} if use_sequence_length else {}) feed_dict.update({inputs[0]: input_value}) if use_state_tuple: out, c_fw, m_fw, c_bw, m_bw = sess.run( @@ -1538,8 +1553,7 @@ class MultiDimensionalLSTMTest(test.TestCase): sequence_length = [4, 6] with self.test_session(graph=ops_lib.Graph()) as sess: inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(None,) + input_size) + array_ops.placeholder(dtypes.float32, shape=(None,) + input_size) ] inputs_using_dim = max_length * [ array_ops.placeholder( @@ -1585,14 +1599,22 @@ class MultiDimensionalLSTMTest(test.TestCase): input_total_size = (batch_size,) + input_size input_value = np.random.randn(*input_total_size) - outputs_static_v = sess.run(outputs_static, - feed_dict={inputs[0]: input_value}) - outputs_dynamic_v = sess.run(outputs_dynamic, - feed_dict={inputs[0]: input_value}) - outputs_bid_v = sess.run(outputs_bid, - feed_dict={inputs_using_dim[0]: input_value}) - outputs_sav_v = sess.run(outputs_sav, - feed_dict={inputs_using_dim[0]: input_value}) + outputs_static_v = sess.run( + outputs_static, feed_dict={ + inputs[0]: input_value + }) + outputs_dynamic_v = sess.run( + outputs_dynamic, feed_dict={ + inputs[0]: input_value + }) + outputs_bid_v = sess.run( + outputs_bid, feed_dict={ + inputs_using_dim[0]: input_value + }) + outputs_sav_v = sess.run( + outputs_sav, feed_dict={ + inputs_using_dim[0]: input_value + }) self.assertAllEqual(outputs_static_v, outputs_dynamic_v) self.assertAllEqual(outputs_static_v, outputs_sav_v) @@ -1602,16 +1624,26 @@ class MultiDimensionalLSTMTest(test.TestCase): outputs_bid_array = np.array(outputs_bid_v) self.assertAllEqual(outputs_static_array_double, outputs_bid_array) - state_static_v = sess.run(state_static, - feed_dict={inputs[0]: input_value}) - state_dynamic_v = sess.run(state_dynamic, - feed_dict={inputs[0]: input_value}) - state_bid_fw_v = sess.run(state_fw, - feed_dict={inputs_using_dim[0]: input_value}) - state_bid_bw_v = sess.run(state_bw, - feed_dict={inputs_using_dim[0]: input_value}) - state_sav_v = sess.run(state_sav, - feed_dict={inputs_using_dim[0]: input_value}) + state_static_v = sess.run( + state_static, feed_dict={ + inputs[0]: input_value + }) + state_dynamic_v = sess.run( + state_dynamic, feed_dict={ + inputs[0]: input_value + }) + state_bid_fw_v = sess.run( + state_fw, feed_dict={ + inputs_using_dim[0]: input_value + }) + state_bid_bw_v = sess.run( + state_bw, feed_dict={ + inputs_using_dim[0]: input_value + }) + state_sav_v = sess.run( + state_sav, feed_dict={ + inputs_using_dim[0]: input_value + }) self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v)) self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v)) self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v)) @@ -1633,16 +1665,17 @@ class NestedLSTMTest(test.TestCase): with self.test_session(graph=ops_lib.Graph()) as sess: state_saver = TestStateSaver(batch_size, state_size) single_input = (array_ops.placeholder( - dtypes.float32, shape=(None, input_size)), array_ops.placeholder( - dtypes.float32, shape=(None, input_size))) + dtypes.float32, shape=(None, input_size)), + array_ops.placeholder( + dtypes.float32, shape=(None, input_size))) inputs = max_length * [single_input] inputs_c = (array_ops.stack([input_[0] for input_ in inputs]), array_ops.stack([input_[1] for input_ in inputs])) - single_input_using_dim = ( - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)), - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size))) + single_input_using_dim = (array_ops.placeholder( + dtypes.float32, shape=(batch_size, input_size)), + array_ops.placeholder( + dtypes.float32, + shape=(batch_size, input_size))) inputs_using_dim = max_length * [single_input_using_dim] # Create a cell for the whole test. This is fine because the cell has no @@ -1688,14 +1721,22 @@ class NestedLSTMTest(test.TestCase): input_total_size = (batch_size, input_size) input_value = (np.random.randn(*input_total_size), np.random.randn(*input_total_size)) - outputs_dynamic_v = sess.run(outputs_dynamic, - feed_dict={single_input: input_value}) - outputs_static_v = sess.run(outputs_static, - feed_dict={single_input: input_value}) - outputs_sav_v = sess.run(outputs_sav, - feed_dict={single_input_using_dim: input_value}) - outputs_bid_v = sess.run(outputs_bid, - feed_dict={single_input_using_dim: input_value}) + outputs_dynamic_v = sess.run( + outputs_dynamic, feed_dict={ + single_input: input_value + }) + outputs_static_v = sess.run( + outputs_static, feed_dict={ + single_input: input_value + }) + outputs_sav_v = sess.run( + outputs_sav, feed_dict={ + single_input_using_dim: input_value + }) + outputs_bid_v = sess.run( + outputs_bid, feed_dict={ + single_input_using_dim: input_value + }) self.assertAllEqual(outputs_static_v, np.transpose(outputs_dynamic_v, (1, 0, 2, 3))) @@ -1706,16 +1747,26 @@ class NestedLSTMTest(test.TestCase): outputs_bid_array = np.array(outputs_bid_v) self.assertAllEqual(outputs_static_array_double, outputs_bid_array) - state_dynamic_v = sess.run(state_dynamic, - feed_dict={single_input: input_value}) - state_static_v = sess.run(state_static, - feed_dict={single_input: input_value}) - state_bid_fw_v = sess.run(state_fw, - feed_dict={single_input_using_dim: input_value}) - state_bid_bw_v = sess.run(state_bw, - feed_dict={single_input_using_dim: input_value}) - state_sav_v = sess.run(state_sav, - feed_dict={single_input_using_dim: input_value}) + state_dynamic_v = sess.run( + state_dynamic, feed_dict={ + single_input: input_value + }) + state_static_v = sess.run( + state_static, feed_dict={ + single_input: input_value + }) + state_bid_fw_v = sess.run( + state_fw, feed_dict={ + single_input_using_dim: input_value + }) + state_bid_bw_v = sess.run( + state_bw, feed_dict={ + single_input_using_dim: input_value + }) + state_sav_v = sess.run( + state_sav, feed_dict={ + single_input_using_dim: input_value + }) self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v)) self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v)) self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v)) @@ -1764,8 +1815,7 @@ class StateSaverRNNTest(test.TestCase): initializer=initializer, state_is_tuple=False) inputs = max_length * [ - array_ops.placeholder( - dtypes.float32, shape=(batch_size, input_size)) + array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size)) ] return rnn.static_state_saving_rnn( cell, @@ -1931,8 +1981,10 @@ class RawRNNTest(test.TestCase): (outputs_val, outputs_dynamic_rnn_val, final_state_val, final_state_dynamic_rnn_val) = sess.run( [outputs, outputs_dynamic_rnn, final_state, final_state_dynamic_rnn], - feed_dict={inputs: rand_input, - sequence_length: rand_seq_len}) + feed_dict={ + inputs: rand_input, + sequence_length: rand_seq_len + }) self.assertAllClose(outputs_dynamic_rnn_val, outputs_val) self.assertAllClose(final_state_dynamic_rnn_val, final_state_val) @@ -1945,12 +1997,16 @@ class RawRNNTest(test.TestCase): self.assertEqual(len(gradients), len(gradients_dynamic_rnn)) gradients_val = sess.run( gradients, - feed_dict={inputs: rand_input, - sequence_length: rand_seq_len}) + feed_dict={ + inputs: rand_input, + sequence_length: rand_seq_len + }) gradients_dynamic_rnn_val = sess.run( gradients_dynamic_rnn, - feed_dict={inputs: rand_input, - sequence_length: rand_seq_len}) + feed_dict={ + inputs: rand_input, + sequence_length: rand_seq_len + }) self.assertEqual(len(gradients_val), len(gradients_dynamic_rnn_val)) input_gradients_val = gradients_val[0] input_gradients_dynamic_rnn_val = gradients_dynamic_rnn_val[0] @@ -2067,14 +2123,13 @@ class RawRNNTest(test.TestCase): def loop_fn(time_, cell_output, cell_state, _): if cell_output is None: - emit_output = (array_ops.zeros( - [2, 3], dtype=dtypes.int32), array_ops.zeros( - [unknown_dim], dtype=dtypes.int64)) + emit_output = (array_ops.zeros([2, 3], dtype=dtypes.int32), + array_ops.zeros([unknown_dim], dtype=dtypes.int64)) next_state = cell.zero_state(batch_size, dtypes.float32) else: - emit_output = (array_ops.ones( - [batch_size, 2, 3], dtype=dtypes.int32), array_ops.ones( - [batch_size, unknown_dim], dtype=dtypes.int64)) + emit_output = (array_ops.ones([batch_size, 2, 3], dtype=dtypes.int32), + array_ops.ones( + [batch_size, unknown_dim], dtype=dtypes.int64)) next_state = cell_state elements_finished = array_ops.tile([time_ >= max_time], [batch_size]) finished = math_ops.reduce_all(elements_finished) @@ -2193,8 +2248,8 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase): cell = rnn_cell.LSTMCell(num_units, use_peepholes=True) gpu_cell = DeviceWrapperCell(cell, cell_device) - inputs = np.random.randn(batch_size, time_steps, - input_size).astype(np.float32) + inputs = np.random.randn(batch_size, time_steps, input_size).astype( + np.float32) sequence_length = np.random.randint(0, time_steps, size=batch_size) if input_device is not None: @@ -2262,8 +2317,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase): gpu_dev = test.gpu_device_name() run_metadata = self._execute_rnn_on( - rnn_device="/cpu:0", cell_device="/cpu:0", - input_device=gpu_dev) + rnn_device="/cpu:0", cell_device="/cpu:0", input_device=gpu_dev) cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata) def _assert_in(op_str, in_stats, out_stats): @@ -2278,8 +2332,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase): return # Test requires access to a GPU gpu_dev = test.gpu_device_name() - run_metadata = self._execute_rnn_on( - input_device=gpu_dev) + run_metadata = self._execute_rnn_on(input_device=gpu_dev) cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata) def _assert_in(op_str, in_stats, out_stats): diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py index 8a3894ef9d7..7b883ebc5d7 100644 --- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py +++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py @@ -1545,97 +1545,6 @@ class BenchmarkLSTMCellXLA(test.Benchmark): ])) -class WeightNormLSTMCellTest(test.TestCase): - """Compared cell output with pre-calculated values.""" - - def _cell_output(self, cell): - """Calculate cell output""" - - with self.test_session() as sess: - init = init_ops.constant_initializer(0.5) - with variable_scope.variable_scope("root", initializer=init): - x = array_ops.zeros([1, 2]) - c0 = array_ops.zeros([1, 2]) - h0 = array_ops.zeros([1, 2]) - - state0 = rnn_cell.LSTMStateTuple(c0, h0) - - xout, sout = cell()(x, state0) - - sess.run([variables.global_variables_initializer()]) - res = sess.run( - [xout, sout], { - x.name: np.array([[1., 1.]]), - c0.name: 0.1 * np.asarray([[0, 1]]), - h0.name: 0.1 * np.asarray([[2, 3]]), - }) - - actual_state_c = res[1].c - actual_state_h = res[1].h - - return actual_state_c, actual_state_h - - def testBasicCell(self): - """Tests cell w/o peepholes and w/o normalisation""" - - def cell(): - return contrib_rnn_cell.WeightNormLSTMCell( - 2, norm=False, use_peepholes=False) - - actual_c, actual_h = self._cell_output(cell) - - expected_c = np.array([[0.65937078, 0.74983585]]) - expected_h = np.array([[0.44923624, 0.49362513]]) - - self.assertAllClose(expected_c, actual_c, 1e-5) - self.assertAllClose(expected_h, actual_h, 1e-5) - - def testNonbasicCell(self): - """Tests cell with peepholes and w/o normalisation""" - - def cell(): - return contrib_rnn_cell.WeightNormLSTMCell( - 2, norm=False, use_peepholes=True) - - actual_c, actual_h = self._cell_output(cell) - - expected_c = np.array([[0.65937084, 0.7574988]]) - expected_h = np.array([[0.4792085, 0.53470564]]) - - self.assertAllClose(expected_c, actual_c, 1e-5) - self.assertAllClose(expected_h, actual_h, 1e-5) - - def testBasicCellWithNorm(self): - """Tests cell w/o peepholes and with normalisation""" - - def cell(): - return contrib_rnn_cell.WeightNormLSTMCell( - 2, norm=True, use_peepholes=False) - - actual_c, actual_h = self._cell_output(cell) - - expected_c = np.array([[0.50125383, 0.58805949]]) - expected_h = np.array([[0.32770363, 0.37397948]]) - - self.assertAllClose(expected_c, actual_c, 1e-5) - self.assertAllClose(expected_h, actual_h, 1e-5) - - def testNonBasicCellWithNorm(self): - """Tests cell with peepholes and with normalisation""" - - def cell(): - return contrib_rnn_cell.WeightNormLSTMCell( - 2, norm=True, use_peepholes=True) - - actual_c, actual_h = self._cell_output(cell) - - expected_c = np.array([[0.50125383, 0.59587258]]) - expected_h = np.array([[0.35041603, 0.40873795]]) - - self.assertAllClose(expected_c, actual_c, 1e-5) - self.assertAllClose(expected_h, actual_h, 1e-5) - - class WeightNormLSTMCellTest(test.TestCase): """Compared cell output with pre-calculated values.""" diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py index 8adf5dce6ec..5fee2e93e4e 100644 --- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py +++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py @@ -2729,25 +2729,9 @@ class SRUCell(rnn_cell_impl._LayerRNNCell): input_depth = inputs_shape[1].value - # Here the contributor believes that the following constraints - # are implied. The reasoning is explained here with reference to - # the paper https://arxiv.org/pdf/1709.02755.pdf upon which this - # implementation is based. - # In section 2.1 Equation 5, specifically: - # h_t = r_t \odot g(c_t) + (1 - r_t) \odot x_t - # the pointwise operation between r_t and x_t means they have - # the same shape (since we are implementing an RNN cell, braodcasting - # does not happen to input of a single timestep); by the same - # reasons, x_t has the same shape as h_t, essentially mandating that - # input_depth = unit_num. - if input_depth != self._num_units: - raise ValueError("SRU requires input_depth == num_units, got " - "input_depth = %s, num_units = %s" % (input_depth, - self._num_units)) - self._kernel = self.add_variable( rnn_cell_impl._WEIGHTS_VARIABLE_NAME, - shape=[input_depth, 3 * self._num_units]) + shape=[input_depth, 4 * self._num_units]) self._bias = self.add_variable( rnn_cell_impl._BIAS_VARIABLE_NAME, @@ -2760,8 +2744,8 @@ class SRUCell(rnn_cell_impl._LayerRNNCell): """Simple recurrent unit (SRU) with num_units cells.""" U = math_ops.matmul(inputs, self._kernel) - x_bar, f_intermediate, r_intermediate = array_ops.split( - value=U, num_or_size_splits=3, axis=1) + x_bar, f_intermediate, r_intermediate, x_tx = array_ops.split( + value=U, num_or_size_splits=4, axis=1) f_r = math_ops.sigmoid( nn_ops.bias_add( @@ -2769,7 +2753,7 @@ class SRUCell(rnn_cell_impl._LayerRNNCell): f, r = array_ops.split(value=f_r, num_or_size_splits=2, axis=1) c = f * state + (1.0 - f) * x_bar - h = r * self._activation(c) + (1.0 - r) * inputs + h = r * self._activation(c) + (1.0 - r) * x_tx return h, c diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py index 95dea312f3a..0a53fd66dbe 100644 --- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py +++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py @@ -331,7 +331,7 @@ def _luong_score(query, keys, scale): # batched matmul on: # [batch_size, 1, depth] . [batch_size, depth, max_time] # resulting in an output shape of: - # [batch_time, 1, max_time]. + # [batch_size, 1, max_time]. # we then squeeze out the center singleton dimension. score = math_ops.matmul(query, keys, transpose_b=True) score = array_ops.squeeze(score, [1]) @@ -924,8 +924,7 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism): _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode, seed=sigmoid_noise_seed) super(LuongMonotonicAttention, self).__init__( - query_layer=layers_core.Dense( - num_units, name="query_layer", use_bias=False, dtype=dtype), + query_layer=None, memory_layer=layers_core.Dense( num_units, name="memory_layer", use_bias=False, dtype=dtype), memory=memory, diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py index 6d8f786223f..3245cc5e721 100644 --- a/tensorflow/contrib/seq2seq/python/ops/helper.py +++ b/tensorflow/contrib/seq2seq/python/ops/helper.py @@ -72,14 +72,6 @@ class Helper(object): """ raise NotImplementedError("batch_size has not been implemented") - @abc.abstractproperty - def input_shape(self): - """Shape of each input element in batch. - - Returns a `TensorShape`. - """ - raise NotImplementedError("input_shape has not been implemented") - @abc.abstractproperty def sample_ids_shape(self): """Shape of tensor returned by `sample`, excluding the batch dimension. @@ -135,7 +127,6 @@ class CustomHelper(Helper): self._sample_fn = sample_fn self._next_inputs_fn = next_inputs_fn self._batch_size = None - self._input_shape = None self._sample_ids_shape = tensor_shape.TensorShape(sample_ids_shape or []) self._sample_ids_dtype = sample_ids_dtype or dtypes.int32 @@ -158,8 +149,6 @@ class CustomHelper(Helper): (finished, next_inputs) = self._initialize_fn() if self._batch_size is None: self._batch_size = array_ops.size(finished) - if self._input_shape is None: - self._input_shape = next_inputs.shape[1:] return (finished, next_inputs) def sample(self, time, outputs, state, name=None): @@ -211,23 +200,6 @@ class TrainingHelper(Helper): lambda inp: array_ops.zeros_like(inp[0, :]), inputs) self._batch_size = array_ops.size(sequence_length) - self._input_shape = inputs.shape[2:] - - @property - def batch_size(self): - return self._batch_size - - @property - def input_shape(self): - return self._input_shape - - @property - def sample_ids_shape(self): - return tensor_shape.TensorShape([]) - - @property - def sample_ids_dtype(self): - return dtypes.int32 @property def inputs(self): @@ -237,6 +209,18 @@ class TrainingHelper(Helper): def sequence_length(self): return self._sequence_length + @property + def batch_size(self): + return self._batch_size + + @property + def sample_ids_shape(self): + return tensor_shape.TensorShape([]) + + @property + def sample_ids_dtype(self): + return dtypes.int32 + def initialize(self, name=None): with ops.name_scope(name, "TrainingHelperInitialize"): finished = math_ops.equal(0, self._sequence_length) @@ -541,16 +525,11 @@ class GreedyEmbeddingHelper(Helper): if self._end_token.get_shape().ndims != 0: raise ValueError("end_token must be a scalar") self._start_inputs = self._embedding_fn(self._start_tokens) - self._input_shape = self._start_inputs.shape[1:] @property def batch_size(self): return self._batch_size - @property - def input_shape(self): - return self._input_shape - @property def sample_ids_shape(self): return tensor_shape.TensorShape([]) @@ -662,8 +641,6 @@ class InferenceHelper(Helper): self._sample_dtype = sample_dtype self._next_inputs_fn = next_inputs_fn self._batch_size = array_ops.shape(start_inputs)[0] - self._input_shape = start_inputs.shape[1:] - self._start_inputs = ops.convert_to_tensor( start_inputs, name="start_inputs") @@ -671,10 +648,6 @@ class InferenceHelper(Helper): def batch_size(self): return self._batch_size - @property - def input_shape(self): - return self._input_shape - @property def sample_ids_shape(self): return self._sample_shape diff --git a/tensorflow/contrib/session_bundle/bundle_shim.py b/tensorflow/contrib/session_bundle/bundle_shim.py index 3149875e41f..69db594f8ae 100644 --- a/tensorflow/contrib/session_bundle/bundle_shim.py +++ b/tensorflow/contrib/session_bundle/bundle_shim.py @@ -82,7 +82,8 @@ def _convert_default_signature_to_signature_def(signatures): """ default_signature = signatures.default_signature signature_def = meta_graph_pb2.SignatureDef() - if default_signature.WhichOneof("type") == legacy_constants.REGRESSION_SIGNATURE: + if (default_signature.WhichOneof("type") == + legacy_constants.REGRESSION_SIGNATURE): regression_signature = default_signature.regression_signature signature_def.method_name = signature_constants.REGRESS_METHOD_NAME _add_input_to_signature_def(regression_signature.input.tensor_name, @@ -91,7 +92,8 @@ def _convert_default_signature_to_signature_def(signatures): _add_output_to_signature_def(regression_signature.output.tensor_name, signature_constants.REGRESS_OUTPUTS, signature_def) - elif default_signature.WhichOneof("type") == legacy_constants.CLASSIFICATION_SIGNATURE: + elif (default_signature.WhichOneof("type") == + legacy_constants.CLASSIFICATION_SIGNATURE): classification_signature = default_signature.classification_signature signature_def.method_name = signature_constants.CLASSIFY_METHOD_NAME _add_input_to_signature_def(classification_signature.input.tensor_name, @@ -132,8 +134,10 @@ def _convert_named_signatures_to_signature_def(signatures): signature_constants.PREDICT_OUTPUTS] # TODO(pdudnik): what if there are other signatures? Mimic cr/140900781 once # it is submitted. - if (input_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE or - output_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE): + if (input_signature.WhichOneof("type") != + legacy_constants.GENERIC_SIGNATURE or + output_signature.WhichOneof("type") != + legacy_constants.GENERIC_SIGNATURE): raise RuntimeError("Named input and output signatures can only be " "up-converted if they are generic signature. " "Input signature type is %s, output signature type is " diff --git a/tensorflow/contrib/session_bundle/bundle_shim_test.cc b/tensorflow/contrib/session_bundle/bundle_shim_test.cc index 72f32a0f555..9a1dd9303f4 100644 --- a/tensorflow/contrib/session_bundle/bundle_shim_test.cc +++ b/tensorflow/contrib/session_bundle/bundle_shim_test.cc @@ -493,17 +493,15 @@ TEST(BundleShimTest, DefaultAndNamedSignatureWithPredict) { ASSERT_FALSE( actual_signature_def_predict->second.inputs().find("foo-input") == actual_signature_def_predict->second.inputs().end()); - EXPECT_EQ("foo-input", - actual_signature_def_predict->second.inputs() - .find("foo-input") - ->second.name()); + EXPECT_EQ("foo-input", actual_signature_def_predict->second.inputs() + .find("foo-input") + ->second.name()); ASSERT_FALSE( actual_signature_def_predict->second.outputs().find("foo-output") == actual_signature_def_predict->second.outputs().end()); - EXPECT_EQ("foo-output", - actual_signature_def_predict->second.outputs() - .find("foo-output") - ->second.name()); + EXPECT_EQ("foo-output", actual_signature_def_predict->second.outputs() + .find("foo-output") + ->second.name()); EXPECT_EQ(kPredictMethodName, actual_signature_def_predict->second.method_name()); } diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py index f6f663aae76..08983337fcc 100644 --- a/tensorflow/contrib/session_bundle/exporter.py +++ b/tensorflow/contrib/session_bundle/exporter.py @@ -281,11 +281,12 @@ class Exporter(object): tmp_export_dir = compat.as_text(export_dir) + "-tmp" gfile.MakeDirs(tmp_export_dir) - self._saver.save(sess, - os.path.join( - compat.as_text(tmp_export_dir), - compat.as_text(constants.EXPORT_BASE_NAME)), - meta_graph_suffix=constants.EXPORT_SUFFIX_NAME) + self._saver.save( + sess, + os.path.join( + compat.as_text(tmp_export_dir), + compat.as_text(constants.EXPORT_BASE_NAME)), + meta_graph_suffix=constants.EXPORT_SUFFIX_NAME) # Run the asset callback. if self._assets_callback and self._assets_to_copy: @@ -301,12 +302,12 @@ class Exporter(object): if exports_to_keep: # create a simple parser that pulls the export_version from the directory. def parser(path): - if os.name == 'nt': - match = re.match("^" + export_dir_base.replace('\\','/') + "/(\\d{8})$", - path.path.replace('\\','/')) + if os.name == "nt": + match = re.match( + "^" + export_dir_base.replace("\\", "/") + "/(\\d{8})$", + path.path.replace("\\", "/")) else: - match = re.match("^" + export_dir_base + "/(\\d{8})$", - path.path) + match = re.match("^" + export_dir_base + "/(\\d{8})$", path.path) if not match: return None return path._replace(export_version=int(match.group(1))) diff --git a/tensorflow/contrib/session_bundle/signature.cc b/tensorflow/contrib/session_bundle/signature.cc index 7133875ad53..ed70a5b91b2 100644 --- a/tensorflow/contrib/session_bundle/signature.cc +++ b/tensorflow/contrib/session_bundle/signature.cc @@ -38,9 +38,9 @@ namespace { Status BatchSizesMatch(const Tensor& input, const Tensor& output) { // Ensure the number of outputs match the number of inputs. if (input.dim_size(0) != output.dim_size(0)) { - return errors::Internal( - strings::StrCat("Input batch size did not match output batch size: ", - input.dim_size(0), " vs. ", output.dim_size(0))); + return errors::Internal(strings::StrCat( + "Input batch size did not match output batch size: ", input.dim_size(0), + " vs. ", output.dim_size(0))); } return Status::OK(); } @@ -100,8 +100,8 @@ Status GetNamedClassificationSignature( const auto& it = signatures.named_signatures().find(name); if (it == signatures.named_signatures().end()) { return errors::NotFound( - strings::StrCat("Missing signature named \"", name, "\" in: ", - DebugStringIfAvailable(signatures))); + strings::StrCat("Missing signature named \"", name, + "\" in: ", DebugStringIfAvailable(signatures))); } if (!it->second.has_classification_signature()) { return errors::FailedPrecondition( @@ -232,8 +232,8 @@ Status GetNamedSignature(const string& name, const auto& it = signatures.named_signatures().find(name); if (it == signatures.named_signatures().end()) { return errors::NotFound( - strings::StrCat("Missing signature named \"", name, "\" in: ", - DebugStringIfAvailable(signatures))); + strings::StrCat("Missing signature named \"", name, + "\" in: ", DebugStringIfAvailable(signatures))); } *signature = it->second; return Status::OK(); diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py index 4e816f9b11b..831c6e427ae 100644 --- a/tensorflow/contrib/slim/python/slim/learning_test.py +++ b/tensorflow/contrib/slim/python/slim/learning_test.py @@ -197,9 +197,7 @@ class MultiplyGradientsTest(test.TestCase): gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32) variable = variables_lib.Variable(array_ops.zeros_like(gradient)) multiplier_flag = variables_lib.Variable(True) - tensor_multiplier = array_ops.where(multiplier_flag, - self._multiplier, - 1.0) + tensor_multiplier = array_ops.where(multiplier_flag, self._multiplier, 1.0) grad_to_var = (gradient, variable) gradient_multipliers = {variable: tensor_multiplier} @@ -212,11 +210,8 @@ class MultiplyGradientsTest(test.TestCase): sess.run(multiplier_flag.assign(False)) gradient_false_flag = sess.run(grad_to_var[0]) np_testing.assert_almost_equal(gradient_true_flag, - self._multiplied_grad_vec, - 5) - np_testing.assert_almost_equal(gradient_false_flag, - self._grad_vec, - 5) + self._multiplied_grad_vec, 5) + np_testing.assert_almost_equal(gradient_false_flag, self._grad_vec, 5) def LogisticClassifier(inputs): @@ -502,6 +497,7 @@ class TrainTest(test.TestCase): purpose. """ dump_root = tempfile.mkdtemp() + def dumping_wrapper(sess): # pylint: disable=invalid-name return dumping_wrapper_lib.DumpingDebugWrapperSession(sess, dump_root) @@ -519,16 +515,13 @@ class TrainTest(test.TestCase): train_op = learning.create_train_op(total_loss, optimizer) loss = learning.train( - train_op, - None, - number_of_steps=1, - session_wrapper=dumping_wrapper) + train_op, None, number_of_steps=1, session_wrapper=dumping_wrapper) self.assertIsNotNone(loss) run_root = glob.glob(os.path.join(dump_root, 'run_*'))[-1] dump = debug_data.DebugDumpDir(run_root) - self.assertAllEqual( - 0, dump.get_tensors('global_step', 0, 'DebugIdentity')[0]) + self.assertAllEqual(0, + dump.get_tensors('global_step', 0, 'DebugIdentity')[0]) def testTrainWithTrace(self): logdir = os.path.join( @@ -961,8 +954,8 @@ class TrainTest(test.TestCase): self.assertGreater(losses[0], losses[1]) def testTrainWithEpochLimit(self): - logdir = os.path.join(tempfile.mkdtemp(prefix=self.get_temp_dir()), - 'tmp_logs') + logdir = os.path.join( + tempfile.mkdtemp(prefix=self.get_temp_dir()), 'tmp_logs') with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) @@ -982,7 +975,8 @@ class TrainTest(test.TestCase): self.assertIsNotNone(loss) self.assertLess(loss, .015) self.assertTrue(os.path.isfile('{}/model.ckpt-300.index'.format(logdir))) - self.assertTrue(os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir))) + self.assertTrue( + os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir))) if __name__ == '__main__': diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py index ee661dfdc11..a6968d8b2a6 100644 --- a/tensorflow/contrib/summary/summary_ops.py +++ b/tensorflow/contrib/summary/summary_ops.py @@ -202,7 +202,7 @@ def create_file_writer(logdir, if flush_millis is None: flush_millis = constant_op.constant(2 * 60 * 1000) if filename_suffix is None: - filename_suffix = constant_op.constant("") + filename_suffix = constant_op.constant(".v2") return _make_summary_writer( name, gen_summary_ops.create_summary_file_writer, diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py index a998ac1e111..4abcc20ed33 100644 --- a/tensorflow/contrib/tensor_forest/client/random_forest.py +++ b/tensorflow/contrib/tensor_forest/client/random_forest.py @@ -18,7 +18,7 @@ from __future__ import division from __future__ import print_function from tensorflow.contrib import layers - +from tensorflow.contrib.learn.python.learn.estimators import constants from tensorflow.contrib.learn.python.learn.estimators import estimator from tensorflow.contrib.learn.python.learn.estimators import head as head_lib from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib @@ -43,8 +43,8 @@ from tensorflow.python.training import training_util KEYS_NAME = 'keys' LOSS_NAME = 'rf_training_loss' TREE_PATHS_PREDICTION_KEY = 'tree_paths' -VARIANCE_PREDICTION_KEY = 'regression_variance' - +VARIANCE_PREDICTION_KEY = 'prediction_variance' +ALL_SERVING_KEY = 'tensorforest_all' EPSILON = 0.000001 @@ -134,7 +134,8 @@ def get_model_fn(params, trainer_id=0, report_feature_importances=False, local_eval=False, - head_scope=None): + head_scope=None, + include_all_in_serving=False): """Return a model function given a way to construct a graph builder.""" if model_head is None: model_head = get_default_head(params, weights_name) @@ -238,7 +239,13 @@ def get_model_fn(params, model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance - + if include_all_in_serving: + # In order to serve the variance we need to add the prediction dict + # to output_alternatives dict. + if not model_ops.output_alternatives: + model_ops.output_alternatives = {} + model_ops.output_alternatives[ALL_SERVING_KEY] = ( + constants.ProblemType.UNSPECIFIED, model_ops.predictions) return model_ops return _model_fn @@ -293,7 +300,8 @@ class TensorForestEstimator(estimator.Estimator): report_feature_importances=False, local_eval=False, version=None, - head=None): + head=None, + include_all_in_serving=False): """Initializes a TensorForestEstimator instance. Args: @@ -339,6 +347,23 @@ class TensorForestEstimator(estimator.Estimator): version: Unused. head: A heads_lib.Head object that calculates losses and such. If None, one will be automatically created based on params. + include_all_in_serving: if True, allow preparation of the complete + prediction dict including the variance to be exported for serving with + the Servo lib; and it also requires calling export_savedmodel with + default_output_alternative_key=ALL_SERVING_KEY, i.e. + estimator.export_savedmodel(export_dir_base=your_export_dir, + serving_input_fn=your_export_input_fn, + default_output_alternative_key=ALL_SERVING_KEY) + if False, resort to default behavior, i.e. export scores and + probabilities but no variances. In this case + default_output_alternative_key should be None while calling + export_savedmodel(). + Note, that due to backward compatibility we cannot always set + include_all_in_serving to True because in this case calling + export_saved_model() without + default_output_alternative_key=ALL_SERVING_KEY (legacy behavior) the + saved_model_export_utils.get_output_alternatives() would raise + ValueError. Returns: A `TensorForestEstimator` instance. @@ -357,7 +382,9 @@ class TensorForestEstimator(estimator.Estimator): num_trainers=num_trainers, trainer_id=trainer_id, report_feature_importances=report_feature_importances, - local_eval=local_eval), + local_eval=local_eval, + include_all_in_serving=include_all_in_serving, + ), model_dir=model_dir, config=config, feature_engineering_fn=feature_engineering_fn) diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc index 76cfb4c9ca0..cf0db788a41 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc @@ -99,18 +99,17 @@ class HardRoutingFunction : public OpKernel { const Tensor& tree_biases_tensor = context->input(2); if (input_data.shape().dim_size(0) > 0) { - OP_REQUIRES(context, input_data.shape().dims() == 2, - errors::InvalidArgument( - "input_data should be two-dimensional")); + OP_REQUIRES( + context, input_data.shape().dims() == 2, + errors::InvalidArgument("input_data should be two-dimensional")); } // Check tensor bounds. if (!CheckTensorBounds(context, input_data)) return; - const int32 num_data = static_cast( - input_data.shape().dim_size(0)); - const int32 num_features = static_cast( - input_data.shape().dim_size(1)); + const int32 num_data = static_cast(input_data.shape().dim_size(0)); + const int32 num_features = + static_cast(input_data.shape().dim_size(1)); Tensor* output_probability = nullptr; TensorShape output_probability_shape; @@ -125,9 +124,8 @@ class HardRoutingFunction : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, output_probability_shape, &output_probability)); - OP_REQUIRES_OK(context, - context->allocate_output(1, output_path_shape, - &output_path)); + OP_REQUIRES_OK( + context, context->allocate_output(1, output_path_shape, &output_path)); auto out_probability = output_probability->tensor(); auto out_path = output_path->tensor(); @@ -144,12 +142,11 @@ class HardRoutingFunction : public OpKernel { out_probability(i, 0) = 1.0; out_path(i, 0) = 0; for (int j = 0; j < tree_depth_ - 1; j++) { - float left_prob = LeftProbability(point, - tree_parameters_tensor.Slice(j, j+1), - tree_biases(j), - num_features); + float left_prob = + LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1), + tree_biases(j), num_features); - int32 left_child = 2*node + 1; + int32 left_child = 2 * node + 1; int32 right_child = left_child + 1; float dot_product = 0.0; diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc index 28f50f1a32e..f64155fa55a 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc @@ -85,12 +85,9 @@ REGISTER_OP("KFeatureGradient") class KFeatureGradient : public OpKernel { public: - explicit KFeatureGradient(OpKernelConstruction* context) - : OpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("layer_num", - &layer_num_)); - OP_REQUIRES_OK(context, context->GetAttr("random_seed", - &random_seed_)); + explicit KFeatureGradient(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_)); + OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_)); } void Compute(OpKernelContext* context) override { @@ -101,14 +98,14 @@ class KFeatureGradient : public OpKernel { const Tensor& routing_tensor = context->input(3); // Extract dimensions from input tensors. - const int32 num_data = static_cast( - input_data_tensor.shape().dim_size(0)); - const int32 num_features = static_cast( - input_data_tensor.shape().dim_size(1)); - const int32 num_nodes = static_cast( - tree_parameters_tensor.shape().dim_size(0)); - const int32 num_features_per_node = static_cast( - tree_parameters_tensor.shape().dim_size(1)); + const int32 num_data = + static_cast(input_data_tensor.shape().dim_size(0)); + const int32 num_features = + static_cast(input_data_tensor.shape().dim_size(1)); + const int32 num_nodes = + static_cast(tree_parameters_tensor.shape().dim_size(0)); + const int32 num_features_per_node = + static_cast(tree_parameters_tensor.shape().dim_size(1)); // Construct output tensors. Tensor* out_routes = nullptr; @@ -127,12 +124,12 @@ class KFeatureGradient : public OpKernel { out_weights_shape.AddDim(num_nodes); out_weights_shape.AddDim(num_features_per_node); - OP_REQUIRES_OK(context, context->allocate_output( - 0, out_routes_shape, &out_routes)); - OP_REQUIRES_OK(context, context->allocate_output( - 1, out_data_shape, &out_data)); - OP_REQUIRES_OK(context, context->allocate_output( - 2, out_weights_shape, &out_weights)); + OP_REQUIRES_OK(context, + context->allocate_output(0, out_routes_shape, &out_routes)); + OP_REQUIRES_OK(context, + context->allocate_output(1, out_data_shape, &out_data)); + OP_REQUIRES_OK( + context, context->allocate_output(2, out_weights_shape, &out_weights)); tensorforest::Initialize(*out_data, 0.0f); @@ -148,18 +145,13 @@ class KFeatureGradient : public OpKernel { std::vector feature_set; for (int i = 0; i < num_data; i++) { - const Tensor point = input_data_tensor.Slice(i, i+1); + const Tensor point = input_data_tensor.Slice(i, i + 1); feature_set.clear(); // Traverse the tree from the bottom up. for (int j = num_nodes - 1; j >= 0; j--) { - tensorforest::GetFeatureSet( - layer_num_, - j, - random_seed_, - num_features, - num_features_per_node, - &feature_set); + tensorforest::GetFeatureSet(layer_num_, j, random_seed_, num_features, + num_features_per_node, &feature_set); // Compute routing gradient. // j is a leaf node. @@ -170,12 +162,8 @@ class KFeatureGradient : public OpKernel { int32 right_child = left_child + 1; float left_prob = LeftProbabilityK( - point, - feature_set, - tree_parameters_tensor.Slice(j, j+1), - tree_biases(j), - num_features, - num_features_per_node); + point, feature_set, tree_parameters_tensor.Slice(j, j + 1), + tree_biases(j), num_features, num_features_per_node); float right_prob = 1.0f - left_prob; diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc index 9bc42eb61fa..e7cafb144da 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc @@ -43,7 +43,6 @@ using shape_inference::ShapeHandle; using tensorforest::CheckTensorBounds; using tensorforest::LeftProbabilityK; - // The term 'routing function' is synonymous with 'the probability // that an instance is routed to each leaf node.' It is defined in // 'Deep Neural Decision Forests' by Kontschieder et al. @@ -96,10 +95,8 @@ class KFeatureRoutingFunction : public OpKernel { OP_REQUIRES_OK(context, context->GetAttr("max_nodes", &max_nodes_)); OP_REQUIRES_OK(context, context->GetAttr("num_features_per_node", &num_features_per_node_)); - OP_REQUIRES_OK(context, context->GetAttr("layer_num", - &layer_num_)); - OP_REQUIRES_OK(context, context->GetAttr("random_seed", - &random_seed_)); + OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_)); + OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_)); } void Compute(OpKernelContext* context) override { @@ -108,27 +105,25 @@ class KFeatureRoutingFunction : public OpKernel { const Tensor& tree_biases_tensor = context->input(2); if (input_data.shape().dim_size(0) > 0) { - OP_REQUIRES(context, input_data.shape().dims() == 2, - errors::InvalidArgument( - "input_data should be two-dimensional")); + OP_REQUIRES( + context, input_data.shape().dims() == 2, + errors::InvalidArgument("input_data should be two-dimensional")); } // Check tensor bounds. if (!CheckTensorBounds(context, input_data)) return; - const int32 num_data = static_cast( - input_data.shape().dim_size(0)); - const int32 num_features = static_cast( - input_data.shape().dim_size(1)); + const int32 num_data = static_cast(input_data.shape().dim_size(0)); + const int32 num_features = + static_cast(input_data.shape().dim_size(1)); Tensor* output_probabilities = nullptr; TensorShape output_shape; output_shape.AddDim(num_data); output_shape.AddDim(max_nodes_); - OP_REQUIRES_OK(context, - context->allocate_output(0, output_shape, - &output_probabilities)); + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, + &output_probabilities)); auto out_probs = output_probabilities->tensor(); const auto tree_biases = tree_biases_tensor.tensor(); @@ -136,30 +131,22 @@ class KFeatureRoutingFunction : public OpKernel { // Iteratively compute the probability of reaching each leaf. std::vector feature_set; for (int i = 0; i < num_data; i++) { - const Tensor point = input_data.Slice(i, i+1); + const Tensor point = input_data.Slice(i, i + 1); out_probs(i, 0) = 1.0f; for (int j = 0; j < max_nodes_ / 2; j++) { feature_set.clear(); - tensorforest::GetFeatureSet( - layer_num_, - i, - random_seed_, - num_features, - num_features_per_node_, - &feature_set); + tensorforest::GetFeatureSet(layer_num_, i, random_seed_, num_features, + num_features_per_node_, &feature_set); - int32 left_child = 2*j + 1; + int32 left_child = 2 * j + 1; int32 right_child = left_child + 1; float prob = out_probs(i, j); - float left_prob = LeftProbabilityK(point, - feature_set, - tree_parameters_tensor.Slice(j, j+1), - tree_biases(j), - num_features, - num_features_per_node_); + float left_prob = LeftProbabilityK( + point, feature_set, tree_parameters_tensor.Slice(j, j + 1), + tree_biases(j), num_features, num_features_per_node_); out_probs(i, left_child) = prob * left_prob; out_probs(i, right_child) = prob * (1.0f - left_prob); diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc index 4027e732b3f..0c2eaabe8f3 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc @@ -90,46 +90,43 @@ class RoutingFunction : public OpKernel { const Tensor& tree_biases_tensor = context->input(2); if (input_data.shape().dim_size(0) > 0) { - OP_REQUIRES(context, input_data.shape().dims() == 2, - errors::InvalidArgument( - "input_data should be two-dimensional")); + OP_REQUIRES( + context, input_data.shape().dims() == 2, + errors::InvalidArgument("input_data should be two-dimensional")); } // Check tensor bounds. if (!CheckTensorBounds(context, input_data)) return; - const int32 num_data = static_cast( - input_data.shape().dim_size(0)); - const int32 num_features = static_cast( - input_data.shape().dim_size(1)); + const int32 num_data = static_cast(input_data.shape().dim_size(0)); + const int32 num_features = + static_cast(input_data.shape().dim_size(1)); Tensor* output_probabilities = nullptr; TensorShape output_shape; output_shape.AddDim(num_data); output_shape.AddDim(max_nodes_); - OP_REQUIRES_OK(context, - context->allocate_output(0, output_shape, - &output_probabilities)); + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, + &output_probabilities)); auto out_probs = output_probabilities->tensor(); const auto tree_biases = tree_biases_tensor.tensor(); // Iteratively compute the probability of reaching each leaf. for (int i = 0; i < num_data; i++) { - const Tensor point = input_data.Slice(i, i+1); + const Tensor point = input_data.Slice(i, i + 1); out_probs(i, 0) = 1.0; for (int j = 0; j < max_nodes_ / 2; j++) { - int32 left_child = 2*j + 1; + int32 left_child = 2 * j + 1; int32 right_child = left_child + 1; float prob = out_probs(i, j); - float left_prob = LeftProbability(point, - tree_parameters_tensor.Slice(j, j+1), - tree_biases(j), - num_features); + float left_prob = + LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1), + tree_biases(j), num_features); out_probs(i, left_child) = prob * left_prob; out_probs(i, right_child) = prob * (1.0 - left_prob); diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc index 66aa293dc1c..c9df09bfda4 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc @@ -96,10 +96,9 @@ class StochasticHardRoutingFunction : public OpKernel { explicit StochasticHardRoutingFunction(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("tree_depth", &tree_depth_)); - OP_REQUIRES_OK(context, context->GetAttr("random_seed", - &random_seed_)); + OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_)); single_rand_ = std::unique_ptr( - new random::PhiloxRandom(random_seed_)); + new random::PhiloxRandom(random_seed_)); rng_ = std::unique_ptr( new random::SimplePhilox(single_rand_.get())); } @@ -111,20 +110,19 @@ class StochasticHardRoutingFunction : public OpKernel { const Tensor& tree_biases_tensor = context->input(2); if (input_data.shape().dim_size(0) > 0) { - OP_REQUIRES(context, input_data.shape().dims() == 2, - errors::InvalidArgument( - "input_data should be two-dimensional")); + OP_REQUIRES( + context, input_data.shape().dims() == 2, + errors::InvalidArgument("input_data should be two-dimensional")); } // Check tensor bounds. if (!CheckTensorBounds(context, input_data)) return; - const int32 num_data = static_cast( - input_data.shape().dim_size(0)); - const int32 num_features = static_cast( - input_data.shape().dim_size(1)); - const int32 num_nodes = static_cast( - tree_parameters_tensor.shape().dim_size(0)); + const int32 num_data = static_cast(input_data.shape().dim_size(0)); + const int32 num_features = + static_cast(input_data.shape().dim_size(1)); + const int32 num_nodes = + static_cast(tree_parameters_tensor.shape().dim_size(0)); Tensor* output_probability = nullptr; TensorShape output_probability_shape; @@ -139,9 +137,8 @@ class StochasticHardRoutingFunction : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, output_probability_shape, &output_probability)); - OP_REQUIRES_OK(context, - context->allocate_output(1, output_path_shape, - &output_path)); + OP_REQUIRES_OK( + context, context->allocate_output(1, output_path_shape, &output_path)); auto out_probability = output_probability->tensor(); auto out_path = output_path->tensor(); @@ -150,19 +147,18 @@ class StochasticHardRoutingFunction : public OpKernel { // Stochastically traverse the tree to a leaf. for (int i = 0; i < num_data; i++) { - const Tensor point = input_data.Slice(i, i+1); + const Tensor point = input_data.Slice(i, i + 1); int32 node = 0; out_probability(i, 0) = 1.0; out_path(i, 0) = 0; for (int j = 0; j < tree_depth_ - 1; j++) { - int32 left_child = 2*node + 1; + int32 left_child = 2 * node + 1; int32 right_child = left_child + 1; - float left_prob = LeftProbability(point, - tree_parameters_tensor.Slice(j, j+1), - tree_biases(j), - num_features); + float left_prob = + LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1), + tree_biases(j), num_features); if (left_prob < rng_->RandFloat()) { CHECK_LT(i, num_data); diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc index 0b5afe464f4..b0d8b832b54 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc @@ -149,14 +149,14 @@ class StochasticHardRoutingGradient : public OpKernel { TensorShape output_bias_shape; output_bias_shape.AddDim(num_data); - OP_REQUIRES_OK(context, context->allocate_output( - 0, output_routing_shape, &output_routing)); - OP_REQUIRES_OK(context, context->allocate_output( - 1, output_data_shape, &output_data)); - OP_REQUIRES_OK(context, context->allocate_output( - 2, output_parameters_shape, &output_parameters)); - OP_REQUIRES_OK(context, context->allocate_output( - 3, output_bias_shape, &output_bias)); + OP_REQUIRES_OK(context, context->allocate_output(0, output_routing_shape, + &output_routing)); + OP_REQUIRES_OK( + context, context->allocate_output(1, output_data_shape, &output_data)); + OP_REQUIRES_OK(context, context->allocate_output(2, output_parameters_shape, + &output_parameters)); + OP_REQUIRES_OK( + context, context->allocate_output(3, output_bias_shape, &output_bias)); tensorforest::Initialize(*output_routing, 0.0); tensorforest::Initialize(*output_data, 0.0); @@ -178,7 +178,7 @@ class StochasticHardRoutingGradient : public OpKernel { const Tensor point = input_data.Slice(i, i + 1); // Traverses the tree from the bottom up. - for (int j = tree_depth_-1; j > -1; j--) { + for (int j = tree_depth_ - 1; j > -1; j--) { int32 node = path(i, j); CHECK_LT(node, num_nodes); diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc index cacad03e274..25825a78a14 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc @@ -64,8 +64,7 @@ REGISTER_OP("UnpackPath") class UnpackPath : public OpKernel { public: - explicit UnpackPath(OpKernelConstruction* context) - : OpKernel(context) {} + explicit UnpackPath(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { VLOG(1) << "unpack start"; @@ -73,8 +72,8 @@ class UnpackPath : public OpKernel { const Tensor& path_values_tensor = context->input(1); const int32 num_data = static_cast(path_tensor.shape().dim_size(0)); - const int32 tree_depth = static_cast( - path_tensor.shape().dim_size(1)); + const int32 tree_depth = + static_cast(path_tensor.shape().dim_size(1)); const int32 num_nodes = MathUtil::IPow(2, tree_depth) - 1; @@ -107,7 +106,6 @@ class UnpackPath : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU), - UnpackPath); +REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU), UnpackPath); } // namespace tensorflow diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc index c091a73c4e4..34388fe1aab 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc @@ -25,9 +25,7 @@ namespace tensorforest { using tensorflow::Tensor; -float LeftProbability(const Tensor& point, - const Tensor& weight, - float bias, +float LeftProbability(const Tensor& point, const Tensor& weight, float bias, int num_features) { const auto p = point.unaligned_flat(); const auto w = weight.unaligned_flat(); @@ -41,11 +39,8 @@ float LeftProbability(const Tensor& point, return 1.0 / (1.0 + exp(-dot_product + bias)); } -float LeftProbabilityK(const Tensor& point, - std::vector feature_set, - const Tensor& weight, - float bias, - int num_features, +float LeftProbabilityK(const Tensor& point, std::vector feature_set, + const Tensor& weight, float bias, int num_features, int k) { const auto p = point.unaligned_flat(); const auto w = weight.unaligned_flat(); diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h index c5902184f95..69a0143a4e3 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h +++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h @@ -24,16 +24,11 @@ namespace tensorflow { namespace tensorforest { // Returns the probability that the point falls to the left. -float LeftProbability(const Tensor& point, - const Tensor& weight, - float bias, +float LeftProbability(const Tensor& point, const Tensor& weight, float bias, int num_features); -float LeftProbabilityK(const Tensor& point, - std::vector feature_set, - const Tensor& weight, - float bias, - int num_features, +float LeftProbabilityK(const Tensor& point, std::vector feature_set, + const Tensor& weight, float bias, int num_features, int k); // Returns a random set of num_features_to_pick features in the @@ -49,5 +44,3 @@ void GetFeatureSet(int32 tree_num, int32 node_num, int32 random_seed, } // namespace tensorflow #endif // LEARNING_LIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_ - - diff --git a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc index 47b49a379c4..b21a9179777 100644 --- a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc +++ b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc @@ -30,15 +30,13 @@ namespace tensorflow { using tensorforest::CheckTensorBounds; - float Convert(const string& in) { const std::size_t intval = std::hash()(in); return static_cast(intval); } - -void Evaluate(const Tensor& input_data, Tensor output_data, - int32 start, int32 end) { +void Evaluate(const Tensor& input_data, Tensor output_data, int32 start, + int32 end) { auto out_data = output_data.unaligned_flat(); const auto in_data = input_data.unaligned_flat(); @@ -59,9 +57,8 @@ class ReinterpretStringToFloat : public OpKernel { if (!CheckTensorBounds(context, input_data)) return; Tensor* output_data = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(0, input_data.shape(), - &output_data)); + OP_REQUIRES_OK( + context, context->allocate_output(0, input_data.shape(), &output_data)); // Evaluate input data in parallel. const int32 num_data = static_cast(input_data.NumElements()); @@ -73,8 +70,8 @@ class ReinterpretStringToFloat : public OpKernel { auto work = [&input_data, output_data, num_data](int64 start, int64 end) { CHECK(start <= end); CHECK(end <= num_data); - Evaluate(input_data, *output_data, - static_cast(start), static_cast(end)); + Evaluate(input_data, *output_data, static_cast(start), + static_cast(end)); }; Shard(num_threads, worker_threads->workers, num_data, 100, work); } diff --git a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc index dd2a98b08cd..60740c2be37 100644 --- a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc +++ b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc @@ -22,7 +22,6 @@ #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/platform/logging.h" - namespace tensorflow { using tensorforest::CheckTensorBounds; @@ -38,20 +37,19 @@ class ScatterAddNdim : public OpKernel { if (indices_tensor.shape().dim_size(0) > 0) { OP_REQUIRES(context, indices_tensor.shape().dims() == 2, - errors::InvalidArgument( - "indices should be two-dimensional")); + errors::InvalidArgument("indices should be two-dimensional")); const int32 delta_dims = deltas_tensor.shape().dims(); OP_REQUIRES( context, indices_tensor.shape().dim_size(1) + delta_dims == - input_tensor.shape().dims() + 1, + input_tensor.shape().dims() + 1, errors::InvalidArgument( "Number of indices dimensions should be the same as input " "rank.")); OP_REQUIRES( context, indices_tensor.shape().dim_size(0) == - deltas_tensor.shape().dim_size(0), + deltas_tensor.shape().dim_size(0), errors::InvalidArgument( "Number of updates should be same as number of indices.")); } else { @@ -68,8 +66,8 @@ class ScatterAddNdim : public OpKernel { const auto indices = indices_tensor.tensor(); const auto deltas = deltas_tensor.unaligned_flat(); - const int32 num_dims = static_cast( - indices_tensor.shape().dim_size(1)); + const int32 num_dims = + static_cast(indices_tensor.shape().dim_size(1)); // Figure out if indices don't specify a complete position in the // input tensor. @@ -80,10 +78,9 @@ class ScatterAddNdim : public OpKernel { // Calculate index multipliers. std::vector multipliers; - OP_REQUIRES( - context, input.size() < std::numeric_limits::max(), - errors::InvalidArgument( - "Input must contain less than 2^31 total elements")); + OP_REQUIRES(context, input.size() < std::numeric_limits::max(), + errors::InvalidArgument( + "Input must contain less than 2^31 total elements")); int32 last_size = static_cast(input.size()); for (int32 j = 0; j < num_dims; j++) { diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc index 94e12cea5a0..44997ec5d6d 100644 --- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc +++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc @@ -65,8 +65,8 @@ void GetTwoBest(int max, const std::function& score_fn, float ClassificationSplitScore( const Eigen::Tensor& splits, - const Eigen::Tensor& rights, - int32 num_classes, int i) { + const Eigen::Tensor& rights, int32 num_classes, + int i) { Eigen::array offsets; // Class counts are stored with the total in [0], so the length of each // count vector is num_classes + 1. @@ -74,7 +74,7 @@ float ClassificationSplitScore( Eigen::array extents; extents[0] = num_classes; return WeightedGiniImpurity(splits.slice(offsets, extents)) + - WeightedGiniImpurity(rights.slice(offsets, extents)); + WeightedGiniImpurity(rights.slice(offsets, extents)); } void GetTwoBestClassification(const Tensor& total_counts, @@ -90,29 +90,28 @@ void GetTwoBestClassification(const Tensor& total_counts, // in seg faults, so we have to go with flat views of these tensors. However, // it is still pretty efficient because we put off evaluation until the // score is actually returned. - const auto tc = total_counts.Slice( - accumulator, accumulator + 1).unaligned_flat(); + const auto tc = + total_counts.Slice(accumulator, accumulator + 1).unaligned_flat(); // TODO(gilberth): See if we can delay evaluation here by templating the // arguments to ClassificationSplitScore. - const Eigen::Tensor splits = split_counts.Slice( - accumulator, accumulator + 1).unaligned_flat(); + const Eigen::Tensor splits = + split_counts.Slice(accumulator, accumulator + 1).unaligned_flat(); Eigen::array bcast; bcast[0] = num_splits; const Eigen::Tensor rights = tc.broadcast(bcast) - splits; - std::function score_fn = std::bind( - ClassificationSplitScore, splits, rights, num_classes, - std::placeholders::_1); + std::function score_fn = + std::bind(ClassificationSplitScore, splits, rights, num_classes, + std::placeholders::_1); GetTwoBest(num_splits, score_fn, best_score, best_index, second_best_score, second_best_index); } -int32 BestFeatureClassification( - const Tensor& total_counts, const Tensor& split_counts, - int32 accumulator) { +int32 BestFeatureClassification(const Tensor& total_counts, + const Tensor& split_counts, int32 accumulator) { float best_score; float second_best_score; int best_feature_index; @@ -130,8 +129,7 @@ float RegressionSplitScore( const Eigen::Tensor& splits_square, const Eigen::Tensor& right_sums, const Eigen::Tensor& right_squares, - int32 accumulator, - int32 num_regression_dims, int i) { + int32 accumulator, int32 num_regression_dims, int i) { Eigen::array offsets = {i * num_regression_dims + 1}; Eigen::array extents = {num_regression_dims - 1}; float left_count = splits_count_accessor(accumulator, i, 0); @@ -141,15 +139,15 @@ float RegressionSplitScore( // Guard against divide-by-zero. if (left_count > 0) { - score += WeightedVariance( - splits_sum.slice(offsets, extents), - splits_square.slice(offsets, extents), left_count); + score += + WeightedVariance(splits_sum.slice(offsets, extents), + splits_square.slice(offsets, extents), left_count); } if (right_count > 0) { - score += WeightedVariance(right_sums.slice(offsets, extents), - right_squares.slice(offsets, extents), - right_count); + score += + WeightedVariance(right_sums.slice(offsets, extents), + right_squares.slice(offsets, extents), right_count); } return score; } @@ -159,20 +157,20 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares, int32 accumulator, float* best_score, int* best_index, float* second_best_score, int* second_best_index) { const int32 num_splits = static_cast(split_sums.shape().dim_size(1)); - const int32 num_regression_dims = static_cast( - split_sums.shape().dim_size(2)); + const int32 num_regression_dims = + static_cast(split_sums.shape().dim_size(2)); // Ideally, Eigen::Tensor::chip would be best to use here but it results // in seg faults, so we have to go with flat views of these tensors. However, // it is still pretty efficient because we put off evaluation until the // score is actually returned. - const auto tc_sum = total_sums.Slice( - accumulator, accumulator + 1).unaligned_flat(); - const auto tc_square = total_squares.Slice( - accumulator, accumulator + 1).unaligned_flat(); - const auto splits_sum = split_sums.Slice( - accumulator, accumulator + 1).unaligned_flat(); - const auto splits_square = split_squares.Slice( - accumulator, accumulator + 1).unaligned_flat(); + const auto tc_sum = + total_sums.Slice(accumulator, accumulator + 1).unaligned_flat(); + const auto tc_square = + total_squares.Slice(accumulator, accumulator + 1).unaligned_flat(); + const auto splits_sum = + split_sums.Slice(accumulator, accumulator + 1).unaligned_flat(); + const auto splits_square = + split_squares.Slice(accumulator, accumulator + 1).unaligned_flat(); // Eigen is infuriating to work with, usually resulting in all kinds of // unhelpful compiler errors when trying something that seems sane. This // helps us do a simple thing like access the first element (the counts) @@ -193,10 +191,10 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares, best_score, best_index, second_best_score, second_best_index); } -int32 BestFeatureRegression( - const Tensor& total_sums, const Tensor& total_squares, - const Tensor& split_sums, const Tensor& split_squares, - int32 accumulator) { +int32 BestFeatureRegression(const Tensor& total_sums, + const Tensor& total_squares, + const Tensor& split_sums, + const Tensor& split_squares, int32 accumulator) { float best_score; float second_best_score; int best_feature_index; @@ -207,10 +205,11 @@ int32 BestFeatureRegression( return best_feature_index; } -bool BestSplitDominatesRegression( - const Tensor& total_sums, const Tensor& total_squares, - const Tensor& split_sums, const Tensor& split_squares, - int32 accumulator) { +bool BestSplitDominatesRegression(const Tensor& total_sums, + const Tensor& total_squares, + const Tensor& split_sums, + const Tensor& split_squares, + int32 accumulator) { // TODO(thomaswc): Implement this, probably as part of v3. return false; } @@ -599,7 +598,6 @@ bool Decide(float value, float bias, DataColumnTypes type) { } } - void GetParentWeightedMean(float leaf_sum, const float* leaf_data, float parent_sum, const float* parent_data, float valid_leaf_threshold, int num_outputs, diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h index dad9df48988..edbac670067 100644 --- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h +++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h @@ -45,13 +45,10 @@ const int32 LEAF_NODE = -1; const int32 FREE_NODE = -2; // Used to indicate column types, e.g. categorical vs. float -enum DataColumnTypes { - kDataFloat = 0, - kDataCategorical = 1 -}; +enum DataColumnTypes { kDataFloat = 0, kDataCategorical = 1 }; // Calculates the sum of a tensor. -template +template T Sum(Tensor counts) { Eigen::Tensor count_sum = counts.unaligned_flat().sum(); @@ -97,7 +94,7 @@ float WeightedGiniImpurity(const T& counts) { return RawWeightedGiniImpurity(smoothed); } -template +template float WeightedVariance(const T1& sums, const T2& squares, float count) { const auto e_x = sums / count; const auto e_x2 = squares / count; @@ -120,10 +117,11 @@ int32 BestFeatureRegression(const Tensor& total_sums, // Returns true if the best split's variance is sufficiently smaller than // that of the next best split. -bool BestSplitDominatesRegression( - const Tensor& total_sums, const Tensor& total_squares, - const Tensor& split_sums, const Tensor& split_squares, - int32 accumulator); +bool BestSplitDominatesRegression(const Tensor& total_sums, + const Tensor& total_squares, + const Tensor& split_sums, + const Tensor& split_squares, + int32 accumulator); // Performs booststrap_samples bootstrap samples of the best split's class // counts and the second best splits's class counts, and returns true if at @@ -178,10 +176,8 @@ bool DecideNode(const GetFeatureFnType& get_dense, // isn't present in sparse_input_indices. sparse_input_indices is assumed // to be sorted. template -float FindSparseValue( - const T1& sparse_input_indices, - const T2& sparse_input_values, - int32 i, int32 j) { +float FindSparseValue(const T1& sparse_input_indices, + const T2& sparse_input_values, int32 i, int32 j) { int32 low = 0; int32 high = sparse_input_values.dimension(0); while (low < high) { @@ -273,7 +269,6 @@ int32 GetNumSparseFeatures(const T1& indices, int32 input_index, // categorical data, it is value != bias. bool Decide(float value, float bias, DataColumnTypes type = kDataFloat); - // Returns true if all the splits are initialized. Since they get initialized // in order, we can simply infer this from the last split. // This should only be called for a single allocator's candidate features diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc index 7485a695dfb..08553545502 100644 --- a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc +++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc @@ -44,11 +44,13 @@ TEST(TestWeightedVariance, Basic) { Tensor squares = test::AsTensor({29, 12}, {2}); EXPECT_FLOAT_EQ(WeightedVariance(sums.unaligned_flat(), - squares.unaligned_flat(), 3), 2.0); + squares.unaligned_flat(), 3), + 2.0); Tensor zero = test::AsTensor({0}, {1}); EXPECT_FLOAT_EQ(WeightedVariance(zero.unaligned_flat(), - zero.unaligned_flat(), 1), 0); + zero.unaligned_flat(), 1), + 0); } TEST(TestInitialize, Basic) { @@ -94,17 +96,16 @@ TEST(BestFeatureClassification, Basic) { const int32 num_accumulators = 4; const int32 num_splits = 3; const int32 num_classes = 4; - Tensor totals = test::AsTensor({1, 5, 6, 7, - 0, 0, 0, 0, - 30, 10, 10, 10, // this one - -1, -1, -1, -1}, - {num_accumulators, num_classes}); - Tensor splits = test::AsTensor( - {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 30, 10, 10, 10, 10, 0, 0, 10, 19, 5, 6, 8, // this one - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, - {num_accumulators, num_splits, num_classes}); + Tensor totals = test::AsTensor( + {1, 5, 6, 7, 0, 0, 0, 0, 30, 10, 10, 10, // this one + -1, -1, -1, -1}, + {num_accumulators, num_classes}); + Tensor splits = + test::AsTensor({1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 10, + 10, 10, 10, 0, 0, 10, 19, 5, 6, 8, // this one + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {num_accumulators, num_splits, num_classes}); EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 1); } @@ -114,17 +115,16 @@ TEST(BestFeatureClassification, NoWinner) { const int32 num_splits = 3; const int32 num_classes = 4; // When counts are all the same, the most reasonable thing to do is pick 0. - Tensor totals = test::AsTensor({1, 5, 6, 7, - 0, 0, 0, 0, - 18, 6, 6, 6, // this one - -1, -1, -1, -1}, - {num_accumulators, num_classes}); - Tensor splits = test::AsTensor( - {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 9, 3, 3, 3, 9, 3, 3, 3, 9, 3, 3, 3, // this one - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, - {num_accumulators, num_splits, num_classes}); + Tensor totals = + test::AsTensor({1, 5, 6, 7, 0, 0, 0, 0, 18, 6, 6, 6, // this one + -1, -1, -1, -1}, + {num_accumulators, num_classes}); + Tensor splits = + test::AsTensor({1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 3, + 3, 3, 9, 3, 3, 3, 9, 3, 3, 3, // this one + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {num_accumulators, num_splits, num_classes}); EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 0); } @@ -133,36 +133,34 @@ TEST(BestFeatureRegression, Basic) { const int32 num_accumulators = 4; const int32 num_splits = 3; const int32 num_classes = 4; - Tensor total_sums = test::AsTensor( - {1, 5, 6, 7, - 0, 0, 0, 0, - 10, 8, 6, 9, // this one - -1, -1, -1, -1}, - {num_accumulators, num_classes}); + Tensor total_sums = + test::AsTensor({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9, // this one + -1, -1, -1, -1}, + {num_accumulators, num_classes}); Tensor total_squares = test::AsTensor( - {1, 5, 6, 7, - 0, 0, 0, 0, - 100, 50, 40, 45, // this one + {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45, // this one -1, -1, -1, -1}, {num_accumulators, num_classes}); - Tensor split_sums = test::AsTensor( - {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 10, 8, 6, 9, 9, 8, 5, 9, 0, 0, 0, 0, // this one - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, - {num_accumulators, num_splits, num_classes}); + Tensor split_sums = + test::AsTensor({1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 8, + 6, 9, 9, 8, 5, 9, 0, 0, 0, 0, // this one + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {num_accumulators, num_splits, num_classes}); // lower the variance by lowering one of the squares just a little. - Tensor split_squares = test::AsTensor( - {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 100, 50, 40, 45, 100, 50, 40, 43, 0, 0, 0, 0, // this one - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, - {num_accumulators, num_splits, num_classes}); + Tensor split_squares = + test::AsTensor( + {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100, 50, 40, 45, 100, 50, 40, 43, 0, 0, 0, 0, // this one + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {num_accumulators, num_splits, num_classes}); EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums, - split_squares, 2), 1); + split_squares, 2), + 1); } TEST(BestFeatureRegression, NoWinner) { @@ -170,37 +168,33 @@ TEST(BestFeatureRegression, NoWinner) { const int32 num_splits = 3; const int32 num_classes = 4; // when counts are all the same, the most reasonable thing to do is pick 0. - Tensor total_sums = test::AsTensor( - {1, 5, 6, 7, - 0, 0, 0, 0, - 10, 8, 6, 9, // this one - -1, -1, -1, -1}, - {num_accumulators, num_classes}); + Tensor total_sums = + test::AsTensor({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9, // this one + -1, -1, -1, -1}, + {num_accumulators, num_classes}); Tensor total_squares = test::AsTensor( - {1, 5, 6, 7, - 0, 0, 0, 0, - 100, 50, 40, 45, // this one + {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45, // this one -1, -1, -1, -1}, {num_accumulators, num_classes}); - Tensor split_sums = test::AsTensor( - {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 10, 8, 6, 9, 10, 8, 6, 9, 10, 8, 6, 9, // this one - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, - {num_accumulators, num_splits, num_classes}); + Tensor split_sums = + test::AsTensor({1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 8, + 6, 9, 10, 8, 6, 9, 10, 8, 6, 9, // this one + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {num_accumulators, num_splits, num_classes}); Tensor split_squares = test::AsTensor( - {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45, // this one - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45, // this one + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, {num_accumulators, num_splits, num_classes}); EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums, - split_squares, 2), 0); + split_squares, 2), + 0); } } // namespace tensorforest } // namespace tensorflow - diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc index 81e2a1b2a1b..f4a7058ddb8 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc @@ -14,8 +14,8 @@ // ============================================================================= #include "tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h" -#include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/platform/env.h" namespace tensorflow { @@ -58,8 +58,7 @@ CandidateGraphRunner::CandidateGraphRunner( // Features don't change, store them in a tensor. const auto& oblique = split.inequality_left_child_test().oblique(); const int32 feat_size = oblique.features_size(); - features_.reset( - new Tensor(tensorflow::DT_INT32, TensorShape({feat_size}))); + features_.reset(new Tensor(tensorflow::DT_INT32, TensorShape({feat_size}))); auto feat = features_->flat(); int i = 0; for (const auto& id : oblique.features()) { @@ -67,10 +66,10 @@ CandidateGraphRunner::CandidateGraphRunner( } } -void CandidateGraphRunner::RunOp( - const string& name, const TensorNameValueList& inputs, - const std::vector& output_tensor_names, - std::vector* outputs) { +void CandidateGraphRunner::RunOp(const string& name, + const TensorNameValueList& inputs, + const std::vector& output_tensor_names, + std::vector* outputs) { std::vector op_name; if (name != kNoOp) { op_name.push_back(name); diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h index cced26b9036..328af28725a 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h @@ -26,7 +26,6 @@ namespace tensorflow { namespace tensorforest { - // Keep a tree ensemble in memory for efficient evaluation and mutation. class DecisionTreeResource : public ResourceBase { public: @@ -35,15 +34,12 @@ class DecisionTreeResource : public ResourceBase { string DebugString() override { return strings::StrCat("DecisionTree[size=", - decision_tree_->decision_tree().nodes_size(), - "]"); + decision_tree_->decision_tree().nodes_size(), "]"); } void MaybeInitialize(); - const decision_trees::Model& decision_tree() const { - return *decision_tree_; - } + const decision_trees::Model& decision_tree() const { return *decision_tree_; } decision_trees::Model* mutable_decision_tree() { return decision_tree_.get(); @@ -59,9 +55,7 @@ class DecisionTreeResource : public ResourceBase { // Resets the resource and frees the proto. // Caller needs to hold the mutex lock while calling this. - void Reset() { - decision_tree_.reset(new decision_trees::Model()); - } + void Reset() { decision_tree_.reset(new decision_trees::Model()); } mutex* get_mutex() { return &mu_; } @@ -84,7 +78,6 @@ class DecisionTreeResource : public ResourceBase { std::vector> node_evaluators_; }; - } // namespace tensorforest } // namespace tensorflow diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h index 85ce7b825b1..bf2b2aaa3c8 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h @@ -22,7 +22,6 @@ namespace tensorflow { namespace tensorforest { - // Base class for evaluators of decision nodes that effectively copy proto // contents into C++ structures for faster execution. class DecisionNodeEvaluator { diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc index 5c49b87443e..af5cf72a3c0 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc @@ -20,11 +20,11 @@ namespace tensorflow { namespace { +using tensorflow::decision_trees::InequalityTest; +using tensorflow::decision_trees::MatchingValuesTest; using tensorflow::tensorforest::InequalityDecisionNodeEvaluator; using tensorflow::tensorforest::MatchingValuesDecisionNodeEvaluator; using tensorflow::tensorforest::ObliqueInequalityDecisionNodeEvaluator; -using tensorflow::decision_trees::InequalityTest; -using tensorflow::decision_trees::MatchingValuesTest; TEST(InequalityDecisionNodeEvaluatorTest, TestLessOrEqual) { InequalityTest test; @@ -124,4 +124,3 @@ TEST(ObliqueDecisionNodeEvaluatorTest, Basic) { } // namespace } // namespace tensorflow - diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h index 0d6712e9e55..eea0be27caf 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h @@ -40,9 +40,7 @@ class FertileStatsResource : public ResourceBase { model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_); } - string DebugString() override { - return "FertileStats"; - } + string DebugString() override { return "FertileStats"; } void ExtractFromProto(const FertileStats& stats); @@ -50,8 +48,7 @@ class FertileStatsResource : public ResourceBase { // Resets the resource and frees the proto. // Caller needs to hold the mutex lock while calling this. - void Reset() { - } + void Reset() {} // Reset the stats for a node, but leave the leaf_stats intact. void ResetSplitStats(int32 node_id, int32 depth) { @@ -84,7 +81,6 @@ class FertileStatsResource : public ResourceBase { // was found. bool BestSplit(int32 node_id, SplitCandidate* best, int32* depth); - private: mutex mu_; std::shared_ptr model_op_; @@ -94,7 +90,6 @@ class FertileStatsResource : public ResourceBase { void AllocateNode(int32 node_id, int32 depth); }; - } // namespace tensorforest } // namespace tensorflow diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc index 3ce630e3a96..da600d34eac 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc @@ -20,7 +20,6 @@ #include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h" #include "tensorflow/core/lib/random/distribution_sampler.h" - namespace tensorflow { namespace tensorforest { @@ -454,14 +453,14 @@ void DenseClassificationGrowStats::PackToProto(FertileSlot* slot) const { class_stats->add_value()->set_float_value(total_counts_[i]); } - for (int split_num = 0; split_num < num_splits(); ++split_num) { + for (int split_num = 0; split_num < num_splits(); ++split_num) { auto* cand = slot->add_candidates(); *cand->mutable_split() = splits_[split_num]; auto* left_stats = cand->mutable_left_stats() ->mutable_classification() ->mutable_dense_counts(); for (int i = 0; i < num_outputs_; ++i) { - left_stats->add_value()->set_float_value(left_count(split_num, i)); + left_stats->add_value()->set_float_value(left_count(split_num, i)); } } } @@ -546,7 +545,7 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const { (*class_stats)[entry.first] = val; } - for (int split_num = 0; split_num < num_splits(); ++split_num) { + for (int split_num = 0; split_num < num_splits(); ++split_num) { auto* cand = slot->add_candidates(); *cand->mutable_split() = splits_[split_num]; auto* left_stats = cand->mutable_left_stats() @@ -561,8 +560,8 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const { } } -float SparseClassificationGrowStats::GiniScore( - int split, float* left_sum, float* right_sum) const { +float SparseClassificationGrowStats::GiniScore(int split, float* left_sum, + float* right_sum) const { float left_square = 0, right_square = 0; *left_sum = 0; *right_sum = 0; @@ -844,12 +843,11 @@ void LeastSquaresRegressionGrowStats::PackToProto(FertileSlot* slot) const { total_squares->add_value()->set_float_value(total_sum_squares_[i]); } - for (int split_num = 0; split_num < num_splits(); ++split_num) { + for (int split_num = 0; split_num < num_splits(); ++split_num) { auto* cand = slot->add_candidates(); *cand->mutable_split() = splits_[split_num]; - auto* sums = cand->mutable_left_stats() - ->mutable_regression() - ->mutable_mean_output(); + auto* sums = + cand->mutable_left_stats()->mutable_regression()->mutable_mean_output(); auto* squares = cand->mutable_left_stats() ->mutable_regression() ->mutable_mean_output_squares(); @@ -891,20 +889,17 @@ float LeastSquaresRegressionGrowStats::SplitVariance(int split) const { float total_variance = 0; for (int i = 0; i < params_.num_outputs(); ++i) { // Left side - const float le_x = - left_sum(split, i) / left_counts_[split]; + const float le_x = left_sum(split, i) / left_counts_[split]; - const float le_x2 = - left_square(split, i) / left_counts_[split]; + const float le_x2 = left_square(split, i) / left_counts_[split]; total_variance += le_x2 - le_x * le_x; // Right side const float re_x = (total_sum_[i] - left_sum(split, i)) / (weight_sum_ - left_counts_[split]); - const float re_x2 = - (total_sum_squares_[i] - left_square(split, i)) / - (weight_sum_ - left_counts_[split]); + const float re_x2 = (total_sum_squares_[i] - left_square(split, i)) / + (weight_sum_ - left_counts_[split]); total_variance += re_x2 - re_x * re_x; } return total_variance; @@ -937,8 +932,7 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const { left->set_weight_sum(left_counts_[best_index]); auto* left_output_sum = left_reg_stats->mutable_mean_output(); for (int i = 0; i < num_outputs; ++i) { - left_output_sum->add_value()->set_float_value( - left_sum(best_index, i)); + left_output_sum->add_value()->set_float_value(left_sum(best_index, i)); } // Right @@ -947,8 +941,8 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const { right->set_weight_sum(weight_sum_ - left_counts_[best_index]); auto* right_output_sum = right_reg_stats->mutable_mean_output(); for (int i = 0; i < num_outputs; ++i) { - right_output_sum->add_value()->set_float_value( - total_sum_[i] - left_sum(best_index, i)); + right_output_sum->add_value()->set_float_value(total_sum_[i] - + left_sum(best_index, i)); } return true; } diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h index 02c0fc687ff..04e6b0a7353 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h @@ -73,21 +73,15 @@ class GrowStats { const InputTarget* target, int example) {} void RemoveSplit(int split_num); - int num_splits() const { - return splits_.size(); - } + int num_splits() const { return splits_.size(); } - float weight_sum() const { - return weight_sum_; - } + float weight_sum() const { return weight_sum_; } virtual bool IsInitialized() const { return weight_sum_ > 0 || splits_.size() == num_splits_to_consider_; } - int32 depth() const { - return depth_; - } + int32 depth() const { return depth_; } protected: GrowStats(const TensorForestParams& params, int32 depth); @@ -206,8 +200,8 @@ class ClassificationStats : public GrowStats { virtual float left_count(int split, int class_num) const = 0; virtual float right_count(int split, int class_num) const = 0; - virtual void ClassificationAddLeftExample( - int split, int64 int_label, float weight) = 0; + virtual void ClassificationAddLeftExample(int split, int64 int_label, + float weight) = 0; virtual void ClassificationAddRightExample(int split, int64 int_label, float weight) { // Does nothing by default, but sub-classes can override. @@ -375,9 +369,7 @@ class SparseClassificationGrowStats : public ClassificationStats { SparseClassificationGrowStats(const TensorForestParams& params, int32 depth) : ClassificationStats(params, depth) {} - void Initialize() override { - Clear(); - } + void Initialize() override { Clear(); } void ExtractFromProto(const FertileSlot& slot) override; void PackToProto(FertileSlot* slot) const override; @@ -562,9 +554,9 @@ class LeastSquaresRegressionGrowStats : public GrowStats { } void RemoveSplitStats(int split_num) override { left_sums_.erase(left_sums_.begin() + num_outputs_ * split_num, - left_sums_.begin() + num_outputs_ * (split_num + 1)); + left_sums_.begin() + num_outputs_ * (split_num + 1)); left_squares_.erase(left_squares_.begin() + num_outputs_ * split_num, - left_squares_.begin() + num_outputs_ * (split_num + 1)); + left_squares_.begin() + num_outputs_ * (split_num + 1)); left_counts_.erase(left_counts_.begin() + split_num, left_counts_.begin() + (split_num + 1)); } @@ -605,7 +597,6 @@ class LeastSquaresRegressionGrowStats : public GrowStats { std::vector left_counts_; }; - } // namespace tensorforest } // namespace tensorflow diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc index ceb58d2ead5..26e989928e0 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc @@ -24,21 +24,21 @@ namespace tensorflow { namespace { -using tensorflow::tensorforest::GrowStats; -using tensorflow::tensorforest::TestableInputTarget; -using tensorflow::tensorforest::FertileSlot; +using tensorflow::decision_trees::BinaryNode; +using tensorflow::decision_trees::FeatureId; +using tensorflow::decision_trees::InequalityTest; using tensorflow::tensorforest::DenseClassificationGrowStats; -using tensorflow::tensorforest::SparseClassificationGrowStats; +using tensorflow::tensorforest::FertileSlot; using tensorflow::tensorforest::FixedSizeClassStats; using tensorflow::tensorforest::FixedSizeSparseClassificationGrowStats; +using tensorflow::tensorforest::GrowStats; using tensorflow::tensorforest::LeastSquaresRegressionGrowStats; -using tensorflow::tensorforest::TensorForestParams; +using tensorflow::tensorforest::SparseClassificationGrowStats; using tensorflow::tensorforest::SPLIT_FINISH_BASIC; using tensorflow::tensorforest::SPLIT_FINISH_DOMINATE_HOEFFDING; using tensorflow::tensorforest::SPLIT_PRUNE_HOEFFDING; -using tensorflow::decision_trees::BinaryNode; -using tensorflow::decision_trees::InequalityTest; -using tensorflow::decision_trees::FeatureId; +using tensorflow::tensorforest::TensorForestParams; +using tensorflow::tensorforest::TestableInputTarget; BinaryNode MakeSplit(const string& feat, float val) { BinaryNode split; @@ -52,8 +52,7 @@ BinaryNode MakeSplit(const string& feat, float val) { return split; } -void RunBatch(GrowStats* stats, - const TestableInputTarget* target) { +void RunBatch(GrowStats* stats, const TestableInputTarget* target) { std::unique_ptr dataset( new tensorflow::tensorforest::TestableDataSet( {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, 2)); @@ -102,18 +101,10 @@ class TestableRunningStats : public DenseClassificationGrowStats { TestableRunningStats(const TensorForestParams& params, int32 depth) : DenseClassificationGrowStats(params, depth) {} - float test_left_sum(int split) { - return get_left_gini()->sum(split); - } - float test_left_square(int split) { - return get_left_gini()->square(split); - } - float test_right_sum(int split) { - return get_right_gini()->sum(split); - } - float test_right_square(int split) { - return get_right_gini()->square(split); - } + float test_left_sum(int split) { return get_left_gini()->sum(split); } + float test_left_square(int split) { return get_left_gini()->square(split); } + float test_right_sum(int split) { return get_right_gini()->sum(split); } + float test_right_square(int split) { return get_right_gini()->square(split); } }; TEST(GrowStatsDenseClassificationTest, BasicRunningStats) { @@ -166,9 +157,7 @@ class TestableFinishEarly : public DenseClassificationGrowStats { int num_times_called_; protected: - void CheckFinishEarlyHoeffding() override { - ++num_times_called_; - } + void CheckFinishEarlyHoeffding() override { ++num_times_called_; } }; TEST(GrowStatsDenseClassificationTest, TestFinishEarly) { @@ -212,7 +201,6 @@ TEST(GrowStatsDenseClassificationTest, TestFinishEarly) { ASSERT_EQ(stat->num_times_called_, 9); } - TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) { TensorForestParams params; params.set_num_outputs(2); @@ -224,7 +212,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) { finish->set_type(SPLIT_FINISH_BASIC); finish->mutable_check_every_steps()->set_constant_value(100); params.mutable_pruning_type()->set_type(SPLIT_PRUNE_HOEFFDING); - params.mutable_pruning_type()->mutable_prune_every_samples() + params.mutable_pruning_type() + ->mutable_prune_every_samples() ->set_constant_value(1); // On each iteration, we add two examples, one of class 0 and one @@ -234,8 +223,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) { std::vector weights = {1, 1}; TestableInputTarget target(labels, weights, 1); std::unique_ptr dataset( - new tensorflow::tensorforest::TestableDataSet( - {-1.0, -1.0, 1.0, -1.0}, 2)); + new tensorflow::tensorforest::TestableDataSet({-1.0, -1.0, 1.0, -1.0}, + 2)); DenseClassificationGrowStats stats(params, 1); stats.Initialize(); diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc index bf0fb924504..d43884481af 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc @@ -109,10 +109,10 @@ void TensorDataSet::set_input_tensors(const Tensor& dense, dense_data_.reset(new DenseStorageType(dense.tensor())); } if (sparse_indices.shape().dims() == 2) { - sparse_indices_.reset(new SparseIndicesStorageType( - sparse_indices.tensor())); - sparse_values_.reset(new SparseValuesStorageType( - sparse_values.tensor())); + sparse_indices_.reset( + new SparseIndicesStorageType(sparse_indices.tensor())); + sparse_values_.reset( + new SparseValuesStorageType(sparse_values.tensor())); sparse_batch_size_ = sparse_shape.tensor()(0); } original_dense_tensor_ = dense; diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h index eafad6b5916..c544a8c75e9 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h @@ -93,9 +93,7 @@ class TensorDataSet { // an int32 you can avoid the atoi32. virtual float GetExampleValue(int example, int32 feature_id) const; - int num_features() { - return available_features_.size(); - } + int num_features() { return available_features_.size(); } const Tensor& original_tensor() const { return original_dense_tensor_; } diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h index 44ec09c50ef..d4402b6055a 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h @@ -79,9 +79,7 @@ class TensorInputTarget : public StoredInputTarget { return (*target_)(example_index * num_targets_ + target_index); } - const Tensor& original_tensor() const { - return original_tensor_; - } + const Tensor& original_tensor() const { return original_tensor_; } protected: Tensor original_tensor_; diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc index d43c068e462..83614a25314 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc @@ -160,6 +160,5 @@ void RegressionLeafModelOperator::ExportModel( } } - } // namespace tensorforest } // namespace tensorflow diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc index ffd92c01f9a..ab4191809b6 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc @@ -26,19 +26,19 @@ namespace { using tensorflow::decision_trees::Leaf; using tensorflow::tensorforest::DenseClassificationLeafModelOperator; using tensorflow::tensorforest::LeafModelOperator; -using tensorflow::tensorforest::SparseClassificationLeafModelOperator; -using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator; using tensorflow::tensorforest::LeafStat; using tensorflow::tensorforest::RegressionLeafModelOperator; -using tensorflow::tensorforest::TestableInputTarget; +using tensorflow::tensorforest::SparseClassificationLeafModelOperator; +using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator; using tensorflow::tensorforest::TensorForestParams; +using tensorflow::tensorforest::TestableInputTarget; const int32 kNumClasses = 3; constexpr char kRegressionStatProto[] = - "weight_sum: 3 " - "regression { " - "mean_output { " + "weight_sum: 3 " + "regression { " + "mean_output { " "value { " " float_value: 27 " "} " @@ -48,8 +48,8 @@ constexpr char kRegressionStatProto[] = "value { " " float_value: 10 " "} " - "} " - "mean_output_squares { " + "} " + "mean_output_squares { " "value {" " float_value: 245" "}" @@ -59,8 +59,8 @@ constexpr char kRegressionStatProto[] = "value {" " float_value: 46" "}" - "}" -"}"; + "}" + "}"; void TestClassificationNormalUse(const std::unique_ptr& op) { Leaf l; @@ -83,7 +83,6 @@ void TestClassificationNormalUse(const std::unique_ptr& op) { EXPECT_FLOAT_EQ(op->GetOutputValue(l, 1), 3.4); } - TEST(DenseLeafModelOperatorsTest, NormalUse) { TensorForestParams params; params.set_num_outputs(kNumClasses); @@ -182,7 +181,7 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) { std::unique_ptr leaf(new Leaf); - op->ExportModel( *stat, leaf.get()); + op->ExportModel(*stat, leaf.get()); // Make sure it was initialized correctly. EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 0), 1.1); @@ -194,7 +193,6 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) { EXPECT_EQ(leaf->sparse_vector().sparse_value().size(), kNumClasses); } - TEST(RegressionLeafModelOperatorsTest, NormalUse) { TensorForestParams params; params.set_num_outputs(kNumClasses); diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params.h b/tensorflow/contrib/tensor_forest/kernels/v4/params.h index b0ed9494247..7583e3d0402 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/params.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/params.h @@ -24,7 +24,6 @@ namespace tensorforest { // Return the value of the given depth-dependent parameter given a leaf's depth. float ResolveParam(const DepthDependentParam& param, int32 depth); - } // namespace tensorforest } // namespace tensorflow diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc index 801881af136..4010a71006d 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc @@ -71,5 +71,3 @@ TEST(ParamsTest, TestThreshold) { } } // namespace - - diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc index cdb1d80a4bb..b7b60d0ab8c 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc @@ -52,8 +52,8 @@ std::unique_ptr SplitCollectionOperator::CreateGrowStats( new SparseClassificationGrowStats(params_, depth)); case STATS_LEAST_SQUARES_REGRESSION: - return std::unique_ptr(new LeastSquaresRegressionGrowStats( - params_, depth)); + return std::unique_ptr( + new LeastSquaresRegressionGrowStats(params_, depth)); case STATS_FIXED_SIZE_SPARSE_GINI: return std::unique_ptr( @@ -136,8 +136,7 @@ void SplitCollectionOperator::CreateAndInitializeCandidateWithExample( stats_.at(node_id)->AddSplit(split, input_data, target, example); } -bool SplitCollectionOperator::BestSplit(int32 node_id, - SplitCandidate* best, +bool SplitCollectionOperator::BestSplit(int32 node_id, SplitCandidate* best, int32* depth) const { auto* slot = stats_.at(node_id).get(); *depth = slot->depth(); diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h index ad52f89fadd..c606ff98c67 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h @@ -71,9 +71,7 @@ class SplitCollectionOperator { } // Perform any necessary cleanup for any tracked state for the slot. - virtual void ClearSlot(int32 node_id) { - stats_.erase(node_id); - } + virtual void ClearSlot(int32 node_id) { stats_.erase(node_id); } // Return true if slot is fully initialized. virtual bool IsInitialized(int32 node_id) const; diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc index 0bec198e97e..c749fbe69e1 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc @@ -32,9 +32,9 @@ namespace tensorforest { // smoothed_sum = stats.sum() + #_classes float GiniImpurity(const LeafStat& stats, int32 num_classes) { const float smoothed_sum = num_classes + stats.weight_sum(); - return 1.0 - ( - (stats.classification().gini().square() - + 2 * stats.weight_sum() + num_classes) / (smoothed_sum * smoothed_sum)); + return 1.0 - ((stats.classification().gini().square() + + 2 * stats.weight_sum() + num_classes) / + (smoothed_sum * smoothed_sum)); } float WeightedGiniImpurity(const LeafStat& stats, int32 num_classes) { @@ -46,21 +46,20 @@ void UpdateGini(LeafStat* stats, float old_val, float weight) { // Equivalent to stats->square() - old_val * old_val + new_val * new_val, // (for new_val = old_val + weight), but more numerically stable. stats->mutable_classification()->mutable_gini()->set_square( - stats->classification().gini().square() - + weight * weight + 2 * old_val * weight); + stats->classification().gini().square() + weight * weight + + 2 * old_val * weight); } - float Variance(const LeafStat& stats, int output) { if (stats.weight_sum() == 0) { return 0; } const float e_x = - stats.regression().mean_output().value(output).float_value() - / stats.weight_sum(); + stats.regression().mean_output().value(output).float_value() / + stats.weight_sum(); const auto e_x2 = - stats.regression().mean_output_squares().value(output).float_value() - / stats.weight_sum(); + stats.regression().mean_output_squares().value(output).float_value() / + stats.weight_sum(); return e_x2 - e_x * e_x; } @@ -75,8 +74,7 @@ float TotalVariance(const LeafStat& stats) { float SmoothedGini(float sum, float square, int num_classes) { // See comments for GiniImpurity above. const float smoothed_sum = num_classes + sum; - return 1.0 - - (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum); + return 1.0 - (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum); } float WeightedSmoothedGini(float sum, float square, int num_classes) { diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h index 289c81e9d51..38deb3e3cd8 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h +++ b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h @@ -27,9 +27,7 @@ class TestableInputTarget : public StoredInputTarget> { : StoredInputTarget(new std::vector(t), new std::vector(w), num_t) {} - int NumItems() const { - return target_->size(); - } + int NumItems() const { return target_->size(); } int32 GetTargetAsClassIndex(int example_index, int target_index) const override { @@ -51,7 +49,6 @@ class TestableInputTarget : public StoredInputTarget> { } }; - class TestableDataSet : public TensorDataSet { public: TestableDataSet(const std::vector& data, int num_features) diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD index 6ff5a9e2b18..4175d8adb58 100644 --- a/tensorflow/contrib/tensorboard/db/BUILD +++ b/tensorflow/contrib/tensorboard/db/BUILD @@ -40,7 +40,6 @@ cc_library( hdrs = ["summary_db_writer.h"], copts = tf_copts(), deps = [ - ":schema", ":summary_converter", "//tensorflow/core:framework", "//tensorflow/core:lib", diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py index c7193cef691..c834430b95d 100644 --- a/tensorflow/contrib/timeseries/examples/lstm.py +++ b/tensorflow/contrib/timeseries/examples/lstm.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools from os import path import numpy @@ -80,18 +81,19 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel): input_statistics: A math_utils.InputStatistics object. """ super(_LSTMModel, self).initialize_graph(input_statistics=input_statistics) - self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units) - # Create templates so we don't have to worry about variable reuse. - self._lstm_cell_run = tf.make_template( - name_="lstm_cell", - func_=self._lstm_cell, - create_scope_now_=True) - # Transforms LSTM output into mean predictions. - self._predict_from_lstm_output = tf.make_template( - name_="predict_from_lstm_output", - func_= - lambda inputs: tf.layers.dense(inputs=inputs, units=self.num_features), - create_scope_now_=True) + with tf.variable_scope("", use_resource=True): + # Use ResourceVariables to avoid race conditions. + self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units) + # Create templates so we don't have to worry about variable reuse. + self._lstm_cell_run = tf.make_template( + name_="lstm_cell", + func_=self._lstm_cell, + create_scope_now_=True) + # Transforms LSTM output into mean predictions. + self._predict_from_lstm_output = tf.make_template( + name_="predict_from_lstm_output", + func_=functools.partial(tf.layers.dense, units=self.num_features), + create_scope_now_=True) def get_start_state(self): """Return initial state for the time series model.""" diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD index 0199313bc8d..a7d54d8a0cc 100644 --- a/tensorflow/contrib/tpu/BUILD +++ b/tensorflow/contrib/tpu/BUILD @@ -43,6 +43,7 @@ py_library( deps = [ ":tpu_lib", ":tpu_py", + "//tensorflow/contrib/summary:summary_ops", "//tensorflow/core:protos_all_py", "//tensorflow/python:array_ops", "//tensorflow/python:control_flow_ops", diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc index 7373d0e17ce..b1ef9fde37f 100644 --- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc +++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/contrib/tpu/profiler/version.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/util/command_line_flags.h" @@ -47,6 +48,19 @@ string GetCurrentTimeStampAsString() { return s; } +Status ValidateHostPortPair(const string& host_port) { + uint32 port; + std::vector parts = str_util::Split(host_port, ':'); + // Must be host:port, port must be a number, host must not contain a '/', + // host also must not be empty. + if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) || + parts[0].find("/") != string::npos || parts[0].empty()) { + return errors::InvalidArgument("Could not interpret \"", host_port, + "\" as a host-port pair."); + } + return Status::OK(); +} + ProfileResponse Profile(const string& service_addr, int duration_ms, const ProfileOptions& opts) { ProfileRequest request; @@ -60,11 +74,14 @@ ProfileResponse Profile(const string& service_addr, int duration_ms, ::grpc::ClientContext context; ::grpc::ChannelArguments channel_args; // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available. + // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their + // `ValidateHostPortPair` checks for empty host string case. channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits::max()); std::unique_ptr stub = TPUProfiler::NewStub(::grpc::CreateCustomChannel( - service_addr, ::grpc::InsecureChannelCredentials(), channel_args)); + "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(), + channel_args)); ProfileResponse response; TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response))); return response; @@ -78,14 +95,19 @@ int main(int argc, char** argv) { tensorflow::string FLAGS_service_addr; tensorflow::string FLAGS_logdir; int FLAGS_duration_ms = 2000; + int FLAGS_num_tracing_attempts = 3; bool FLAGS_include_dataset_ops = true; std::vector flag_list = { tensorflow::Flag("service_addr", &FLAGS_service_addr, "Address of TPU profiler service e.g. localhost:8466"), tensorflow::Flag("logdir", &FLAGS_logdir, - "Path of TensorBoard log directory e.g. /tmp/tb_log"), + "Path of TensorBoard log directory e.g. /tmp/tb_log, " + "gs://tb_bucket"), tensorflow::Flag("duration_ms", &FLAGS_duration_ms, "Duration of tracing in ms. Default is 2000ms."), + tensorflow::Flag("num_tracing_attempts", &FLAGS_num_tracing_attempts, + "Automatically retry N times when no trace event " + "is collected. Default is 3."), tensorflow::Flag("include_dataset_ops", &FLAGS_include_dataset_ops, "Set to false to profile longer TPU device traces."), }; @@ -96,16 +118,46 @@ int main(int argc, char** argv) { tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list); bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list); if (!parse_ok || FLAGS_service_addr.empty() || FLAGS_logdir.empty()) { - std::printf("%s", usage.c_str()); + std::cout << usage.c_str() << std::endl; + return 2; + } + tensorflow::Status status = + tensorflow::tpu::ValidateHostPortPair(FLAGS_service_addr); + if (!status.ok()) { + std::cout << status.error_message() << std::endl; + std::cout << usage.c_str() << std::endl; return 2; } tensorflow::port::InitMain(argv[0], &argc, &argv); - int duration_ms = FLAGS_duration_ms; + // Sets the minimum duration_ms and tracing attempts to one. + int duration_ms = std::max(FLAGS_duration_ms, 1); + int remaining_attempts = std::max(FLAGS_num_tracing_attempts, 1); tensorflow::ProfileOptions opts; opts.set_include_dataset_ops(FLAGS_include_dataset_ops); - tensorflow::ProfileResponse response = - tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts); + tensorflow::ProfileResponse response; + + while (true) { + std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. " + << "Remaining attempt(s): " << remaining_attempts-- << std::endl; + response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts); + if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break; + std::cout << "No trace event is collected. Automatically retrying." + << std::endl + << std::endl; + } + + if (response.encoded_trace().empty()) { + std::cout << "No trace event is collected after " + << FLAGS_num_tracing_attempts << " attempt(s). " + << "Perhaps, you want to try again (with more attempts?)." + << std::endl + << "Tip: increase number of attempts with --num_tracing_attempts." + << std::endl; + // Don't dump profile data if no trace is collected. + return 0; + } + // Use the current timestamp as the run name. tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString(); TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile( diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc index b842951eb2c..ebd6185faad 100644 --- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc +++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc @@ -151,10 +151,7 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run, TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir)); // Ignore computation_graph for now. - const bool empty_trace = response.encoded_trace().empty(); - if (empty_trace) { - *os << "No trace event is collected." << std::endl; - } else { + if (!response.encoded_trace().empty()) { LOG(INFO) << "Converting trace events to TraceViewer JSON."; TF_RETURN_IF_ERROR( DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os)); @@ -165,11 +162,9 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run, TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir, response.op_profile(), os)); } - if (!empty_trace && !response.tool_data().empty()) { - for (const auto& tool_data : response.tool_data()) { - TF_RETURN_IF_ERROR( - DumpToolDataToLogDirectory(profile_run_dir, tool_data, os)); - } + for (const auto& tool_data : response.tool_data()) { + TF_RETURN_IF_ERROR( + DumpToolDataToLogDirectory(profile_run_dir, tool_data, os)); } return Status::OK(); diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h index 25b958bcfea..29ef977bacf 100644 --- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h +++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h @@ -27,7 +27,10 @@ namespace tpu { // The following tools are supported: // - Trace viewer // - Op profile -// - HLO computation graph +// - Input pipeline analyzer +// - Overview page +// Note: this function creates a directory even when all fields in +// ProfileResponse are unset/empty. Status WriteTensorboardTPUProfile(const string& logdir, const string& run, const ProfileResponse& response, std::ostream* os); diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py index 846db133299..78d237e6a20 100644 --- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py +++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py @@ -17,6 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl import flags import os import subprocess @@ -24,13 +25,21 @@ import sys import tensorflow as tf -tf.flags.DEFINE_string('service_addr', '', - 'Address of TPU profiler service e.g. localhost:8466') -tf.flags.DEFINE_string('logdir', '', - 'Path of TensorBoard log directory e.g. /tmp/tb_log') -tf.flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.') +flags.DEFINE_string( + 'service_addr', None, 'Address of TPU profiler service e.g. ' + 'localhost:8466') +flags.DEFINE_string( + 'logdir', None, 'Path of TensorBoard log directory e.g. /tmp/tb_log, ' + 'gs://tb_bucket') +flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.') +flags.DEFINE_integer( + 'num_tracing_attempts', 3, 'Automatically retry N times when no trace ' + 'event is collected.') +flags.DEFINE_boolean( + 'include_dataset_ops', True, 'Set to false to profile longer TPU ' + 'device traces.') -FLAGS = tf.flags.FLAGS +FLAGS = flags.FLAGS EXECUTABLE = 'data/capture_tpu_profile' @@ -47,6 +56,8 @@ def main(unused_argv=None): cmd.append('--logdir='+logdir) cmd.append('--service_addr='+FLAGS.service_addr) cmd.append('--duration_ms='+str(FLAGS.duration_ms)) + cmd.append('--num_tracing_attempts='+str(FLAGS.num_tracing_attempts)) + cmd.append('--include_dataset_ops='+str(FLAGS.include_dataset_ops).lower()) subprocess.call(cmd) diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py index 92196638318..3dffebe6680 100644 --- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py +++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py @@ -20,7 +20,7 @@ from __future__ import print_function from setuptools import setup -_VERSION = '1.4.3-a2' +_VERSION = '1.5.0-rc1' CONSOLE_SCRIPTS = [ 'capture_tpu_profile=cloud_tpu_profiler.main:run_main', diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h index 0f645a54929..dc6a9348911 100644 --- a/tensorflow/contrib/tpu/profiler/version.h +++ b/tensorflow/contrib/tpu/profiler/version.h @@ -16,6 +16,6 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ -#define TPU_PROFILER_VERSION "1.4.3" +#define TPU_PROFILER_VERSION "1.5.0" #endif // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py index 8fec379aad8..d5f54ff4fd2 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu.py @@ -153,10 +153,11 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): raise NotImplementedError( "Non-resource Variables are not supported inside TPU computations " "(operator name: %s)" % op.name) - # pylint: enable=protected-access if _TPU_REPLICATE_ATTR in op.node_def.attr: raise ValueError("TPU computations cannot be nested") - op.node_def.attr[_TPU_REPLICATE_ATTR].s = compat.as_bytes(self._name) + op._set_attr(_TPU_REPLICATE_ATTR, + attr_value_pb2.AttrValue(s=compat.as_bytes(self._name))) + # pylint: enable=protected-access op.graph.prevent_feeding(op) op.graph.prevent_fetching(op) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py index 0c2580211ab..188db6e2f0d 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py @@ -53,7 +53,8 @@ class TPUConfig( num_shards: The number of TPU shards in the system. per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host rather than Per-Core. With Per-Host input pipeline deployment, `input_fn` - is invoked once on each host. To be precise, with a global batch size + is invoked once on each host. With Per-Core input pipeline deployment, it + is invoked once for each core. To be precise, with a global batch size `train_batch_size` in `TPUEstimator` constructor, the batch size for each shard is `train_batch_size` // #hosts. With Per-Core input pipeline deployment, the shard batch size is `train_batch_size` // #cores. diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index 2ae3a26a853..56793f11d9c 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -21,6 +21,7 @@ from __future__ import print_function import collections from contextlib import contextmanager import copy +import signal import threading import time import traceback @@ -29,6 +30,7 @@ import six from six.moves import queue as Queue # pylint: disable=redefined-builtin from six.moves import xrange # pylint: disable=redefined-builtin +from tensorflow.contrib.summary import summary_ops as contrib_summary from tensorflow.contrib.tpu.python.ops import tpu_ops from tensorflow.contrib.tpu.python.tpu import tpu from tensorflow.contrib.tpu.python.tpu import tpu_config @@ -39,11 +41,13 @@ from tensorflow.contrib.tpu.python.tpu import util as util_lib from tensorflow.core.framework.summary_pb2 import Summary from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.estimator import estimator as estimator_lib from tensorflow.python.estimator import model_fn as model_fn_lib from tensorflow.python.estimator import util from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops @@ -59,6 +63,7 @@ from tensorflow.python.training import evaluation from tensorflow.python.training import session_run_hook from tensorflow.python.training import training from tensorflow.python.training import training_util +from tensorflow.python.util import tf_inspect _INITIAL_LOSS = 1e7 _ZERO_LOSS = 0. @@ -68,7 +73,12 @@ _BATCH_SIZE_KEY = 'batch_size' _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum' _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY] -# TODO(b/65703635): Flip the value and remove all dead code. + +# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is +# only used for per-core based deployments. For per-host based pipelines, if a +# user returns a Dataset instance it will be automatically wrapped in a +# tf.while_loop (This can be disabled by returning features and labels +# explicitly). _WRAP_INPUT_FN_INTO_WHILE_LOOP = False @@ -162,10 +172,12 @@ class _TPUContext(object): ``` """ - def __init__(self, config, train_batch_size, eval_batch_size, use_tpu): + def __init__(self, config, train_batch_size, eval_batch_size, + predict_batch_size, use_tpu): self._config = config self._train_batch_size = train_batch_size self._eval_batch_size = eval_batch_size + self._predict_batch_size = predict_batch_size self._use_tpu = use_tpu self._num_shards_or_none = self._config.tpu_config.num_shards self._mode = None @@ -210,39 +222,66 @@ class _TPUContext(object): return (self._mode == model_fn_lib.ModeKeys.TRAIN and not self._config.tpu_config.per_host_input_for_training) - def is_running_on_cpu(self): - """Determines whether the input_fn and model_fn should be invoked on CPU.""" + def is_running_on_cpu(self, is_export_mode=False): + """Determines whether the input_fn and model_fn should be invoked on CPU. + + Args: + is_export_mode: Indicates whether the current mode is for exporting the + model, when mode == PREDICT. Only with this bool, we could + tell whether user is calling the Estimator.predict or + Estimator.export_savedmodel, which are running on TPU and CPU + respectively. Parent class Estimator does not distingush these two. + + Returns: + bool, whether current input_fn or model_fn should be running on CPU. + + Raises: + ValueError: any configuration is invalid. + """ mode = self._assert_mode() - return ((not self._use_tpu) or mode == model_fn_lib.ModeKeys.PREDICT or - (mode == model_fn_lib.ModeKeys.EVAL and - self._eval_batch_size is None)) + + if not self._use_tpu: + return True + + if mode != model_fn_lib.ModeKeys.PREDICT: + return False + + # There are actually 2 use cases when running with mode.PREDICT: prediction + # and saving the model. We run actual predictions on the TPU, but + # model export is run on the CPU. + if is_export_mode: + return True + + if self._predict_batch_size is None: + raise ValueError( + 'predict_batch_size in TPUEstimator constructor should not be ' + '`None` if .predict is running on TPU.') + if self.num_hosts > 1: + raise ValueError( + 'TPUEstimator.predict should be running on single host.') + + return False @property def global_batch_size(self): mode = self._assert_mode() - if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None: - raise RuntimeError('Internal error, EVAL on TPU is not enabled, but ' - '`global_batch_size` is called.') - return (self._train_batch_size - if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size) + if mode == model_fn_lib.ModeKeys.TRAIN: + return self._train_batch_size + elif mode == model_fn_lib.ModeKeys.EVAL: + return self._eval_batch_size + elif mode == model_fn_lib.ModeKeys.PREDICT: + return self._predict_batch_size + else: + return None @property def batch_size_for_input_fn(self): """Returns the shard batch size for `input_fn`.""" - mode = self._assert_mode() - # Special case for eval. - if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None: - return None - if self.is_running_on_cpu(): - if mode == model_fn_lib.ModeKeys.TRAIN: - return self._train_batch_size - if mode == model_fn_lib.ModeKeys.EVAL: - return self._eval_batch_size - return None + global_batch_size = self.global_batch_size + + if self.is_running_on_cpu(): + return global_batch_size - global_batch_size = ( - self._train_batch_size - if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size) # On TPU if self.is_input_sharded_per_core(): return global_batch_size // self.num_cores @@ -252,22 +291,13 @@ class _TPUContext(object): @property def batch_size_for_model_fn(self): """Returns the shard batch size for `model_fn`.""" - mode = self._assert_mode() - # Special case for eval. - if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None: - return None + global_batch_size = self.global_batch_size + if self.is_running_on_cpu(): - if mode == model_fn_lib.ModeKeys.TRAIN: - return self._train_batch_size - if mode == model_fn_lib.ModeKeys.EVAL: - return self._eval_batch_size - return None + return global_batch_size # On TPU. always sharded per core. - if mode == model_fn_lib.ModeKeys.TRAIN: - return self._train_batch_size // self.num_cores - else: - return self._eval_batch_size // self.num_cores + return global_batch_size // self.num_cores @property def master_job(self): @@ -384,7 +414,8 @@ class TPUEstimatorSpec( 'train_op', 'eval_metrics', 'export_outputs', - 'scaffold_fn' + 'scaffold_fn', + 'host_call' ])): """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`. @@ -410,6 +441,15 @@ class TPUEstimatorSpec( `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This function should not capture any Tensors in `model_fn`. + + `host_call` is a tuple of a `function` and a list or dictionary of `tensors` + to pass to that function and returns a list of Tensors. `host_call` currently + works for train() and evaluate(). The Tensors returned by the function is + executed on the CPU on every step, so there is communication overhead when + sending tensors from TPU to CPU. To reduce the overhead, try reducing the + size of the tensors. The `tensors` are concatenated along their major (batch) + dimension, and so must be >= rank 1. The `host_call` is useful for writing + summaries with @{tf.contrib.summary.create_file_writer}. """ def __new__(cls, @@ -419,10 +459,15 @@ class TPUEstimatorSpec( train_op=None, eval_metrics=None, export_outputs=None, - scaffold_fn=None): + scaffold_fn=None, + host_call=None): """Creates a validated `TPUEstimatorSpec` instance.""" + host_calls = {} if eval_metrics is not None: - _EvalMetrics.validate(eval_metrics) + host_calls['eval_metrics'] = eval_metrics + if host_call is not None: + host_calls['host_call'] = host_call + _OutfeedHostCall.validate(host_calls) return super(TPUEstimatorSpec, cls).__new__( cls, mode=mode, @@ -431,12 +476,23 @@ class TPUEstimatorSpec( train_op=train_op, eval_metrics=eval_metrics, export_outputs=export_outputs, - scaffold_fn=scaffold_fn) + scaffold_fn=scaffold_fn, + host_call=host_call) def as_estimator_spec(self): """Creates an equivalent `EstimatorSpec` used by CPU train/eval.""" - eval_metric_ops = _EvalMetrics.to_metric_metric_ops_for_cpu( - self.eval_metrics) + host_calls = {} + if self.eval_metrics is not None: + host_calls['eval_metrics'] = self.eval_metrics + if self.host_call is not None: + host_calls['host_call'] = wrap_hostcall_with_global_step(self.host_call) + host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls) + eval_metric_ops = None + if self.eval_metrics is not None: + eval_metric_ops = host_call_ret['eval_metrics'] + hooks = None + if self.host_call is not None: + hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])] scaffold = self.scaffold_fn() if self.scaffold_fn else None return model_fn_lib.EstimatorSpec( mode=self.mode, @@ -445,7 +501,10 @@ class TPUEstimatorSpec( train_op=self.train_op, eval_metric_ops=eval_metric_ops, export_outputs=self.export_outputs, - scaffold=scaffold) + scaffold=scaffold, + training_hooks=hooks, + evaluation_hooks=hooks, + prediction_hooks=hooks) class _OpQueueContext(object): @@ -467,12 +526,12 @@ class _OpQueueContext(object): def read_iteration_counts(self): while True: - signal = self._queue.get(block=True) - logging.debug('%s read signal %s', self._name, signal) - if signal == _SIGNAL.STOP: - logging.info('%s received signal, stopping.', self._name) + iterations = self._queue.get(block=True) + logging.debug('%s read iterations %s', self._name, iterations) + if iterations == _SIGNAL.STOP: + logging.info('%s received shutdown signal, stopping.', self._name) return - yield signal + yield iterations def join(self): logging.info('Shutting down %s thread.' % self._name) @@ -480,6 +539,22 @@ class _OpQueueContext(object): self._thread.join() +class _OpSignalOnceQueueContext(_OpQueueContext): + """Manages work queue and thread for a infeed/outfeed thread. + + This subclass only signals once. + """ + + def __init__(self, name, target, args): + super(_OpSignalOnceQueueContext, self).__init__(name, target, args) + self._has_signaled = False + + def send_next_batch_signal(self, iterations): + if not self._has_signaled: + self._queue.put(iterations) + self._has_signaled = True + + class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): """A Session hook setting up the TPU initialization, infeed, and outfeed. @@ -489,12 +564,19 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): dequeue. """ - def __init__(self, ctx, enqueue_ops, dequeue_ops=None): + def __init__(self, + ctx, + enqueue_ops, + dequeue_ops, + run_infeed_loop_on_coordinator=True): self._master_job = ctx.master_job self._enqueue_ops = enqueue_ops self._dequeue_ops = dequeue_ops + + self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator self._initial_infeed_sleep_secs = ( ctx.config.tpu_config.initial_infeed_sleep_secs) + self._session_cancel_timer = None self._feed_error = None @@ -503,8 +585,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): def begin(self): logging.info('TPU job name %s', self._master_job) self._iterations_per_loop_var = _create_or_get_iterations_per_loop() - self._init_op = [tpu.initialize_system(job=self._master_job)] - self._finalize_op = [tpu.shutdown_system(job=self._master_job)] + self._init_ops = [tpu.initialize_system(job=self._master_job)] + self._finalize_ops = [tpu.shutdown_system(job=self._master_job)] + + summary_writer_init_ops = contrib_summary.summary_writer_initializer_op() + self._init_ops.extend(summary_writer_init_ops) + # Get all the writer resources from the initializer, so we know what to + # flush. + for op in summary_writer_init_ops: + self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0])) def _log_error(self, session, error): """Log an infeed or outfeed error. @@ -516,8 +605,9 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): emitting a stack trace for the infeed. Args: - session: `tf.Session`, session to be terminated - error: exception that triggered logging. + session: `tf.Session`, session to be terminated error: exception that + triggered logging. + error: the Exception to log. """ logging.warning( '\n\n' @@ -569,15 +659,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): logging.info('%s thread starting after sleep', self._name) try: - if _WRAP_INPUT_FN_INTO_WHILE_LOOP: - for _ in queue_ctx.read_iteration_counts(): - session.run(self._enqueue_ops) - else: + if self._run_infeed_loop_on_coordinator: for count, steps in enumerate(queue_ctx.read_iteration_counts()): for i in xrange(steps): logging.debug('Infeed enqueue for iteration (%d, %d)', count, i) session.run(self._enqueue_ops) - logging.debug('Infeed thread finished, shutting down.') + else: + for _ in queue_ctx.read_iteration_counts(): + session.run(self._enqueue_ops) + logging.info('Infeed thread finished, shutting down.') except Exception as e: # pylint: disable=broad-except self._log_error(session, e) @@ -588,23 +678,25 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): for i in xrange(steps): logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i) session.run(self._dequeue_ops) + logging.info('Outfeed thread finished, shutting down.') except Exception as e: # pylint: disable=broad-except self._log_error(session, e) + def _create_infeed_controller(self, name, target, args): + return _OpQueueContext(name=name, target=target, args=args) + def after_create_session(self, session, coord): logging.info('Init TPU system') - session.run( - self._init_op, - options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000)) + session.run(self._init_ops, + options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000)) logging.info('Start infeed thread controller') - self._infeed_controller = _OpQueueContext( + self._infeed_controller = self._create_infeed_controller( name='InfeedController', target=self._run_infeed, args=(session,)) - if self._dequeue_ops is not None: - logging.info('Start outfeed thread controller') - self._outfeed_controller = _OpQueueContext( - name='OutfeedController', target=self._run_outfeed, args=(session,)) + logging.info('Start outfeed thread controller') + self._outfeed_controller = _OpQueueContext( + name='OutfeedController', target=self._run_outfeed, args=(session,)) def before_run(self, run_context): if self._feed_error: @@ -617,11 +709,9 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations) self._infeed_controller.send_next_batch_signal(iterations) - if self._dequeue_ops is not None: - # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop. - logging.info('Dequeue next (%d) batch(es) of data from outfeed.', - iterations) - self._outfeed_controller.send_next_batch_signal(iterations) + logging.info('Dequeue next (%d) batch(es) of data from outfeed.', + iterations) + self._outfeed_controller.send_next_batch_signal(iterations) def end(self, session): if self._session_cancel_timer: @@ -632,12 +722,21 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): logging.info('Stop infeed thread controller') self._infeed_controller.join() - if self._dequeue_ops is not None: - logging.info('Stop output thread controller') - self._outfeed_controller.join() + logging.info('Stop output thread controller') + self._outfeed_controller.join() logging.info('Shutdown TPU system.') - session.run(self._finalize_op) + session.run(self._finalize_ops) + + +class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook): + + def __init__(self, ctx, enqueue_ops, dequeue_ops): + super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__( + ctx, enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=False) + + def _create_infeed_controller(self, name, target, args): + return _OpSignalOnceQueueContext(name=name, target=target, args=args) class _TPUStopAtStepHook(session_run_hook.SessionRunHook): @@ -727,6 +826,47 @@ class _SetEvalIterationsHook(session_run_hook.SessionRunHook): self._iterations_per_loop_var.load(self._num_steps, session=session) +class _StoppingPredictHook(session_run_hook.SessionRunHook): + """Hook that requests stop according to the stopping signal in prediction.""" + + def __init__(self, scalar_stopping_signal): + self._scalar_stopping_signal = scalar_stopping_signal + + def begin(self): + self._iterations_per_loop_var = _create_or_get_iterations_per_loop() + + def after_create_session(self, session, coord): + # This is not necessary as we do not run infeed enqueue and outfeed dequeue + # in side threads for prediction model. But it makes the + # TPUInfeedOutfeedSessionHook prints nice message. + self._iterations_per_loop_var.load(1, session=session) + + def before_run(self, run_context): + return session_run_hook.SessionRunArgs(self._scalar_stopping_signal) + + def after_run(self, run_context, run_values): + _ = run_context + scalar_stopping_signal = run_values.results + if _StopSignals.should_stop(scalar_stopping_signal): + # NOTE(xiejw): In prediction, stopping signals are inserted for each + # batch. And we append one more batch to signal the system it should stop. + # The data flow might look like + # + # batch 0: images, labels, stop = 0 (user provideded) + # batch 1: images, labels, stop = 0 (user provideded) + # ... + # batch 99: images, labels, stop = 0 (user provideded) + # batch 100: images, labels, stop = 1 (TPUEstimator appended) + # + # where the final batch (id = 100) is appended by TPUEstimator, so we + # should drop it before returning the predictions to user. + # To achieve that, we throw the OutOfRangeError in after_run. Once + # Monitored Session sees this error in SessionRunHook.after_run, the + # "current" prediciton, i.e., batch with id=100, will be discarded + # immediately + raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.') + + def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn, inputs_structure_recorder): """Generates infeed enqueue ops for per-core input_fn on a single host.""" @@ -738,11 +878,14 @@ def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn, per_host_sharded_inputs = [] for core_ordinal in range(num_cores_per_host): with ops.name_scope('ordinal_%d' % (core_ordinal)): - inputs = input_fn() - if isinstance(inputs, tuple): - features, labels = inputs - else: - features, labels = inputs, None + inputs = _Inputs.from_input_fn(input_fn()) + if inputs.is_dataset: + raise TypeError( + '`input_fn` returning `Dataset` is not yet supported in ' + 'per-Core input pipeline deployment yet. Please set ' + 'TPUConfig.per_host_input_for_training to True or return ' + '`features` and `labels` from `input_fn`') + features, labels = inputs.features_and_labels() inputs_structure_recorder.validate_and_record_structure( features, labels) @@ -769,18 +912,37 @@ def generate_per_host_enqueue_ops_fn_for_host( """Generates infeed enqueue ops for per-host input_fn on a single host.""" captured_infeed_queue = _CapturedObject() + hooks = [] + + with ops.device(device): + inputs = _Inputs.from_input_fn(input_fn()) + + is_dataset = inputs.is_dataset + if ctx.mode == model_fn_lib.ModeKeys.PREDICT: + if not is_dataset: + raise TypeError( + 'For mode PREDICT, `input_fn` must return `Dataset` instead of ' + '`features` and `labels`.') + inputs = _InputsWithStoppingSignals( + dataset=inputs.dataset, batch_size=ctx.batch_size_for_input_fn) + + if is_dataset: + hooks.append(inputs.dataset_initializer_hook()) + def enqueue_ops_fn(): with ops.device(device): num_cores_per_host = ctx.num_of_cores_per_host - inputs = input_fn() - if isinstance(inputs, tuple): - features, labels = inputs - else: - features, labels = inputs, None - inputs_structure_recorder.validate_and_record_structure(features, labels) + # Convert user input to features and labels. If the user returns a + # dataset, it is initialized and the features and labels extracted via + # `dataset.iterator.get_next()` + features, labels = inputs.features_and_labels() + signals = inputs.signals() + + inputs_structure_recorder.validate_and_record_structure( + features, labels, signals) unsharded_tensor_list = ( inputs_structure_recorder.flatten_features_and_labels( - features, labels)) + features, labels, signals)) infeed_queue = tpu_feed.InfeedQueue( tuple_types=[t.dtype for t in unsharded_tensor_list], @@ -792,9 +954,15 @@ def generate_per_host_enqueue_ops_fn_for_host( per_host_enqueue_ops = ( infeed_queue.split_inputs_and_generate_enqueue_ops( unsharded_tensor_list, placement_function=lambda x: device)) - return per_host_enqueue_ops + if signals is None: + return per_host_enqueue_ops + else: + return { + 'ops': per_host_enqueue_ops, + 'signals': signals, + } - return enqueue_ops_fn, captured_infeed_queue + return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset class _InputPipeline(object): @@ -834,6 +1002,7 @@ class _InputPipeline(object): self._feature_names = [] self._label_names = [] self._has_labels = False + self._signals_helper = None # Internal state. self._initialized = False @@ -841,7 +1010,7 @@ class _InputPipeline(object): def has_labels(self): return self._has_labels - def validate_and_record_structure(self, features, labels): + def validate_and_record_structure(self, features, labels, signals=None): """Validates and records the structure of features` and `labels`.""" def _extract_key_names(tensor_or_dict): @@ -854,6 +1023,10 @@ class _InputPipeline(object): feature_names = _extract_key_names(features) label_names = _extract_key_names(labels) + if signals is not None and self._signals_helper is None: + # Record signals helper. + self._signals_helper = _SignalsHelper(signals) + if self._initialized: # Verify the structure is same. The following should never happen. assert feature_names == self._feature_names, 'feature keys mismatched' @@ -866,7 +1039,7 @@ class _InputPipeline(object): self._label_names = label_names self._has_labels = has_labels - def flatten_features_and_labels(self, features, labels): + def flatten_features_and_labels(self, features, labels, signals=None): """Flattens the `features` and `labels` to a single tensor list.""" flattened_inputs = [] if self._feature_names: @@ -882,6 +1055,9 @@ class _InputPipeline(object): flattened_inputs.extend([labels[name] for name in self._label_names]) else: flattened_inputs.append(labels) + + if signals is not None: + flattened_inputs.extend(_SignalsHelper.as_tensor_list(signals)) return flattened_inputs def unflatten_features_and_labels(self, flattened_inputs): @@ -907,7 +1083,11 @@ class _InputPipeline(object): else: expected_num_labels = 0 - expected_num_tensors = expected_num_features + expected_num_labels + expected_num_signals = ( + self._signals_helper.num_signals if self._signals_helper else 0) + + expected_num_tensors = ( + expected_num_features + expected_num_labels + expected_num_signals) if expected_num_tensors != len(flattened_inputs): raise ValueError( @@ -924,13 +1104,20 @@ class _InputPipeline(object): if expected_num_labels == 0: unflattened_label = None elif self._label_names: - unflattened_label = dict( - zip(self._label_names, flattened_inputs[expected_num_features:])) + label_list = flattened_inputs[ + expected_num_features:expected_num_features + expected_num_labels] + unflattened_label = dict(zip(self._label_names, label_list)) else: # Single tensor case. unflattened_label = flattened_inputs[expected_num_features] - return unflattened_features, unflattened_label + signals = None + if expected_num_signals != 0: + tensor_list_for_signals = flattened_inputs[ + expected_num_features + expected_num_labels:] + signals = self._signals_helper.unflatten(tensor_list_for_signals) + + return _Inputs(unflattened_features, unflattened_label, signals=signals) def __init__(self, input_fn, batch_axis, ctx): """Constructor. @@ -958,7 +1145,8 @@ class _InputPipeline(object): # While tf.while_loop is called, the body function, which invokes # `enqueue_fn` passed in, is called to construct the graph. So, input_fn # structure is recorded. - enqueue_ops = self._invoke_input_fn_and_record_structure() + enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = ( + self._invoke_input_fn_and_record_structure()) self._validate_input_pipeline() @@ -969,14 +1157,18 @@ class _InputPipeline(object): return self._inputs_structure_recorder.unflatten_features_and_labels( values) - return (enqueue_ops, dequeue_fn) + return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator) def _invoke_input_fn_and_record_structure(self): """Deploys the input pipeline and record input structure.""" enqueue_ops = [] infeed_queues = [] + all_hooks = [] num_hosts = self._ctx.num_hosts tpu_host_placement_fn = self._ctx.tpu_host_placement_function + + run_infeed_loop_on_coordinator = True + if self._sharded_per_core: # Per-Core input pipeline deployment. # Invoke input pipeline for each core and placed on the corresponding @@ -990,6 +1182,7 @@ class _InputPipeline(object): self._ctx, self._input_fn, self._inputs_structure_recorder)) if _WRAP_INPUT_FN_INTO_WHILE_LOOP: + run_infeed_loop_on_coordinator = False enqueue_ops.append( _wrap_computation_in_while_loop( device=host_device, op_fn=enqueue_ops_fn)) @@ -1003,15 +1196,32 @@ class _InputPipeline(object): host_device = tpu_host_placement_fn(host_id=host_id) with ops.device(host_device): with ops.name_scope('input_pipeline_task%d' % (host_id)): - enqueue_ops_fn, captured_infeed_queue = ( + enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = ( generate_per_host_enqueue_ops_fn_for_host( self._ctx, self._input_fn, self._inputs_structure_recorder, self._batch_axis, host_device)) + all_hooks.extend(hooks) - if _WRAP_INPUT_FN_INTO_WHILE_LOOP: + # NOTE(xiejw): We dispatch here based on the return type of the + # users `input_fn`. + # + # 1. If input_fn returns a Dataset instance, we initialize the + # iterator outside of tf.while_loop, and call the iterator.get_next + # inside tf.while_loop. This should be always safe. + # + # 2. If input_fn returns (features, labels), it is too late to wrap + # them inside tf.while_loop, as resource initialization cannot be + # handled in TF control flow properly. In this case, we will use + # python loop to enqueue the data into TPU system. This may be + # slow compared to the previous case. + if is_dataset: + run_infeed_loop_on_coordinator = False + wrap_fn = ( + _wrap_computation_in_while_loop + if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else + _wrap_computation_in_while_loop_with_stopping_signals) enqueue_ops.append( - _wrap_computation_in_while_loop( - device=host_device, op_fn=enqueue_ops_fn)) + wrap_fn(device=host_device, op_fn=enqueue_ops_fn)) else: enqueue_ops.append(enqueue_ops_fn()) infeed_queues.append(captured_infeed_queue.get()) @@ -1019,7 +1229,7 @@ class _InputPipeline(object): # dequeue is dtypes and types. So, any one can be used. Here, grab the # first one. self._infeed_queue = infeed_queues[0] - return enqueue_ops + return enqueue_ops, all_hooks, run_infeed_loop_on_coordinator def _validate_input_pipeline(self): # Perform some sanity checks to log user friendly information. We should @@ -1076,29 +1286,38 @@ class _ModelFnWrapper(object): infeed dequeue channel. Returns: - A Fn representing the train step for TPU. + A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn + representing the train step for TPU. """ + host_call = _OutfeedHostCall(self._ctx) captured_scaffold_fn = _CapturedObject() def train_step(loss): """Training step function for use inside a while loop.""" del loss # unused; required in function signature. - features, labels = dequeue_fn() + inputs = dequeue_fn() + features, labels = inputs.features_and_labels() estimator_spec = self._verify_estimator_spec( self._call_model_fn(features, labels)) loss, train_op = estimator_spec.loss, estimator_spec.train_op + host_call_outfeed_ops = [] if isinstance(estimator_spec, TPUEstimatorSpec): captured_scaffold_fn.capture(estimator_spec.scaffold_fn) + if estimator_spec.host_call is not None: + host_call.record({ + 'host_call': wrap_hostcall_with_global_step( + estimator_spec.host_call)}) + host_call_outfeed_ops = host_call.create_enqueue_op() else: captured_scaffold_fn.capture(None) - with ops.control_dependencies([train_op]): + with ops.control_dependencies([train_op] + host_call_outfeed_ops): return array_ops.identity(loss) - return train_step, captured_scaffold_fn + return train_step, host_call, captured_scaffold_fn def convert_to_single_tpu_eval_step(self, dequeue_fn): """Converts user provided model_fn` as a single eval step on TPU. @@ -1123,15 +1342,16 @@ class _ModelFnWrapper(object): infeed dequeue channel. Returns: - A tuple of eval_fn and eval_metrics. The eval_fn representing the eval - step for TPU. and eval_metrics is an `_EvalMetrics` instance. + A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn + representing the eval step for TPU. """ - eval_metrics = _EvalMetrics(self._ctx) + host_calls = _OutfeedHostCall(self._ctx) captured_scaffold_fn = _CapturedObject() def eval_step(total_loss): """Evaluation step function for use inside a while loop.""" - features, labels = dequeue_fn() + inputs = dequeue_fn() + features, labels = inputs.features_and_labels() tpu_estimator_spec = self._call_model_fn(features, labels) if not isinstance(tpu_estimator_spec, TPUEstimatorSpec): @@ -1141,15 +1361,68 @@ class _ModelFnWrapper(object): loss = tpu_estimator_spec.loss captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn) - eval_metrics.record(tpu_estimator_spec) - outfeed_ops = tpu_ops.outfeed_enqueue_tuple(eval_metrics.outfeed_tensors) + to_record = {} + to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics + if tpu_estimator_spec.host_call is not None: + # We assume that evaluate won't update global step, so we don't wrap + # this host_call. + to_record['host_call'] = tpu_estimator_spec.host_call + host_calls.record(to_record) - with ops.control_dependencies([outfeed_ops]): + with ops.control_dependencies(host_calls.create_enqueue_op()): return math_ops.add(total_loss, loss) - return eval_step, eval_metrics, captured_scaffold_fn + return eval_step, host_calls, captured_scaffold_fn - def _call_model_fn(self, features, labels): + def convert_to_single_tpu_predict_step(self, dequeue_fn): + """Converts user provided model_fn` as a single predict step on TPU. + + Args: + dequeue_fn: The function to retrieve inputs, features and labels, from TPU + infeed dequeue channel. + + Returns: + A tuple of predict_fn, host_calls, and captured scaffold_fn. The + predict_fn representing the predict step for TPU. + """ + host_calls = _OutfeedHostCall(self._ctx) + captured_scaffold_fn = _CapturedObject() + + def predict_step(unused_scalar_stopping_signal): + """Evaluation step function for use inside a while loop.""" + inputs = dequeue_fn() + features, labels = inputs.features_and_labels() + stopping_signals = inputs.signals() + + assert stopping_signals is not None, ( + 'Internal Error: `signals` is missing.') + + tpu_estimator_spec = self._call_model_fn( + features, labels, is_export_mode=False) + if not isinstance(tpu_estimator_spec, TPUEstimatorSpec): + raise RuntimeError( + 'estimator_spec used by TPU prediction must have type' + '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec))) + + captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn) + to_record = {} + identity_fn = lambda **kwargs: kwargs + # TODO(xiejw): Adds validation for prediction dictionrary. + # TODO(xiejw): Adds support for single tensor as predictions. + if not isinstance(tpu_estimator_spec.predictions, dict): + raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.') + to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions] + to_record['signals'] = [identity_fn, stopping_signals] + if tpu_estimator_spec.host_call is not None: + to_record['host_call'] = tpu_estimator_spec.host_call + host_calls.record(to_record) + + with ops.control_dependencies(host_calls.create_enqueue_op()): + return _StopSignals.as_scalar_stopping_signal(stopping_signals) + + return predict_step, host_calls, captured_scaffold_fn + + def _call_model_fn(self, features, labels, is_export_mode=True): """Calls the model_fn with required parameters.""" model_fn_args = util.fn_args(self._model_fn) kwargs = {} @@ -1180,7 +1453,7 @@ class _ModelFnWrapper(object): params[_BATCH_SIZE_KEY] = batch_size_for_model_fn estimator_spec = self._model_fn(features=features, **kwargs) - if (self._ctx.is_running_on_cpu() and + if (self._ctx.is_running_on_cpu(is_export_mode) and isinstance(estimator_spec, TPUEstimatorSpec)): # The estimator_spec will be passed to `Estimator` directly, which expects # type `EstimatorSpec`. @@ -1207,158 +1480,241 @@ class _ModelFnWrapper(object): return estimator_spec -class _EvalMetrics(object): - """Class wraps TPUEstimator.eval_metrics.""" +class _OutfeedHostCall(object): + """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec.""" def __init__(self, ctx): self._ctx = ctx - self._metric_fn = None - self._is_dict = False - self._tensor_keys = [] - self._tensors = [] - self._tensor_dtypes = [] - self._tensor_shapes = [] - self._recorded = False + self._names = [] + # All of these are dictionaries of lists keyed on the name. + self._host_fns = {} + self._tensor_keys = collections.defaultdict(list) + self._tensors = collections.defaultdict(list) + self._tensor_dtypes = collections.defaultdict(list) + self._tensor_shapes = collections.defaultdict(list) @staticmethod - def validate(eval_metrics): - """Validates the `eval_metrics` in `TPUEstimatorSpec`.""" + def validate(host_calls): + """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`.""" - if not isinstance(eval_metrics, (tuple, list)): - raise ValueError('eval_metrics should be tuple or list') - if len(eval_metrics) != 2: - raise ValueError('eval_metrics should have two elements.') - if not callable(eval_metrics[0]): - raise TypeError('eval_metrics[0] should be callable.') - if not isinstance(eval_metrics[1], (tuple, list, dict)): - raise ValueError('eval_metrics[1] should be tuple or list, or dict.') + for name, host_call in host_calls.items(): + if not isinstance(host_call, (tuple, list)): + raise ValueError('{} should be tuple or list'.format(name)) + if len(host_call) != 2: + raise ValueError('{} should have two elements.'.format(name)) + if not callable(host_call[0]): + raise TypeError('{}[0] should be callable.'.format(name)) + if not isinstance(host_call[1], (tuple, list, dict)): + raise ValueError('{}[1] should be tuple or list, or dict.'.format(name)) - if isinstance(eval_metrics[1], (tuple, list)): - fn_args = util.fn_args(eval_metrics[0]) - if len(eval_metrics[1]) != len(fn_args): - raise RuntimeError( - 'In TPUEstimatorSpec.eval_metrics, length of tensors does not ' - 'match method args of metric_fn.') + if isinstance(host_call[1], (tuple, list)): + fullargspec = tf_inspect.getfullargspec(host_call[0]) + fn_args = util.fn_args(host_call[0]) + # wrapped_hostcall_with_global_step uses varargs, so we allow that. + if fullargspec.varargs is None and len(host_call[1]) != len(fn_args): + raise RuntimeError( + 'In TPUEstimatorSpec.{}, length of tensors {} does not match ' + 'method args of the function, which takes {}.'.format( + name, len(host_call[1]), len(fn_args))) @staticmethod - def to_metric_metric_ops_for_cpu(eval_metrics): - """Converts `TPUEstimatorSpec.eval_metrics` to `eval_metric_ops` for CPU.""" - if not eval_metrics: - return None + def create_cpu_hostcall(host_calls): + """Runs on the host_call on CPU instead of TPU when use_tpu=False.""" - _EvalMetrics.validate(eval_metrics) + _OutfeedHostCall.validate(host_calls) + ret = {} + for name, host_call in host_calls.items(): + host_fn, tensors = host_call + if isinstance(tensors, (tuple, list)): + ret[name] = host_fn(*tensors) + else: + # Must be dict. + try: + ret[name] = host_fn(**tensors) + except TypeError as e: + logging.warning( + 'Exception while calling %s: %s. It is likely the tensors ' + '(%s[1]) do not match the ' + 'function\'s arguments', name, e, name) + raise e + return ret - metric_fn, tensors = eval_metrics + def record(self, host_calls): + """Records the host_call structure.""" - if isinstance(tensors, (tuple, list)): - return metric_fn(*tensors) - else: - # Must be dict. - try: - return metric_fn(**tensors) - except TypeError as e: - logging.warning( - 'Exception while calling metric_fn for evalution: %s. ' - 'It is likely the tensors (eval_metrics[1]) do not match the ' - 'metric_fn arguments', e) - raise e + for name, host_call in host_calls.items(): + host_fn, tensor_list_or_dict = host_call + self._names.append(name) + self._host_fns[name] = host_fn - def record(self, spec): - """Records the eval_metrics structure in `spec`.""" - if self._recorded: - raise RuntimeError('Eval metrics have been recorded already.') + if isinstance(tensor_list_or_dict, dict): + for (key, tensor) in six.iteritems(tensor_list_or_dict): + self._tensor_keys[name].append(key) + self._tensors[name].append(tensor) + self._tensor_dtypes[name].append(tensor.dtype) + self._tensor_shapes[name].append(tensor.shape) + else: + # List or tuple. + self._tensor_keys[name] = None + for tensor in tensor_list_or_dict: + self._tensors[name].append(tensor) + self._tensor_dtypes[name].append(tensor.dtype) + self._tensor_shapes[name].append(tensor.shape) - self._metric_fn, tensor_list_or_dict = spec.eval_metrics - - if isinstance(tensor_list_or_dict, dict): - self._is_dict = True - for (key, tensor) in six.iteritems(tensor_list_or_dict): - self._tensor_keys.append(key) - self._tensors.append(tensor) - self._tensor_dtypes.append(tensor.dtype) - self._tensor_shapes.append(tensor.shape) - else: - # List or tuple. - self._is_dict = False - self._tensors = tensor_list_or_dict - for tensor in tensor_list_or_dict: - self._tensor_dtypes.append(tensor.dtype) - self._tensor_shapes.append(tensor.shape) - self._recorded = True - - @property - def outfeed_tensors(self): - if not self._recorded: - raise RuntimeError('Eval metrics have not been recorded yet') - return self._tensors - - def to_metric_metric_ops_for_tpu(self, dummy_update_op): - """Creates the eval_metric_ops now based on the TPU outfeed. - - `eval_metric_ops` is defined in `EstimatorSpec`. From all shards, tensors - are dequeued from outfeed and then concatenated (along batch size dimension) - to form global-like tensors. All global-like tensors are passed to the - metric fn. - - Args: - dummy_update_op: A dummy update op. + def create_enqueue_op(self): + """Create the op to enqueue the recorded host_calls. Returns: - A tuple of (`eval_metric_ops` and `update_ops`), where `update_ops` should - be invoked in Outfeed dequeue thread, which drive the outfeed dequeue and - update the state of metrics. + A list of enqueue ops, which is empty if there are no host calls. + """ + if not self._names: + return [] + + tensors = [] + # TODO(jhseu): Consider deduping tensors. + for name in self._names: + tensors.extend(self._tensors[name]) + return [tpu_ops.outfeed_enqueue_tuple(tensors)] + + def create_tpu_hostcall(self): + """Sends the tensors through outfeed and runs the host_fn on CPU. + + The tensors are concatenated along dimension 0 to form a global tensor + across all shards. The concatenated function is passed to the host_fn and + executed on the first host. + + Returns: + A dictionary mapping name to the return type of the host_call by that + name. Raises: RuntimeError: If outfeed tensor is scalar. """ + if not self._names: + return [] - num_cores = self._ctx.num_cores - + ret = {} # For each i, dequeue_ops[i] is a list containing the tensors from all # shards. This list is concatenated later. dequeue_ops = [] - for i in xrange(len(self._tensors)): - dequeue_ops.append([]) + tensor_dtypes = [] + tensor_shapes = [] + for name in self._names: + for _ in self._tensors[name]: + dequeue_ops.append([]) + for dtype in self._tensor_dtypes[name]: + tensor_dtypes.append(dtype) + for shape in self._tensor_shapes[name]: + tensor_shapes.append(shape) - # Outfeed ops execute on each JF node. + # Outfeed ops execute on each JF node. Note: we must constraint it such that + # we have at most one outfeed dequeue and enqueue. tpu_device_placement_fn = self._ctx.tpu_device_placement_function - for i in xrange(num_cores): + for i in xrange(self._ctx.num_cores): with ops.device(tpu_device_placement_fn(i)): outfeed_tensors = tpu_ops.outfeed_dequeue_tuple( - dtypes=self._tensor_dtypes, shapes=self._tensor_shapes) + dtypes=tensor_dtypes, shapes=tensor_shapes) for j, item in enumerate(outfeed_tensors): dequeue_ops[j].append(item) - # It is assumed evaluation always happends on single host TPU system. So, + # Deconstruct dequeue ops. + dequeue_ops_by_name = {} + pos = 0 + for name in self._names: + dequeue_ops_by_name[name] = dequeue_ops[pos:pos+len(self._tensors[name])] + pos += len(self._tensors[name]) + + # It is assumed evaluation always happens on single host TPU system. So, # place all ops on tpu host if possible. + # + # TODO(jhseu): Evaluate whether this is right for summaries. with ops.device(self._ctx.tpu_host_placement_function(core_id=0)): - for i, item in enumerate(dequeue_ops): - if dequeue_ops[i][0].shape.ndims == 0: - raise RuntimeError( - 'All tensors outfed from TPU should preseve batch size ' - 'dimension, but got scalar {}'.format(dequeue_ops[i][0])) - # TODO(xiejw): Allow users to specify the axis for batch size dimension. - dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0) + for name in self._names: + dequeue_ops = dequeue_ops_by_name[name] + for i, item in enumerate(dequeue_ops): + if dequeue_ops[i][0].shape.ndims == 0: + raise RuntimeError( + 'All tensors outfed from TPU should preserve batch size ' + 'dimension, but got scalar {}'.format(dequeue_ops[i][0])) + # TODO(xiejw): Allow users to specify the axis for batch size + # dimension. + dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0) - if self._is_dict: - dequeue_ops = dict(zip(self._tensor_keys, dequeue_ops)) - try: - eval_metric_ops = self._metric_fn(**dequeue_ops) - except TypeError as e: - logging.warning( - 'Exception while calling metric_fn for evalution: %s. ' - 'It is likely the tensors (eval_metrics[1]) do not match the ' - 'metric_fn arguments', e) - raise e - else: - eval_metric_ops = self._metric_fn(*dequeue_ops) + if self._tensor_keys[name] is not None: + # The user-provided eval_metrics[1] is a dict. + dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops)) + try: + ret[name] = self._host_fns[name](**dequeue_ops) + except TypeError as e: + logging.warning( + 'Exception while calling %s: %s. It is likely the tensors ' + '(%s[1]) do not match the ' + 'function\'s arguments', name, e, name) + raise e + else: + ret[name] = self._host_fns[name](*dequeue_ops) - eval_update_ops = [] - for k, v in eval_metric_ops.items(): - eval_metric_ops[k] = (v[0], dummy_update_op) - eval_update_ops.append(v[1]) + return ret - return eval_metric_ops, eval_update_ops + +def wrap_hostcall_with_global_step(hostcall): + """Wrap the hostcall so that we update the global step upon every call.""" + if hostcall is None: + return None + host_fn, tensors = hostcall + + def global_step_host_fn(_global_step, *args, **kwargs): # pylint: disable=invalid-name + # Note that we don't have any ordering here, so the graph may see a + # global_step that's off by 1. + state_ops.assign( + training.get_global_step(), + math_ops.cast(_global_step[0], dtypes.int64)) + return host_fn(*args, **kwargs) + # Give the global step tensor a batch dimension. Reshape is not supported for + # int64, so we cast it to int32. + # TODO(jhseu): Remove the cast once int64 is supported. + global_step_tensor = array_ops.reshape( + math_ops.cast(training.get_global_step(), dtypes.int32), [1]) + if isinstance(tensors, dict): + outfeed_tensors = {'_global_step': global_step_tensor} + outfeed_tensors.update(tensors) + return global_step_host_fn, outfeed_tensors + else: + fn_args = util.fn_args(host_fn) + if len(tensors) != len(fn_args): + raise RuntimeError( + 'In TPUEstimatorSpec.host_call, length of tensors {} does not match ' + 'method args of the function, which takes {}.'.format( + len(tensors), len(fn_args))) + return global_step_host_fn, [global_step_tensor] + list(tensors) + + +class _OutfeedHostCallHook(session_run_hook.SessionRunHook): + """Hook to run host calls when use_tpu=False.""" + + def __init__(self, tensors): + self._tensors = tensors + + def begin(self): + # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than + # create a separate hook to guarantee execution order, because summaries + # need to be initialized before the outfeed thread starts. + # TODO(jhseu): Make a wrapper hook instead? + self._init_ops = contrib_summary.summary_writer_initializer_op() + # Get all the writer resources from the initializer, so we know what to + # flush. + self._finalize_ops = [] + for op in self._init_ops: + self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0])) + + def after_create_session(self, session, coord): + session.run(self._init_ops) + + def before_run(self, run_context): + return basic_session_run_hooks.SessionRunArgs(self._tensors) + + def end(self, session): + session.run(self._finalize_ops) class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook): @@ -1387,6 +1743,23 @@ class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook): logging.info('examples/sec: %g', examples_per_sec) +class InstallSignalHandlerHook(session_run_hook.SessionRunHook): + """Change SIGINT (CTRL^C) handler to force quit the process. + + The default behavior often results in hanging processes. + The original handler is restored after training/evaluation. + """ + + def __init__(self): + self._signal_fn = signal.getsignal(signal.SIGINT) + + def before_run(self, run_context): + signal.signal(signal.SIGINT, signal.SIG_DFL) + + def end(self, session): + signal.signal(signal.SIGINT, self._signal_fn) + + class TPUEstimator(estimator_lib.Estimator): """Estimator with TPU support. @@ -1394,30 +1767,28 @@ class TPUEstimator(estimator_lib.Estimator): replicating inputs and models for each core, and returning to host periodically to run hooks. - If `use_tpu` is false, all training, evaluation, and predict are executed on - CPU. - - For training, TPUEstimator transforms a global batch size in params to a - per-shard batch size when calling the `input_fn` and `model_fn`. Users should - specify `train_batch_size` in constructor, and then get the batch size for - each shard in `input_fn` and `model_fn` by `params['batch_size']`. If - `TPUConfig.per_host_input_for_training` is `True`, `input_fn` is invoked per - host rather than per core. In this case, a global batch size is transformed a - per-host batch size in params for `input_fn`, but `model_fn` still gets - per-core batch size. - - For evaluation, if `eval_batch_size` is None, it is executed on CPU, even if - `use_tpu` is `True`. If `eval_batch_size` is not `None`, it is executed on - TPU, which is an experimental feature. In this case, `model_fn` should return - `TPUEstimatorSpec` instead of `EstimatorSpec`, which expects the - `eval_metrics` for TPU evaluation. + TPUEstimator transforms a global batch size in params to a per-shard batch + size when calling the `input_fn` and `model_fn`. Users should specify + global batch size in constructor, and then get the batch size for each shard + in `input_fn` and `model_fn` by `params['batch_size']`. + For training, `model_fn` gets per-core batch size; `input_fn` may get + per-core or per-host batch size depending on + `per_host_input_for_training` in `TPUConfig`. + For evaluation, `model_fn` gets per-core batch size and `input_fn` get + per-host batch size. + `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics` + for TPU evaluation. `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. (See `TPUEstimatorSpec` for details). `metric_fn` takes the `tensors` and returns a dict from metric string name to the result of calling a metric function, namely a `(metric_tensor, update_op)` tuple. + One can set `use_tpu` to `False` for testing. All training, evaluation, and + predict will be executed on CPU. `input_fn` and `model_fn` will receive + `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`. + Current limitations: 1. TPU evaluation only works on single host. @@ -1472,6 +1843,7 @@ class TPUEstimator(estimator_lib.Estimator): use_tpu=True, train_batch_size=None, eval_batch_size=None, + predict_batch_size=None, batch_axis=None): """Constructs an `TPUEstimator` instance. @@ -1490,18 +1862,17 @@ class TPUEstimator(estimator_lib.Estimator): basic python types. There are reserved keys for `TPUEstimator`, including 'batch_size'. use_tpu: A bool indicating whether TPU support is enabled. Currently, - - TPU training respects this bit. - - If true, see `eval_batch_size` for evaluate support. + - TPU training and evaluation respect this bit. - Predict still happens on CPU. train_batch_size: An int representing the global training batch size. TPUEstimator transforms this global batch size to a per-shard batch size, as params['batch_size'], when calling `input_fn` and `model_fn`. Cannot be `None` if `use_tpu` is `True`. Must be divisible by `config.tpu_config.num_shards`. - eval_batch_size: An int representing the global training batch size. - Currently, if `None`, evaluation is still executed on CPU (even when - `use_tpu` is True). In near future, `use_tpu` will be the only option to - switch between TPU/CPU evaluation. + eval_batch_size: An int representing evaluation batch size. + Must be divisible by `config.tpu_config.num_shards`. + predict_batch_size: An int representing the prediction batch size. + Must be divisible by `config.tpu_config.num_shards`. batch_axis: A python tuple of int values describing how each tensor produced by the Estimator `input_fn` should be split across the TPU compute shards. For example, if your input_fn produced (images, labels) @@ -1541,15 +1912,25 @@ class TPUEstimator(estimator_lib.Estimator): .format(train_batch_size, config.tpu_config.num_shards)) if eval_batch_size is not None: - if config.tpu_config.num_shards > 8: - raise NotImplementedError( - 'TPU evaluation is only supported with one host.') - + if not isinstance(eval_batch_size, int): + raise ValueError('`eval_batch_size` must be an int') + if eval_batch_size < 1: + raise ValueError('`eval_batch_size` must be positive') if eval_batch_size % config.tpu_config.num_shards != 0: raise ValueError( 'eval batch size {} must be divisible by number of shards {}' .format(eval_batch_size, config.tpu_config.num_shards)) + if predict_batch_size is not None: + if not isinstance(predict_batch_size, int): + raise ValueError('`predict_batch_size` must be an int') + if predict_batch_size < 1: + raise ValueError('`predict_batch_size` must be positive') + if predict_batch_size % config.tpu_config.num_shards != 0: + raise ValueError( + 'predict batch size {} must be divisible by number of shards {}' + .format(predict_batch_size, config.tpu_config.num_shards)) + # Verifies the model_fn signature according to Estimator framework. estimator_lib._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access # We cannot store config and params in this constructor as parent @@ -1569,7 +1950,7 @@ class TPUEstimator(estimator_lib.Estimator): # All properties passed to _TPUContext are immutable. self._ctx = _TPUContext(self._config, train_batch_size, eval_batch_size, - use_tpu) + predict_batch_size, use_tpu) def _create_global_step(self, graph): """Creates a global step suitable for TPUs. @@ -1617,6 +1998,14 @@ class TPUEstimator(estimator_lib.Estimator): util_lib.check_positive_integer(steps, 'Eval steps') + if self._config.tpu_config.num_shards > 8: + raise NotImplementedError( + 'TPU evaluation is only supported with one host.') + + if self._ctx._eval_batch_size is None: # pylint: disable=protected-access + raise ValueError('`eval_batch_size` cannot be `None`' + 'if evaluate() is called on TPU.') + return [ evaluation._StopAfterNEvalsHook( # pylint: disable=protected-access num_evals=steps), @@ -1657,7 +2046,9 @@ class TPUEstimator(estimator_lib.Estimator): if batch_size_for_input_fn is not None: kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn - if ctx.is_running_on_cpu(): + # For export_savedmodel, input_fn is never passed to Estimator. So, + # `is_export_mode` must be False. + if ctx.is_running_on_cpu(is_export_mode=False): with ops.device('/device:CPU:0'): return input_fn(**kwargs) @@ -1684,8 +2075,13 @@ class TPUEstimator(estimator_lib.Estimator): with self._ctx.with_mode(mode) as ctx: model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx) - # TODO(jhseu): Move to PREDICT to TPU. - if ctx.is_running_on_cpu(): + # For export_savedmodel, input_fn is never passed to Estimator. So, + # if features is callable, it means it is the input_fn passed by + # TPUEstimator._call_input_fn. Then we can know if the mode == PREDICT, + # it implies, it is the .predict API, not export_savedmodel API. + is_export_mode = not callable(features) + + if ctx.is_running_on_cpu(is_export_mode=is_export_mode): logging.info('Running %s on CPU', mode) return model_fn_wrapper.call_without_tpu(features, labels) @@ -1695,22 +2091,31 @@ class TPUEstimator(estimator_lib.Estimator): input_fn = features input_holders = _InputPipeline(input_fn, batch_axis, ctx) - enqueue_ops, dequeue_fn = ( + enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = ( input_holders.generate_infeed_enqueue_ops_and_dequeue_fn()) if mode == model_fn_lib.ModeKeys.TRAIN: - loss, scaffold = ( + loss, host_call, scaffold = ( _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn)) + host_ops = host_call.create_tpu_hostcall() + if host_ops is None: + host_ops = [] hooks = [ - TPUInfeedOutfeedSessionHook(ctx, enqueue_ops), + TPUInfeedOutfeedSessionHook( + ctx, + enqueue_ops, + host_ops, + run_infeed_loop_on_coordinator=( + run_infeed_loop_on_coordinator)), ExamplesPerSecondHook(ctx.global_batch_size), + InstallSignalHandlerHook(), training.LoggingTensorHook( { 'loss': array_ops.identity(loss), 'step': training.get_global_step() }, every_n_secs=30) - ] + ] + input_hooks summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss) with ops.control_dependencies([loss]): update_ops = _sync_variables_ops() @@ -1725,40 +2130,114 @@ class TPUEstimator(estimator_lib.Estimator): train_op=control_flow_ops.group(*update_ops), scaffold=scaffold) - # Now eval. - total_loss, eval_metric_ops, scaffold = _eval_on_tpu_system( + if mode == model_fn_lib.ModeKeys.EVAL: + total_loss, host_calls, scaffold = _eval_on_tpu_system( + ctx, model_fn_wrapper, dequeue_fn) + iterations_per_loop_var = _create_or_get_iterations_per_loop() + mean_loss = math_ops.div(total_loss, + math_ops.cast( + iterations_per_loop_var, + dtype=total_loss.dtype)) + + # Creates a dummy metric update_op for all metrics. Estimator expects + # all metrics in eval_metric_ops have update_op and calls them one by + # one. The real metric update_ops are invoked in a separated thread. + # So, here give Estimator the dummy op for all metrics. + with ops.control_dependencies([mean_loss]): + # After TPU evaluation computation is done (the mean_loss tensor), + # reads all variables back from TPU and updates the eval step + # counter properly + internal_ops_to_run = _sync_variables_ops() + internal_ops_to_run.append( + _increase_eval_step_op(iterations_per_loop_var)) + with ops.control_dependencies(internal_ops_to_run): + dummy_update_op = control_flow_ops.no_op() + + host_call_ret = host_calls.create_tpu_hostcall() + eval_metric_ops = {} + eval_update_ops = [] + for k, v in host_call_ret['eval_metrics'].items(): + eval_metric_ops[k] = (v[0], dummy_update_op) + eval_update_ops.append(v[1]) + + if 'host_call' not in host_call_ret: + host_ops = [] + else: + host_ops = host_call_ret['host_call'] + hooks = [ + TPUInfeedOutfeedSessionHook( + ctx, + enqueue_ops, + eval_update_ops + host_ops, + run_infeed_loop_on_coordinator=( + run_infeed_loop_on_coordinator)), + ] + input_hooks + + return model_fn_lib.EstimatorSpec( + mode, + loss=mean_loss, + evaluation_hooks=hooks, + eval_metric_ops=eval_metric_ops, + scaffold=scaffold) + + # Predict + assert mode == model_fn_lib.ModeKeys.PREDICT + + dummy_predict_op, host_calls, scaffold = _predict_on_tpu_system( ctx, model_fn_wrapper, dequeue_fn) - iterations_per_loop_var = _create_or_get_iterations_per_loop() - mean_loss = math_ops.div(total_loss, - math_ops.cast( - iterations_per_loop_var, - dtype=total_loss.dtype)) - - # Creates a dummy metric update_op for all metrics. Estimator expects - # all metrics in eval_metric_ops have update_op and calls them one by - # one. The real metric update_ops are invoked in a separated thread. So, - # here give Estimator the dummy op for all metrics. - with ops.control_dependencies([mean_loss]): - # After TPU evaluation computation is done (the mean_loss tensor), - # reads all variables back from TPU and updates the eval step counter - # properly + with ops.control_dependencies([dummy_predict_op]): internal_ops_to_run = _sync_variables_ops() - internal_ops_to_run.append( - _increase_eval_step_op(iterations_per_loop_var)) with ops.control_dependencies(internal_ops_to_run): - dummy_update_op = control_flow_ops.no_op() + dummy_predict_op = control_flow_ops.no_op() + + # In train and evaluation, the main TPU program is passed to monitored + # training session to run. Infeed enqueue and outfeed dequeue are + # executed in side threads. This is not the configuration for + # prediction mode. + # + # For prediction, the Estimator executes the EstimatorSpec.predictions + # directly and yield the element (via generator) to call site. So, the + # outfeed based prediction must be passed to MonitoredSession directly. + # Other parts of the TPU execution are organized as follows. + # + # 1. All outfeed based Tensors must be grouped with predictions Tensors + # to form a single invocation. This avoid the issue we might trigger + # multiple outfeeds incorrectly. To achieve this, `host_call` is + # placed in control_dependencies of `stopping_signals`, and + # `stopping_signals` is passed into _StoppingPredictHook, which sets + # the `stopping_signals` as SessionRunArgs. MonitoredSession merges + # all SessionRunArgs with the fetch in session.run together. + # + # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue) + # are grouped together. They will be launched once and only once in + # side threads and they quit naturally according to the SAME stopping + # condition. + enqueue_ops.append(dummy_predict_op) + + host_call_ret = host_calls.create_tpu_hostcall() + if 'host_call' not in host_call_ret: + host_ops = [] + else: + host_ops = host_call_ret['host_call'] + + predictions = host_call_ret['predictions'] + stopping_signals = host_call_ret['signals'] + + with ops.control_dependencies(host_ops): + host_ops = [] # Empty, we do do not need it anymore. + scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal( + stopping_signals) - eval_metric_ops, eval_update_ops = ( - eval_metric_ops.to_metric_metric_ops_for_tpu(dummy_update_op)) hooks = [ - TPUInfeedOutfeedSessionHook(ctx, enqueue_ops, eval_update_ops), - ] + _StoppingPredictHook(scalar_stopping_signal), + TPUInfeedOutfeedSessionHookForPrediction(ctx, enqueue_ops, + host_ops), + ] + input_hooks return model_fn_lib.EstimatorSpec( mode, - loss=mean_loss, - evaluation_hooks=hooks, - eval_metric_ops=eval_metric_ops, + prediction_hooks=hooks, + predictions=predictions, scaffold=scaffold) return _model_fn @@ -1769,7 +2248,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): num_cores = ctx.num_cores iterations_per_loop_var = _create_or_get_iterations_per_loop() - single_tpu_eval_step, eval_metric_ops, captured_scaffold_fn = ( + single_tpu_eval_step, host_calls, captured_scaffold_fn = ( model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)) def multi_tpu_eval_steps_on_single_shard(): @@ -1785,7 +2264,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): outputs_from_all_shards=False) scaffold = _get_scaffold(captured_scaffold_fn) - return loss, eval_metric_ops, scaffold + return loss, host_calls, scaffold def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): @@ -1793,7 +2272,7 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): num_cores = ctx.num_cores iterations_per_loop_var = _create_or_get_iterations_per_loop() - single_tpu_train_step, captured_scaffold_fn = ( + single_tpu_train_step, host_call, captured_scaffold_fn = ( model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn)) def multi_tpu_train_steps_on_single_shard(): @@ -1809,7 +2288,35 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): outputs_from_all_shards=False) scaffold = _get_scaffold(captured_scaffold_fn) - return loss, scaffold + return loss, host_call, scaffold + + +def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): + """Executes `model_fn_wrapper` multiple times on all TPU shards.""" + num_cores = ctx.num_cores + + single_tpu_predict_step, host_calls, captured_scaffold_fn = ( + model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)) + + def multi_tpu_predict_steps_on_single_shard(): + + def cond(scalar_stopping_signal): + return math_ops.logical_not( + _StopSignals.should_stop(scalar_stopping_signal)) + + inputs = [_StopSignals.NON_STOPPING_SIGNAL] + outputs = training_loop.while_loop( + cond, single_tpu_predict_step, inputs=inputs, name=b'loop') + return outputs + + (dummy_predict_op,) = tpu.shard( + multi_tpu_predict_steps_on_single_shard, + inputs=[], + num_shards=num_cores, + outputs_from_all_shards=False) + + scaffold = _get_scaffold(captured_scaffold_fn) + return dummy_predict_op, host_calls, scaffold def _wrap_computation_in_while_loop(device, op_fn): @@ -1830,6 +2337,29 @@ def _wrap_computation_in_while_loop(device, op_fn): parallel_iterations=1) +def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn): + """Wraps the ops generated by `op_fn` in tf.while_loop.""" + + def cond(scalar_stopping_signal): + return math_ops.logical_not( + _StopSignals.should_stop(scalar_stopping_signal)) + + def computation(unused_scalar_stopping_signal): + return_value = op_fn() + execute_ops = return_value['ops'] + signals = return_value['signals'] + with ops.control_dependencies(execute_ops): + return _StopSignals.as_scalar_stopping_signal(signals) + + # By setting parallel_iterations=1, the parallel execution in while_loop is + # basically turned off. + with ops.device(device): + return control_flow_ops.while_loop( + cond, + computation, [_StopSignals.NON_STOPPING_SIGNAL], + parallel_iterations=1) + + def _validate_tpu_training_graph(): """Validate graph before running distributed training. @@ -1920,3 +2450,194 @@ class _CapturingContext(control_flow_ops.ControlFlowContext): def __exit__(self, _, __, ___): # pylint: disable=invalid-name self._g._set_control_flow_context(self._old) # pylint: disable=protected-access + + +class _Inputs(object): + """A data structure representing the input_fn returned values. + + This also supports the returned value from input_fn as `Dataset`. + """ + + def __init__(self, features=None, labels=None, dataset=None, signals=None): + if dataset is not None and (features is not None or labels is not None or + signals is not None): + raise RuntimeError('Internal Error: Either (features and labels) or ' + 'dataset should be provided, not both. Please file ' + 'bug') + + self._features = features + self._labels = labels + self._signals = signals + + self._dataset = dataset + self._iterator = None + + @staticmethod + def from_input_fn(return_values): + """Returns an `_Inputs` instance according to `input_fn` return value.""" + if isinstance(return_values, dataset_ops.Dataset): + dataset = return_values + return _Inputs(dataset=dataset) + + features, labels = _Inputs._parse_inputs(return_values) + return _Inputs(features, labels) + + @staticmethod + def _parse_inputs(return_values): + if isinstance(return_values, tuple): + features, labels = return_values + else: + features, labels = return_values, None + return features, labels + + @property + def is_dataset(self): + """Returns True if the return value from input_fn is Dataset.""" + return self._dataset is not None + + def dataset_initializer_hook(self): + """Returns a `SessionRunHook` to initialize this dataset. + + This must be called before `features_and_labels`. + """ + iterator = self._dataset.make_initializable_iterator() + # pylint: disable=protected-access + hook = estimator_lib._DatasetInitializerHook(iterator) + self._iterator = iterator + return hook + + def features_and_labels(self): + """Gets `features` and `labels`.""" + if self.is_dataset: + return _Inputs._parse_inputs(self._iterator.get_next()) + + return (self._features, self._labels) + + def signals(self): + return self._signals + + @property + def dataset(self): + return self._dataset + + +# TODO(xiejw): Extend this to support final partial batch. +class _InputsWithStoppingSignals(_Inputs): + """Inputs with `_StopSignals` inserted into the dataset.""" + + def __init__(self, dataset, batch_size): + + assert dataset is not None + + user_provided_dataset = dataset.map( + _InputsWithStoppingSignals.insert_stopping_signal( + stop=False, batch_size=batch_size)) + final_batch_dataset = dataset.take(1).map( + _InputsWithStoppingSignals.insert_stopping_signal( + stop=True, batch_size=batch_size)) + dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2) + + super(_InputsWithStoppingSignals, self).__init__(dataset=dataset) + self._current_inputs = None + + def features_and_labels(self): + if self._current_inputs is not None: + raise RuntimeError( + 'Internal Error: The previous inputs have not been properly ' + 'consumed. First call features_and_labels, then call signals.') + + inputs_with_signals = self._iterator.get_next() + features = inputs_with_signals['features'] + labels = inputs_with_signals.get('labels') + + self._current_inputs = inputs_with_signals + return features, labels + + def signals(self): + """Returns the `Signals` from `_Inputs`.""" + if self._current_inputs is None: + raise RuntimeError( + 'Internal Error: The current inputs have not been properly ' + 'generated. First call features_and_labels, then call signals.') + signals = self._current_inputs['signals'] + self._current_inputs = None + return signals + + @staticmethod + def insert_stopping_signal(stop, batch_size): + """Inserts stopping_signal into dataset via _map_fn. + + Here we change the data structure in the dataset, such that the return value + is a dictionary now and `features`, `labels`, and `signals` are three + distinguished keys in that dict. This provides a better structure, which + eases the process to decompose the inputs (see `features_and_labels`). + + Args: + stop: bool, state of current stopping signals. + batch_size: int, batch size. + + Returns: + A map_fn passed to dataset.map API. + """ + + def _map_fn(*args): + features, labels = _Inputs._parse_inputs(args) + new_input_dict = {} + new_input_dict['features'] = features + if labels is not None: + new_input_dict['labels'] = labels + new_input_dict['signals'] = _StopSignals( + stop=stop, batch_size=batch_size).as_dict() + return new_input_dict + + return _map_fn + + +class _StopSignals(object): + """Signals class holding all logic to handle TPU stopping condition.""" + + NON_STOPPING_SIGNAL = 0.0 + STOPPING_SIGNAL = 1.0 + + def __init__(self, stop, batch_size): + self._stop = stop + self._batch_size = batch_size + + def as_dict(self): + shape = [self._batch_size, 1] + dtype = dtypes.float32 + + if self._stop: + stopping = array_ops.ones(shape=shape, dtype=dtype) + else: + stopping = array_ops.zeros(shape=shape, dtype=dtype) + + return {'stopping': stopping} + + @staticmethod + def as_scalar_stopping_signal(signals): + return array_ops.identity(signals['stopping'][0][0]) + + @staticmethod + def should_stop(scalar_stopping_signal): + return scalar_stopping_signal >= _StopSignals.STOPPING_SIGNAL + + +class _SignalsHelper(object): + """A general helper class to handle common signals manipulation.""" + + def __init__(self, signals): + self._signal_keys = [] + for key in sorted(signals.iterkeys()): + self._signal_keys.append(key) + + @property + def num_signals(self): + return len(self._signal_keys) + + def unflatten(self, tensor_list): + return dict(zip(self._signal_keys, tensor_list)) + + @staticmethod + def as_tensor_list(signals): + return [signals[key] for key in sorted(signals.iterkeys())] diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD index cccaa2b833e..6db373d2d5e 100644 --- a/tensorflow/contrib/training/BUILD +++ b/tensorflow/contrib/training/BUILD @@ -26,6 +26,7 @@ py_library( "python/training/resample.py", "python/training/sampling_ops.py", "python/training/sequence_queueing_state_saver.py", + "python/training/tensor_queue_dataset.py", "python/training/training.py", "python/training/tuner.py", ], @@ -285,6 +286,28 @@ py_test( ], ) +py_test( + name = "tensor_queue_dataset_test", + size = "large", + srcs = ["python/training/tensor_queue_dataset_test.py"], + srcs_version = "PY2AND3", + tags = ["notsan"], + deps = [ + ":training_py", + "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:gradients", + "//tensorflow/python:math_ops", + "//tensorflow/python:platform", + "//tensorflow/python:random_seed", + "//tensorflow/python:training", + "//tensorflow/python:variables", + "//tensorflow/python/data", + "//third_party/py/numpy", + ], +) + filegroup( name = "all_files", srcs = glob( diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py new file mode 100644 index 00000000000..409aba817c1 --- /dev/null +++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py @@ -0,0 +1,200 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Python wrappers for Datasets and Iterators.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.util import nest +from tensorflow.python.data.util import sparse +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import tensor_util +from tensorflow.python.ops import gen_dataset_ops +from tensorflow.python.util import nest as tf_nest + + +class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.Dataset): + """A `Dataset` that prepends a queue to another `Dataset`. + + A vector of handles to the queue is returned as the first component of + the associated iterator. This vector can be passed to + `enqueue_in_queue_dataset` to add new elements to the queue. + """ + + def __init__(self, input_dataset, batch_size, padded_shapes, padding_values): + """Initialize `PrependFromQueueAndPaddedBatchDataset`.""" + super(_PrependFromQueueAndPaddedBatchDataset, self).__init__() + if sparse.any_sparse(input_dataset.output_classes): + raise TypeError( + "Batching of padded sparse tensors is not currently supported") + self._input_dataset = input_dataset + self._batch_size = ops.convert_to_tensor( + batch_size, dtype=dtypes.int64, name="batch_size") + # pylint: disable=protected-access + if padded_shapes is None: + self._padded_shapes = nest.map_structure( + dataset_ops._partial_shape_to_tensor, input_dataset.output_shapes) + else: + self._padded_shapes = nest.map_structure_up_to( + input_dataset.output_shapes, dataset_ops._partial_shape_to_tensor, + padded_shapes) + padding_values = ( + padding_values if padding_values is not None else + dataset_ops._default_padding(input_dataset)) + self._padding_values = nest.map_structure_up_to( + input_dataset.output_shapes, dataset_ops._padding_value_to_tensor, + padding_values, input_dataset.output_types) + # pylint: enable=protected-access + + def _as_variant_tensor(self): + # pylint: disable=protected-access + return gen_dataset_ops.prepend_from_queue_and_padded_batch_dataset( + self._input_dataset._as_variant_tensor(), + batch_size=self._batch_size, + padded_shapes=[ + ops.convert_to_tensor(s, dtype=dtypes.int64) + for s in nest.flatten(self._padded_shapes) + ], + padding_values=nest.flatten(self._padding_values), + output_shapes=nest.flatten( + sparse.as_dense_shapes(self.output_shapes, self.output_classes))) + # pylint: enable=protected-access + + @property + def output_classes(self): + return (ops.Tensor, self._input_dataset.output_classes) + + def _as_batch_shape(self, shape_like): + return tensor_shape.vector(None).concatenate( + tensor_util.constant_value_as_shape(shape_like)) + + @property + def output_shapes(self): + # First output is a variant representing the Queue + return (tensor_shape.vector(None), + nest.map_structure(self._as_batch_shape, self._padded_shapes)) + + @property + def output_types(self): + # First output is a variant representing the Queue + return (dtypes.variant, self._input_dataset.output_types) + + +def prepend_from_queue_and_padded_batch_dataset(batch_size, + padding_values=None, + padded_shapes=None): + """A transformation that prepends a queue to a `Dataset` and batches results. + + A vector of handles to the queue is returned as the first component of the + associated iterator. This vector can be passed to `enqueue_in_queue_dataset` + to add new elements to the queue. + + Below is an example of how this dataset might be used to split incoming + variable-length sequences into "head" and "rest" parts, where "rest" parts + are re-enqueued back into the dataset. A more realistic example would + perform some calculation on the "head" and modify some components of "rest" + with the result (before re-enqueueing). + + ```python + dataset = tf.data.Dataset.from_tensor_slices([2*x for x in range(10)]) + # Make a dataset of variable-length vectors and their lengths. + dataset = dataset.map(lambda count: (count, tf.ones((count,)))) + # Emit a queue we can prepend to, and counts/values as padded batch. + dataset = dataset.apply( + tf.contrib.training.prepend_from_queue_and_padded_batch_dataset( + batch_size=10)) + dataset = dataset.prefetch(1) + + iterator = dataset.make_one_shot_iterator() + queue, (count, padded_value) = iterator.get_next() + + # Split the padded_value into two pieces: head and rest + rest_indices = tf.squeeze(tf.where(count > 3), axis=1) + bound = tf.minimum(3, tf.reduce_max(count)) + value_head = padded_value[:, :bound] + count_rest = tf.gather(count - 3, rest_indices) + value_rest = tf.gather(padded_value[:, bound:], rest_indices) + queue_rest = tf.gather(queue, rest_indices) + enqueue_rest_op = tf.contrib.training.enqueue_in_queue_dataset( + queue_rest, (count_rest, value_rest)) + with tf.control_dependencies([enqueue_rest_op]): + calculation = fn(value_head) + + while True: # Will raise OutOfRange when finished with all pieces. + session.run(calculation) + ``` + + Args: + batch_size: `int64` scalar tensor. The batch size to use when performing + padded batching. + padding_values: (optional) Nested tuple of scalar tensors. If provided, + the structure and dtypes of padding_values should match that of + incoming dataset's `output_types`. + padded_shapes: (optional) Nested tuple of `int64` vector tensors. + If provided, the structure must match that of the incoming dataset's + `output_types`. If not provided, the incoming dataset's `output_shapes` + is used. Any unknown (`None` or `-1`) dimensions in the shapes are + treated as being unique per-batch: for each batch time, an unknown + dimension is replaced with the maximum given value of this dimension + across all tensors for the given component in the batch. + + Returns: + A `Dataset` transformation function, which can be passed to + @{tf.data.Dataset.apply}. + """ + + def _apply_fn(dataset): + return _PrependFromQueueAndPaddedBatchDataset( + dataset, + batch_size=batch_size, + padding_values=padding_values, + padded_shapes=padded_shapes) + + return _apply_fn + + +def enqueue_in_queue_dataset(queue, components): + """Enqueue components into queue from `PrependFromQueueAndPaddedBatchDataset`. + + The components' dtypes and shapes must be compatible with the `output_shapes` + attribute of the `dataset` created by + `prepend_from_queue_and_padded_batch_dataset`. This operation supports both + non-batched and batched modes. + + For more details, see the example in the docstring for + `prepend_from_queue_and_padded_batch_dataset`. + + Args: + queue: `variant` scalar or vector tensor. + The tensor emitted by the first component of the iterator associated with + `prepend_from_queue_and_padded_batch_dataset`. If this is a scalar, + then the `components` input tensors should not have a prepended batch + dimension. + components: Nested tuple of tensors, each with a leading batch dimension + if `queue` is a vector. The structure, dtypes, and shapes + (excluding batch dimension) must match the nested tuples + `dataset.output_types[1]` and `dataset.output_shapes[1]` (the non-queue + output types and shapes) of the `dataset` emitted by + the original `prepend_from_queue_and_padded_batch_dataset` call. + + Returns: + An `Operation` that enqueues `components` into the dataset(s) associated + with entries of `queue`. + """ + return gen_dataset_ops.enqueue_in_queue_dataset( + queue=queue, components=tf_nest.flatten(components)) diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py new file mode 100644 index 00000000000..0338f409a20 --- /dev/null +++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py @@ -0,0 +1,355 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for TensorQueueDataset.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base +from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import string_ops +from tensorflow.python.platform import test + + +class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase): + + def testNoEnqueue(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2]) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1)) + self.assertEqual((dtypes.variant, dtypes.int32), dataset.output_types) + self.assertAllEqual(([None],) * 2, + [x.as_list() for x in dataset.output_shapes]) + iterator = dataset.make_one_shot_iterator() + _, value = iterator.get_next() + self.assertEqual([0], self.evaluate(value)) + self.assertEqual([1], self.evaluate(value)) + self.assertEqual([2], self.evaluate(value)) + with self.assertRaisesOpError("End of sequence"): + self.evaluate(value) + + def testBatchedNoEnqueue(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2]) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2)) + iterator = dataset.make_one_shot_iterator() + _, value = iterator.get_next() + self.assertAllEqual([0, 1], self.evaluate(value)) + self.assertAllEqual([2], self.evaluate(value)) + with self.assertRaisesOpError("End of sequence"): + self.evaluate(value) + + def testBatchedWithBiggerPaddingNoEnqueue(self): + dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]]) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset( + batch_size=2, padded_shapes=[3])) + iterator = dataset.make_one_shot_iterator() + _, value = iterator.get_next() + self.assertAllEqual([[0, 0, 0], [1, 0, 0]], self.evaluate(value)) + self.assertAllEqual([[2, 0, 0]], self.evaluate(value)) + with self.assertRaisesOpError("End of sequence"): + self.evaluate(value) + + def testBatchedWithBiggerPaddingOneEnqueue(self): + dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]]) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset( + batch_size=1, padded_shapes=[3])) + iterator = dataset.make_one_shot_iterator() + queue_handle, value = iterator.get_next() + enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value) + with self.test_session() as sess: + self.assertAllEqual([[0, 0, 0]], sess.run(value)) + value_1, _ = sess.run([value, enqueue_negative]) + self.assertAllEqual([[1, 0, 0]], value_1) + value_2, _ = sess.run([value, enqueue_negative]) + self.assertAllEqual([[-1, 0, 0]], value_2) + value_3 = sess.run(value) + self.assertAllEqual([[1, 0, 0]], value_3) + value_4, _ = sess.run([value, enqueue_negative]) + self.assertAllEqual([[2, 0, 0]], value_4) + value_5 = sess.run(value) + self.assertAllEqual([[-2, 0, 0]], value_5) + with self.assertRaisesOpError("End of sequence"): + sess.run(value) + + def testOneEnqueue(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2]) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1)) + iterator = dataset.make_one_shot_iterator() + queue_handle, value = iterator.get_next() + enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value) + with self.test_session() as sess: + self.assertEqual([0], sess.run(value)) + value_1, _ = sess.run([value, enqueue_negative]) + self.assertEqual([1], value_1) + value_2, _ = sess.run([value, enqueue_negative]) + self.assertEqual([-1], value_2) + value_3 = sess.run(value) + self.assertEqual([1], value_3) + value_4, _ = sess.run([value, enqueue_negative]) + self.assertEqual([2], value_4) + value_5 = sess.run(value) + self.assertEqual([-2], value_5) + with self.assertRaisesOpError("End of sequence"): + sess.run(value) + + def testBatchedOneEnqueue(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2]) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2)) + iterator = dataset.make_one_shot_iterator() + queue_handle, value = iterator.get_next() + enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value) + enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]], + array_ops.expand_dims( + value[0], axis=0)) + with self.test_session() as sess: + value_0, _ = sess.run([value, enqueue_negative]) + self.assertAllEqual([0, 1], value_0) + value_1, _ = sess.run([value, enqueue_zeroth]) + self.assertAllEqual([0, -1], value_1) + value_2, _ = sess.run([value, enqueue_negative]) + self.assertAllEqual([0, 2], value_2) + self.assertAllEqual([0, -2], sess.run(value)) + with self.assertRaisesOpError("End of sequence"): + sess.run(value) + + def testManyEnqueue(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0, 1]) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1)) + iterator = dataset.make_one_shot_iterator() + queue_handle, value = iterator.get_next() + enqueue_many_more = [ + tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i) + for i in range(1000) + ] + with self.test_session() as sess: + value_0, _ = sess.run((value, enqueue_many_more)) + self.assertEqual([0], value_0) + rest = [] + for _ in range(1000): + rest.append(sess.run(value)) + self.assertEquals([[100 + i] for i in range(1000)], sorted(rest)) + # Going back to the original input. + value_1, _ = sess.run((value, enqueue_many_more)) + self.assertEqual(1, value_1) + rest = [] + for _ in range(1000): + rest.append(sess.run(value)) + self.assertEquals([[100 + i + 1] for i in range(1000)], sorted(rest)) + with self.assertRaisesOpError("End of sequence"): + sess.run(value) + + def testEnqueueWithPrefetch(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0]) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1)) + # Prefetching will request additional values before they are + # available to the queue. + dataset = dataset.prefetch(buffer_size=3) + iterator = dataset.make_one_shot_iterator() + queue_handle, value = iterator.get_next() + enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1) + with self.test_session() as sess: + i = 0 + while i < 4: + received, _ = sess.run((value, enqueue)) + if received.size > 0: + self.assertAllEqual([i], received) + i += 1 + received_last = False + while True: + try: + received = sess.run(value) + if received.size > 0: + self.assertAllEqual([4], received) + received_last = True + except errors.OutOfRangeError: + break + self.assertTrue(received_last) + + def testDatasetWithPaddedShapeSmallerThanInputFails(self): + dataset = dataset_ops.Dataset.from_tensor_slices([[0, 0, 0]]).repeat(None) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset( + batch_size=1, padded_shapes=[2])) + iterator = dataset.make_one_shot_iterator() + _, value = iterator.get_next() + with self.test_session() as sess: + with self.assertRaisesOpError( + r"Incompatible input shapes at component 0 between " + r"input dataset this dataset: \[3\] vs. \[2\]"): + sess.run(value) + + def testEnqueueWithIncompatibleInputsFailsWithInformativeError(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0]).repeat(None) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1)) + iterator = dataset.make_one_shot_iterator() + queue_handle, value = iterator.get_next() + + enqueue_bad_structure = tqd.enqueue_in_queue_dataset( + queue_handle, (value, value)) + enqueue_bad_dtype = tqd.enqueue_in_queue_dataset(queue_handle, + np.array( + [1.0], + dtype=np.float32)) + enqueue_bad_shape_no_batch_dim = tqd.enqueue_in_queue_dataset( + queue_handle, ([1],)) + enqueue_bad_shape = tqd.enqueue_in_queue_dataset(queue_handle, + np.array( + [[1]], dtype=np.int32)) + + with self.test_session() as sess: + with self.assertRaisesOpError( + "mismatched number of tensors. Queue expects 1 tensors but " + "tried to insert 2"): + sess.run(enqueue_bad_structure) + with self.assertRaisesOpError(r"Expected component 0 to have batched " + r"shape \[1,...\], but saw shape: \[\]"): + sess.run(enqueue_bad_shape_no_batch_dim) + with self.assertRaisesOpError( + r"mismatched shapes at component 0. Attempted to insert tensor " + r"with shape \[1\] but queue expected shape: \[\]"): + sess.run(enqueue_bad_shape) + with self.assertRaisesOpError( + r"mismatched dtypes at component 0. Attempted to insert tensor " + r"of type float but queue expected type: int32"): + sess.run(enqueue_bad_dtype) + + def testEnqueueWithPaddedBatchFailsWithInformativeError(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2]) + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1)) + with self.assertRaisesRegexp( + TypeError, r"Unable to create padding for field of type 'variant'"): + dataset.padded_batch(batch_size=10, padded_shapes=[1]) + + def testOneEnqueueWithPadding(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6]) + # Make a dataset of variable-length vectors and their lengths. + dataset = dataset.map( + lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype))) + # Emit a queue we can prepend to, and counts/values as padded + # batch. + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=3)) + + iterator = dataset.make_one_shot_iterator() + queue, (count, padded_value) = iterator.get_next() + + # Split the padded_value into two pieces: head and rest + rest_indices = array_ops.squeeze(array_ops.where(count > 2), axis=1) + bound = math_ops.minimum(2, math_ops.reduce_max(count)) + value_head = padded_value[:, :bound] + count_rest = array_ops.gather(count - 2, rest_indices) + value_rest = array_ops.gather(padded_value, rest_indices)[:, bound:] + queue_rest = array_ops.gather(queue, rest_indices) + enqueue_rest_op = tqd.enqueue_in_queue_dataset(queue_rest, + (count_rest, value_rest)) + with ops.control_dependencies([enqueue_rest_op]): + calc = array_ops.identity(value_head) + + with self.test_session() as sess: + self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc)) + self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc)) + self.assertAllEqual([[6, 6]], sess.run(calc)) + self.assertAllEqual([[6, 6]], sess.run(calc)) + # Get some final batches due to prefetching. + for _ in range(3): + try: + self.assertAllEqual( + np.empty(shape=(0, 0), dtype=np.int32), sess.run(calc)) + except errors.OutOfRangeError as e: + self.assertTrue(str(e).startswith("End of sequence")) + + def testNonstandardPadding(self): + dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6]) + # Make a dataset of variable-length vectors and their lengths. + dataset = dataset.map( + lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype))) + # Emit a queue we can prepend to, and counts/values as padded + # batch. + dataset = dataset.apply( + tqd.prepend_from_queue_and_padded_batch_dataset( + batch_size=3, padding_values=( + 0, + -1, + ))) + + iterator = dataset.make_one_shot_iterator() + _, (unused_count, padded_value) = iterator.get_next() + + with self.test_session() as sess: + self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]], + sess.run(padded_value)) + self.assertAllEqual([[6] * 6], sess.run(padded_value)) + with self.assertRaisesOpError("End of sequence"): + sess.run(padded_value) + + +# TODO(ebrevdo): Figure out how to use run_core_tests to test state +# saving of an iterator that's had some tensors enqueued into its queue. +class PrependFromQueueAndPaddedBatchDatasetSerializationTest( + dataset_serialization_test_base.DatasetSerializationTestBase): + + def testPrependFromQueueAndPaddedBatch(self): + + def build_dataset(seq_lens): + return dataset_ops.Dataset.from_tensor_slices(seq_lens).map( + lambda x: array_ops.fill([x], x)).apply( + tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=4)) + + seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32) + seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32) + self.run_core_tests(lambda: build_dataset(seq_lens1), + lambda: build_dataset(seq_lens2), 8) + + def testPrependFromQueueAndPaddedBatchNonDefaultPadding(self): + + def build_dataset(seq_lens): + + def fill_tuple(x): + filled = array_ops.fill([x], x) + return (filled, string_ops.as_string(filled)) + + padded_shape = [-1] + return dataset_ops.Dataset.from_tensor_slices(seq_lens).map( + fill_tuple).apply( + tqd.prepend_from_queue_and_padded_batch_dataset( + batch_size=4, + padded_shapes=(padded_shape, padded_shape), + padding_values=(-1, ""))) + + seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32) + seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32) + self.run_core_tests(lambda: build_dataset(seq_lens1), + lambda: build_dataset(seq_lens2), 8) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc index 2992a61ea81..9675428e56e 100644 --- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc +++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc @@ -142,9 +142,9 @@ Status ConvertConstantsToImmutable(const string& in_graph_filename, const auto load_graph_status = ReadBinaryProto(default_env, in_graph_filename, &graph_def); if (!load_graph_status.ok()) { - return tensorflow::errors::NotFound("Failed to load graph at '", - in_graph_filename, "' : ", - load_graph_status.error_message()); + return tensorflow::errors::NotFound( + "Failed to load graph at '", in_graph_filename, + "' : ", load_graph_status.error_message()); } NodeConverter node_converter; diff --git a/tensorflow/contrib/util/inspect_checkpoint.cc b/tensorflow/contrib/util/inspect_checkpoint.cc index 39088aeaad6..9b578ceb075 100644 --- a/tensorflow/contrib/util/inspect_checkpoint.cc +++ b/tensorflow/contrib/util/inspect_checkpoint.cc @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/util/tensor_slice_reader.h" namespace tensorflow { diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc index 47ed83f521c..1a0b5028feb 100644 --- a/tensorflow/contrib/verbs/verbs_server_lib.cc +++ b/tensorflow/contrib/verbs/verbs_server_lib.cc @@ -49,8 +49,8 @@ VerbsServer::~VerbsServer() { Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def, GrpcChannelCache** channel_cache) { string name_prefix = - strings::StrCat("/job:", server_def.job_name(), "/replica:0", "/task:", - server_def.task_index()); + strings::StrCat("/job:", server_def.job_name(), "/replica:0", + "/task:", server_def.task_index()); GrpcChannelSpec channel_spec; TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec)); diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 29c515121e7..4ba84b420f2 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -381,7 +381,6 @@ cc_library( srcs = ["platform/stacktrace_handler.cc"], hdrs = ["platform/stacktrace_handler.h"], deps = [ - ":abi", ":lib", ":lib_platform", ], @@ -434,6 +433,7 @@ tf_cuda_library( "framework/common_shape_fns.h", "framework/control_flow.h", # TODO(josh11b): Make internal? "framework/dataset.h", + "framework/dataset_stateful_op_whitelist.h", "framework/device_base.h", "framework/function.h", "framework/graph_def_util.h", @@ -611,6 +611,7 @@ tf_gen_op_libs( "list_ops", "lookup_ops", "logging_ops", + "manip_ops", "math_ops", "nn_ops", "no_op", @@ -693,6 +694,7 @@ cc_library( ":list_ops_op_lib", ":logging_ops_op_lib", ":lookup_ops_op_lib", + ":manip_ops_op_lib", ":math_ops_op_lib", ":nn_ops_op_lib", ":no_op_op_lib", @@ -830,6 +832,7 @@ cc_library( "//tensorflow/core/kernels:list_kernels", "//tensorflow/core/kernels:lookup", "//tensorflow/core/kernels:logging", + "//tensorflow/core/kernels:manip", "//tensorflow/core/kernels:math", "//tensorflow/core/kernels:multinomial_op", "//tensorflow/core/kernels:nn", @@ -1152,6 +1155,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], alwayslink = 1, @@ -1352,6 +1356,13 @@ tf_pyclif_proto_library( visibility = ["//visibility:public"], ) +tf_pyclif_proto_library( + name = "protobuf/device_properties_pyclif", + proto_lib = ":protos_all_cc", + proto_srcfile = "protobuf/device_properties.proto", + visibility = ["//visibility:public"], +) + # ----------------------------------------------------------------------------- # Internal targets @@ -1899,6 +1910,7 @@ cc_library( tf_cuda_library( name = "cuda_device_functions", hdrs = ["util/cuda_device_functions.h"], + cuda_deps = ["//third_party_gpus/cuda:cuda_headers"], visibility = ["//visibility:public"], deps = [":framework_lite"], ) @@ -2413,7 +2425,6 @@ cc_library( deps = [ ":lib", ":lib_internal", - ":stacktrace_handler", ":test", # buildcleaner: keep "//tensorflow/core/platform/default/build_config:test_main", ], diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD index 81187ff6b77..58dbac4e8ed 100644 --- a/tensorflow/core/api_def/BUILD +++ b/tensorflow/core/api_def/BUILD @@ -96,6 +96,7 @@ tf_cc_test( srcs = ["api_test.cc"], data = [ ":base_api_def", + ":python_api_def", ], deps = [ ":excluded_ops_lib", diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc index 112c55ccc3b..477a0b670e4 100644 --- a/tensorflow/core/api_def/api_test.cc +++ b/tensorflow/core/api_def/api_test.cc @@ -41,8 +41,9 @@ namespace tensorflow { namespace { constexpr char kDefaultApiDefDir[] = "tensorflow/core/api_def/base_api"; +constexpr char kPythonApiDefDir[] = + "tensorflow/core/api_def/python_api"; constexpr char kApiDefFilePattern[] = "api_def_*.pbtxt"; -} // namespace // Reads golden ApiDef files and returns a map from file name to ApiDef file // contents. @@ -66,9 +67,93 @@ void GetGoldenApiDefs(Env* env, const string& api_files_dir, } } -class ApiTest : public ::testing::Test { +void TestAllApiDefsHaveCorrespondingOp( + const OpList& ops, const std::unordered_map& api_defs_map) { + std::unordered_set op_names; + for (const auto& op : ops.op()) { + op_names.insert(op.name()); + } + for (const auto& name_and_api_def : api_defs_map) { + ASSERT_TRUE(op_names.find(name_and_api_def.first) != op_names.end()) + << name_and_api_def.first << " op has ApiDef but missing from ops. " + << "Does api_def_" << name_and_api_def.first << " need to be deleted?"; + } +} + +void TestAllApiDefInputArgsAreValid( + const OpList& ops, const std::unordered_map& api_defs_map) { + for (const auto& op : ops.op()) { + const auto api_def_iter = api_defs_map.find(op.name()); + if (api_def_iter == api_defs_map.end()) { + continue; + } + const auto& api_def = api_def_iter->second; + for (const auto& api_def_arg : api_def.in_arg()) { + bool found_arg = false; + for (const auto& op_arg : op.input_arg()) { + if (api_def_arg.name() == op_arg.name()) { + found_arg = true; + break; + } + } + ASSERT_TRUE(found_arg) + << "Input argument " << api_def_arg.name() + << " (overwritten in api_def_" << op.name() + << ".pbtxt) is not defined in OpDef for " << op.name(); + } + } +} + +void TestAllApiDefOutputArgsAreValid( + const OpList& ops, const std::unordered_map& api_defs_map) { + for (const auto& op : ops.op()) { + const auto api_def_iter = api_defs_map.find(op.name()); + if (api_def_iter == api_defs_map.end()) { + continue; + } + const auto& api_def = api_def_iter->second; + for (const auto& api_def_arg : api_def.out_arg()) { + bool found_arg = false; + for (const auto& op_arg : op.output_arg()) { + if (api_def_arg.name() == op_arg.name()) { + found_arg = true; + break; + } + } + ASSERT_TRUE(found_arg) + << "Output argument " << api_def_arg.name() + << " (overwritten in api_def_" << op.name() + << ".pbtxt) is not defined in OpDef for " << op.name(); + } + } +} + +void TestAllApiDefAttributeNamesAreValid( + const OpList& ops, const std::unordered_map& api_defs_map) { + for (const auto& op : ops.op()) { + const auto api_def_iter = api_defs_map.find(op.name()); + if (api_def_iter == api_defs_map.end()) { + continue; + } + const auto& api_def = api_def_iter->second; + for (const auto& api_def_attr : api_def.attr()) { + bool found_attr = false; + for (const auto& op_attr : op.attr()) { + if (api_def_attr.name() == op_attr.name()) { + found_attr = true; + } + } + ASSERT_TRUE(found_attr) + << "Attribute " << api_def_attr.name() << " (overwritten in api_def_" + << op.name() << ".pbtxt) is not defined in OpDef for " << op.name(); + } + } +} +} // namespace + +class BaseApiTest : public ::testing::Test { protected: - ApiTest() { + BaseApiTest() { OpRegistry::Global()->Export(false, &ops_); const std::vector multi_line_fields = {"description"}; @@ -80,7 +165,7 @@ class ApiTest : public ::testing::Test { }; // Check that all ops have an ApiDef. -TEST_F(ApiTest, AllOpsAreInApiDef) { +TEST_F(BaseApiTest, AllOpsAreInApiDef) { auto* excluded_ops = GetExcludedOps(); for (const auto& op : ops_.op()) { if (excluded_ops->find(op.name()) != excluded_ops->end()) { @@ -94,16 +179,8 @@ TEST_F(ApiTest, AllOpsAreInApiDef) { } // Check that ApiDefs have a corresponding op. -TEST_F(ApiTest, AllApiDefsHaveCorrespondingOp) { - std::unordered_set op_names; - for (const auto& op : ops_.op()) { - op_names.insert(op.name()); - } - for (const auto& name_and_api_def : api_defs_map_) { - ASSERT_TRUE(op_names.find(name_and_api_def.first) != op_names.end()) - << name_and_api_def.first << " op has ApiDef but missing from ops. " - << "Does api_def_" << name_and_api_def.first << " need to be deleted?"; - } +TEST_F(BaseApiTest, AllApiDefsHaveCorrespondingOp) { + TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_); } string GetOpDefHasDocStringError(const string& op_name) { @@ -117,7 +194,7 @@ string GetOpDefHasDocStringError(const string& op_name) { // Check that OpDef's do not have descriptions and summaries. // Descriptions and summaries must be in corresponding ApiDefs. -TEST_F(ApiTest, OpDefsShouldNotHaveDocs) { +TEST_F(BaseApiTest, OpDefsShouldNotHaveDocs) { auto* excluded_ops = GetExcludedOps(); for (const auto& op : ops_.op()) { if (excluded_ops->find(op.name()) != excluded_ops->end()) { @@ -143,62 +220,56 @@ TEST_F(ApiTest, OpDefsShouldNotHaveDocs) { // Checks that input arg names in an ApiDef match input // arg names in corresponding OpDef. -TEST_F(ApiTest, AllApiDefInputArgsAreValid) { - for (const auto& op : ops_.op()) { - const auto& api_def = api_defs_map_[op.name()]; - for (const auto& api_def_arg : api_def.in_arg()) { - bool found_arg = false; - for (const auto& op_arg : op.input_arg()) { - if (api_def_arg.name() == op_arg.name()) { - found_arg = true; - break; - } - } - ASSERT_TRUE(found_arg) - << "Input argument " << api_def_arg.name() - << " (overwritten in api_def_" << op.name() - << ".pbtxt) is not defined in OpDef for " << op.name(); - } - } +TEST_F(BaseApiTest, AllApiDefInputArgsAreValid) { + TestAllApiDefInputArgsAreValid(ops_, api_defs_map_); } // Checks that output arg names in an ApiDef match output // arg names in corresponding OpDef. -TEST_F(ApiTest, AllApiDefOutputArgsAreValid) { - for (const auto& op : ops_.op()) { - const auto& api_def = api_defs_map_[op.name()]; - for (const auto& api_def_arg : api_def.out_arg()) { - bool found_arg = false; - for (const auto& op_arg : op.output_arg()) { - if (api_def_arg.name() == op_arg.name()) { - found_arg = true; - break; - } - } - ASSERT_TRUE(found_arg) - << "Output argument " << api_def_arg.name() - << " (overwritten in api_def_" << op.name() - << ".pbtxt) is not defined in OpDef for " << op.name(); - } - } +TEST_F(BaseApiTest, AllApiDefOutputArgsAreValid) { + TestAllApiDefOutputArgsAreValid(ops_, api_defs_map_); } // Checks that attribute names in an ApiDef match attribute // names in corresponding OpDef. -TEST_F(ApiTest, AllApiDefAttributeNamesAreValid) { - for (const auto& op : ops_.op()) { - const auto& api_def = api_defs_map_[op.name()]; - for (const auto& api_def_attr : api_def.attr()) { - bool found_attr = false; - for (const auto& op_attr : op.attr()) { - if (api_def_attr.name() == op_attr.name()) { - found_attr = true; - } - } - ASSERT_TRUE(found_attr) - << "Attribute " << api_def_attr.name() << " (overwritten in api_def_" - << op.name() << ".pbtxt) is not defined in OpDef for " << op.name(); - } - } +TEST_F(BaseApiTest, AllApiDefAttributeNamesAreValid) { + TestAllApiDefAttributeNamesAreValid(ops_, api_defs_map_); } + +class PythonApiTest : public ::testing::Test { + protected: + PythonApiTest() { + OpRegistry::Global()->Export(false, &ops_); + const std::vector multi_line_fields = {"description"}; + + Env* env = Env::Default(); + GetGoldenApiDefs(env, kPythonApiDefDir, &api_defs_map_); + } + OpList ops_; + std::unordered_map api_defs_map_; +}; + +// Check that ApiDefs have a corresponding op. +TEST_F(PythonApiTest, AllApiDefsHaveCorrespondingOp) { + TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_); +} + +// Checks that input arg names in an ApiDef match input +// arg names in corresponding OpDef. +TEST_F(PythonApiTest, AllApiDefInputArgsAreValid) { + TestAllApiDefInputArgsAreValid(ops_, api_defs_map_); +} + +// Checks that output arg names in an ApiDef match output +// arg names in corresponding OpDef. +TEST_F(PythonApiTest, AllApiDefOutputArgsAreValid) { + TestAllApiDefOutputArgsAreValid(ops_, api_defs_map_); +} + +// Checks that attribute names in an ApiDef match attribute +// names in corresponding OpDef. +TEST_F(PythonApiTest, AllApiDefAttributeNamesAreValid) { + TestAllApiDefAttributeNamesAreValid(ops_, api_defs_map_); +} + } // namespace tensorflow diff --git a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt index 5d21d7bab69..ac05b54eea9 100644 --- a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt @@ -20,10 +20,7 @@ END } summary: "Adds a value to the current value of a variable." description: < [3, 4, 0, 1, 2] + +# shifting along multiple dimensions +# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] +roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]] + +# shifting along the same axis multiple times +# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] +roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]] +``` +END +} diff --git a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt new file mode 100644 index 00000000000..97c380700a2 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt @@ -0,0 +1,32 @@ +op { + graph_op_name: "UnravelIndex" + in_arg { + name: "indices" + description: <ChunkFromHandle(prev); strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false)); diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc index 0507076c8c3..fd9c4222a7a 100644 --- a/tensorflow/core/common_runtime/device_set_test.cc +++ b/tensorflow/core/common_runtime/device_set_test.cc @@ -88,7 +88,9 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) { // D3 is prioritized below D1. AddDevice("d3", "/job:a/replica:0/task:0/device:d3:0"); EXPECT_EQ((std::vector{ - DeviceType("d2"), DeviceType("d1"), DeviceType("d3"), + DeviceType("d2"), + DeviceType("d1"), + DeviceType("d3"), }), types()); } diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc index 20c59ad42b3..df6f4b88773 100644 --- a/tensorflow/core/common_runtime/direct_session.cc +++ b/tensorflow/core/common_runtime/direct_session.cc @@ -61,7 +61,6 @@ limitations under the License. #include "tensorflow/core/util/device_name_utils.h" #include "tensorflow/core/util/env_var.h" - namespace tensorflow { namespace { @@ -472,9 +471,9 @@ Status DirectSession::Run(const RunOptions& run_options, Executor::Args args; args.step_id = step_id_counter_.fetch_add(1); - TF_RETURN_IF_ERROR( - GetOrCreateExecutors(input_tensor_names, output_names, target_nodes, - &executors_and_keys, &run_state_args)); + TF_RETURN_IF_ERROR(GetOrCreateExecutors(input_tensor_names, output_names, + target_nodes, &executors_and_keys, + &run_state_args)); const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1); std::unique_ptr debugger_state; diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc index 99b33e2ef0d..b75a4f76d94 100644 --- a/tensorflow/core/common_runtime/direct_session_test.cc +++ b/tensorflow/core/common_runtime/direct_session_test.cc @@ -436,10 +436,7 @@ TEST(DirectSessionTest, FetchMultipleTimes) { } } -REGISTER_OP("Darth") - .Input("x: float") - .Output("y: float") - .Doc(R"doc( +REGISTER_OP("Darth").Input("x: float").Output("y: float").Doc(R"doc( Darth promises one return value. x: float @@ -972,39 +969,38 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib, std::atomic num_done(0); // Runs session to compute :0 using inter_op thread pool . - auto add_session_run_call = [use_global_pools, &def, &options, &sessions, - &sessions_mu, - &num_done](thread::ThreadPool* tp, Node* node, - int inter_op_pool) { - auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu, - inter_op_pool, node, &num_done]() { - RunOptions run_options; - run_options.set_inter_op_thread_pool(inter_op_pool); - std::vector outputs; + auto add_session_run_call = + [use_global_pools, &def, &options, &sessions, &sessions_mu, &num_done]( + thread::ThreadPool* tp, Node* node, int inter_op_pool) { + auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu, + inter_op_pool, node, &num_done]() { + RunOptions run_options; + run_options.set_inter_op_thread_pool(inter_op_pool); + std::vector outputs; - Session* session; - if (use_global_pools) { - std::unique_ptr s(NewSession(options)); - TF_ASSERT_OK(s->Create(def)); - session = s.get(); + Session* session; + if (use_global_pools) { + std::unique_ptr s(NewSession(options)); + TF_ASSERT_OK(s->Create(def)); + session = s.get(); - mutex_lock l(sessions_mu); - sessions.emplace_back(std::move(s)); - } else { - session = sessions[0].get(); - } + mutex_lock l(sessions_mu); + sessions.emplace_back(std::move(s)); + } else { + session = sessions[0].get(); + } - Status s = session->Run(run_options, {} /* inputs */, - {node->name() + ":0"} /* output_names */, {}, - &outputs, nullptr /* run_metadata */); - TF_CHECK_OK(s); - ASSERT_EQ(1, outputs.size()); - auto flat = outputs[0].flat(); - EXPECT_FLOAT_EQ(1.2, flat(0)); - num_done.fetch_add(1); - }; - tp->Schedule(fn); - }; + Status s = session->Run(run_options, {} /* inputs */, + {node->name() + ":0"} /* output_names */, {}, + &outputs, nullptr /* run_metadata */); + TF_CHECK_OK(s); + ASSERT_EQ(1, outputs.size()); + auto flat = outputs[0].flat(); + EXPECT_FLOAT_EQ(1.2, flat(0)); + num_done.fetch_add(1); + }; + tp->Schedule(fn); + }; // For blocking states: // - Starts at 0, BlockingOp::Compute will move to 1. diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc index df9cf0c91f1..31fb128f937 100644 --- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc +++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc @@ -161,14 +161,14 @@ static void TestHWAccelerator(bool enableHWTrace) { x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0"); #ifdef TENSORFLOW_USE_SYCL x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0"); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // y = A * x Node* y = test::graph::Matmul(&graph, a, x, false, false); y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0"); #ifdef TENSORFLOW_USE_SYCL -y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0"); -#endif // TENSORFLOW_USE_SYCL + y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0"); +#endif // TENSORFLOW_USE_SYCL Node* y_neg = test::graph::Unary(&graph, "Neg", y); y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); @@ -181,7 +181,7 @@ y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0"); (*options.config.mutable_device_count())["GPU"] = 1; #ifdef TENSORFLOW_USE_SYCL (*options.config.mutable_device_count())["SYCL"] = 1; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL options.config.set_allow_soft_placement(true); options.config.mutable_graph_options()->set_build_cost_model(1); std::unique_ptr session(NewSession(options)); diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc index 9d03caff1e1..e3416da988c 100644 --- a/tensorflow/core/common_runtime/executor.cc +++ b/tensorflow/core/common_runtime/executor.cc @@ -1609,7 +1609,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) { auto done = [this, state]() { Device* device = impl_->params_.device; NodeExecStatsWrapper* stats = state->stats; // Shorthand - Entry* first_input = state->first_input; // Shorthand + Entry* first_input = state->first_input; // Shorthand nodestats::SetOpEnd(stats); EntryVector outputs; @@ -1776,6 +1776,19 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input, entry->ref_mu = nullptr; inp->tensor = entry->val.get(); + // The dtype of entry->ref could have been changed by another operation + // that ran after the operation that "produced" it executed, so + // re-validate that the type of the dereferenced tensor matches the + // expected input type. + if (item.input_type(i) != inp->tensor->dtype()) { + return AttachDef( + errors::InvalidArgument( + i, "-th input expects type ", + DataTypeString(item.input_type(i)), + " but automatically dereferenced input tensor has type ", + DataTypeString(inp->tensor->dtype())), + item.kernel->def()); + } } } } diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc index e9c4328f29e..e1b5404b795 100644 --- a/tensorflow/core/common_runtime/function.cc +++ b/tensorflow/core/common_runtime/function.cc @@ -97,12 +97,11 @@ static Node* AddNoOp(Graph* g) { static Node* AddIdentity(Graph* g, Endpoint input) { DCHECK_LT(0, input.dtype()); - DCHECK_LT(input.dtype(), DT_FLOAT_REF); NodeDef ndef; ndef.set_name(g->NewName(kNodeLabel)); ndef.set_op("Identity"); ndef.add_input(input.name()); - AddNodeAttr("T", input.dtype(), &ndef); + AddNodeAttr("T", BaseType(input.dtype()), &ndef); Status s; Node* ret = g->AddNode(ndef, &s); TF_CHECK_OK(s); @@ -205,7 +204,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime { // The instantiated and transformed function is encoded as a Graph // object, and an executor is created for the graph. struct Item : public core::RefCounted { - const Graph* graph = nullptr; // Owned by exec. + const Graph* graph = nullptr; // Owned by exec. const FunctionLibraryDefinition* overlay_lib = nullptr; // Not owned. FunctionBody* func_graph = nullptr; Executor* exec = nullptr; diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc index 9e4b617d2bd..67caeb3495c 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -154,8 +154,9 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { a.DeallocateRaw(t3); a.DeallocateRaw(t4); } - CheckStats(&a, 4097, 0, 1024 * sizeof(float) + 1048576 * sizeof(int64) + - 2048 * sizeof(double) + 10485760 * sizeof(float), + CheckStats(&a, 4097, 0, + 1024 * sizeof(float) + 1048576 * sizeof(int64) + + 2048 * sizeof(double) + 10485760 * sizeof(float), 10485760 * sizeof(float)); // At the end, we should have coalesced all memory into one region diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc index 933d700f604..80a5bdbfff4 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc @@ -762,9 +762,11 @@ int64 MinSystemMemory(int64 available_memory) { // is necessary. min_system_memory *= 2; #endif + #if defined(ANDROID_TEGRA) - // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM and Video RAM - min_system_memory = 1<<30; + // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM + // and Video RAM + min_system_memory = 1 << 30; #endif return min_system_memory; } diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc index 7763a4f2e6f..2500425359c 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc @@ -108,7 +108,8 @@ TEST_F(GpuStreamUtilTest, StreamOverrides) { ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0, "/device:GPU:0"); Output n = ops::MatMul(root, {}, {}); - ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0, "/cpu:0"); + ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0, + "/cpu:0"); Graph g(OpRegistry::Global()); TF_ASSERT_OK(root.ToGraph(&g)); diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc index 995fd1253fb..b195de7cbac 100644 --- a/tensorflow/core/common_runtime/gpu/process_state.cc +++ b/tensorflow/core/common_runtime/gpu/process_state.cc @@ -88,8 +88,8 @@ ProcessState::~ProcessState() { } string ProcessState::MemDesc::DebugString() { - return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ", - gpu_registered, ", nic: ", nic_registered); + return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, + ", dma: ", gpu_registered, ", nic: ", nic_registered); } ProcessState::MemDesc ProcessState::PtrType(const void* ptr) { @@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) { // TODO(tucker): actually maintain separate CPUAllocators for // different numa_nodes. For now, just one. numa_node = 0; - mutex_lock lock(mu_); + { + // Here we optimize the most common use case where cuda_host_allocators_ + // and cuda_al_ have already been populated and since we're only reading + // these vectors, we can get by with a shared lock. In the slower case, + // we take a unique lock and populate these vectors. + tf_shared_lock lock(mu_); + + if (FLAGS_brain_gpu_record_mem_types && + static_cast(cuda_al_.size()) > 0) { + return cuda_al_[0]; + } + if (static_cast(cuda_host_allocators_.size()) > numa_node) { + return cuda_host_allocators_[0]; + } + } + + mutex_lock lock(mu_); // Find the first valid StreamExecutor to request CUDA host memory // through, since any will work. // diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc index 3b309e915cd..33a5d60eb7e 100644 --- a/tensorflow/core/common_runtime/graph_execution_state.cc +++ b/tensorflow/core/common_runtime/graph_execution_state.cc @@ -340,8 +340,11 @@ Status GraphExecutionState::OptimizeGraph( std::unordered_map device_map; Device* cpu_device = nullptr; for (const auto& device : device_set_->devices()) { - device_map[device->name()] = - grappler::GetDeviceInfo(device->parsed_name()); + DeviceProperties props = grappler::GetDeviceInfo(device->parsed_name()); + if (props.type() == "UNKNOWN") { + continue; + } + device_map[device->name()] = props; if (device->parsed_name().id == 0 && StringPiece(device->parsed_name().type) == "CPU" && device->GetAllocator(AllocatorAttributes()) != nullptr) { diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h index db2686ce2c4..2312e1a89fd 100644 --- a/tensorflow/core/common_runtime/graph_execution_state.h +++ b/tensorflow/core/common_runtime/graph_execution_state.h @@ -139,9 +139,7 @@ class GraphExecutionState { // The graph returned by BuildGraph may contain only the pruned // graph, whereas some clients may want access to the full graph. - const Graph* full_graph() { - return graph_; - } + const Graph* full_graph() { return graph_; } // Returns the node with the given name, or null if it does not exist. const Node* get_node_by_name(const string& name) const { diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc index 76b926ba400..090a16ebeb1 100644 --- a/tensorflow/core/common_runtime/memory_types.cc +++ b/tensorflow/core/common_runtime/memory_types.cc @@ -47,7 +47,7 @@ struct EndpointEq { static Status ProcessMemoryTypes( const DeviceType& device_type, const Graph* g, const std::function& fn) { - if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL ) { + if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL) { // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always // compatible. return Status::OK(); diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc index 2a834ddca42..a0935855719 100644 --- a/tensorflow/core/common_runtime/memory_types_test.cc +++ b/tensorflow/core/common_runtime/memory_types_test.cc @@ -36,7 +36,7 @@ TEST(MemoryTypeChecker, Int32OK) { #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g)); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL delete g; } @@ -64,7 +64,7 @@ TEST(MemoryTypeChecker, Int32NotOk) { // But we can insert _HostSend/_HostRecv to ensure the invariant. TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g)); TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g)); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL delete g; } @@ -91,7 +91,7 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) { TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type)); // int Switch's output on GPU has HOST_MEMORY constraint. EXPECT_EQ(memory_type, HOST_MEMORY); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL delete g; } diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h index c5b76592e1b..75dce7c7feb 100644 --- a/tensorflow/core/common_runtime/placer.h +++ b/tensorflow/core/common_runtime/placer.h @@ -88,9 +88,9 @@ class Placer { void AssignAndLog(int assigned_device, Node* node) const; void LogDeviceAssignment(const Node* node) const; - Graph* const graph_; // Not owned. - const DeviceSet* const devices_; // Not owned. - const SessionOptions* options_; // Not owned. + Graph* const graph_; // Not owned. + const DeviceSet* const devices_; // Not owned. + const SessionOptions* options_; // Not owned. const bool log_device_placement_; TF_DISALLOW_COPY_AND_ASSIGN(Placer); diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc index 5d87b1e279a..02c9cd5313e 100644 --- a/tensorflow/core/common_runtime/placer_test.cc +++ b/tensorflow/core/common_runtime/placer_test.cc @@ -619,9 +619,9 @@ TEST_F(PlacerTest, TestReferenceConnectionIgnoreInfeasible) { Node* input = ops::SourceOp( "TestDevice", b.opts().WithName("in").WithDevice("/job:a/task:0/device:fakegpu:0")); - Node* var = ops::SourceOp("TestVariable", - b.opts().WithName("var_0").WithDevice( - "/job:a/task:0/device:fakegpu:0")); + Node* var = + ops::SourceOp("TestVariable", b.opts().WithName("var_0").WithDevice( + "/job:a/task:0/device:fakegpu:0")); // This op is specified on CPU, but in practice will be ignored, // because the reference edges forces it on GPU. diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc index 12947e284a3..dd4bf6a3457 100644 --- a/tensorflow/core/common_runtime/process_function_library_runtime.cc +++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc @@ -158,7 +158,7 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext( } FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR( - const string& device_name) { + const string& device_name) const { Device* device = nullptr; if (device_name != kDefaultFLRDevice) { if (!device_mgr_->LookupDevice(device_name, &device).ok()) { diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h index a1adc4b6b35..9c9c92f1ea0 100644 --- a/tensorflow/core/common_runtime/process_function_library_runtime.h +++ b/tensorflow/core/common_runtime/process_function_library_runtime.h @@ -85,7 +85,7 @@ class ProcessFunctionLibraryRuntime { static const char kDefaultFLRDevice[]; // Returns the FunctionLibraryRuntime for the corresponding device_name. - FunctionLibraryRuntime* GetFLR(const string& device_name); + FunctionLibraryRuntime* GetFLR(const string& device_name) const; // Returns the device incarnation for the given device_name. Status GetDeviceIncarnation(const string& device_name, int64* incarnation); diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc index 0234d4c3725..4dbe113e44e 100644 --- a/tensorflow/core/common_runtime/session_factory.cc +++ b/tensorflow/core/common_runtime/session_factory.cc @@ -60,8 +60,8 @@ const string RegisteredFactoriesErrorMessageLocked() { str_util::Join(factory_types, ", "), "}."); } string SessionOptionsToString(const SessionOptions& options) { - return strings::StrCat("target: \"", options.target, "\" config: ", - ProtoShortDebugString(options.config)); + return strings::StrCat("target: \"", options.target, + "\" config: ", ProtoShortDebugString(options.config)); } } // namespace diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc index d7e01144c9e..cb900db10af 100644 --- a/tensorflow/core/common_runtime/step_stats_collector.cc +++ b/tensorflow/core/common_runtime/step_stats_collector.cc @@ -226,22 +226,23 @@ void StepStatsCollector::BuildCostModel( if (node) { for (int i = 0; i < stats.output_size(); ++i) { const auto& output = stats.output(i); - cm->RecordMaxMemorySize(node, i, Bytes(output.tensor_description() - .allocation_description() - .allocated_bytes()), + cm->RecordMaxMemorySize(node, i, + Bytes(output.tensor_description() + .allocation_description() + .allocated_bytes()), stats.output(i).tensor_description().shape(), node->output_types()[i]); - cm->RecordAllocationId(node, i, output.tensor_description() - .allocation_description() - .allocation_id()); + cm->RecordAllocationId(node, i, + output.tensor_description() + .allocation_description() + .allocation_id()); } cm->RecordMemoryStats(node, stats.memory_stats()); // Use hardware stats to record the execution time if they're available, // otherwise use the regular (less accurate) stats string node_name = dev_stats.regular_stats->node_stats(i).node_name(); - if (dev_stats.hardware_stats && - name_to_hw_node_stats.find(node_name) != - name_to_hw_node_stats.end()) { + if (dev_stats.hardware_stats && name_to_hw_node_stats.find(node_name) != + name_to_hw_node_stats.end()) { const NodeExecStats& hw_stats = name_to_hw_node_stats[node_name]; cm->RecordMaxExecutionTime( node, Microseconds(hw_stats.op_end_rel_micros())); diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc index 9094824ee73..02bd8b8f3bc 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc @@ -80,7 +80,7 @@ void SYCLAllocator::ClearStats() override { size_t SYCLAllocator::RequestedSize(void* ptr) { mutex_lock lock(mu_); - if(!sycl_device_) { + if (!sycl_device_) { return 0; } const auto& buffer = sycl_device_->get_sycl_buffer(ptr); diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h index cca9f92c62e..550f1933322 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h +++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h @@ -20,10 +20,10 @@ limitations under the License. #ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_ #define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_ +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -56,14 +56,13 @@ class SYCLAllocator : public Allocator { // Clear the SYCL device used by the Allocator void ClearSYCLDevice() { mutex_lock lock(mu_); - if(sycl_device_) { + if (sycl_device_) { delete sycl_device_; sycl_device_ = nullptr; } } private: - mutable mutex mu_; Eigen::SyclDevice* sycl_device_ GUARDED_BY(mu_); // owned AllocatorStats stats_ GUARDED_BY(mu_); diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h index cc272d156ef..7c09e0b8f19 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device.h +++ b/tensorflow/core/common_runtime/sycl/sycl_device.h @@ -187,9 +187,9 @@ class GSYCLInterface { type = "Unknown"; } - return strings::StrCat("id: ", device_id, ", type: ", type, ", name: ", - name.c_str(), ", vendor: ", vendor.c_str(), - ", profile: ", profile.c_str()); + return strings::StrCat( + "id: ", device_id, ", type: ", type, ", name: ", name.c_str(), + ", vendor: ", vendor.c_str(), ", profile: ", profile.c_str()); } }; diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc index 19c14770dca..14f7727659d 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc +++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc @@ -26,7 +26,6 @@ class SYCLDeviceFactory : public DeviceFactory { public: Status CreateDevices(const SessionOptions &options, const string &name_prefix, std::vector *devices) override { - auto syclInterface = GSYCLInterface::instance(); size_t n = 1; @@ -37,13 +36,11 @@ class SYCLDeviceFactory : public DeviceFactory { for (int i = 0; i < n; i++) { string name = strings::StrCat(name_prefix, "/device:SYCL:", i); - devices->push_back( - new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality() - , syclInterface->GetShortDeviceDescription(i) - , syclInterface->GetSYCLAllocator(i) - , syclInterface->GetCPUAllocator(i) - , syclInterface->GetSYCLContext(i)) - ); + devices->push_back(new SYCLDevice( + options, name, Bytes(256 << 20), DeviceLocality(), + syclInterface->GetShortDeviceDescription(i), + syclInterface->GetSYCLAllocator(i), syclInterface->GetCPUAllocator(i), + syclInterface->GetSYCLContext(i))); } return Status::OK(); @@ -51,6 +48,6 @@ class SYCLDeviceFactory : public DeviceFactory { }; REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200); -} +} // namespace tensorflow #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h index 83016b706a5..3124ed23c92 100644 --- a/tensorflow/core/common_runtime/sycl/sycl_util.h +++ b/tensorflow/core/common_runtime/sycl/sycl_util.h @@ -20,8 +20,8 @@ limitations under the License. #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_ #define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_ -#include "tensorflow/core/common_runtime/device.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/common_runtime/device.h" // For DMA helper #include "tensorflow/core/common_runtime/dma_helper.h" #include "tensorflow/core/framework/tensor.h" diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc index 616ced3d0f3..2e1aabd1cc8 100644 --- a/tensorflow/core/debug/debug_gateway.cc +++ b/tensorflow/core/debug/debug_gateway.cc @@ -24,31 +24,31 @@ limitations under the License. namespace tensorflow { DebugGateway::DebugGateway(DirectSession* session) : session_(session) { - session_->node_outputs_callback_ = [this]( - const string& node_name, const int output_slot, const Tensor* tensor, - const bool is_ref, OpKernelContext* ctx) { - if (comp_cb_ != nullptr && output_slot <= 0) { - // The node completion callback is invoked once for a node regardless - // of whether the node has zero, one or more outputs. - // The output_slot can be negative (-1, or kControlSlot) if - // node_outputs_callback_ is invoked for a node with no output. If that - // is the case, notify the callback that the node in question has no - // output. - comp_cb_(node_name, output_slot == 0); - } + session_->node_outputs_callback_ = + [this](const string& node_name, const int output_slot, + const Tensor* tensor, const bool is_ref, OpKernelContext* ctx) { + if (comp_cb_ != nullptr && output_slot <= 0) { + // The node completion callback is invoked once for a node regardless + // of whether the node has zero, one or more outputs. + // The output_slot can be negative (-1, or kControlSlot) if + // node_outputs_callback_ is invoked for a node with no output. If + // that is the case, notify the callback that the node in question has + // no output. + comp_cb_(node_name, output_slot == 0); + } - // Copy tensor values (e.g., from GPU to host) only if the - // value callback is not nullptr. - if (val_cb_ != nullptr && output_slot >= 0) { - CopyTensor( - node_name, output_slot, tensor, ctx, - [this, node_name, output_slot, is_ref](const Tensor* copied_tensor) { - val_cb_(node_name, output_slot, *copied_tensor, is_ref); - }); - } + // Copy tensor values (e.g., from GPU to host) only if the + // value callback is not nullptr. + if (val_cb_ != nullptr && output_slot >= 0) { + CopyTensor(node_name, output_slot, tensor, ctx, + [this, node_name, output_slot, + is_ref](const Tensor* copied_tensor) { + val_cb_(node_name, output_slot, *copied_tensor, is_ref); + }); + } - return Status::OK(); - }; + return Status::OK(); + }; } DebugGateway::~DebugGateway() { @@ -86,7 +86,8 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot, // Determine if the tensor is on device (GPU) or host (CPU). // The second part of the check is necessary because even an OpKernel on // may have output tensors allocated on CPU. - if ((device->name().find("GPU:") != string::npos || device->name().find("SYCL:") != string::npos) && + if ((device->name().find("GPU:") != string::npos || + device->name().find("SYCL:") != string::npos) && !ctx->output_alloc_attr(output_slot).on_host()) { // GPU tensors: Copy it to host (CPU). DeviceContext* device_ctxt = ctx->op_device_context(); diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc index 57583349069..b1bbd3f6980 100644 --- a/tensorflow/core/debug/debug_gateway_test.cc +++ b/tensorflow/core/debug/debug_gateway_test.cc @@ -390,9 +390,9 @@ TEST_F(SessionDebugMinusAXTest, debug_gateway.SetNodeValueCallback( [this, &mu, &val_callback_count, &a_debug_identity_node_name, &x_debug_identity_node_name, &y_debug_identity_node_name, - &debug_identity_tensor_vals, &callbacks_done, &kConcurrentRuns]( - const string& node_name, const int output_slot, - const Tensor& tensor_value, const bool is_ref) { + &debug_identity_tensor_vals, &callbacks_done, + &kConcurrentRuns](const string& node_name, const int output_slot, + const Tensor& tensor_value, const bool is_ref) { mutex_lock l(mu); if (node_name == a_debug_identity_node_name && output_slot == 0) { @@ -560,21 +560,21 @@ TEST_F(SessionDebugOutputSlotWithoutOutgoingEdgeTest, Notification callbacks_done; std::vector debug_identity_tensor_vals; - debug_gateway.SetNodeValueCallback([this, &mu, &callbacks_done, - &debug_identity_node_name, - &debug_identity_tensor_vals]( - const string& node_name, const int output_slot, - const Tensor& tensor_value, const bool is_ref) { - mutex_lock l(mu); + debug_gateway.SetNodeValueCallback( + [this, &mu, &callbacks_done, &debug_identity_node_name, + &debug_identity_tensor_vals]( + const string& node_name, const int output_slot, + const Tensor& tensor_value, const bool is_ref) { + mutex_lock l(mu); - if (node_name == debug_identity_node_name && output_slot == 0) { - debug_identity_tensor_vals.push_back(tensor_value); + if (node_name == debug_identity_node_name && output_slot == 0) { + debug_identity_tensor_vals.push_back(tensor_value); - if (!callbacks_done.HasBeenNotified()) { - callbacks_done.Notify(); - } - } - }); + if (!callbacks_done.HasBeenNotified()) { + callbacks_done.Notify(); + } + } + }); // Add DebugIdentity watch on c:0, which does not have an outgoing edge. RunOptions run_opts; diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc index a312f789d84..f70931e9265 100644 --- a/tensorflow/core/debug/debug_grpc_testlib.cc +++ b/tensorflow/core/debug/debug_grpc_testlib.cc @@ -30,7 +30,7 @@ namespace test { ::grpc::Status TestEventListenerImpl::SendEvents( ::grpc::ServerContext* context, - ::grpc::ServerReaderWriter< ::tensorflow::EventReply, ::tensorflow::Event>* + ::grpc::ServerReaderWriter<::tensorflow::EventReply, ::tensorflow::Event>* stream) { Event event; diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc index f81445c20bd..baa8c08fdf1 100644 --- a/tensorflow/core/debug/debug_io_utils.cc +++ b/tensorflow/core/debug/debug_io_utils.cc @@ -574,8 +574,6 @@ Status DebugIO::CloseDebugURL(const string& debug_url) { } } -static Status CloseDebugURL(const string& debug_url) { return Status::OK(); } - Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key, const Tensor& tensor, const uint64 wall_time_us, diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc index 2f83c2415b8..0807a85b8b3 100644 --- a/tensorflow/core/debug/debug_io_utils_test.cc +++ b/tensorflow/core/debug/debug_io_utils_test.cc @@ -57,7 +57,8 @@ class DebugIOUtilsTest : public ::testing::Test { TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) { DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2", "hidden_1/MatMul", 0, "DebugIdentity"); - EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", debug_node_key.device_name); + EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", + debug_node_key.device_name); EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name); EXPECT_EQ(0, debug_node_key.output_slot); EXPECT_EQ("DebugIdentity", debug_node_key.debug_op); diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h index d0ca2a62577..cc35264b8fe 100644 --- a/tensorflow/core/distributed_runtime/graph_mgr.h +++ b/tensorflow/core/distributed_runtime/graph_mgr.h @@ -140,7 +140,7 @@ class GraphMgr { GraphMgr* graph_mgr; }; - const WorkerEnv* worker_env_; // Not owned. + const WorkerEnv* worker_env_; // Not owned. DeviceMgr* device_mgr_; CostModelManager cost_model_manager_; diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc index d1dc622ce79..1a488303ac7 100644 --- a/tensorflow/core/distributed_runtime/master.cc +++ b/tensorflow/core/distributed_runtime/master.cc @@ -528,8 +528,8 @@ void Master::ListDevices(const ListDevicesRequest* req, auto session = FindMasterSession(req->session_handle()); if (session == nullptr) { done(errors::InvalidArgument( - "Session ", req->session_handle(), - " is not found. Possibly, this master has restarted.")); + "Session ", req->session_handle(), + " is not found. Possibly, this master has restarted.")); return; } core::ScopedUnref ref(session); diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc index 121c58762f1..f2c1f3489c3 100644 --- a/tensorflow/core/distributed_runtime/master_test.cc +++ b/tensorflow/core/distributed_runtime/master_test.cc @@ -61,7 +61,7 @@ class MasterTest : public ::testing::Test { // rpc calls. Status CreateSession(const GraphDef& def, string* handle, - int64* initial_version) { + int64* initial_version) { ::grpc::ClientContext ctx; CreateSessionRequest req; *(req.mutable_graph_def()) = def; @@ -77,7 +77,7 @@ class MasterTest : public ::testing::Test { } Status ExtendSession(const string& handle, const GraphDef& def, - int64 current_version, int64* new_version) { + int64 current_version, int64* new_version) { ::grpc::ClientContext ctx; ExtendSessionRequest req; req.set_session_handle(handle); diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc index ac279937730..b4d18d8607e 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc @@ -185,23 +185,22 @@ class GrpcMasterService : public AsyncServiceInterface { MutableRunStepResponseWrapper* wrapped_response = new NonOwnedProtoRunStepResponse(&call->response); call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); }); - master_impl_->RunStep(call_opts, wrapped_request, wrapped_response, - [call, call_opts, wrapped_request, wrapped_response, - trace](const Status& status) { - call->ClearCancelCallback(); - delete call_opts; - delete wrapped_request; - delete trace; - if (call->request.store_errors_in_response_body() && - !status.ok()) { - call->response.set_status_code(status.code()); - call->response.set_status_error_message( - status.error_message()); - call->SendResponse(ToGrpcStatus(Status::OK())); - } else { - call->SendResponse(ToGrpcStatus(status)); - } - }); + master_impl_->RunStep( + call_opts, wrapped_request, wrapped_response, + [call, call_opts, wrapped_request, wrapped_response, + trace](const Status& status) { + call->ClearCancelCallback(); + delete call_opts; + delete wrapped_request; + delete trace; + if (call->request.store_errors_in_response_body() && !status.ok()) { + call->response.set_status_code(status.code()); + call->response.set_status_error_message(status.error_message()); + call->SendResponse(ToGrpcStatus(Status::OK())); + } else { + call->SendResponse(ToGrpcStatus(status)); + } + }); ENQUEUE_REQUEST(RunStep, true); } diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h index 4e203e260a1..6ae94b74417 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h @@ -89,9 +89,9 @@ class MasterService final { ::grpc::Status ExtendSession(::grpc::ClientContext* context, const ExtendSessionRequest& request, ExtendSessionResponse* response) override; - ::grpc::Status PartialRunSetup( - ::grpc::ClientContext* context, const PartialRunSetupRequest& request, - PartialRunSetupResponse* response) override; + ::grpc::Status PartialRunSetup(::grpc::ClientContext* context, + const PartialRunSetupRequest& request, + PartialRunSetupResponse* response) override; ::grpc::Status RunStep(::grpc::ClientContext* context, const RunStepRequest& request, RunStepResponse* response) override; diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc index 70418f63686..1088e9be66c 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc @@ -69,8 +69,7 @@ class GrpcRemoteMaster : public MasterInterface { ::grpc::ClientContext ctx; auto trace = TraceRpc("RunStep/Client", &ctx); return Call(&ctx, call_options, &request->ToProto(), - get_proto_from_wrapper(response), - &MasterServiceStub::RunStep); + get_proto_from_wrapper(response), &MasterServiceStub::RunStep); } Status CloseSession(CallOptions* call_options, @@ -114,8 +113,9 @@ class GrpcRemoteMaster : public MasterInterface { template Status Call(::grpc::ClientContext* ctx, CallOptions* call_options, const Request* request, Response* response, - ::grpc::Status (MasterServiceStub::*pfunc)( - ::grpc::ClientContext*, const Request&, Response*)) { + ::grpc::Status (MasterServiceStub::*pfunc)(::grpc::ClientContext*, + const Request&, + Response*)) { ctx->set_fail_fast(false); SetDeadline(ctx, call_options->GetTimeout()); return FromGrpcStatus((stub_.get()->*pfunc)(ctx, *request, response)); diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h index dd114d39c62..730124c25e9 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h @@ -66,7 +66,7 @@ class GrpcBufferWriter final } // It's dangerous to keep an inlined grpc_slice as the backup slice, since // on a following Next() call, a reference will be returned to this slice - // via GRPC_SLICE_START_PTR, which will not be an adddress held by + // via GRPC_SLICE_START_PTR, which will not be an address held by // slice_buffer_. have_backup_ = backup_slice_.refcount != NULL; byte_count_ -= count; diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc index 373eecffcab..5597ee7a76a 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc @@ -21,11 +21,8 @@ namespace tensorflow { namespace test { // ErrorOp::Compute returns an error. -REGISTER_OP("Error") - .Input("in: T") - .Output("out: T") - .Attr("T: type") - .Attr("message: string"); +REGISTER_OP("Error").Input("in: T").Output("out: T").Attr("T: type").Attr( + "message: string"); class ErrorOp : public OpKernel { public: explicit ErrorOp(OpKernelConstruction* ctx) : OpKernel(ctx) { @@ -66,11 +63,8 @@ REGISTER_KERNEL_BUILDER(Name("InvalidRefType").Device(DEVICE_CPU), // DelayOp::AsyncCompute sleeps for "micros"-econd and then returns // its input. -REGISTER_OP("Delay") - .Input("in: T") - .Output("out: T") - .Attr("T: type") - .Attr("micros: int"); +REGISTER_OP("Delay").Input("in: T").Output("out: T").Attr("T: type").Attr( + "micros: int"); class DelayOp : public AsyncOpKernel { public: explicit DelayOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) { diff --git a/tensorflow/core/distributed_runtime/rpcbench_test.cc b/tensorflow/core/distributed_runtime/rpcbench_test.cc index b2668fae25a..d3af7417e61 100644 --- a/tensorflow/core/distributed_runtime/rpcbench_test.cc +++ b/tensorflow/core/distributed_runtime/rpcbench_test.cc @@ -184,8 +184,8 @@ static void BM_Helper(int iters, int width, int num_stages, int tensor_size, testing::SetLabel( strings::StrCat(def.node_size(), " nodes; ", - use_multiple_devices ? "Multi device" : "Single device", - "; tensor bytes/send: ", tensor_size * sizeof(float))); + use_multiple_devices ? "Multi device" : "Single device", + "; tensor bytes/send: ", tensor_size * sizeof(float))); std::vector outputs; diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc index 4766f4c33b6..9dae5b3b926 100644 --- a/tensorflow/core/distributed_runtime/scheduler.cc +++ b/tensorflow/core/distributed_runtime/scheduler.cc @@ -17,9 +17,9 @@ limitations under the License. #include -#include "tensorflow/core/graph/graph.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_set.h" +#include "tensorflow/core/graph/graph.h" #include "tensorflow/core/util/util.h" namespace tensorflow { diff --git a/tensorflow/core/distributed_runtime/scheduler.h b/tensorflow/core/distributed_runtime/scheduler.h index eabcaccdd1e..ef87b9834db 100644 --- a/tensorflow/core/distributed_runtime/scheduler.h +++ b/tensorflow/core/distributed_runtime/scheduler.h @@ -16,15 +16,15 @@ limitations under the License. #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_ #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_ -#include #include +#include #include #include #include -#include "tensorflow/core/graph/costmodel.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_set.h" +#include "tensorflow/core/graph/costmodel.h" namespace tensorflow { diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc index fe2d1a12934..34a4013547b 100644 --- a/tensorflow/core/distributed_runtime/tensor_coding.cc +++ b/tensorflow/core/distributed_runtime/tensor_coding.cc @@ -81,7 +81,7 @@ void TensorResponse::InitPartial(const RecvTensorResponse& response) { Status TensorResponse::ParseFrom(Source* source) { if (!on_host_) { protobuf::io::CodedInputStream input(source->contents()); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); // Unlimited + input.SetTotalBytesLimit(INT_MAX); // Unlimited // Pre-parse into local storage, then delegate to device. if (!meta_.ParseFromCodedStream(&input) || !input.ConsumedEntireMessage()) { @@ -217,7 +217,7 @@ bool TensorResponse::ParseTensorSubmessage( bool TensorResponse::ParseFast(Source* source) { protobuf::io::CodedInputStream input(source->contents()); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); // Unlimited + input.SetTotalBytesLimit(INT_MAX); // Unlimited while (true) { auto p = input.ReadTagWithCutoff(127); int tag = GetTagFieldNumber(p.first); diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc index 702af78c880..95ca3c3b4d1 100644 --- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc +++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc @@ -97,9 +97,8 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs, const string& tensor_name, const string& src_device, const string& dst_device, - int64 bytes, - const string& details, - const string& transfer_method_name){ + int64 bytes, const string& details, + const string& transfer_method_name) { NodeExecStats* ns = new NodeExecStats; ns->set_node_name(transfer_method_name); if (details.empty()) { diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc index 0efe43fde2d..6025be51704 100644 --- a/tensorflow/core/framework/bfloat16.cc +++ b/tensorflow/core/framework/bfloat16.cc @@ -21,13 +21,13 @@ void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) { const uint16_t* p = reinterpret_cast(src); uint16_t* q = reinterpret_cast(dst); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - for (; size != 0; p += 2, q++, size--) { - *q = p[0]; - } + for (; size != 0; p += 2, q++, size--) { + *q = p[0]; + } #else - for (; size != 0; p += 2, q++, size--) { - *q = p[1]; - } + for (; size != 0; p += 2, q++, size--) { + *q = p[1]; + } #endif } @@ -35,15 +35,15 @@ void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) { const uint16_t* p = reinterpret_cast(src); uint16_t* q = reinterpret_cast(dst); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - for (; size != 0; p++, q += 2, size--) { - q[0] = *p; - q[1] = 0; - } + for (; size != 0; p++, q += 2, size--) { + q[0] = *p; + q[1] = 0; + } #else - for (; size != 0; p++, q += 2, size--) { - q[0] = 0; - q[1] = *p; - } + for (; size != 0; p++, q += 2, size--) { + q[0] = 0; + q[1] = *p; + } #endif } diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc index 7ab8e3ec188..8bb87483e1c 100644 --- a/tensorflow/core/framework/common_shape_fns.cc +++ b/tensorflow/core/framework/common_shape_fns.cc @@ -1356,10 +1356,11 @@ Status ScatterNdUpdateShape(InferenceContext* c) { Status s = c->Merge(prefix_indices, prefix_updates, &unused); if (!s.ok()) { return errors::InvalidArgument( - "The outer ", num_outer_dims, " dimensions of indices.shape=", - c->DebugString(indices_shape), " must match the outer ", - num_outer_dims, " dimensions of updates.shape=", - c->DebugString(updates_shape), ": ", s.error_message()); + "The outer ", num_outer_dims, + " dimensions of indices.shape=", c->DebugString(indices_shape), + " must match the outer ", num_outer_dims, + " dimensions of updates.shape=", c->DebugString(updates_shape), + ": ", s.error_message()); } ShapeHandle input_suffix; diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h index 2c2c7e7c585..96566c285a2 100644 --- a/tensorflow/core/framework/dataset.h +++ b/tensorflow/core/framework/dataset.h @@ -1,4 +1,4 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,64 +12,603 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_H_ +#define TENSORFLOW_CORE_FRAMEWORK_DATASET_H_ -#ifndef TENSORFLOW_FRAMEWORK_DATASET_H_ -#define TENSORFLOW_FRAMEWORK_DATASET_H_ +#include + +#include "tensorflow/core/framework/attr_value.pb.h" +#include "tensorflow/core/framework/attr_value_util.h" +#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/framework/variant_encode_decode.h" +#include "tensorflow/core/framework/variant_tensor_data.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/tracing.h" + +// Polymorphic datasets should support all primitive TensorFlow +// types. Use this macro to expand `m(T)` once for each primitive type +// `T`, e.g. to build a `switch` statement. +#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m) namespace tensorflow { -namespace dataset { -// Registry for stateful ops that need to be used in dataset functions. -// See below macro for usage details. -class WhitelistedStatefulOpRegistry { + +// Interface for reading values from a key-value store. +// Used for restoring iterator state. +class IteratorStateReader { public: - Status Add(StringPiece op_name) { - op_names_.insert(op_name); + virtual Status ReadScalar(StringPiece key, int64* val) = 0; + virtual Status ReadScalar(StringPiece key, string* val) = 0; + virtual Status ReadTensor(StringPiece key, Tensor* val) = 0; + virtual bool Contains(StringPiece key) = 0; + + virtual ~IteratorStateReader() {} +}; + +// Interface for writing values to a key-value store. +// Used for saving iterator state. +class IteratorStateWriter { + public: + virtual Status WriteScalar(StringPiece key, const int64 val) = 0; + virtual Status WriteScalar(StringPiece key, const string& val) = 0; + virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0; + + virtual ~IteratorStateWriter() {} +}; + +// Forward declarations to avoid introducing a dependency on headers in +// "tensorflow/core/graph/...". +class GraphDefBuilder; +class GraphDatasetBase; +class Node; + +// Wrapper around GraphDefBuilder. Used to serialize Dataset graph. +class GraphDefBuilderWrapper { + public: + explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {} + + // Adds a Const node with scalar value to the Graph. + // `*output` contains a pointer to the output `Node`. It is guaranteed to be + // non-null if the method returns with an OK status. + // The returned Node pointer is owned by the backing Graph of GraphDefBuilder. + template + Status AddScalar(const T& val, Node** output) { + Tensor val_t = Tensor(DataTypeToEnum::v(), TensorShape({})); + val_t.scalar()() = val; + AddTensorInternal(val_t, output); + if (*output == nullptr) { + return errors::Internal("AddScalar: Failed to build Const op."); + } return Status::OK(); } - bool Contains(StringPiece op_name) { - return op_names_.find(op_name) != op_names_.end(); + // Adds a Const node with vector value to the Graph. + // `*output` contains a pointer to the output `Node`. It is guaranteed to be + // non-null if the method returns with an OK status. + // The returned Node pointer is owned by the backing Graph of GraphDefBuilder. + // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice? + template + Status AddVector(const std::vector& val, Node** output) { + Tensor val_t = Tensor(DataTypeToEnum::v(), + TensorShape({static_cast(val.size())})); + for (int i = 0; i < val.size(); i++) { + val_t.flat()(i) = val[i]; + } + AddTensorInternal(val_t, output); + if (*output == nullptr) { + return errors::Internal("AddVector: Failed to build Const op."); + } + return Status::OK(); } - static WhitelistedStatefulOpRegistry* Global() { - static WhitelistedStatefulOpRegistry* reg = - new WhitelistedStatefulOpRegistry; - return reg; + // Adds a Const node with Tensor value to the Graph. + // `*output` contains a pointer to the output `Node`. It is guaranteed to be + // non-null if the method returns with an OK status. + // The returned Node pointer is owned by the backing Graph of GraphDefBuilder. + Status AddTensor(const Tensor& val, Node** output) { + AddTensorInternal(val, output); + if (*output == nullptr) { + return errors::Internal("AddTensor: Failed to build Const op."); + } + return Status::OK(); + } + + Status AddDataset(const GraphDatasetBase* dataset, + const std::vector& inputs, Node** output) { + return AddDataset(dataset, inputs, {}, output); + } + + // Adds a node corresponding to the `DatasetType` to the Graph. + // Return value of `DatasetType::op_name()` is used as the op type for the + // node. + // Values for the output_types and output_shapes node attributes are also + // written if those attributes are defined in the OpDef. + // `*output` contains a pointer to the output `Node`. It is guaranteed to be + // non-null if the method returns with an OK status. + // The returned Node pointer is owned by the backing Graph of GraphDefBuilder. + Status AddDataset(const GraphDatasetBase* dataset, + const std::vector& inputs, + const std::vector>& attrs, + Node** output) { + std::vector> enumerated_inputs(inputs.size()); + for (int i = 0; i < inputs.size(); i++) { + enumerated_inputs[i] = std::make_pair(i, inputs[i]); + } + return AddDataset(dataset, enumerated_inputs, {}, attrs, output); + } + + Status AddDataset( + const GraphDatasetBase* dataset, + const std::vector>& inputs, + const std::vector>>& list_inputs, + const std::vector>& attrs, + Node** output); + + // Adds a user-defined function with name `function_name` to the graph and + // recursively adds all functions it references. If a function with a matching + // name has already been added, returns with OK status. If a user-defined with + // name `function_name` is not found in the FunctionLibraryDefinition, returns + // an InvalidArgumentError. If the function with name `function_name` or any + // of its dependent functions are stateful, returns an InvalidArgument error. + Status AddFunction(OpKernelContext* ctx, const string& function_name); + + template + void BuildAttrValue(const T& value, AttrValue* attr) { + SetAttrValue(value, attr); } private: - WhitelistedStatefulOpRegistry() {} - WhitelistedStatefulOpRegistry(WhitelistedStatefulOpRegistry const& copy); - WhitelistedStatefulOpRegistry operator=( - WhitelistedStatefulOpRegistry const& copy); - std::set op_names_; + void AddTensorInternal(const Tensor& val, Node** output); + + Status EnsureFunctionIsStateless(OpKernelContext* ctx, + const string& function_name) const { + const FunctionLibraryDefinition* lib_def = + ctx->function_library()->GetFunctionLibraryDefinition(); + const FunctionDef* function_def = lib_def->Find(function_name); + if (!function_def) { + return errors::InvalidArgument("Unable to find FunctionDef for ", + function_name, " in registry."); + } + for (const NodeDef& node_def : function_def->node_def()) { + const OpDef* op_def; + TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def)); + // TODO(b/65524810): Hack to allow functions to capture Dataset op + // nodes needed for FlatMap. Currently, source datasets nodes have been + // marked stateful to avoid constant folding since we do not have a + // good way of serializing them. + if (IsOpWhitelisted(op_def)) { + continue; + } + if (op_def->is_stateful()) { + return errors::InvalidArgument( + "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ", + "in function ", function_name, " is stateful. ", + "Saving stateful functions is not supported yet."); + } + } + return Status::OK(); + } + + // Returns whether an op has been whitelisted for use inside map_fns. + // Uses a heuristic to whitelist source dataset ops which have been + // marked stateful due to b/65524810. + // Also looks up the `op_def->name` in the global + // `WhitelistedStatefulOpRegistry`. + bool IsOpWhitelisted(const OpDef* op_def) const { + return (StringPiece(op_def->name()).ends_with("Dataset") && + op_def->output_arg_size() == 1 && + op_def->output_arg(0).type() == DT_VARIANT) || + dataset::WhitelistedStatefulOpRegistry::Global()->Contains( + op_def->name()); + } + + bool HasAttr(const string& op_type_name, const string& attr_name) const; + + bool HasAttr(const OpDef* op_def, const string& attr_name) const { + for (auto attr : op_def->attr()) { + if (attr.name() == attr_name) { + return true; + } + } + return false; + } + + Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) { + if (attr_value.has_func()) { + TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name())); + } else if (attr_value.has_list()) { + for (const NameAttrList& name_attr_list : attr_value.list().func()) { + TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name())); + } + } + return Status::OK(); + } + + GraphDefBuilder* b_; }; -} // namespace dataset +class StatsAggregator; -// Use this macro to whitelist an op that is marked stateful but needs to be -// used inside a map_fn in an input pipeline. This is only needed if you wish -// to be able to checkpoint the state of the input pipeline. We currently -// do not allow stateful ops to be defined inside of map_fns since it is not -// possible to save their state. -// Note that the state of the whitelisted ops inside functions will not be -// saved during checkpointing, hence this should only be used if the op is -// marked stateful for reasons like to avoid constant folding during graph -// optimiztion but is not stateful. -// If possible, try to remove the stateful flag on the op first. -// Example usage: +// A cut-down version of OpKernelContext for running computations in +// iterators. Note that we cannot simply use OpKernelContext here +// because we might run computation in an iterator whose lifetime is +// not nested within the lifetime of a single OpKernelContext +// (e.g. asynchronous prefetching). // -// WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LegacyStatefulReader"); +// TODO(mrry): We will probably need to support more of +// OpKernelContext here. For example, should allocation be handled by +// the IteratorContext? +// TODO(mrry): We're making some daring assumptions about the lifetime +// of the runner passed in here. A runner will be deleted when the original +// step ends, but all existing runners only close over session-lifetime (or +// longer-lived) state, so we can make a copy of the function. There's nothing +// in the definition of the API from which we took the runner to guarantee that +// what we are doing is safe. We should formalize the properties here. +class IteratorContext { + public: + struct Params { + // Interface to operating system functionality. + Env* env; + + // Function call support. + std::function)> runner = nullptr; + + // A function that returns the current `StatsAggregator` instance to be + // used when recording statistics about the iterator. + // + // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator` + // is a property of the `IteratorResource` (which this class does not know + // about), and (ii) it can change after the `IteratorContext` has been + // created. Better suggestions are welcome! + std::function()> stats_aggregator_getter = + nullptr; + + // The FunctionLibraryRuntime object to be used to make function calls. + FunctionLibraryRuntime* lib = nullptr; + std::shared_ptr function_library = nullptr; + + // The Allocator to be used to allocate the output of an iterator. + Allocator* allocator = nullptr; + }; + + explicit IteratorContext(Params params) : params_(std::move(params)) {} + + Env* env() const { return params_.env; } + + std::function)>* runner() { + return ¶ms_.runner; + } + + std::shared_ptr stats_aggregator() { + if (params_.stats_aggregator_getter) { + return params_.stats_aggregator_getter(); + } else { + return nullptr; + } + } + + std::shared_ptr function_library() { + return params_.function_library; + } + + FunctionLibraryRuntime* lib() { return params_.lib; } + + void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; } + + Allocator* allocator(AllocatorAttributes attrs); + + private: + Params params_; +}; + +// Represents the current position in a range of outputs, where the +// range of outputs is typically represented by an `DatasetBase`, +// defined below. +class IteratorBase { + public: + virtual ~IteratorBase() {} + + // Gets the next output from the range that this iterator is traversing. + // + // If at least one output remains in this iterator's range, that + // output will be stored in `*out_tensors` and `false` will be + // stored in `*end_of_sequence`. + // + // If no more outputs remain in this iterator's range, `true` will + // be stored in `*end_of_sequence`, and the content of + // `*out_tensors` will be undefined. + // + // This method is thread-safe. + // + // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and + // potentially remove this method. + virtual Status GetNext(IteratorContext* ctx, std::vector* out_tensors, + bool* end_of_sequence) = 0; + + // Returns a vector of DataType values, representing the respective + // element types of each tuple component in the outputs of this + // iterator. + virtual const DataTypeVector& output_dtypes() const = 0; + + // Returns a vector of tensor shapes, representing the respective + // (and possibly partially defined) shapes of each tuple component + // in the outputs of this iterator. + virtual const std::vector& output_shapes() const = 0; + + // Saves the state of this iterator. + virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) { + return SaveInternal(writer); + } + + // Restores the state of this iterator. + virtual Status Restore(IteratorContext* ctx, IteratorStateReader* reader) { + return RestoreInternal(ctx, reader); + } + + protected: + // This is needed so that sub-classes of IteratorBase can call + // `SaveInternal` on their parent iterators, e.g., in + // `RepeatDataasetOp::Dataset`. + Status SaveParent(IteratorStateWriter* writer, + const std::unique_ptr& parent) { + return parent->SaveInternal(writer); + } + + // This is needed so that sub-classes of IteratorBase can call + // `RestoreInternal` on their parent iterators, e.g., in + // `RepeatDataasetOp::Dataset`. + Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader, + const std::unique_ptr& parent) { + return parent->RestoreInternal(ctx, reader); + } + + // Saves the state of this iterator recursively. + virtual Status SaveInternal(IteratorStateWriter* writer) { + return errors::Unimplemented("SaveInternal"); + } + + // Restores the state of this iterator recursively. + virtual Status RestoreInternal(IteratorContext* ctx, + IteratorStateReader* reader) { + return errors::Unimplemented("RestoreInternal"); + } +}; + +// Represents a (potentially infinite) range of outputs, where each +// output is a tuple of tensors. +class DatasetBase : public core::RefCounted { + public: + // Returns a new iterator for iterating over the range of elements in + // this dataset. + // + // This method may be called multiple times on the same instance, + // and the resulting iterators will have distinct state. Each + // iterator will traverse all elements in this dataset from the + // start. + // + // Ownership of the created iterator will be transferred to the caller. + // + // The prefix identifies the sequence of iterators leading up to the newly + // created iterator. + virtual std::unique_ptr MakeIterator( + const string& prefix) const = 0; + + // Returns a vector of DataType values, representing the respective + // element types of each tuple component in the outputs of this + // dataset. + virtual const DataTypeVector& output_dtypes() const = 0; + + // Returns a vector of tensor shapes, representing the respective + // (and possibly partially defined) shapes of each tuple component + // in the outputs of this dataset. + virtual const std::vector& output_shapes() const = 0; + + // A human-readable debug string for this dataset. + virtual string DebugString() = 0; + + // Serializes the dataset and writes it to the `writer`. + virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const { + return errors::Unimplemented("DatasetBase::Save"); + } + + protected: + // TODO(srbs): Ideally all graph related logic should reside in + // GraphDatasetBase. However, that would require Datasets defined in all ops + // to derive from GraphDatasetBase. Once that is done we can move + // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase. + class DatasetGraphDefBuilder : public GraphDefBuilderWrapper { + public: + DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {} + Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset, + Node** output) { + return dataset->AsGraphDefInternal(ctx, this, output); + } + }; + + virtual Status AsGraphDefInternal(OpKernelContext* ctx, + DatasetGraphDefBuilder* b, + Node** node) const { + return AsGraphDefInternal(b, node); + } + + virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b, + Node** node) const { + return errors::Unimplemented("AsGraphDefInternal"); + } +}; + +// Base-class for datasets that are built by ops. +class GraphDatasetBase : public DatasetBase { + public: + GraphDatasetBase(OpKernelContext* ctx) + : op_name_(ctx->op_kernel().type_string()) {} + + const string op_name() const { return op_name_; } + + Status Save(OpKernelContext* ctx, + IteratorStateWriter* writer) const override { + string serialized_graph_def; + string output_node; + TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node)); + TF_RETURN_IF_ERROR( + writer->WriteScalar(kDatasetGraphKey, serialized_graph_def)); + TF_RETURN_IF_ERROR( + writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node)); + return Status::OK(); + } + + // Key for storing the Dataset graph in the serialized format. + static const char kDatasetGraphKey[]; + + // Key for storing the output node of the Dataset graph in the serialized + // format. + static const char kDatasetGraphOutputNodeKey[]; + + private: + Status Serialize(OpKernelContext* ctx, string* serialized_graph_def, + string* output_node) const; + + const string op_name_; +}; + +// Represents an iterator that is associated with a particular parent dataset. +template +class DatasetIterator : public IteratorBase { + public: + struct Params { + // Owns one reference on the shared dataset resource. + const DatasetType* dataset; + + // Identifies the sequence of iterators leading up to this iterator. + const string prefix; + }; + + explicit DatasetIterator(const Params& params) : params_(params) { + params_.dataset->Ref(); + } + + ~DatasetIterator() override { params_.dataset->Unref(); } + + // The dataset from which this iterator was created. + const DatasetType* dataset() const { return params_.dataset; } + + // The sequence of iterators leading up to this iterator. + const string prefix() const { return params_.prefix; } + + const DataTypeVector& output_dtypes() const override { + return params_.dataset->output_dtypes(); + } + + const std::vector& output_shapes() const override { + return params_.dataset->output_shapes(); + } + + Status GetNext(IteratorContext* ctx, std::vector* out_tensors, + bool* end_of_sequence) final { + port::Tracing::TraceMe activity(params_.prefix); + Status s = GetNextInternal(ctx, out_tensors, end_of_sequence); + if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) { + s = errors::Internal( + "Iterator \"", params_.prefix, + "\" returned OutOfRange without setting `*end_of_sequence`. This " + "indicates that an error may have occurred. Original message: ", + s.error_message()); + LOG(ERROR) << s; + } + return s; + } + + Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final { + TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer)); + return IteratorBase::Save(ctx, writer); + } + + protected: + // Internal implementation of GetNext that is wrapped in tracing logic. + virtual Status GetNextInternal(IteratorContext* ctx, + std::vector* out_tensors, + bool* end_of_sequence) = 0; + + string full_name(const string& name) const { + return strings::StrCat(prefix(), ":", name); + } + + private: + Params params_; +}; + +// Encapsulates the work required to plug a DatasetBase into the core TensorFlow +// graph execution engine. +class DatasetOpKernel : public OpKernel { + public: + DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {} + void Compute(OpKernelContext* ctx) final; + + protected: + // Subclasses should implement this method. It will be called during Compute + // execution. + virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0; + + template + Status ParseScalarArgument(OpKernelContext* ctx, + const StringPiece& argument_name, T* output) { + const Tensor* argument_t; + TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t)); + if (!TensorShapeUtils::IsScalar(argument_t->shape())) { + return errors::InvalidArgument(argument_name, " must be a scalar"); + } + *output = argument_t->scalar()(); + return Status::OK(); + } +}; + +// Encapsulates the work required to plug unary Datasets into the core +// TensorFlow graph execution engine. +class UnaryDatasetOpKernel : public DatasetOpKernel { + public: + UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {} + + protected: + void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final; + virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input, + DatasetBase** output) = 0; +}; + +// Encapsulates the work required to plug binary Datasets into the core +// TensorFlow graph execution engine. +class BinaryDatasetOpKernel : public DatasetOpKernel { + public: + BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {} + + protected: + void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final; + virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input, + DatasetBase* another_input, + DatasetBase** output) = 0; +}; + +// Validates and extracts a `DatasetBase` object from `tensor`. // -#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS(name) \ - WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(__COUNTER__, name) -#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(ctr, name) \ - WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name) -#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name) \ - static ::tensorflow::Status whitelist_op##ctr TF_ATTRIBUTE_UNUSED = \ - ::tensorflow::dataset::WhitelistedStatefulOpRegistry::Global()->Add( \ - name) +// `tensor` must have been written by a call to SetVariantTensorToDataset(). +// +// The retrieved pointer is a borrowed reference to the dataset, which is owned +// by the tensor. The consumer must either acquire its own reference to the +// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not +// destroyed or mutated while the retrieved pointer is in use. +Status GetDatasetFromVariantTensor(const Tensor& tensor, + DatasetBase** out_dataset); + +// Stores a `DatasetBase` object in `tensor`. +// +// The ownership of `dataset` is transferred to `tensor`. +Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor); } // namespace tensorflow -#endif // TENSORFLOW_FRAMEWORK_DATASET_H_ +#endif // TENSORFLOW_CORE_FRAMEWORK_DATASET_H_ diff --git a/tensorflow/core/framework/dataset_stateful_op_whitelist.h b/tensorflow/core/framework/dataset_stateful_op_whitelist.h new file mode 100644 index 00000000000..3b48999edb3 --- /dev/null +++ b/tensorflow/core/framework/dataset_stateful_op_whitelist.h @@ -0,0 +1,77 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_ +#define TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_ + +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace dataset { +// Registry for stateful ops that need to be used in dataset functions. +// See below macro for usage details. +class WhitelistedStatefulOpRegistry { + public: + Status Add(StringPiece op_name) { + op_names_.insert(op_name); + return Status::OK(); + } + + bool Contains(StringPiece op_name) { + return op_names_.find(op_name) != op_names_.end(); + } + + static WhitelistedStatefulOpRegistry* Global() { + static WhitelistedStatefulOpRegistry* reg = + new WhitelistedStatefulOpRegistry; + return reg; + } + + private: + WhitelistedStatefulOpRegistry() {} + WhitelistedStatefulOpRegistry(WhitelistedStatefulOpRegistry const& copy); + WhitelistedStatefulOpRegistry operator=( + WhitelistedStatefulOpRegistry const& copy); + std::set op_names_; +}; + +} // namespace dataset + +// Use this macro to whitelist an op that is marked stateful but needs to be +// used inside a map_fn in an input pipeline. This is only needed if you wish +// to be able to checkpoint the state of the input pipeline. We currently +// do not allow stateful ops to be defined inside of map_fns since it is not +// possible to save their state. +// Note that the state of the whitelisted ops inside functions will not be +// saved during checkpointing, hence this should only be used if the op is +// marked stateful for reasons like to avoid constant folding during graph +// optimiztion but is not stateful. +// If possible, try to remove the stateful flag on the op first. +// Example usage: +// +// WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LegacyStatefulReader"); +// +#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS(name) \ + WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(__COUNTER__, name) +#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(ctr, name) \ + WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name) +#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name) \ + static ::tensorflow::Status whitelist_op##ctr TF_ATTRIBUTE_UNUSED = \ + ::tensorflow::dataset::WhitelistedStatefulOpRegistry::Global()->Add( \ + name) + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_ diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc index ad301a8aa4b..70d1e20a17c 100644 --- a/tensorflow/core/framework/fake_input.cc +++ b/tensorflow/core/framework/fake_input.cc @@ -104,8 +104,8 @@ Status FakeInputImpl::AddInputToBuilder() { Status status = GetNodeAttr(*node_def_, arg_->type_list_attr(), &dts); if (!status.ok()) { return errors::InvalidArgument( - "Could not infer list of types for input '", arg_->name(), "': ", - status.error_message()); + "Could not infer list of types for input '", arg_->name(), + "': ", status.error_message()); } SourceList(dts); return Status::OK(); @@ -131,8 +131,8 @@ Status FakeInputImpl::GetN(int* n) const { Status status = GetNodeAttr(*node_def_, arg_->number_attr(), n); if (!status.ok()) { return errors::InvalidArgument("Could not infer length of input '", - arg_->name(), "': ", - status.error_message()); + arg_->name(), + "': ", status.error_message()); } } return Status::OK(); @@ -153,8 +153,8 @@ Status FakeInputImpl::GetDataType(DataType* dt) const { *dt = attr->default_value().type(); } else { return errors::InvalidArgument("Could not infer type for input '", - arg_->name(), "': ", - status.error_message()); + arg_->name(), + "': ", status.error_message()); } } } else { diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc index 0224f252270..eae8e6c3c10 100644 --- a/tensorflow/core/framework/function.cc +++ b/tensorflow/core/framework/function.cc @@ -1064,26 +1064,36 @@ Status FunctionLibraryDefinition::AddLibrary( return Status::OK(); } -void FunctionLibraryDefinition::RemoveFunction(const string& func) { +Status FunctionLibraryDefinition::RemoveFunction(const string& func) { const auto& i = function_defs_.find(func); - DCHECK(i != function_defs_.end()); + if (i == function_defs_.end()) { + return errors::InvalidArgument("Tried to remove non-existent function ", + func); + } function_defs_.erase(i); + return Status::OK(); } -void FunctionLibraryDefinition::RemoveGradient(const string& func) { +Status FunctionLibraryDefinition::RemoveGradient(const string& func) { const auto& i = func_grad_.find(func); - DCHECK(i != func_grad_.end()); + if (i == func_grad_.end()) { + return errors::InvalidArgument("Tried to remove non-existent gradient ", + func); + } func_grad_.erase(i); + return Status::OK(); } void FunctionLibraryDefinition::Remove( const std::vector& funcs, const std::vector& funcs_with_grads) { for (const string& f : funcs) { - RemoveFunction(f); + Status s = RemoveFunction(f); + DCHECK(s.ok()); } for (const string& f : funcs_with_grads) { - RemoveGradient(f); + Status s = RemoveGradient(f); + DCHECK(s.ok()); } } @@ -1264,8 +1274,8 @@ FunctionDef FunctionDefHelper::Define(const string& name, } for (const string& a : src.arg) { const auto iter = ret_index.find(a); - CHECK(iter != ret_index.end()) << "Node input '" << a << "' in '" - << src.ret[0] << "' of " << name; + CHECK(iter != ret_index.end()) + << "Node input '" << a << "' in '" << src.ret[0] << "' of " << name; n->add_input(iter->second); } for (const string& d : src.dep) { diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h index 3bb5638cdf2..7d0e15641d0 100644 --- a/tensorflow/core/framework/function.h +++ b/tensorflow/core/framework/function.h @@ -312,6 +312,14 @@ class FunctionLibraryDefinition : public OpRegistryInterface { // This operation is atomic. Status AddGradientDef(const GradientDef& grad); + // Remove function `func` from the library. Returns non-OK Status unless + // `func` is in the library. + Status RemoveFunction(const string& func); + + // Remove gradient of function `func` from the library. Returns non-OK Status + // unless `func` has a gradient. + Status RemoveGradient(const string& func); + // Adds the functions and gradients in 'other' to this function library. // Duplicate functions and gradients are ignored. // This operation is atomic. @@ -384,13 +392,6 @@ class FunctionLibraryDefinition : public OpRegistryInterface { // attr from. const FunctionDef* GetAttrImpl(const NodeDef& ndef) const; - // Remove function `func` from the library. `func` must be in the library. - void RemoveFunction(const string& func); - - // Remove gradient of function `func` from the library. `func` must have - // a gradient. - void RemoveGradient(const string& func); - // Remove all functions in `funcs` and all gradients of // functions in `funcs_with_grads` from this library. void Remove(const std::vector& funcs, @@ -656,7 +657,7 @@ bool RegisterOp(const string& op, Creator func); // Returns OK the gradient creator for the "op" is found (may be // nullptr if REGISTER_OP_NO_GRADIENT is used. Status GetOpGradientCreator(const string& op, Creator* creator); -}; +}; // namespace gradient // Declare explicit instantiations of GetAttr #define GET_ATTR(T) \ diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc index bd018b72438..1f670535d57 100644 --- a/tensorflow/core/framework/graph_def_util.cc +++ b/tensorflow/core/framework/graph_def_util.cc @@ -35,8 +35,8 @@ namespace tensorflow { string SummarizeGraphDef(const GraphDef& graph_def) { string ret; - strings::StrAppend(&ret, "versions = ", - ProtoShortDebugString(graph_def.versions()), ";\n"); + strings::StrAppend( + &ret, "versions = ", ProtoShortDebugString(graph_def.versions()), ";\n"); for (const NodeDef& node : graph_def.node()) { strings::StrAppend(&ret, SummarizeNodeDef(node), ";\n"); } @@ -90,9 +90,9 @@ static Status RemoveNewDefaultAttrsFromNodeDef( FindAttr(attr.first, *producer_op_def); if (producer_attr_def == nullptr) { return errors::InvalidArgument( - "Attr '", attr.first, "' missing in producer's OpDef: ", - SummarizeOpDef(*producer_op_def), " but found in node: ", - SummarizeNodeDef(*node_def)); + "Attr '", attr.first, + "' missing in producer's OpDef: ", SummarizeOpDef(*producer_op_def), + " but found in node: ", SummarizeNodeDef(*node_def)); } // ...and it has the same value as the default in producer, if (producer_attr_def->has_default_value() && diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h index 99a5d0a054e..8249059c294 100644 --- a/tensorflow/core/framework/numeric_types.h +++ b/tensorflow/core/framework/numeric_types.h @@ -44,7 +44,7 @@ typedef Eigen::QUInt16 quint16; } // namespace tensorflow namespace Eigen { -// TOOD(xpan): We probably need to overwrite more methods to have correct eigen +// TODO(xpan): We probably need to overwrite more methods to have correct eigen // behavior. E.g. loest(), is_integer, etc. See NumTraits.h in eigen. template <> struct NumTraits diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc index a4e8add6c49..2d035ab90d0 100644 --- a/tensorflow/core/framework/op_def_util.cc +++ b/tensorflow/core/framework/op_def_util.cc @@ -170,20 +170,20 @@ const OpDef::ArgDef* FindInputArg(StringPiece name, const OpDef& op_def) { return nullptr; } -#define VALIDATE(EXPR, ...) \ - do { \ - if (!(EXPR)) { \ - return errors::InvalidArgument(__VA_ARGS__, "; in OpDef: ", \ - ProtoShortDebugString(op_def)); \ - } \ +#define VALIDATE(EXPR, ...) \ + do { \ + if (!(EXPR)) { \ + return errors::InvalidArgument( \ + __VA_ARGS__, "; in OpDef: ", ProtoShortDebugString(op_def)); \ + } \ } while (false) static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def, bool output, std::set* names) { const string suffix = strings::StrCat( output ? " for output '" : " for input '", arg.name(), "'"); - VALIDATE(gtl::InsertIfNotPresent(names, arg.name()), "Duplicate name: ", - arg.name()); + VALIDATE(gtl::InsertIfNotPresent(names, arg.name()), + "Duplicate name: ", arg.name()); VALIDATE(HasAttrStyleType(arg), "Missing type", suffix); if (!arg.number_attr().empty()) { @@ -250,8 +250,8 @@ Status ValidateOpDef(const OpDef& op_def) { std::set names; // for detecting duplicate names for (const auto& attr : op_def.attr()) { // Validate name - VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()), "Duplicate name: ", - attr.name()); + VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()), + "Duplicate name: ", attr.name()); DataType dt; VALIDATE(!DataTypeFromString(attr.name(), &dt), "Attr can't have name ", attr.name(), " that matches a data type"); @@ -680,8 +680,8 @@ Status OpDefAddedDefaultsUnchanged(const OpDef& old_op, if (!penultimate_attr.has_default_value() || !new_attr->has_default_value()) { return errors::InvalidArgument("Missing default for attr '", - penultimate_attr.name(), "' in op: ", - SummarizeOpDef(new_op)); + penultimate_attr.name(), + "' in op: ", SummarizeOpDef(new_op)); } // Actually test that the attr's default value hasn't changed. diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc index 28809c11c58..2b9812d4fcb 100644 --- a/tensorflow/core/framework/op_def_util_test.cc +++ b/tensorflow/core/framework/op_def_util_test.cc @@ -200,10 +200,11 @@ TEST_F(ValidateOpDefTest, BadAttrDefault) { "default_value { list { s: ['foo'] } } }"), "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op " "'BadAttrDef'"); - ExpectFailure(TestBuilder(OpDefBuilder("GoodAttrDef") - .Attr("a: list(type) >=2 = [DT_STRING]")), - "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op " - "'GoodAttrDef'"); + ExpectFailure( + TestBuilder( + OpDefBuilder("GoodAttrDef").Attr("a: list(type) >=2 = [DT_STRING]")), + "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op " + "'GoodAttrDef'"); } TEST_F(ValidateOpDefTest, NoRefTypes) { @@ -213,9 +214,10 @@ TEST_F(ValidateOpDefTest, NoRefTypes) { ExpectFailure( TestBuilder(OpDefBuilder("BadAttrDef").Attr("T: type = DT_INT32_REF")), "AttrValue must not have reference type value of int32_ref"); - ExpectFailure(TestBuilder(OpDefBuilder("BadAttrDef") - .Attr("T: list(type) = [DT_STRING_REF]")), - "AttrValue must not have reference type value of string_ref"); + ExpectFailure( + TestBuilder( + OpDefBuilder("BadAttrDef").Attr("T: list(type) = [DT_STRING_REF]")), + "AttrValue must not have reference type value of string_ref"); } TEST_F(ValidateOpDefTest, BadAttrMin) { @@ -245,9 +247,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) { TF_EXPECT_OK(TestBuilder( OpDefBuilder("GoodAttrtude").Attr("x: numbertype = DT_INT32"))); // Not in list of allowed types. - ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude") - .Attr("x: numbertype = DT_STRING")), - "attr 'x' of string is not in the list of allowed values"); + ExpectFailure( + TestBuilder( + OpDefBuilder("BadAttrtude").Attr("x: numbertype = DT_STRING")), + "attr 'x' of string is not in the list of allowed values"); ExpectFailure( TestBuilder(OpDefBuilder("BadAttrtude") .Attr("x: list(realnumbertype) = [DT_COMPLEX64]")), @@ -260,9 +263,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) { TF_EXPECT_OK(TestBuilder( OpDefBuilder("GoodAttrtude").Attr("x: {'foo', 'bar'} = 'bar'"))); // Not in list of allowed strings. - ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude") - .Attr("x: {'foo', 'bar'} = 'baz'")), - "attr 'x' of \"baz\" is not in the list of allowed values"); + ExpectFailure( + TestBuilder( + OpDefBuilder("BadAttrtude").Attr("x: {'foo', 'bar'} = 'baz'")), + "attr 'x' of \"baz\" is not in the list of allowed values"); ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude") .Attr("x: list({'foo', 'bar'}) = ['baz']")), "attr 'x' of \"baz\" is not in the list of allowed values"); diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc index 870bbb141b8..5f2eb9d99ab 100644 --- a/tensorflow/core/framework/op_gen_lib.cc +++ b/tensorflow/core/framework/op_gen_lib.cc @@ -296,7 +296,6 @@ static void RenameInDocs(const string& from, const string& to, } } - namespace { // Initializes given ApiDef with data in OpDef. diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h index 94fe194a1a5..ff38e4b2214 100644 --- a/tensorflow/core/framework/op_gen_lib.h +++ b/tensorflow/core/framework/op_gen_lib.h @@ -47,7 +47,6 @@ string PBTxtToMultiline(StringPiece pbtxt, const std::vector& multi_line_fields); string PBTxtFromMultiline(StringPiece multiline_pbtxt); - // Takes a list of files with ApiDefs text protos, and allows you to // look up the specific ApiDef for any given op. class ApiDefMap { diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index 16bf5c256f9..fd2d06be989 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -101,7 +101,8 @@ OpKernel::OpKernel(OpKernelConstruction* context) // Kernels executing on GPU/SYCL tie very few resources on the CPU where the // scheduler runs: we consider them as inexpensive. - expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && context->device_type() != DeviceType(DEVICE_SYCL); + expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && + context->device_type() != DeviceType(DEVICE_SYCL); } OpKernel::~OpKernel() {} diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc index 94a9d1335a7..b53b877f28d 100644 --- a/tensorflow/core/framework/op_kernel_test.cc +++ b/tensorflow/core/framework/op_kernel_test.cc @@ -510,10 +510,9 @@ TEST_F(OpKernelBuilderTest, BuilderBoth) { } REGISTER_OP("BuildTypeAttr").Attr("T: type"); -REGISTER_KERNEL_BUILDER(Name("BuildTypeAttr") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DummyKernel); +REGISTER_KERNEL_BUILDER( + Name("BuildTypeAttr").Device(DEVICE_CPU).TypeConstraint("T"), + DummyKernel); TEST_F(OpKernelBuilderTest, BuilderTypeAttr) { ExpectSuccess("BuildTypeAttr", DEVICE_CPU, {"T|type|DT_FLOAT"}); @@ -525,10 +524,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeAttr) { } REGISTER_OP("BuildTypeListAttr").Attr("T: list(type)"); -REGISTER_KERNEL_BUILDER(Name("BuildTypeListAttr") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DummyKernel); +REGISTER_KERNEL_BUILDER( + Name("BuildTypeListAttr").Device(DEVICE_CPU).TypeConstraint("T"), + DummyKernel); TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) { ExpectSuccess("BuildTypeListAttr", DEVICE_CPU, {"T|list(type)|[]"}); @@ -574,14 +572,12 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) { } REGISTER_OP("DuplicateKernelForT").Attr("T: type"); -REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DummyKernel); -REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT") - .Device(DEVICE_CPU) - .TypeConstraint("T"), - DummyKernel); +REGISTER_KERNEL_BUILDER( + Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint("T"), + DummyKernel); +REGISTER_KERNEL_BUILDER( + Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint("T"), + DummyKernel); TEST_F(OpKernelBuilderTest, DuplicateKernelForT) { const NodeDef ndef = diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc index b8c771a0a19..f84ef0f953c 100644 --- a/tensorflow/core/framework/reader_base.cc +++ b/tensorflow/core/framework/reader_base.cc @@ -178,9 +178,9 @@ void ReaderBase::Read(QueueInterface* queue, string* key, string* value, " must set *at_end=true, *produced=true, or return an error."); } if (!status.ok() && produced) { - status = errors::Internal("ReadLocked() for ", name(), - " set *produced=true *and* returned an error: ", - status.ToString()); + status = errors::Internal( + "ReadLocked() for ", name(), + " set *produced=true *and* returned an error: ", status.ToString()); } if (status.ok() && at_end) { status = OnWorkFinishedLocked(); diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h index e062adffe82..e90596980f8 100644 --- a/tensorflow/core/framework/register_types.h +++ b/tensorflow/core/framework/register_types.h @@ -179,7 +179,7 @@ limitations under the License. // Call "m" on all types. #define TF_CALL_ALL_TYPES(m) \ - TF_CALL_POD_TYPES(m) TF_CALL_string(m) TF_CALL_resource(m) + TF_CALL_POD_TYPES(m) TF_CALL_string(m) TF_CALL_resource(m) TF_CALL_variant(m) // Call "m" on POD and string types. #define TF_CALL_POD_STRING_TYPES(m) TF_CALL_POD_TYPES(m) TF_CALL_string(m) @@ -211,14 +211,12 @@ limitations under the License. #define TF_CALL_SYCL_double(m) #else // TENSORFLOW_SYCL_NO_DOUBLE #define TF_CALL_SYCL_double(m) TF_CALL_double(m) -#endif // TENSORFLOW_SYCL_NO_DOUBLE +#endif // TENSORFLOW_SYCL_NO_DOUBLE #ifdef __ANDROID_TYPES_SLIM__ -#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) +#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) #else // __ANDROID_TYPES_SLIM__ -#define TF_CALL_SYCL_NUMBER_TYPES(m) \ - TF_CALL_float(m) \ - TF_CALL_SYCL_double(m) -#endif // __ANDROID_TYPES_SLIM__ +#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) TF_CALL_SYCL_double(m) +#endif // __ANDROID_TYPES_SLIM__ #endif // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_H_ diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h index c1fe5517c69..ab35c2f0951 100644 --- a/tensorflow/core/framework/register_types_traits.h +++ b/tensorflow/core/framework/register_types_traits.h @@ -23,7 +23,7 @@ typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #include "tensorflow/core/framework/numeric_types.h" #include "tensorflow/core/platform/types.h" @@ -79,7 +79,7 @@ template <> struct proxy_type_pod { typedef float type; }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL /// If POD we use proxy_type_pod, otherwise this maps to identiy. template @@ -99,7 +99,7 @@ struct proxy_type { #ifdef TENSORFLOW_USE_SYCL #define TF_CALL_SYCL_PROXY_TYPES(m) \ TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m) -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow #endif // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_ diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc index 32b8ad784d5..de148f0bd34 100644 --- a/tensorflow/core/framework/rendezvous_test.cc +++ b/tensorflow/core/framework/rendezvous_test.cc @@ -69,9 +69,7 @@ class LocalRendezvousTest : public ::testing::Test { rendez_ = NewLocalRendezvous(); } - ~LocalRendezvousTest() override { - rendez_->Unref(); - } + ~LocalRendezvousTest() override { rendez_->Unref(); } void SchedClosure(std::function fn) { threads_.Schedule(std::move(fn)); @@ -99,8 +97,8 @@ string V(const Tensor& tensor) { Rendezvous::ParsedKey MakeKey(const string& name) { string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890, - "/job:mnist/replica:1/task:2/device:GPU:0", name, - FrameAndIter(0, 0)); + "/job:mnist/replica:1/task:2/device:GPU:0", + name, FrameAndIter(0, 0)); Rendezvous::ParsedKey k; TF_EXPECT_OK(Rendezvous::ParseKey(s, &k)); return k; diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h index d552ec1693f..e3cc848a169 100644 --- a/tensorflow/core/framework/shape_inference.h +++ b/tensorflow/core/framework/shape_inference.h @@ -32,7 +32,7 @@ class ShapeRefinerTest; namespace grappler { class GraphProperties; class SymbolicShapeManager; -} +} // namespace grappler namespace shape_inference { diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc index a9b63ca60e4..f48a7b9c47d 100644 --- a/tensorflow/core/framework/shape_inference_test.cc +++ b/tensorflow/core/framework/shape_inference_test.cc @@ -760,7 +760,10 @@ TEST_F(ShapeInferenceTest, MergePrefix) { NodeDef def; InferenceContext c(kVersion, &def, MakeOpDef(4, 2), { - Unknown(), S({-1, 2}), S({1, -1, 3}), S({2, 4}), + Unknown(), + S({-1, 2}), + S({1, -1, 3}), + S({2, 4}), }, {}, {}, {}); diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc index d8a9c0bac5b..d7517bb311d 100644 --- a/tensorflow/core/framework/tensor_shape_test.cc +++ b/tensorflow/core/framework/tensor_shape_test.cc @@ -582,7 +582,8 @@ TEST(TensorShapeTest, Large) { TEST(TensorShapeTest, Overflow) { int64 one = 1; std::vector> overflows = { - {1 << 30, 1 << 30, 1 << 30}, {1 << 5, (one << 60) + 1}, + {1 << 30, 1 << 30, 1 << 30}, + {1 << 5, (one << 60) + 1}, }; for (const auto& overflow : overflows) { TensorShapeProto proto; diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc index a8d14123009..8f480d65f25 100644 --- a/tensorflow/core/framework/tensor_testutil.cc +++ b/tensorflow/core/framework/tensor_testutil.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include #include "tensorflow/core/framework/tensor_testutil.h" +#include namespace tensorflow { namespace test { diff --git a/tensorflow/core/framework/tensor_types.h b/tensorflow/core/framework/tensor_types.h index 921f88dc0ba..a5c1a56bfc0 100644 --- a/tensorflow/core/framework/tensor_types.h +++ b/tensorflow/core/framework/tensor_types.h @@ -25,7 +25,8 @@ template struct TTypes { // Rank- tensor of scalar type T. typedef Eigen::TensorMap, - Eigen::Aligned> Tensor; + Eigen::Aligned> + Tensor; typedef Eigen::TensorMap< Eigen::Tensor, Eigen::Aligned> ConstTensor; @@ -33,35 +34,42 @@ struct TTypes { // Unaligned Rank- tensor of scalar type T. typedef Eigen::TensorMap > UnalignedTensor; - typedef Eigen::TensorMap > UnalignedConstTensor; + typedef Eigen::TensorMap< + Eigen::Tensor > + UnalignedConstTensor; typedef Eigen::TensorMap, - Eigen::Aligned> Tensor32Bit; + Eigen::Aligned> + Tensor32Bit; // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. typedef Eigen::TensorMap< Eigen::TensorFixedSize, Eigen::RowMajor, IndexType>, - Eigen::Aligned> Scalar; + Eigen::Aligned> + Scalar; typedef Eigen::TensorMap, Eigen::RowMajor, IndexType>, - Eigen::Aligned> ConstScalar; + Eigen::Aligned> + ConstScalar; // Unaligned Scalar tensor of scalar type T. - typedef Eigen::TensorMap, Eigen::RowMajor, IndexType> > UnalignedScalar; + typedef Eigen::TensorMap< + Eigen::TensorFixedSize, Eigen::RowMajor, IndexType> > + UnalignedScalar; typedef Eigen::TensorMap, Eigen::RowMajor, IndexType> > UnalignedConstScalar; // Rank-1 tensor (vector) of scalar type T. typedef Eigen::TensorMap, - Eigen::Aligned> Flat; + Eigen::Aligned> + Flat; typedef Eigen::TensorMap< Eigen::Tensor, Eigen::Aligned> ConstFlat; typedef Eigen::TensorMap, - Eigen::Aligned> Vec; + Eigen::Aligned> + Vec; typedef Eigen::TensorMap< Eigen::Tensor, Eigen::Aligned> ConstVec; @@ -69,16 +77,19 @@ struct TTypes { // Unaligned Rank-1 tensor (vector) of scalar type T. typedef Eigen::TensorMap > UnalignedFlat; - typedef Eigen::TensorMap > UnalignedConstFlat; + typedef Eigen::TensorMap< + Eigen::Tensor > + UnalignedConstFlat; typedef Eigen::TensorMap > UnalignedVec; typedef Eigen::TensorMap< - Eigen::Tensor > UnalignedConstVec; + Eigen::Tensor > + UnalignedConstVec; // Rank-2 tensor (matrix) of scalar type T. typedef Eigen::TensorMap, - Eigen::Aligned> Matrix; + Eigen::Aligned> + Matrix; typedef Eigen::TensorMap< Eigen::Tensor, Eigen::Aligned> ConstMatrix; @@ -86,8 +97,9 @@ struct TTypes { // Unaligned Rank-2 tensor (matrix) of scalar type T. typedef Eigen::TensorMap > UnalignedMatrix; - typedef Eigen::TensorMap > UnalignedConstMatrix; + typedef Eigen::TensorMap< + Eigen::Tensor > + UnalignedConstMatrix; }; typedef typename TTypes::Tensor32Bit::Index Index32; diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc index 5ddc9865633..60f2b4135a6 100644 --- a/tensorflow/core/framework/types_test.cc +++ b/tensorflow/core/framework/types_test.cc @@ -70,8 +70,8 @@ TEST(TypesTest, kDataTypeRefOffset) { << "Extra reference enum " << enum_descriptor->FindValueByNumber(e_ref)->name() << " without corresponding base enum with value " << e; - ASSERT_LT(DataType_MAX, e_ref) << "Gap in reference types, missing value for " - << e_ref; + ASSERT_LT(DataType_MAX, e_ref) + << "Gap in reference types, missing value for " << e_ref; // Make sure there are no enums defined after the last regular type before // the first reference type. diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc index 4118f14f8bf..4f3a6ec38cb 100644 --- a/tensorflow/core/graph/costmodel.cc +++ b/tensorflow/core/graph/costmodel.cc @@ -158,8 +158,8 @@ void CostModel::SetNumOutputs(const Node* node, int num_outputs) { Ensure(id, 0); auto perslot = &slot_bytes_[id]; if (!perslot->empty()) { - CHECK_EQ(num_outputs, perslot->size()) << "Cannot resize slot_bytes, node=" - << node->name(); + CHECK_EQ(num_outputs, perslot->size()) + << "Cannot resize slot_bytes, node=" << node->name(); } Ensure(id, num_outputs); } @@ -252,9 +252,12 @@ void CostModel::RecordMaxMemorySize(const Node* node, int output_slot, const DataType& dtype) { const int id = Id(node); if (id < 0) return; - CHECK_LT(output_slot, node->num_outputs()) - << "Unexpected output slot for node " << node->DebugString() << ". Got " - << output_slot << " but its num_outputs is " << node->num_outputs(); + if (output_slot >= node->num_outputs()) { + LOG(ERROR) << "Unexpected output slot for node " << node->DebugString() + << ". Got " << output_slot << " but its num_outputs is " + << node->num_outputs(); + return; + } Ensure(id, node->num_outputs()); auto& current_max = max_mem_usage_[id].output_port_mem[output_slot]; // If the memory allocator doesn't track memory usage, let's infer a lower diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h index c60a946c2cc..9b703e46938 100644 --- a/tensorflow/core/graph/costmodel.h +++ b/tensorflow/core/graph/costmodel.h @@ -198,7 +198,7 @@ class CostModel { // Cumulative execution time. std::vector time_; // Cumulative Bytes output on each channel. - std::vector > slot_bytes_; + std::vector> slot_bytes_; // Maximum execution time std::vector max_exec_time_; @@ -217,7 +217,7 @@ class CostModel { }; std::vector max_mem_usage_; - std::vector > output_port_alloc_ids_; + std::vector> output_port_alloc_ids_; std::set persistent_alloc_ids_; std::map> persistent_alloc_ids_by_devices_; diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h index b620127d907..93d8dd6f110 100644 --- a/tensorflow/core/graph/graph.h +++ b/tensorflow/core/graph/graph.h @@ -62,8 +62,8 @@ class Node; class VersionDef; class WhileContext; -class NeighborIter; // Declared below -class NodeIter; // Declared below +class NeighborIter; // Declared below +class NodeIter; // Declared below class NodeProperties; // Defined in .cc class Node { diff --git a/tensorflow/core/graph/graph_def_builder_test.cc b/tensorflow/core/graph/graph_def_builder_test.cc index e85de71ef79..e928c81b453 100644 --- a/tensorflow/core/graph/graph_def_builder_test.cc +++ b/tensorflow/core/graph/graph_def_builder_test.cc @@ -26,7 +26,6 @@ namespace tensorflow { namespace { TEST(GraphDefBuilderTest, Version) { - // Verify that our assertions will be nontrivial ASSERT_LT(0, TF_GRAPH_DEF_VERSION); diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h index 3df981437af..1b99d54e8e3 100644 --- a/tensorflow/core/graph/mkl_graph_util.h +++ b/tensorflow/core/graph/mkl_graph_util.h @@ -21,102 +21,101 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" namespace tensorflow { - // Since our ops are going to produce and also consume N addition tensors - // (Mkl) for N Tensorflow tensors, we can have following different - // orderings among these 2N tensors. - // - // E.g., for Tensorflow tensors A, B, and C, our ops will produce and - // consume A_m, B_m, and C_m additionally. - // - // INTERLEAVED: in this case 2N tensors are interleaved. So for above - // example, the ordering looks like: A, A_m, B, B_m, C, C_m. - // - // CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed - // by N Mkl tensors. So for above example, the ordering looks - // like: A, B, C, A_m, B_m, C_m - // - // Following APIs map index of original Tensorflow tensors to their - // appropriate position based on selected ordering. For contiguous ordering, - // we need to know the total number of tensors (parameter total). - // - typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering; - // NOTE: Currently, we use contiguous ordering. If you change this, then you - // would need to change Mkl op definitions in nn_ops.cc. - static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS; +// Since our ops are going to produce and also consume N addition tensors +// (Mkl) for N Tensorflow tensors, we can have following different +// orderings among these 2N tensors. +// +// E.g., for Tensorflow tensors A, B, and C, our ops will produce and +// consume A_m, B_m, and C_m additionally. +// +// INTERLEAVED: in this case 2N tensors are interleaved. So for above +// example, the ordering looks like: A, A_m, B, B_m, C, C_m. +// +// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed +// by N Mkl tensors. So for above example, the ordering looks +// like: A, B, C, A_m, B_m, C_m +// +// Following APIs map index of original Tensorflow tensors to their +// appropriate position based on selected ordering. For contiguous ordering, +// we need to know the total number of tensors (parameter total). +// +typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering; +// NOTE: Currently, we use contiguous ordering. If you change this, then you +// would need to change Mkl op definitions in nn_ops.cc. +static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS; - // Get index of MetaData tensor from index 'n' of Data tensor. - inline int DataIndexToMetaDataIndex(int n, int total_tensors) { - if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) { - // For interleaved ordering, Mkl tensor follows immediately after - // Tensorflow tensor. - return n + 1; - } else { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); - // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away. - return n + total_tensors / 2; - } +// Get index of MetaData tensor from index 'n' of Data tensor. +inline int DataIndexToMetaDataIndex(int n, int total_tensors) { + if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) { + // For interleaved ordering, Mkl tensor follows immediately after + // Tensorflow tensor. + return n + 1; + } else { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away. + return n + total_tensors / 2; } +} - int inline GetTensorDataIndex(int n, int total_tensors) { - if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) { - return 2 * n; // index corresponding to nth input/output tensor - } else { - CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); - return n; - } - } +int inline GetTensorDataIndex(int n, int total_tensors) { + if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) { + return 2 * n; // index corresponding to nth input/output tensor + } else { + CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); + return n; + } +} - int inline GetTensorMetaDataIndex(int n, int total_tensors) { - // Get index for TensorData first and then use mapping function - // to get TensorMetaData index from TensorData index. - int tidx = GetTensorDataIndex(n, total_tensors); - return DataIndexToMetaDataIndex(tidx, total_tensors); - } +int inline GetTensorMetaDataIndex(int n, int total_tensors) { + // Get index for TensorData first and then use mapping function + // to get TensorMetaData index from TensorData index. + int tidx = GetTensorDataIndex(n, total_tensors); + return DataIndexToMetaDataIndex(tidx, total_tensors); +} namespace mkl_op_registry { - static const char* kMklOpLabel = "MklOp"; - static const char* kMklOpLabelPattern = "label='MklOp'"; - // Prefix that we add to Tensorflow op name to construct Mkl op name. - static const char* const kMklOpPrefix = "_Mkl"; +static const char* kMklOpLabel = "MklOp"; +static const char* kMklOpLabelPattern = "label='MklOp'"; +// Prefix that we add to Tensorflow op name to construct Mkl op name. +static const char* const kMklOpPrefix = "_Mkl"; - // Get the name of Mkl op from original TensorFlow op - // We prefix 'Mkl' to the original op to get Mkl op. - inline string GetMklOpName(const string& name) { - return string(kMklOpPrefix) + name; +// Get the name of Mkl op from original TensorFlow op +// We prefix 'Mkl' to the original op to get Mkl op. +inline string GetMklOpName(const string& name) { + return string(kMklOpPrefix) + name; +} + +// Check whether opname with type T is registered as MKL-compliant. +// +// @input: name of the op +// @input: T datatype to be used for checking op +// @return: true if opname is registered as Mkl op; false otherwise +static inline bool IsMklOp(const std::string& op_name, DataType T) { + string kernel = KernelsRegisteredForOp(op_name); + bool result = + kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT); + return result; +} + +// Check whether opname with type T is registered as MKL-compliant and +// is element-wise. +// +// @input: name of the op +// @input: T datatype to be used for checking op +// @return: true if opname is registered as element-wise Mkl op; +// false otherwise +static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) { + if (!IsMklOp(op_name, T)) { + return false; } + bool result = (0 == op_name.compare(GetMklOpName("Add")) || + 0 == op_name.compare(GetMklOpName("Sub")) || + 0 == op_name.compare(GetMklOpName("Mul")) || + 0 == op_name.compare(GetMklOpName("Maximum")) || + 0 == op_name.compare(GetMklOpName("SquaredDifference"))); - // Check whether opname with type T is registered as MKL-compliant. - // - // @input: name of the op - // @input: T datatype to be used for checking op - // @return: true if opname is registered as Mkl op; false otherwise - static inline bool IsMklOp(const std::string& op_name, DataType T) { - string kernel = KernelsRegisteredForOp(op_name); - bool result = - kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT); - return result; - } - - // Check whether opname with type T is registered as MKL-compliant and - // is element-wise. - // - // @input: name of the op - // @input: T datatype to be used for checking op - // @return: true if opname is registered as element-wise Mkl op; - // false otherwise - static inline bool IsMklElementWiseOp(const std::string& op_name, - DataType T) { - if (!IsMklOp(op_name, T)) { - return false; - } - bool result = (0 == op_name.compare(GetMklOpName("Add")) || - 0 == op_name.compare(GetMklOpName("Sub")) || - 0 == op_name.compare(GetMklOpName("Mul")) || - 0 == op_name.compare(GetMklOpName("Maximum")) || - 0 == op_name.compare(GetMklOpName("SquaredDifference"))); - - return result; - } + return result; +} } // namespace mkl_op_registry } // namespace tensorflow #endif // INTEL_MKL diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 55bc401b9d6..0e8a1cb26ce 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -37,12 +37,12 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/util/tensor_format.h" -#include "tensorflow/core/graph/mkl_layout_pass.h" #include "tensorflow/core/graph/mkl_graph_util.h" +#include "tensorflow/core/graph/mkl_layout_pass.h" namespace tensorflow { -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML // This pass implements rewriting of graph to support following scenarios: // (A) Merging nodes in the graph @@ -281,7 +281,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter"; csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias"; csinfo_.mkl_conv2d_with_bias_backprop_bias = - "_MklConv2DWithBiasBackpropBias"; + "_MklConv2DWithBiasBackpropBias"; csinfo_.relu = "Relu"; csinfo_.relu_grad = "ReluGrad"; csinfo_.reshape = "Reshape"; @@ -297,10 +297,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // End - element-wise ops. See note above. // NOTE: names are alphabetically sorted. - rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN, - AddNRewrite, nullptr}); - rinfo_.push_back({csinfo_.add, - mkl_op_registry::GetMklOpName(csinfo_.add), + rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), + CopyAttrsAddN, AddNRewrite, nullptr}); + rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add), CopyAttrsDataType, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.avg_pool, mkl_op_registry::GetMklOpName(csinfo_.avg_pool), @@ -337,14 +336,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.fused_batch_norm, mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm), CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr}); - rinfo_.push_back({csinfo_.fused_batch_norm_grad, - mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad), - CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr}); + rinfo_.push_back( + {csinfo_.fused_batch_norm_grad, + mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad), + CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.identity, mkl_op_registry::GetMklOpName(csinfo_.identity), CopyAttrsIdentity, AlwaysRewrite, nullptr}); - rinfo_.push_back({csinfo_.lrn, - mkl_op_registry::GetMklOpName(csinfo_.lrn), + rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn), CopyAttrsLRN, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.lrn_grad, mkl_op_registry::GetMklOpName(csinfo_.lrn_grad), @@ -358,11 +357,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum), CopyAttrsDataType, AlwaysRewrite, nullptr}); - rinfo_.push_back({csinfo_.mul, - mkl_op_registry::GetMklOpName(csinfo_.mul), + rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul), CopyAttrsDataType, AlwaysRewrite, nullptr}); - rinfo_.push_back({csinfo_.relu, - mkl_op_registry::GetMklOpName(csinfo_.relu), + rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu), CopyAttrsDataType, AlwaysRewrite, nullptr}); rinfo_.push_back({csinfo_.relu_grad, mkl_op_registry::GetMklOpName(csinfo_.relu_grad), @@ -373,8 +370,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.squared_difference, mkl_op_registry::GetMklOpName(csinfo_.squared_difference), CopyAttrsDataType, AlwaysRewrite, nullptr}); - rinfo_.push_back({csinfo_.sub, - mkl_op_registry::GetMklOpName(csinfo_.sub), + rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub), CopyAttrsDataType, AlwaysRewrite, nullptr}); // Add info about which ops to add workspace edge to and the slots. @@ -388,9 +384,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass { biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul, IsBiasAddGradInMatMulContext}; - biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad, - csinfo_.mkl_conv2d_with_bias, - IsBiasAddGradInConv2DWithBiasContext}; + biasaddgrad_conv2dwithbias_context_ = { + csinfo_.bias_add_grad, csinfo_.mkl_conv2d_with_bias, + IsBiasAddGradInConv2DWithBiasContext}; cinfo_.push_back(&biasaddgrad_matmul_context_); cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_); @@ -410,9 +406,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass { /// Structure to specify the context information used in a node rewrite rule typedef struct { - string node; // Name of the node to be rewritten - string fwd; // Name of the node in the forward pass that this node - // corresponds to + string node; // Name of the node to be rewritten + string fwd; // Name of the node in the forward pass that this node + // corresponds to std::function context_match_fn; } ContextInfo; @@ -615,14 +611,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass { std::vector ksize, strides; CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true); CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true); - CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), - true); + CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true); CHECK_EQ(FormatFromString(data_format_str, &data_format), true); // Condition that specifies non-batch-wise and non-depth-wise pooling. - if (GetTensorDim(ksize, data_format, 'N') == 1 && + if (GetTensorDim(ksize, data_format, 'N') == 1 && GetTensorDim(strides, data_format, 'N') == 1 && - GetTensorDim(ksize, data_format, 'C') == 1 && + GetTensorDim(ksize, data_format, 'C') == 1 && GetTensorDim(strides, data_format, 'C') == 1) { return true; } @@ -785,8 +780,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { for (const Edge* fe : first_inp_of_filter->out_edges()) { if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias && fe->dst_input() == 0) { - VLOG(1) << "MklLayoutRewritePass: found " - << fe->dst()->DebugString() + VLOG(1) << "MklLayoutRewritePass: found " << fe->dst()->DebugString() << " as the forward node for matching context, backward" << " node is: " << n->DebugString(); *fwd_node = fe->dst(); @@ -803,13 +797,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // // @return - true (if BiasAddGrad is associated with MatMul); // false otherwise. - static bool IsBiasAddGradInMatMulContext(const Node* n, - const Node** fwd_node, + static bool IsBiasAddGradInMatMulContext(const Node* n, const Node** fwd_node, void* ci) { return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci)); } - // Rewrite rule that uses context-information for matching, // used in scenario 2. // @@ -880,10 +872,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // @output output_nodes - the list of new nodes creating Mkl tensors // // @return None - void GetNodesProducingMklTensorList(std::unique_ptr* g, - Node* orig_node, const gtl::InlinedVector, 4>& inputs, - int* input_idx, int list_length, - std::vector* output_nodes); + void GetNodesProducingMklTensorList( + std::unique_ptr* g, Node* orig_node, + const gtl::InlinedVector, 4>& inputs, + int* input_idx, int list_length, + std::vector* output_nodes); // Get a node that will feed an Mkl tensor to the new // node that we are constructing. The output node could be (1) 'n' @@ -900,7 +893,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // will feed the tensor // @return None void GetNodeProducingMklTensor(std::unique_ptr* g, Node* orig_node, - Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot); + Node* n, int n_output_slot, Node** mkl_node, + int* mkl_node_output_slot); // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb' // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are @@ -970,9 +964,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass { MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_; MklLayoutRewritePass::ContextInfo - MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_; + MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_; MklLayoutRewritePass::ContextInfo - MklLayoutRewritePass::biasaddgrad_matmul_context_; + MklLayoutRewritePass::biasaddgrad_matmul_context_; std::vector MklLayoutRewritePass::cinfo_; // We register Mkl rewrite pass for phase 1 in post partitioning group. @@ -1041,13 +1035,13 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g, TensorShape dummy_shape({8}); dummy_shape.AsProto(proto.mutable_tensor_shape()); TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const") - .Attr("value", proto) - .Attr("dtype", dt) - .Device(orig_node->def().device()) // We place this node on - // the same device as the - // device of the original - // node. - .Finalize(&**g, out)); + .Attr("value", proto) + .Attr("dtype", dt) + .Device(orig_node->def().device()) // We place this node on + // the same device as the + // device of the original + // node. + .Finalize(&**g, out)); // If number of inputs to the original node is > 0, then we add // control dependency between 1st input (index 0) of the original node and @@ -1060,8 +1054,8 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g, // the same frame. if (orig_node->num_inputs() > 0) { Node* orig_input0 = nullptr; - TF_CHECK_OK(orig_node->input_node(0, - const_cast(&orig_input0))); + TF_CHECK_OK( + orig_node->input_node(0, const_cast(&orig_input0))); CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out)); } @@ -1069,11 +1063,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g, } void MklLayoutRewritePass::GetNodesProducingMklTensorList( - std::unique_ptr* g, - Node* orig_node, - const gtl::InlinedVector, 4>& inputs, - int* input_idx, int list_length, - std::vector* output_nodes) { + std::unique_ptr* g, Node* orig_node, + const gtl::InlinedVector, 4>& inputs, int* input_idx, + int list_length, std::vector* output_nodes) { CHECK_LT(*input_idx, inputs.size()); CHECK_GT(list_length, 0); CHECK_NOTNULL(output_nodes); @@ -1090,8 +1082,8 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList( int mkl_node_output_slot = 0; GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node, &mkl_node_output_slot); - output_nodes->push_back(NodeBuilder::NodeOut(mkl_node, - mkl_node_output_slot)); + output_nodes->push_back( + NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot)); (*input_idx)++; list_length--; } @@ -1101,9 +1093,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList( // node that we are constructing. An input node could be (1) 'n' // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor // if 'n' is not an Mkl layer. -void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr* g, - Node* orig_node, Node* n, - int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) { +void MklLayoutRewritePass::GetNodeProducingMklTensor( + std::unique_ptr* g, Node* orig_node, Node* n, int n_output_slot, + Node** mkl_node, int* mkl_node_output_slot) { CHECK_NOTNULL(n); CHECK_NOTNULL(mkl_node); CHECK_NOTNULL(mkl_node_output_slot); @@ -1234,8 +1226,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs( if (ArgIsList(arg)) { std::vector new_node_inputs; int N = GetTensorListLength(arg, old_node); - GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, - N, &new_node_inputs); + GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N, + &new_node_inputs); nb->Input(new_node_inputs); nn_slot_idx++; } else { @@ -1336,13 +1328,13 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode( TensorShape dummy_shape({1}); dummy_shape.AsProto(proto.mutable_tensor_shape()); TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const") - .Attr("value", proto) - .Attr("dtype", dt) - .Device(orig_node->def().device()) // We place this node on - // same the device as the - // device of the original - // node. - .Finalize(&**g, out)); + .Attr("value", proto) + .Attr("dtype", dt) + .Device(orig_node->def().device()) // We place this node on + // same the device as the + // device of the original + // node. + .Finalize(&**g, out)); // If number of inputs to the original node is > 0, then we add // control dependency between 1st input (index 0) of the original node and @@ -1355,8 +1347,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode( // the same frame. if (orig_node->num_inputs() > 0) { Node* orig_input0 = nullptr; - TF_CHECK_OK(orig_node->input_node(0, - const_cast(&orig_input0))); + TF_CHECK_OK( + orig_node->input_node(0, const_cast(&orig_input0))); CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out)); } @@ -1374,7 +1366,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); for (auto ws : wsinfo_) { if (orig_node->type_string() == ws.fwd_op && - mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) { + mkl_op_registry::IsMklOp( + mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) { // If this op is a fwd op, then we need to check if there is an // edge from this node's fwd_slot to bwdop's bwd_slot. If there is // an edge, then we just add an attribute on this node for setting @@ -1400,8 +1393,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( nb->Attr("workspace_enabled", false); } } else if (orig_node->type_string() == ws.bwd_op && - mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()), - T)) { + mkl_op_registry::IsMklOp( + mkl_op_registry::GetMklOpName(orig_node->type_string()), + T)) { // If this op is a bwd op, then we need to add workspace edge and // it's Mkl tensor edge between its corresponding fwd op and this // op. Corresponding fwd op is specified in 'fwd_op' field of @@ -1416,7 +1410,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( if (e->src_output() == ws.fwd_slot && // We would have rewritten the forward op, so we need to use // GetMklOpName call to get its Mkl name. - e->src()->type_string() == mkl_op_registry::GetMklOpName(ws.fwd_op) && + e->src()->type_string() == + mkl_op_registry::GetMklOpName(ws.fwd_op) && e->dst_input() == ws.bwd_slot) { nb->Attr("workspace_enabled", true); CHECK_NOTNULL(ws_tensors); @@ -1593,7 +1588,7 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node, } void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node, - NodeBuilder* nb) { + NodeBuilder* nb) { DataType T; DataType Tshape; @@ -1869,8 +1864,8 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr* g, Node* succ, if (e->IsControlEdge()) { CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst())); } else { - CHECK_NOTNULL((*g)->AddEdge(new_node, e->src_output(), e->dst(), - e->dst_input())); + CHECK_NOTNULL( + (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input())); } } @@ -1941,9 +1936,9 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr* g, // and leave BiasAddGrad as it is. But we check for this condition // when we check for node rewrite rule. So we should not even come // here for MatMul. So we will fail now. - return Status( - error::Code::INVALID_ARGUMENT, - "No rewrite is required for BiasAddGrad for MatMul context."); + return Status( + error::Code::INVALID_ARGUMENT, + "No rewrite is required for BiasAddGrad for MatMul context."); } } @@ -2012,9 +2007,10 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr* g, if (e->IsControlEdge()) { CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst())); } else { - CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(), - e->src()->num_outputs()), - e->dst(), e->dst_input())); + CHECK_NOTNULL((*g)->AddEdge( + new_node, + GetTensorDataIndex(e->src_output(), e->src()->num_outputs()), + e->dst(), e->dst_input())); } } @@ -2070,7 +2066,8 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const { // BiasAddGrad is not an Mkl layer, so we make an exception for it. if (n->type_string() != csinfo_.bias_add_grad) { - if (!mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), T)) { + if (!mkl_op_registry::IsMklOp( + mkl_op_registry::GetMklOpName(n->type_string()), T)) { return nullptr; } } @@ -2186,8 +2183,7 @@ bool RunMklLayoutRewritePass(std::unique_ptr* g) { return MklLayoutRewritePass().RunPass(g); } -Status MklLayoutRewritePass::Run( - const GraphOptimizationPassOptions& options) { +Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) { if (options.graph == nullptr && options.partition_graphs == nullptr) { return Status::OK(); } @@ -2215,7 +2211,7 @@ Status MklLayoutRewritePass::Run( return Status::OK(); } -#else // INTEL_MKL_DNN +#else // INTEL_MKL_ML // This pass implements rewriting of graph to support following scenarios: // (A) Merging nodes in the graph @@ -2421,7 +2417,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { csinfo_.conv2d_grad_input = "Conv2DBackpropInput"; csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter"; csinfo_.conv2d_grad_filter_with_bias = - "__MklDummyConv2DBackpropFilterWithBias"; + "__MklDummyConv2DBackpropFilterWithBias"; csinfo_.fused_batch_norm = "FusedBatchNorm"; csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad"; csinfo_.identity = "Identity"; @@ -2435,11 +2431,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass { csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter"; csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias"; csinfo_.mkl_conv2d_grad_filter_with_bias = - "_MklConv2DBackpropFilterWithBias"; + "_MklConv2DBackpropFilterWithBias"; csinfo_.relu = "Relu"; csinfo_.relu_grad = "ReluGrad"; - csinfo_.tanh = "Tanh"; - csinfo_.tanh_grad = "TanhGrad"; + csinfo_.tanh = "Tanh"; + csinfo_.tanh_grad = "TanhGrad"; csinfo_.reshape = "Reshape"; csinfo_.softmax = "Softmax"; csinfo_.split = "Split"; @@ -2456,9 +2452,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // NOTE: names are alphabetically sorted. rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN, AddNRewrite}); - /* rinfo_.push_back({csinfo_.add, + rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add), - CopyAttrsDataType, AlwaysRewrite}); */ + CopyAttrsDataType, AlwaysRewrite}); rinfo_.push_back({csinfo_.avg_pool, mkl_op_registry::GetMklOpName(csinfo_.avg_pool), CopyAttrsPooling, AlwaysRewrite}); @@ -2474,29 +2470,28 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.conv2d, mkl_op_registry::GetMklOpName(csinfo_.conv2d), CopyAttrsConv2D, AlwaysRewrite}); - rinfo_.push_back({csinfo_.conv2d_with_bias, - csinfo_.mkl_conv2d_with_bias, + rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias, CopyAttrsConv2D, AlwaysRewrite}); rinfo_.push_back({csinfo_.conv2d_grad_filter, mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter), CopyAttrsConv2D, AlwaysRewrite}); rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias, - csinfo_.mkl_conv2d_grad_filter_with_bias, - CopyAttrsConv2D, AlwaysRewrite}); + csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv2D, + AlwaysRewrite}); rinfo_.push_back({csinfo_.conv2d_grad_input, mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input), CopyAttrsConv2D, AlwaysRewrite}); rinfo_.push_back({csinfo_.fused_batch_norm, mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm), CopyAttrsFusedBatchNorm, AlwaysRewrite}); - rinfo_.push_back({csinfo_.fused_batch_norm_grad, - mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad), - CopyAttrsFusedBatchNorm, AlwaysRewrite}); + rinfo_.push_back( + {csinfo_.fused_batch_norm_grad, + mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad), + CopyAttrsFusedBatchNorm, AlwaysRewrite}); rinfo_.push_back({csinfo_.identity, mkl_op_registry::GetMklOpName(csinfo_.identity), CopyAttrsDataType, AlwaysRewrite}); - rinfo_.push_back({csinfo_.lrn, - mkl_op_registry::GetMklOpName(csinfo_.lrn), + rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn), CopyAttrsLRN, AlwaysRewrite}); rinfo_.push_back({csinfo_.lrn_grad, mkl_op_registry::GetMklOpName(csinfo_.lrn_grad), @@ -2507,14 +2502,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.max_pool_grad, mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad), CopyAttrsPooling, AlwaysRewrite}); - /* + rinfo_.push_back({csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum), CopyAttrsDataType, AlwaysRewrite}); rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul), CopyAttrsDataType, AlwaysRewrite}); - */ rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu), CopyAttrsDataType, AlwaysRewrite}); @@ -2535,14 +2529,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back({csinfo_.softmax, mkl_op_registry::GetMklOpName(csinfo_.softmax), CopyAttrsDataType, AlwaysRewrite}); - /* + rinfo_.push_back({csinfo_.squared_difference, mkl_op_registry::GetMklOpName(csinfo_.squared_difference), CopyAttrsDataType, AlwaysRewrite}); rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub), CopyAttrsDataType, AlwaysRewrite}); - */ + // Add info about which ops to add workspace edge to and the slots. wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3}); @@ -2550,8 +2544,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // Add a rule for merging nodes minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add, - csinfo_.conv2d_with_bias, - GetConv2DOrBiasAdd}); + csinfo_.conv2d_with_bias, GetConv2DOrBiasAdd}); minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad, csinfo_.conv2d_grad_filter_with_bias, @@ -2846,9 +2839,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // Default rewrite rule to be used in scenario 1 for rewrite. // @return - true (since we want to always rewrite) - static bool AlwaysRewrite(const Node* n) { - return true; - } + static bool AlwaysRewrite(const Node* n) { return true; } // Check if we are performing pooling on depth or batch. If it is, then we // do not rewrite MaxPool node to Mkl version. @@ -2862,14 +2853,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass { std::vector ksize, strides; CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true); CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true); - CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), - true); + CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true); CHECK_EQ(FormatFromString(data_format_str, &data_format), true); // Condition that specifies non-batch-wise and non-depth-wise pooling. - if (GetTensorDim(ksize, data_format, 'N') == 1 && + if (GetTensorDim(ksize, data_format, 'N') == 1 && GetTensorDim(strides, data_format, 'N') == 1 && - GetTensorDim(ksize, data_format, 'C') == 1 && + GetTensorDim(ksize, data_format, 'C') == 1 && GetTensorDim(strides, data_format, 'C') == 1) { return true; } @@ -2941,10 +2931,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // @output output_nodes - the list of new nodes creating Mkl tensors // // @return None - void GetNodesProducingMklTensorList(std::unique_ptr* g, - Node* orig_node, const gtl::InlinedVector, 4>& inputs, - int* input_idx, int list_length, - std::vector* output_nodes); + void GetNodesProducingMklTensorList( + std::unique_ptr* g, Node* orig_node, + const gtl::InlinedVector, 4>& inputs, + int* input_idx, int list_length, + std::vector* output_nodes); // Get a node that will feed an Mkl tensor to the new // node that we are constructing. The output node could be (1) 'n' @@ -2961,7 +2952,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // will feed the tensor // @return None void GetNodeProducingMklTensor(std::unique_ptr* g, Node* orig_node, - Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot); + Node* n, int n_output_slot, Node** mkl_node, + int* mkl_node_output_slot); // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb' // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are @@ -3096,13 +3088,13 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g, TensorShape dummy_shape({8}); dummy_shape.AsProto(proto.mutable_tensor_shape()); TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const") - .Attr("value", proto) - .Attr("dtype", dt) - .Device(orig_node->def().device()) // We place this node on - // the same device as the - // device of the original - // node. - .Finalize(&**g, out)); + .Attr("value", proto) + .Attr("dtype", dt) + .Device(orig_node->def().device()) // We place this node on + // the same device as the + // device of the original + // node. + .Finalize(&**g, out)); // If number of inputs to the original node is > 0, then we add // control dependency between 1st input (index 0) of the original node and @@ -3115,8 +3107,8 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g, // the same frame. if (orig_node->num_inputs() > 0) { Node* orig_input0 = nullptr; - TF_CHECK_OK(orig_node->input_node(0, - const_cast(&orig_input0))); + TF_CHECK_OK( + orig_node->input_node(0, const_cast(&orig_input0))); // Allow duplicate while adding control edge as it would fail (return // NULL) if we try to add duplicate edge. CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true)); @@ -3126,11 +3118,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g, } void MklLayoutRewritePass::GetNodesProducingMklTensorList( - std::unique_ptr* g, - Node* orig_node, - const gtl::InlinedVector, 4>& inputs, - int* input_idx, int list_length, - std::vector* output_nodes) { + std::unique_ptr* g, Node* orig_node, + const gtl::InlinedVector, 4>& inputs, int* input_idx, + int list_length, std::vector* output_nodes) { CHECK_LT(*input_idx, inputs.size()); CHECK_GT(list_length, 0); CHECK_NOTNULL(output_nodes); @@ -3147,8 +3137,8 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList( int mkl_node_output_slot = 0; GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node, &mkl_node_output_slot); - output_nodes->push_back(NodeBuilder::NodeOut(mkl_node, - mkl_node_output_slot)); + output_nodes->push_back( + NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot)); (*input_idx)++; list_length--; } @@ -3158,9 +3148,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList( // node that we are constructing. An input node could be (1) 'n' // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor // if 'n' is not an Mkl layer. -void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr* g, - Node* orig_node, Node* n, - int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) { +void MklLayoutRewritePass::GetNodeProducingMklTensor( + std::unique_ptr* g, Node* orig_node, Node* n, int n_output_slot, + Node** mkl_node, int* mkl_node_output_slot) { CHECK_NOTNULL(n); CHECK_NOTNULL(mkl_node); CHECK_NOTNULL(mkl_node_output_slot); @@ -3292,8 +3282,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs( if (ArgIsList(arg)) { std::vector new_node_inputs; int N = GetTensorListLength(arg, old_node); - GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, - N, &new_node_inputs); + GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N, + &new_node_inputs); nb->Input(new_node_inputs); nn_slot_idx++; } else { @@ -3394,13 +3384,13 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode( TensorShape dummy_shape({1}); dummy_shape.AsProto(proto.mutable_tensor_shape()); TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const") - .Attr("value", proto) - .Attr("dtype", dt) - .Device(orig_node->def().device()) // We place this node on - // same the device as the - // device of the original - // node. - .Finalize(&**g, out)); + .Attr("value", proto) + .Attr("dtype", dt) + .Device(orig_node->def().device()) // We place this node on + // same the device as the + // device of the original + // node. + .Finalize(&**g, out)); // If number of inputs to the original node is > 0, then we add // control dependency between 1st input (index 0) of the original node and @@ -3413,8 +3403,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode( // the same frame. if (orig_node->num_inputs() > 0) { Node* orig_input0 = nullptr; - TF_CHECK_OK(orig_node->input_node(0, - const_cast(&orig_input0))); + TF_CHECK_OK( + orig_node->input_node(0, const_cast(&orig_input0))); // Allow duplicate while adding control edge as it would fail (return // NULL) if we try to add duplicate edge. CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true)); @@ -3434,8 +3424,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); for (auto ws : wsinfo_) { if (orig_node->type_string() == ws.fwd_op && - mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName( - orig_node->type_string()), T)) { + mkl_op_registry::IsMklOp( + mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) { // If this op is a fwd op, then we need to check if there is an // edge from this node's fwd_slot to bwdop's bwd_slot. If there is // an edge, then we just add an attribute on this node for setting @@ -3461,8 +3451,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( nb->Attr("workspace_enabled", false); } } else if (orig_node->type_string() == ws.bwd_op && - mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName( - orig_node->type_string()), T)) { + mkl_op_registry::IsMklOp( + mkl_op_registry::GetMklOpName(orig_node->type_string()), + T)) { // If this op is a bwd op, then we need to add workspace edge and // it's Mkl tensor edge between its corresponding fwd op and this // op. Corresponding fwd op is specified in 'fwd_op' field of @@ -3477,8 +3468,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( if (e->src_output() == ws.fwd_slot && // We would have rewritten the forward op, so we need to use // GetMklOpName call to get its Mkl name. - e->src()->type_string() == mkl_op_registry::GetMklOpName( - ws.fwd_op) && + e->src()->type_string() == + mkl_op_registry::GetMklOpName(ws.fwd_op) && e->dst_input() == ws.bwd_slot) { nb->Attr("workspace_enabled", true); CHECK_NOTNULL(ws_tensors); @@ -3645,7 +3636,7 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node, } void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node, - NodeBuilder* nb) { + NodeBuilder* nb) { DataType T; DataType Tshape; @@ -3776,8 +3767,9 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, Node* m, Node* n) { CHECK_EQ(((m->type_string() == csinfo_.bias_add && n->type_string() == csinfo_.conv2d)) || - ((n->type_string() == csinfo_.bias_add && - m->type_string() == csinfo_.conv2d)), true); + ((n->type_string() == csinfo_.bias_add && + m->type_string() == csinfo_.conv2d)), + true); // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd, // BiasAdd is successor node, and Conv2D predecessor node. @@ -3796,8 +3788,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides)); TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred)); TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ)); - TF_CHECK_OK( - GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu)); + TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu)); // We check to ensure that data formats of both succ and pred are same. // We expect them to be same, so we can enforce this as assert. // But assert can be too strict, so we enforce this as a check. @@ -3900,8 +3891,8 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, // BiasAdd has only 1 output (at slot 0) and merged node also has only 1 // output (at slot 0). const int kConv2DWithBiasOutputSlot = 0; - CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, - e->dst(), e->dst_input())); + CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(), + e->dst_input())); } } @@ -3924,8 +3915,9 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad( std::unique_ptr* g, Node* m, Node* n) { CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad && n->type_string() == csinfo_.conv2d_grad_filter)) || - ((n->type_string() == csinfo_.bias_add_grad && - m->type_string() == csinfo_.conv2d_grad_filter)), true); + ((n->type_string() == csinfo_.bias_add_grad && + m->type_string() == csinfo_.conv2d_grad_filter)), + true); // If 'm' is BiasAddGrad, then 'n' is BackpropFilter. Node* badd = m->type_string() == csinfo_.bias_add_grad ? m : n; @@ -4132,9 +4124,10 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr* g, // NULL) if we try to add duplicate edge. CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true)); } else { - CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(), - e->src()->num_outputs()), - e->dst(), e->dst_input())); + CHECK_NOTNULL((*g)->AddEdge( + new_node, + GetTensorDataIndex(e->src_output(), e->src()->num_outputs()), + e->dst(), e->dst_input())); } } @@ -4166,9 +4159,9 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const { // names. if (n->type_string() != csinfo_.conv2d_with_bias && n->type_string() != csinfo_.conv2d_grad_filter_with_bias && - !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName( - n->type_string()), T)) { - return nullptr; + !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), + T)) { + return nullptr; } // For elementwise node, we reuse the Eigen implementation and pass the MKL @@ -4184,29 +4177,30 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const { // eigen code to reduce cross-library dependency. VLOG(1) << "ELEMENTWISE: checking op: " << n->type_string(); if (mkl_op_registry::IsMklElementWiseOp( - mkl_op_registry::GetMklOpName(n->type_string()), T) || + mkl_op_registry::GetMklOpName(n->type_string()), T) || n->type_string().find("Identity") != string::npos) { VLOG(1) << "ELEMENTWISE: op is elementwise: " << n->type_string(); bool incoming_mkl_edge = false; int num_parent = 0; for (auto parent : n->in_edges()) { if (mkl_op_registry::IsMklOp(parent->src()->type_string(), T)) { - VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is MKL op: " - << parent->src()->type_string(); + VLOG(1) << "ELEMENTWISE: parent " << num_parent++ + << " is MKL op: " << parent->src()->type_string(); incoming_mkl_edge = true; break; } else { - VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is NON-MKL op: " - << parent->src()->type_string(); + VLOG(1) << "ELEMENTWISE: parent " << num_parent++ + << " is NON-MKL op: " << parent->src()->type_string(); } } if (incoming_mkl_edge == false) { - VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which has no MKL " + VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which " + "has no MKL " "parents."; return nullptr; } else { - VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string() << - " which has MKL parents"; + VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string() + << " which has MKL parents"; } } @@ -4214,8 +4208,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const { // for this op, then we rewrite it to Mkl op. // Find matching RewriteInfo and then check that rewrite rule applies. for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) { - if (n->type_string().compare(ri->name) == 0 && - ri->rewrite_rule(n)) { + if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) { return &*ri; } } @@ -4297,8 +4290,7 @@ bool RunMklLayoutRewritePass(std::unique_ptr* g) { return MklLayoutRewritePass().RunPass(g); } -Status MklLayoutRewritePass::Run( - const GraphOptimizationPassOptions& options) { +Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) { if (options.graph == nullptr && options.partition_graphs == nullptr) { return Status::OK(); } @@ -4325,7 +4317,7 @@ Status MklLayoutRewritePass::Run( return Status::OK(); } -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace tensorflow #endif diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 75f7ca2d4d7..5e2a465e22c 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -38,7 +38,7 @@ limitations under the License. namespace tensorflow { -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML namespace { @@ -125,8 +125,10 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful(); REGISTER_OP("HalfInput").Output("o: half").SetIsStateful(); REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful(); REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful(); -REGISTER_OP("_MklInput2").Output("o: uint8") - .Output("o1: uint8").SetIsStateful(); +REGISTER_OP("_MklInput2") + .Output("o: uint8") + .Output("o1: uint8") + .SetIsStateful(); ///////////////////////////////////////////////////////////////////// // Unit tests related to node merge optiimization @@ -498,7 +500,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) { "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5"); } - // BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) { InitGraph( @@ -874,11 +875,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) { " input: ['A', 'B:0', 'B:1']}" "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'D'] }"); - EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);" - "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;" - "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;" - "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5"); + EXPECT_EQ( + DoMklLayoutOptimizationPass(), + "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);" + "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;" + "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;" + "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5"); } // Concat with 2 Mkl layers feeding it @@ -1273,7 +1275,8 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) { "node { name: 'H' op: 'Input'}" "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['H', 'G'] }"); - EXPECT_EQ(DoMklLayoutOptimizationPass(), + EXPECT_EQ( + DoMklLayoutOptimizationPass(), "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);" "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;" @@ -1640,7 +1643,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) { " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B']}" "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['B', 'C'] }", kGPUDevice); + " input: ['B', 'C'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1"); } @@ -1666,7 +1670,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) { "node { name: 'F' op: 'BiasAddGrad'" " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'data_format' value { s: 'NCHW' } }" - " input: ['E'] }", kGPUDevice); + " input: ['E'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);" "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);" @@ -1687,7 +1692,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) { " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'C']}" "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'D'] }", kGPUDevice); + " input: ['A', 'D'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|" "A->D;A->E;B->D:1;C->D:2;D->E:1"); @@ -1700,7 +1706,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) { " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A'] }" "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'B'] }", kGPUDevice); + " input: ['A', 'B'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1"); } @@ -1713,7 +1720,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) { " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }" "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'C'] }", kGPUDevice); + " input: ['A', 'C'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1"); } @@ -1729,7 +1737,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'B'] }", kGPUDevice); + " input: ['A', 'B'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } @@ -1745,7 +1754,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'B'] }", kGPUDevice); + " input: ['A', 'B'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1"); } @@ -1766,7 +1776,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) { " attr { key: 'N' value { i: 2 } }" " input: ['A', 'B:0', 'B:1']}" "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['C', 'D'] }", kGPUDevice); + " input: ['C', 'D'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;" "B->D:1;B:1->D:2;C->E;D->E:1"); @@ -1788,7 +1799,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) { " attr { key: 'N' value { i: 2 } }" " input: ['B:0', 'B:1', 'A']}" "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['C', 'D'] }", kGPUDevice); + " input: ['C', 'D'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|" "A->D:2;B->D;B:1->D:1;C->E;D->E:1"); @@ -1808,7 +1820,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) { " attr { key: 'is_training' value { b: true } }" " input: ['A', 'B', 'C', 'D', 'E'] }" "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'F'] }", kGPUDevice); + " input: ['A', 'F'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);E(Input);" "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;" @@ -1837,7 +1850,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) { "node { name: 'Y' op: 'Input'}" "node { name: 'Z' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" - " input: ['E', 'Y']}", kGPUDevice); + " input: ['E', 'Y']}", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);" "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;" @@ -1885,7 +1899,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000); } // namespace -#else // INTEL_MKL_DNN +#else // INTEL_MKL_ML namespace { @@ -1972,8 +1986,10 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful(); REGISTER_OP("HalfInput").Output("o: half").SetIsStateful(); REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful(); REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful(); -REGISTER_OP("_MklInput2").Output("o: uint8") - .Output("o1: uint8").SetIsStateful(); +REGISTER_OP("_MklInput2") + .Output("o: uint8") + .Output("o1: uint8") + .SetIsStateful(); ///////////////////////////////////////////////////////////////////// // Unit tests related to node merge optiimization @@ -2492,11 +2508,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) { " input: ['A', 'B:0', 'B:1']}" "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'D'] }"); - EXPECT_EQ(DoMklLayoutOptimizationPass(), - "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);" - "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;" - "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;" - "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5"); + EXPECT_EQ( + DoMklLayoutOptimizationPass(), + "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);" + "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;" + "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;" + "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5"); } // Concat with 2 Mkl layers feeding it @@ -2891,7 +2908,8 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) { "node { name: 'H' op: 'Input'}" "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" " input: ['H', 'G'] }"); - EXPECT_EQ(DoMklLayoutOptimizationPass(), + EXPECT_EQ( + DoMklLayoutOptimizationPass(), "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);" "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);" "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;" @@ -3258,7 +3276,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) { " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B']}" "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['B', 'C'] }", kGPUDevice); + " input: ['B', 'C'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1"); } @@ -3284,7 +3303,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) { "node { name: 'F' op: 'BiasAddGrad'" " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'data_format' value { s: 'NCHW' } }" - " input: ['E'] }", kGPUDevice); + " input: ['E'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);" "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);" @@ -3305,7 +3325,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) { " attr { key: 'padding' value { s: 'SAME' } }" " input: ['A', 'B', 'C']}" "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'D'] }", kGPUDevice); + " input: ['A', 'D'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|" "A->D;A->E;B->D:1;C->D:2;D->E:1"); @@ -3318,7 +3339,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) { " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A'] }" "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'B'] }", kGPUDevice); + " input: ['A', 'B'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1"); } @@ -3331,7 +3353,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) { " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['A', 'B'] }" "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'C'] }", kGPUDevice); + " input: ['A', 'C'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1"); } @@ -3347,7 +3370,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'B'] }", kGPUDevice); + " input: ['A', 'B'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1"); } @@ -3363,7 +3387,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) { " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " input: ['A'] }" "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'B'] }", kGPUDevice); + " input: ['A', 'B'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1"); } @@ -3384,7 +3409,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) { " attr { key: 'N' value { i: 2 } }" " input: ['A', 'B:0', 'B:1']}" "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['C', 'D'] }", kGPUDevice); + " input: ['C', 'D'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;" "B->D:1;B:1->D:2;C->E;D->E:1"); @@ -3406,7 +3432,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) { " attr { key: 'N' value { i: 2 } }" " input: ['B:0', 'B:1', 'A']}" "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['C', 'D'] }", kGPUDevice); + " input: ['C', 'D'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|" "A->D:2;B->D;B:1->D:1;C->E;D->E:1"); @@ -3426,7 +3453,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) { " attr { key: 'is_training' value { b: true } }" " input: ['A', 'B', 'C', 'D', 'E'] }" "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }" - " input: ['A', 'F'] }", kGPUDevice); + " input: ['A', 'F'] }", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);E(Input);" "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;" @@ -3455,7 +3483,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) { "node { name: 'Y' op: 'Input'}" "node { name: 'Z' op: 'Zeta'" " attr {key: 'T' value { type: DT_FLOAT } }" - " input: ['E', 'Y']}", kGPUDevice); + " input: ['E', 'Y']}", + kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);" "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;" @@ -3503,7 +3532,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000); } // namespace -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace tensorflow diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc index 599bb88f015..5343e6802d1 100644 --- a/tensorflow/core/graph/mkl_tfconversion_pass.cc +++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc @@ -33,8 +33,8 @@ limitations under the License. #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/graph/mkl_tfconversion_pass.h" #include "tensorflow/core/graph/mkl_graph_util.h" +#include "tensorflow/core/graph/mkl_tfconversion_pass.h" namespace tensorflow { @@ -152,12 +152,12 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge( string data_format; TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype)); - bool dst_dtype_found = GetNodeAttr(dst->def(), "T", &dst_datatype) == - Status::OK(); + bool dst_dtype_found = + GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK(); // We compare source and destination datatypes only when both are found. if (dst_dtype_found && (src_datatype != dst_datatype)) { - string err_msg = "T attribute of " + src->name() + " and " + - dst->name() + " do not match. Will not insert" + + string err_msg = "T attribute of " + src->name() + " and " + dst->name() + + " do not match. Will not insert" + " MklToTf node in such case."; return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str()); } @@ -325,12 +325,12 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr* g) { // may not be Mkl node. DataType src_datatype; DataType dst_datatype; - bool src_is_mkl_op = (GetNodeAttr(src->def(), "T", &src_datatype) == - Status::OK() && - IsMklSupportedOp(src->type_string(), src_datatype)); - bool dst_is_mkl_op = (GetNodeAttr(dst->def(), "T", &dst_datatype) == - Status::OK() && - IsMklSupportedOp(dst->type_string(), dst_datatype)); + bool src_is_mkl_op = + (GetNodeAttr(src->def(), "T", &src_datatype) == Status::OK() && + IsMklSupportedOp(src->type_string(), src_datatype)); + bool dst_is_mkl_op = + (GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK() && + IsMklSupportedOp(dst->type_string(), dst_datatype)); // Check if src with is Mkl-compliant, while dst is not Mkl-compliant. if (src_is_mkl_op && !dst_is_mkl_op) { diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc index 172471e34bc..0d88d1ff723 100644 --- a/tensorflow/core/graph/testlib.cc +++ b/tensorflow/core/graph/testlib.cc @@ -40,7 +40,7 @@ REGISTER_KERNEL_BUILDER( #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER( Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Register the HostConst Op // Returns a constant tensor on the host. Useful for writing C++ tests @@ -273,6 +273,16 @@ Node* Reverse(Graph* g, Node* tensor, Node* axis) { return Binary(g, "ReverseV2", tensor, axis); } +Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Roll", g->op_registry()) + .Input(input) + .Input(shift) + .Input(axis) + .Finalize(g, &ret)); + return ret; +} + Node* Error(Graph* g, Node* input, const string& errmsg) { Node* ret; TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error") diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h index 06597778bb2..eb9038d619e 100644 --- a/tensorflow/core/graph/testlib.h +++ b/tensorflow/core/graph/testlib.h @@ -117,6 +117,10 @@ Node* RandomGamma(Graph* g, Node* shape, Node* alpha); // Output dtype determined by lam. Node* RandomPoisson(Graph* g, Node* shape, Node* lam); +// Rolls tensor by an offset of along the corresponding +// dimensions. +Node* Roll(Graph* g, Node* input, Node* shift, Node* axis); + // Generates random parameters from the truncated standard normal distribution // of the nput shape Node* TruncatedNormal(Graph* g, Node* input, DataType dtype); diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc index 01a618ed777..39bfca244ed 100644 --- a/tensorflow/core/grappler/clusters/cluster.cc +++ b/tensorflow/core/grappler/clusters/cluster.cc @@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) { DisableDetailedStats(false); } -Cluster::~Cluster() { -} +Cluster::~Cluster() {} void Cluster::AllowSoftPlacement(bool soft_placement_state) { options_.config.set_allow_soft_placement(soft_placement_state); diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc index 2712c5b6791..862ce4ae888 100644 --- a/tensorflow/core/grappler/clusters/single_machine.cc +++ b/tensorflow/core/grappler/clusters/single_machine.cc @@ -36,10 +36,7 @@ namespace grappler { static std::atomic already_provisioned(false); SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus) - : Cluster(timeout_s), - num_gpus_(num_gpus), - expected_init_time_s_(0), - closing_(false) { + : Cluster(timeout_s), expected_init_time_s_(0), closing_(false) { VLOG(1) << "Number of CPU cores: " << num_cpu_cores << " Number of GPUs: " << num_gpus; thread_pool_.reset(new thread::ThreadPool( diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h index a254f72f0c7..90d6a04cab6 100644 --- a/tensorflow/core/grappler/clusters/single_machine.h +++ b/tensorflow/core/grappler/clusters/single_machine.h @@ -64,7 +64,6 @@ class SingleMachine : public Cluster { Status ClearAllocatorStats() const; - const int num_gpus_; std::unique_ptr session_; std::vector queue_runner_defs_; string last_graph_id_; diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc index 1c2c1713834..f2419224711 100644 --- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc +++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc @@ -102,7 +102,7 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) { Costs summary; TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary)); - EXPECT_EQ(Costs::NanoSeconds(9150), summary.execution_time); + EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time); // Make this estimate accurate: // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h index b7eaf8dc637..9e01ec5ff5b 100644 --- a/tensorflow/core/grappler/costs/cost_estimator.h +++ b/tensorflow/core/grappler/costs/cost_estimator.h @@ -78,6 +78,9 @@ struct Costs { MilliSeconds asMilliSeconds() const { return std::chrono::duration_cast(*this); } + static NanoSeconds infinity() { + return NanoSeconds(std::chrono::nanoseconds::max()); + } }; // We store all our times in nanoseconds. If needs be, we can always switch to @@ -97,6 +100,8 @@ struct Costs { // requirements of a graph. For example, it might assume that all activations // are live for all of a graph's execution. int64 max_memory; // Maximum main memory requirement in bytes over all ops. + int64 persistent_memory; + int64 temporary_memory; // These fields are used for TPU-related estimations. They are per-op // maximums, so each op is evaluated independently, but we want the maximum of @@ -129,6 +134,8 @@ Costs::Costs() { compute_time = Duration::zero(); memory_time = Duration::zero(); max_memory = kMemoryUnknown; + persistent_memory = kMemoryUnknown; + temporary_memory = kMemoryUnknown; max_per_op_buffers = kMemoryUnknown; max_per_op_streaming = kMemoryUnknown; } @@ -139,6 +146,8 @@ Costs Costs::ZeroCosts() { costs.compute_time = Duration::zero(); costs.memory_time = Duration::zero(); costs.max_memory = kZeroMemory; + costs.persistent_memory = kZeroMemory; + costs.temporary_memory = kZeroMemory; costs.max_per_op_buffers = kZeroMemory; costs.max_per_op_streaming = kZeroMemory; return costs; diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc index 6bc136a3f89..5600267f6a2 100644 --- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc +++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc @@ -47,6 +47,8 @@ constexpr char kSize[] = "Size"; constexpr char kStopGradient[] = "StopGradient"; constexpr char kPreventGradient[] = "PreventGradient"; +static const Costs::Duration kMinComputeTime(1); + namespace { string GetDataFormat(const OpInfo& op_features) { @@ -163,18 +165,20 @@ OpLevelCostEstimator::OpLevelCostEstimator() { {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)}, {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)}, - {kPlaceholder, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kRefIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kStopGradient, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kPreventGradient, wrap(&OpLevelCostEstimator::PredictNoOp)}, {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kRecv, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kSend, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kConst, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kVariable, wrap(&OpLevelCostEstimator::PredictNoOp)}, - {kVariableV2, wrap(&OpLevelCostEstimator::PredictNoOp)}, + + {kPlaceholder, wrap(&OpLevelCostEstimator::PredictIdentity)}, + {kIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)}, + {kRefIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)}, + {kStopGradient, wrap(&OpLevelCostEstimator::PredictIdentity)}, + {kPreventGradient, wrap(&OpLevelCostEstimator::PredictIdentity)}, + {kReshape, wrap(&OpLevelCostEstimator::PredictIdentity)}, + {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)}, + {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)}, + + {kConst, wrap(&OpLevelCostEstimator::PredictVariable)}, + {kVariable, wrap(&OpLevelCostEstimator::PredictVariable)}, + {kVariableV2, wrap(&OpLevelCostEstimator::PredictVariable)}, {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)}, {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)}, @@ -349,6 +353,9 @@ OpLevelCostEstimator::DeviceInfo OpLevelCostEstimator::GetDeviceInfo( VLOG(1) << "Device: " << device.type() << " gflops: " << gflops << " gb_per_sec: " << gb_per_sec; + DCHECK_LT(0, gflops) << device.DebugString(); + DCHECK_LT(0, gb_per_sec) << device.DebugString(); + return {gflops, gb_per_sec}; } @@ -404,6 +411,7 @@ Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp( Costs OpLevelCostEstimator::PredictOpCountBasedCost( double operations, const OpInfo& op_features) const { DeviceInfo device_perf = GetDeviceInfo(op_features.device()); + Costs::NanoSeconds compute_cost(std::ceil(operations / device_perf.gigaops)); VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9 << " Execution Time (ns):" << compute_cost.count(); @@ -429,6 +437,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost( costs.execution_time = compute_cost + memory_cost; } costs.inaccurate = found_unknown_shapes; + costs.max_memory = total_output_size; return costs; } @@ -885,6 +894,30 @@ Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const { return Costs::ZeroCosts(); } +Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const { + const auto& op_features = op_context.op_info; + VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)"; + Costs result = Costs::ZeroCosts(); + result.max_memory = CalculateOutputSize(op_features, &result.inaccurate); + // Assign the minimum amount of time we can represent to the identity op since + // it tends to be really cheap. + result.compute_time = kMinComputeTime; + result.execution_time = result.compute_time; + return result; +} + +Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const { + const auto& op_features = op_context.op_info; + VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)"; + Costs result = Costs::ZeroCosts(); + result.persistent_memory = + CalculateOutputSize(op_features, &result.inaccurate); + + result.compute_time = kMinComputeTime; + result.execution_time = result.execution_time; + return result; +} + Costs OpLevelCostEstimator::PredictBatchMatMul( const OpContext& op_context) const { const auto& op_features = op_context.op_info; @@ -898,13 +931,12 @@ Costs OpLevelCostEstimator::PredictBatchMatMul( Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const { const auto& op_features = op_context.op_info; - Costs costs; + Costs costs = Costs::ZeroCosts(); costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate); // Metadata operations are so cheap we assume they take the minimum amount of // time we can represent (1 ns). - costs.execution_time = 1; - costs.compute_time = 1; - costs.memory_time = 0; + costs.compute_time = kMinComputeTime; + costs.execution_time = costs.compute_time; return costs; } diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h index 5f541ccf04d..a292e5e97fe 100644 --- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h +++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h @@ -132,6 +132,8 @@ class OpLevelCostEstimator { Costs PredictConv2DBackpropFilter(const OpContext& op_context) const; Costs PredictMatMul(const OpContext& op_context) const; Costs PredictNoOp(const OpContext& op_context) const; + Costs PredictIdentity(const OpContext& op_context) const; + Costs PredictVariable(const OpContext& op_context) const; Costs PredictBatchMatMul(const OpContext& op_context) const; Costs PredictMetadata(const OpContext& op_context) const; diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc index d7d07ee7a55..020492a3e9e 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.cc +++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc @@ -323,8 +323,13 @@ Status VirtualScheduler::Init() { } // Get the nodes that would run to output fetch_nodes. + bool ill_formed = false; std::vector nodes = - ComputeTransitiveFanin(graph, fetch_nodes); + ComputeTransitiveFanin(graph, fetch_nodes, &ill_formed); + if (ill_formed) { + return errors::InvalidArgument( + "Ill formed graph or invalid set of fetch nodes specified"); + } // TODO(dyoon): this is a bit inefficient as name_to_node is already built in // ComputeTransitiveFanin(). diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h index 9db6d462667..5116c8183cb 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.h +++ b/tensorflow/core/grappler/costs/virtual_scheduler.h @@ -325,7 +325,7 @@ class VirtualScheduler { // Boolean field for whether the cost is accurate. std::map> op_costs_; - Costs graph_costs_; // Graph cost. + Costs graph_costs_; // Graph cost. std::map op_to_cost_; // Per-op cost. // Auxilliary data structures for constructing NodeState and DeviceState. diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h index f4e2de75a60..173ce9c09c2 100644 --- a/tensorflow/core/grappler/graph_view.h +++ b/tensorflow/core/grappler/graph_view.h @@ -46,6 +46,7 @@ class GraphView { }; explicit GraphView(GraphDef* graph); + GraphDef* GetGraph() const { return graph_; } NodeDef* GetNode(const string& node_name) const; // Get the specified input port. Note that the special '-1' port_id can be // used to access the controlling nodes (i.e. the nodes connected to node_name diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h index a17e1c9ff2a..b5973192610 100644 --- a/tensorflow/core/grappler/inputs/file_input_yielder.h +++ b/tensorflow/core/grappler/inputs/file_input_yielder.h @@ -18,8 +18,8 @@ limitations under the License. // that may be stored in the checkpoint are not restored in order to speedup the // initialization. -#ifndef LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_ -#define LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_ +#ifndef TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_ +#define TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_ #include #include @@ -53,4 +53,4 @@ class FileInputYielder : public InputYielder { } // end namespace grappler } // end namespace tensorflow -#endif // LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_ +#endif // TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_ diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index 791ad34bbed..8b9885e4c16 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -285,9 +285,11 @@ cc_library( "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler:op_types", "//tensorflow/core/grappler:utils", + "//tensorflow/core/grappler/clusters:virtual_cluster", "//tensorflow/core/grappler/costs:graph_memory", "//tensorflow/core/grappler/costs:graph_properties", "//tensorflow/core/grappler/utils:topological_sort", + "//tensorflow/core/grappler/utils:traversal", ], ) diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h index c5d2d47782f..8d1098d8775 100644 --- a/tensorflow/core/grappler/optimizers/auto_parallel.h +++ b/tensorflow/core/grappler/optimizers/auto_parallel.h @@ -16,8 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_ #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_ -#include "tensorflow/core/grappler/optimizers/graph_optimizer.h" #include "tensorflow/core/framework/variable.pb.h" +#include "tensorflow/core/grappler/optimizers/graph_optimizer.h" #include "tensorflow/core/lib/core/status.h" namespace tensorflow { diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc index d2da125236a..7b4ca1496cc 100644 --- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc @@ -36,20 +36,20 @@ namespace grappler { namespace { -int RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) { - int num_removed = 0; +bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) { + bool removed_input = false; int pos = 0; while (pos < node->input_size()) { if (node->input(pos) == input) { node->mutable_input()->SwapElements(pos, node->input_size() - 1); node->mutable_input()->RemoveLast(); node_map->RemoveOutput(NodeName(input), node->name()); + removed_input = true; } else { ++pos; } - ++num_removed; } - return num_removed; + return removed_input; } // Remove duplicate control inputs. @@ -71,6 +71,48 @@ void PruneControlInputs(NodeDef* node) { } // namespace +bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) { + if (!IsIdentity(node)) { + return true; + } + if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) { + return false; + } + if (!fetch_nodes_known_) { + // The output values of this node may be needed. + return false; + } + const NodeDef* input = node_map_->GetNode(NodeName(node.input(0))); + CHECK(input != nullptr) << "node = " << node.name() + << " input = " << node.input(0); + // Don't remove Identity nodes corresponding to Variable reads or following + // Recv. + if (IsVariable(*input) || IsRecv(*input)) { + return false; + } else if (IsSwitch(*input)) { + // Don't turn Identity nodes following Switch into NoOp or remove them + // if it requires anchoring a control dependencies the Switch node, which + // is not valid. + if (StringPiece(node.name()).starts_with(kConstantFoldingCtrl)) { + // TODO(rmlarsen): Try to remove this artificial contraint. + return false; + } + } + for (auto consumer : node_map_->GetOutputs(node.name())) { + if (node.input_size() > 1 && IsMerge(*consumer)) { + return false; + } + if (IsSwitch(*input)) { + for (const string& consumer_input : consumer->input()) { + if (consumer_input == AsControlDependency(node.name())) { + return false; + } + } + } + } + return true; +} + bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) { if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) { return false; @@ -100,18 +142,8 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) { return false; } - // Don't turn Identity nodes inserted by Grappler after Switch into NoOp, - // since we cannot anchor control dependencies on Switch nodes. - // Don't remove Identity nodes corresponding to Variable reads. - if (IsIdentity(node)) { - const NodeDef* input = node_map_->GetNode(NodeName(node.input(0))); - if (input != nullptr) { - if (IsVariable(*input) || - (StringPiece(node.name()).starts_with(kConstantFoldingCtrl) && - IsSwitch(*input))) { - return false; - } - } + if (!SafeToRemoveIdentity(node)) { + return false; } const std::unordered_set do_not_rewrite_ops{ @@ -124,19 +156,22 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) { void DependencyOptimizer::OptimizeNode(int node_idx, SetVector* nodes_to_simplify, std::set* nodes_to_delete) { + const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE; NodeDef* node = optimized_graph_->mutable_node(node_idx); - + const bool is_noop = IsNoOp(*node); + const bool is_identity = IsIdentity(*node); + const string node_name = node->name(); // Constant nodes with no input control dependency are always executed early, // so we can prune all their output control dependencies. if (IsConstant(*node) && node->input_size() == 0) { - const std::set output_nodes = node_map_->GetOutputs(node->name()); + const std::set output_nodes = node_map_->GetOutputs(node_name); for (NodeDef* fanout : output_nodes) { bool optimize_fanout = false; bool data_connection = false; for (int i = fanout->input_size() - 1; i >= 0; --i) { int pos; string input_name = ParseNodeName(fanout->input(i), &pos); - if (input_name == node->name()) { + if (input_name == node_name) { if (pos < 0) { fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1); fanout->mutable_input()->RemoveLast(); @@ -149,22 +184,21 @@ void DependencyOptimizer::OptimizeNode(int node_idx, if (optimize_fanout) { nodes_to_simplify->PushBack(node_to_idx_[fanout]); if (!data_connection) { - node_map_->RemoveOutput(node->name(), fanout->name()); + node_map_->RemoveOutput(node_name, fanout->name()); } } } - if (node_map_->GetOutputs(node->name()).empty() && fetch_nodes_known_ && - nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) { + if (node_map_->GetOutputs(node_name).empty() && fetch_nodes_known_ && + nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) { // Mark the node for deletion. nodes_to_delete->insert(node_to_idx_[node]); } - return; } // Change ops that only have control dependencies as outputs to NoOps. - if (node->op() != "NoOp" && SafeToConvertToNoOp(*node)) { - VLOG(1) << "***** Replacing " << node->name() << " (" << node->op() + if (!is_noop && SafeToConvertToNoOp(*node)) { + VLOG(1) << "***** Replacing " << node_name << " (" << node->op() << ") with NoOp."; // The outputs of this node are not consumed. Replace its inputs with // control dependencies and replace the op itself with the NoOp op. @@ -186,7 +220,7 @@ void DependencyOptimizer::OptimizeNode(int node_idx, old_input, optimized_graph_, node_map_.get()); if (ctrl_inputs.insert(ctrl_input).second) { node->set_input(pos, ctrl_input); - node_map_->UpdateInput(node->name(), old_input, ctrl_input); + node_map_->UpdateInput(node_name, old_input, ctrl_input); const NodeDef* old_input_node = node_map_->GetNode(old_input); nodes_to_simplify->PushBack(node_to_idx_[old_input_node]); } @@ -194,6 +228,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx, } node->set_op("NoOp"); node->clear_attr(); + nodes_to_simplify->PushBack(node_to_idx_[node]); + return; } // Remove NoOp nodes if the product of their fan-in and fan-out is less than @@ -222,9 +258,30 @@ void DependencyOptimizer::OptimizeNode(int node_idx, // a and x, respectively, are on the same device. Control edges across device // boundaries require inter-device communication (Send/Recv pairs to be // inserted in the graph), which is very costly. + // + // We also remove identity nodes, subject to the same constraints on number of + // resulting control edges and device boundary crossings: + // + // Case a) + // +----------+ ---> a +---+ ---> a + // x --> | Identity | --^> b ==> | x | --^> b + // | | ... | | ... + // +----------+ --^> c +---+ --^> c + // + // Case b) + // x ---> +----------+ ---> a x ---> +---+ + // y --^> | Identity | ==> y --^> | a | + // ... | | ... | | + // z --^> +----------+ z --^> +---+ + // + // Case c) + // +----------+ x ---> +---+ + // x ---> | Identity | ---> a ==> \--^> | a | + // y --^> | | --^> b /\ +---+ + // +----------+ y --^> b - if (node->op() == "NoOp") { - const auto& output_node_set = node_map_->GetOutputs(node->name()); + if (is_noop || (is_identity && is_aggressive)) { + const auto& output_node_set = node_map_->GetOutputs(node_name); const std::vector output_nodes(output_node_set.begin(), output_node_set.end()); const int num_outputs = output_nodes.size(); @@ -233,15 +290,14 @@ void DependencyOptimizer::OptimizeNode(int node_idx, if (num_inputs * num_outputs > num_inputs + num_outputs) { return; } - VLOG(1) << "***** Rerouting input around " << node->name(); std::vector input_nodes; for (int i = 0; i < num_inputs; ++i) { - NodeDef* tmp = node_map_->GetNode(node->input(i)); - CHECK_NE(tmp, nullptr); - input_nodes.push_back(tmp); + NodeDef* input_node = node_map_->GetNode(node->input(i)); + CHECK_NE(input_node, nullptr); + input_nodes.push_back(input_node); } - // Make sure that we don't increase the number of control edges that cross + // Make sure that we don't increase the number of edges that cross // device boundaries. if ((num_inputs == 1 && num_outputs > 1 && input_nodes[0]->device() != node->device()) || @@ -266,40 +322,75 @@ void DependencyOptimizer::OptimizeNode(int node_idx, if (num_cross_after > num_cross_before) { return; } + // To avoid potentially removing Identity nodes following _Recv nodes, + // we require that no device crossings occur in that case. + // TODO(rmlarsen): See if we can relax this condition. + if (is_identity && (num_cross_after > 0 || num_cross_before > 0)) { + return; + } } + if (is_identity && !SafeToRemoveIdentity(*node)) { + return; + } + + VLOG(1) << "***** Rerouting input around\n" << node->DebugString(); + // Now remove the node and re-wire its inputs to its outputs. for (auto consumer : output_nodes) { bool updated_consumer = false; - VLOG(1) << "***** Considering consumer " << consumer->name() << "\n" - << consumer->DebugString(); + VLOG(1) << "consumer before:\n" << consumer->DebugString(); for (int i = 0; i < num_inputs; ++i) { const NodeDef* input = input_nodes[i]; // Forward dependency from input to consumer if it doesn't already // depend on it. - if (node_map_->GetOutputs(input->name()).count(consumer) == 0) { - consumer->add_input(AsControlDependency(input->name())); + if (is_identity && i == 0) { + // Replace regular input from Identity node. + bool found_input = false; + string new_input; + const string& input_to_forward = node->input(0); + CHECK(!IsControlInput(input_to_forward)); + for (int j = 0; j < consumer->input_size(); ++j) { + const string& old_input = consumer->input(j); + if (old_input == node_name) { + new_input = input_to_forward; + node_map_->UpdateInput(consumer->name(), old_input, new_input); + consumer->set_input(j, new_input); + found_input = true; + } else if (old_input == AsControlDependency(NodeName(node_name))) { + new_input = AsControlDependency(NodeName(input_to_forward)); + node_map_->UpdateInput(consumer->name(), old_input, new_input); + consumer->set_input(j, new_input); + found_input = true; + } + } + CHECK(found_input); updated_consumer = true; - node_map_->AddOutput(input->name(), consumer->name()); - nodes_to_simplify->PushBack(node_to_idx_[input]); + } else { + // Forward dependency from input to consumer if it doesn't already + // depend on it. + if (node_map_->GetOutputs(input->name()).count(consumer) == 0) { + consumer->add_input(AsControlDependency(input->name())); + node_map_->AddOutput(input->name(), consumer->name()); + nodes_to_simplify->PushBack(node_to_idx_[input]); + updated_consumer = true; + } } } // Remove dependency on node from consumer. - updated_consumer |= RemoveInput( - consumer, AsControlDependency(node->name()), node_map_.get()); + updated_consumer |= RemoveInput(consumer, AsControlDependency(node_name), + node_map_.get()); if (updated_consumer) { - VLOG(1) << "***** Updated consumer " << consumer->name() << " (" - << consumer->op() << ")"; nodes_to_simplify->PushBack(node_to_idx_[consumer]); } + VLOG(1) << "consumer after:\n" << consumer->DebugString(); } - - node_map_->RemoveOutputs(node->name()); + node_map_->RemoveOutputs(node_name); if (fetch_nodes_known_ && - nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) { + nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) { // Mark the node for deletion. nodes_to_delete->insert(node_idx); - // Unconnect the node from its inputs to enable further optimizations. - node_map_->RemoveInputs(node->name()); + // Disconnect the node from its inputs to enable further optimizations. + node_map_->RemoveInputs(node_name); node->clear_input(); } } @@ -330,13 +421,18 @@ Status DependencyOptimizer::OptimizeDependencies() { std::set nodes_to_delete; for (int i = 0; i < optimized_graph_->node_size(); ++i) { const NodeDef& node = optimized_graph_->node(i); - if (node.op() == "NoOp" || IsConstant(node) || SafeToConvertToNoOp(node)) { + if (IsNoOp(node) || IsIdentity(node) || IsConstant(node) || + SafeToConvertToNoOp(node)) { nodes_to_simplify.PushBack(i); } } while (!nodes_to_simplify.Empty()) { - OptimizeNode(nodes_to_simplify.PopBack(), &nodes_to_simplify, - &nodes_to_delete); + int node_to_simplify = nodes_to_simplify.PopBack(); + // Discard nodes that were marked for deletion already. + while (nodes_to_delete.find(node_to_simplify) != nodes_to_delete.end()) { + node_to_simplify = nodes_to_simplify.PopBack(); + } + OptimizeNode(node_to_simplify, &nodes_to_simplify, &nodes_to_delete); } if (fetch_nodes_known_) { @@ -431,9 +527,10 @@ Status DependencyOptimizer::TransitiveReduction() { if (longest_distance[target] > 1) { const int input_slot = control_output.second; control_edges_to_remove[target].emplace(input_slot, source); - VLOG(1) << "Removing edge from:\n" - << optimized_graph_->node(source).DebugString() << "\n\nto:\n\n" - << optimized_graph_->node(target).DebugString(); + // VLOG(1) << "Removing edge from:\n" + // << optimized_graph_->node(source).DebugString() << + // "\n\nto:\n\n" + // << optimized_graph_->node(target).DebugString(); } } } @@ -473,8 +570,8 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, *optimized_graph_ = item.graph; nodes_to_preserve_ = item.NodesToPreserve(); fetch_nodes_known_ = !item.fetch.empty(); - CleanControlInputs(); + const int num_iterations = 2; for (int iteration = 0; iteration < num_iterations; ++iteration) { Status topo_sort_status; @@ -491,9 +588,12 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item, } else { LOG(ERROR) << topo_sort_status.error_message(); } - - // Turn nodes with only control outputs into NoOps, prune NoOps. + // Turn nodes with only control outputs into NoOps, prune NoOp and Identity + // nodes. TF_RETURN_IF_ERROR(OptimizeDependencies()); + + // Dedup control inputs. + CleanControlInputs(); } return Status::OK(); diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h index 02d8a0f32a9..0f47528a043 100644 --- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h +++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h @@ -43,6 +43,9 @@ class DependencyOptimizer : public GraphOptimizer { const GraphDef& optimized_graph, double result) override; private: + // Returns true if node is not an Identity node or if it is an Identity + // that is safe to remove. + bool SafeToRemoveIdentity(const NodeDef& node); // Returns true if it is safe to convert node to NoOp. bool SafeToConvertToNoOp(const NodeDef& node); // Removes all duplicate control dependencies. diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc index f5027a4a99e..b8facb9deae 100644 --- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc @@ -167,14 +167,16 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_SwitchIdentity) { ops::Const(scope.WithOpName("c2").WithControlDependencies(ctrl_dep_id), {1.0f, 2.0f}, {1, 2}); Output neg1 = ops::Neg(scope.WithOpName("neg1"), s.output_false); + Output neg2 = ops::Neg(scope.WithOpName("neg2"), ctrl_dep_id); GrapplerItem item; TF_CHECK_OK(scope.ToGraphDef(&item.graph)); item.fetch.push_back("c1"); item.fetch.push_back("c2"); item.fetch.push_back("neg1"); + item.fetch.push_back("neg2"); - DependencyOptimizer optimizer; + DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); TF_EXPECT_OK(status); @@ -323,25 +325,148 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) { } } +TEST_F(DependencyOptimizerTest, RemoveIdentity) { + tensorflow::Scope s = tensorflow::Scope::NewRootScope(); + Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT); + Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT); + Output z = ops::RandomUniform(s.WithOpName("z"), {1, 2}, DT_FLOAT); + + // Identity nodes to be removed. + // Case a) with a single input- and multiple outputs. + auto id_a = ops::Identity(s.WithOpName("id_a"), x); + // Case b) with multiple inputs and a single output. + auto id_b = ops::Identity( + s.WithOpName("id_b").WithControlDependencies(y).WithControlDependencies( + z), + x); + // Case c) with two inputs and two outputs. + auto id_c = ops::Identity(s.WithOpName("id_c").WithControlDependencies(y), x); + + // Output for Case a. + Output a_a = ops::Identity(s.WithOpName("a_a"), id_a); + Output a_b = ops::Identity(s.WithOpName("a_b"), id_a); + Output a_c = + ops::Identity(s.WithOpName("a_c").WithControlDependencies(id_a), z); + Output a_d = + ops::Identity(s.WithOpName("a_d").WithControlDependencies(id_a), z); + // Output for Case b. + Output b_a = ops::Identity(s.WithOpName("b_a"), id_b); + // Output for Case c. + Output c_a = ops::Identity(s.WithOpName("c_a"), id_c); + Output c_b = + ops::Identity(s.WithOpName("c_b").WithControlDependencies(id_c), z); + + GrapplerItem item; + TF_CHECK_OK(s.ToGraphDef(&item.graph)); + item.fetch = {"a_a", "a_b", "a_c", "a_d", "b_a", "c_a", "c_b"}; + + DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE); + GraphDef output; + Status status = optimizer.Optimize(nullptr, item, &output); + TF_EXPECT_OK(status); + + EXPECT_EQ(item.graph.node_size() - 3, output.node_size()); + for (const NodeDef& node : output.node()) { + EXPECT_NE("id_a", node.name()); + EXPECT_NE("id_b", node.name()); + EXPECT_NE("id_c", node.name()); + if (node.name() == "a_a" || node.name() == "a_b") { + EXPECT_EQ(1, node.input_size()); + EXPECT_EQ("x", node.input(0)); + } + if (node.name() == "a_c" || node.name() == "a_d") { + EXPECT_EQ(2, node.input_size()); + EXPECT_EQ("z", node.input(0)); + EXPECT_EQ("^x", node.input(1)); + } + if (node.name() == "b_a") { + EXPECT_EQ(3, node.input_size()); + EXPECT_EQ("x", node.input(0)); + EXPECT_EQ("^y", node.input(1)); + EXPECT_EQ("^z", node.input(2)); + } + if (node.name() == "c_a") { + EXPECT_EQ(2, node.input_size()); + EXPECT_EQ("x", node.input(0)); + EXPECT_EQ("^y", node.input(1)); + } + if (node.name() == "c_b") { + EXPECT_EQ(3, node.input_size()); + EXPECT_EQ("z", node.input(0)); + EXPECT_EQ("^x", node.input(1)); + EXPECT_EQ("^y", node.input(2)); + } + } +} + +TEST_F(DependencyOptimizerTest, RemoveIdentity_RepeatedInputs) { + // Corner cases with repeated inputs. + tensorflow::Scope scope = tensorflow::Scope::NewRootScope(); + ops::Variable x(scope.WithOpName("x"), {}, DT_BOOL); + ops::Variable y(scope.WithOpName("y"), {}, DT_BOOL); + ops::Switch sw(scope.WithOpName("switch"), x, x); + // id0 should be removed. + Output id0 = ops::Identity(scope.WithOpName("id0"), sw.output_true); + // id1 should not be removed, since it would anchor a control dependency + // on the switch. + Output id1 = ops::Identity(scope.WithOpName("id1"), sw.output_false); + Output or0 = ops::LogicalOr(scope.WithOpName("or0"), id0, id0); + Output or1 = ops::LogicalOr(scope.WithOpName("or1"), id0, y); + Output or2 = ops::LogicalOr( + scope.WithOpName("or2").WithControlDependencies(id1), y, y); + + GrapplerItem item; + TF_CHECK_OK(scope.ToGraphDef(&item.graph)); + item.fetch.push_back("or0"); + item.fetch.push_back("or1"); + item.fetch.push_back("or2"); + DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE); + GraphDef output; + Status status = optimizer.Optimize(nullptr, item, &output); + TF_EXPECT_OK(status); + + EXPECT_EQ(item.graph.node_size() - 1, output.node_size()); + for (const NodeDef& node : output.node()) { + EXPECT_NE("id0", node.name()); + if (node.name() == "or0") { + EXPECT_EQ(2, node.input_size()); + EXPECT_EQ("switch:1", node.input(0)); + EXPECT_EQ("switch:1", node.input(1)); + } + if (node.name() == "or1") { + EXPECT_EQ(2, node.input_size()); + EXPECT_EQ("switch:1", node.input(0)); + EXPECT_EQ("y", node.input(1)); + } + if (node.name() == "or2") { + // or1 should be unchanged. + EXPECT_EQ(3, node.input_size()); + EXPECT_EQ("y", node.input(0)); + EXPECT_EQ("y", node.input(1)); + EXPECT_EQ("^id1", node.input(2)); + } + } +} + TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) { tensorflow::Scope s = tensorflow::Scope::NewRootScope(); Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2}); Output x = ops::Square(s.WithOpName("x"), c); - Output id1 = ops::Identity(s.WithOpName("id1"), x); - Output id2 = - ops::Identity(s.WithOpName("id2").WithControlDependencies({x}), id1); + Output neg1 = ops::Neg(s.WithOpName("neg1"), x); + Output neg2 = + ops::Neg(s.WithOpName("neg2").WithControlDependencies({x}), neg1); GrapplerItem item; TF_CHECK_OK(s.ToGraphDef(&item.graph)); - item.fetch.push_back("id2"); - DependencyOptimizer optimizer; + item.fetch.push_back("neg2"); + DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); TF_EXPECT_OK(status); EXPECT_EQ(4, output.node_size()); - EXPECT_EQ("id2", output.node(3).name()); + EXPECT_EQ("neg2", output.node(3).name()); EXPECT_EQ(1, output.node(3).input_size()); - EXPECT_EQ("id1", output.node(3).input(0)); + EXPECT_EQ("neg1", output.node(3).input(0)); } TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) { @@ -356,20 +481,21 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) { Output grappler_added_id = ops::Identity( scope.WithOpName("ConstantFoldingCtrl/switch_1"), s.output_true); Output c1 = ops::Const(scope.WithOpName("c1") - .WithControlDependencies(id0) .WithControlDependencies(id_after_var) .WithControlDependencies(grappler_added_id), {1.0f, 2.0f}, {1, 2}); Output id1 = ops::Identity(scope.WithOpName("id1"), c1); + Output id2 = ops::Identity(scope.WithOpName("id2"), id0); Output fetch = ops::Identity(scope.WithOpName("fetch").WithControlDependencies(id1), c1); GrapplerItem item; TF_CHECK_OK(scope.ToGraphDef(&item.graph)); item.fetch.push_back("c1"); + item.fetch.push_back("id2"); item.fetch.push_back("fetch"); - DependencyOptimizer optimizer; + DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE); GraphDef output; Status status = optimizer.Optimize(nullptr, item, &output); TF_EXPECT_OK(status); @@ -377,8 +503,8 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) { EXPECT_EQ(item.graph.node_size() - 2, output.node_size()); for (int i = 0; i < output.node_size(); ++i) { const NodeDef& node = output.node(i); - // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1" nor - // "id_after_var" should be eliminated. + // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1", + // "id_after_var, nor "id2"" should be eliminated. EXPECT_NE("id0", node.name()); EXPECT_NE("id1", node.name()); if (node.name() == "c1") { diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc index 735d78e7eee..433b3564fe5 100644 --- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc @@ -2041,17 +2041,6 @@ class DataLayoutOptimizer : GraphProcessor { const LayoutOptimizer::TuningConfig& config_; }; -int GetNumTranspose(const GraphDef& graph) { - int number = 0; - for (const auto& node : graph.node()) { - if (IsTranspose(node)) { - number++; - } - } - VLOG(1) << "Number of Transpose nodes: " << number; - return number; -} - int GetNumGPUs(const Cluster& cluster) { auto devices = cluster.GetDevices(); int num_gpus = 0; diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc index f537ecc41b9..ffa03db2623 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/core/framework/node_def.pb.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/tensor_shape.pb.h" +#include "tensorflow/core/grappler/clusters/virtual_cluster.h" #include "tensorflow/core/grappler/costs/graph_memory.h" #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/grappler/graph_view.h" @@ -34,6 +35,7 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/static_schedule.h" #include "tensorflow/core/grappler/utils.h" #include "tensorflow/core/grappler/utils/topological_sort.h" +#include "tensorflow/core/grappler/utils/traversal.h" #include "tensorflow/core/protobuf/rewriter_config.pb.h" namespace tensorflow { @@ -496,7 +498,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) { if (!IsAddN(node)) { continue; } - // There is nothing to gain by optimizing nodes with 2 inputs of fewer. + // There is nothing to gain by optimizing nodes with 2 or fewer inputs. if (view.NumFanins(node, false) <= 2) { continue; } @@ -558,6 +560,54 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) { VLOG(1) << "Missing properties for " << node->name(); continue; } + + // Compute a topological ordering for the node fanin. + std::unordered_map topo_order; + ReverseDfs(view, {node}, nullptr, + [&topo_order](NodeDef* n) { + int topo_index = topo_order.size(); + topo_order[n] = topo_index; + }, + nullptr); + + std::vector input_topo_index; + + for (int i = 0; i < node->input_size(); ++i) { + const string& input = node->input(i); + const string node_name = NodeName(input); + NodeDef* node = view.GetNode(node_name); + input_topo_index.push_back(topo_order.at(node)); + } + int min_input_topo_index = INT_MAX; + int min_input_id = -1; + for (int i = 0; i < node->input_size(); ++i) { + if (IsControlInput(node->input(i))) { + // control inputs are always last. + break; + } + const int current = input_topo_index[i]; + if (current < min_input_topo_index) { + min_input_topo_index = current; + min_input_id = i; + } + } + CHECK_LE(0, min_input_id); + std::vector pre_ctrl_deps; + std::vector post_ctrl_deps; + for (int i = node->input_size() - 1; i >= 0; --i) { + if (!IsControlInput(node->input(i))) { + // control inputs are always last. + break; + } + if (input_topo_index[i] < min_input_topo_index) { + // These control dependencies can be executed before the node. + pre_ctrl_deps.push_back(node->input(i)); + } else { + // These control dependencies should be executed after the node. + post_ctrl_deps.push_back(node->input(i)); + } + } + const TensorShapeProto& shape = properties.GetOutputProperties(node->name())[0].shape(); DataType dtype = node->attr().at("T").type(); @@ -572,13 +622,19 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) { *(*tmp_var->mutable_attr())["shape"].mutable_shape() = shape; (*tmp_var->mutable_attr())["var_name"].set_s(tmp_var->name()); + for (const string& ctrl_dep : pre_ctrl_deps) { + *tmp_var->add_input() = ctrl_dep; + } + *tmp_var->add_input() = + AsControlDependency(NodeName(node->input(min_input_id))); + // Initialize it to zero NodeDef* zeros = item->graph.add_node(); zeros->set_name(strings::StrCat(node->name(), "/tmp_var_zeros")); zeros->set_op("ZerosLike"); zeros->set_device(device); (*zeros->mutable_attr())["T"].set_type(dtype); - *zeros->add_input() = node->input(0); + *zeros->add_input() = node->input(min_input_id); NodeDef* initialize = item->graph.add_node(); initialize->set_name(strings::StrCat(node->name(), "/tmp_var_initializer")); @@ -592,9 +648,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) { std::vector accumulates; for (int i = 0; i < node->input_size(); ++i) { const string& input = node->input(i); - if (IsControlInput(input)) { - *zeros->add_input() = input; - } else { + if (!IsControlInput(input)) { NodeDef* accumulate = item->graph.add_node(); accumulate->set_name( strings::StrCat(node->name(), "/tmp_var_accum_", i)); @@ -617,6 +671,10 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) { for (const NodeDef* accum : accumulates) { *node->add_input() = AsControlDependency(accum->name()); } + for (const string& ctrl_dep : post_ctrl_deps) { + *node->add_input() = ctrl_dep; + } + updated_graph = true; } @@ -828,8 +886,7 @@ static NodeDef* FindSwapOutTrigger( const std::unordered_set& fanout = view.GetFanout(generator); NodeDef* trigger = nullptr; - Costs::NanoSeconds earliest_fanout( - static_cast(std::numeric_limits::max() >> 2)); + Costs::NanoSeconds earliest_fanout(Costs::NanoSeconds::infinity()); for (const auto& port : fanout) { if (port.node == node) { @@ -861,6 +918,15 @@ static bool IsSwappable(GraphView::InputPort input) { return !IsRefType(dtype); } +struct MemInfo { + GraphView::OutputPort port; + int64 memory_used; + std::vector uses_left; + double fitness; + + bool operator<(const MemInfo& other) const { return fitness < other.fitness; } +}; + static bool IdentifySwappingCandidates( Cluster* cluster, GrapplerItem* item, std::unordered_set* skip_list, std::unordered_map* nodes_to_swap) { @@ -890,31 +956,56 @@ static bool IdentifySwappingCandidates( continue; } int64 required_savings = mem_usage.used_memory - prop.memory_size(); - // TODO(bsteiner): sort the tensors by how long they're live. - std::unordered_map execution_times; + std::unordered_map op_completion_times; { - std::unordered_map - tmp_execution_times; - if (!EstimateEarliestExecutionTimes(*item, cluster, &tmp_execution_times) - .ok()) { + VirtualCluster vcluster(cluster->GetDevices()); + if (!vcluster.Provision().ok()) { return false; } - for (const auto& exec_time : tmp_execution_times) { - execution_times.emplace(exec_time.first->name(), exec_time.second); + if (!vcluster.Initialize(*item).ok()) { + return false; + } + RunMetadata metadata; + Status s = vcluster.Run(item->graph, item->feed, item->fetch, &metadata); + if (!s.ok() && s.code() != error::RESOURCE_EXHAUSTED) { + return false; + } + + for (const auto& dev_stats : metadata.step_stats().dev_stats()) { + for (const auto& node_stats : dev_stats.node_stats()) { + Costs::NanoSeconds exec_time = + Costs::NanoSeconds(1) + + Costs::MicroSeconds(node_stats.all_start_micros() + + node_stats.op_end_rel_micros()); + op_completion_times.emplace(node_stats.node_name(), exec_time); + } } } + Costs::Duration peak_time = -1; + for (const auto& live_tensor : mem_usage.live_tensors) { + if (live_tensor.allocation_time > peak_time) { + peak_time = live_tensor.allocation_time; + } + } + + std::vector mem_state; + GraphView graph(&item->graph); for (const auto& live_tensor : mem_usage.live_tensors) { + if (live_tensor.memory_used <= 1024) { + // Don't bother with small tensors. + continue; + } if (live_tensor.deallocation_time - live_tensor.allocation_time <= Costs::Duration(1e6)) { // Not enough time to swap. VLOG(1) << "Not enough time to swap: skipping " << live_tensor.node; continue; } - if (live_tensor.memory_used <= 1024) { - // Don't bother with small tensors. + + if (skip_list->find(live_tensor.node) != skip_list->end()) { continue; } GraphView::OutputPort port = @@ -922,56 +1013,77 @@ static bool IdentifySwappingCandidates( if (!IsSwappable(graph, port)) { continue; } - Costs::NanoSeconds execution_time(-1); - GraphView::InputPort fanout_to_swap; + MemInfo mem_info; + mem_info.port = port; + mem_info.memory_used = live_tensor.memory_used; + Costs::Duration allocation_time = live_tensor.allocation_time; + Costs::Duration earliest_use(Costs::Duration::infinity()); + bool valid = true; for (GraphView::InputPort input : graph.GetFanout(port)) { - if (skip_list->find(input.node->name()) != skip_list->end()) { + // Get execution time. + auto it = op_completion_times.find(input.node->name()); + if (it == op_completion_times.end()) { + valid = false; + break; + } + if (it->second <= peak_time) { continue; } + + if (skip_list->find(input.node->name()) != skip_list->end()) { + valid = false; + break; + } string input_name = strings::StrCat(input.node->name(), ":", input.port_id); if (skip_list->find(input_name) != skip_list->end()) { - continue; - } - if (!IsSwappable(input)) { - continue; - } - auto it = execution_times.find(input.node->name()); - if (it != execution_times.end()) { - if (it->second > execution_time) { - fanout_to_swap = input; - execution_time = it->second; - } - } - } - // Annotate the fanout to request the tensor to be swapped if it's not - // already been done. - bool found = false; - if (!fanout_to_swap.node) { - continue; - } - auto it = fanout_to_swap.node->attr().find("_swap_to_host"); - if (it != fanout_to_swap.node->attr().end()) { - const AttrValue& val = it->second; - for (int port_id : val.list().i()) { - if (port_id == fanout_to_swap.port_id) { - found = true; - break; - } - } - } - if (!found) { - (*nodes_to_swap)[fanout_to_swap.node].inputs_to_swap.push_back( - fanout_to_swap.port_id); - required_savings -= live_tensor.memory_used; - updated_graph = true; - if (required_savings < 0) { + valid = false; break; } + if (!IsSwappable(input)) { + valid = false; + break; + } + + // Set earliest use time that's after peak. + mem_info.uses_left.emplace_back(input); + earliest_use = std::min(earliest_use, it->second); + } + if (valid && !mem_info.uses_left.empty()) { + // Compute the fitness: we need the tensor to be generated way away of + // the time of peak memory usage (to ensure there is enough time to swap + // it out). We also need to ensure it's used way after the peak time, to + // ensure that swapping the tensor back in won't recreate the memory + // bottleneck. Last but not least, we want the tensor to have as few + // remaining uses as possible. + mem_info.fitness = std::pow((earliest_use - peak_time).count(), 2); + mem_info.fitness /= std::pow(mem_info.uses_left.size(), 2); + mem_info.fitness += std::pow((allocation_time - peak_time).count(), 2); + mem_info.fitness = -mem_info.fitness; + mem_state.push_back(mem_info); + } + } + + // Sort by fitness + std::sort(mem_state.begin(), mem_state.end()); + + for (const MemInfo& mem_info : mem_state) { + for (const GraphView::InputPort fanout_to_swap : mem_info.uses_left) { + VLOG(1) << "Will swap fanout " << fanout_to_swap.node->name() << ":" + << fanout_to_swap.port_id << " of tensor " + << mem_info.port.node->name() << ":" << mem_info.port.port_id + << " of size " << mem_info.memory_used; + + (*nodes_to_swap)[fanout_to_swap.node].inputs_to_swap.push_back( + fanout_to_swap.port_id); + } + required_savings -= mem_info.memory_used; + updated_graph = true; + if (required_savings < 0) { + break; } } } - return updated_graph; } @@ -1011,7 +1123,7 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level, } for (auto& swap : nodes_to_swap) { const NodeDef* node = swap.first; - std::vector props = + const std::vector& props = properties.GetInputProperties(node->name()); SwapInfo& swap_info = swap.second; int64 bytes_to_swap = 0; diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc index dd2d20d8d68..f5d9c879926 100644 --- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc @@ -337,8 +337,9 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) { for (const auto& node : output.node()) { if (node.name() == "e") { // The d node isn't swappable. - EXPECT_EQ(4, node.input_size()); + EXPECT_EQ(5, node.input_size()); EXPECT_EQ("d", node.input(2)); + EXPECT_EQ("^swap_out_d_2", node.input(4)); } } } diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD index 534f7a063fe..137d51790d3 100644 --- a/tensorflow/core/grappler/utils/BUILD +++ b/tensorflow/core/grappler/utils/BUILD @@ -99,3 +99,29 @@ tf_cc_test( "//tensorflow/core:test_main", ], ) + +cc_library( + name = "traversal", + srcs = ["traversal.cc"], + hdrs = ["traversal.h"], + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core/grappler:graph_view", + "//tensorflow/core/grappler:op_types", + "//tensorflow/core/grappler:utils", + ], +) + +tf_cc_test( + name = "traversal_test", + srcs = ["traversal_test.cc"], + deps = [ + ":traversal", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) diff --git a/tensorflow/core/grappler/utils/traversal.cc b/tensorflow/core/grappler/utils/traversal.cc new file mode 100644 index 00000000000..f44f53c4e63 --- /dev/null +++ b/tensorflow/core/grappler/utils/traversal.cc @@ -0,0 +1,80 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/utils/traversal.h" +#include "tensorflow/core/framework/node_def.pb.h" + +namespace tensorflow { +namespace grappler { + +void ReverseDfs(const GraphView& graph_view, const std::vector& from, + const std::function& pre_order, + const std::function& post_order, + const std::function& on_back_edge) { + // Stack of work to do. + struct StackElem { + NodeDef* node; + bool children_visited; + NodeDef* src; + }; + std::vector stack; + + stack.reserve(from.size()); + for (NodeDef* node : from) { + stack.push_back(StackElem{node, false}); + } + + enum NodeState { NOT_VISITED = 0, VISITING = 1, DONE = 2 }; + std::unordered_map node_state; + while (!stack.empty()) { + StackElem w = stack.back(); + stack.pop_back(); + + if (w.children_visited) { + // We've processed all the children of this node + node_state[w.node] = DONE; + if (post_order) { + post_order(w.node); + } + continue; + } + + auto& rslt = node_state[w.node]; + if (rslt == DONE) { + continue; + } else if (rslt == VISITING) { + // Loop detected + if (on_back_edge) { + on_back_edge(w.src, w.node); + } + continue; + } + rslt = VISITING; + if (pre_order) { + pre_order(w.node); + } + + // Enqueue the node again with the children_visited flag set to true. + stack.push_back(StackElem{w.node, true, w.src}); + + // Now enqueu the node children. + for (const auto fanin : graph_view.GetFanins(*w.node, true)) { + stack.push_back(StackElem{fanin.node, false, w.node}); + } + } +} + +} // namespace grappler +} // namespace tensorflow diff --git a/tensorflow/core/grappler/utils/traversal.h b/tensorflow/core/grappler/utils/traversal.h new file mode 100644 index 00000000000..bb3fa090e8f --- /dev/null +++ b/tensorflow/core/grappler/utils/traversal.h @@ -0,0 +1,39 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_ +#define TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_ + +#include +#include "tensorflow/core/grappler/graph_view.h" + +namespace tensorflow { +namespace grappler { + +// Traverse the graph in reverse dfs order, starting from the list of nodes +// specified in the 'from' argument. The pre_order and post_order functors will +// be called on each reachable node (including the 'from' nodes) in pre and post +// order. If loops are found, the on_back_edge functor will be called on the +// corresponding back edges. Moreover, the pre and post order will assume that +// these back edges will be cut. +void ReverseDfs(const GraphView& graph_view, const std::vector& from, + const std::function& pre_order, + const std::function& post_order, + const std::function& on_back_edge); + +} // namespace grappler +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_ diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc new file mode 100644 index 00000000000..cc68bd1a963 --- /dev/null +++ b/tensorflow/core/grappler/utils/traversal_test.cc @@ -0,0 +1,101 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/grappler/utils/traversal.h" +//#include "tensorflow/core/framework/node_def.pb.h" +//#include "tensorflow/core/lib/core/status_test_util.h" +//#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace grappler { +namespace { + +class TraversalTest : public ::testing::Test { + protected: + static NodeDef CreateNode(const string& name, + const std::vector& inputs) { + return CreateNode(name, "", inputs); + } + static NodeDef CreateNode(const string& name, const string& op, + const std::vector& inputs) { + NodeDef node; + node.set_name(name); + if (!op.empty()) { + node.set_op(op); + } + for (const string& input : inputs) { + node.add_input(input); + } + return node; + } +}; + +TEST_F(TraversalTest, ReverseDfsNoLoop) { + GraphDef graph; + *graph.add_node() = CreateNode("2", {"5"}); + *graph.add_node() = CreateNode("0", {"5", "4"}); + *graph.add_node() = CreateNode("1", {"4", "3"}); + *graph.add_node() = CreateNode("3", {"2"}); + *graph.add_node() = CreateNode("5", {}); + *graph.add_node() = CreateNode("4", {}); + + std::vector start_nodes = {graph.mutable_node(1), + graph.mutable_node(2)}; + std::vector pre_order; + std::vector post_order; + bool found_back_edge = false; + ReverseDfs( + GraphView(&graph), start_nodes, + [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); }, + [&post_order](NodeDef* n) { post_order.push_back(n->name()); }, + [&found_back_edge](NodeDef*, NodeDef*) { found_back_edge = true; }); + + EXPECT_EQ(std::vector({"1", "4", "3", "2", "5", "0"}), pre_order); + EXPECT_EQ(std::vector({"4", "5", "2", "3", "1", "0"}), post_order); + EXPECT_FALSE(found_back_edge); +} + +TEST_F(TraversalTest, ReverseDfsWithLoop) { + GraphDef graph; + // Create a loop + *graph.add_node() = CreateNode("2", "Merge", {"1", "5"}); + *graph.add_node() = CreateNode("3", "Switch", {"2"}); + *graph.add_node() = CreateNode("4", "Identity", {"3"}); + *graph.add_node() = CreateNode("5", "NextIteration", {"4"}); + *graph.add_node() = CreateNode("1", "Enter", {}); + *graph.add_node() = CreateNode("6", "Exit", {"3"}); + + std::vector start_nodes = {graph.mutable_node(5)}; + std::vector pre_order; + std::vector post_order; + std::vector back_edges; + ReverseDfs( + GraphView(&graph), start_nodes, + [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); }, + [&post_order](NodeDef* n) { post_order.push_back(n->name()); }, + [&back_edges](NodeDef* src, NodeDef* dst) { + back_edges.push_back(strings::StrCat(src->name(), "->", dst->name())); + }); + + EXPECT_EQ(std::vector({"6", "3", "2", "1", "5", "4"}), pre_order); + EXPECT_EQ(std::vector({"1", "4", "5", "2", "3", "6"}), post_order); + EXPECT_EQ(std::vector({"4->3"}), back_edges); +} + +} // namespace +} // namespace grappler +} // namespace tensorflow diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index fd99409c9b3..e7192ec42fe 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -629,6 +629,7 @@ cc_library( ":transpose_op", ":unique_op", ":unpack_op", + ":unravel_index_op", ":where_op", ], ) @@ -883,6 +884,12 @@ tf_kernel_library( deps = ARRAY_DEPS + [":split_lib"], ) +tf_kernel_library( + name = "unravel_index_op", + prefix = "unravel_index_op", + deps = ARRAY_DEPS, +) + tf_kernel_library( name = "where_op", srcs = ["where_op.cc"], @@ -2582,6 +2589,45 @@ tf_cc_tests( ], ) +cc_library( + name = "manip", + deps = [ + ":roll_op", + ], +) + +MANIP_DEPS = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:manip_ops_op_lib", + "//third_party/eigen3", +] + +tf_kernel_library( + name = "roll_op", + prefix = "roll_op", + deps = MANIP_DEPS, +) + +tf_cc_test( + name = "roll_op_test", + size = "small", + srcs = ["roll_op_test.cc"], + deps = [ + ":ops_testutil", + ":ops_util", + ":roll_op", + "//tensorflow/core:core_cpu", + "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + MATH_DEPS = [ ":bounds_check", ":fill_functor", diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc index 37976f71837..72155fd0373 100644 --- a/tensorflow/core/kernels/adjust_contrast_op.cc +++ b/tensorflow/core/kernels/adjust_contrast_op.cc @@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice; template class AdjustContrastOp : public OpKernel { public: - explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) { - } + explicit AdjustContrastOp(OpKernelConstruction* context) + : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc index 0fc03b5a236..7522b320400 100644 --- a/tensorflow/core/kernels/adjust_contrast_op_test.cc +++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc @@ -29,8 +29,7 @@ limitations under the License. namespace tensorflow { -class AdjustContrastOpTest : public OpsTestBase { -}; +class AdjustContrastOpTest : public OpsTestBase {}; TEST_F(AdjustContrastOpTest, Simple_1113) { TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2") diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc index 4643d4e6efd..f0c6ae499d4 100644 --- a/tensorflow/core/kernels/adjust_saturation_op.cc +++ b/tensorflow/core/kernels/adjust_saturation_op.cc @@ -192,8 +192,9 @@ class AdjustSaturationOp : public AdjustSaturationOpBase { const DeviceBase::CpuWorkerThreads& worker_threads = *context->device()->tensorflow_cpu_worker_threads(); Shard(worker_threads.num_threads, worker_threads.workers, channel_count, - kCostPerChannel, [channel_count, &input_data, &output_data, scale_h]( - int64 start_channel, int64 end_channel) { + kCostPerChannel, + [channel_count, &input_data, &output_data, scale_h]( + int64 start_channel, int64 end_channel) { const float* p = input_data.data() + start_channel * kChannelSize; float* q = output_data.data() + start_channel * kChannelSize; for (int i = start_channel; i < end_channel; i++) { diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h index dfa3fe585e3..aa1cead928a 100644 --- a/tensorflow/core/kernels/aggregate_ops_cpu.h +++ b/tensorflow/core/kernels/aggregate_ops_cpu.h @@ -25,7 +25,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace tensorflow { @@ -201,7 +201,7 @@ struct Add7Functor { typename TTypes::ConstFlat in6, typename TTypes::ConstFlat in7) { Add7EigenImpl::Compute(d, out, in1, in2, in3, in4, in5, in6, - in7); + in7); } }; @@ -214,7 +214,7 @@ struct Add8Functor { typename TTypes::ConstFlat in5, typename TTypes::ConstFlat in6, typename TTypes::ConstFlat in7, typename TTypes::ConstFlat in8) { Add8EigenImpl::Compute(d, out, in1, in2, in3, in4, in5, in6, - in7, in8); + in7, in8); } }; @@ -227,7 +227,7 @@ struct Add8pFunctor { typename TTypes::ConstFlat in5, typename TTypes::ConstFlat in6, typename TTypes::ConstFlat in7, typename TTypes::ConstFlat in8) { Add8pEigenImpl::Compute(d, out, in1, in2, in3, in4, in5, in6, - in7, in8); + in7, in8); } }; @@ -241,10 +241,10 @@ struct Add9Functor { typename TTypes::ConstFlat in7, typename TTypes::ConstFlat in8, typename TTypes::ConstFlat in9) { Add9EigenImpl::Compute(d, out, in1, in2, in3, in4, in5, in6, - in7, in8, in9); + in7, in8, in9); } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc index cc8f122cab3..ce2fce92e4e 100644 --- a/tensorflow/core/kernels/attention_ops.cc +++ b/tensorflow/core/kernels/attention_ops.cc @@ -52,8 +52,9 @@ class ExtractGlimpseOp : public OpKernel { const int64 batch_size = input_shape.dim_size(0); const Tensor& window_size = context->input(1); - OP_REQUIRES(context, (window_size.shape().dims() == 1) && - window_size.shape().dim_size(0) == 2, + OP_REQUIRES(context, + (window_size.shape().dims() == 1) && + window_size.shape().dim_size(0) == 2, errors::InvalidArgument( "input must be a vector of size 2 (height, width)", window_size.shape().DebugString())); diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h index dea2683184a..f5e81dbc093 100644 --- a/tensorflow/core/kernels/avgpooling_op.h +++ b/tensorflow/core/kernels/avgpooling_op.h @@ -48,9 +48,8 @@ struct SpatialAvgPooling { typedef Eigen::GpuDevice GPUDevice; -// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation -// that works NHWC data formats. -// Arguments: +// Launch a custom GPU kernels from Yanqing for the avgpooling backward +// operation that works NHWC data formats. Arguments: // top_diff: backprop to the output of the pooling layer // num: number of input batches // height: input height diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc index 2be330d1427..6537b42f1ed 100644 --- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc +++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc @@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads, hstart = max(hstart, 0); wstart = max(wstart, 0); int pool_size = (hend - hstart) * (wend - wstart); - gradient += - top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size); + gradient += top_diff_slice[(ph * pooled_width + pw) * channels] / + dtype(pool_size); } } bottom_diff[index] = gradient; @@ -90,11 +90,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num, const GPUDevice& d) { int x_size = num * height * width * channels; CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d); - AvePoolBackwardNHWC< - T><<>>( - config.virtual_thread_count, top_diff, num, height, width, channels, - pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w, - pad_t, pad_t, bottom_diff); + AvePoolBackwardNHWC + <<>>( + config.virtual_thread_count, top_diff, num, height, width, channels, + pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w, + pad_t, pad_t, bottom_diff); return d.ok(); } diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc index d0bbea9fe27..944564dfba6 100644 --- a/tensorflow/core/kernels/barrier_ops.cc +++ b/tensorflow/core/kernels/barrier_ops.cc @@ -111,13 +111,14 @@ class Barrier : public ResourceBase { mutex_lock lock(mu_); if (closed_) { OP_REQUIRES_ASYNC( - ctx, !cancel_pending_enqueues_ && - (num_inserted == 0 || !incomplete_.empty()), + ctx, + !cancel_pending_enqueues_ && + (num_inserted == 0 || !incomplete_.empty()), errors::Cancelled( "Barrier ", name_, " is closed. Pending enqueues cancelled: ", - cancel_pending_enqueues_, ". Number of new insertions: ", - num_inserted, ". Number of incomplete keys: ", - incomplete_.size(), "."), + cancel_pending_enqueues_, + ". Number of new insertions: ", num_inserted, + ". Number of incomplete keys: ", incomplete_.size(), "."), callback); } @@ -128,9 +129,10 @@ class Barrier : public ResourceBase { for (int i = 0; i < num_inserted; ++i) { OP_REQUIRES_OK_ASYNC( - ctx, InsertOneLocked(ctx, keys, values, element_shape, - component_index, i, &ready_tuples, - &new_elements), + ctx, + InsertOneLocked(ctx, keys, values, element_shape, + component_index, i, &ready_tuples, + &new_elements), callback); } @@ -317,8 +319,9 @@ class Barrier : public ResourceBase { return errors::Cancelled( "Barrier ", name_, " is closed, but attempted to insert a brand new key: ", - keys_vec(i), ". Pending enqueues cancelled: ", - cancel_pending_enqueues_, ". Insertion index: ", i, + keys_vec(i), + ". Pending enqueues cancelled: ", cancel_pending_enqueues_, + ". Insertion index: ", i, ". Number of incomplete keys: ", incomplete_.size(), "."); } } else { @@ -532,13 +535,14 @@ class InsertManyOp : public BarrierOpKernel { OP_REQUIRES_ASYNC( ctx, component_index_ < barrier->num_components(), errors::InvalidArgument("The component ID is out of range ", - component_index_, " > num_components", " (= ", - barrier->num_components(), ")"), + component_index_, " > num_components", + " (= ", barrier->num_components(), ")"), callback); OP_REQUIRES_OK_ASYNC( - ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING, - barrier->component_type(component_index_)}, - {}), + ctx, + ctx->MatchSignature({DT_STRING_REF, DT_STRING, + barrier->component_type(component_index_)}, + {}), callback); const Tensor* keys; diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc index 5b4e1a809fa..c447db842d3 100644 --- a/tensorflow/core/kernels/batch_kernels.cc +++ b/tensorflow/core/kernels/batch_kernels.cc @@ -13,22 +13,20 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ - #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_util.h" #include "tensorflow/core/framework/types.h" -#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h" #include "tensorflow/core/kernels/batching_util/periodic_function.h" +#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h" #include "tensorflow/core/kernels/concat_lib.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/kernels/split_lib.h" #include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/platform/macros.h" - namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h index 93c39183198..43e716c542a 100644 --- a/tensorflow/core/kernels/batch_matmul_op_impl.h +++ b/tensorflow/core/kernels/batch_matmul_op_impl.h @@ -41,7 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace { @@ -429,14 +429,13 @@ template struct LaunchBatchMatMul { static void Launch(OpKernelContext* context, const Tensor& in_x, const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) { - - // Number of matrix multiplies i.e. size of the batch. - const int64 batch_size = in_x.dim_size(0); - ParallelMatMulKernelSYCL::Run(context, in_x, in_y, adj_x, adj_y, out, - 0, batch_size); + // Number of matrix multiplies i.e. size of the batch. + const int64 batch_size = in_x.dim_size(0); + ParallelMatMulKernelSYCL::Run(context, in_x, in_y, adj_x, adj_y, + out, 0, batch_size); } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class BatchMatMul : public OpKernel { @@ -462,10 +461,10 @@ class BatchMatMul : public OpKernel { TensorShape out_shape; for (int i = 0; i < ndims - 2; ++i) { OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i), - errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(", - i, ") must be the same: ", - in0.shape().DebugString(), " vs ", - in1.shape().DebugString())); + errors::InvalidArgument( + "In[0].dim(", i, ") and In[1].dim(", i, + ") must be the same: ", in0.shape().DebugString(), " vs ", + in1.shape().DebugString())); out_shape.AddDim(in0.dim_size(i)); } auto n = (ndims == 2) ? 1 : out_shape.num_elements(); @@ -507,12 +506,12 @@ class BatchMatMul : public OpKernel { bool adj_y_; }; -#define REGISTER_BATCH_MATMUL_CPU(TYPE) \ +#define REGISTER_BATCH_MATMUL_CPU(TYPE) \ REGISTER_KERNEL_BUILDER( \ Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint("T"), \ BatchMatMul) -#define REGISTER_BATCH_MATMUL_GPU(TYPE) \ +#define REGISTER_BATCH_MATMUL_GPU(TYPE) \ REGISTER_KERNEL_BUILDER( \ Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint("T"), \ BatchMatMul) @@ -522,5 +521,5 @@ class BatchMatMul : public OpKernel { REGISTER_KERNEL_BUILDER( \ Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint("T"), \ BatchMatMul) -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc index 8d155ca62b2..7e1e2aa4ec1 100644 --- a/tensorflow/core/kernels/batch_matmul_op_real.cc +++ b/tensorflow/core/kernels/batch_matmul_op_real.cc @@ -35,5 +35,5 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU); #ifdef TENSORFLOW_USE_SYCL TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL); TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/batch_matmul_op_test.cc b/tensorflow/core/kernels/batch_matmul_op_test.cc index 7923f34155b..c3932cd7b90 100644 --- a/tensorflow/core/kernels/batch_matmul_op_test.cc +++ b/tensorflow/core/kernels/batch_matmul_op_test.cc @@ -53,9 +53,10 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a, /* Uncomment to enable benchmarks for double & complex types: */ // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex, DT_COMPLEX64, // gpu); -// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \ -// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex, DT_COMPLEX128, cpu); \ -// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \ +// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \ +// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex, DT_COMPLEX128, cpu); +// \ +// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \ // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex, DT_COMPLEX128, gpu); // Typical fully connected layers diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc index d3ed617f713..c34ea14bf60 100644 --- a/tensorflow/core/kernels/batch_norm_op.cc +++ b/tensorflow/core/kernels/batch_norm_op.cc @@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class BatchNormOp : public OpKernel { diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc index 5e3fcd2114a..45ddc853295 100644 --- a/tensorflow/core/kernels/batch_norm_op_test.cc +++ b/tensorflow/core/kernels/batch_norm_op_test.cc @@ -54,7 +54,7 @@ TEST_F(BatchNormOpTest, Simple) { Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2})); test::FillValues( &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f, - -33.31f, -23.85f, -34.72f, -25.85f, -36.13f }); + -33.31f, -23.85f, -34.72f, -25.85f, -36.13f}); test::ExpectTensorNear(expected, *GetOutput(0), 0.01); } diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc index 7f2df95e2d5..1a45212ad29 100644 --- a/tensorflow/core/kernels/batch_util.cc +++ b/tensorflow/core/kernels/batch_util.cc @@ -19,6 +19,8 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/errors.h" +#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m) + namespace tensorflow { namespace batch_util { @@ -61,6 +63,21 @@ Status HandleElementToSlice(Tensor element, Tensor* parent, int64 index, return Status::OK(); } +template <> +Status HandleElementToSlice(Tensor element, Tensor* parent, + int64 index, bool can_move) { + auto parent_as_matrix = parent->flat_outer_dims(); + auto element_flat = element.flat(); + if (can_move) { + for (int64 i = 0; i < element.NumElements(); ++i) { + parent_as_matrix(index, i) = std::move(element_flat(i)); + } + } else { + parent_as_matrix.chip(index, 0) = element_flat; + } + return Status::OK(); +} + // TODO(jsimsa): Add HandleElementToSlice specialization that moves // the data when possible. @@ -87,7 +104,6 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) { switch (element.dtype()) { TF_CALL_ALL_TYPES(HANDLE_TYPE); TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE); - TF_CALL_variant(HANDLE_TYPE); #undef HANDLE_TYPE default: return errors::Unimplemented("CopyElementToSlice Unhandled data type: ", @@ -107,7 +123,6 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) { switch (parent.dtype()) { TF_CALL_ALL_TYPES(HANDLE_TYPE); TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE); - TF_CALL_variant(HANDLE_TYPE); #undef HANDLE_TYPE default: return errors::Unimplemented("CopySliceToElement Unhandled data type: ", @@ -115,5 +130,101 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) { } } +// The following five functions are copied from padding_fifo_queue.cc. +// TODO(mrry): Reconcile these functions with the similar methods in the +// queue implementation. +Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) { + DCHECK_NE(parent->dim_size(0), 0); + if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) { + TensorShape chip_shape = parent->shape(); + chip_shape.RemoveDim(0); + return errors::Internal( + "HandleElementToLargerSlice Cannot copy slice: number of entries in " + "element is greater than number of elements in parent slice. ", + "Shapes are: [element]: ", element.shape().DebugString(), + ", [parent slice]: ", chip_shape.DebugString()); + } + return Status::OK(); +} + +template +Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent, + int index) { + TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent)); + if (element.NumElements() == 0) { + return Status::OK(); + } + auto element_t = element.tensor(); + auto parent_t = parent->tensor(); + Eigen::DSizes slice_indices; + slice_indices[0] = index; + Eigen::DSizes slice_size; + slice_size[0] = 1; + for (size_t i = 1; i < slice_size.size(); ++i) { + slice_size[i] = element_t.dimension(i - 1); + } + parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size); + return Status::OK(); +} + +template +Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent, + int index) { +#define HANDLE_TYPE(T) \ + case DataTypeToEnum::value: { \ + return HandleElementToLargerSlice(element, parent, index); \ + } + + switch (element.dtype()) { + TF_CALL_DATASET_TYPES(HANDLE_TYPE); +#undef HANDLE_TYPE + default: + return errors::Unimplemented( + "HandleElementToLargerSliceWithRank Unhandled data type: ", + element.dtype()); + } +} + +Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent, + int index) { + if (parent->dims() != element.dims() + 1) { + return errors::Internal( + "Mismatched ranks. Element's rank is: ", element.dims(), + " but element is meant to be a slice in output Tensor having rank: ", + parent->dims(), " (should be: ", element.dims() + 1, ")"); + } + +#define HANDLE_DIMS(NDIMS) \ + case NDIMS: { \ + TF_RETURN_IF_ERROR( \ + HandleElementToLargerSliceWithRank(element, parent, index)); \ + return Status::OK(); \ + } + + switch (element.dims()) { + HANDLE_DIMS(0); + HANDLE_DIMS(1); + HANDLE_DIMS(2); + HANDLE_DIMS(3); + HANDLE_DIMS(4); +#undef HANDLE_DIMS + default: + return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ", + element.dims()); + } +} + +Status SetElementZero(Tensor* element, const Tensor& padding) { +#define HANDLE_TYPE(T) \ + if (element->dtype() == DataTypeToEnum::value) { \ + element->flat().setConstant(padding.scalar()()); \ + return Status::OK(); \ + } + TF_CALL_DATASET_TYPES(HANDLE_TYPE); +#undef HANDLE_TYPE + return errors::Unimplemented("SetElementZero Unhandled data type: ", + element->dtype()); +} + } // namespace batch_util } // namespace tensorflow diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h index 0d634ae7b07..a47bf1935db 100644 --- a/tensorflow/core/kernels/batch_util.h +++ b/tensorflow/core/kernels/batch_util.h @@ -32,6 +32,16 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index); // Copies the index^th slice of parent (in the 0th dimension) into element. Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index); +// Zero-initializes the tensor `element` using the scalar stored in `padding`. +// Both `element` and `padding` must have matching `dtype`. +Status SetElementZero(Tensor* element, const Tensor& padding); + +// Copies `element` into a (0th dimension) slice of `parent`, assuming +// the shape of `element` is strictly not larger along any axis than a +// slice. +Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent, + int index); + } // namespace batch_util } // namespace tensorflow diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h index dbf1733dcc3..36a4019002a 100644 --- a/tensorflow/core/kernels/batching_util/periodic_function.h +++ b/tensorflow/core/kernels/batching_util/periodic_function.h @@ -114,7 +114,7 @@ class PeriodicFunction { void RunLoop(int64 start) LOCKS_EXCLUDED(mutex_); const std::function function_; // Actual client function - const int64 interval_micros_; // Interval between calls. + const int64 interval_micros_; // Interval between calls. const Options options_; // Protects state below. diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc index d73dcf0fa0e..d5ea2b648f3 100644 --- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc +++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc @@ -55,15 +55,14 @@ Status ScheduleTask(size_t task_size, BatchScheduler* scheduler) { // use the clock to be destroyed. std::unique_ptr CreateFakeClockAdvancerThread( test_util::FakeClockEnv* env, Notification* start, Notification* stop) { - return std::unique_ptr( - Env::Default()->StartThread({}, "FakeClockAdvancerThread", - [env, start, stop] { - start->WaitForNotification(); - while (!stop->HasBeenNotified()) { - env->AdvanceByMicroseconds(10); - Env::Default()->SleepForMicroseconds(10); - } - })); + return std::unique_ptr(Env::Default()->StartThread( + {}, "FakeClockAdvancerThread", [env, start, stop] { + start->WaitForNotification(); + while (!stop->HasBeenNotified()) { + env->AdvanceByMicroseconds(10); + Env::Default()->SleepForMicroseconds(10); + } + })); } TEST(SharedBatchSchedulerTest, Basic) { @@ -258,7 +257,7 @@ TEST(SharedBatchSchedulerTest, ObeysTimeout) { TEST(SharedBatchSchedulerTest, ObeysTimeoutWithRealClock) { Notification first_batch_processed, second_batch_processed; auto callback = [&first_batch_processed, &second_batch_processed]( - std::unique_ptr> batch) { + std::unique_ptr> batch) { ASSERT_TRUE(batch->IsClosed()); if (batch->size() == 1) { first_batch_processed.Notify(); @@ -301,7 +300,7 @@ TEST(SharedBatchSchedulerTest, { Notification first_batch_processed, second_batch_processed; auto callback = [&first_batch_processed, &second_batch_processed]( - std::unique_ptr> batch) { + std::unique_ptr> batch) { ASSERT_TRUE(batch->IsClosed()); if (batch->size() == 1) { first_batch_processed.Notify(); @@ -349,7 +348,7 @@ TEST(SharedBatchSchedulerTest, Fairness) { auto queue_0_callback = [&queue_0_first_batch_scheduled, &queue_0_first_batch_proceed, &queue_0_second_batch_scheduled]( - std::unique_ptr> batch) { + std::unique_ptr> batch) { if (!queue_0_first_batch_scheduled.HasBeenNotified()) { queue_0_first_batch_scheduled.Notify(); queue_0_first_batch_proceed.WaitForNotification(); @@ -467,7 +466,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) { TEST(SharedBatchSchedulerTest, OneFullQueueDoesntBlockOtherQueues) { Notification queue_0_processing, queue_0_proceed; auto queue_0_callback = [&queue_0_processing, &queue_0_proceed]( - std::unique_ptr> batch) { + std::unique_ptr> batch) { if (!queue_0_processing.HasBeenNotified()) { queue_0_processing.Notify(); queue_0_proceed.WaitForNotification(); diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc index c1c0d6d3292..b07c5fd718d 100644 --- a/tensorflow/core/kernels/batchtospace_op.cc +++ b/tensorflow/core/kernels/batchtospace_op.cc @@ -56,9 +56,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context, errors::InvalidArgument("input rank should be >= ", 1 + block_dims, " instead of ", orig_input_tensor.dims())); - OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) && - block_dims == orig_crops.dim_size(0) && - 2 == orig_crops.dim_size(1), + OP_REQUIRES(context, + TensorShapeUtils::IsMatrix(orig_crops.shape()) && + block_dims == orig_crops.dim_size(0) && + 2 == orig_crops.dim_size(1), errors::InvalidArgument("crops should have shape [", block_dims, ", 2] instead of ", orig_crops.shape().DebugString())); diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc index 7fc4b1762d0..8e4f08e4730 100644 --- a/tensorflow/core/kernels/bcast_ops.cc +++ b/tensorflow/core/kernels/bcast_ops.cc @@ -13,11 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/util/bcast.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/bcast.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc index 2ca194a77f2..754b93b073a 100644 --- a/tensorflow/core/kernels/bias_op_gpu.cu.cc +++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc @@ -77,14 +77,14 @@ void BiasGPU::compute(const GPUDevice& d, const T* input, const T* bias, } CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); if (data_format == FORMAT_NHWC) { - BiasNHWCKernel< - T><<>>( - config.virtual_thread_count, input, bias, output, bias_size); + BiasNHWCKernel + <<>>( + config.virtual_thread_count, input, bias, output, bias_size); } else { - BiasNCHWKernel< - T><<>>( - config.virtual_thread_count, input, bias, output, bias_size, - image_size); + BiasNCHWKernel + <<>>( + config.virtual_thread_count, input, bias, output, bias_size, + image_size); } } @@ -206,10 +206,10 @@ void BiasGradGPU::compute(const GPUDevice& d, const T* output_backprop, // Check if we have enough shared memory. if (shared_memory_size <= max_shared_memory_size) { if (data_format == FORMAT_NHWC) { - BiasGradNHWC_SharedAtomics< - T><<>>(total_count, output_backprop, bias_backprop, - bias_size); + BiasGradNHWC_SharedAtomics + <<>>(total_count, output_backprop, bias_backprop, + bias_size); } else { // Round up the block count to multiple of bias_size. int group_size = (config.block_count + bias_size - 1) / bias_size; @@ -217,23 +217,24 @@ void BiasGradGPU::compute(const GPUDevice& d, const T* output_backprop, if (config.thread_per_block < kWarpSize) { config.thread_per_block = kWarpSize; } - BiasGradNCHW_SharedAtomics< - T><<>>( - output_backprop, bias_backprop, batch, bias_size, image_size, - group_size); + BiasGradNCHW_SharedAtomics + <<>>( + output_backprop, bias_backprop, batch, bias_size, image_size, + group_size); } } else { // Note that even if we don't have enough shared memory to fit the entire // output block, it is possible to process one group of elements at a time. // But for now, we simply fall back to the naive implementation. if (data_format == FORMAT_NHWC) { - BiasGradNHWC_Naive< - T><<>>( - total_count, output_backprop, bias_backprop, bias_size); + BiasGradNHWC_Naive + <<>>( + total_count, output_backprop, bias_backprop, bias_size); } else { - BiasGradNCHW_Naive< - T><<>>( - total_count, output_backprop, bias_backprop, bias_size, image_size); + BiasGradNCHW_Naive + <<>>( + total_count, output_backprop, bias_backprop, bias_size, + image_size); } } } diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/bounds_check.h index e35f42ad417..c8c60c55241 100644 --- a/tensorflow/core/kernels/bounds_check.h +++ b/tensorflow/core/kernels/bounds_check.h @@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) { auto *to_x = reinterpret_cast(&x); return *to_x; } -} // namespace tensorflow::internal +} // namespace internal } // namespace tensorflow #endif // TENSORFLOW_UTIL_BOUNDS_CHECK_H_ diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc index e937c4f11ba..654d99301af 100644 --- a/tensorflow/core/kernels/candidate_sampler_ops.cc +++ b/tensorflow/core/kernels/candidate_sampler_ops.cc @@ -126,13 +126,13 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU), REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU), SimpleCandidateSamplerOp); -REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler") - .Device(DEVICE_CPU), - SimpleCandidateSamplerOp); +REGISTER_KERNEL_BUILDER( + Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU), + SimpleCandidateSamplerOp); -REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler") - .Device(DEVICE_CPU), - SimpleCandidateSamplerOp); +REGISTER_KERNEL_BUILDER( + Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU), + SimpleCandidateSamplerOp); class AllCandidateSamplerOp : public BaseCandidateSamplerOp { public: @@ -197,8 +197,9 @@ class ComputeAccidentalHitsOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& in_true_candidates = context->input(0); const TensorShape& in_true_candidates_shape = in_true_candidates.shape(); - OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) && - in_true_candidates_shape.dim_size(1) == num_true_, + OP_REQUIRES(context, + TensorShapeUtils::IsMatrix(in_true_candidates_shape) && + in_true_candidates_shape.dim_size(1) == num_true_, errors::InvalidArgument( "true_candidates must be a batch_size * num_true matrix")); diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc index f16abb2b79f..626db9131ae 100644 --- a/tensorflow/core/kernels/cast_op.cc +++ b/tensorflow/core/kernels/cast_op.cc @@ -36,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define CURRY_TYPES2(FN, arg0) \ FN(arg0, bool); \ @@ -223,11 +223,11 @@ class SyclCastOp : public CastOpBase { } }; -#define REGISTER_CAST_SYCL(srctype, dsttype) \ - REGISTER_KERNEL_BUILDER(Name("Cast") \ - .TypeConstraint("SrcT") \ - .TypeConstraint("DstT") \ - .Device(DEVICE_SYCL), \ +#define REGISTER_CAST_SYCL(srctype, dsttype) \ + REGISTER_KERNEL_BUILDER(Name("Cast") \ + .TypeConstraint("SrcT") \ + .TypeConstraint("DstT") \ + .Device(DEVICE_SYCL), \ SyclCastOp) CURRY_TYPES2(REGISTER_CAST_SYCL, bool); CURRY_TYPES2(REGISTER_CAST_SYCL, int32); @@ -237,7 +237,7 @@ CURRY_TYPES2(REGISTER_CAST_SYCL, double); #undef REGISTER_CAST_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef CURRY_TYPES2 @@ -250,6 +250,5 @@ REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER( Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"), CpuCastOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow - diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h index 8fedf2c271c..fd4e75d26f0 100644 --- a/tensorflow/core/kernels/cast_op.h +++ b/tensorflow/core/kernels/cast_op.h @@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> { p[0] = a.value; p[1] = 0; #else - static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!"); + static_assert(::tensorflow::port::kLittleEndian, + "Not a little endian system!"); p[0] = 0; p[1] = a.value; #endif diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h index 470e9e08041..3ae9f2ab4d9 100644 --- a/tensorflow/core/kernels/cast_op_impl.h +++ b/tensorflow/core/kernels/cast_op_impl.h @@ -41,25 +41,25 @@ struct CastFunctor { o.device(d) = i.template cast(); } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor -#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ - FN(arg0, arg1, bool); \ - FN(arg0, arg1, uint8); \ - FN(arg0, arg1, int8); \ - FN(arg0, arg1, uint16); \ - FN(arg0, arg1, int16); \ - FN(arg0, arg1, int32); \ - FN(arg0, arg1, int64); \ - FN(arg0, arg1, float); \ - FN(arg0, arg1, double); \ - FN(arg0, arg1, std::complex); \ +#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ + FN(arg0, arg1, bool); \ + FN(arg0, arg1, uint8); \ + FN(arg0, arg1, int8); \ + FN(arg0, arg1, uint16); \ + FN(arg0, arg1, int16); \ + FN(arg0, arg1, int32); \ + FN(arg0, arg1, int64); \ + FN(arg0, arg1, float); \ + FN(arg0, arg1, double); \ + FN(arg0, arg1, std::complex); \ FN(arg0, arg1, std::complex) -#define CURRY_TYPES3(FN, arg0, arg1) \ - CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ +#define CURRY_TYPES3(FN, arg0, arg1) \ + CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \ FN(arg0, arg1, Eigen::half); #define CAST_CASE(DEVICE, IN, OUT) \ diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc index a106f287c18..057e209a719 100644 --- a/tensorflow/core/kernels/cast_op_test.cc +++ b/tensorflow/core/kernels/cast_op_test.cc @@ -107,10 +107,10 @@ static void BM_gpu_float_int64(int iters, int num) { testing::UseRealTime(); #if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL test::Benchmark("sycl", Cast(num)).Run(iters); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20); @@ -130,10 +130,10 @@ static void BM_gpu_bool_float(int iters, int num) { testing::UseRealTime(); #if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL test::Benchmark("sycl", Cast(num)).Run(iters); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20); @@ -180,7 +180,7 @@ static void BM_gpu_float_half(int iters, int num) { testing::UseRealTime(); #if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA } BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20); @@ -191,7 +191,7 @@ static void BM_gpu_half_float(int iters, int num) { testing::UseRealTime(); #if GOOGLE_CUDA test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA } BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20); diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc index ba100b32e7d..9cc2e67bbe1 100644 --- a/tensorflow/core/kernels/colorspace_op.cc +++ b/tensorflow/core/kernels/colorspace_op.cc @@ -107,14 +107,14 @@ class HSVToRGBOp : public OpKernel { } }; -#define REGISTER_CPU(T) \ - REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \ - .TypeConstraint("T"), \ - RGBToHSVOp); \ - template class RGBToHSVOp; \ - REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \ - .TypeConstraint("T"), \ - HSVToRGBOp); \ +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint("T"), \ + RGBToHSVOp); \ + template class RGBToHSVOp; \ + REGISTER_KERNEL_BUILDER( \ + Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint("T"), \ + HSVToRGBOp); \ template class HSVToRGBOp; TF_CALL_float(REGISTER_CPU); TF_CALL_double(REGISTER_CPU); @@ -123,40 +123,39 @@ TF_CALL_double(REGISTER_CPU); // Forward declarations of the function specializations for GPU (to prevent // building the GPU versions here, they will be built compiling _gpu.cu.cc). namespace functor { -#define DECLARE_GPU(T) \ - template <> \ - void RGBToHSV::operator()(const GPUDevice& d, \ - TTypes::ConstTensor input_data, \ - TTypes::Tensor range, \ - TTypes::Tensor output_data); \ - extern template struct RGBToHSV; \ - template <> \ - void HSVToRGB::operator()(const GPUDevice& d, \ - TTypes::ConstTensor input_data, \ - TTypes::Tensor output_data); \ +#define DECLARE_GPU(T) \ + template <> \ + void RGBToHSV::operator()( \ + const GPUDevice& d, TTypes::ConstTensor input_data, \ + TTypes::Tensor range, TTypes::Tensor output_data); \ + extern template struct RGBToHSV; \ + template <> \ + void HSVToRGB::operator()( \ + const GPUDevice& d, TTypes::ConstTensor input_data, \ + TTypes::Tensor output_data); \ extern template struct HSVToRGB; TF_CALL_float(DECLARE_GPU); TF_CALL_double(DECLARE_GPU); } // namespace functor -#define REGISTER_GPU(T) \ - REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \ - .TypeConstraint("T"), \ - RGBToHSVOp); \ - REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \ - .TypeConstraint("T"), \ - HSVToRGBOp); +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint("T"), \ + RGBToHSVOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint("T"), \ + HSVToRGBOp); TF_CALL_float(REGISTER_GPU); TF_CALL_double(REGISTER_GPU); #endif #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL(T) \ - REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - RGBToHSVOp); \ - REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - HSVToRGBOp); +#define REGISTER_SYCL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint("T"), \ + RGBToHSVOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint("T"), \ + HSVToRGBOp); TF_CALL_float(REGISTER_SYCL); TF_CALL_double(REGISTER_SYCL); #endif diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/colorspace_op.h index c5721ef6dd0..90bfce14194 100644 --- a/tensorflow/core/kernels/colorspace_op.h +++ b/tensorflow/core/kernels/colorspace_op.h @@ -54,10 +54,9 @@ struct RGBToHSV { // TODO(wicke): all these assignments are only necessary because a combined // expression is larger than kernel parameter space. A custom kernel is // probably in order. - H.device(d) = (R == V).select(norm * (G - B), - (G == V).select( - norm * (B - R) + T(2) / T(6), - norm * (R - G) + T(4) / T(6))); + H.device(d) = (R == V).select( + norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6), + norm * (R - G) + T(4) / T(6))); H.device(d) = (range > T(0)).select(H, H.constant(T(0))); H.device(d) = (H < T(0)).select(H + T(1), H); } diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc index e19d0b14d5d..61f9ba44c46 100644 --- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc +++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc @@ -17,8 +17,8 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/colorspace_op.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/colorspace_op.h" namespace tensorflow { @@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice; template class functor::HSVToRGB; TF_CALL_float(INSTANTIATE_GPU); TF_CALL_double(INSTANTIATE_GPU); -} +} // namespace tensorflow #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc index 8c6fb732abf..bd82826770f 100644 --- a/tensorflow/core/kernels/colorspace_op_test.cc +++ b/tensorflow/core/kernels/colorspace_op_test.cc @@ -224,34 +224,34 @@ class HSVToRGBOpTest : public OpsTestBase { } }; -#define TEST_COLORSPACE(test, dt) \ - TEST_F(test, CheckBlack) { \ - MakeOp(dt); \ - CheckBlack(dt); \ - } \ - TEST_F(test, CheckGray) { \ - MakeOp(dt); \ - CheckGray(dt); \ - } \ - TEST_F(test, CheckWhite) { \ - MakeOp(dt); \ - CheckWhite(dt); \ - } \ - TEST_F(test, CheckRedMax) { \ - MakeOp(dt); \ - CheckRedMax(dt); \ - } \ - TEST_F(test, CheckGreenMax) { \ - MakeOp(dt); \ - CheckGreenMax(dt); \ - } \ - TEST_F(test, CheckBlueMax) { \ - MakeOp(dt); \ - CheckBlueMax(dt); \ - } \ - TEST_F(test, CheckNegativeDifference) { \ - MakeOp(dt); \ - CheckNegativeDifference(dt); \ +#define TEST_COLORSPACE(test, dt) \ + TEST_F(test, CheckBlack) { \ + MakeOp(dt); \ + CheckBlack(dt); \ + } \ + TEST_F(test, CheckGray) { \ + MakeOp(dt); \ + CheckGray(dt); \ + } \ + TEST_F(test, CheckWhite) { \ + MakeOp(dt); \ + CheckWhite(dt); \ + } \ + TEST_F(test, CheckRedMax) { \ + MakeOp(dt); \ + CheckRedMax(dt); \ + } \ + TEST_F(test, CheckGreenMax) { \ + MakeOp(dt); \ + CheckGreenMax(dt); \ + } \ + TEST_F(test, CheckBlueMax) { \ + MakeOp(dt); \ + CheckBlueMax(dt); \ + } \ + TEST_F(test, CheckNegativeDifference) { \ + MakeOp(dt); \ + CheckNegativeDifference(dt); \ } typedef RGBToHSVOpTest rgb_to_hsv_float; diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc index 9f626a274a4..39e4f24ed51 100644 --- a/tensorflow/core/kernels/compare_and_bitpack_op.cc +++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc @@ -110,7 +110,20 @@ struct ComputeShard::ConstMatrix input, typename TTypes::Matrix output, bool /*thresh*/, int64 start, int64 limit) { - // NOTE(ebrevdo): This assumes memory is little-endian. +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + for (int64 i = start; i < limit; ++i) { + uint8* out = output.data() + i; + const int64 block = *reinterpret_cast(input.data() + 8 * i); + *out = + ((((block & (1LL << (7 * 8))) >> (7 * 8 - 7))) | + (((block & (1LL << (6 * 8))) >> (6 * 8 - 6))) | + (((block & (1LL << (5 * 8))) >> (5 * 8 - 5))) | + (((block & (1LL << (4 * 8))) >> (4 * 8 - 4))) | + (((block & (1LL << (3 * 8))) >> (3 * 8 - 3))) | + (((block & (1LL << (2 * 8))) >> (2 * 8 - 2))) | + (((block & (1LL << 8)) >> (1 * 8 - 1))) | (((block & (1LL))))); + } +#else for (int64 i = start; i < limit; ++i) { uint8* out = output.data() + i; const int64 block = *reinterpret_cast(input.data() + 8 * i); @@ -123,6 +136,7 @@ struct ComputeShard> (2 * 8 - 5))) | (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7))); } +#endif } }; diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h index 526f9420d72..16784c4770e 100644 --- a/tensorflow/core/kernels/concat_lib.h +++ b/tensorflow/core/kernels/concat_lib.h @@ -41,10 +41,11 @@ namespace tensorflow { // Assumes all inputs are nonempty template -void ConcatCPU(DeviceBase* d, - const std::vector< - std::unique_ptr::ConstMatrix>>& inputs, - typename TTypes::Matrix* output); +void ConcatCPU( + DeviceBase* d, + const std::vector::ConstMatrix>>& + inputs, + typename TTypes::Matrix* output); #if GOOGLE_CUDA template void ConcatGPU( @@ -57,11 +58,12 @@ void ConcatGPU( #ifdef TENSORFLOW_USE_SYCL template -void ConcatSYCL(const Eigen::SyclDevice& d, - const std::vector< - std::unique_ptr::ConstMatrix>>& inputs, - typename TTypes::Matrix* output); -#endif // TENSORFLOW_USE_SYCL +void ConcatSYCL( + const Eigen::SyclDevice& d, + const std::vector::ConstMatrix>>& + inputs, + typename TTypes::Matrix* output); +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow #endif // TENSORFLOW_KERNELS_CONCAT_LIB_H_ diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc index 43731114c0b..547a7b40b92 100644 --- a/tensorflow/core/kernels/concat_lib_cpu.cc +++ b/tensorflow/core/kernels/concat_lib_cpu.cc @@ -48,10 +48,11 @@ struct MemCpyCopier { } // namespace template -void ConcatCPU(DeviceBase* d, - const std::vector< - std::unique_ptr::ConstMatrix>>& inputs, - typename TTypes::Matrix* output) { +void ConcatCPU( + DeviceBase* d, + const std::vector::ConstMatrix>>& + inputs, + typename TTypes::Matrix* output) { if (std::is_same::value) { // use a large cost here to force strings to be handled by separate threads ConcatCPUImpl(d, inputs, 100000, MemCpyCopier(), output); @@ -72,7 +73,6 @@ REGISTER(qint8) REGISTER(quint16) REGISTER(qint16) REGISTER(qint32) -TF_CALL_variant(REGISTER) #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \ !defined(__ANDROID_TYPES_FULL__) @@ -86,21 +86,22 @@ TF_CALL_variant(REGISTER) #ifdef TENSORFLOW_USE_SYCL template -void ConcatSYCL(const Eigen::SyclDevice& d, - const std::vector< - std::unique_ptr::ConstMatrix>>& inputs, - typename TTypes::Matrix* output) { +void ConcatSYCL( + const Eigen::SyclDevice& d, + const std::vector::ConstMatrix>>& + inputs, + typename TTypes::Matrix* output) { ConcatSYCLImpl(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier(), - output); + output); } -#define REGISTER_SYCL(T) \ - template void ConcatSYCL( \ - const Eigen::SyclDevice&, \ - const std::vector::ConstMatrix>>&, \ - typename TTypes::Matrix* output); +#define REGISTER_SYCL(T) \ + template void ConcatSYCL( \ + const Eigen::SyclDevice&, \ + const std::vector::ConstMatrix>>&, \ + typename TTypes::Matrix* output); TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL) #undef REGISTER_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h index 6a933efde4b..720b5065377 100644 --- a/tensorflow/core/kernels/concat_lib_cpu.h +++ b/tensorflow/core/kernels/concat_lib_cpu.h @@ -15,9 +15,9 @@ limitations under the License. #define EIGEN_USE_THREADS -#include "tensorflow/core/kernels/concat_lib.h" #include #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/concat_lib.h" #include "tensorflow/core/util/work_sharder.h" namespace tensorflow { @@ -73,7 +73,7 @@ void ConcatCPUImpl( // Sharded mode. auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs]( - int64 start, int64 end) { + int64 start, int64 end) { int64 skipped_rows = start / row_size; T* out = output->data() + skipped_rows * row_size; T* out_start = output->data() + start; @@ -160,5 +160,5 @@ void ConcatSYCLImpl( } } } -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc index ae1b5da32ea..7011550f7e1 100644 --- a/tensorflow/core/kernels/concat_op.cc +++ b/tensorflow/core/kernels/concat_op.cc @@ -37,7 +37,7 @@ typedef Eigen::GpuDevice GPUDevice; #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM }; @@ -71,8 +71,9 @@ class ConcatBaseOp : public OpKernel { const TensorShape& input_shape = values[0].shape(); int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim; - OP_REQUIRES(c, (0 <= axis && axis < input_dims) || - (allow_legacy_scalars() && concat_dim == 0), + OP_REQUIRES(c, + (0 <= axis && axis < input_dims) || + (allow_legacy_scalars() && concat_dim == 0), errors::InvalidArgument( "ConcatOp : Expected concatenating dimensions in the range " "[", @@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel { c, in.dims() == input_dims || (input_is_scalar && in_is_scalar), errors::InvalidArgument( "ConcatOp : Ranks of all input tensors should match: shape[0] = ", - input_shape.DebugString(), " vs. shape[", i, "] = ", - in.shape().DebugString())); + input_shape.DebugString(), " vs. shape[", i, + "] = ", in.shape().DebugString())); for (int j = 0; j < input_dims; ++j) { if (j == axis) { continue; @@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel { c, in.dim_size(j) == input_shape.dim_size(j), errors::InvalidArgument( "ConcatOp : Dimensions of inputs should match: shape[0] = ", - input_shape.DebugString(), " vs. shape[", i, "] = ", - in.shape().DebugString())); + input_shape.DebugString(), " vs. shape[", i, + "] = ", in.shape().DebugString())); } if (in.NumElements() > 0) { int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0; @@ -142,7 +143,7 @@ class ConcatBaseOp : public OpKernel { ConcatSYCL(c->eigen_sycl_device(), inputs_flat, &output_flat); return; } -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL ConcatCPU(c->device(), inputs_flat, &output_flat); } } @@ -252,7 +253,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2") ConcatV2Op); #undef REGISTER_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class ConcatOffsetOp : public OpKernel { public: @@ -347,5 +348,5 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset") .HostMemory("shape") .HostMemory("offset"), ConcatOffsetOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc index c5bded9dafc..e3ba8ae9f69 100644 --- a/tensorflow/core/kernels/concat_op_test.cc +++ b/tensorflow/core/kernels/concat_op_test.cc @@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000); BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000); typedef Eigen::TensorMap, - Eigen::Unaligned> EigenMap; + Eigen::Unaligned> + EigenMap; static void MemcpyManyAlternative1(int iters, int dim2) { testing::StopTiming(); diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h index 794ac6fa6de..c7c7c983691 100644 --- a/tensorflow/core/kernels/conditional_accumulator_base.h +++ b/tensorflow/core/kernels/conditional_accumulator_base.h @@ -160,7 +160,7 @@ class ConditionalAccumulatorBase : public ResourceBase { * Modifications to convenience macros defined in core/framework/op_kernel.h. * The below macros return a boolean if the test fails, so that the calling * function can get an indication that a failure has occurred. -*/ + */ #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS) \ do { \ if (!TF_PREDICT_TRUE(EXP)) { \ diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc index fa37916eaba..e13bf8a4c63 100644 --- a/tensorflow/core/kernels/conditional_accumulator_op.cc +++ b/tensorflow/core/kernels/conditional_accumulator_op.cc @@ -99,9 +99,10 @@ class AccumulatorTakeGradientOp ConditionalAccumulatorBase* accumulator, DoneCallback callback) override { // Check signature - OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32}, - {accumulator->dtype()}), - callback); + OP_REQUIRES_OK_ASYNC( + ctx, + ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}), + callback); } private: @@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU), AccumulatorTakeGradientOp); - } // namespace tensorflow diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc index 59f9f69315e..920cd87858a 100644 --- a/tensorflow/core/kernels/constant_op.cc +++ b/tensorflow/core/kernels/constant_op.cc @@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice; typedef Eigen::SyclDevice SYCLDevice; #endif // TENSORFLOW_USE_SYCL - template class FillOp : public OpKernel { public: diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc index 8fe82d118a7..7d5d54e5bec 100644 --- a/tensorflow/core/kernels/control_flow_ops.cc +++ b/tensorflow/core/kernels/control_flow_ops.cc @@ -113,47 +113,47 @@ REGISTER_GPU_HOST_REF_KERNEL(string); #undef REGISTER_GPU_HOST_REF_KERNEL #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_SWITCH(type) \ - REGISTER_KERNEL_BUILDER(Name("Switch") \ - .Device(DEVICE_SYCL) \ - .HostMemory("pred") \ - .TypeConstraint("T"),\ +#define REGISTER_SYCL_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("Switch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("pred") \ + .TypeConstraint("T"), \ SwitchOp) TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH); -#define REGISTER_SYCL_REF_SWITCH(type) \ - REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ - .Device(DEVICE_SYCL) \ - .HostMemory("pred") \ - .TypeConstraint("T"), \ +#define REGISTER_SYCL_REF_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("pred") \ + .TypeConstraint("T"), \ SwitchOp) TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH); #undef REGISTER_SYCL_SWITCH #undef REGISTER_SYCL_REF_SWITCH -#define REGISTER_SYCL_HOST_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("Switch") \ - .Device(DEVICE_SYCL) \ - .HostMemory("data") \ - .HostMemory("pred") \ - .HostMemory("output_false")\ - .HostMemory("output_true") \ - .TypeConstraint("T"),\ +#define REGISTER_SYCL_HOST_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Switch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("pred") \ + .HostMemory("output_false") \ + .HostMemory("output_true") \ + .TypeConstraint("T"), \ SwitchOp) REGISTER_SYCL_HOST_KERNEL(bool); REGISTER_SYCL_HOST_KERNEL(string); REGISTER_SYCL_HOST_KERNEL(int32); -#define REGISTER_SYCL_HOST_REF_KERNEL(type) \ - REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ - .Device(DEVICE_SYCL) \ - .HostMemory("data") \ - .HostMemory("pred") \ - .HostMemory("output_false") \ - .HostMemory("output_true") \ - .TypeConstraint("T"), \ +#define REGISTER_SYCL_HOST_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ + .Device(DEVICE_SYCL) \ + .HostMemory("data") \ + .HostMemory("pred") \ + .HostMemory("output_false") \ + .HostMemory("output_true") \ + .TypeConstraint("T"), \ SwitchOp) REGISTER_SYCL_HOST_REF_KERNEL(int32); @@ -162,7 +162,7 @@ REGISTER_SYCL_HOST_REF_KERNEL(string); #undef REGISTER_SYCL_HOST_KERNEL #undef REGISTER_SYCL_HOST_REF_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class RefSelectOp : public OpKernel { public: @@ -282,7 +282,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_REF_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel @@ -331,7 +331,7 @@ REGISTER_SYCL_HOST_KERNEL(string); REGISTER_SYCL_HOST_KERNEL(ResourceHandle); #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL void EnterOp::Compute(OpKernelContext* context) { if (IsRefType(context->input_dtype(0))) { @@ -360,14 +360,14 @@ REGISTER_GPU_REF_KERNEL(bool); #undef REGISTER_GPU_REF_KERNEL #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ Name("Enter").Device(DEVICE_SYCL).TypeConstraint("T"), EnterOp) REGISTER_SYCL_KERNEL(bool); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); -#define REGISTER_SYCL_REF_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ +#define REGISTER_SYCL_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint("T"), EnterOp) REGISTER_SYCL_REF_KERNEL(bool); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL); @@ -398,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle); #undef REGISTER_SYCL_HOST_KERNEL #undef REGISTER_SYCL_HOST_REF_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel @@ -455,10 +455,10 @@ REGISTER_GPU_REF_KERNEL(bool); #undef REGISTER_GPU_REF_KERNEL #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ - Name("Exit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp); \ - REGISTER_KERNEL_BUILDER( \ +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Exit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp); \ + REGISTER_KERNEL_BUILDER( \ Name("RefExit").Device(DEVICE_SYCL).TypeConstraint("T"), ExitOp); REGISTER_SYCL_KERNEL(bool); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); @@ -483,7 +483,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(string); #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Special GPU kernels for int32 and string. // TODO(b/25387198): Also enable int32 in device memory. This kernel @@ -556,12 +556,12 @@ REGISTER_GPU_HOST_KERNEL(string); #undef REGISTER_GPU_HOST_KERNEL #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(type) \ - REGISTER_KERNEL_BUILDER( \ - Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint("T"), \ - NextIterationOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint("T"),\ +#define REGISTER_SYCL_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint("T"), \ + NextIterationOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint("T"), \ NextIterationOp) REGISTER_SYCL_KERNEL(bool); TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); @@ -585,7 +585,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL); REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(string); #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // A LoopCond op has one input and one output. The input is a boolean // scalar representing the taken branches of the "pivot" Switch that @@ -619,7 +619,7 @@ REGISTER_KERNEL_BUILDER(Name("LoopCond") .HostMemory("input") .HostMemory("output"), LoopCondOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // ControlTrigger kernels REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU), @@ -631,7 +631,7 @@ REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU), #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL), ControlTriggerOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // When called, abort op will abort the current process. This can be used to // abort remote PSs when needed. diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc index affa0e8ca6b..a2f7bd40692 100644 --- a/tensorflow/core/kernels/control_flow_ops_test.cc +++ b/tensorflow/core/kernels/control_flow_ops_test.cc @@ -91,6 +91,7 @@ class KilledBySignal { public: explicit KilledBySignal(int signum) : signum_(signum) {} bool operator()(int exit_status) const { return exit_status == signum_; } + private: const int signum_; }; diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index 985586d6262..dbddaf3dc64 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -688,7 +688,7 @@ void LaunchConv2DOp::operator()( static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit( // default value is in bytes despite the name of the environment variable "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB - ); + ); int device_id = stream->parent()->device_ordinal(); DataType dtype = input.dtype(); diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc index 291ebf22987..1b40ad81f41 100644 --- a/tensorflow/core/kernels/conv_ops_fused.cc +++ b/tensorflow/core/kernels/conv_ops_fused.cc @@ -679,8 +679,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { const int dims = resized_shape.dims(); OP_REQUIRES( - context, TensorShapeUtils::IsMatrix(paddings.shape()) && - paddings.dim_size(1) == 2, + context, + TensorShapeUtils::IsMatrix(paddings.shape()) && + paddings.dim_size(1) == 2, errors::InvalidArgument("paddings must be a matrix with 2 columns: ", paddings.shape().DebugString())); const int fixed_dims = @@ -715,20 +716,22 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { const int32 after = paddings_matrix(d, 1); // Pad after existing elements. OP_REQUIRES(context, before >= 0 && after >= 0, - errors::InvalidArgument("paddings must be non-negative: ", - before, " ", after)); + errors::InvalidArgument( + "paddings must be non-negative: ", before, " ", after)); if (offset_ == 0) { // SYMMETRIC mode. OP_REQUIRES( - context, before <= resized_shape.dim_size(d) && - after <= resized_shape.dim_size(d), + context, + before <= resized_shape.dim_size(d) && + after <= resized_shape.dim_size(d), errors::InvalidArgument("paddings must be no greater " "than the dimension size: ", before, ", ", after, " greater than ", resized_shape.dim_size(d))); } else if (offset_ == 1) { // REFLECT mode. OP_REQUIRES( - context, before < resized_shape.dim_size(d) && - after < resized_shape.dim_size(d), + context, + before < resized_shape.dim_size(d) && + after < resized_shape.dim_size(d), errors::InvalidArgument("paddings must be less than" " the dimension size: ", before, ", ", after, " not less than ", @@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { // We only check the first three dims, since the depth is accessed as an // int64 below. for (int i = 0; i < 3; i++) { - OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), - std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES( + context, + FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } // The last dimension for input is in_depth. It must be the same as the // filter's in_depth. const int64 in_depth = padded_shape.dim_size(3); - OP_REQUIRES( - context, in_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - in_depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, in_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", in_depth, + " vs ", filter.dim_size(2))); // The last dimension for filter is out_depth. const int out_depth = static_cast(filter.dim_size(3)); @@ -786,9 +790,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { // The second dimension for input is rows/height. // The first dimension for filter is rows/height. const int64 padded_rows_raw = padded_shape.dim_size(1); - OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input rows too large")); + OP_REQUIRES( + context, + FastBoundsCheck(padded_rows_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); const int padded_rows = static_cast(padded_rows_raw); const int filter_rows = static_cast(filter.dim_size(0)); const int resized_rows = static_cast(resized_shape.dim_size(1)); @@ -796,9 +801,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { // The third dimension for input is columns/width. // The second dimension for filter is columns/width. const int64 padded_cols_raw = padded_shape.dim_size(2); - OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input cols too large")); + OP_REQUIRES( + context, + FastBoundsCheck(padded_cols_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); const int padded_cols = static_cast(padded_cols_raw); const int filter_cols = static_cast(filter.dim_size(1)); const int resized_cols = static_cast(resized_shape.dim_size(2)); @@ -864,24 +870,26 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp); }; -#define REGISTER_FUSED(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("FusedResizeAndPadConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T"), \ - FusedResizeConv2DUsingGemmOp< \ - T, FusedResizeAndPadConvFunctor, \ - BILINEAR>, \ +#define REGISTER_FUSED(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("FusedResizeAndPadConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + FusedResizeConv2DUsingGemmOp< \ + T, \ + FusedResizeAndPadConvFunctor, \ + BILINEAR>, \ true>); TF_CALL_float(REGISTER_FUSED); -#define REGISTER_PAD_ONLY_FUSED(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint("T"), \ - FusedResizeConv2DUsingGemmOp< \ - T, FusedResizeAndPadConvFunctor, \ - NEAREST>, \ +#define REGISTER_PAD_ONLY_FUSED(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint("T"), \ + FusedResizeConv2DUsingGemmOp< \ + T, \ + FusedResizeAndPadConvFunctor, \ + NEAREST>, \ false>); TF_CALL_float(REGISTER_PAD_ONLY_FUSED); diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h index 57e196c67cf..f0085be3a53 100644 --- a/tensorflow/core/kernels/conv_ops_gpu.h +++ b/tensorflow/core/kernels/conv_ops_gpu.h @@ -27,7 +27,6 @@ limitations under the License. namespace tensorflow { - // Get the Cudnn workspace limit from the environment variable, which is in MB. // Return the workspace memory limit in bytes. If no value is set, return the // default value. diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc index af6013c9747..a376534badc 100644 --- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc +++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc @@ -25,9 +25,9 @@ limitations under the License. #include "cuda/include/cuda.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/lib/math/math_util.h" #include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/util/tensor_format.h" -#include "tensorflow/core/lib/math/math_util.h" namespace tensorflow { @@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles( int x = threadIdx.x; Dimension<3> output_dims = { - input_dims[0], input_dims[2], input_dims[1], + input_dims[0], + input_dims[2], + input_dims[1], }; Dimension<3> input_dims_in_tiles = { - input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI, + input_dims[0], + (input_dims[1] + TileSizeI - 1) / TileSizeI, (input_dims[2] + TileSizeJ - 1) / TileSizeJ, }; @@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles( FlatToTensorIndex(blockIdx.x, input_dims_in_tiles); Index<3> input_tile_origin = { - input_tile_index[0], input_tile_index[1] * TileSizeI, + input_tile_index[0], + input_tile_index[1] * TileSizeI, input_tile_index[2] * TileSizeJ, }; @@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles( __syncthreads(); Index<3> output_tile_index = { - input_tile_index[0], input_tile_index[2], input_tile_index[1], + input_tile_index[0], + input_tile_index[2], + input_tile_index[1], }; Index<3> output_tile_origin = { - output_tile_index[0], output_tile_index[1] * TileSizeJ, + output_tile_index[0], + output_tile_index[1] * TileSizeJ, output_tile_index[2] * TileSizeI, }; @@ -641,8 +648,9 @@ struct BatchNarrowMatrixTransposeDispatcher { static_assert( (TileLongSide & (TileLongSide - 1)) == 0, "The length of the longer side of the tile is always a power of 2."); - bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide && - min(tile_size_i, tile_size_j) <= TileShortSide; + bool request_satisfied = + std::max(tile_size_i, tile_size_j) <= TileLongSide && + std::min(tile_size_i, tile_size_j) <= TileShortSide; if (request_satisfied) { LaunchBatchNarrowMatrixTransposeKernel( @@ -655,7 +663,7 @@ struct BatchNarrowMatrixTransposeDispatcher { // determine whether it is the long side or the short side that falls short // of the request and increase that parameter accordingly. const bool long_side_request_not_satisfied = - max(tile_size_i, tile_size_j) > TileLongSide; + std::max(tile_size_i, tile_size_j) > TileLongSide; if (long_side_request_not_satisfied) { BatchNarrowMatrixTransposeDispatcher< @@ -683,8 +691,9 @@ struct BatchNarrowMatrixTransposeDispatcher< static_assert( (TileLongSide & (TileLongSide - 1)) == 0, "The length of the longer side of the tile is always a power of 2."); - bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide && - min(tile_size_i, tile_size_j) <= TileShortSide; + bool request_satisfied = + std::max(tile_size_i, tile_size_j) <= TileLongSide && + std::min(tile_size_i, tile_size_j) <= TileShortSide; if (request_satisfied) { LaunchBatchNarrowMatrixTransposeKernel( @@ -799,7 +808,7 @@ struct TransposeElemType<16> { // A helper function to make RunSwapDimension1And2InTensor3 concise. This // helper function looks at the data type and input matrix sizes and decides // the thread numbers and tile sizes to use. -template +template void SwapDimension1And2InTensor3WithNarrowMatrices( const GPUDevice& d, const T* input, const Dimension<3>& input_dims, T* output, const int kMinDimensionToUseTiles) { @@ -809,7 +818,7 @@ void SwapDimension1And2InTensor3WithNarrowMatrices( int tile_long_side_len = 0; int tile_short_side_len = 0; float lowest_cost = std::numeric_limits::max(); - int data_long_side = max(input_dims[1], input_dims[2]); + int data_long_side = std::max(input_dims[1], input_dims[2]); for (auto tile_size_pair : tile_spec) { int proposed_tile_long_side_len = tile_size_pair.first; @@ -854,12 +863,14 @@ void SwapDimension1And2InTensor3WithNarrowMatrices( // Truncate the shorter size requested according to the manual limit set in // tile_spec to make sure that we do not launch configurations violating // hardware limits. - requested_tile_size_i = requested_tile_size_i == tile_long_side_len - ? tile_long_side_len - : min(requested_tile_size_i, tile_short_side_len); - requested_tile_size_j = requested_tile_size_j == tile_long_side_len - ? tile_long_side_len - : min(requested_tile_size_j, tile_short_side_len); + requested_tile_size_i = + requested_tile_size_i == tile_long_side_len + ? tile_long_side_len + : std::min(requested_tile_size_i, tile_short_side_len); + requested_tile_size_j = + requested_tile_size_j == tile_long_side_len + ? tile_long_side_len + : std::min(requested_tile_size_j, tile_short_side_len); Dimension<3> input_dims_in_tiles = { input_dims[0], @@ -902,19 +913,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input, constexpr int kNumThreads = 256; Dimension<3> input_dims_in_tiles = { - input_dims[0], MathUtil::CeilOfRatio(input_dims[1], kTileSize), + input_dims[0], + MathUtil::CeilOfRatio(input_dims[1], kTileSize), MathUtil::CeilOfRatio(input_dims[2], kTileSize), }; int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2]; - SwapDimension1And2InTensor3UsingTiles + SwapDimension1And2InTensor3UsingTiles <<>>(input, input_dims, output); } else if (narrow_matrix) { - SwapDimension1And2InTensor3WithNarrowMatrices(d, input, input_dims, output, - kMinDimensionToUseTiles); + SwapDimension1And2InTensor3WithNarrowMatrices( + d, input, input_dims, output, kMinDimensionToUseTiles); } else { int total_element_count = input_dims[0] * input_dims[1] * input_dims[2]; CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d); diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc index 20da77c36f6..af0a9fa82ee 100644 --- a/tensorflow/core/kernels/conv_ops_using_gemm.cc +++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc @@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp { filter.shape().DebugString())); for (int i = 0; i < 3; i++) { - OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), - std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES( + context, + FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } // The last dimension for input is in_depth. It must be the same as the // filter's in_depth. const int64 in_depth = GetTensorDim(input, data_format_, 'C'); - OP_REQUIRES( - context, in_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - in_depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, in_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", in_depth, + " vs ", filter.dim_size(2))); // The last dimension for filter is out_depth. const int out_depth = static_cast(filter.dim_size(3)); @@ -487,18 +488,20 @@ class Conv2DUsingGemmOp : public BinaryOp { // The second dimension for input is rows/height. // The first dimension for filter is rows/height. const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); - OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input rows too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_rows_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); const int input_rows = static_cast(input_rows_raw); const int filter_rows = static_cast(filter.dim_size(0)); // The third dimension for input is columns/width. // The second dimension for filter is columns/width. const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); - OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input cols too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_cols_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); const int input_cols = static_cast(input_cols_raw); const int filter_cols = static_cast(filter.dim_size(1)); diff --git a/tensorflow/core/kernels/cross_op_gpu.cu.cc b/tensorflow/core/kernels/cross_op_gpu.cu.cc index 7ea0b3be0ca..4a37f6cfbbc 100644 --- a/tensorflow/core/kernels/cross_op_gpu.cu.cc +++ b/tensorflow/core/kernels/cross_op_gpu.cu.cc @@ -17,8 +17,8 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/cross_op.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/cross_op.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc index 73ee3106048..96bdb6a241b 100644 --- a/tensorflow/core/kernels/ctc_decoder_ops.cc +++ b/tensorflow/core/kernels/ctc_decoder_ops.cc @@ -19,13 +19,13 @@ limitations under the License. #include -#include "tensorflow/core/util/ctc/ctc_beam_search.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/util/ctc/ctc_beam_search.h" #include "tensorflow/core/util/sparse/sparse_tensor.h" namespace tensorflow { @@ -80,16 +80,17 @@ class CTCDecodeHelper { if (!(batch_size == (*seq_len)->dim_size(0))) { return errors::FailedPrecondition( - "len(sequence_length) != batch_size. ", "len(sequence_length): ", - (*seq_len)->dim_size(0), " batch_size: ", batch_size); + "len(sequence_length) != batch_size. ", + "len(sequence_length): ", (*seq_len)->dim_size(0), + " batch_size: ", batch_size); } auto seq_len_t = (*seq_len)->vec(); for (int b = 0; b < batch_size; ++b) { if (!(seq_len_t(b) <= max_time)) { - return errors::FailedPrecondition("sequence_length(", b, ") <= ", - max_time); + return errors::FailedPrecondition("sequence_length(", b, + ") <= ", max_time); } } diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc index fb03adb7a53..b38d838bf1e 100644 --- a/tensorflow/core/kernels/ctc_loss_op.cc +++ b/tensorflow/core/kernels/ctc_loss_op.cc @@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel { const int64 batch_indices = g.group()[0]; OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size), errors::InvalidArgument("labels batch index must be between ", - 0, " and ", batch_size, " but saw: ", - batch_indices)); + 0, " and ", batch_size, + " but saw: ", batch_indices)); auto values = g.values(); std::vector* b_values = &labels_t[batch_indices]; diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc index 5fd38d9dc25..1466f24202f 100644 --- a/tensorflow/core/kernels/cwise_op_abs.cc +++ b/tensorflow/core/kernels/cwise_op_abs.cc @@ -45,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("Abs") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc index 12cc6c8bdd4..49191226074 100644 --- a/tensorflow/core/kernels/cwise_op_acos.cc +++ b/tensorflow/core/kernels/cwise_op_acos.cc @@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double); #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc index 39c88140733..c2b355ab7f4 100644 --- a/tensorflow/core/kernels/cwise_op_acosh.cc +++ b/tensorflow/core/kernels/cwise_op_acosh.cc @@ -17,12 +17,12 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_gradients.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64, + complex128); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc index 608a6dce3d2..bf32c8a54b3 100644 --- a/tensorflow/core/kernels/cwise_op_add_1.cc +++ b/tensorflow/core/kernels/cwise_op_add_1.cc @@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2") BinaryOp>); #endif - #if TENSORFLOW_USE_SYCL #define REGISTER_KERNEL(type) \ REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \ @@ -66,5 +65,5 @@ REGISTER_KERNEL_BUILDER(Name("AddV2") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc index ac21ca06c92..e8acbac2853 100644 --- a/tensorflow/core/kernels/cwise_op_add_2.cc +++ b/tensorflow/core/kernels/cwise_op_add_2.cc @@ -22,8 +22,8 @@ namespace tensorflow { // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__. #if !defined(__ANDROID_TYPES_SLIM__) -REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, - uint8, complex128, string); +REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8, + complex128, string); // Notice: String is excluded to allow marking AddV2 is_commutative and // is_aggregate. REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8, diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc index c28e27d95ae..fe8dfea1173 100644 --- a/tensorflow/core/kernels/cwise_op_asin.cc +++ b/tensorflow/core/kernels/cwise_op_asin.cc @@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double); #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc index 0aec6aac344..7cf0405f524 100644 --- a/tensorflow/core/kernels/cwise_op_asinh.cc +++ b/tensorflow/core/kernels/cwise_op_asinh.cc @@ -1,10 +1,10 @@ - /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -17,8 +17,8 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_gradients.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64, + complex128); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc index 7d73de48102..09f0448874f 100644 --- a/tensorflow/core/kernels/cwise_op_atan.cc +++ b/tensorflow/core/kernels/cwise_op_atan.cc @@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double); #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc index 7b688db4c58..6170683fa64 100644 --- a/tensorflow/core/kernels/cwise_op_atanh.cc +++ b/tensorflow/core/kernels/cwise_op_atanh.cc @@ -17,8 +17,8 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_gradients.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64, + complex128); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc index 0111e9d5fd1..816eadc80eb 100644 --- a/tensorflow/core/kernels/cwise_op_ceil.cc +++ b/tensorflow/core/kernels/cwise_op_ceil.cc @@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double); #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc index d4b3b0e3935..71ad0ff0dc2 100644 --- a/tensorflow/core/kernels/cwise_op_cos.cc +++ b/tensorflow/core/kernels/cwise_op_cos.cc @@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc index bca99a4f897..31b4bb3cadd 100644 --- a/tensorflow/core/kernels/cwise_op_cosh.cc +++ b/tensorflow/core/kernels/cwise_op_cosh.cc @@ -16,20 +16,18 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64, + complex128); #if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Cosh") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); +#define REGISTER_SYCL_KERNEL(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Cosh").Device(DEVICE_SYCL).TypeConstraint("T"), \ + UnaryOp>); REGISTER_SYCL_KERNEL(float); REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc index d44c1bf473e..c71c756e446 100644 --- a/tensorflow/core/kernels/cwise_op_div.cc +++ b/tensorflow/core/kernels/cwise_op_div.cc @@ -54,5 +54,5 @@ REGISTER_KERNEL_BUILDER(Name("Div") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc index 66d7b7d22eb..8f4ac98016c 100644 --- a/tensorflow/core/kernels/cwise_op_exp.cc +++ b/tensorflow/core/kernels/cwise_op_exp.cc @@ -26,5 +26,5 @@ REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double, #if TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc index 4f723080060..ce03ad5de62 100644 --- a/tensorflow/core/kernels/cwise_op_expm1.cc +++ b/tensorflow/core/kernels/cwise_op_expm1.cc @@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc index 5a142b9ce9f..d554d41c412 100644 --- a/tensorflow/core/kernels/cwise_op_floor.cc +++ b/tensorflow/core/kernels/cwise_op_floor.cc @@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc index fa81ef0872d..fecbf859897 100644 --- a/tensorflow/core/kernels/cwise_op_floor_div.cc +++ b/tensorflow/core/kernels/cwise_op_floor_div.cc @@ -49,5 +49,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc index 55f8a30461f..29340b88506 100644 --- a/tensorflow/core/kernels/cwise_op_floor_mod.cc +++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc @@ -40,5 +40,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc index e7dff5d0ac5..77723b3169f 100644 --- a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc +++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc @@ -19,8 +19,8 @@ limitations under the License. namespace tensorflow { namespace functor { - DEFINE_UNARY1(conj, complex64); - DEFINE_UNARY1(conj, complex128); +DEFINE_UNARY1(conj, complex64); +DEFINE_UNARY1(conj, complex128); } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc index 3675398126f..26748ef0e72 100644 --- a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc +++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc @@ -20,7 +20,7 @@ limitations under the License. namespace tensorflow { namespace functor { DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64, - complex64, complex128, bool); + complex64, complex128, bool); DEFINE_APPROXIMATE_EQUAL2(float, double); } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc index a54dbdfc247..627ecc8c802 100644 --- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc +++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc @@ -15,8 +15,10 @@ limitations under the License. #if GOOGLE_CUDA -#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" +#define EIGEN_USE_GPU + #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" namespace tensorflow { namespace functor { @@ -38,19 +40,17 @@ struct SelectScalarFunctor { typename TTypes::ConstScalar cond, typename TTypes::ConstFlat then_flat, typename TTypes::ConstFlat else_flat) { - #if !defined(EIGEN_HAS_INDEX_LIST) - Eigen::array rank1{1}; + Eigen::array rank1{1}; #else - Eigen::IndexList> rank1; + Eigen::IndexList > rank1; #endif - const int size = then_flat.dimension(0); - Eigen::array broadcast_dims{size}; - - To32Bit(out).device(d) = cond.reshape(rank1) - .broadcast(broadcast_dims) - .select(then_flat, else_flat); + const int size = then_flat.dimension(0); + Eigen::array broadcast_dims{size}; + To32Bit(out).device(d) = cond.reshape(rank1) + .broadcast(broadcast_dims) + .select(then_flat, else_flat); } }; @@ -89,8 +89,8 @@ struct BatchSelectFunctor { } }; -#define SELECT_FUNCTOR(T) \ - template struct SelectFunctor; \ +#define SELECT_FUNCTOR(T) \ + template struct SelectFunctor; \ template struct SelectScalarFunctor; \ template struct BatchSelectFunctor; diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc index ba89899fb32..a4ea4088369 100644 --- a/tensorflow/core/kernels/cwise_op_greater.cc +++ b/tensorflow/core/kernels/cwise_op_greater.cc @@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Greater") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc index 8f0c483aecd..3f34d6269ef 100644 --- a/tensorflow/core/kernels/cwise_op_greater_equal.cc +++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc @@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual") #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double); +REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, + double); REGISTER_KERNEL_BUILDER(Name("GreaterEqual") .Device(DEVICE_SYCL) @@ -44,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc index df2c02e42e1..f5cafcc7809 100644 --- a/tensorflow/core/kernels/cwise_op_invert.cc +++ b/tensorflow/core/kernels/cwise_op_invert.cc @@ -21,7 +21,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64, #ifdef TENSORFLOW_USE_SYCL REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64, - uint8, uint16); + uint8, uint16); #endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc index 53ec1c1c63f..ae1e590d242 100644 --- a/tensorflow/core/kernels/cwise_op_isfinite.cc +++ b/tensorflow/core/kernels/cwise_op_isfinite.cc @@ -26,5 +26,5 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half, #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc index 4b34744304f..f22ca21e1ca 100644 --- a/tensorflow/core/kernels/cwise_op_isinf.cc +++ b/tensorflow/core/kernels/cwise_op_isinf.cc @@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc index ad2dd3f722c..aa180c247e7 100644 --- a/tensorflow/core/kernels/cwise_op_isnan.cc +++ b/tensorflow/core/kernels/cwise_op_isnan.cc @@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc index 136c3666dfc..00cdecdbd18 100644 --- a/tensorflow/core/kernels/cwise_op_less.cc +++ b/tensorflow/core/kernels/cwise_op_less.cc @@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Less") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc index 97a2508d129..11806c5fc77 100644 --- a/tensorflow/core/kernels/cwise_op_less_equal.cc +++ b/tensorflow/core/kernels/cwise_op_less_equal.cc @@ -44,5 +44,5 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc index 7fdfdff0e38..98936e0f960 100644 --- a/tensorflow/core/kernels/cwise_op_log.cc +++ b/tensorflow/core/kernels/cwise_op_log.cc @@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc index 25ad7b24bb1..162ca9e07cd 100644 --- a/tensorflow/core/kernels/cwise_op_log1p.cc +++ b/tensorflow/core/kernels/cwise_op_log1p.cc @@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc index 87d54e380b4..8c54f22f108 100644 --- a/tensorflow/core/kernels/cwise_op_maximum.cc +++ b/tensorflow/core/kernels/cwise_op_maximum.cc @@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Maximum") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc index 442171193bf..dff83df828f 100644 --- a/tensorflow/core/kernels/cwise_op_minimum.cc +++ b/tensorflow/core/kernels/cwise_op_minimum.cc @@ -43,6 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Minimum") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc index 023eb07ca3f..0e8d2e37350 100644 --- a/tensorflow/core/kernels/cwise_op_mul_1.cc +++ b/tensorflow/core/kernels/cwise_op_mul_1.cc @@ -17,8 +17,8 @@ limitations under the License. namespace tensorflow { -REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, - uint8, int32); +REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8, + int32); #if defined(__ANDROID_TYPES_SLIM__) // We only register the first type when we have multi-argument calls in the // case where we're trying to reduce executable size, but it turns out that the @@ -28,7 +28,7 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32); #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double, - uint8); + uint8); // A special GPU kernel for int32. // TODO(b/25387198): Also enable int32 in device memory. This kernel // registration requires all int32 inputs and outputs to be in host memory. @@ -50,5 +50,5 @@ REGISTER_KERNEL_BUILDER(Name("Mul") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_mul_2.cc b/tensorflow/core/kernels/cwise_op_mul_2.cc index 7be5857cc06..6aa8f883640 100644 --- a/tensorflow/core/kernels/cwise_op_mul_2.cc +++ b/tensorflow/core/kernels/cwise_op_mul_2.cc @@ -22,11 +22,11 @@ namespace tensorflow { // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__. #if !defined(__ANDROID_TYPES_SLIM__) -REGISTER6(BinaryOp, CPU, "Mul", functor::mul, - int8, uint16, int16, int64, complex64, complex128); +REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64, + complex64, complex128); #if GOOGLE_CUDA REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64, - complex64, complex128); + complex64, complex128); #endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc index 536891b548f..a136769b912 100644 --- a/tensorflow/core/kernels/cwise_op_neg.cc +++ b/tensorflow/core/kernels/cwise_op_neg.cc @@ -27,7 +27,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64, diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc index 7bd81ee1271..02cd2987457 100644 --- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc +++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc @@ -17,7 +17,7 @@ limitations under the License. namespace tensorflow { REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half, - double, uint8, int8, int16); + double, uint8, int8, int16); #if GOOGLE_CUDA REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half, double, uint8); diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc index 7d4ecec59f1..05bdea66367 100644 --- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc +++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc @@ -30,5 +30,5 @@ REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64, #endif // GOOGLE_CUDA -#endif // !defined(__ANDROID_TYPES_SLIM__) +#endif // !defined(__ANDROID_TYPES_SLIM__) } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc index 8c0e21f9cf3..aee25747b86 100644 --- a/tensorflow/core/kernels/cwise_op_reciprocal.cc +++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc @@ -38,7 +38,7 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half, #endif #ifdef TENSORFLOW_USE_SYCL REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float, Eigen::half, double, complex64, complex128); @@ -48,5 +48,5 @@ REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float, #endif #ifdef TENSORFLOW_USE_SYCL REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc index 3dd9de8d897..e259daaba47 100644 --- a/tensorflow/core/kernels/cwise_op_select.cc +++ b/tensorflow/core/kernels/cwise_op_select.cc @@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class SelectOp : public OpKernel { @@ -185,7 +185,7 @@ REGISTER_SELECT_SYCL(double); REGISTER_SELECT_SYCL(int32); REGISTER_SELECT_SYCL(int64); #undef REGISTER_SELECT_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace functor { @@ -201,13 +201,11 @@ struct SelectFunctorBase { }; template -struct SelectFunctor - : SelectFunctorBase {}; +struct SelectFunctor : SelectFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template -struct SelectFunctor - : SelectFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL +struct SelectFunctor : SelectFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL template struct SelectScalarFunctorBase { @@ -222,12 +220,12 @@ struct SelectScalarFunctorBase { // CPU Specializations of Select functors with scalar template struct SelectScalarFunctor - : SelectScalarFunctorBase {}; + : SelectScalarFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template struct SelectScalarFunctor - : SelectScalarFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL + : SelectScalarFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL template struct BatchSelectFunctorBase { @@ -240,8 +238,8 @@ struct BatchSelectFunctorBase { const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1); #if !defined(EIGEN_HAS_INDEX_LIST) - Eigen::array broadcast_dims{{ 1, all_but_batch }}; - Eigen::Tensor::Dimensions reshape_dims{{ batch, 1 }}; + Eigen::array broadcast_dims{{1, all_but_batch}}; + Eigen::Tensor::Dimensions reshape_dims{{batch, 1}}; #else Eigen::IndexList, Eigen::DenseIndex> broadcast_dims; broadcast_dims.set(1, all_but_batch); @@ -257,13 +255,13 @@ struct BatchSelectFunctorBase { }; template -struct BatchSelectFunctor - : BatchSelectFunctorBase {}; +struct BatchSelectFunctor : BatchSelectFunctorBase { +}; #ifdef TENSORFLOW_USE_SYCL template struct BatchSelectFunctor - : BatchSelectFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL + : BatchSelectFunctorBase {}; +#endif // TENSORFLOW_USE_SYCL } // namespace functor diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc index a76a088ac8f..c132fdb63f2 100644 --- a/tensorflow/core/kernels/cwise_op_sigmoid.cc +++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc @@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half, #endif #ifdef TENSORFLOW_USE_SYCL REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float, Eigen::half, double, complex64, complex128); @@ -35,6 +35,6 @@ REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float, #endif #ifdef TENSORFLOW_USE_SYCL REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc index a4084d5ad17..02915ff4ce4 100644 --- a/tensorflow/core/kernels/cwise_op_sign.cc +++ b/tensorflow/core/kernels/cwise_op_sign.cc @@ -41,6 +41,6 @@ REGISTER_KERNEL_BUILDER(Name("Sign") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc index b91ff1ac30b..16c60578640 100644 --- a/tensorflow/core/kernels/cwise_op_sin.cc +++ b/tensorflow/core/kernels/cwise_op_sin.cc @@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double); -#endif // TENSORFLOW_USE_SYC +#endif // TENSORFLOW_USE_SYC } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc index 055f0b12e14..26b7a940aa8 100644 --- a/tensorflow/core/kernels/cwise_op_sinh.cc +++ b/tensorflow/core/kernels/cwise_op_sinh.cc @@ -16,20 +16,18 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, - complex64, complex128); +REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64, + complex128); #if TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNEL(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("Sinh") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T"), \ - UnaryOp>); +#define REGISTER_SYCL_KERNEL(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Sinh").Device(DEVICE_SYCL).TypeConstraint("T"), \ + UnaryOp>); REGISTER_SYCL_KERNEL(float); REGISTER_SYCL_KERNEL(double); #undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYC +#endif // TENSORFLOW_USE_SYC #if GOOGLE_CUDA REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double); diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc index 00efbb00f15..497756133d0 100644 --- a/tensorflow/core/kernels/cwise_op_sqrt.cc +++ b/tensorflow/core/kernels/cwise_op_sqrt.cc @@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float, Eigen::half, double, complex64, complex128); @@ -36,5 +36,5 @@ REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float, #ifdef TENSORFLOW_USE_SYCL REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc index 07a4b0b084d..7fc2f6bf08b 100644 --- a/tensorflow/core/kernels/cwise_op_square.cc +++ b/tensorflow/core/kernels/cwise_op_square.cc @@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Square") .HostMemory("y") .TypeConstraint("T"), UnaryOp>); -#endif // TENSORFLOW_USE_SYC +#endif // TENSORFLOW_USE_SYC } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc index 6adaecba04b..025041946ac 100644 --- a/tensorflow/core/kernels/cwise_op_sub.cc +++ b/tensorflow/core/kernels/cwise_op_sub.cc @@ -53,5 +53,5 @@ REGISTER_KERNEL_BUILDER(Name("Sub") .HostMemory("z") .TypeConstraint("T"), BinaryOp>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc index 7891b1183dd..c1a25767d31 100644 --- a/tensorflow/core/kernels/cwise_op_tan.cc +++ b/tensorflow/core/kernels/cwise_op_tan.cc @@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc index 8b3900892c3..c5005f5ea8a 100644 --- a/tensorflow/core/kernels/cwise_op_tanh.cc +++ b/tensorflow/core/kernels/cwise_op_tanh.cc @@ -26,7 +26,7 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double); #ifdef TENSORFLOW_USE_SYCL REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float, Eigen::half, double, complex64, complex128); diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc index e561e59cf5a..980edffceb3 100644 --- a/tensorflow/core/kernels/cwise_ops_common.cc +++ b/tensorflow/core/kernels/cwise_ops_common.cc @@ -57,9 +57,9 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx) in1(ctx->input(1)), bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) { if (!bcast.IsValid()) { - ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ", - in0.shape().DebugString(), " vs. ", - in1.shape().DebugString())); + ctx->SetStatus(errors::InvalidArgument( + "Incompatible shapes: ", in0.shape().DebugString(), " vs. ", + in1.shape().DebugString())); return; } const TensorShape output_shape = BCast::ToShape(bcast.output_shape()); diff --git a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h index 43947707089..e81b840a509 100644 --- a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h +++ b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h @@ -50,16 +50,16 @@ struct SimpleBinaryFunctor { // Macros to explicitly instantiate kernels on GPU for multiple types // (T0, T1, etc.) for SimpleBiaryFunctor (e.g., functor::tanh_grad). -#define DEFINE_SIMPLE_BINARY1(F, T) \ +#define DEFINE_SIMPLE_BINARY1(F, T) \ template struct SimpleBinaryFunctor > -#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \ - DEFINE_SIMPLE_BINARY1(F, T0); \ +#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \ + DEFINE_SIMPLE_BINARY1(F, T0); \ DEFINE_SIMPLE_BINARY1(F, T1) -#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \ - DEFINE_SIMPLE_BINARY2(F, T0, T1); \ +#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \ + DEFINE_SIMPLE_BINARY2(F, T0, T1); \ DEFINE_SIMPLE_BINARY1(F, T2) -#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \ - DEFINE_SIMPLE_BINARY2(F, T0, T1); \ +#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \ + DEFINE_SIMPLE_BINARY2(F, T0, T1); \ DEFINE_SIMPLE_BINARY2(F, T2, T3) #define DEFINE_SIMPLE_BINARY5(F, T0, T1, T2, T3, T4) \ DEFINE_SIMPLE_BINARY2(F, T0, T1); \ diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h index 77b330f5899..82cdae9a348 100644 --- a/tensorflow/core/kernels/cwise_ops_gradients.h +++ b/tensorflow/core/kernels/cwise_ops_gradients.h @@ -171,7 +171,6 @@ struct SimpleBinaryFunctor { } }; - #ifdef TENSORFLOW_USE_SYCL // Partial specialization of BinaryFunctor for SYCL devices typedef Eigen::SyclDevice SYCLDevice; @@ -184,7 +183,7 @@ struct SimpleBinaryFunctor { } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template struct tanh_grad : base> {}; diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h index 3f6ff7303d6..3e107cee04c 100644 --- a/tensorflow/core/kernels/cwise_ops_sycl_common.h +++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h @@ -51,7 +51,8 @@ struct BinaryFunctor { void operator()(const SYCLDevice& d, typename Functor::tout_type out, typename Functor::tin_type in0, typename Functor::tin_type in1, bool* error) { - To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func()); + To32Bit(out).device(d) = + To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func()); } void Left(const SYCLDevice& d, typename Functor::tout_type out, @@ -61,7 +62,9 @@ struct BinaryFunctor { constexpr int NumDims = Functor::tin_type::NumDimensions; static_assert(NumDims == 1, "Unexpected size"); Eigen::Sizes<1> scalar_dim; - out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary()); + out.device(d) = scalar.reshape(scalar_dim) + .broadcast(in.dimensions()) + .binaryExpr(in, Binary()); } void Right(const SYCLDevice& d, typename Functor::tout_type out, @@ -71,7 +74,8 @@ struct BinaryFunctor { constexpr int NumDims = Functor::tin_type::NumDimensions; static_assert(NumDims == 1, "Unexpected size"); Eigen::Sizes<1> scalar_dim; - out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary()); + out.device(d) = in.binaryExpr( + scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary()); } void BCast(const SYCLDevice& d, diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc index bca0f1004d5..39f497e7161 100644 --- a/tensorflow/core/kernels/cwise_ops_test.cc +++ b/tensorflow/core/kernels/cwise_ops_test.cc @@ -54,36 +54,36 @@ int ColsFromArg(int arg) { return (arg % kRows); } BM_UNARY(cpu, Floor, float, DT_FLOAT); #if GOOGLE_CUDA BM_UNARY(gpu, Floor, float, DT_FLOAT); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_UNARY(sycl, Floor, float, DT_FLOAT); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL BM_UNARY(cpu, Floor, double, DT_DOUBLE); #if GOOGLE_CUDA BM_UNARY(gpu, Floor, double, DT_DOUBLE); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_UNARY(sycl, Floor, double, DT_DOUBLE); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL BM_UNARY(cpu, Conj, std::complex, DT_COMPLEX64); #if GOOGLE_CUDA BM_UNARY(gpu, Conj, std::complex, DT_COMPLEX64); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_UNARY(cpu, Conj, std::complex, DT_COMPLEX128); #if GOOGLE_CUDA BM_UNARY(gpu, Conj, std::complex, DT_COMPLEX128); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_UNARY(cpu, Rint, double, DT_DOUBLE); #if GOOGLE_CUDA BM_UNARY(gpu, Rint, double, DT_DOUBLE); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_UNARY(cpu, Rint, float, DT_FLOAT); #if GOOGLE_CUDA BM_UNARY(gpu, Rint, float, DT_FLOAT); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA // data func scalar. Graph* BinaryScalar(int num, const string& func) { @@ -113,18 +113,18 @@ Graph* BinaryScalar(int num, const string& func) { BM_BINARY_SCALAR(cpu, Less); #if GOOGLE_CUDA BM_BINARY_SCALAR(gpu, Less); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_BINARY_SCALAR(sycl, Less); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL BM_BINARY_SCALAR(cpu, Add); #if GOOGLE_CUDA BM_BINARY_SCALAR(gpu, Add); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_BINARY_SCALAR(sycl, Add); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef BM_BINARY_SCALAR template @@ -163,11 +163,11 @@ using Eigen::half; BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT); #if GOOGLE_CUDA BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_BIAS_ADD_ALL(cpu, half, DT_HALF); #if GOOGLE_CUDA BM_BIAS_ADD_ALL(gpu, half, DT_HALF); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #undef BM_BIAS_ADD_ALL #undef BM_BIAS_ADD @@ -217,15 +217,15 @@ using Eigen::half; #if GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT); BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT); #if GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF); #if GOOGLE_CUDA BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #undef BM_BIAS_ADD_GRAD_ALL #undef BM_BIAS_ADD_GRAD @@ -265,10 +265,10 @@ Graph* BcastAdd(int rows, int cols, int dim) { BM_BCAST_ADD_ROW_ALL(cpu); #if GOOGLE_CUDA BM_BCAST_ADD_ROW_ALL(gpu); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_BCAST_ADD_ROW_ALL(sycl); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef BM_BCAST_ADD_ROW_ALL #undef BM_BCAST_ADD_ROW @@ -291,10 +291,10 @@ BM_BCAST_ADD_ROW_ALL(sycl); BM_BCAST_ADD_COL_ALL(cpu); #if GOOGLE_CUDA BM_BCAST_ADD_COL_ALL(gpu); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL BM_BCAST_ADD_COL_ALL(sycl); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef BM_BCAST_ADD_COL_ALL #undef BM_BCAST_ADD_COL diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index 500ee7b43f2..c4e21257ffc 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -49,6 +49,7 @@ cc_library( srcs = ["dataset.cc"], hdrs = ["dataset.h"], deps = [ + "//tensorflow/core:core_cpu", "//tensorflow/core:framework", "//tensorflow/core:graph", "//tensorflow/core:lib", @@ -81,9 +82,7 @@ cc_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", - "//tensorflow/core:proto_text", "//tensorflow/core:protos_all_cc", - "//tensorflow/core:session_options", "//tensorflow/core/kernels:variable_ops", ], ) @@ -122,6 +121,7 @@ tf_kernel_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", + "//tensorflow/core/kernels:batch_util", ], ) @@ -402,6 +402,19 @@ tf_kernel_library( ], ) +tf_kernel_library( + name = "tensor_queue_dataset_op", + srcs = ["tensor_queue_dataset_op.cc"], + deps = [ + ":dataset", + "//tensorflow/core:dataset_ops_op_lib", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core/kernels:batch_util", + ], +) + tf_kernel_library( name = "tensor_slice_dataset_op", srcs = ["tensor_slice_dataset_op.cc"], @@ -540,6 +553,7 @@ tf_kernel_library( ":stats_dataset_ops", ":take_dataset_op", ":tensor_dataset_op", + ":tensor_queue_dataset_op", ":tensor_slice_dataset_op", ":unique_dataset_op", ":zip_dataset_op", diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc index 2d6e06398f6..7fa67efb9e2 100644 --- a/tensorflow/core/kernels/data/batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/batch_dataset_op.cc @@ -92,7 +92,6 @@ class BatchDatasetOp : public UnaryDatasetOpKernel { } private: - class Iterator : public DatasetIterator { public: explicit Iterator(const Params& params) @@ -145,7 +144,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel { const Tensor& first_element = batch_elements[0][component_index]; TensorShape batch_component_shape({num_batch_elements}); batch_component_shape.AppendShape(first_element.shape()); - Tensor batch_component(cpu_allocator(), first_element.dtype(), + Tensor batch_component(ctx->allocator({}), first_element.dtype(), batch_component_shape); // Build the output tuple component by copying one slice // from each input element in the batch. diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc index 1f6d32f8df3..f3e4f1cd3fd 100644 --- a/tensorflow/core/kernels/data/captured_function.cc +++ b/tensorflow/core/kernels/data/captured_function.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/platform/notification.h" - namespace tensorflow { /* static */ @@ -185,8 +184,7 @@ Status CapturedFunction::MaybeInstantiate( return Status::OK(); } -Status CapturedFunction::Run(IteratorContext* ctx, - std::vector&& args, +Status CapturedFunction::Run(IteratorContext* ctx, std::vector&& args, std::vector* rets) { FunctionLibraryRuntime::Handle handle; TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &handle)); diff --git a/tensorflow/core/kernels/data/dataset.cc b/tensorflow/core/kernels/data/dataset.cc index 2ea68755676..d18cb160189 100644 --- a/tensorflow/core/kernels/data/dataset.cc +++ b/tensorflow/core/kernels/data/dataset.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/core/kernels/data/dataset.h" +#include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/graph/graph_def_builder.h" #include "tensorflow/core/graph/node_builder.h" @@ -264,6 +265,10 @@ void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx, MakeDataset(ctx, input, another_input, output); } +Allocator* IteratorContext::allocator(AllocatorAttributes attrs) { + return params_.lib->device()->GetAllocator(attrs); +} + const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH"; const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] = "_DATASET_GRAPH_OUTPUT_NODE"; diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h index 2ef31ddfaaa..2c6fc8d5b4f 100644 --- a/tensorflow/core/kernels/data/dataset.h +++ b/tensorflow/core/kernels/data/dataset.h @@ -15,595 +15,6 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_ #define TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_ -#include - -#include "tensorflow/core/framework/attr_value.pb.h" -#include "tensorflow/core/framework/attr_value_util.h" #include "tensorflow/core/framework/dataset.h" -#include "tensorflow/core/framework/function.h" -#include "tensorflow/core/framework/graph.pb.h" -#include "tensorflow/core/framework/node_def.pb.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/types.pb.h" -#include "tensorflow/core/framework/variant_encode_decode.h" -#include "tensorflow/core/framework/variant_tensor_data.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/platform/tracing.h" - -// Polymorphic datasets should support all primitive TensorFlow -// types. Use this macro to expand `m(T)` once for each primitive type -// `T`, e.g. to build a `switch` statement. -#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m) - -namespace tensorflow { - -// Interface for reading values from a key-value store. -// Used for restoring iterator state. -class IteratorStateReader { - public: - virtual Status ReadScalar(StringPiece key, int64* val) = 0; - virtual Status ReadScalar(StringPiece key, string* val) = 0; - virtual Status ReadTensor(StringPiece key, Tensor* val) = 0; - virtual bool Contains(StringPiece key) = 0; - - virtual ~IteratorStateReader() {} -}; - -// Interface for writing values to a key-value store. -// Used for saving iterator state. -class IteratorStateWriter { - public: - virtual Status WriteScalar(StringPiece key, const int64 val) = 0; - virtual Status WriteScalar(StringPiece key, const string& val) = 0; - virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0; - - virtual ~IteratorStateWriter() {} -}; - -// Forward declarations to avoid introducing a dependency on headers in -// "tensorflow/core/graph/...". -class GraphDefBuilder; -class GraphDatasetBase; -class Node; - -// Wrapper around GraphDefBuilder. Used to serialize Dataset graph. -class GraphDefBuilderWrapper { - public: - explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {} - - // Adds a Const node with scalar value to the Graph. - // `*output` contains a pointer to the output `Node`. It is guaranteed to be - // non-null if the method returns with an OK status. - // The returned Node pointer is owned by the backing Graph of GraphDefBuilder. - template - Status AddScalar(const T& val, Node** output) { - Tensor val_t = Tensor(DataTypeToEnum::v(), TensorShape({})); - val_t.scalar()() = val; - AddTensorInternal(val_t, output); - if (*output == nullptr) { - return errors::Internal("AddScalar: Failed to build Const op."); - } - return Status::OK(); - } - - // Adds a Const node with vector value to the Graph. - // `*output` contains a pointer to the output `Node`. It is guaranteed to be - // non-null if the method returns with an OK status. - // The returned Node pointer is owned by the backing Graph of GraphDefBuilder. - // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice? - template - Status AddVector(const std::vector& val, Node** output) { - Tensor val_t = Tensor(DataTypeToEnum::v(), - TensorShape({static_cast(val.size())})); - for (int i = 0; i < val.size(); i++) { - val_t.flat()(i) = val[i]; - } - AddTensorInternal(val_t, output); - if (*output == nullptr) { - return errors::Internal("AddVector: Failed to build Const op."); - } - return Status::OK(); - } - - // Adds a Const node with Tensor value to the Graph. - // `*output` contains a pointer to the output `Node`. It is guaranteed to be - // non-null if the method returns with an OK status. - // The returned Node pointer is owned by the backing Graph of GraphDefBuilder. - Status AddTensor(const Tensor& val, Node** output) { - AddTensorInternal(val, output); - if (*output == nullptr) { - return errors::Internal("AddTensor: Failed to build Const op."); - } - return Status::OK(); - } - - Status AddDataset(const GraphDatasetBase* dataset, - const std::vector& inputs, Node** output) { - return AddDataset(dataset, inputs, {}, output); - } - - // Adds a node corresponding to the `DatasetType` to the Graph. - // Return value of `DatasetType::op_name()` is used as the op type for the - // node. - // Values for the output_types and output_shapes node attributes are also - // written if those attributes are defined in the OpDef. - // `*output` contains a pointer to the output `Node`. It is guaranteed to be - // non-null if the method returns with an OK status. - // The returned Node pointer is owned by the backing Graph of GraphDefBuilder. - Status AddDataset(const GraphDatasetBase* dataset, - const std::vector& inputs, - const std::vector>& attrs, - Node** output) { - std::vector> enumerated_inputs(inputs.size()); - for (int i = 0; i < inputs.size(); i++) { - enumerated_inputs[i] = std::make_pair(i, inputs[i]); - } - return AddDataset(dataset, enumerated_inputs, {}, attrs, output); - } - - Status AddDataset( - const GraphDatasetBase* dataset, - const std::vector>& inputs, - const std::vector>>& list_inputs, - const std::vector>& attrs, - Node** output); - - // Adds a user-defined function with name `function_name` to the graph and - // recursively adds all functions it references. If a function with a matching - // name has already been added, returns with OK status. If a user-defined with - // name `function_name` is not found in the FunctionLibraryDefinition, returns - // an InvalidArgumentError. If the function with name `function_name` or any - // of its dependent functions are stateful, returns an InvalidArgument error. - Status AddFunction(OpKernelContext* ctx, const string& function_name); - - template - void BuildAttrValue(const T& value, AttrValue* attr) { - SetAttrValue(value, attr); - } - - private: - void AddTensorInternal(const Tensor& val, Node** output); - - Status EnsureFunctionIsStateless(OpKernelContext* ctx, - const string& function_name) const { - const FunctionLibraryDefinition* lib_def = - ctx->function_library()->GetFunctionLibraryDefinition(); - const FunctionDef* function_def = lib_def->Find(function_name); - if (!function_def) { - return errors::InvalidArgument("Unable to find FunctionDef for ", - function_name, " in registry."); - } - for (const NodeDef& node_def : function_def->node_def()) { - const OpDef* op_def; - TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def)); - // TODO(b/65524810): Hack to allow functions to capture Dataset op - // nodes needed for FlatMap. Currently, source datasets nodes have been - // marked stateful to avoid constant folding since we do not have a - // good way of serializing them. - if (IsOpWhitelisted(op_def)) { - continue; - } - if (op_def->is_stateful()) { - return errors::InvalidArgument( - "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ", - "in function ", function_name, " is stateful. ", - "Saving stateful functions is not supported yet."); - } - } - return Status::OK(); - } - - // Returns whether an op has been whitelisted for use inside map_fns. - // Uses a heuristic to whitelist source dataset ops which have been - // marked stateful due to b/65524810. - // Also looks up the `op_def->name` in the global - // `WhitelistedStatefulOpRegistry`. - bool IsOpWhitelisted(const OpDef* op_def) const { - return (StringPiece(op_def->name()).ends_with("Dataset") && - op_def->output_arg_size() == 1 && - op_def->output_arg(0).type() == DT_VARIANT) || - dataset::WhitelistedStatefulOpRegistry::Global()->Contains( - op_def->name()); - } - - bool HasAttr(const string& op_type_name, const string& attr_name) const; - - bool HasAttr(const OpDef* op_def, const string& attr_name) const { - for (auto attr : op_def->attr()) { - if (attr.name() == attr_name) { - return true; - } - } - return false; - } - - Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) { - if (attr_value.has_func()) { - TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name())); - } else if (attr_value.has_list()) { - for (const NameAttrList& name_attr_list : attr_value.list().func()) { - TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name())); - } - } - return Status::OK(); - } - - GraphDefBuilder* b_; -}; - -class StatsAggregator; - -// A cut-down version of OpKernelContext for running computations in -// iterators. Note that we cannot simply use OpKernelContext here -// because we might run computation in an iterator whose lifetime is -// not nested within the lifetime of a single OpKernelContext -// (e.g. asynchronous prefetching). -// -// TODO(mrry): We will probably need to support more of -// OpKernelContext here. For example, should allocation be handled by -// the IteratorContext? -// TODO(mrry): We're making some daring assumptions about the lifetime -// of the runner passed in here. A runner will be deleted when the original -// step ends, but all existing runners only close over session-lifetime (or -// longer-lived) state, so we can make a copy of the function. There's nothing -// in the definition of the API from which we took the runner to guarantee that -// what we are doing is safe. We should formalize the properties here. -class IteratorContext { - public: - struct Params { - // Interface to operating system functionality. - Env* env; - - // Function call support. - std::function)> runner = nullptr; - - // A function that returns the current `StatsAggregator` instance to be - // used when recording statistics about the iterator. - // - // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator` - // is a property of the `IteratorResource` (which this class does not know - // about), and (ii) it can change after the `IteratorContext` has been - // created. Better suggestions are welcome! - std::function()> stats_aggregator_getter = - nullptr; - - // The FunctionLibraryRuntime object to be used to make function calls. - FunctionLibraryRuntime* lib = nullptr; - std::shared_ptr function_library = nullptr; - }; - - explicit IteratorContext(Params params) : params_(std::move(params)) {} - - Env* env() const { return params_.env; } - - std::function)>* runner() { - return ¶ms_.runner; - } - - std::shared_ptr stats_aggregator() { - if (params_.stats_aggregator_getter) { - return params_.stats_aggregator_getter(); - } else { - return nullptr; - } - } - - std::shared_ptr function_library() { - return params_.function_library; - } - - FunctionLibraryRuntime* lib() { return params_.lib; } - - void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; } - - private: - Params params_; -}; - -// Represents the current position in a range of outputs, where the -// range of outputs is typically represented by an `DatasetBase`, -// defined below. -class IteratorBase { - public: - virtual ~IteratorBase() {} - - // Gets the next output from the range that this iterator is traversing. - // - // If at least one output remains in this iterator's range, that - // output will be stored in `*out_tensors` and `false` will be - // stored in `*end_of_sequence`. - // - // If no more outputs remain in this iterator's range, `true` will - // be stored in `*end_of_sequence`, and the content of - // `*out_tensors` will be undefined. - // - // This method is thread-safe. - // - // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and - // potentially remove this method. - virtual Status GetNext(IteratorContext* ctx, std::vector* out_tensors, - bool* end_of_sequence) = 0; - - // Returns a vector of DataType values, representing the respective - // element types of each tuple component in the outputs of this - // iterator. - virtual const DataTypeVector& output_dtypes() const = 0; - - // Returns a vector of tensor shapes, representing the respective - // (and possibly partially defined) shapes of each tuple component - // in the outputs of this iterator. - virtual const std::vector& output_shapes() const = 0; - - // Saves the state of this iterator. - virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) { - return SaveInternal(writer); - } - - // Restores the state of this iterator. - virtual Status Restore(IteratorContext* ctx, IteratorStateReader* reader) { - return RestoreInternal(ctx, reader); - } - - protected: - // This is needed so that sub-classes of IteratorBase can call - // `SaveInternal` on their parent iterators, e.g., in - // `RepeatDataasetOp::Dataset`. - Status SaveParent(IteratorStateWriter* writer, - const std::unique_ptr& parent) { - return parent->SaveInternal(writer); - } - - // This is needed so that sub-classes of IteratorBase can call - // `RestoreInternal` on their parent iterators, e.g., in - // `RepeatDataasetOp::Dataset`. - Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader, - const std::unique_ptr& parent) { - return parent->RestoreInternal(ctx, reader); - } - - // Saves the state of this iterator recursively. - virtual Status SaveInternal(IteratorStateWriter* writer) { - return errors::Unimplemented("SaveInternal"); - } - - // Restores the state of this iterator recursively. - virtual Status RestoreInternal(IteratorContext* ctx, - IteratorStateReader* reader) { - return errors::Unimplemented("RestoreInternal"); - } -}; - -// Represents a (potentially infinite) range of outputs, where each -// output is a tuple of tensors. -class DatasetBase : public core::RefCounted { - public: - // Returns a new iterator for iterating over the range of elements in - // this dataset. - // - // This method may be called multiple times on the same instance, - // and the resulting iterators will have distinct state. Each - // iterator will traverse all elements in this dataset from the - // start. - // - // Ownership of the created iterator will be transferred to the caller. - // - // The prefix identifies the sequence of iterators leading up to the newly - // created iterator. - virtual std::unique_ptr MakeIterator( - const string& prefix) const = 0; - - // Returns a vector of DataType values, representing the respective - // element types of each tuple component in the outputs of this - // dataset. - virtual const DataTypeVector& output_dtypes() const = 0; - - // Returns a vector of tensor shapes, representing the respective - // (and possibly partially defined) shapes of each tuple component - // in the outputs of this dataset. - virtual const std::vector& output_shapes() const = 0; - - // A human-readable debug string for this dataset. - virtual string DebugString() = 0; - - // Serializes the dataset and writes it to the `writer`. - virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const { - return errors::Unimplemented("DatasetBase::Save"); - } - - protected: - // TODO(srbs): Ideally all graph related logic should reside in - // GraphDatasetBase. However, that would require Datasets defined in all ops - // to derive from GraphDatasetBase. Once that is done we can move - // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase. - class DatasetGraphDefBuilder : public GraphDefBuilderWrapper { - public: - DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {} - Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset, - Node** output) { - return dataset->AsGraphDefInternal(ctx, this, output); - } - }; - - virtual Status AsGraphDefInternal(OpKernelContext* ctx, - DatasetGraphDefBuilder* b, - Node** node) const { - return AsGraphDefInternal(b, node); - } - - virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b, - Node** node) const { - return errors::Unimplemented("AsGraphDefInternal"); - } -}; - -// Base-class for datasets that are built by ops. -class GraphDatasetBase : public DatasetBase { - public: - GraphDatasetBase(OpKernelContext* ctx) - : op_name_(ctx->op_kernel().type_string()) {} - - const string op_name() const { return op_name_; } - - Status Save(OpKernelContext* ctx, - IteratorStateWriter* writer) const override { - string serialized_graph_def; - string output_node; - TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node)); - TF_RETURN_IF_ERROR( - writer->WriteScalar(kDatasetGraphKey, serialized_graph_def)); - TF_RETURN_IF_ERROR( - writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node)); - return Status::OK(); - } - - // Key for storing the Dataset graph in the serialized format. - static const char kDatasetGraphKey[]; - - // Key for storing the output node of the Dataset graph in the serialized - // format. - static const char kDatasetGraphOutputNodeKey[]; - - private: - Status Serialize(OpKernelContext* ctx, string* serialized_graph_def, - string* output_node) const; - - const string op_name_; -}; - -// Represents an iterator that is associated with a particular parent dataset. -template -class DatasetIterator : public IteratorBase { - public: - struct Params { - // Owns one reference on the shared dataset resource. - const DatasetType* dataset; - - // Identifies the sequence of iterators leading up to this iterator. - const string prefix; - }; - - explicit DatasetIterator(const Params& params) : params_(params) { - params_.dataset->Ref(); - } - - ~DatasetIterator() override { params_.dataset->Unref(); } - - // The dataset from which this iterator was created. - const DatasetType* dataset() const { return params_.dataset; } - - // The sequence of iterators leading up to this iterator. - const string prefix() const { return params_.prefix; } - - const DataTypeVector& output_dtypes() const override { - return params_.dataset->output_dtypes(); - } - - const std::vector& output_shapes() const override { - return params_.dataset->output_shapes(); - } - - Status GetNext(IteratorContext* ctx, std::vector* out_tensors, - bool* end_of_sequence) final { - port::Tracing::TraceMe activity(params_.prefix); - Status s = GetNextInternal(ctx, out_tensors, end_of_sequence); - if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) { - s = errors::Internal( - "Iterator \"", params_.prefix, - "\" returned OutOfRange without setting `*end_of_sequence`. This " - "indicates that an error may have occurred. Original message: ", - s.error_message()); - LOG(ERROR) << s; - } - return s; - } - - Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final { - TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer)); - return IteratorBase::Save(ctx, writer); - } - - protected: - // Internal implementation of GetNext that is wrapped in tracing logic. - virtual Status GetNextInternal(IteratorContext* ctx, - std::vector* out_tensors, - bool* end_of_sequence) = 0; - - string full_name(const string& name) const { - return strings::StrCat(prefix(), ":", name); - } - - private: - Params params_; -}; - -// Encapsulates the work required to plug a DatasetBase into the core TensorFlow -// graph execution engine. -class DatasetOpKernel : public OpKernel { - public: - DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {} - void Compute(OpKernelContext* ctx) final; - - protected: - // Subclasses should implement this method. It will be called during Compute - // execution. - virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0; - - template - Status ParseScalarArgument(OpKernelContext* ctx, - const StringPiece& argument_name, T* output) { - const Tensor* argument_t; - TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t)); - if (!TensorShapeUtils::IsScalar(argument_t->shape())) { - return errors::InvalidArgument(argument_name, " must be a scalar"); - } - *output = argument_t->scalar()(); - return Status::OK(); - } -}; - -// Encapsulates the work required to plug unary Datasets into the core -// TensorFlow graph execution engine. -class UnaryDatasetOpKernel : public DatasetOpKernel { - public: - UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {} - - protected: - void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final; - virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input, - DatasetBase** output) = 0; -}; - -// Encapsulates the work required to plug binary Datasets into the core -// TensorFlow graph execution engine. -class BinaryDatasetOpKernel : public DatasetOpKernel { - public: - BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {} - - protected: - void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final; - virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input, - DatasetBase* another_input, - DatasetBase** output) = 0; -}; - -// Validates and extracts a `DatasetBase` object from `tensor`. -// -// `tensor` must have been written by a call to SetVariantTensorToDataset(). -// -// The retrieved pointer is a borrowed reference to the dataset, which is owned -// by the tensor. The consumer must either acquire its own reference to the -// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not -// destroyed or mutated while the retrieved pointer is in use. -Status GetDatasetFromVariantTensor(const Tensor& tensor, - DatasetBase** out_dataset); - -// Stores a `DatasetBase` object in `tensor`. -// -// The ownership of `dataset` is transferred to `tensor`. -Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor); - -} // namespace tensorflow #endif // TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_ diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc index e7224bb547f..132808a5f14 100644 --- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc @@ -155,7 +155,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel { // Determine the size of the output tensors: // * dense_shape will be [`row_shape + 1`]. - Tensor dense_shape(cpu_allocator(), DT_INT64, {row_ndims + 1}); + Tensor dense_shape(ctx->allocator({}), DT_INT64, {row_ndims + 1}); auto dense_shape_vec = dense_shape.vec(); for (size_t i = 0; i < row_ndims; ++i) { if (row_shape.dim_size(i) == -1) { @@ -215,10 +215,10 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel { // * indices will be [`total_elements`, `row_shape + 1`]. // * values will be [`total_elements`]. - Tensor indices(cpu_allocator(), DT_INT64, + Tensor indices(ctx->allocator({}), DT_INT64, {total_elements, row_ndims + 1}); Tensor values( - cpu_allocator(), + ctx->allocator({}), DatasetIterator>::dataset()->input_->output_dtypes()[0], {total_elements}); auto indices_matrix = indices.matrix(); diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc index eb047e10ecf..834c06bb930 100644 --- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc +++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc @@ -23,7 +23,6 @@ limitations under the License. #include "tensorflow/core/lib/random/random.h" namespace tensorflow { - namespace { // See documentation in ../ops/dataset_ops.cc for a high-level @@ -510,10 +509,6 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel { return Status::OK(); } - // A resource name for the temporary window dataset that is - // created as the input to the reduce function. - static constexpr const char* kWindowResourceName = "__window_dataset"; - const DatasetBase* const input_; const NameAttrList key_func_; const NameAttrList reduce_func_; @@ -537,5 +532,4 @@ REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU), GroupByWindowDatasetOp); } // namespace - } // namespace tensorflow diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index 56044a3d41a..dd5f4a4554b 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/common_runtime/graph_runner.h" +#include "tensorflow/core/common_runtime/renamed_device.h" #include "tensorflow/core/common_runtime/threadpool_device.h" #include "tensorflow/core/framework/iterator.pb.h" #include "tensorflow/core/framework/partial_tensor_shape.h" @@ -82,7 +83,7 @@ class IteratorResource : public ResourceBase { public: IteratorResource(const DataTypeVector& output_dtypes, const std::vector& output_shapes, - const int graph_def_version, + const int /*unused: graph_def_version*/, std::unique_ptr device_mgr, std::unique_ptr flib_def, std::unique_ptr pflr, @@ -93,8 +94,7 @@ class IteratorResource : public ResourceBase { lib_(lib), iterator_(nullptr), output_dtypes_(output_dtypes), - output_shapes_(output_shapes), - graph_def_version_(graph_def_version) {} + output_shapes_(output_shapes) {} Status GetNext(IteratorContext* ctx, std::vector* out_tensors, bool* end_of_sequence) { @@ -223,7 +223,6 @@ class IteratorResource : public ResourceBase { std::shared_ptr lib_def_ GUARDED_BY(mu_); const DataTypeVector output_dtypes_; const std::vector output_shapes_; - const int graph_def_version_; }; // Helper class for reading data from a VariantTensorData object. @@ -430,13 +429,10 @@ class IteratorStateVariant { REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant, kIteratorVariantTypeName); -// TODO(mrry): Can we simply use the template kernel here? class IteratorHandleOp : public OpKernel { public: explicit IteratorHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) { - OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_STRING, TensorShape({2}), - &handle_, nullptr)); OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_)); OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_)); @@ -460,56 +456,51 @@ class IteratorHandleOp : public OpKernel { } void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) { - mutex_lock l(mu_); - FunctionLibraryRuntime* lib = context->function_library(); - std::unique_ptr device_mgr(nullptr); - std::unique_ptr flib_def(nullptr); - std::unique_ptr pflr(nullptr); - // If the iterator is shared then we construct a new FLR, and pass that in. - // NOTE(mrry,rohanj): In this case it is not possible to call remote - // functions from the iterator. We may add this functionality if there - // is sufficient demand, but it will require a significant refactoring. - if (!name_.empty()) { - lib = CreateFLR(context, &device_mgr, &flib_def, &pflr); - } + { + mutex_lock l(mu_); + if (resource_ == nullptr) { + FunctionLibraryRuntime* lib = context->function_library(); + std::unique_ptr device_mgr(nullptr); + std::unique_ptr flib_def(nullptr); + std::unique_ptr pflr(nullptr); + // If the iterator is shared then we construct a new FLR, and pass that + // in. NOTE(mrry,rohanj): In this case it is not possible to call remote + // functions from the iterator. We may add this functionality if there + // is sufficient demand, but it will require a significant refactoring. + if (!name_.empty()) { + lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr); + } - if (resource_ == nullptr) { - ResourceMgr* mgr = context->resource_manager(); - OP_REQUIRES_OK(context, cinfo_.Init(mgr, def())); + ResourceMgr* mgr = context->resource_manager(); + OP_REQUIRES_OK(context, cinfo_.Init(mgr, def())); - IteratorResource* resource; - OP_REQUIRES_OK( - context, - mgr->LookupOrCreate( - cinfo_.container(), cinfo_.name(), &resource, - [lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret) - EXCLUSIVE_LOCKS_REQUIRED(mu_) { - *ret = new IteratorResource( - output_dtypes_, output_shapes_, graph_def_version_, - std::move(device_mgr), std::move(flib_def), - std::move(pflr), lib); - return Status::OK(); - })); + IteratorResource* resource; + OP_REQUIRES_OK( + context, + mgr->LookupOrCreate( + cinfo_.container(), cinfo_.name(), &resource, + [lib, &device_mgr, &flib_def, &pflr, + this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + *ret = new IteratorResource( + output_dtypes_, output_shapes_, graph_def_version_, + std::move(device_mgr), std::move(flib_def), + std::move(pflr), lib); + return Status::OK(); + })); - Status s = VerifyResource(resource); - if (TF_PREDICT_FALSE(!s.ok())) { - resource->Unref(); - context->SetStatus(s); - return; + Status s = VerifyResource(resource); + if (TF_PREDICT_FALSE(!s.ok())) { + resource->Unref(); + context->SetStatus(s); + return; + } + + resource_ = resource; } - - auto h = handle_.AccessTensor(context)->template flat(); - h(0) = cinfo_.container(); - h(1) = cinfo_.name(); - resource_ = resource; - } - if (context->expected_output_dtype(0) == DT_RESOURCE) { - OP_REQUIRES_OK(context, MakeResourceHandleToOutput( - context, 0, cinfo_.container(), cinfo_.name(), - MakeTypeIndex())); - } else { - context->set_output_ref(0, &mu_, handle_.AccessTensor(context)); } + OP_REQUIRES_OK(context, MakeResourceHandleToOutput( + context, 0, cinfo_.container(), cinfo_.name(), + MakeTypeIndex())); } private: @@ -526,15 +517,32 @@ class IteratorHandleOp : public OpKernel { return Status::OK(); } - FunctionLibraryRuntime* CreateFLR( + template // use like this: down_cast(foo); + static inline To down_cast(From* f) { // so we only accept pointers + static_assert( + (std::is_base_of::type>::value), + "target type not derived from source type"); + + // We skip the assert and hence the dynamic_cast if RTTI is disabled. +#if !defined(__GNUC__) || defined(__GXX_RTTI) + // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds. + assert(f == nullptr || dynamic_cast(f) != nullptr); +#endif // !defined(__GNUC__) || defined(__GXX_RTTI) + return static_cast(f); + } + + FunctionLibraryRuntime* CreatePrivateFLR( OpKernelContext* ctx, std::unique_ptr* device_mgr, std::unique_ptr* flib_def, std::unique_ptr* pflr) { - Device* device = new ThreadPoolDevice( - SessionOptions(), ctx->device()->attributes().name(), Bytes(256 << 20), - DeviceLocality(), cpu_allocator()); - - device_mgr->reset(new DeviceMgr({device})); + // Wrap the existing device in order to see any captured resources + // in its resource manager. The existing device will outlive the + // IteratorResource, because we are storing the IteratorResource + // in that device's resourc manager. + Device* wrapped_device = RenamedDevice::NewRenamedDevice( + ctx->device()->name(), down_cast(ctx->device()), + false /* owns_underlying */, false /* isolate_session_state */); + device_mgr->reset(new DeviceMgr({wrapped_device})); flib_def->reset(new FunctionLibraryDefinition( *ctx->function_library()->GetFunctionLibraryDefinition())); pflr->reset(new ProcessFunctionLibraryRuntime( @@ -542,13 +550,12 @@ class IteratorHandleOp : public OpKernel { {} /* TODO(mrry): OptimizerOptions? */, nullptr /* TODO(mrry): ClusterFLR */)); - return (*pflr)->GetFLR(device->name()); + return (*pflr)->GetFLR(ctx->device()->name()); } mutex mu_; - ContainerInfo cinfo_ GUARDED_BY(mu_); + ContainerInfo cinfo_; // Written once under mu_ then constant afterwards. IteratorResource* resource_ GUARDED_BY(mu_) = nullptr; - PersistentTensor handle_ GUARDED_BY(mu_); DataTypeVector output_dtypes_; std::vector output_shapes_; const int graph_def_version_; diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc index c529f671f2b..9ce263732f6 100644 --- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc @@ -183,7 +183,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { TensorShape component_shape( batch_results_[current_batch_index_].output[i].shape()); component_shape.set_dim(0, num_elements); - Tensor component(cpu_allocator(), output[i].dtype(), + Tensor component(ctx->allocator({}), output[i].dtype(), component_shape); TF_RETURN_IF_ERROR( CopyPartialBatch(&component, output[i], num_elements)); @@ -244,7 +244,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { return Status::OK(); } - void EnsureOutputAllocated(BatchResult* batch_result, + void EnsureOutputAllocated(IteratorContext* ctx, + BatchResult* batch_result, const std::vector& return_values) { mutex_lock l(batch_result->mu); if (batch_result->output_allocated) { @@ -254,7 +255,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { for (size_t i = 0; i < num_components; ++i) { TensorShape component_shape({dataset()->batch_size_}); component_shape.AppendShape(return_values[i].shape()); - Tensor component(cpu_allocator(), return_values[i].dtype(), + Tensor component(ctx->allocator({}), return_values[i].dtype(), component_shape); batch_result->output.emplace_back(std::move(component)); } @@ -285,10 +286,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { dataset()->captured_func_->RunAsync( ctx, std::move(input_element), &result->return_values, [this, ctx, result, batch_result, offset](Status ret_status) { - delete ctx; result->status.Update(ret_status); if (ret_status.ok()) { - EnsureOutputAllocated(batch_result, + EnsureOutputAllocated(ctx, batch_result, result->return_values); const size_t num_components = result->return_values.size(); @@ -318,6 +318,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel { } } } + delete ctx; // NOTE(mrry): We clear the return values here to release // any memory associated with them and to paralellize the // destruction of the tensors (which can be surprisingly diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc index 346eca0bb2a..cfb4efda9a5 100644 --- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc +++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_util.h" +#include "tensorflow/core/kernels/batch_util.h" #include "tensorflow/core/kernels/data/dataset.h" namespace tensorflow { @@ -24,102 +25,6 @@ namespace { // See documentation in ../ops/dataset_ops.cc for a high-level // description of the following op. -// The following five functions are copied from padding_fifo_queue.cc. -// TODO(mrry): Reconcile these functions with the similar methods in the -// queue implementation. -Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) { - DCHECK_NE(parent->dim_size(0), 0); - if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) { - TensorShape chip_shape = parent->shape(); - chip_shape.RemoveDim(0); - return errors::Internal( - "HandleElementToLargerSlice Cannot copy slice: number of entries in " - "element is greater than number of elements in parent slice. ", - "Shapes are: [element]: ", element.shape().DebugString(), - ", [parent slice]: ", chip_shape.DebugString()); - } - return Status::OK(); -} - -template -Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent, - int index) { - TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent)); - if (element.NumElements() == 0) { - return Status::OK(); - } - auto element_t = element.tensor(); - auto parent_t = parent->tensor(); - Eigen::DSizes slice_indices; - slice_indices[0] = index; - Eigen::DSizes slice_size; - slice_size[0] = 1; - for (size_t i = 1; i < slice_size.size(); ++i) { - slice_size[i] = element_t.dimension(i - 1); - } - parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size); - return Status::OK(); -} - -template -Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent, - int index) { -#define HANDLE_TYPE(T) \ - case DataTypeToEnum::value: { \ - return HandleElementToLargerSlice(element, parent, index); \ - } - - switch (element.dtype()) { - TF_CALL_DATASET_TYPES(HANDLE_TYPE); -#undef HANDLE_TYPE - default: - return errors::Unimplemented( - "HandleElementToLargerSliceWithRank Unhandled data type: ", - element.dtype()); - } -} - -Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent, - int index) { - if (parent->dims() != element.dims() + 1) { - return errors::Internal( - "Mismatched ranks. Element's rank is: ", element.dims(), - " but element is meant to be a slice in output Tensor having rank: ", - parent->dims(), " (should be: ", element.dims() + 1, ")"); - } - -#define HANDLE_DIMS(NDIMS) \ - case NDIMS: { \ - TF_RETURN_IF_ERROR( \ - HandleElementToLargerSliceWithRank(element, parent, index)); \ - return Status::OK(); \ - } - - switch (element.dims()) { - HANDLE_DIMS(0); - HANDLE_DIMS(1); - HANDLE_DIMS(2); - HANDLE_DIMS(3); - HANDLE_DIMS(4); -#undef HANDLE_DIMS - default: - return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ", - element.dims()); - } -} - -Status SetElementZero(Tensor* element, const Tensor& padding) { -#define HANDLE_TYPE(T) \ - if (element->dtype() == DataTypeToEnum::value) { \ - element->flat().setConstant(padding.scalar()()); \ - return Status::OK(); \ - } - TF_CALL_DATASET_TYPES(HANDLE_TYPE); -#undef HANDLE_TYPE - return errors::Unimplemented("SetElementZero Unhandled data type: ", - element->dtype()); -} - class PaddedBatchDatasetOp : public UnaryDatasetOpKernel { public: explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx) @@ -376,20 +281,27 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel { // 2. Copy each batch element to the appropriate location in // the output component tensor. - Tensor batch_component(cpu_allocator(), + Tensor batch_component(ctx->allocator({}), output_dtypes()[component_index], batch_component_shape); - TF_RETURN_IF_ERROR(SetElementZero( + TF_RETURN_IF_ERROR(batch_util::SetElementZero( &batch_component, dataset()->padding_values_[component_index])); // Build the output tuple component by copying one slice // from each input element in the batch. + TensorShape component_shape({}); + for (int i = 1; i < batch_component_shape.dims(); ++i) { + component_shape.AddDim(batch_component_shape.dim_size(i)); + } for (int64 i = 0; i < num_batch_elements; ++i) { - TF_RETURN_IF_ERROR(ValidateElementToLargerSlice( - batch_elements[i][component_index], &batch_component)); - - TF_RETURN_IF_ERROR(CopyElementToLargerSlice( - batch_elements[i][component_index], &batch_component, i)); + // Take the fast path if possible. + if (batch_elements[i][component_index].shape() == component_shape) { + TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice( + batch_elements[i][component_index], &batch_component, i)); + } else { + TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice( + batch_elements[i][component_index], &batch_component, i)); + } } out_tensors->push_back(std::move(batch_component)); } diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc index bc638864b01..210b9ad1b84 100644 --- a/tensorflow/core/kernels/data/random_dataset_op.cc +++ b/tensorflow/core/kernels/data/random_dataset_op.cc @@ -99,7 +99,7 @@ class RandomDatasetOp : public DatasetOpKernel { std::vector* out_tensors, bool* end_of_sequence) override { mutex_lock l(mu_); - Tensor value_tensor(cpu_allocator(), DT_INT64, {}); + Tensor value_tensor(ctx->allocator({}), DT_INT64, {}); value_tensor.scalar()() = Random(); out_tensors->emplace_back(std::move(value_tensor)); *end_of_sequence = false; diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc index d0bc61acd99..b57518e678e 100644 --- a/tensorflow/core/kernels/data/range_dataset_op.cc +++ b/tensorflow/core/kernels/data/range_dataset_op.cc @@ -100,7 +100,7 @@ class RangeDatasetOp : public DatasetOpKernel { *end_of_sequence = true; return Status::OK(); } - Tensor value_tensor(cpu_allocator(), DT_INT64, {}); + Tensor value_tensor(ctx->allocator({}), DT_INT64, {}); value_tensor.scalar()() = next_; out_tensors->emplace_back(std::move(value_tensor)); *end_of_sequence = false; diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc index aa39fffc2e3..34d7d9f914d 100644 --- a/tensorflow/core/kernels/data/reader_dataset_ops.cc +++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc @@ -141,7 +141,7 @@ class TextLineDatasetOp : public DatasetOpKernel { if (s.ok()) { // Produce the line as output. - Tensor line_tensor(cpu_allocator(), DT_STRING, {}); + Tensor line_tensor(ctx->allocator({}), DT_STRING, {}); line_tensor.scalar()() = line_contents; out_tensors->emplace_back(std::move(line_tensor)); *end_of_sequence = false; @@ -384,7 +384,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel { TF_RETURN_IF_ERROR( input_buffer_->ReadNBytes(dataset()->record_bytes_, &record)); // Produce the record as output. - Tensor record_tensor(cpu_allocator(), DT_STRING, {}); + Tensor record_tensor(ctx->allocator({}), DT_STRING, {}); record_tensor.scalar()() = record; out_tensors->emplace_back(std::move(record_tensor)); *end_of_sequence = false; @@ -589,7 +589,7 @@ class TFRecordDatasetOp : public DatasetOpKernel { do { // We are currently processing a file, so try to read the next record. if (reader_) { - Tensor result_tensor(cpu_allocator(), DT_STRING, {}); + Tensor result_tensor(ctx->allocator({}), DT_STRING, {}); Status s = reader_->ReadRecord(&result_tensor.scalar()()); if (s.ok()) { out_tensors->emplace_back(std::move(result_tensor)); diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc index 13c2501bbbd..d636c37afe2 100644 --- a/tensorflow/core/kernels/data/skip_dataset_op.cc +++ b/tensorflow/core/kernels/data/skip_dataset_op.cc @@ -128,8 +128,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel { while (i_ < dataset()->count_) { // Fetch and throw away Tensors. std::vector dummy_out_tensors; - TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &dummy_out_tensors, - end_of_sequence)); + TF_RETURN_IF_ERROR( + input_impl_->GetNext(ctx, &dummy_out_tensors, end_of_sequence)); if (*end_of_sequence) { // We reached the end before the count was reached. input_impl_.reset(); @@ -140,8 +140,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel { } // Return GetNext() on the underlying iterator. - TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors, - end_of_sequence)); + TF_RETURN_IF_ERROR( + input_impl_->GetNext(ctx, out_tensors, end_of_sequence)); if (*end_of_sequence) { input_impl_.reset(); } @@ -184,8 +184,7 @@ class SkipDatasetOp : public UnaryDatasetOpKernel { }; }; -REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU), - SkipDatasetOp); +REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU), SkipDatasetOp); } // namespace diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD index 0286825af3e..f4698bdaf7a 100644 --- a/tensorflow/core/kernels/data/sql/BUILD +++ b/tensorflow/core/kernels/data/sql/BUILD @@ -33,6 +33,7 @@ cc_library( deps = [ "//tensorflow/core:framework", "//tensorflow/core:lib", + "//tensorflow/core/kernels/data:dataset", "//tensorflow/core/lib/db:sqlite", ], ) diff --git a/tensorflow/core/kernels/data/sql/query_connection.h b/tensorflow/core/kernels/data/sql/query_connection.h index f31017bd198..e9ffca202ff 100644 --- a/tensorflow/core/kernels/data/sql/query_connection.h +++ b/tensorflow/core/kernels/data/sql/query_connection.h @@ -19,6 +19,8 @@ limitations under the License. namespace tensorflow { +class IteratorContext; + namespace sql { // This interface allows a user to connect to a database, execute a query, and // iterate over the result set, putting the results into an output tensor. @@ -56,7 +58,7 @@ class QueryConnection { // If there are no more rows in the result set, then instead `true` will be // stored in `*end_of_sequence`, and the content of `*out_tensors` will be // undefined. - virtual Status GetNext(std::vector* out_tensors, + virtual Status GetNext(IteratorContext* ctx, std::vector* out_tensors, bool* end_of_sequence) = 0; }; diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc index 029a0aab972..7cd07bd8eca 100644 --- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc +++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/data/dataset.h" #include "tensorflow/core/lib/strings/stringprintf.h" namespace tensorflow { @@ -48,14 +49,16 @@ Status SqliteQueryConnection::Close() { return Status::OK(); } -Status SqliteQueryConnection::GetNext(std::vector* out_tensors, +Status SqliteQueryConnection::GetNext(IteratorContext* ctx, + std::vector* out_tensors, bool* end_of_sequence) { if (!stmt_) TF_RETURN_IF_ERROR(PrepareQuery()); TF_RETURN_IF_ERROR(stmt_.Step(end_of_sequence)); if (!*end_of_sequence) { for (int i = 0; i < column_count_; i++) { DataType dt = output_types_[i]; - Tensor tensor(cpu_allocator(), dt, {}); + // TODO(mrry): Pass in the `IteratorContext::allocator()`. + Tensor tensor(ctx->allocator({}), dt, {}); FillTensorWithResultSetEntry(dt, i, &tensor); out_tensors->emplace_back(std::move(tensor)); } diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h index 787c17d6c00..81b19530b7d 100644 --- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h +++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h @@ -32,7 +32,7 @@ class SqliteQueryConnection : public QueryConnection { Status Open(const string& data_source_name, const string& query, const DataTypeVector& output_types) override; Status Close() override; - Status GetNext(std::vector* out_tensors, + Status GetNext(IteratorContext* ctx, std::vector* out_tensors, bool* end_of_sequence) override; private: diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc index 72302190802..d50e9c9cf97 100644 --- a/tensorflow/core/kernels/data/sql_dataset_ops.cc +++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc @@ -116,7 +116,7 @@ class SqlDatasetOp : public DatasetOpKernel { } } - Status GetNextInternal(IteratorContext* /*ctx*/, + Status GetNextInternal(IteratorContext* ctx, std::vector* out_tensors, bool* end_of_sequence) override { mutex_lock l(mu_); @@ -132,7 +132,7 @@ class SqlDatasetOp : public DatasetOpKernel { return s; } } - return query_connection_->GetNext(out_tensors, end_of_sequence); + return query_connection_->GetNext(ctx, out_tensors, end_of_sequence); } private: diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc new file mode 100644 index 00000000000..ff412a4671b --- /dev/null +++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc @@ -0,0 +1,646 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/core/framework/partial_tensor_shape.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/variant.h" +#include "tensorflow/core/framework/variant_encode_decode.h" +#include "tensorflow/core/kernels/batch_util.h" +#include "tensorflow/core/kernels/data/dataset.h" + +namespace tensorflow { + +namespace { + +bool IsGreaterEqualToOrCompatibleWith(const PartialTensorShape& a, + const PartialTensorShape& b) { + // Returns true if dims[a] >= dims[b], or are compatible. + if (a.unknown_rank()) return true; + if (a.dims() != b.dims()) return false; + for (int d = 0; d < a.dims(); ++d) { + if (a.dim_size(d) == -1 || b.dim_size(d) == -1) continue; + if (a.dim_size(d) < b.dim_size(d)) return false; + } + return true; +} + +DataTypeVector PrependQueueType(const DataTypeVector& dtypes) { + DataTypeVector out; + out.reserve(dtypes.size() + 1); + out.push_back(DT_VARIANT); // The queue component. + for (const DataType& d : dtypes) out.push_back(d); + return out; +} + +std::vector PrependQueueShapeWithBatch( + const std::vector& shapes) { + std::vector out; + out.reserve(shapes.size() + 1); + out.emplace_back(PartialTensorShape({-1})); // The queue component. + for (PartialTensorShape s : shapes) { + s.InsertDim(0, -1); // Unknown batch size. + out.push_back(std::move(s)); + } + return out; +} + +class EnqueueInQueueDatasetOp; + +class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase { + public: + PrependFromQueueAndPaddedBatchDataset( + OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input, + const DataTypeVector& dtypes, + const std::vector& shapes, + std::vector padding_values) + : GraphDatasetBase(ctx), + batch_size_(batch_size), + input_(input), + dtypes_(dtypes), + shapes_(shapes), + padding_values_(std::move(padding_values)), + dtypes_with_queue_(PrependQueueType(dtypes)), + batched_shapes_with_queue_(PrependQueueShapeWithBatch(shapes)) { + input_->Ref(); + } + + ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); } + + std::unique_ptr MakeIterator( + const string& prefix) const override { + return std::unique_ptr(new Iterator( + {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")})); + } + + const DataTypeVector& output_dtypes() const override { + return dtypes_with_queue_; + } + const std::vector& output_shapes() const override { + return batched_shapes_with_queue_; + } + + string DebugString() override { + return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset"; + } + + protected: + Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b, + Node** output) const override { + Node* input_graph = nullptr; + TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph)); + Node* batch_size = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size)); + + std::vector padded_shapes; + padded_shapes.reserve(shapes_.size()); + for (int i = 0; i < shapes_.size(); i++) { + Node* node; + Tensor t(DT_INT64, TensorShape({shapes_[i].dims()})); + for (int j = 0; j < shapes_[i].dims(); j++) { + t.vec()(j) = shapes_[i].dim_size(j); + } + TF_RETURN_IF_ERROR(b->AddTensor(t, &node)); + padded_shapes.emplace_back(node); + } + + std::vector padding_values; + padding_values.reserve(padding_values_.size()); + for (const Tensor& t : padding_values_) { + Node* node; + TF_RETURN_IF_ERROR(b->AddTensor(t, &node)); + padding_values.emplace_back(node); + } + + AttrValue output_types; + b->BuildAttrValue(dtypes_, &output_types); + + AttrValue output_shapes; + b->BuildAttrValue(batched_shapes_with_queue_, &output_shapes); + + AttrValue N; + b->BuildAttrValue(shapes_.size(), &N); + + TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, input_graph}, {1, batch_size}}, + {{2, padded_shapes}, {3, padding_values}}, + {{"Toutput_types", output_types}, + {"output_shapes", output_shapes}, + {"N", N}}, + output)); + + return Status::OK(); + } + + private: + friend class EnqueueInQueueDatasetOp; + + class Iterator + : public DatasetIterator { + public: + explicit Iterator(const Params& params) + : DatasetIterator(params), + queue_(new TensorQueue(/*input_impl*/ + params.dataset->input_->MakeIterator( + params.prefix), + params.dataset->dtypes_, + params.dataset->shapes_)) {} + + ~Iterator() override { queue_->Unref(); } + + Status GetNextInternal(IteratorContext* ctx, + std::vector* out_tensors, + bool* end_of_sequence) override { + std::vector> batch; + TF_RETURN_IF_ERROR(queue_->GetNext(ctx, dataset()->batch_size_, &batch, + end_of_sequence)); + const auto& dtypes = dataset()->dtypes_; + const auto& shapes = dataset()->shapes_; + const auto& input_shapes = dataset()->input_->output_shapes(); + const auto& padding_values = dataset()->padding_values_; + const int64 batch_size = batch.size(); + out_tensors->reserve(dtypes.size()); + + std::vector max_shapes; // Of non-queue components. + for (int i = 0; i < dtypes.size(); ++i) { + const PartialTensorShape& shape = shapes[i]; + TensorShape out_shape({batch_size}); + for (int r = 0; r < shape.dims(); ++r) { + if (shape.dim_size(r) >= 0) { + // padded_shape[r] is known. + out_shape.AddDim(shape.dim_size(r)); + } else { + // padded_shape[r] is unknown, find the maximum across + // the batch. + int64 dim = 0; + for (int b = 0; b < batch.size(); ++b) { + dim = std::max(dim, batch[b][i].dim_size(r)); + } + out_shape.AddDim(dim); + } + } + max_shapes.push_back(std::move(out_shape)); + } + + Tensor queues_t(cpu_allocator(), DT_VARIANT, TensorShape({batch_size})); + if (!batch.empty()) { + auto queues = queues_t.flat(); + Variant& queue_inserter = queues(0); + queue_inserter = TensorQueueInserter(); + queue_inserter.get()->set_queue(queue_); + for (int b = 1; b < batch.size(); ++b) { + // Copy the TensorQueueInserter. Each copy increments the + // Ref on the queue_. + queues(b) = queues(0); + } + } + out_tensors->push_back(std::move(queues_t)); + + for (int i = 0; i < max_shapes.size(); ++i) { + Tensor component(cpu_allocator(), dtypes[i], max_shapes[i]); + // Try hard to take the fast path. + if (shapes[i].IsFullyDefined() && + shapes[i].IsIdenticalTo(input_shapes[i])) { + // Take the fast path if we know all the shapes statically. + for (int64 b = 0; b < batch.size(); ++b) { + TF_RETURN_IF_ERROR( + batch_util::CopyElementToSlice(batch[b][i], &component, b)); + } + } else { + TF_RETURN_IF_ERROR( + batch_util::SetElementZero(&component, padding_values[i])); + for (int64 b = 0; b < batch.size(); ++b) { + if (batch[b][i].shape() == max_shapes[i]) { + TF_RETURN_IF_ERROR( + batch_util::CopyElementToSlice(batch[b][i], &component, b)); + } else { + TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice( + batch[b][i], &component, b)); + } + } + } + out_tensors->push_back(std::move(component)); + } + + // end_of_sequence was set before we populated out_tensors, so + // it's ok to return now. + return Status::OK(); + } + + protected: + // Work around bug in MSVC that disallows access to protected + // members of Iterator from within TensorQueue. + class TensorQueue; + friend class TensorQueue; + + class TensorQueue : public core::RefCounted { + public: + TensorQueue(std::unique_ptr input_impl, + const DataTypeVector& dtypes, + const std::vector& shapes) + : dtypes_(dtypes), + shapes_(shapes), + input_impl_(std::move(input_impl)) {} + + void MaybeWaitForNotificationLocked(mutex_lock* lock) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + // This essentially just releases the lock and immediately relocks. + cv_.wait_for(*lock, std::chrono::milliseconds(0)); + } + + void NotifyLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { cv_.notify_all(); } + + Status GetNext(IteratorContext* ctx, const int64 batch_size, + std::vector>* batch, + bool* end_of_sequence) { + mutex_lock lock(mu_); + + *end_of_sequence = false; + + for (int64 b = 0; b < batch_size;) { + if (!entries_.empty()) { + batch->push_back(std::move(entries_.front())); + entries_.pop_front(); + ++b; + continue; + } else { + if (input_impl_) { + // There's still input coming in. + std::vector tensors; + bool input_end; + TF_RETURN_IF_ERROR( + input_impl_->GetNext(ctx, &tensors, &input_end)); + if (!input_end) { + batch->push_back(std::move(tensors)); + ++b; + continue; + } else { + input_impl_.reset(); + } + } + if (!input_impl_) { + // There's no more input coming in. + if (RefCountIsOne()) { + // No TensorQueueInserters in the wild. + if (batch->empty()) { + *end_of_sequence = true; + } + break; + } else { + MaybeWaitForNotificationLocked(&lock); + // If there's data available, try to add entries again. + // Otherwise return a smaller batch and hope the next + // iterator request has a non-empty or unused queue_. + if (entries_.empty()) { + break; + } + } + } + } + } // for (int64 b = ... batch_size) + return Status::OK(); + } + + Status Insert(const std::vector& tensors) { + if (tensors.size() != dtypes_.size()) { + return errors::InvalidArgument( + "TensorQueue::Insert: mismatched number of tensors. Queue " + "expects ", + dtypes_.size(), " tensors but tried to insert ", tensors.size()); + } + for (int i = 0; i < tensors.size(); ++i) { + if (tensors[i].dtype() != dtypes_[i]) { + return errors::InvalidArgument( + "TensorQueue::Insert: mismatched dtypes at component ", i, + ". Attempted " + "to insert tensor of type ", + DataTypeString(tensors[i].dtype()), + " but queue expected type: ", DataTypeString(dtypes_[i])); + } + if (!shapes_[i].IsCompatibleWith(tensors[i].shape())) { + return errors::InvalidArgument( + "TensorQueue::Insert: mismatched shapes at component ", i, + ". Attempted " + "to insert tensor with shape ", + tensors[i].shape().DebugString(), + " but queue expected shape: ", shapes_[i].DebugString()); + } + } + mutex_lock lock(mu_); + entries_.push_back(tensors); + NotifyLocked(); + return Status::OK(); + } + + Status Save(Iterator* iter, IteratorStateWriter* writer) { + mutex_lock lock(mu_); + if (input_impl_) { + TF_RETURN_IF_ERROR(iter->SaveParent(writer, input_impl_)); + } else { + TF_RETURN_IF_ERROR( + writer->WriteScalar(iter->full_name("input_exhausted"), "")); + } + TF_RETURN_IF_ERROR(writer->WriteScalar(iter->full_name("entries_size"), + entries_.size())); + for (int64 b = 0; b < entries_.size(); ++b) { + for (int i = 0; i < dtypes_.size(); ++i) { + TF_RETURN_IF_ERROR( + writer->WriteTensor(strings::StrCat(iter->full_name("entries"), + "[", b, "][", i, "]"), + entries_[b][i])); + } + } + return Status::OK(); + } + + Status Restore(Iterator* iter, IteratorContext* ctx, + IteratorStateReader* reader) { + mutex_lock l(mu_); + if (reader->Contains(iter->full_name("input_exhausted"))) { + input_impl_.reset(); + } else { + input_impl_ = iter->dataset_input()->MakeIterator(iter->prefix()); + TF_RETURN_IF_ERROR(iter->RestoreParent(ctx, reader, input_impl_)); + } + entries_.clear(); + int64 entries_size = -1; + TF_RETURN_IF_ERROR( + reader->ReadScalar(iter->full_name("entries_size"), &entries_size)); + if (entries_size < 0) { + return errors::DataLoss( + "Expected entries_size key '", iter->full_name("entries_size"), + "' to have nonnegative value, but saw: ", entries_size); + } + for (int64 b = 0; b < entries_size; ++b) { + std::vector entry; + for (int i = 0; i < dtypes_.size(); ++i) { + Tensor value; + TF_RETURN_IF_ERROR( + reader->ReadTensor(strings::StrCat(iter->full_name("entries"), + "[", b, "][", i, "]"), + &value)); + entry.push_back(std::move(value)); + } + entries_.push_back(std::move(entry)); + } + return Status::OK(); + } + + mutex* mu() { return &mu_; } + + private: + DataTypeVector dtypes_; + std::vector shapes_; + + mutex mu_; + std::unique_ptr input_impl_ GUARDED_BY(mu_); + std::deque> entries_ GUARDED_BY(mu_); + condition_variable cv_ GUARDED_BY(mu_); + }; + + const DatasetBase* dataset_input() const { return dataset()->input_; } + + Status SaveInternal(IteratorStateWriter* writer) override { + return queue_->Save(this, writer); + } + + Status RestoreInternal(IteratorContext* ctx, + IteratorStateReader* reader) override { + return queue_->Restore(this, ctx, reader); + } + + public: + class TensorQueueInserter { + public: + TensorQueueInserter() : queue_(nullptr) {} + + void set_queue(TensorQueue* queue) { + queue_ = queue; + queue_->Ref(); + } + + TensorQueueInserter(const TensorQueueInserter& rhs) { + queue_ = rhs.queue_; + queue_->Ref(); + }; + + TensorQueueInserter(TensorQueueInserter&& rhs) { + queue_ = rhs.queue_; + rhs.queue_ = nullptr; + } + + TensorQueueInserter& operator=(const TensorQueueInserter& rhs) = delete; + + string TypeName() const { return "tensorflow::TensorQueueInserter"; } + string DebugString() const { return TypeName(); } + + void Encode(VariantTensorData*) const {} + bool Decode(const VariantTensorData&) { return false; } + + ~TensorQueueInserter() { + if (queue_) { + mutex_lock lock(*queue_->mu()); + queue_->Unref(); + queue_->NotifyLocked(); + queue_ = nullptr; + } + } + + Status Insert(const std::vector& tensors) const { + CHECK(queue_); + return queue_->Insert(tensors); + } + + private: + mutable TensorQueue* queue_; + }; + + private: + TensorQueue* const queue_; + }; + + private: + const int64 batch_size_; + const DatasetBase* input_; + const DataTypeVector dtypes_; + const std::vector shapes_; + const std::vector padding_values_; + const DataTypeVector dtypes_with_queue_; + const std::vector batched_shapes_with_queue_; +}; + +class PrependFromQueueAndPaddedBatchDatasetOp : public UnaryDatasetOpKernel { + public: + explicit PrependFromQueueAndPaddedBatchDatasetOp(OpKernelConstruction* ctx) + : UnaryDatasetOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_)); + } + + void MakeDataset(OpKernelContext* ctx, DatasetBase* input, + DatasetBase** output) override { + int64 batch_size = 0; + OP_REQUIRES_OK(ctx, + ParseScalarArgument(ctx, "batch_size", &batch_size)); + OP_REQUIRES( + ctx, batch_size > 0, + errors::InvalidArgument("Batch size must be greater than zero.")); + + OpInputList padded_shape_tensors; + OP_REQUIRES_OK(ctx, + ctx->input_list("padded_shapes", &padded_shape_tensors)); + std::vector padded_shapes; + padded_shapes.reserve(padded_shape_tensors.size()); + OP_REQUIRES(ctx, + padded_shape_tensors.size() == input->output_shapes().size(), + errors::InvalidArgument("Number of padded shapes (", + padded_shape_tensors.size(), + ") must match the number of components " + "in the input dataset's elements (", + input->output_shapes().size(), ")")); + for (const Tensor& padded_shape_t : padded_shape_tensors) { + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()), + errors::InvalidArgument("All padded shapes must be vectors")); + PartialTensorShape padded_shape; + OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape( + padded_shape_t.vec().data(), + padded_shape_t.NumElements(), &padded_shape)); + padded_shapes.push_back(std::move(padded_shape)); + } + + OP_REQUIRES( + ctx, input->output_dtypes() == output_types_, + errors::InvalidArgument("Input dataset and this dataset " + "have different output_types: ", + DataTypeVectorString(input->output_dtypes()), + " and ", DataTypeVectorString(output_types_))); + + for (int i = 0; i < input->output_shapes().size(); ++i) { + // Exclude the queue from the tensor_shapes calculation. + const PartialTensorShape& tensor_shape = padded_shapes[i]; + OP_REQUIRES( + ctx, + IsGreaterEqualToOrCompatibleWith(tensor_shape, + input->output_shapes()[i]), + errors::InvalidArgument("Incompatible input shapes at component ", i, + " between input dataset this dataset: ", + input->output_shapes()[i].DebugString(), + " vs. ", tensor_shape.DebugString())); + } + + OpInputList padding_values_list; + OP_REQUIRES_OK(ctx, + ctx->input_list("padding_values", &padding_values_list)); + std::vector padding_values; + OP_REQUIRES(ctx, + padding_values_list.size() == input->output_shapes().size(), + errors::InvalidArgument( + "Number of padding values (", padding_values_list.size(), + ") must match the number of components in the input " + "dataset's elements (", + input->output_shapes().size(), ")")); + for (int i = 0; i < padding_values_list.size(); ++i) { + const Tensor& padding_value_t = padding_values_list[i]; + OP_REQUIRES( + ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()), + errors::InvalidArgument( + "All padding values must be scalars; but at component ", i, + " saw shape: ", padding_value_t.shape().DebugString())); + OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i], + errors::InvalidArgument( + "Mismatched type between padding value ", i, + " and input dataset's component ", i, ": ", + DataTypeString(padding_value_t.dtype()), " vs. ", + DataTypeString(input->output_dtypes()[i]))); + padding_values.push_back(padding_value_t); + } + + *output = new PrependFromQueueAndPaddedBatchDataset( + ctx, batch_size, input, output_types_, padded_shapes, + std::move(padding_values)); + } + + private: + DataTypeVector output_types_; +}; + +REGISTER_KERNEL_BUILDER( + Name("PrependFromQueueAndPaddedBatchDataset").Device(DEVICE_CPU), + PrependFromQueueAndPaddedBatchDatasetOp); + +class EnqueueInQueueDatasetOp : public OpKernel { + public: + explicit EnqueueInQueueDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + void Compute(OpKernelContext* ctx) override { + using TensorQueueInserter = + PrependFromQueueAndPaddedBatchDataset::Iterator::TensorQueueInserter; + + // TODO(ebrevdo): accept list of sequence lengths to do proper + // sub-slicing of tensors for placement into the queue? + const Tensor& tensor_queue_t = ctx->input(0); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_queue_t.shape()), + errors::InvalidArgument("queue must be a vector, saw shape: ", + tensor_queue_t.shape().DebugString())); + std::vector inserters; + const int64 batch_size = tensor_queue_t.NumElements(); + inserters.reserve(batch_size); + const Variant* variants = tensor_queue_t.flat().data(); + for (int i = 0; i < batch_size; ++i) { + const auto* inserter = variants[i].get(); + OP_REQUIRES(ctx, inserter != nullptr, + errors::InvalidArgument( + "Could not access TensorQueueInserter from queue[", i, + "]. Received variant: ", variants[i].DebugString())); + inserters.push_back(inserter); + } + + OpInputList components; + OP_REQUIRES_OK(ctx, ctx->input_list("components", &components)); + for (int i = 0; i < components.size(); ++i) { + OP_REQUIRES( + ctx, + components[i].dims() > 0 && components[i].dim_size(0) == batch_size, + errors::InvalidArgument( + "Expected component ", i, " to have batched shape [", batch_size, + ",...], but saw shape: ", components[i].shape().DebugString())); + } + std::vector element_shapes; + for (int i = 0; i < components.size(); ++i) { + TensorShape element_shape = components[i].shape(); + element_shape.RemoveDim(0); + element_shapes.push_back(std::move(element_shape)); + } + for (int64 b = 0; b < batch_size; ++b) { + std::vector tensors; + tensors.reserve(components.size()); + for (int i = 0; i < components.size(); ++i) { + Tensor t(components[i].dtype(), element_shapes[i]); + OP_REQUIRES_OK(ctx, + batch_util::CopySliceToElement(components[i], &t, b)); + tensors.push_back(std::move(t)); + } + // TODO(ebrevdo): Acquire the lock once for all inserters with + // the same underlying queue? Add InsertLocked? + OP_REQUIRES_OK(ctx, inserters[b]->Insert(tensors)); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("EnqueueInQueueDataset").Device(DEVICE_CPU), + EnqueueInQueueDatasetOp); + +} // namespace + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc index 18adae1ea32..d5be4c77807 100644 --- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc +++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc @@ -117,7 +117,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel { out_tensors->reserve(dataset()->tensors_.size()); for (int i = 0; i < dataset()->tensors_.size(); ++i) { const Tensor& t = dataset()->tensors_[i]; - Tensor t_slice(cpu_allocator(), t.dtype(), + Tensor t_slice(ctx->allocator({}), t.dtype(), TensorShape(dataset()->shapes_[i].dim_sizes())); TF_RETURN_IF_ERROR(batch_util::CopySliceToElement(t, &t_slice, i_)); out_tensors->emplace_back(std::move(t_slice)); diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc index 965a60c7e05..1b94ea05440 100644 --- a/tensorflow/core/kernels/debug_ops.cc +++ b/tensorflow/core/kernels/debug_ops.cc @@ -46,7 +46,7 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost") .HostMemory("input") .HostMemory("output"), CopyOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Register debug identity (non-ref and ref) ops. REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU), @@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity") .HostMemory("input") .HostMemory("output"), DebugIdentityOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Register debug NaN-counter (non-ref and ref) ops. #define REGISTER_DEBUG_NAN_COUNT(type) \ @@ -98,7 +98,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double); DebugNanCountOp); REGISTER_GPU_DEBUG_NAN_COUNT(float); REGISTER_GPU_DEBUG_NAN_COUNT(double); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Register debug numeric summary ops. #define REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(type) \ diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h index 381add3fb3b..53a23b13060 100644 --- a/tensorflow/core/kernels/debug_ops.h +++ b/tensorflow/core/kernels/debug_ops.h @@ -21,7 +21,7 @@ limitations under the License. #endif #ifdef TENSORFLOW_USE_SYCL #include "tensorflow/core/common_runtime/sycl/sycl_util.h" -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #include "tensorflow/core/debug/debug_io_utils.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_kernel.h" @@ -91,7 +91,7 @@ class CopyOp : public OpKernel { Device* device = static_cast(context->device()); // Determine if the input tensor is not on CPU (e.g., on GPU). const bool off_host_input = device->device_type() == DEVICE_SYCL && - !context->input_alloc_attr(0).on_host(); + !context->input_alloc_attr(0).on_host(); if (off_host_input) { SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor); diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc index c4555db453b..0c42f632521 100644 --- a/tensorflow/core/kernels/decode_csv_op.cc +++ b/tensorflow/core/kernels/decode_csv_op.cc @@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel { } else { int32 value; OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value), - errors::InvalidArgument("Field ", f, " in record ", i, - " is not a valid int32: ", - fields[f])); + errors::InvalidArgument( + "Field ", f, " in record ", i, + " is not a valid int32: ", fields[f])); output[f]->flat()(i) = value; } break; @@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel { } else { int64 value; OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value), - errors::InvalidArgument("Field ", f, " in record ", i, - " is not a valid int64: ", - fields[f])); + errors::InvalidArgument( + "Field ", f, " in record ", i, + " is not a valid int64: ", fields[f])); output[f]->flat()(i) = value; } break; @@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel { } else { float value; OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value), - errors::InvalidArgument("Field ", f, " in record ", i, - " is not a valid float: ", - fields[f])); + errors::InvalidArgument( + "Field ", f, " in record ", i, + " is not a valid float: ", fields[f])); output[f]->flat()(i) = value; } break; @@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel { } else { double value; OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value), - errors::InvalidArgument("Field ", f, " in record ", i, - " is not a valid double: ", - fields[f])); + errors::InvalidArgument( + "Field ", f, " in record ", i, + " is not a valid double: ", fields[f])); output[f]->flat()(i) = value; } break; @@ -208,9 +208,10 @@ class DecodeCSVOp : public OpKernel { if (!quoted) { while (static_cast(current_idx) < input.size() && input[current_idx] != delim_) { - OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') && - input[current_idx] != '\n' && - input[current_idx] != '\r', + OP_REQUIRES(ctx, + (!use_quote_delim_ || input[current_idx] != '"') && + input[current_idx] != '\n' && + input[current_idx] != '\r', errors::InvalidArgument( "Unquoted fields cannot have quotes/CRLFs inside")); field += input[current_idx]; @@ -238,10 +239,11 @@ class DecodeCSVOp : public OpKernel { } OP_REQUIRES( - ctx, (static_cast(current_idx) < input.size() && - input[current_idx] == '"' && - (static_cast(current_idx) == input.size() - 1 || - input[current_idx + 1] == delim_)), + ctx, + (static_cast(current_idx) < input.size() && + input[current_idx] == '"' && + (static_cast(current_idx) == input.size() - 1 || + input[current_idx + 1] == delim_)), errors::InvalidArgument("Quoted field has to end with quote " "followed by delim or end")); diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc index 44dcbf834ce..912d04c1536 100644 --- a/tensorflow/core/kernels/decode_image_op.cc +++ b/tensorflow/core/kernels/decode_image_op.cc @@ -87,10 +87,11 @@ class DecodeImageOp : public OpKernel { channels_ = 3; } else { OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_)); - OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 || - channels_ == 4, - errors::InvalidArgument( - "channels must be 0, 1, 3, or 4, got ", channels_)); + OP_REQUIRES( + context, + channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4, + errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ", + channels_)); } flags_.components = channels_; @@ -114,8 +115,9 @@ class DecodeImageOp : public OpKernel { if (format_ == kJpgFormat) { OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio)); - OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 || - flags_.ratio == 4 || flags_.ratio == 8, + OP_REQUIRES(context, + flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 || + flags_.ratio == 8, errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ", flags_.ratio)); OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling", @@ -130,8 +132,9 @@ class DecodeImageOp : public OpKernel { string dct_method; OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method)); OP_REQUIRES( - context, (dct_method.empty() || dct_method == "INTEGER_FAST" || - dct_method == "INTEGER_ACCURATE"), + context, + (dct_method.empty() || dct_method == "INTEGER_FAST" || + dct_method == "INTEGER_ACCURATE"), errors::InvalidArgument("dct_method must be one of " "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}")); if (dct_method == "INTEGER_FAST") { @@ -157,9 +160,9 @@ class DecodeImageOp : public OpKernel { errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ", FileFormatString(magic, input))); OP_REQUIRES(context, input.size() <= std::numeric_limits::max(), - errors::InvalidArgument(FileFormatString(magic, input), - " contents are too large for int: ", - input.size())); + errors::InvalidArgument( + FileFormatString(magic, input), + " contents are too large for int: ", input.size())); OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8, errors::InvalidArgument(FileFormatString(magic, input), " does not support uint16 output")); @@ -212,9 +215,10 @@ class DecodeImageOp : public OpKernel { input.data(), input.size(), flags, nullptr /* nwarn */, [=, &output](int width, int height, int channels) -> uint8* { Status status(context->allocate_output( - 0, format_ == kGifFormat - ? TensorShape({1, height, width, channels}) - : TensorShape({height, width, channels}), + 0, + format_ == kGifFormat + ? TensorShape({1, height, width, channels}) + : TensorShape({height, width, channels}), &output)); if (!status.ok()) { VLOG(1) << status; diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc index 8e9b8a7e2e7..829155fb313 100644 --- a/tensorflow/core/kernels/deep_conv2d.cc +++ b/tensorflow/core/kernels/deep_conv2d.cc @@ -120,9 +120,9 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows, VLOG(2) << "CanUseDeepConv2D" << " deep_conv_cost: " << deep_conv_cost - << " direct_conv_cost: " << direct_conv_cost - << " deep_direct_ratio: " << (static_cast(deep_conv_cost) / - static_cast(direct_conv_cost)) + << " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: " + << (static_cast(deep_conv_cost) / + static_cast(direct_conv_cost)) << " use_deep_conv: " << (deep_conv_cost < direct_conv_cost); return deep_conv_cost < direct_conv_cost; } diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc index 6d44a92fa3c..6497c8f3719 100644 --- a/tensorflow/core/kernels/dense_update_ops.cc +++ b/tensorflow/core/kernels/dense_update_ops.cc @@ -89,7 +89,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -113,14 +113,14 @@ TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS); #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNELS(type) \ -REGISTER_KERNEL_BUILDER( \ - Name("Assign").Device(DEVICE_SYCL).TypeConstraint("T"), \ - AssignOpT); +#define REGISTER_SYCL_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Assign").Device(DEVICE_SYCL).TypeConstraint("T"), \ + AssignOpT); TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -146,7 +146,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #endif // end GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL_KERNELS(type) \ +#define REGISTER_SYCL_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint("T"), \ DenseUpdateOp); \ @@ -156,5 +156,5 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc index 9347978d515..91a9587174b 100644 --- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc @@ -400,7 +400,7 @@ struct LaunchDepthwiseConvBackpropInputOp { // Computes one shard of depthwise conv2d backprop input. auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop]( - int64 start, int64 limit) { + int64 start, int64 limit) { static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); const int64 input_image_size = @@ -750,7 +750,7 @@ struct LaunchDepthwiseConvBackpropFilterOp { // Computes one shard of depthwise conv2d backprop filter. auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data]( - int64 start, int64 limit) { + int64 start, int64 limit) { static const int64 kPacketSize = (sizeof(Packet) / sizeof(T)); const int64 filter_spatial_size = args.filter_rows * args.filter_cols; const int64 padded_out_depth_size = diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc index a5fd07fbe17..c060b2e14d2 100644 --- a/tensorflow/core/kernels/depthwise_conv_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_op.cc @@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp { // in_depth for input and filter must match. const int64 in_depth = GetTensorDim(input, data_format_, 'C'); - OP_REQUIRES( - context, in_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - in_depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, in_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", in_depth, + " vs ", filter.dim_size(2))); // The last dimension for filter is depth multiplier. const int32 depth_multiplier = filter.dim_size(3); @@ -430,9 +430,10 @@ TF_CALL_double(REGISTER_CPU_KERNEL); #endif #if GOOGLE_CUDA -REGISTER_KERNEL_BUILDER( - Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint("T"), - DepthwiseConv2dNativeOp); +REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative") + .Device(DEVICE_GPU) + .TypeConstraint("T"), + DepthwiseConv2dNativeOp); REGISTER_KERNEL_BUILDER( Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint("T"), diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc index 5493e335328..126b64f73df 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc @@ -17,12 +17,12 @@ limitations under the License. #define EIGEN_USE_GPU #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "external/cub_archive/cub/util_ptx.cuh" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/depthwise_conv_op.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/util/tensor_format.h" -#include "external/cub_archive/cub/util_ptx.cuh" #if !defined(_MSC_VER) #define UNROLL _Pragma("unroll") @@ -1021,7 +1021,7 @@ __global__ void __launch_bounds__(640, 2) // Device function to compute sub-warp sum reduction for a power-of-two group of // neighboring threads. -template +template __device__ __forceinline__ T WarpSumReduce(T val) { // support only power-of-two widths. assert(__popc(kWidth) == 1); diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc index 86fa7dce36a..d228153d4c7 100644 --- a/tensorflow/core/kernels/diag_op.cc +++ b/tensorflow/core/kernels/diag_op.cc @@ -29,8 +29,8 @@ limitations under the License. #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/work_sharder.h" namespace tensorflow { @@ -47,8 +47,9 @@ class DiagOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& diagonal = context->input(0); const int num_dims = diagonal.dims(); - OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument( - "Input must be at least rank 1, got 0")); + OP_REQUIRES( + context, 0 != num_dims, + errors::InvalidArgument("Input must be at least rank 1, got 0")); TensorShape out_shape; for (int i = 0; i < num_dims; ++i) { out_shape.AddDim(diagonal.dim_size(i)); @@ -60,10 +61,9 @@ class DiagOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output_tensor)); functor::DiagFunctor diagFunc; - Status s = diagFunc(context, - diagonal.NumElements(), - diagonal.flat().data(), - output_tensor->flat().data()); + Status s = + diagFunc(context, diagonal.NumElements(), diagonal.flat().data(), + output_tensor->flat().data()); OP_REQUIRES_OK(context, s); } }; @@ -82,12 +82,12 @@ class DiagPartOp : public OpKernel { errors::InvalidArgument("The rank of the tensor should be \ even and positive, got shape ", tensor.shape().DebugString())); - for (int i = 0; i < out_dims; i++){ - OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims), - errors::InvalidArgument( - "Invalid shape ", tensor.shape().DebugString(), - ": dimensions ", i, " and ", i + out_dims, " do not match.") - ); + for (int i = 0; i < out_dims; i++) { + OP_REQUIRES( + context, tensor.dim_size(i) == tensor.dim_size(i + out_dims), + errors::InvalidArgument("Invalid shape ", + tensor.shape().DebugString(), ": dimensions ", + i, " and ", i + out_dims, " do not match.")); } TensorShape out_shape; @@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel { } Tensor* output = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(0, out_shape, &output)); + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); functor::DiagPartFunctor diagPartFunc; - Status s = diagPartFunc(context, - out_shape.num_elements(), - tensor.flat().data(), - output->flat().data()); + Status s = diagPartFunc(context, out_shape.num_elements(), + tensor.flat().data(), output->flat().data()); OP_REQUIRES_OK(context, s); } }; @@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel { namespace functor { template struct DiagFunctor { - EIGEN_ALWAYS_INLINE Status - operator() (OpKernelContext* context, const int64 size, - const T* in, T* out) { + EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context, + const int64 size, const T* in, T* out) { // This subprocess is responsible for writing values in index range // [start*size, limit*size) auto subDiag = [in, out, size](int64 start, int64 limit) { @@ -143,17 +139,16 @@ struct DiagFunctor { // Here, 5 is a empirical factor of cost_per_unit. auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); - Shard(worker_threads.num_threads, worker_threads.workers, size, - 5 * size, subDiag); + Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size, + subDiag); return Status::OK(); } }; template struct DiagPartFunctor { - EIGEN_ALWAYS_INLINE Status - operator() (OpKernelContext* context, const int64 size, - const T* in, T* out) { + EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context, + const int64 size, const T* in, T* out) { // This subprocess is responsible for extracting values in index range // [start, limit) auto subDiagPart = [in, out, size](int64 start, int64 limit) { @@ -164,14 +159,13 @@ struct DiagPartFunctor { // Here, 5 is a empirical factor of cost_per_unit. auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); - Shard(worker_threads.num_threads, worker_threads.workers, size, - 5, subDiagPart); + Shard(worker_threads.num_threads, worker_threads.workers, size, 5, + subDiagPart); return Status::OK(); } }; } // namespace functor - // Register the CPU kernels. #define REGISTER_DIAGOP(T) \ REGISTER_KERNEL_BUILDER( \ @@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU); #endif // GOOGLE_CUDA - } // namespace tensorflow - diff --git a/tensorflow/core/kernels/diag_op.h b/tensorflow/core/kernels/diag_op.h index c6ca6a20474..baf16ddb4b9 100644 --- a/tensorflow/core/kernels/diag_op.h +++ b/tensorflow/core/kernels/diag_op.h @@ -26,14 +26,14 @@ namespace functor { template struct DiagFunctor { - Status operator() (OpKernelContext* context, const int64 size, - const T* in, T* out); + Status operator()(OpKernelContext* context, const int64 size, const T* in, + T* out); }; template struct DiagPartFunctor { - Status operator() (OpKernelContext* context, const int64 size, - const T* in, T* out); + Status operator()(OpKernelContext* context, const int64 size, const T* in, + T* out); }; } // namespace functor diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc index d3c529d784e..910f3093b23 100644 --- a/tensorflow/core/kernels/diag_op_gpu.cu.cc +++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc @@ -19,8 +19,8 @@ limitations under the License. #include #include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" #include "tensorflow/core/kernels/diag_op.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" namespace tensorflow { namespace functor { @@ -28,10 +28,8 @@ namespace functor { typedef Eigen::GpuDevice GPUDevice; template -__global__ void DiagCudaKernel(const int num_threads, - const int64 size, - const T* in, - T* out) { +__global__ void DiagCudaKernel(const int num_threads, const int64 size, + const T* in, T* out) { CUDA_1D_KERNEL_LOOP(index, num_threads) { // Fill the diagonal elements or set to zero in other place. if (index % (1 + size) == 0) { @@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads, template struct DiagFunctor { - EIGEN_ALWAYS_INLINE Status - operator() (OpKernelContext* context, const int64 size, - const T* in, T* out) { + EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context, + const int64 size, const T* in, T* out) { // Empty tensor couldn't launch the kernel. if (size == 0) { return Status::OK(); @@ -56,25 +53,22 @@ struct DiagFunctor { // so this may overflow for `size*size` in extreme cases, // here is checking the multiplication overflow for integer. if (size && (int(size * size) / size) != size) { - return errors::Internal( - "DiagOp got input size too large."); + return errors::Internal("DiagOp got input size too large."); } int virtual_thread_count = int(size * size); // Launch the GPU kernel. const GPUDevice& device = context->eigen_device(); - CudaLaunchConfig diag_config = GetCudaLaunchConfig( - virtual_thread_count, device); - DiagCudaKernel<<>>( - diag_config.virtual_thread_count, size, in, out); + CudaLaunchConfig diag_config = + GetCudaLaunchConfig(virtual_thread_count, device); + DiagCudaKernel<<>>(diag_config.virtual_thread_count, size, + in, out); auto err = cudaGetLastError(); if (err != cudaSuccess) { return errors::Internal( - "Could not launch DiagOp kernel: ", - cudaGetErrorString(err), "."); + "Could not launch DiagOp kernel: ", cudaGetErrorString(err), "."); } return Status::OK(); } @@ -87,12 +81,9 @@ template struct DiagFunctor; template struct DiagFunctor; template struct DiagFunctor; - template -__global__ void DiagPartCudaKernel(const int num_threads, - const int64 size, - const T* in, - T* out) { +__global__ void DiagPartCudaKernel(const int num_threads, const int64 size, + const T* in, T* out) { CUDA_1D_KERNEL_LOOP(index, num_threads) { out[index] = in[(1 + size) * index]; } @@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads, template struct DiagPartFunctor { - EIGEN_ALWAYS_INLINE Status - operator() (OpKernelContext* context, const int64 size, - const T* in, T* out) { + EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context, + const int64 size, const T* in, T* out) { // Empty tensor couldn't launch the kernel. if (size == 0) { return Status::OK(); @@ -111,16 +101,14 @@ struct DiagPartFunctor { // Extract the diagonal elements. CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device); - DiagPartCudaKernel<<>>( - diag_config.virtual_thread_count, size, in, out); + DiagPartCudaKernel<<>>(diag_config.virtual_thread_count, + size, in, out); auto err = cudaGetLastError(); if (err != cudaSuccess) { return errors::Internal( - "Could not launch DiagPartOp kernel: ", - cudaGetErrorString(err), "."); + "Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), "."); } return Status::OK(); } diff --git a/tensorflow/core/kernels/diag_op_test.cc b/tensorflow/core/kernels/diag_op_test.cc index 2d1417854cc..a708e53dd01 100644 --- a/tensorflow/core/kernels/diag_op_test.cc +++ b/tensorflow/core/kernels/diag_op_test.cc @@ -30,8 +30,8 @@ static Graph* Diag(int n, DataType type) { return g; } -#define BM_DiagDev(N, T, TFTYPE, DEVICE) \ - static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) { \ +#define BM_DiagDev(N, T, TFTYPE, DEVICE) \ + static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) { \ testing::UseRealTime(); \ testing::ItemsProcessed(static_cast(iters) * N * N); \ test::Benchmark(#DEVICE, Diag(N, TFTYPE)).Run(iters); \ @@ -51,4 +51,3 @@ BM_Diag(128); BM_Diag(512); } // end namespace tensorflow - diff --git a/tensorflow/core/kernels/dilation_ops.cc b/tensorflow/core/kernels/dilation_ops.cc index 6f5c0e91569..441a63465c8 100644 --- a/tensorflow/core/kernels/dilation_ops.cc +++ b/tensorflow/core/kernels/dilation_ops.cc @@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector& strides, filter.shape().DebugString())); const int filter_rows = filter.dim_size(0); const int filter_cols = filter.dim_size(1); - OP_REQUIRES( - context, depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", depth, " vs ", + filter.dim_size(2))); // Effective filter size, after introducing rate - 1 zeros between each // non-zero filter element. @@ -234,10 +234,11 @@ class DilationBackpropInputOp : public OpKernel { // [ batch, out_rows, out_cols, depth ] const int batch = input.dim_size(0); const int depth = input.dim_size(3); - OP_REQUIRES(context, batch == out_backprop.dim_size(0) && - out_rows == out_backprop.dim_size(1) && - out_cols == out_backprop.dim_size(2) && - depth == out_backprop.dim_size(3), + OP_REQUIRES(context, + batch == out_backprop.dim_size(0) && + out_rows == out_backprop.dim_size(1) && + out_cols == out_backprop.dim_size(2) && + depth == out_backprop.dim_size(3), errors::InvalidArgument("out_backprop has incompatible size.")); // The computed in_backprop has the same dimensions as the input: @@ -353,10 +354,11 @@ class DilationBackpropFilterOp : public OpKernel { // [ batch, out_rows, out_cols, depth ] const int batch = input.dim_size(0); const int depth = input.dim_size(3); - OP_REQUIRES(context, batch == out_backprop.dim_size(0) && - out_rows == out_backprop.dim_size(1) && - out_cols == out_backprop.dim_size(2) && - depth == out_backprop.dim_size(3), + OP_REQUIRES(context, + batch == out_backprop.dim_size(0) && + out_rows == out_backprop.dim_size(1) && + out_cols == out_backprop.dim_size(2) && + depth == out_backprop.dim_size(3), errors::InvalidArgument("out_backprop has incompatible size.")); // The computed filter_backprop has the same dimensions as the filter: diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc index ac0775fbefe..c63806a7f68 100644 --- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc @@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr, const int w_in = w_beg + w * rate_cols; if (w_in >= 0 && w_in < input_cols) { const T val = - input_ptr[d + - depth * - (w_in + input_cols * (h_in + input_rows * b))] + + input_ptr[d + depth * (w_in + + input_cols * (h_in + input_rows * b))] + filter_ptr[d + depth * (w + filter_cols * h)]; if (val > cur_val) { cur_val = val; @@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel( const int w_in = w_beg + w * rate_cols; if (w_in >= 0 && w_in < input_cols) { const T val = - input_ptr[d + - depth * - (w_in + input_cols * (h_in + input_rows * b))] + + input_ptr[d + depth * (w_in + + input_cols * (h_in + input_rows * b))] + filter_ptr[d + depth * (w + filter_cols * h)]; if (val > cur_val) { cur_val = val; @@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel( const int w_in = w_beg + w * rate_cols; if (w_in >= 0 && w_in < input_cols) { const T val = - input_ptr[d + - depth * - (w_in + input_cols * (h_in + input_rows * b))] + + input_ptr[d + depth * (w_in + + input_cols * (h_in + input_rows * b))] + filter_ptr[d + depth * (w + filter_cols * h)]; if (val > cur_val) { cur_val = val; diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc index a8818b7385d..b5d5b880bbb 100644 --- a/tensorflow/core/kernels/draw_bounding_box_op.cc +++ b/tensorflow/core/kernels/draw_bounding_box_op.cc @@ -29,8 +29,7 @@ template class DrawBoundingBoxesOp : public OpKernel { public: explicit DrawBoundingBoxesOp(OpKernelConstruction* context) - : OpKernel(context) { - } + : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& images = context->input(0); @@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel { int64 color_index = bb % color_table_length; const int64 min_box_row = static_cast(tboxes(b, bb, 0)) * (height - 1); - const int64 min_box_row_clamp = - std::max(min_box_row, 0); + const int64 min_box_row_clamp = std::max(min_box_row, 0); const int64 max_box_row = static_cast(tboxes(b, bb, 2)) * (height - 1); const int64 max_box_row_clamp = std::min(max_box_row, height - 1); const int64 min_box_col = static_cast(tboxes(b, bb, 1)) * (width - 1); - const int64 min_box_col_clamp = - std::max(min_box_col, 0); + const int64 min_box_col_clamp = std::max(min_box_col, 0); const int64 max_box_col = static_cast(tboxes(b, bb, 3)) * (width - 1); - const int64 max_box_col_clamp = - std::min(max_box_col, width - 1); + const int64 max_box_col_clamp = std::min(max_box_col, width - 1); if (min_box_row > max_box_row || min_box_col > max_box_col) { - LOG(WARNING) << "Bounding box (" << min_box_row - << "," << min_box_col - << "," << max_box_row - << "," << max_box_col + LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col + << "," << max_box_row << "," << max_box_col << ") is inverted and will not be drawn."; continue; } - if (min_box_row >= height || max_box_row < 0 || - min_box_col >= width || max_box_col < 0) { - LOG(WARNING) << "Bounding box (" << min_box_row - << "," << min_box_col - << "," << max_box_row - << "," << max_box_col + if (min_box_row >= height || max_box_row < 0 || min_box_col >= width || + max_box_col < 0) { + LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col + << "," << max_box_row << "," << max_box_col << ") is completely outside the image" << " and will not be drawn."; continue; diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc index 861e16b2fd0..3c988db5e61 100644 --- a/tensorflow/core/kernels/dynamic_partition_op.cc +++ b/tensorflow/core/kernels/dynamic_partition_op.cc @@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared { // Walk through data and copy the data to the appropriate output tensor const auto data_flat = data->flat(); std::vector, - Eigen::Aligned> > out_vec; + Eigen::Aligned> > + out_vec; out_vec.reserve(num_partitions_); for (int p = 0; p < num_partitions_; p++) { out_vec.push_back(outputs[p]->vec()); @@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared { } else { // If data has extra dimensions, use Eigen slices std::vector, - Eigen::Aligned> > out_flat; + Eigen::Aligned> > + out_flat; out_flat.reserve(num_partitions_); for (int p = 0; p < num_partitions_; p++) { out_flat.push_back(outputs[p]->flat_outer_dims()); diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc index 9bb58b13f38..9dfeccff0e8 100644 --- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc +++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc @@ -79,9 +79,9 @@ template void RangeInit(const GPUDevice& d, const T start, const T delta, const int32 size, typename TTypes::Flat out) { CudaLaunchConfig config = GetCudaLaunchConfig(size, d); - RangeInitKernel< - T><<>>( - start, delta, size, out.data()); + RangeInitKernel + <<>>( + start, delta, size, out.data()); } // Given *num_runs pairs (key, value), this function moves the value @@ -103,11 +103,10 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices, T* out, int64 gather_dim_size, int64 indices_size, int64 slice_size, int64 out_size) { CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d); - GatherOpKernel< - T, int32, - true><<>>( - params, indices, out, gather_dim_size, indices_size, slice_size, - out_size); + GatherOpKernel + <<>>( + params, indices, out, gather_dim_size, indices_size, slice_size, + out_size); } struct IdentityOp { @@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { OP_REQUIRES_ASYNC( c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()), - errors::InvalidArgument("data.shape must start with partitions.shape, ", - "got data.shape = ", data.shape().DebugString(), - ", partitions.shape = ", - partitions.shape().DebugString()), + errors::InvalidArgument( + "data.shape must start with partitions.shape, ", + "got data.shape = ", data.shape().DebugString(), + ", partitions.shape = ", partitions.shape().DebugString()), done); Tensor partition_count; @@ -245,8 +244,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { AllocatorAttributes alloc_attr; alloc_attr.set_on_host(true); OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), - &partition_count, alloc_attr), + c, + c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), + &partition_count, alloc_attr), done); auto e_part_count = partition_count.flat(); for (int i = 0; i < num_partitions_; i++) e_part_count(i) = 0; @@ -259,8 +259,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { // Prepare for counting. OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), - &partition_count), + c, + c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), + &partition_count), done); Tensor indices_out; // Count how many times each partition index occurs. @@ -280,8 +281,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { alloc_attr.set_on_host(true); alloc_attr.set_gpu_compatible(true); OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp(partition_count.dtype(), partition_count.shape(), - &cpu_tensor, alloc_attr), + c, + c->allocate_temp(partition_count.dtype(), partition_count.shape(), + &cpu_tensor, alloc_attr), done); perftools::gputools::DeviceMemoryBase wrapped( partition_count.flat().data(), num_partitions_ * sizeof(int32)); @@ -340,9 +342,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream); // Allocate temporary storage. OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp( - DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), - &cub_temp_storage), + c, + c->allocate_temp(DT_INT8, + TensorShape({static_cast(temp_storage_bytes)}), + &cub_temp_storage), done); // Radix-sort the partition information. cub::DeviceRadixSort::SortPairs( @@ -376,8 +379,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { zero_functor(device, partition_count->flat()); // Allocate memory for aggregates_out. OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), - &aggregates_out), + c, + c->allocate_temp(DT_INT32, TensorShape({num_partitions_}), + &aggregates_out), done); // Obtain the pointers to inner buffers. int32* keys_in_ptr = partitions_out.flat().data(); @@ -408,9 +412,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel { num_runs_ptr, reduction_op, N, cu_stream); // Allocate temporary storage. OP_REQUIRES_OK_ASYNC( - c, c->allocate_temp( - DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), - &cub_temp_storage), + c, + c->allocate_temp(DT_INT8, + TensorShape({static_cast(temp_storage_bytes)}), + &cub_temp_storage), done); // Run reduce-by-key. The effect is that we count how many times // each index appears in partitions. The distinct indices are stored diff --git a/tensorflow/core/kernels/eigen_activations.h b/tensorflow/core/kernels/eigen_activations.h index 99b4b2abe66..302033e47c5 100644 --- a/tensorflow/core/kernels/eigen_activations.h +++ b/tensorflow/core/kernels/eigen_activations.h @@ -21,13 +21,13 @@ limitations under the License. namespace Eigen { /** scalar_sigmoid_fast_derivative_op - * \ingroup CXX11_NeuralNetworks_Module - * \brief Template functor to compute the fast derivative of a sigmoid - * - * Input should be the backpropagated gradient. - * - * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative() - */ + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to compute the fast derivative of a sigmoid + * + * Input should be the backpropagated gradient. + * + * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative() + */ template struct scalar_sigmoid_fast_derivative_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op) @@ -55,13 +55,13 @@ struct functor_traits > { } // namespace internal /** scalar_tanh_fast_derivative_op - * \ingroup CXX11_NeuralNetworks_Module - * \brief Template functor to compute the fast derivative of a tanh - * - * Input should be the backpropagated gradient. - * - * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative() - */ + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to compute the fast derivative of a tanh + * + * Input should be the backpropagated gradient. + * + * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative() + */ template struct scalar_tanh_fast_derivative_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op) @@ -89,11 +89,11 @@ struct functor_traits > { } // namespace internal /** - * \ingroup CXX11_NeuralNetworks_Module - * \brief Template functor to clip the magnitude of the first scalar. - * - * \sa class CwiseBinaryOp, MatrixBase::Clip - */ + * \ingroup CXX11_NeuralNetworks_Module + * \brief Template functor to clip the magnitude of the first scalar. + * + * \sa class CwiseBinaryOp, MatrixBase::Clip + */ template struct scalar_clip_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op) diff --git a/tensorflow/core/kernels/eigen_activations_test.cc b/tensorflow/core/kernels/eigen_activations_test.cc index 907233103d8..34952f5abb8 100644 --- a/tensorflow/core/kernels/eigen_activations_test.cc +++ b/tensorflow/core/kernels/eigen_activations_test.cc @@ -23,7 +23,7 @@ namespace { void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } -} +} // namespace TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) { const ptrdiff_t depth = 3; diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h index 3a94b8c9933..4d86f9deb99 100644 --- a/tensorflow/core/kernels/eigen_attention.h +++ b/tensorflow/core/kernels/eigen_attention.h @@ -21,35 +21,47 @@ limitations under the License. namespace Eigen { /** ExtractGlimpses - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Extract glimpses from an input tensor. - * - * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch). - * The width and height parameters specify the extension of the returned glimpses. - * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension. - * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension. - * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center. - * - * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch). - * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size. - */ + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Extract glimpses from an input tensor. + * + * The input parameter is expected to be a col-major tensor with a rank of 4 + * (depth, x, y, and batch). The width and height parameters specify the + * extension of the returned glimpses. The offsets parameter specifies the x, y + * locations of the center of the glimpses relative to the center of the input + * image. The vector is expected to contain one IndexPair for each image in the + * batch dimension. The normalized boolean indicates if incoming coordinates are + * normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each + * height and width dimension. The centered boolean indicates if incoming + * coordinates are centered relative to the image, in which case -1.0 and 1.0 + * correspond to minimum and maximum of each dimension while 0.0 corresponds to + * the center. + * + * The result can be assigned to a tensor of rank equal to that of the input. + * The result will be laid out in col-major order (depth, x, y, batch). The + * dimensions of the result will be equal to the dimensions of the input except + * for width and height which will be equal to the requested glimpse size. + */ namespace { template struct GlimpseExtractionOp { GlimpseExtractionOp(const Index width, const Index height, const std::vector >& offsets, - const bool normalized, - const bool centered, - const bool uniform_noise) : - width_(width), height_(height), offsets_(offsets), - normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { } + const bool normalized, const bool centered, + const bool uniform_noise) + : width_(width), + height_(height), + offsets_(offsets), + normalized_(normalized), + centered_(centered), + uniform_noise_(uniform_noise) {} template DSizes dimensions(const Input& input) const { typedef typename internal::traits::Index IndexType; typedef TensorRef::Scalar, 4, - internal::traits::Layout, IndexType> > Ref; + internal::traits::Layout, IndexType> > + Ref; Ref in(input); DSizes dims = in.dimensions(); @@ -62,12 +74,12 @@ struct GlimpseExtractionOp { } template - EIGEN_DEVICE_FUNC - void eval(const Input& input, Output& output, const Device& device) const - { + EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output, + const Device& device) const { typedef typename internal::traits::Index IndexType; typedef TensorRef::Scalar, 4, - internal::traits::Layout, IndexType> > Ref; + internal::traits::Layout, IndexType> > + Ref; Ref in(input); const Index num_channels = in.dimension(0); const Index input_width = in.dimension(1); @@ -97,8 +109,8 @@ struct GlimpseExtractionOp { x -= width_ / 2.0f; y -= height_ / 2.0f; - const Index offset_x = (Index) x; - const Index offset_y = (Index) y; + const Index offset_x = (Index)x; + const Index offset_y = (Index)y; Index glimpse_width = width_; Index glimpse_height = height_; bool partial_overlap = false; @@ -135,7 +147,7 @@ struct GlimpseExtractionOp { if (uniform_noise_) { // Initialize the glimpse with uniform noise. typedef typename internal::remove_const< - typename internal::traits::Scalar>::type Scalar; + typename internal::traits::Scalar>::type Scalar; TensorFixedSize > mini; mini.device(device) = input.template chip<3>(i).minimum(); TensorFixedSize > range; @@ -215,21 +227,22 @@ struct GlimpseExtractionOp { const bool centered_; const bool uniform_noise_; }; -} - +} // namespace template -EIGEN_ALWAYS_INLINE -static const TensorCustomUnaryOp::Index>, const Input> +EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp< + const GlimpseExtractionOp::Index>, + const Input> ExtractGlimpses(const Input& input, const typename internal::traits::Index width, const typename internal::traits::Index height, const std::vector >& offsets, const bool normalized = true, const bool centered = true, - const bool uniform_noise = true) -{ - EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE); + const bool uniform_noise = true) { + EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, + YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 4, + YOU_MADE_A_PROGRAMMING_MISTAKE); typedef typename internal::traits::Index Index; const GlimpseExtractionOp op(width, height, offsets, normalized, @@ -237,6 +250,6 @@ ExtractGlimpses(const Input& input, return input.customOp(op); } -} // end namespace Eigen +} // end namespace Eigen #endif // TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_ diff --git a/tensorflow/core/kernels/eigen_attention_test.cc b/tensorflow/core/kernels/eigen_attention_test.cc index 3a2eeb05959..08f61877182 100644 --- a/tensorflow/core/kernels/eigen_attention_test.cc +++ b/tensorflow/core/kernels/eigen_attention_test.cc @@ -23,7 +23,7 @@ namespace { void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } -} +} // namespace TEST(EigenAttentionTest, Simple) { const ptrdiff_t depth = 3; diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h index aec76978102..099696105b6 100644 --- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h +++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h @@ -21,29 +21,29 @@ limitations under the License. namespace Eigen { /** SpatialConvolutionBackwardInput - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Computes the backprop for the input of a 2D convolution. - * - * The output_backward parameter is expected to be a tensor with a rank of 3 or + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the input of a 2D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 3 or * more (channels, height, width, and optionally others) - * The kernel parameter is expected to be a 4D tensor (filters, channels, + * The kernel parameter is expected to be a 4D tensor (filters, channels, * kernel_height, kernel_width) - * The output_backward and the kernel must both be in col-major layout. The + * The output_backward and the kernel must both be in col-major layout. The * result will also be in col-major layout. - * - * If row_in_stride, col_in_stride > 1, then applies convolution with holes + * + * If row_in_stride, col_in_stride > 1, then applies convolution with holes * (aka atrous convolution), sampling every row_in_stride, col_in_stride input * pixels. - * - * The result can be assigned to a tensor of rank equal to the rank of the + * + * The result can be assigned to a tensor of rank equal to the rank of the * output_backward. The dimensions of the result will be filters, height, width * (and others if applicable). - * - * It is possible to swap the order of the width and height dimensions provided + * + * It is possible to swap the order of the width and height dimensions provided * that the same order is used in the input, the kernel, and the output. - * - */ + * + */ #ifdef EIGEN_HAS_INDEX_LIST typedef IndexList, type2index<0>, type2index<1>, type2index<1> > ReverseColMajor; @@ -293,29 +293,29 @@ SpatialConvolutionBackwardInput( } /** SpatialConvolutionBackwardKernel - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Computes the backprop for the filter of a 2D convolution. - * - * The output_backward parameter is expected to be a tensor with a rank of 3 or + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Computes the backprop for the filter of a 2D convolution. + * + * The output_backward parameter is expected to be a tensor with a rank of 3 or * more (channels, height, width, and optionally others) - * The kernel parameter is expected to be a 4D tensor (filters, channels, + * The kernel parameter is expected to be a 4D tensor (filters, channels, * kernel_height, kernel_width) - * The output_backward and the kernel must both be in col-major layout. The + * The output_backward and the kernel must both be in col-major layout. The * result will also be in col-major layout. - * - * If row_in_stride, col_stride > 1, then applies convolution with holes (aka + * + * If row_in_stride, col_stride > 1, then applies convolution with holes (aka * atrous convolution), sampling every row_in_stride, col_in_stride input * pixels. - * - * The result can be assigned to a tensor of rank equal to the rank of the + * + * The result can be assigned to a tensor of rank equal to the rank of the * output_backward. The dimensions of the result will be filters, height, width * (and others if applicable). - * - * It is possible to swap the order of the width and height dimensions provided + * + * It is possible to swap the order of the width and height dimensions provided * that the same order is used in the input, the kernel, and the output. - * - */ + * + */ template EIGEN_ALWAYS_INLINE static const typename internal::conditional< diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc index 1758067829e..2229ec96594 100644 --- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc +++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc @@ -25,7 +25,7 @@ void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } static int ceil_div(int a, int b) { return (a + b - 1) / b; } -} +} // namespace TEST(EigenBackwardSpatialConvolutionsTest, test_simple_spatial_convolution_backward_input_valid) { diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h index 972036833ff..896c9957616 100644 --- a/tensorflow/core/kernels/eigen_pooling.h +++ b/tensorflow/core/kernels/eigen_pooling.h @@ -309,10 +309,10 @@ struct AvgPoolMeanReducer { _mm512_castsi512_ps( \ _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1)) -// The ternarylogic function immediate determines the values in the result -// In the case below, 0xd8 implies (false_mask) ? (b) : (a) -// For details, refer to the vpternlogd instruction table at -// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf + // The ternarylogic function immediate determines the values in the result + // In the case below, 0xd8 implies (false_mask) ? (b) : (a) + // For details, refer to the vpternlogd instruction table at + // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf #define psel(a, b, false_mask) \ _mm512_castsi512_ps(_mm512_ternarylogic_epi32( \ diff --git a/tensorflow/core/kernels/eigen_pooling_test.cc b/tensorflow/core/kernels/eigen_pooling_test.cc index 9383972b9ff..47b6665e680 100644 --- a/tensorflow/core/kernels/eigen_pooling_test.cc +++ b/tensorflow/core/kernels/eigen_pooling_test.cc @@ -23,7 +23,7 @@ namespace { void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } -} +} // namespace TEST(EigenPoolingTest, Simple) { const int depth = 10; diff --git a/tensorflow/core/kernels/eigen_softmax.h b/tensorflow/core/kernels/eigen_softmax.h index a2930a726f9..12148c54b36 100644 --- a/tensorflow/core/kernels/eigen_softmax.h +++ b/tensorflow/core/kernels/eigen_softmax.h @@ -21,19 +21,21 @@ limitations under the License. namespace Eigen { /** SoftMax - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Applies a softmax - * - * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other). - * - * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order. - * -*/ + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a softmax + * + * The input parameter is expected to be a col-major tensor with a rank of 2 + * (depth and other). + * + * The result can be assigned to a tensor of rank and dimensions equal to that + * of the input. The result will be laid out in col-major order. + * + */ namespace { struct SoftmaxOp { - SoftmaxOp(const float beta) : beta_(beta) { } + SoftmaxOp(const float beta) : beta_(beta) {} template typename Input::Dimensions dimensions(const Input& input) const { @@ -41,8 +43,7 @@ struct SoftmaxOp { } template - void eval(const Input& input, Output& output, const Device& device) const - { + void eval(const Input& input, Output& output, const Device& device) const { #if !defined(EIGEN_HAS_INDEX_LIST) // nvcc doesn't support cxx11 Eigen::array::Index, 1> depth_dim; @@ -56,35 +57,43 @@ struct SoftmaxOp { #else // Take advantage of cxx11 to give the compiler information it can use to // optimize the code. - Eigen::IndexList> depth_dim; - Eigen::IndexList> bcast; + Eigen::IndexList > depth_dim; + Eigen::IndexList > bcast; bcast.set(0, dimensions(input)[0]); - Eigen::IndexList, typename internal::traits::Index> dims2d; + Eigen::IndexList, + typename internal::traits::Index> + dims2d; dims2d.set(1, dimensions(input)[1]); #endif - output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp(); - output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + output.device(device) = + ((input - + input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * + beta_) + .exp(); + output.device(device) = + output / + (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); } private: const float beta_; }; -} - +} // namespace template -EIGEN_ALWAYS_INLINE -static const TensorCustomUnaryOp -SoftMax(const Input& input, const float beta) -{ - EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE); +EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp +SoftMax(const Input& input, const float beta) { + EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, + YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 2, + YOU_MADE_A_PROGRAMMING_MISTAKE); const SoftmaxOp op(beta); return input.customOp(op); } -} // end namespace Eigen +} // end namespace Eigen #endif // TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_ diff --git a/tensorflow/core/kernels/eigen_softmax_test.cc b/tensorflow/core/kernels/eigen_softmax_test.cc index ba681d68ab0..7f985d71366 100644 --- a/tensorflow/core/kernels/eigen_softmax_test.cc +++ b/tensorflow/core/kernels/eigen_softmax_test.cc @@ -23,7 +23,7 @@ namespace { void EigenApprox(float a, float b) { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); } -} +} // namespace TEST(EigenSoftmaxTest, Simple) { const int depth = 1024; diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h index 2fe64cd72ac..1acbe3a6580 100644 --- a/tensorflow/core/kernels/eigen_spatial_convolutions.h +++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h @@ -877,29 +877,29 @@ struct gemm_pack_rhs< } // end namespace internal /** SpatialConvolution - * \ingroup CXX11_NeuralNetworks_Module - * - * \brief Applies a 2D convolution over a multichannel input image. - * - * The input parameter is expected to be a tensor with a rank of 3 or more + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a 2D convolution over a multichannel input image. + * + * The input parameter is expected to be a tensor with a rank of 3 or more * (channels, height, width, and optionally others) - * The kernel parameter is expected to be a 4D tensor (filters, channels, + * The kernel parameter is expected to be a 4D tensor (filters, channels, * kernel_height, kernel_width) - * The input and the kernel must both be in col-major layout. The result will + * The input and the kernel must both be in col-major layout. The result will * also be in col-major layout. - * - * If col_in_stride, row_in_stride > 1, then applies convolution with holes + * + * If col_in_stride, row_in_stride > 1, then applies convolution with holes * (aka atrous convolution), sampling every col_in_stride, row_in_stride input * pixels. - * - * The result can be assigned to a tensor of rank equal to the rank of the + * + * The result can be assigned to a tensor of rank equal to the rank of the * input. The dimensions of the result will be filters, height, width (and * others if applicable). - * - * It is possible to swap the order of the width and height dimensions provided + * + * It is possible to swap the order of the width and height dimensions provided * that the same order is used in the input, the kernel, and the output. - * - */ + * + */ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional< @@ -993,7 +993,7 @@ EIGEN_DEVICE_FUNC default: // Initialize unused variables to avoid a compiler warning out_height = 0; - out_width = 0; + out_width = 0; eigen_assert(false && "unexpected padding"); } diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc index 4fcae25aa6e..1a5b0f2b675 100644 --- a/tensorflow/core/kernels/encode_jpeg_op.cc +++ b/tensorflow/core/kernels/encode_jpeg_op.cc @@ -80,10 +80,11 @@ class EncodeJpegOp : public OpKernel { errors::InvalidArgument("image must be 3-dimensional", image.shape().DebugString())); - OP_REQUIRES(context, FastBoundsCheck(image.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument( - "Cannot encode images with >= max int32 elements")); + OP_REQUIRES( + context, + FastBoundsCheck(image.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument( + "Cannot encode images with >= max int32 elements")); const int32 dim_size0 = static_cast(image.dim_size(0)); const int32 dim_size1 = static_cast(image.dim_size(1)); @@ -100,9 +101,10 @@ class EncodeJpegOp : public OpKernel { } else if (channels == 3) { adjusted_flags.format = jpeg::FORMAT_RGB; } else { - OP_REQUIRES(context, false, errors::InvalidArgument( - "image must have 1 or 3 channels, got ", - image.shape().DebugString())); + OP_REQUIRES( + context, false, + errors::InvalidArgument("image must have 1 or 3 channels, got ", + image.shape().DebugString())); } } else { if (flags_.format == jpeg::FORMAT_GRAYSCALE) { diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc index 268a059275a..83cd0e9b47e 100644 --- a/tensorflow/core/kernels/example_parsing_ops.cc +++ b/tensorflow/core/kernels/example_parsing_ops.cc @@ -346,8 +346,9 @@ class SingleSequenceExampleParserOp : public OpKernel { feature_list_sparse_keys[di].scalar()(); } OP_REQUIRES( - ctx, TensorShapeUtils::IsVector( - feature_list_dense_missing_assumed_empty->shape()), + ctx, + TensorShapeUtils::IsVector( + feature_list_dense_missing_assumed_empty->shape()), errors::InvalidArgument( "Expected feature_list_dense_missing_assumed_empty ", "to be a vector, got shape: ", @@ -386,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel { required[d] = (def_value.NumElements() == 0); // No default provided. if (def_value.NumElements() > 0) { - OP_REQUIRES( - ctx, def_value.shape() == attrs_.context_dense_shapes[d], - errors::InvalidArgument( - "def_value[", d, "].shape() == ", - def_value.shape().DebugString(), " != context_dense_shapes_[", - d, "] == ", attrs_.context_dense_shapes[d].DebugString())); + OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d], + errors::InvalidArgument( + "def_value[", d, + "].shape() == ", def_value.shape().DebugString(), + " != context_dense_shapes_[", d, + "] == ", attrs_.context_dense_shapes[d].DebugString())); OP_REQUIRES( ctx, def_value.dtype() == attrs_.context_dense_types[d], errors::InvalidArgument( @@ -576,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel { const Feature& f = fl.feature(t); bool types_match; OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match)); - OP_REQUIRES( - ctx, types_match, - errors::InvalidArgument( - "Name: ", name, ", Feature list: ", key, ", Index: ", t, - ". Data types don't match. ", "Expected type: ", - DataTypeString(dtype), " Feature is: ", ProtoDebugString(f))); + OP_REQUIRES(ctx, types_match, + errors::InvalidArgument( + "Name: ", name, ", Feature list: ", key, ", Index: ", t, + ". Data types don't match. ", + "Expected type: ", DataTypeString(dtype), + " Feature is: ", ProtoDebugString(f))); OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f, feature_list_dense_values[d])); } diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc index 4fbf76d2d0d..4a1aa433bc9 100644 --- a/tensorflow/core/kernels/fact_op.cc +++ b/tensorflow/core/kernels/fact_op.cc @@ -122,13 +122,9 @@ static string D(const char* s) { return ret; } -REGISTER_KERNEL_BUILDER(Name("Fact") - .Device(DEVICE_CPU) - .Label(D("Yoxmos").c_str()), - FactOpKernel2); -REGISTER_KERNEL_BUILDER(Name("Fact") - .Device(DEVICE_CPU) - .Label(D("yoxmos").c_str()), - FactOpKernel2); +REGISTER_KERNEL_BUILDER( + Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2); +REGISTER_KERNEL_BUILDER( + Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2); } // namespace tensorflow diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc index 5953db14768..af3a42135d1 100644 --- a/tensorflow/core/kernels/fake_quant_ops_test.cc +++ b/tensorflow/core/kernels/fake_quant_ops_test.cc @@ -378,9 +378,8 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) { Tensor* output = GetOutput(0); auto input_flat = GetInput(0).flat(); Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3})); - FillValues(&expected, - {0.0f, input_flat(1), input_flat(2), - input_flat(3), input_flat(4), 0.0f}); + FillValues(&expected, {0.0f, input_flat(1), input_flat(2), + input_flat(3), input_flat(4), 0.0f}); ExpectClose(expected, *output); } @@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest, Tensor* output_bprop_wrt_input = GetOutput(0); Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3})); auto grad_flat = GetInput(0).flat(); - FillValues(&expected_bprop_wrt_input, - {0.0f, grad_flat(1), grad_flat(2), - grad_flat(3), grad_flat(4), 0.0f}); + FillValues( + &expected_bprop_wrt_input, + {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f}); ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input); Tensor* output_bprop_wrt_min = GetOutput(1); Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3})); - FillValues(&expected_bprop_wrt_min, - {grad_flat(0), 0.0f, 0.0f}); + FillValues(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f}); ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min); Tensor* output_bprop_wrt_max = GetOutput(2); Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3})); - FillValues(&expected_bprop_wrt_max, - {0.0f, 0.0f, grad_flat(5)}); + FillValues(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)}); ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max); } @@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) { Tensor* output_bprop_wrt_input = GetOutput(0); Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3})); auto grad_flat = GetInput(0).flat(); - FillValues(&expected_bprop_wrt_input, - {0.0f, grad_flat(1), grad_flat(2), - grad_flat(3), grad_flat(4), 0.0f}); + FillValues( + &expected_bprop_wrt_input, + {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f}); ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input); Tensor* output_bprop_wrt_min = GetOutput(1); Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3})); - FillValues(&expected_bprop_wrt_min, - {grad_flat(0), 0.0f, 0.0f}); + FillValues(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f}); ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min); Tensor* output_bprop_wrt_max = GetOutput(2); Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3})); - FillValues(&expected_bprop_wrt_max, - {0.0f, 0.0f, grad_flat(5)}); + FillValues(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)}); ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max); } @@ -2270,14 +2265,13 @@ TEST_F(QuantOpsTest, Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4})); auto grad_flat = GetInput(0).flat(); - FillValues( - &expected_bprop_wrt_input, - {0.0f, grad_flat(1), grad_flat(2), 0.0f, - 0.0f, grad_flat(5), grad_flat(6), 0.0f, - 0.0f, grad_flat(9), grad_flat(10), 0.0f, - 0.0f, grad_flat(13), grad_flat(14), 0.0f, - 0.0f, grad_flat(17), grad_flat(18), 0.0f, - 0.0f, grad_flat(21), grad_flat(22), 0.0f}); + FillValues(&expected_bprop_wrt_input, + {0.0f, grad_flat(1), grad_flat(2), 0.0f, + 0.0f, grad_flat(5), grad_flat(6), 0.0f, + 0.0f, grad_flat(9), grad_flat(10), 0.0f, + 0.0f, grad_flat(13), grad_flat(14), 0.0f, + 0.0f, grad_flat(17), grad_flat(18), 0.0f, + 0.0f, grad_flat(21), grad_flat(22), 0.0f}); ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input); Tensor* output_bprop_wrt_min = GetOutput(1); diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc index 82ec8791198..479f7be4b50 100644 --- a/tensorflow/core/kernels/fifo_queue.cc +++ b/tensorflow/core/kernels/fifo_queue.cc @@ -255,97 +255,96 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx, // TODO(josh11b): This makes two copies of callback, avoid this if possible. dequeue_attempts_.emplace_back( num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token, - [callback, allow_small_batch, this](Attempt* attempt) - EXCLUSIVE_LOCKS_REQUIRED(mu_) { - int64 queue_size = queues_[0].size(); + [callback, allow_small_batch, + this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + int64 queue_size = queues_[0].size(); - if (closed_ && queue_size < attempt->elements_requested) { - // If we don't have enough for a full dequeue, we have - // to reset the attempt tuple. - if (!attempt->tuple.empty()) { - // Restore already-dequeued elements to the front of the - // queue. - for (int64 i = attempt->tuple[0].dim_size(0) - - attempt->elements_requested - 1; - i >= 0; --i) { - for (int j = 0; j < num_components(); ++j) { - PersistentTensor element; - Status s = GetElementComponentFromBatch( - attempt->tuple, i, j, attempt->context, &element); - if (!s.ok()) { - attempt->context->SetStatus( - errors::DataLoss("Failed to restore element from " - "partially-dequeued batch " - "to FIFOQueue: ", - s.error_message())); - } - queues_[j].push_front(element); - } - } - } - if (allow_small_batch && !queues_[0].empty()) { - // Request all remaining elements in the queue. - queue_size = queues_[0].size(); - attempt->tuple.clear(); - attempt->elements_requested = queue_size; - } else { - if (allow_small_batch) { - // There may be some other attempts containing - // values. If so, we'll yield and wait for them - // to add elements to the queue. - if (!enqueue_attempts_.empty()) return kProgress; - } - if (attempt->context->status().ok()) { - attempt->context->SetStatus(errors::OutOfRange( - "FIFOQueue '", name_, "' is closed and has ", - "insufficient elements (requested ", - attempt->elements_requested, ", current size ", - queue_size, ")")); - } - return kComplete; - } - } - - RunResult result = kNoProgress; - for (; queue_size > 0; --queue_size) { - if (attempt->tuple.empty()) { - // Only allocate tuple when we have something to dequeue - // so we don't use excessive memory when there are many - // blocked dequeue attempts waiting. - attempt->tuple.reserve(num_components()); - for (int i = 0; i < num_components(); ++i) { - const TensorShape shape = - ManyOutShape(i, attempt->elements_requested); - Tensor element; + if (closed_ && queue_size < attempt->elements_requested) { + // If we don't have enough for a full dequeue, we have + // to reset the attempt tuple. + if (!attempt->tuple.empty()) { + // Restore already-dequeued elements to the front of the + // queue. + for (int64 i = attempt->tuple[0].dim_size(0) - + attempt->elements_requested - 1; + i >= 0; --i) { + for (int j = 0; j < num_components(); ++j) { + PersistentTensor element; + Status s = GetElementComponentFromBatch( + attempt->tuple, i, j, attempt->context, &element); + if (!s.ok()) { attempt->context->SetStatus( - attempt->context->allocate_temp(component_dtypes_[i], - shape, &element)); - if (!attempt->context->status().ok()) return kComplete; - attempt->tuple.emplace_back(element); + errors::DataLoss("Failed to restore element from " + "partially-dequeued batch " + "to FIFOQueue: ", + s.error_message())); } - } - result = kProgress; - Tuple tuple; - DequeueLocked(attempt->context, &tuple); - const int64 index = attempt->tuple[0].dim_size(0) - - attempt->elements_requested; - for (int i = 0; i < num_components(); ++i) { - attempt->context->SetStatus(batch_util::CopyElementToSlice( - std::move(tuple[i]), &attempt->tuple[i], index)); - if (!attempt->context->status().ok()) return kComplete; - } - tuple.clear(); - --attempt->elements_requested; - if (attempt->elements_requested == 0) { - tuple = attempt->tuple; - attempt->done_callback = [callback, tuple]() { - callback(tuple); - }; - return kComplete; + queues_[j].push_front(element); } } - return result; - }); + } + if (allow_small_batch && !queues_[0].empty()) { + // Request all remaining elements in the queue. + queue_size = queues_[0].size(); + attempt->tuple.clear(); + attempt->elements_requested = queue_size; + } else { + if (allow_small_batch) { + // There may be some other attempts containing + // values. If so, we'll yield and wait for them + // to add elements to the queue. + if (!enqueue_attempts_.empty()) return kProgress; + } + if (attempt->context->status().ok()) { + attempt->context->SetStatus(errors::OutOfRange( + "FIFOQueue '", name_, "' is closed and has ", + "insufficient elements (requested ", + attempt->elements_requested, ", current size ", + queue_size, ")")); + } + return kComplete; + } + } + + RunResult result = kNoProgress; + for (; queue_size > 0; --queue_size) { + if (attempt->tuple.empty()) { + // Only allocate tuple when we have something to dequeue + // so we don't use excessive memory when there are many + // blocked dequeue attempts waiting. + attempt->tuple.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + const TensorShape shape = + ManyOutShape(i, attempt->elements_requested); + Tensor element; + attempt->context->SetStatus(attempt->context->allocate_temp( + component_dtypes_[i], shape, &element)); + if (!attempt->context->status().ok()) return kComplete; + attempt->tuple.emplace_back(element); + } + } + result = kProgress; + Tuple tuple; + DequeueLocked(attempt->context, &tuple); + const int64 index = + attempt->tuple[0].dim_size(0) - attempt->elements_requested; + for (int i = 0; i < num_components(); ++i) { + attempt->context->SetStatus(batch_util::CopyElementToSlice( + std::move(tuple[i]), &attempt->tuple[i], index)); + if (!attempt->context->status().ok()) return kComplete; + } + tuple.clear(); + --attempt->elements_requested; + if (attempt->elements_requested == 0) { + tuple = attempt->tuple; + attempt->done_callback = [callback, tuple]() { + callback(tuple); + }; + return kComplete; + } + } + return result; + }); } } if (!already_cancelled) { diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc index bde39770dee..7090417dfdb 100644 --- a/tensorflow/core/kernels/fill_functor.cc +++ b/tensorflow/core/kernels/fill_functor.cc @@ -18,8 +18,8 @@ limitations under the License. #define EIGEN_USE_THREADS #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/variant_encode_decode.h" @@ -60,7 +60,7 @@ DEFINE_SETZERO_CPU(Variant); template void SetZeroFunctor::operator()( const Eigen::SyclDevice& d, typename TTypes::Flat out) { - To32Bit(out).device(d) = To32Bit(out).constant(T(0)); + To32Bit(out).device(d) = To32Bit(out).constant(T(0)); } #define DEFINE_SETZERO_SYCL(T) \ @@ -118,7 +118,8 @@ DEFINE_SETONE_SYCL(double); template struct FillFunctor { - void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes::Flat out, + void operator()(const Eigen::ThreadPoolDevice& d, + typename TTypes::Flat out, typename TTypes::ConstScalar in) { out.device(d) = out.constant(in()); } @@ -150,8 +151,7 @@ struct FillFunctor { } }; -#define DEFINE_FILL_SYCL(T) \ - template struct FillFunctor; +#define DEFINE_FILL_SYCL(T) template struct FillFunctor; DEFINE_FILL_SYCL(float); DEFINE_FILL_SYCL(double); TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL) diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc index 47f4189c30f..135d0023458 100644 --- a/tensorflow/core/kernels/fractional_avg_pool_op.cc +++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc @@ -232,8 +232,9 @@ class FractionalAvgPoolGradOp : public OpKernel { // Grab the inputs. const Tensor& orig_input_tensor_shape = context->input(0); - OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 && - orig_input_tensor_shape.NumElements() == 4, + OP_REQUIRES(context, + orig_input_tensor_shape.dims() == 1 && + orig_input_tensor_shape.NumElements() == 4, errors::InvalidArgument("original input tensor shape must be" "1-dimensional and 4 elements")); const Tensor& out_backprop = context->input(1); diff --git a/tensorflow/core/kernels/fractional_pool_common.h b/tensorflow/core/kernels/fractional_pool_common.h index df0bbbfa066..2d7a230fc00 100644 --- a/tensorflow/core/kernels/fractional_pool_common.h +++ b/tensorflow/core/kernels/fractional_pool_common.h @@ -57,7 +57,7 @@ static inline void RandomShuffle(Iter first, Iter last, const Random& uniform) { // * sum(generated_diff_pooling_sequence) = input_length // * Let's define floor(input_length / output_length) = K, then // K <= generated_diff_pooling_sequence[i] <= K+1 -// For example, when input_length = 10, output_length = 6, the followings are +// For example, when input_length = 10, output_length = 6, the following are // valid pooling sequence: // * [1, 2, 2, 1, 2, 2] // * [1, 1, 2, 2, 2, 2] diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc index ef9e8484132..9d4bc35ba89 100644 --- a/tensorflow/core/kernels/function_ops.cc +++ b/tensorflow/core/kernels/function_ops.cc @@ -253,22 +253,21 @@ class SymbolicGradientOp : public AsyncOpKernel { args.push_back(ctx->input(i)); } std::vector* rets = new std::vector; - lib->Run( - opts, handle, args, rets, [ctx, done, rets](const Status& status) { - if (!status.ok()) { - ctx->SetStatus(status); - } else if (rets->size() != ctx->num_outputs()) { - ctx->SetStatus(errors::InvalidArgument( - "SymGrad expects to return ", ctx->num_outputs(), - " tensor(s), but get ", rets->size(), " tensor(s) instead.")); - } else { - for (size_t i = 0; i < rets->size(); ++i) { - ctx->set_output(i, (*rets)[i]); - } - } - delete rets; - done(); - }); + lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) { + if (!status.ok()) { + ctx->SetStatus(status); + } else if (rets->size() != ctx->num_outputs()) { + ctx->SetStatus(errors::InvalidArgument( + "SymGrad expects to return ", ctx->num_outputs(), + " tensor(s), but get ", rets->size(), " tensor(s) instead.")); + } else { + for (size_t i = 0; i < rets->size(); ++i) { + ctx->set_output(i, (*rets)[i]); + } + } + delete rets; + done(); + }); } private: diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc index a8484390b92..4a67b2b3a30 100644 --- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc +++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc @@ -68,7 +68,8 @@ void InvVarianceToVariance::operator()(const Eigen::GpuDevice& d, template void SetNanFunctor::operator()(const Eigen::GpuDevice& d, typename TTypes::Flat out) { - To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits::quiet_NaN()); + To32Bit(out).device(d) = + To32Bit(out).constant(Eigen::NumTraits::quiet_NaN()); } template class VarianceToInvVariance; diff --git a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc index 6d4a9dfdef4..37edd1ce0f9 100644 --- a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc index b084a972049..f3b24b2341e 100644 --- a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc index 9dd795b94e8..e9ffad17861 100644 --- a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc index 4a68a5b5803..020f18b1895 100644 --- a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc index 2d6c82826cf..a8f07f4bad3 100644 --- a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc index 81b6e491248..f5dd47a052c 100644 --- a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc index d91a351c596..4d736a21602 100644 --- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h index 0c0e548a909..f1f3f199df1 100644 --- a/tensorflow/core/kernels/fuzzing/fuzz_session.h +++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_ -#define LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_ +#ifndef TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_ +#define TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_ #include "tensorflow/cc/framework/scope.h" #include "tensorflow/core/graph/graph.h" @@ -153,4 +153,4 @@ class FuzzStringInputOp : public FuzzSession { } // end namespace fuzzing } // end namespace tensorflow -#endif // LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_ +#endif // TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_ diff --git a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc index ac3a12aa399..5c3fc4a2795 100644 --- a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc index 978fcd10282..c90ad2cfeb7 100644 --- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc index 7d1aa1fbf3a..738d78e99a0 100644 --- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc index 94255d215e5..e98363ffbf1 100644 --- a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc +++ b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/kernels/fuzzing/fuzz_session.h" namespace tensorflow { namespace fuzzing { diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc index dde08b37eac..e6fefe643b7 100644 --- a/tensorflow/core/kernels/gather_functor.cc +++ b/tensorflow/core/kernels/gather_functor.cc @@ -25,12 +25,12 @@ typedef Eigen::GpuDevice GPUDevice; namespace functor { // Forward declarations of the functor specializations for GPU. -#define DECLARE_GPU_SPECS_INDEX(T, Index) \ - template <> \ - int64 GatherFunctor::operator()( \ +#define DECLARE_GPU_SPECS_INDEX(T, Index) \ + template <> \ + int64 GatherFunctor::operator()( \ OpKernelContext* ctx, typename TTypes::ConstTensor Tparams, \ - typename TTypes::ConstFlat Tindices, \ - typename TTypes::Tensor Tout); \ + typename TTypes::ConstFlat Tindices, \ + typename TTypes::Tensor Tout); \ extern template struct GatherFunctor; #define DECLARE_GPU_SPECS(T) \ diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h index 1e429a037e8..16ccb03b850 100644 --- a/tensorflow/core/kernels/gather_functor.h +++ b/tensorflow/core/kernels/gather_functor.h @@ -18,12 +18,12 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/type_traits.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/platform/prefetch.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/util/work_sharder.h" namespace tensorflow { @@ -52,21 +52,23 @@ SliceIndex HandleCopies(OpKernelContext* ctx, const size_t slice_bytes = slice_elems * sizeof(T); auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); mutex mu; - // Store the value of invalidate index for printing error information, it's a shared variable. + // Store the value of invalidate index for printing error information, it's a + // shared variable. SliceIndex result = -1; - auto work = [&] (int64 start, int64 end) { + auto work = [&](int64 start, int64 end) { SliceIndex batch_idx = static_cast(start / indices_size); SliceIndex indices_idx = static_cast(start % indices_size); SliceIndex batch_idx_end = static_cast(end / indices_size); SliceIndex indices_idx_end = static_cast(end % indices_size); while ((batch_idx < batch_idx_end) || - (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) { + (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) { SliceIndex i_next = indices_idx + 1; SliceIndex b_next = batch_idx + 1; if ((batch_idx == batch_idx_end && i_next < indices_idx_end) || - (i_next < indices_size)) { - port::prefetch(¶ms(batch_idx, indices(i_next), 0)); + (i_next < indices_size)) { + port::prefetch( + ¶ms(batch_idx, indices(i_next), 0)); port::prefetch(&out(batch_idx, i_next, 0)); b_next = batch_idx; } else if (b_next <= batch_idx_end) { @@ -85,11 +87,12 @@ SliceIndex HandleCopies(OpKernelContext* ctx, // ahead-of-time compilation binary size). if (is_simple_type::value) { // Avoid auto-promotion to Index from SliceIndex by casting. - memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems, - params_base + (batch_idx * static_cast(limit) + - static_cast(index)) * - slice_elems, - slice_bytes); + memcpy( + out_base + (batch_idx * indices_size + indices_idx) * slice_elems, + params_base + (batch_idx * static_cast(limit) + + static_cast(index)) * + slice_elems, + slice_bytes); } else { // For non-"simple" types (e.g. strings). out.template chip<1>(indices_idx) = params.template chip<1>(index); @@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx, } }; - Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size, - slice_elems * sizeof(T), work); + Shard(worker_threads->num_threads, worker_threads->workers, + batch_size * indices_size, slice_elems * sizeof(T), work); return result; } @@ -117,16 +120,16 @@ struct GatherFunctorCPU { bool use_large = (slice_size > std::numeric_limits::max() || params.size() > std::numeric_limits::max() || N > std::numeric_limits::max()); -#define CALL(elems) \ - do { \ - if (use_large) { \ - bad_i = HandleCopies(ctx, params, indices, \ - slice_size, out); \ - } else { \ - const int32 small_slice = static_cast(slice_size); \ - bad_i = HandleCopies(ctx, params, indices, \ - small_slice, out); \ - } \ +#define CALL(elems) \ + do { \ + if (use_large) { \ + bad_i = HandleCopies(ctx, params, indices, \ + slice_size, out); \ + } else { \ + const int32 small_slice = static_cast(slice_size); \ + bad_i = HandleCopies(ctx, params, indices, \ + small_slice, out); \ + } \ } while (0) if (slice_size == 10) @@ -143,7 +146,8 @@ struct GatherFunctorCPU { template struct GatherFunctor { - int64 operator()(OpKernelContext* ctx, typename TTypes::ConstTensor params, + int64 operator()(OpKernelContext* ctx, + typename TTypes::ConstTensor params, typename TTypes::ConstFlat indices, typename TTypes::Tensor out); }; diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc index 239d5d2e990..08adf4badbc 100644 --- a/tensorflow/core/kernels/gather_op.cc +++ b/tensorflow/core/kernels/gather_op.cc @@ -18,6 +18,8 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/variant.h" +#include "tensorflow/core/framework/variant_encode_decode.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/kernels/gather_functor.h" #include "tensorflow/core/platform/mem.h" @@ -106,8 +108,7 @@ class GatherOp : public OpKernel { auto out_flat = out->shaped({outer_size, N, inner_size}); functor::GatherFunctor functor; - int64 bad_i = functor(c, params_flat, - indices_flat, out_flat); + int64 bad_i = functor(c, params_flat, indices_flat, out_flat); OP_REQUIRES( c, bad_i < 0, diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc index f0d7c670a62..4040bf52bff 100644 --- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc +++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc @@ -46,7 +46,7 @@ GraphTransferUtils::GetTopNFloatResults(const float* const data, GetTopNFloatResults(data, labels, element_count); LOG(INFO) << "=== Dump ranking ==="; for (int i = 0; i < top_n; ++i) { - const std::tuple &entry = queue.top(); + const std::tuple& entry = queue.top(); LOG(INFO) << i << ": " << std::get<1>(entry) << ", " << std::get<2>(entry) << ", " << std::get<0>(entry); queue.pop(); diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h index a360d188cc2..0d43d028cdb 100644 --- a/tensorflow/core/kernels/hexagon/graph_transferer.h +++ b/tensorflow/core/kernels/hexagon/graph_transferer.h @@ -181,8 +181,8 @@ class GraphTransferer { void AppendNodeInputParams(const int id, const Node& node, const std::vector& extra_inputs); - void AppendNodeOutputParams(const ShapeRefiner& shape_refiner, - const int id, const Node& node); + void AppendNodeOutputParams(const ShapeRefiner& shape_refiner, const int id, + const Node& node); static std::array BuildShapeArray( const shape_inference::ShapeHandle& shape_handle, diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc index 536d295506c..20b09f144ba 100644 --- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc +++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc @@ -42,8 +42,7 @@ constexpr float VALUE_TOLERANCE_FLOAT = 1e-8f; class GraphTransfererTest : public ::testing::Test { protected: - void SetUp() final { - } + void SetUp() final {} GraphTransferer gt_; }; @@ -61,7 +60,7 @@ class TestGraphTransferOpsDefinitions : public IRemoteFusedGraphOpsDefinitions { } } return -1; -} + } private: const std::vector op_types_{"INPUT", "OUTPUT", "Conv2D", diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc index 71bc4187b74..3f794dfb1a0 100644 --- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc +++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc @@ -420,7 +420,7 @@ TEST(GraphTransferer, false, // is_text_proto false, // shape_inference_for_unknown_shape true // dry_run_for_unknown_shape - ); + ); ASSERT_TRUE(status.ok()) << status; prof.Stop(); prof.DumpStatistics("LoadGraphFromProtoFile"); @@ -487,7 +487,7 @@ TEST(GraphTransferer, false, // is_text_proto true, // shape_inference_for_unknown_shape false // dry_run_for_unknown_shape - ); + ); ASSERT_TRUE(status.ok()) << status; prof.Stop(); prof.DumpStatistics("LoadGraphFromProtoFile"); @@ -556,7 +556,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) { false, // is_text_proto false, // shape_inference_for_unknown_shape true // dry_run_for_unknown_shape - ); + ); const GraphTransferInfo& gfi0 = gt0.GetGraphTransferInfo(); ASSERT_TRUE(status.ok()); @@ -576,7 +576,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) { false, // is_text_proto true, // shape_inference_for_unknown_shape false // dry_run_for_unknown_shape - ); + ); const GraphTransferInfo& gfi1 = gt1.GetGraphTransferInfo(); ASSERT_TRUE(status.ok()); diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h index 789a7ce7a3d..d303e9c877e 100644 --- a/tensorflow/core/kernels/hinge-loss.h +++ b/tensorflow/core/kernels/hinge-loss.h @@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater { // valid value for new dual = 0 // c. new optimal value > 1.0. Then new optimal value should be set to 1.0. const double candidate_optimal_dual = - current_dual + - (label - wx) / - (num_loss_partitions * example_weight * weighted_example_norm); + current_dual + (label - wx) / (num_loss_partitions * example_weight * + weighted_example_norm); if (label * candidate_optimal_dual < 0) { return 0.0; } diff --git a/tensorflow/core/kernels/histogram_op_gpu.cu.cc b/tensorflow/core/kernels/histogram_op_gpu.cu.cc index c2bb958be8b..a88e9b0ddcd 100644 --- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc +++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc @@ -17,16 +17,16 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/histogram_op.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "external/cub_archive/cub/device/device_histogram.cuh" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/histogram_op.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/cuda_kernel_helper.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor { /* num_samples */ num_samples, /* stream */ stream); if (err != cudaSuccess) { - return errors::Internal("Could not launch HistogramRange: ", - cudaGetErrorString(err), "."); + return errors::Internal( + "Could not launch HistogramRange: ", cudaGetErrorString(err), "."); } return Status::OK(); diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc index 1db9263e5d3..a18a72c66dc 100644 --- a/tensorflow/core/kernels/identity_op.cc +++ b/tensorflow/core/kernels/identity_op.cc @@ -128,6 +128,7 @@ REGISTER_GPU_KERNEL(Variant); REGISTER_GPU_HOST_KERNEL(int32); REGISTER_GPU_HOST_KERNEL(bool); +REGISTER_GPU_HOST_KERNEL(string); #undef REGISTER_GPU_HOST_KERNEL diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h index f088315ff53..faf997be05c 100644 --- a/tensorflow/core/kernels/image_resizer_state.h +++ b/tensorflow/core/kernels/image_resizer_state.h @@ -109,8 +109,9 @@ struct ImageResizerState { ValidateAndCalculateOutputSize(context, input); if (!context->status().ok()) return; OP_REQUIRES_OK(context, context->allocate_output( - 0, TensorShape({input.dim_size(0), out_height, - out_width, input.dim_size(3)}), + 0, + TensorShape({input.dim_size(0), out_height, + out_width, input.dim_size(3)}), &output)); } @@ -168,8 +169,9 @@ struct ImageResizerGradientState { CalculateResizeScale(original_width, resized_width, align_corners_); output = nullptr; OP_REQUIRES_OK(context, context->allocate_output( - 0, TensorShape({batch_size, original_height, - original_width, channels}), + 0, + TensorShape({batch_size, original_height, + original_width, channels}), &output)); } diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc index e2861ae090c..c37055239c2 100644 --- a/tensorflow/core/kernels/in_topk_op.cc +++ b/tensorflow/core/kernels/in_topk_op.cc @@ -17,11 +17,11 @@ limitations under the License. #define EIGEN_USE_THREADS +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/kernels/bounds_check.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -98,36 +98,36 @@ class InTopK : public OpKernel { int k_; }; -REGISTER_KERNEL_BUILDER( - Name("InTopK").Device(DEVICE_CPU) - .HostMemory("predictions") - .HostMemory("targets") - .HostMemory("precision") - .TypeConstraint("T"), - InTopK); -REGISTER_KERNEL_BUILDER( - Name("InTopK").Device(DEVICE_CPU) - .HostMemory("predictions") - .HostMemory("targets") - .HostMemory("precision") - .TypeConstraint("T"), - InTopK); +REGISTER_KERNEL_BUILDER(Name("InTopK") + .Device(DEVICE_CPU) + .HostMemory("predictions") + .HostMemory("targets") + .HostMemory("precision") + .TypeConstraint("T"), + InTopK); +REGISTER_KERNEL_BUILDER(Name("InTopK") + .Device(DEVICE_CPU) + .HostMemory("predictions") + .HostMemory("targets") + .HostMemory("precision") + .TypeConstraint("T"), + InTopK); -REGISTER_KERNEL_BUILDER( - Name("InTopKV2").Device(DEVICE_CPU) - .HostMemory("predictions") - .HostMemory("targets") - .HostMemory("k") - .HostMemory("precision") - .TypeConstraint("T"), - InTopK); -REGISTER_KERNEL_BUILDER( - Name("InTopKV2").Device(DEVICE_CPU) - .HostMemory("predictions") - .HostMemory("targets") - .HostMemory("k") - .HostMemory("precision") - .TypeConstraint("T"), - InTopK); +REGISTER_KERNEL_BUILDER(Name("InTopKV2") + .Device(DEVICE_CPU) + .HostMemory("predictions") + .HostMemory("targets") + .HostMemory("k") + .HostMemory("precision") + .TypeConstraint("T"), + InTopK); +REGISTER_KERNEL_BUILDER(Name("InTopKV2") + .Device(DEVICE_CPU) + .HostMemory("predictions") + .HostMemory("targets") + .HostMemory("k") + .HostMemory("precision") + .TypeConstraint("T"), + InTopK); } // namespace tensorflow diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc index 7728ba850c9..a71d047ed1a 100644 --- a/tensorflow/core/kernels/inplace_ops.cc +++ b/tensorflow/core/kernels/inplace_ops.cc @@ -27,13 +27,13 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SyclDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace functor { template -Status DoParallelConcatUpdate(const Device& d, const Tensor& value, - int32 loc, Tensor* output) { +Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc, + Tensor* output) { auto Tvalue = value.shaped({1, value.NumElements()}); auto Toutput = output->flat_outer_dims(); auto nrows = Toutput.dimension(0); @@ -74,7 +74,7 @@ Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc, return errors::InvalidArgument("Unsupported data type: ", value.dtype()); } } -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace functor @@ -207,7 +207,7 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate") .HostMemory("output") .TypeConstraint("T"), ParallelConcatUpdate); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #if GOOGLE_CUDA diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc index f8ed9351579..f561287f7a1 100644 --- a/tensorflow/core/kernels/l2loss_op.cc +++ b/tensorflow/core/kernels/l2loss_op.cc @@ -17,8 +17,8 @@ limitations under the License. #define EIGEN_USE_THREADS -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/kernels/l2loss_op.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc index 36907fb5716..b58bcf58348 100644 --- a/tensorflow/core/kernels/linalg_ops_common.cc +++ b/tensorflow/core/kernels/linalg_ops_common.cc @@ -108,7 +108,6 @@ void LinearAlgebraOp::Compute(OpKernelContext* context) { auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); Shard(worker_threads.num_threads, worker_threads.workers, batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard); - } template diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc index 31a427f2c90..2474fe4d564 100755 --- a/tensorflow/core/kernels/lmdb_reader_op.cc +++ b/tensorflow/core/kernels/lmdb_reader_op.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/framework/reader_op_kernel.h" #include "tensorflow/core/framework/reader_base.h" +#include "tensorflow/core/framework/reader_op_kernel.h" #include "tensorflow/core/lib/core/errors.h" #include @@ -26,9 +26,8 @@ namespace tensorflow { class LMDBReader : public ReaderBase { public: - LMDBReader(const string& node_name, Env* env) + LMDBReader(const string& node_name, Env* /*unused*/) : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")), - env_(env), mdb_env_(nullptr), mdb_dbi_(0), mdb_txn_(nullptr), @@ -77,15 +76,13 @@ class LMDBReader : public ReaderBase { *at_end = true; return Status::OK(); } - } - else { + } else { if (Seek(MDB_NEXT) == false) { *at_end = true; return Status::OK(); } } - *key = string(static_cast(mdb_key_.mv_data), - mdb_key_.mv_size); + *key = string(static_cast(mdb_key_.mv_data), mdb_key_.mv_size); *value = string(static_cast(mdb_value_.mv_data), mdb_value_.mv_size); *produced = true; @@ -109,7 +106,6 @@ class LMDBReader : public ReaderBase { } } - Env* const env_; MDB_env* mdb_env_; MDB_dbi mdb_dbi_; @@ -123,13 +119,10 @@ class LMDBReaderOp : public ReaderOpKernel { explicit LMDBReaderOp(OpKernelConstruction* context) : ReaderOpKernel(context) { Env* env = context->env(); - SetReaderFactory([this, env]() { - return new LMDBReader(name(), env); - }); + SetReaderFactory([this, env]() { return new LMDBReader(name(), env); }); } }; -REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), - LMDBReaderOp); +REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), LMDBReaderOp); } // namespace tensorflow diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h index 2765f42bbdc..6479e6f5dc3 100644 --- a/tensorflow/core/kernels/logistic-loss.h +++ b/tensorflow/core/kernels/logistic-loss.h @@ -122,10 +122,9 @@ class LogisticLossUpdater : public DualLossUpdater { num_loss_partitions * weighted_example_norm * example_weight * (0.5 * (1 + tanhx) / label - current_dual); - const double denominator = -2 * label - - num_loss_partitions * weighted_example_norm * - example_weight * (1 - tanhx * tanhx) * 0.5 / - label; + const double denominator = + -2 * label - num_loss_partitions * weighted_example_norm * + example_weight * (1 - tanhx * tanhx) * 0.5 / label; return x - numerator / denominator; } }; diff --git a/tensorflow/core/kernels/loss_test.cc b/tensorflow/core/kernels/loss_test.cc index 89f0677e1f5..460d65c5c27 100644 --- a/tensorflow/core/kernels/loss_test.cc +++ b/tensorflow/core/kernels/loss_test.cc @@ -32,14 +32,17 @@ namespace { TEST(LogisticLoss, ComputePrimalLoss) { LogisticLossUpdater loss_updater; - EXPECT_NEAR(0.693147, loss_updater.ComputePrimalLoss( - 0 /* wx */, 1 /* label */, 1 /* example weight */), + EXPECT_NEAR(0.693147, + loss_updater.ComputePrimalLoss(0 /* wx */, 1 /* label */, + 1 /* example weight */), 1e-3); - EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */, - 1 /* example weight */), + EXPECT_NEAR(0.0, + loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */, + 1 /* example weight */), 1e-3); - EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */, - 1 /* example weight */), + EXPECT_NEAR(0.0, + loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */, + 1 /* example weight */), 1e-3); } @@ -53,31 +56,35 @@ TEST(LogisticLoss, ComputeDualLoss) { loss_updater.ComputeDualLoss(1 /* current dual */, 1 /* label */, 1 /* example weight */), 1e-3); - EXPECT_NEAR(-0.693147, loss_updater.ComputeDualLoss(0.5 /* current dual */, - 1 /* label */, - 1 /* example weight */), - 1e-3); + EXPECT_NEAR( + -0.693147, + loss_updater.ComputeDualLoss(0.5 /* current dual */, 1 /* label */, + 1 /* example weight */), + 1e-3); } TEST(LogisticLoss, ComputeUpdatedDual) { LogisticLossUpdater loss_updater; - EXPECT_NEAR(0.479, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, 0.5 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(0.479, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, 0.5 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); - EXPECT_NEAR(-0.031, loss_updater.ComputeUpdatedDual( - 2 /* num partitions */, -1.0 /* label */, - 1.0 /* example weight */, 0.1 /* current_dual */, - -0.8 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-0.031, + loss_updater.ComputeUpdatedDual( + 2 /* num partitions */, -1.0 /* label */, + 1.0 /* example weight */, 0.1 /* current_dual */, + -0.8 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); } TEST(SquaredLoss, ComputePrimalLoss) { SquaredLossUpdater loss_updater; - EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, - 1.0 /* example weight */), + EXPECT_NEAR(0.5, + loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, + 1.0 /* example weight */), 1e-3); EXPECT_NEAR(40.5, loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */, @@ -95,43 +102,50 @@ TEST(SquaredLoss, ComputePrimalLoss) { TEST(SquaredLoss, ComputeDualLoss) { SquaredLossUpdater loss_updater; - EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */, - -1.0 /* label */, - 1.0 /* example weight */), - 1e-3); - EXPECT_NEAR(0.66, loss_updater.ComputeDualLoss(0.2 /* current dual */, - -1.0 /* label */, - 3.0 /* example weight */), - 1e-3); - EXPECT_NEAR(-0.375, loss_updater.ComputeDualLoss(1.5 /* current dual */, - 1.0 /* label */, - 1.0 /* example weight */), - 1e-3); - EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */, - 1.0 /* label */, - 3.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + 0.0, + loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */, + 1.0 /* example weight */), + 1e-3); + EXPECT_NEAR( + 0.66, + loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */, + 3.0 /* example weight */), + 1e-3); + EXPECT_NEAR( + -0.375, + loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */, + 1.0 /* example weight */), + 1e-3); + EXPECT_NEAR( + -1.125, + loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */, + 3.0 /* example weight */), + 1e-3); } TEST(SquaredLoss, ComputeUpdatedDual) { SquaredLossUpdater loss_updater; - EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, 0.3 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(0.336, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, 0.3 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); - EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual( - 5 /* num partitions */, -1.0 /* label */, - 1.0 /* example weight */, -0.4 /* current_dual */, - 0.8 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-0.427, + loss_updater.ComputeUpdatedDual( + 5 /* num partitions */, -1.0 /* label */, + 1.0 /* example weight */, -0.4 /* current_dual */, + 0.8 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); } TEST(HingeLoss, ComputePrimalLoss) { HingeLossUpdater loss_updater; - EXPECT_NEAR(1.0, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, - 1.0 /* example weight */), + EXPECT_NEAR(1.0, + loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, + 1.0 /* example weight */), 1e-3); EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */, @@ -149,10 +163,11 @@ TEST(HingeLoss, ComputePrimalLoss) { TEST(HingeLoss, ComputeDualLoss) { HingeLossUpdater loss_updater; - EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */, - -1.0 /* label */, - 1.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + 0.0, + loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */, + 1.0 /* example weight */), + 1e-3); EXPECT_NEAR( std::numeric_limits::max(), loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */, @@ -163,10 +178,11 @@ TEST(HingeLoss, ComputeDualLoss) { loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */, 1.0 /* example weight */), 1e-3); - EXPECT_NEAR(-1.5, loss_updater.ComputeDualLoss(0.5 /* current dual */, - 1.0 /* label */, - 3.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + -1.5, + loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */, + 3.0 /* example weight */), + 1e-3); } TEST(HingeLoss, ConvertLabel) { @@ -195,28 +211,31 @@ TEST(HingeLoss, ComputeUpdatedDual) { // weighted_example_norm=100.0, it turns out that the optimal value to update // the dual to is 0.507 which is within the permitted range and thus should be // the value returned. - EXPECT_NEAR(0.507, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, 0.5 /* current_dual */, - 0.3 /* wx */, 100.0 /* weighted_example_norm */), + EXPECT_NEAR(0.507, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, 0.5 /* current_dual */, + 0.3 /* wx */, 100.0 /* weighted_example_norm */), 1e-3); // When label=-1.0, example_weight=1.0, current_dual=0.4, wx=0.6, // weighted_example_norm=10.0 and num_loss_partitions=10, it turns out that // the optimal value to update the dual to is 0.384 which is within the // permitted range and thus should be the value returned. - EXPECT_NEAR(-0.416, loss_updater.ComputeUpdatedDual( - 10 /* num partitions */, -1.0 /* label */, - 1.0 /* example weight */, -0.4 /* current_dual */, - 0.6 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-0.416, + loss_updater.ComputeUpdatedDual( + 10 /* num partitions */, -1.0 /* label */, + 1.0 /* example weight */, -0.4 /* current_dual */, + 0.6 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); // When label=1.0, example_weight=1.0, current_dual=-0.5, wx=0.3 and // weighted_example_norm=10.0, it turns out that the optimal value to update // the dual to is -0.43. However, this is outside the allowed [0.0, 1.0] range // and hence the closest permitted value (0.0) should be returned instead. - EXPECT_NEAR(0.0, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, -0.5 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(0.0, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, -0.5 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); // When label=-1.0, example_weight=2.0, current_dual=-1.0, wx=0.3 and @@ -224,17 +243,19 @@ TEST(HingeLoss, ComputeUpdatedDual) { // the dual to is -1.065. However, this is outside the allowed [-1.0, 0.0] // range and hence the closest permitted value (-1.0) should be returned // instead. - EXPECT_NEAR(-1.0, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, -1.0 /* label */, - 2.0 /* example weight */, -1.0 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-1.0, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, -1.0 /* label */, + 2.0 /* example weight */, -1.0 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); } TEST(SmoothHingeLoss, ComputePrimalLoss) { SmoothHingeLossUpdater loss_updater; - EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, - 1.0 /* example weight */), + EXPECT_NEAR(0.5, + loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */, + 1.0 /* example weight */), 1e-3); EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */, @@ -252,10 +273,11 @@ TEST(SmoothHingeLoss, ComputePrimalLoss) { TEST(SmoothHingeLoss, ComputeDualLoss) { SmoothHingeLossUpdater loss_updater; - EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */, - -1.0 /* label */, - 1.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + 0.0, + loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */, + 1.0 /* example weight */), + 1e-3); EXPECT_NEAR( std::numeric_limits::max(), loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */, @@ -266,24 +288,27 @@ TEST(SmoothHingeLoss, ComputeDualLoss) { loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */, 1.0 /* example weight */), 1e-3); - EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */, - 1.0 /* label */, - 3.0 /* example weight */), - 1e-3); + EXPECT_NEAR( + -1.125, + loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */, + 3.0 /* example weight */), + 1e-3); } TEST(SmoothHingeLoss, ComputeUpdatedDual) { SmoothHingeLossUpdater loss_updater; - EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual( - 1 /* num partitions */, 1.0 /* label */, - 1.0 /* example weight */, 0.3 /* current_dual */, - 0.3 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(0.336, + loss_updater.ComputeUpdatedDual( + 1 /* num partitions */, 1.0 /* label */, + 1.0 /* example weight */, 0.3 /* current_dual */, + 0.3 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); - EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual( - 5 /* num partitions */, -1.0 /* label */, - 1.0 /* example weight */, -0.4 /* current_dual */, - 0.8 /* wx */, 10.0 /* weighted_example_norm */), + EXPECT_NEAR(-0.427, + loss_updater.ComputeUpdatedDual( + 5 /* num partitions */, -1.0 /* label */, + 1.0 /* example weight */, -0.4 /* current_dual */, + 0.8 /* wx */, 10.0 /* weighted_example_norm */), 1e-3); } diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc index c905ebc84a6..c3a59c95762 100644 --- a/tensorflow/core/kernels/lrn_op.cc +++ b/tensorflow/core/kernels/lrn_op.cc @@ -229,10 +229,11 @@ class LRNOp : public OpKernel { explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); float tmp; OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp)); @@ -247,9 +248,10 @@ class LRNOp : public OpKernel { const Tensor& in = context->input(0); OP_REQUIRES(context, in.dims() == 4, errors::InvalidArgument("in must be 4-dimensional")); - OP_REQUIRES(context, FastBoundsCheck(in.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("argument to LRN too large")); + OP_REQUIRES( + context, + FastBoundsCheck(in.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument("argument to LRN too large")); // Cast to platform-specific int to avoid conversion warnings. const int batch = static_cast(in.dim_size(0)); const int rows = static_cast(in.dim_size(1)); @@ -448,10 +450,11 @@ class LRNGradOp : public OpKernel { explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); float tmp; OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp)); diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc index 5eb060f6641..cdff7bad5fe 100644 --- a/tensorflow/core/kernels/matching_files_op.cc +++ b/tensorflow/core/kernels/matching_files_op.cc @@ -45,15 +45,14 @@ class MatchingFilesOp : public OpKernel { int num_files = 0; std::vector> all_fnames(num_patterns); for (int i = 0; i < num_patterns; i++) { - OP_REQUIRES_OK( - context, - context->env()->GetMatchingPaths(patterns(i), &all_fnames[i])); + OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(patterns(i), + &all_fnames[i])); num_files += all_fnames[i].size(); } Tensor* output_t = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output( - "filenames", TensorShape({num_files}), &output_t)); + OP_REQUIRES_OK( + context, context->allocate_output("filenames", TensorShape({num_files}), + &output_t)); auto output = output_t->vec(); int index = 0; for (int i = 0; i < num_patterns; ++i) { diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc index cb68690f284..f499ce6519d 100644 --- a/tensorflow/core/kernels/matmul_op.cc +++ b/tensorflow/core/kernels/matmul_op.cc @@ -261,12 +261,12 @@ struct LaunchMatMul { std::vector* algorithms, bool use_autotune, Tensor* out) { using perftools::gputools::blas::AlgorithmConfig; using perftools::gputools::blas::ComputationType; - using perftools::gputools::blas::ProfileResult; - using perftools::gputools::blas::Transpose; using perftools::gputools::blas::kDefaultAlgorithm; using perftools::gputools::blas::kDefaultBlasGemm; using perftools::gputools::blas::kDefaultBlasGemv; using perftools::gputools::blas::kNoAlgorithm; + using perftools::gputools::blas::ProfileResult; + using perftools::gputools::blas::Transpose; Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose}; const uint64 m = a.dim_size(1 - dim_pair[0].first); const uint64 k = a.dim_size(dim_pair[0].first); diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h index 6398da2fb95..628895ca86f 100644 --- a/tensorflow/core/kernels/matmul_op.h +++ b/tensorflow/core/kernels/matmul_op.h @@ -30,7 +30,8 @@ struct MatMulTypes { typedef Eigen::TensorMap, Eigen::Aligned> out_type; typedef Eigen::TensorMap, - Eigen::Aligned> in_type; + Eigen::Aligned> + in_type; }; template ()(); + + auto as_int64_scalar = [](const Tensor& tensor) -> int64 { + if (tensor.dtype() == DT_INT32) { + return tensor.scalar()(); + } else { + return tensor.scalar()(); + } + }; + const int64 num_lower = as_int64_scalar(num_lower_in); OP_REQUIRES( context, num_lower <= input_reshaped.dimension(1), errors::InvalidArgument( @@ -73,7 +81,7 @@ class MatrixBandPartOp : public OpKernel { OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in.shape()), errors::InvalidArgument("num_upper must be scalar, got shape ", num_upper_in.shape().DebugString())); - const int64 num_upper = num_upper_in.scalar()(); + const int64 num_upper = as_int64_scalar(num_upper_in); OP_REQUIRES(context, num_upper <= input_reshaped.dimension(2), errors::InvalidArgument("num_upper must be negative or less or " "equal to number of columns (", diff --git a/tensorflow/core/kernels/matrix_exponential_op.cc b/tensorflow/core/kernels/matrix_exponential_op.cc index 4cc3f32f7e4..99db8983013 100644 --- a/tensorflow/core/kernels/matrix_exponential_op.cc +++ b/tensorflow/core/kernels/matrix_exponential_op.cc @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" - namespace tensorflow { template @@ -40,7 +39,8 @@ class MatrixExponentialOp : public LinearAlgebraOp { MatrixMaps* outputs) final { const ConstMatrixMap& input = inputs[0]; if (input.rows() == 0) return; - using Matrix = Eigen::Matrix; + using Matrix = + Eigen::Matrix; Matrix tmp = input; outputs->at(0) = tmp.exp(); } @@ -51,9 +51,9 @@ class MatrixExponentialOp : public LinearAlgebraOp { REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), float); REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), double); -REGISTER_LINALG_OP("MatrixExponential", - (MatrixExponentialOp), complex64); -REGISTER_LINALG_OP("MatrixExponential", - (MatrixExponentialOp), complex128); +REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), + complex64); +REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp), + complex128); } // namespace tensorflow diff --git a/tensorflow/core/kernels/matrix_logarithm_op.cc b/tensorflow/core/kernels/matrix_logarithm_op.cc index cf0007b5b67..22ca094e243 100644 --- a/tensorflow/core/kernels/matrix_logarithm_op.cc +++ b/tensorflow/core/kernels/matrix_logarithm_op.cc @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" - namespace tensorflow { template @@ -40,7 +39,8 @@ class MatrixLogarithmOp : public LinearAlgebraOp { MatrixMaps* outputs) final { const ConstMatrixMap& input = inputs[0]; if (input.rows() == 0) return; - using Matrix = Eigen::Matrix; + using Matrix = + Eigen::Matrix; Matrix tmp = input; outputs->at(0) = tmp.log(); } @@ -53,9 +53,9 @@ class MatrixLogarithmOp : public LinearAlgebraOp { // logarithm. If all eigenvalues are positive, then this returns the correct // logarithm, however checking for positive definiteness adds significant // overhead. Therefore at present we only register this Op for complex types. -REGISTER_LINALG_OP("MatrixLogarithm", - (MatrixLogarithmOp), complex64); -REGISTER_LINALG_OP("MatrixLogarithm", - (MatrixLogarithmOp), complex128); +REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp), + complex64); +REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp), + complex128); } // namespace tensorflow diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc index 9dd665392bc..502d593474e 100644 --- a/tensorflow/core/kernels/matrix_set_diag_op.cc +++ b/tensorflow/core/kernels/matrix_set_diag_op.cc @@ -69,8 +69,8 @@ class MatrixSetDiagOp : public OpKernel { errors::InvalidArgument( "must have diagonal.shape == input.shape[:-2] + " "min(input.shape[-2:]), but received input shape: ", - input_shape.DebugString(), " and diagonal shape: ", - diag_shape.DebugString())); + input_shape.DebugString(), + " and diagonal shape: ", diag_shape.DebugString())); if (input.NumElements() == 0) { // This is a no-op. diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc index 2eefadad494..9be7408012b 100644 --- a/tensorflow/core/kernels/maxpooling_op.cc +++ b/tensorflow/core/kernels/maxpooling_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/kernels/maxpooling_op.h" #include +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -37,7 +38,6 @@ limitations under the License. #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/util/use_cudnn.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #if GOOGLE_CUDA #include "tensorflow/core/kernels/maxpooling_op_gpu.h" @@ -89,7 +89,6 @@ static void SpatialMaxPoolWithArgMaxHelper( // max value. auto shard = [¶ms, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop, &output_arg_max, &out_backprop](int64 start, int64 limit) { - const int32 depth = params.depth; const int32 in_rows = params.tensor_in_rows; const int32 in_cols = params.tensor_in_cols; @@ -180,7 +179,6 @@ static void SpatialMaxPoolWithArgMaxHelper( input_backprop_flat(input_backprop_index) += out_backprop_flat(index); } } - }; const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols * @@ -567,7 +565,7 @@ class MaxPoolingGradGradOp : public OpKernel { // tensor_out_as_matrix with the corresponding values in // top_diff_as_matrix. auto shard = [¶ms, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat]( - int64 start, int64 limit) { + int64 start, int64 limit) { const int32 depth = params.depth; const int32 in_rows = params.tensor_in_rows; const int32 in_cols = params.tensor_in_cols; diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc index f8daaca4c94..0c7a236b2ff 100644 --- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc +++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc @@ -450,10 +450,10 @@ bool MaxPoolBackwardWithArgmax::operator()( T* bottom_diff, const Eigen::GpuDevice& d) { const int kThreadsPerBlock = 1024; SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock, - kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff); + kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff); MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, d.stream()>>>( - output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff); + output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff); return d.ok(); } diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc index 9fed01189fc..39e60c9fcef 100644 --- a/tensorflow/core/kernels/meta_support.cc +++ b/tensorflow/core/kernels/meta_support.cc @@ -98,9 +98,9 @@ typedef gemmlowp::meta::SimpleContext LocalContext; template void MultiThreadGemm(Context* context, const Params& params) { if (params.m <= 4) { - gemmlowp::meta::MultiThreadGemm< - Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, - 1, 8, 8>(context, params); + gemmlowp::meta::MultiThreadGemm< + Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, 1, + 8, 8>(context, params); } else { if (params.m >= params.n) { gemmlowp::meta::MultiThreadGemm< diff --git a/tensorflow/core/kernels/mfcc.cc b/tensorflow/core/kernels/mfcc.cc index 2793005aa26..8c755e0df87 100644 --- a/tensorflow/core/kernels/mfcc.cc +++ b/tensorflow/core/kernels/mfcc.cc @@ -27,21 +27,19 @@ const double kFilterbankFloor = 1e-12; const int kDefaultFilterbankChannelCount = 40; const int kDefaultDCTCoefficientCount = 13; -Mfcc::Mfcc() : initialized_(false), - lower_frequency_limit_(kDefaultLowerFrequencyLimit), - upper_frequency_limit_(kDefaultUpperFrequencyLimit), - filterbank_channel_count_(kDefaultFilterbankChannelCount), - dct_coefficient_count_(kDefaultDCTCoefficientCount) { } +Mfcc::Mfcc() + : initialized_(false), + lower_frequency_limit_(kDefaultLowerFrequencyLimit), + upper_frequency_limit_(kDefaultUpperFrequencyLimit), + filterbank_channel_count_(kDefaultFilterbankChannelCount), + dct_coefficient_count_(kDefaultDCTCoefficientCount) {} -bool Mfcc::Initialize(int input_length, - double input_sample_rate) { - bool initialized = mel_filterbank_.Initialize(input_length, - input_sample_rate, - filterbank_channel_count_, - lower_frequency_limit_, - upper_frequency_limit_); - initialized &= dct_.Initialize(filterbank_channel_count_, - dct_coefficient_count_); +bool Mfcc::Initialize(int input_length, double input_sample_rate) { + bool initialized = mel_filterbank_.Initialize( + input_length, input_sample_rate, filterbank_channel_count_, + lower_frequency_limit_, upper_frequency_limit_); + initialized &= + dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_); initialized_ = initialized; return initialized; } diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h index 8268f472034..8eee76f7f0c 100644 --- a/tensorflow/core/kernels/mfcc.h +++ b/tensorflow/core/kernels/mfcc.h @@ -20,18 +20,17 @@ limitations under the License. #include +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/mfcc_dct.h" #include "tensorflow/core/kernels/mfcc_mel_filterbank.h" #include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/framework/op_kernel.h" namespace tensorflow { class Mfcc { public: Mfcc(); - bool Initialize(int input_length, - double input_sample_rate); + bool Initialize(int input_length, double input_sample_rate); // Input is a single squared-magnitude spectrogram frame. The input spectrum // is converted to linear magnitude and weighted into bands using a diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc index 630de8a5a33..3db3b51e8b6 100644 --- a/tensorflow/core/kernels/mfcc_mel_filterbank.cc +++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc @@ -38,13 +38,12 @@ namespace tensorflow { MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {} -bool MfccMelFilterbank::Initialize(int input_length, - double input_sample_rate, - int output_channel_count, - double lower_frequency_limit, - double upper_frequency_limit) { +bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate, + int output_channel_count, + double lower_frequency_limit, + double upper_frequency_limit) { num_channels_ = output_channel_count; - sample_rate_ = input_sample_rate; + sample_rate_ = input_sample_rate; input_length_ = input_length; if (num_channels_ < 1) { @@ -85,10 +84,9 @@ bool MfccMelFilterbank::Initialize(int input_length, } // Always exclude DC; emulate HTK. - const double hz_per_sbin = 0.5 * sample_rate_ / - static_cast(input_length_ - 1); - start_index_ = static_cast(1.5 + (lower_frequency_limit / - hz_per_sbin)); + const double hz_per_sbin = + 0.5 * sample_rate_ / static_cast(input_length_ - 1); + start_index_ = static_cast(1.5 + (lower_frequency_limit / hz_per_sbin)); end_index_ = static_cast(upper_frequency_limit / hz_per_sbin); // Maps the input spectrum bin indices to filter bank channels/indices. For @@ -121,12 +119,12 @@ bool MfccMelFilterbank::Initialize(int input_length, weights_[i] = 0.0; } else { if (channel >= 0) { - weights_[i] = (center_frequencies_[channel + 1] - - FreqToMel(i * hz_per_sbin)) / + weights_[i] = + (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) / (center_frequencies_[channel + 1] - center_frequencies_[channel]); } else { weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) / - (center_frequencies_[0] - mel_low); + (center_frequencies_[0] - mel_low); } } } @@ -152,16 +150,16 @@ bool MfccMelFilterbank::Initialize(int input_length, } } if (!bad_channels.empty()) { - LOG(ERROR) << "Missing " << bad_channels.size() << " bands " << - " starting at " << bad_channels[0] << - " in mel-frequency design. " << - "Perhaps too many channels or " << - "not enough frequency resolution in spectrum. (" << - "input_length: " << input_length << - " input_sample_rate: " << input_sample_rate << - " output_channel_count: " << output_channel_count << - " lower_frequency_limit: " << lower_frequency_limit << - " upper_frequency_limit: " << upper_frequency_limit; + LOG(ERROR) << "Missing " << bad_channels.size() << " bands " + << " starting at " << bad_channels[0] + << " in mel-frequency design. " + << "Perhaps too many channels or " + << "not enough frequency resolution in spectrum. (" + << "input_length: " << input_length + << " input_sample_rate: " << input_sample_rate + << " output_channel_count: " << output_channel_count + << " lower_frequency_limit: " << lower_frequency_limit + << " upper_frequency_limit: " << upper_frequency_limit; } initialized_ = true; return true; @@ -171,7 +169,7 @@ bool MfccMelFilterbank::Initialize(int input_length, // square root, then summing FFT magnitudes under triangular integration windows // whose widths increase with frequency. void MfccMelFilterbank::Compute(const std::vector &input, - std::vector *output) const { + std::vector *output) const { if (!initialized_) { LOG(ERROR) << "Mel Filterbank not initialized."; return; diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h index 1bdc2dc93b8..37c3936e80d 100644 --- a/tensorflow/core/kernels/mfcc_mel_filterbank.h +++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h @@ -27,10 +27,8 @@ class MfccMelFilterbank { public: MfccMelFilterbank(); bool Initialize(int input_length, // Number of unique FFT bins fftsize/2+1. - double input_sample_rate, - int output_channel_count, - double lower_frequency_limit, - double upper_frequency_limit); + double input_sample_rate, int output_channel_count, + double lower_frequency_limit, double upper_frequency_limit); // Takes a squared-magnitude spectrogram slice as input, computes a // triangular-mel-weighted linear-magnitude filterbank, and places the result @@ -56,7 +54,7 @@ class MfccMelFilterbank { // FFT bin i contributes to the upper side of mel channel band_mapper_[i] std::vector band_mapper_; int start_index_; // Lowest FFT bin used to calculate mel spectrum. - int end_index_; // Highest FFT bin used to calculate mel spectrum. + int end_index_; // Highest FFT bin used to calculate mel spectrum. TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank); }; diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc index 602dfeb4e54..54f31e1699e 100644 --- a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc +++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc @@ -34,11 +34,9 @@ TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) { input.push_back(i + 1); } const int kChannelCount = 20; - filterbank.Initialize(input.size(), - 22050 /* sample rate */, - kChannelCount /* channels */, - 20.0 /* lower frequency limit */, - 4000.0 /* upper frequency limit */); + filterbank.Initialize( + input.size(), 22050 /* sample rate */, kChannelCount /* channels */, + 20.0 /* lower frequency limit */, 4000.0 /* upper frequency limit */); std::vector output; filterbank.Compute(input, &output); @@ -65,13 +63,10 @@ TEST(MfccMelFilterbankTest, IgnoresExistingContentOfOutputVector) { std::vector input; std::vector output; - filterbank.Initialize(kSampleCount, - 22050 /* sample rate */, - 20 /* channels */, - 20.0 /* lower frequency limit */, + filterbank.Initialize(kSampleCount, 22050 /* sample rate */, + 20 /* channels */, 20.0 /* lower frequency limit */, 4000.0 /* upper frequency limit */); - // First call with nonzero input value, and an empty output vector, // will resize the output and fill it with the correct, nonzero outputs. input.assign(kSampleCount, 1.0); diff --git a/tensorflow/core/kernels/mfcc_test.cc b/tensorflow/core/kernels/mfcc_test.cc index cb32df8811e..72c1d331d6e 100644 --- a/tensorflow/core/kernels/mfcc_test.cc +++ b/tensorflow/core/kernels/mfcc_test.cc @@ -36,11 +36,10 @@ TEST(MfccTest, AgreesWithPythonGoldenValues) { std::vector output; mfcc.Compute(input, &output); - std::vector expected = {29.13970072, -6.41568601, -0.61903012, - -0.96778652, -0.26819878, -0.40907028, - -0.15614748, -0.23203119, -0.10481487, - -0.1543029, -0.0769791, -0.10806114, - -0.06047613}; + std::vector expected = { + 29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878, + -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029, + -0.0769791, -0.10806114, -0.06047613}; ASSERT_EQ(expected.size(), output.size()); for (int i = 0; i < output.size(); ++i) { diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc index fbdeaf43ebb..26e1082989f 100644 --- a/tensorflow/core/kernels/mirror_pad_op.cc +++ b/tensorflow/core/kernels/mirror_pad_op.cc @@ -87,8 +87,8 @@ class MirrorPadOp : public OpKernel { const Tpaddings before = paddings(d, 0); // Pad before existing elements. const Tpaddings after = paddings(d, 1); // Pad after existing elements. OP_REQUIRES(context, before >= 0 && after >= 0, - errors::InvalidArgument("paddings must be non-negative: ", - before, " ", after)); + errors::InvalidArgument( + "paddings must be non-negative: ", before, " ", after)); if (offset_ == 0) { // SYMMETRIC mode. OP_REQUIRES(context, before <= in0.dim_size(d) && after <= in0.dim_size(d), @@ -296,8 +296,8 @@ class MirrorPadGradOp : public OpKernel { const Tpaddings before = paddings(d, 0); // Pad before existing elements. const Tpaddings after = paddings(d, 1); // Pad after existing elements. OP_REQUIRES(context, before >= 0 && after >= 0, - errors::InvalidArgument("Paddings must be non-negative: ", - before, ", ", after)); + errors::InvalidArgument( + "Paddings must be non-negative: ", before, ", ", after)); const int64 out_size = in0.dim_size(d) - (before + after); if (offset_ == 0) { // SYMMETRIC mode. diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc index 89d37d2f874..ef724f0a296 100644 --- a/tensorflow/core/kernels/mkl_aggregate_ops.cc +++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc @@ -28,7 +28,7 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::stream; using mkldnn::sum; @@ -37,7 +37,7 @@ using mkldnn::sum; namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template class MklAddNOp : public OpKernel { @@ -285,7 +285,7 @@ class MklAddNOp : public OpKernel { } MklAddNOpContext; }; -#else // INTEL_MKL_DNN +#else // INTEL_MKL_ML template class MklAddNOp : public OpKernel { public: @@ -317,8 +317,11 @@ class MklAddNOp : public OpKernel { : src2_tensor.dims(); // if the shapes of two tensors are not same raise op error TensorShape src1_shape, src2_shape; - src1_shape = src1_tensor.shape(); - src2_shape = src2_tensor.shape(); + src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape() + : src1_tensor.shape(); + src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape() + : src2_tensor.shape(); + if (!src1_shape.IsSameSize(src2_shape)) { ctx->SetStatus(errors::InvalidArgument( "Inputs to operation ", this->name(), " of type ", diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc index d751a70fc86..cff1bd18a74 100644 --- a/tensorflow/core/kernels/mkl_avgpooling_op.cc +++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc @@ -24,24 +24,23 @@ #include "tensorflow/core/kernels/mkl_pooling_ops_common.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" -using mkldnn::memory; -using mkldnn::error; -using mkldnn::pooling_forward; -using mkldnn::pooling_backward; -using mkldnn::padding_kind; -using mkldnn::engine; -using mkldnn::prop_kind; using mkldnn::algorithm; +using mkldnn::engine; +using mkldnn::error; +using mkldnn::memory; +using mkldnn::padding_kind; +using mkldnn::pooling_backward; +using mkldnn::pooling_forward; +using mkldnn::prop_kind; #endif namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -// For now, MKL-ML is default. So making MKL-DNN not a default choice. -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template class MklAvgPoolingOp : public OpKernel { @@ -358,10 +357,11 @@ class MklAvgPoolingGradOp : public OpKernel { if (!outbackprop_in_mkl_format) { // For avgpooling, tensor_in_shape should have 1 dimension, and 4 // elements. - OP_REQUIRES(context, tensor_in_shape.dims() == 1 && - tensor_in_shape.NumElements() == 4, - errors::InvalidArgument("original input shape must be " - "1-dimensional and 4 elements")); + OP_REQUIRES( + context, + tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4, + errors::InvalidArgument("original input shape must be " + "1-dimensional and 4 elements")); // For avgpooling, out_backprop should have 4 dimensions. OP_REQUIRES(context, out_backprop.dims() == 4, @@ -429,13 +429,15 @@ class MklAvgPoolingGradOp : public OpKernel { }; // MklAvgPoolingGradOp -#else // INTEL_MKL_DNN is defined + +#else + template class MklAvgPoolingOp : public MklPoolingForwardOpBase { public: explicit MklAvgPoolingOp(OpKernelConstruction* context) - : MklPoolingForwardOpBase(context) { + : MklPoolingForwardOpBase(context) { // Workspace is an MKLDNN construct that is only used in Max Pooling. // So set workspace_enabled_ to false. this->workspace_enabled_ = false; @@ -444,8 +446,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { void Compute(OpKernelContext* context) override { try { auto cpu_engine = engine(engine::cpu, 0); - const Tensor& input_tensor = MklGetInput(context, - this->kInputTensorIndexInput); + const Tensor& input_tensor = + MklGetInput(context, this->kInputTensorIndexInput); MklDnnShape dnn_shape_input; GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input); this->SanityCheckInput(context, input_tensor, dnn_shape_input); @@ -457,9 +459,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { // initialize variables for the pooling op MklPoolParameters pool_params; // Get the input tensor and initialize the pooling parameters - this->ConfigureInput(context, dnn_shape_input, - input_tensor, &pool_params, - &dnn_data_input); + this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params, + &dnn_data_input); OP_REQUIRES_OK(context, context->status()); // Declare output tensor @@ -467,59 +468,77 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase { memory::dims output_dims_mkl_order; this->GetOutputDims(pool_params, &output_dims_mkl_order); + // If input is an empty tensor, allocate an empty output tensor and return + if (input_tensor.NumElements() == 0) { + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(false); + TensorShape output_tf_shape; + if (pool_params.data_format == TensorFormat::FORMAT_NCHW) { + output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order); + } else { + memory::dims output_dims_NHWC_order; + output_dims_NHWC_order = {pool_params.tensor_in_batch, + static_cast(pool_params.out_height), + static_cast(pool_params.out_width), + pool_params.out_depth}; + output_tf_shape = MklDnnDimsToTFShape(output_dims_NHWC_order); + } + const int kOutputIndex = 0; + AllocateOutputSetMklShape(context, kOutputIndex, &output_tensor, + output_tf_shape, output_mkl_shape); + CHECK_NOTNULL(output_tensor); + return; + } + // If input is in Mkl layout, then just get the memory format from it // directly, instead of using input data_format to AvgPool. if (dnn_shape_input.IsMklTensor()) { - dnn_data_output.SetUsrMem(output_dims_mkl_order, - static_cast(dnn_data_input.GetUsrMemDesc() - .data.format)); + dnn_data_output.SetUsrMem( + output_dims_mkl_order, + static_cast( + dnn_data_input.GetUsrMemDesc().data.format)); } else { - dnn_data_output.SetUsrMem(output_dims_mkl_order, - this->data_format_mkldnn_); + dnn_data_output.SetUsrMem(output_dims_mkl_order, + this->data_format_mkldnn_); } - // describe the memory layout + // describe the memory layout dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any); // 3. create a pooling primitive descriptor - auto pool_desc = pooling_forward::desc(prop_kind::forward, - algorithm::pooling_avg_exclude_padding, - dnn_data_input.GetUsrMemDesc(), - dnn_data_output.GetUsrMemDesc(), - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc, - cpu_engine); + auto pool_desc = pooling_forward::desc( + prop_kind::forward, algorithm::pooling_avg_exclude_padding, + dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(), + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_prim_desc = + pooling_forward::primitive_desc(pool_desc, cpu_engine); this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order, - this->data_format_mkldnn_, &output_tensor); + this->data_format_mkldnn_, &output_tensor); CHECK_NOTNULL(output_tensor); OP_REQUIRES_OK(context, context->status()); dnn_data_output.SetUsrMemDataHandle(output_tensor); - this->PrepareAndExecuteNet(pool_prim_desc, - &dnn_data_input, - &dnn_data_output); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + this->PrepareAndExecuteNet(pool_prim_desc, &dnn_data_input, + &dnn_data_output); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } // Compute -}; // MklAvgPoolingOp +}; // MklAvgPoolingOp //----------------------------------------------------------------------------- @@ -527,27 +546,23 @@ template class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { public: explicit MklAvgPoolingGradOp(OpKernelConstruction* context) - : MklPoolingBackwardOpBase(context) { - } + : MklPoolingBackwardOpBase(context) {} void Compute(OpKernelContext* context) override { try { auto cpu_engine = engine(engine::cpu, 0); MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape; - const Tensor& tensor_in_shape = MklGetInput(context, - kInputTensorIndexInputShape); - const Tensor& input_gradient_tensor = MklGetInput(context, - kInputTensorIndexInputGradient); + const Tensor& tensor_in_shape = + MklGetInput(context, kInputTensorIndexInputShape); + const Tensor& input_gradient_tensor = + MklGetInput(context, kInputTensorIndexInputGradient); GetMklShape(context, kInputTensorIndexInputShape, - &original_input_mkl_shape); + &original_input_mkl_shape); GetMklShape(context, kInputTensorIndexInputGradient, - &input_gradient_mkl_shape); + &input_gradient_mkl_shape); - - SanityCheckInputs(context, tensor_in_shape, - input_gradient_tensor, - original_input_mkl_shape, - input_gradient_mkl_shape); + SanityCheckInputs(context, tensor_in_shape, input_gradient_tensor, + original_input_mkl_shape, input_gradient_mkl_shape); if (!context->status().ok()) return; // Used to allocate output_diff_src/diff_src @@ -562,90 +577,70 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { MklPoolParameters pool_params; memory::dims output_dims_mkl_order, original_input_dims_nchw; // Configure the original input memory descriptor - memory::desc original_input_md = ConfigureOriginalInput(context, - tensor_in_shape, - original_input_mkl_shape, - &original_input_dims_nchw, - &pool_params, - &original_input_shape); + memory::desc original_input_md = ConfigureOriginalInput( + context, tensor_in_shape, original_input_mkl_shape, + &original_input_dims_nchw, &pool_params, &original_input_shape); // configure the original output memory descriptor // by definition, the shape of the original output is the same // as the shape of the gradient diff_dst memory::desc original_output_md = this->ConfigureOriginalOutput( - pool_params, input_gradient_mkl_shape, output_dims_mkl_order); + pool_params, input_gradient_mkl_shape, output_dims_mkl_order); memory::desc target_diff_dst_md = this->ConfigureInputGradient( - input_gradient_mkl_shape, - input_gradient_tensor, - &input_gradient_diff_dst, - original_output_md); + input_gradient_mkl_shape, input_gradient_tensor, + &input_gradient_diff_dst, original_output_md); // The shape of the output diff src needs to be the same shape as the // original input. But we will set its format to be same as the format of // input gradient. We won't use format of original input since it will // always be in Tensorflow layout (given that AvgPoolGrad gets shape of // the input rather than actual input). - output_diff_src.SetUsrMem(original_input_dims_nchw, - static_cast( - target_diff_dst_md.data.format)); + output_diff_src.SetUsrMem( + original_input_dims_nchw, + static_cast(target_diff_dst_md.data.format)); // Create the forward pooling primitive descriptor so we can reference it // in the backward pooling primitive descriptor - auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward, - algorithm::pooling_avg_exclude_padding, - original_input_md, - original_output_md, - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_fwd_prim_desc - = pooling_forward::primitive_desc(pool_fwd_desc, - cpu_engine); + auto pool_fwd_desc = pooling_forward::desc( + prop_kind::forward, algorithm::pooling_avg_exclude_padding, + original_input_md, original_output_md, + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_fwd_prim_desc = + pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine); auto pool_bkwd_desc = pooling_backward::desc( - algorithm::pooling_avg_exclude_padding, - output_diff_src.GetUsrMemDesc(), - target_diff_dst_md, - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_bkwd_prim_desc - = pooling_backward::primitive_desc(pool_bkwd_desc, - cpu_engine, - pool_fwd_prim_desc); - this->AllocateOutputTensor(context, pool_bkwd_prim_desc, - original_input_dims_nchw, - this->data_format_mkldnn_, - &output_tensor_diff_src); + algorithm::pooling_avg_exclude_padding, + output_diff_src.GetUsrMemDesc(), target_diff_dst_md, + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_bkwd_prim_desc = pooling_backward::primitive_desc( + pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc); + this->AllocateOutputTensor( + context, pool_bkwd_prim_desc, original_input_dims_nchw, + this->data_format_mkldnn_, &output_tensor_diff_src); output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src); - this->PrepareAndExecuteNet(pool_bkwd_prim_desc, - &input_gradient_diff_dst, - &output_diff_src, - memory::primitive_desc( - target_diff_dst_md, - cpu_engine)); - } catch (mkldnn::error &e) { + this->PrepareAndExecuteNet( + pool_bkwd_prim_desc, &input_gradient_diff_dst, &output_diff_src, + memory::primitive_desc(target_diff_dst_md, cpu_engine)); + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Compute received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:", + error_msg)); } } // Compute @@ -655,12 +650,11 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { const int kInputTensorIndexInputShape = 0; const int kInputTensorIndexInputGradient = 1; - memory::desc ConfigureOriginalInput(OpKernelContext* context, - const Tensor& tensor_original_input_shape, - const MklDnnShape& original_input_mkl_shape, - memory::dims* original_input_dims_mkl_order, - MklPoolParameters* pool_params, - TensorShape* input_tensor_shape) { + memory::desc ConfigureOriginalInput( + OpKernelContext* context, const Tensor& tensor_original_input_shape, + const MklDnnShape& original_input_mkl_shape, + memory::dims* original_input_dims_mkl_order, + MklPoolParameters* pool_params, TensorShape* input_tensor_shape) { CHECK_NOTNULL(original_input_dims_mkl_order); CHECK_NOTNULL(pool_params); CHECK_NOTNULL(input_tensor_shape); @@ -672,47 +666,47 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase { } return MklPoolingBackwardOpBase::ConfigureOriginalInput( - context, - tensor_original_input_shape, - original_input_mkl_shape, - original_input_dims_mkl_order, - pool_params, - *input_tensor_shape); -} + context, tensor_original_input_shape, original_input_mkl_shape, + original_input_dims_mkl_order, pool_params, *input_tensor_shape); + } void SanityCheckInputs(OpKernelContext* context, - const Tensor& tensor_in_shape, - const Tensor& input_gradient_tensor, - const MklDnnShape& original_input_mkl_shape, - const MklDnnShape& input_gradient_mkl_shape) { + const Tensor& tensor_in_shape, + const Tensor& input_gradient_tensor, + const MklDnnShape& original_input_mkl_shape, + const MklDnnShape& input_gradient_mkl_shape) { if (!original_input_mkl_shape.IsMklTensor()) { - OP_REQUIRES(context, tensor_in_shape.dims() == 1 && - tensor_in_shape.NumElements() == 4, + OP_REQUIRES( + context, + tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4, errors::InvalidArgument("original input shape must be " - "1-dimensional and 4 elements")); + "1-dimensional and 4 elements")); } else { - OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 && - original_input_mkl_shape.DimSize(0) == 4, - errors::InvalidArgument("original input shape must be " - "1-dimensional and 4 elements")); + OP_REQUIRES(context, + original_input_mkl_shape.GetDimension() == 1 && + original_input_mkl_shape.DimSize(0) == 4, + errors::InvalidArgument("original input shape must be " + "1-dimensional and 4 elements")); } if (!input_gradient_mkl_shape.IsMklTensor()) { // For avgpooling, input_gradient_diff_dst should have 4 dimensions. OP_REQUIRES(context, input_gradient_tensor.dims() == 4, - errors::InvalidArgument("Gradient shape must be " - "4-dimensional")); + errors::InvalidArgument("Gradient shape must be " + "4-dimensional")); } else { OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Gradient shape must be " - "4-dimensional")); + errors::InvalidArgument("Gradient shape must be " + "4-dimensional")); } } }; // MklAvgPoolingGradOp -#endif // INTEL_MKL_DNN + +#endif // INTEL_MKL_ML + REGISTER_KERNEL_BUILDER(Name("_MklAvgPool") .Device(DEVICE_CPU) @@ -728,4 +722,3 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad") } // namespace tensorflow #endif // INTEL_MKL - diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc index 9fee94f9465..d9713075be6 100644 --- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc +++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc @@ -40,7 +40,6 @@ limitations under the License. #include "tensorflow/core/kernels/fill_functor.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #define MKL_Complex8 tensorflow::complex64 #define MKL_Complex16 tensorflow::complex128 diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc index d109bb6bcfe..f1f267e849a 100644 --- a/tensorflow/core/kernels/mkl_concat_op.cc +++ b/tensorflow/core/kernels/mkl_concat_op.cc @@ -30,11 +30,11 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" -using mkldnn::stream; using mkldnn::concat; +using mkldnn::stream; #endif namespace tensorflow { @@ -45,7 +45,6 @@ typedef std::vector TensorShapeList; enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM }; - // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable // reference inputs. // -------------------------------------------------------------------------- @@ -63,7 +62,7 @@ class EigenConcatBaseOp : public OpKernel { // we need to have empty Compute because Compute is pure virtual function. void Compute(OpKernelContext* c) {} -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML void Compute(OpKernelContext* c, const std::vector& values) { const Tensor* concat_dim_tensor; @@ -152,8 +151,8 @@ class EigenConcatBaseOp : public OpKernel { #else // MKL_DNN -void Compute(OpKernelContext* c, const std::vector& values, - const TensorShapeList& input_shapes) { + void Compute(OpKernelContext* c, const std::vector& values, + const TensorShapeList& input_shapes) { const Tensor* concat_dim_tensor; const char* axis_attribute_name = AxisArgName == NAME_IS_AXIS @@ -197,7 +196,8 @@ void Compute(OpKernelContext* c, const std::vector& values, const auto in = values[i]; const bool in_is_scalar = IsLegacyScalar(input_shapes[i]); OP_REQUIRES( - c, (input_shapes[i].dims() == input_dims) || + c, + (input_shapes[i].dims() == input_dims) || (input_is_scalar && in_is_scalar), errors::InvalidArgument( "ConcatOp : Ranks of all input tensors should match: shape[0] = ", @@ -208,8 +208,8 @@ void Compute(OpKernelContext* c, const std::vector& values, inputs_flat.emplace_back(new typename TTypes::ConstMatrix( in.shaped({inputs_flat_dim0, inputs_flat_dim1}))); } - output_concat_dim += input_shapes[i].dims() > 0 ? - input_shapes[i].dim_size(axis) : 1; + output_concat_dim += + input_shapes[i].dims() > 0 ? input_shapes[i].dim_size(axis) : 1; } TensorShape output_shape(input_shape); @@ -230,7 +230,7 @@ void Compute(OpKernelContext* c, const std::vector& values, #endif }; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML // -------------------------------------------------------------------------- // Mkl Concat Op @@ -418,7 +418,6 @@ class MklConcatOp : public OpKernel { OP_REQUIRES_OK(context, context->status()); } - private: typedef struct { TensorFormat data_format; @@ -590,39 +589,45 @@ class MklConcatOp : public OpKernel { GetMklShapeList(context, "values", &input_shapes); const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM) - ? MklGetInput(context, 0) : MklGetInput(context, N); + ? MklGetInput(context, 0) + : MklGetInput(context, N); // Sanity checks - OP_REQUIRES(context, IsLegacyScalar(concat_dim_tensor.shape()), - errors::InvalidArgument( - "Concat dim tensor should be a scalar integer, but got shape ", - concat_dim_tensor.shape().DebugString())); - int32 concat_dim = internal::SubtleMustCopy( - concat_dim_tensor.scalar()()); + OP_REQUIRES( + context, IsLegacyScalar(concat_dim_tensor.shape()), + errors::InvalidArgument( + "Concat dim tensor should be a scalar integer, but got shape ", + concat_dim_tensor.shape().DebugString())); + int32 concat_dim = + internal::SubtleMustCopy(concat_dim_tensor.scalar()()); // check that ranks of all tensors match // and that their shapes match except for concat_dim. int i = 0; bool invoke_eigen = false; bool are_all_mkl_inputs = true, are_all_tf_inputs = true; - const TensorShape expected_shape = input_shapes[0].IsMklTensor() ? - input_shapes[0].GetTfShape() : - input_tensors[0].shape(); + const TensorShape expected_shape = input_shapes[0].IsMklTensor() + ? input_shapes[0].GetTfShape() + : input_tensors[0].shape(); size_t expected_dims = expected_shape.dims(); if (concat_dim < 0) concat_dim = expected_dims + concat_dim; for (auto& s : input_shapes) { - if (s == expected_shape) {++i; continue;} + if (s == expected_shape) { + ++i; + continue; + } - TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() : - input_tensors[i].shape(); + TensorShape s_shape = + s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape(); size_t s_dims = s_shape.dims(); - OP_REQUIRES(context, s_dims == expected_dims, - errors::InvalidArgument( - "_MklConcatOp : Ranks of all input tensors should match:" - " input dimensions = ", - s_dims, " vs. expected rank = ", expected_dims)); + OP_REQUIRES( + context, s_dims == expected_dims, + errors::InvalidArgument( + "_MklConcatOp : Ranks of all input tensors should match:" + " input dimensions = ", + s_dims, " vs. expected rank = ", expected_dims)); for (int d = 0; d < expected_dims; ++d) { if (d == concat_dim) continue; @@ -630,10 +635,11 @@ class MklConcatOp : public OpKernel { size_t expected_size = expected_shape.dim_size(d); size_t s_size = s_shape.dim_size(d); OP_REQUIRES( - context, expected_size == s_size, - errors::InvalidArgument("_MklConcatOp : Dimensions of inputs " - "should match: shape[0][", d, "]= ", expected_size, - " vs. shape[", i, "][", d, "] = ", s_size)); + context, expected_size == s_size, + errors::InvalidArgument("_MklConcatOp : Dimensions of inputs " + "should match: shape[0][", + d, "]= ", expected_size, " vs. shape[", i, + "][", d, "] = ", s_size)); } if (s.IsMklTensor()) @@ -657,8 +663,8 @@ class MklConcatOp : public OpKernel { TensorShapeList tf_input_shapes; i = 0; for (auto& s : input_shapes) { - TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() : - input_tensors[i].shape(); + TensorShape s_shape = + s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape(); tf_input_shapes.push_back(s_shape); ++i; } @@ -678,21 +684,22 @@ class MklConcatOp : public OpKernel { std::vector srcs_pd; std::vector> srcs(N, MklDnnData(&cpu_engine)); int64 dst_concat_dim_size = 0; - for (int k =0; k < N; k++) { + for (int k = 0; k < N; k++) { bool is_mkl_tensor = input_shapes[k].IsMklTensor(); memory::dims src_dims; // Same comment as dst_dims for src_dims. - src_dims = (is_mkl_tensor) ? - TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) : - TFShapeToMklDnnDims(input_tensors[k].shape()); + src_dims = (is_mkl_tensor) + ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) + : TFShapeToMklDnnDims(input_tensors[k].shape()); dst_concat_dim_size += src_dims[concat_dim]; - auto src_md = is_mkl_tensor ? input_shapes[k].GetMklLayout() : - // It does not matter what data format we use here (NHWC or NCHW). - // We just need to ensure that output of Concat uses same data format - // as input. - memory::desc(src_dims, MklDnnType(), memory::format::nchw); + auto src_md = + is_mkl_tensor ? input_shapes[k].GetMklLayout() : + // It does not matter what data format we use here + // (NHWC or NCHW). We just need to ensure that output + // of Concat uses same data format as input. + memory::desc(src_dims, MklDnnType(), memory::format::nchw); srcs[k].SetUsrMem(src_md, &input_tensors[k]); auto src_mpd = srcs[k].GetUsrMemPrimDesc(); @@ -707,14 +714,15 @@ class MklConcatOp : public OpKernel { // Since we are passing a specific format for destination, // we need to have dst_dims in MklDnn order (NCHW). auto orig_tf_format = input_shapes[0].GetTfDataFormat(); - dst_dims_in_nchw = MklDnnDimsInNCHW(dst_dims, - MklDnnDataFormatToTFDataFormat(orig_tf_format)); + dst_dims_in_nchw = MklDnnDimsInNCHW( + dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format)); // We will set the output in the same format as input to avoid layout // conversions. // Currently we are setting dst format same as input format. // See if we can make this choice in a better way. - dst_md = memory::desc(dst_dims_in_nchw, MklDnnType(), - (memory::format) input_shapes[0].GetMklLayout().data.format); + dst_md = memory::desc( + dst_dims_in_nchw, MklDnnType(), + (memory::format)input_shapes[0].GetMklLayout().data.format); } else { // Again, format does not matter here. We just need to make it same as // input format. @@ -722,7 +730,7 @@ class MklConcatOp : public OpKernel { } std::vector inputs; - for (int k=0; k < input_tensors.size(); k++) + for (int k = 0; k < input_tensors.size(); k++) inputs.push_back(srcs[k].GetOpMem()); // If all inputs are in MKL format, then meaning of concat_dim needs to @@ -732,8 +740,7 @@ class MklConcatOp : public OpKernel { // But ifinput tensors are in NHWC order, then semantics need to change. // E.g., if we are concatinating over Channel (dimension 3 for NHWC), // then since MklDnn order is NCHW, concat_dim needs to be 1. - if (are_all_mkl_inputs) - concat_dim = input_shapes[0].TfDimIdx(concat_dim); + if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim); auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd); @@ -752,24 +759,25 @@ class MklConcatOp : public OpKernel { dnn_shape_dst.SetMklTensor(false); tf_shape_dst = MklDnnDimsToTFShape(dst_dims); } - AllocateOutputSetMklShape(context, 0, &dst_tensor, - tf_shape_dst, dnn_shape_dst); + AllocateOutputSetMklShape(context, 0, &dst_tensor, tf_shape_dst, + dnn_shape_dst); CHECK_NOTNULL(dst_tensor); - dst_md = dnn_shape_dst.IsMklTensor() ? - dnn_shape_dst.GetMklLayout() : dst_md; + dst_md = + dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout() : dst_md; dst.SetUsrMem(dst_md, dst_tensor); auto concat_op = concat(concat_pd, inputs, dst.GetOpMem()); std::vector net; net.push_back(concat_op); stream(stream::kind::eager).submit(net).wait(); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); - OP_REQUIRES_OK(context, errors::Aborted( - "Operation received an exception:", error_msg)); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -790,11 +798,9 @@ class MklConcatOp : public OpKernel { dnn_shape_output.SetDimensions(4); Tensor* output_tensor = nullptr; TensorShape tf_shape_output; - tf_shape_output.AddDim( - dnn_shape_output.GetSerializeBufferSize()); - context->allocate_output( - GetTensorMetaDataIndex(0, context->num_outputs()), - tf_shape_output, &output_tensor); + tf_shape_output.AddDim(dnn_shape_output.GetSerializeBufferSize()); + context->allocate_output(GetTensorMetaDataIndex(0, context->num_outputs()), + tf_shape_output, &output_tensor); dnn_shape_output.SerializeMklDnnShape( output_tensor->flat().data(), output_tensor->flat().size() * sizeof(uint8)); diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc index 0f1a218fe62..25c25737412 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc @@ -38,9 +38,9 @@ limitations under the License. #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/work_sharder.h" -#include "tensorflow/core/util/mkl_util.h" #include "mkl_dnn.h" #include "mkl_dnn_types.h" +#include "tensorflow/core/util/mkl_util.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc index 54d4916d494..1401bc65a45 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc @@ -38,24 +38,24 @@ limitations under the License. #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/work_sharder.h" -#include "tensorflow/core/util/mkl_util.h" #include "mkl_dnn.h" #include "mkl_dnn_types.h" +#include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" -using mkldnn::stream; -using mkldnn::prop_kind; using mkldnn::convolution_backward_weights; using mkldnn::memory; +using mkldnn::prop_kind; +using mkldnn::stream; #endif namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template class MklConv2DCustomBackpropFilterOp : public OpKernel { @@ -360,8 +360,8 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel { (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input; const Tensor& out_backprop = MklGetInput(context, 2); - void* mkl_buf_out_backprop = const_cast(static_cast( - out_backprop.flat().data())); + void* mkl_buf_out_backprop = const_cast( + static_cast(out_backprop.flat().data())); CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop, prim_conv_bwdfilter, @@ -371,10 +371,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel { !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop); if (mkl_convert_out_backprop) { CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop, - lt_out_backprop, mkl_lt_internal_out_backprop), + lt_out_backprop, + mkl_lt_internal_out_backprop), E_SUCCESS); AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor, - lt_out_backprop, &mkl_buf_convert_out_backprop); + lt_out_backprop, &mkl_buf_convert_out_backprop); CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop, mkl_buf_out_backprop, mkl_buf_convert_out_backprop), @@ -428,18 +429,18 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel { .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DCustomBackpropFilterOp); + MklConv2DCustomBackpropFilterOp); TF_CALL_float(REGISTER_MKL_FILTER_KERNELS); #undef REGISTER_MKL_FILTER_KERNELS #else template -class MklConv2DCustomBackpropFilterOp : - public MklConv2DBackpropCommonOp { +class MklConv2DCustomBackpropFilterOp + : public MklConv2DBackpropCommonOp { public: explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context) - : MklConv2DBackpropCommonOp(context) { } + : MklConv2DBackpropCommonOp(context) {} ~MklConv2DCustomBackpropFilterOp() {} private: @@ -447,7 +448,7 @@ class MklConv2DCustomBackpropFilterOp : const MklDnnShape& filter_mkl_shape, const MklDnnShape& obp_mkl_shape) { CHECK(!filter_mkl_shape.IsMklTensor()) - << "Conv2DBackpropFilter: filter should not be in MKL Layout"; + << "Conv2DBackpropFilter: filter should not be in MKL Layout"; } size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ } @@ -462,8 +463,10 @@ class MklConv2DCustomBackpropFilterOp : const Tensor& filter_tensor) { TensorShape filter_tf_shape; CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true); - CHECK_EQ(TensorShapeUtils::MakeShape( - filter_tensor.vec(), &filter_tf_shape).ok(), true); + CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec(), + &filter_tf_shape) + .ok(), + true); return filter_tf_shape; } @@ -485,16 +488,13 @@ class MklConv2DCustomBackpropFilterOp : return memory::format::hwio; } - void CreatePrimitive(OpKernelContext* context, - const engine& cpu_engine, + void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine, const convolution_forward::primitive_desc& conv_fwd_pd, MklDnnData* input, MklDnnData* filter, MklDnnData* outbackprop, MklDnnData* output, - Tensor** output_tensor, - const memory::dims& strides, + Tensor** output_tensor, const memory::dims& strides, const memory::dims& padding_l, - const memory::dims& padding_r, - padding_kind padding, + const memory::dims& padding_r, padding_kind padding, const memory::dims& bwd_output_dims, memory::format bwd_output_format) { CHECK_NOTNULL(context); @@ -508,34 +508,35 @@ class MklConv2DCustomBackpropFilterOp : int depth = 0; if (biasEnabled) { // Data structure for bias_grad - bias_grad = new MklDnnData (&cpu_engine); + bias_grad = new MklDnnData(&cpu_engine); TensorShape obp_tf_shape = GetTfShape(context, 2); - depth = (MklConv2DBackpropCommonOp::GetTFDataFormat() - == FORMAT_NCHW) ? - obp_tf_shape.dim_size(1) : obp_tf_shape.dim_size(3); + depth = (MklConv2DBackpropCommonOp::GetTFDataFormat() == + FORMAT_NCHW) + ? obp_tf_shape.dim_size(1) + : obp_tf_shape.dim_size(3); memory::dims bias_grad_dims = {depth}; bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x); } // Create convolution backward weights primitive. - auto bwd_desc = (biasEnabled && (bias_grad != nullptr))? - convolution_backward_weights::desc(convolution_direct, - input->GetOpMemDesc(), output->GetOpMemDesc(), - bias_grad->GetOpMemDesc(), - outbackprop->GetOpMemDesc(), strides, padding_l, - padding_r, padding) : - convolution_backward_weights::desc(convolution_direct, - input->GetOpMemDesc(), output->GetOpMemDesc(), - outbackprop->GetOpMemDesc(), strides, padding_l, - padding_r, padding); + auto bwd_desc = + (biasEnabled && (bias_grad != nullptr)) + ? convolution_backward_weights::desc( + convolution_direct, input->GetOpMemDesc(), + output->GetOpMemDesc(), bias_grad->GetOpMemDesc(), + outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, + padding) + : convolution_backward_weights::desc( + convolution_direct, input->GetOpMemDesc(), + output->GetOpMemDesc(), outbackprop->GetOpMemDesc(), strides, + padding_l, padding_r, padding); - auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc, - cpu_engine, - conv_fwd_pd); + auto bwd_pd = convolution_backward_weights::primitive_desc( + bwd_desc, cpu_engine, conv_fwd_pd); // Allocate output tensor. - AllocateOutputTensor(context, bwd_pd, bwd_output_dims, - bwd_output_format, output_tensor); + AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format, + output_tensor); CHECK_NOTNULL(*output_tensor); // Set buffer handle using allocated output tensor. @@ -548,8 +549,8 @@ class MklConv2DCustomBackpropFilterOp : AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor); memory::dims bias_grad_dims = {depth}; // Since Bias is 1D, we use format::x from MKLDNN to represent it. - auto bias_grad_md = memory::desc({bias_grad_dims}, MklDnnType(), - memory::format::x); + auto bias_grad_md = + memory::desc({bias_grad_dims}, MklDnnType(), memory::format::x); bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor); bias_grad->SetUsrMemDataHandle(bias_grad_tensor); } @@ -562,28 +563,29 @@ class MklConv2DCustomBackpropFilterOp : } // Allocate output tensor. - void AllocateOutputTensor(OpKernelContext* context, - const convolution_backward_weights::primitive_desc& conv_pd, - const memory::dims& output_dims_mkl_order, - memory::format output_tf_format, Tensor** output_tensor) { - CHECK_NOTNULL(output_tensor); + void AllocateOutputTensor( + OpKernelContext* context, + const convolution_backward_weights::primitive_desc& conv_pd, + const memory::dims& output_dims_mkl_order, + memory::format output_tf_format, Tensor** output_tensor) { + CHECK_NOTNULL(output_tensor); - // For BackpropFilter, we convert the output tensor back in Tensorflow - // layout. Because typically, BackpropFilter is the last operator in the - // graph that emit filter gradient that is provided to ApplyGradient - // method to update the filter. But it may be possible to eliminate this - // by forwarding filter in MKL layout if we support ApplyGradient method - // for MKL layout propagation. - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(false); - // output_dims_mkl_order is in OIHW format. - // Allocate shape of TF tensor in HWIO format. - TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H], - output_dims_mkl_order[MklDnnDims::Dim_W], - output_dims_mkl_order[MklDnnDims::Dim_I], - output_dims_mkl_order[MklDnnDims::Dim_O]}); - AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape, - output_mkl_shape); + // For BackpropFilter, we convert the output tensor back in Tensorflow + // layout. Because typically, BackpropFilter is the last operator in the + // graph that emit filter gradient that is provided to ApplyGradient + // method to update the filter. But it may be possible to eliminate this + // by forwarding filter in MKL layout if we support ApplyGradient method + // for MKL layout propagation. + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(false); + // output_dims_mkl_order is in OIHW format. + // Allocate shape of TF tensor in HWIO format. + TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H], + output_dims_mkl_order[MklDnnDims::Dim_W], + output_dims_mkl_order[MklDnnDims::Dim_I], + output_dims_mkl_order[MklDnnDims::Dim_O]}); + AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape, + output_mkl_shape); } // Allocate tensor for bias grad @@ -600,9 +602,9 @@ class MklConv2DCustomBackpropFilterOp : // Prepare and execute net - checks for input and output reorders. void PrepareAndExecutePrimitive( - const convolution_backward_weights::primitive_desc& conv_pd, - MklDnnData* input, MklDnnData* obp, - MklDnnData* output, MklDnnData* bias_grad = nullptr) { + const convolution_backward_weights::primitive_desc& conv_pd, + MklDnnData* input, MklDnnData* obp, MklDnnData* output, + MklDnnData* bias_grad = nullptr) { // Create reorders between user layout and MKL layout if it is needed and // add it to the net before convolution. std::vector net; @@ -612,15 +614,15 @@ class MklConv2DCustomBackpropFilterOp : // For BackpropFilter, we convert the output tensor back in Tensorflow // layout. bool output_reorder_required = output->PrepareReorderToUserMemIfReq( - conv_pd.diff_weights_primitive_desc()); + conv_pd.diff_weights_primitive_desc()); if (biasEnabled && (bias_grad != nullptr)) { - net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(), - obp->GetOpMem(), output->GetOpMem(), - bias_grad->GetOpMem())); + net.push_back(convolution_backward_weights( + conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem(), + bias_grad->GetOpMem())); } else { - net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(), - obp->GetOpMem(), output->GetOpMem())); + net.push_back(convolution_backward_weights( + conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem())); } if (output_reorder_required) { @@ -631,27 +633,29 @@ class MklConv2DCustomBackpropFilterOp : } }; -#define REGISTER_MKL_FILTER_KERNELS(T) \ - REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DCustomBackpropFilterOp);\ - REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklConv2DCustomBackpropFilterOp); \ - REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklOpLabel), \ - MklDummyOp); +#define REGISTER_MKL_FILTER_KERNELS(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DBackpropFilter") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConv2DCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DBackpropFilterWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklConv2DCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklOpLabel), \ + MklDummyOp); TF_CALL_float(REGISTER_MKL_FILTER_KERNELS); #undef REGISTER_MKL_FILTER_KERNELS -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc index ef6db58d31f..eeed0095310 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc @@ -23,6 +23,8 @@ limitations under the License. #define EIGEN_USE_THREADS #include #include +#include "mkl_dnn.h" +#include "mkl_dnn_types.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -41,22 +43,20 @@ limitations under the License. #include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/work_sharder.h" -#include "mkl_dnn.h" -#include "mkl_dnn_types.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" -using mkldnn::stream; -using mkldnn::prop_kind; using mkldnn::convolution_backward_data; +using mkldnn::prop_kind; +using mkldnn::stream; #endif namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template class MklConv2DCustomBackpropInputOp : public OpKernel { @@ -359,16 +359,15 @@ class MklConv2DCustomBackpropInputOp : public OpKernel { #else template -class MklConv2DCustomBackpropInputOp : - public MklConv2DBackpropCommonOp { +class MklConv2DCustomBackpropInputOp + : public MklConv2DBackpropCommonOp { public: explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context) - : MklConv2DBackpropCommonOp(context) { } + : MklConv2DBackpropCommonOp(context) {} ~MklConv2DCustomBackpropInputOp() {} private: - const int kInputIndex_Filter = 1, - kInputIndex_InputSizes = 0, + const int kInputIndex_Filter = 1, kInputIndex_InputSizes = 0, kInputIndex_OutBackProp = 2; void ValidateMklShapes(const MklDnnShape& input_mkl_shape, const MklDnnShape& filter_mkl_shape, @@ -377,7 +376,7 @@ class MklConv2DCustomBackpropInputOp : // of the Tensor and never an actual tensor. So it will never be in MKL // layout. CHECK(!input_mkl_shape.IsMklTensor()) - << "Conv2DBackpropInput: input should not be in MKL Layout"; + << "Conv2DBackpropInput: input should not be in MKL Layout"; } size_t GetInputTensorIndexWithSizes() { return kInputIndex_InputSizes; } @@ -386,8 +385,10 @@ class MklConv2DCustomBackpropInputOp : const Tensor& input_tensor) { TensorShape input_tf_shape; CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true); - CHECK_EQ(TensorShapeUtils::MakeShape(input_tensor.vec(), - &input_tf_shape).ok(), true); + CHECK_EQ( + TensorShapeUtils::MakeShape(input_tensor.vec(), &input_tf_shape) + .ok(), + true); return input_tf_shape; } @@ -414,16 +415,13 @@ class MklConv2DCustomBackpropInputOp : return data_format; } - void CreatePrimitive(OpKernelContext* context, - const engine& cpu_engine, + void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine, const convolution_forward::primitive_desc& conv_fwd_pd, MklDnnData* input, MklDnnData* filter, MklDnnData* outbackprop, MklDnnData* output, - Tensor** output_tensor, - const memory::dims& strides, + Tensor** output_tensor, const memory::dims& strides, const memory::dims& padding_l, - const memory::dims& padding_r, - padding_kind padding, + const memory::dims& padding_r, padding_kind padding, const memory::dims& bwd_output_dims, memory::format bwd_output_format) { CHECK_NOTNULL(context); @@ -434,19 +432,16 @@ class MklConv2DCustomBackpropInputOp : CHECK_NOTNULL(output_tensor); // Create convolution backward data primitive. - auto bwd_desc = convolution_backward_data::desc(convolution_direct, - output->GetOpMemDesc(), filter->GetOpMemDesc(), - outbackprop->GetOpMemDesc(), strides, padding_l, - padding_r, padding); - - auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc, - cpu_engine, - conv_fwd_pd); + auto bwd_desc = convolution_backward_data::desc( + convolution_direct, output->GetOpMemDesc(), filter->GetOpMemDesc(), + outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, padding); + auto bwd_pd = convolution_backward_data::primitive_desc( + bwd_desc, cpu_engine, conv_fwd_pd); // Allocate output tensor in TensorFlow and MKL layout. - AllocateOutputTensor(context, bwd_pd, bwd_output_dims, - bwd_output_format, output_tensor); + AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format, + output_tensor); CHECK_NOTNULL(*output_tensor); // Set buffer handle using allocated output tensor. output->SetUsrMemDataHandle(*output_tensor); @@ -455,50 +450,50 @@ class MklConv2DCustomBackpropInputOp : } // Allocate output tensor. - void AllocateOutputTensor(OpKernelContext* context, - const convolution_backward_data::primitive_desc& conv_pd, - const memory::dims& output_dims_mkl_order, - memory::format output_tf_format, Tensor** output_tensor) { - CHECK_NOTNULL(output_tensor); + void AllocateOutputTensor( + OpKernelContext* context, + const convolution_backward_data::primitive_desc& conv_pd, + const memory::dims& output_dims_mkl_order, + memory::format output_tf_format, Tensor** output_tensor) { + CHECK_NOTNULL(output_tensor); - // Output primitive descriptor for backward data is diff_src. - auto dst_pd = conv_pd.diff_src_primitive_desc(); + // Output primitive descriptor for backward data is diff_src. + auto dst_pd = conv_pd.diff_src_primitive_desc(); - // Allocate shape of Mkl tensor. - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(true); - output_mkl_shape.SetMklLayout(&dst_pd); - output_mkl_shape.SetElemType(MklDnnType()); - output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, output_tf_format); + // Allocate shape of Mkl tensor. + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(true); + output_mkl_shape.SetMklLayout(&dst_pd); + output_mkl_shape.SetElemType(MklDnnType()); + output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), + output_dims_mkl_order, output_tf_format); - // Allocate shape of TF tensor. - TensorShape output_tf_shape; - output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T)); + // Allocate shape of TF tensor. + TensorShape output_tf_shape; + output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T)); - AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape, - output_mkl_shape); + AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape, + output_mkl_shape); } // Prepare and execute net - checks for input and output reorders. void PrepareAndExecutePrimitive( - const convolution_backward_data::primitive_desc& conv_pd, - MklDnnData* filter, MklDnnData* obp, - MklDnnData* output) { + const convolution_backward_data::primitive_desc& conv_pd, + MklDnnData* filter, MklDnnData* obp, MklDnnData* output) { // Create reorders between user layout and MKL layout if it is needed and // add it to the net before convolution. std::vector net; filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net); obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net); - net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(), - filter->GetOpMem(), output->GetOpMem())); + net.push_back(convolution_backward_data( + conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem())); stream(stream::kind::eager).submit(net).wait(); } }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML #define REGISTER_MKL_CPU_KERNELS(T) \ REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput") \ diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 0e77b45993c..cbda12689f8 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -18,8 +18,8 @@ limitations under the License. #include #include -#include #include +#include #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -42,14 +42,16 @@ limitations under the License. #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN + +#ifndef INTEL_MKL_ML + #include "mkldnn.hpp" -using mkldnn::stream; using mkldnn::prop_kind; +using mkldnn::stream; -using mkldnn::convolution_forward; using mkldnn::convolution_direct; +using mkldnn::convolution_forward; #else #include "mkl_dnn.h" #include "mkl_dnn_types.h" @@ -59,8 +61,8 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -// For now, MKL-ML is default. So making MKL-DNN not a default choice. -#ifndef INTEL_MKL_DNN +// MKL-DNN is now default. MKL-ML must be specified explicitly. +#ifdef INTEL_MKL_ML template class MklConv2DOp : public OpKernel { @@ -116,18 +118,19 @@ class MklConv2DOp : public OpKernel { filter.shape().DebugString())); for (int i = 0; i < 3; i++) { - OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), - std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES( + context, + FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } const int64 input_depth = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C') : GetTensorDim(input, data_format_, 'C'); - OP_REQUIRES( - context, input_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - input_depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, input_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", input_depth, + " vs ", filter.dim_size(2))); // The last dimension for filter is out_depth. const int out_depth = static_cast(filter.dim_size(3)); @@ -136,9 +139,10 @@ class MklConv2DOp : public OpKernel { const int64 input_rows_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H') : GetTensorDim(input, data_format_, 'H'); - OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input rows too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_rows_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); const int input_rows = static_cast(input_rows_raw); const int filter_rows = static_cast(filter.dim_size(0)); @@ -147,9 +151,10 @@ class MklConv2DOp : public OpKernel { const int64 input_cols_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W') : GetTensorDim(input, data_format_, 'W'); - OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, - std::numeric_limits::max()), - errors::InvalidArgument("Input cols too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_cols_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); const int input_cols = static_cast(input_cols_raw); const int filter_cols = static_cast(filter.dim_size(1)); @@ -157,9 +162,10 @@ class MklConv2DOp : public OpKernel { const int64 input_batch_raw = input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N') : GetTensorDim(input, data_format_, 'N'); - OP_REQUIRES(context, FastBoundsCheck(input_batch_raw, - std::numeric_limits::max()), - errors::InvalidArgument("batch is too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input_batch_raw, std::numeric_limits::max()), + errors::InvalidArgument("batch is too large")); const int batch = static_cast(input_batch_raw); // For now we take the stride from the second and third dimensions only (we @@ -313,8 +319,7 @@ class MklConv2DOp : public OpKernel { // Temp tensor used to allocate tmp buffers Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor, mkl_tmp_bias_buf_tensor; - mkl_context.MklPrepareConvolutionInputs(context, - &mkl_tmp_input_buf_tensor, + mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor, &mkl_tmp_filter_buf_tensor, &mkl_tmp_bias_buf_tensor); @@ -398,8 +403,9 @@ class MklConv2DOp : public OpKernel { mkl_convert_input = !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input); if (mkl_convert_input) { - CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, - lt_input, mkl_lt_internal_input), E_SUCCESS); + CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input, + mkl_lt_internal_input), + E_SUCCESS); AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input, &mkl_buf_convert_input); CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input, @@ -517,8 +523,8 @@ class MklConv2DOp : public OpKernel { GetMklShape(context, kInputIndex_Src, &src_mkl_shape); GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape); OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false, - errors::InvalidArgument("Filter should not be in " - "Mkl Layout")); + errors::InvalidArgument("Filter should not be in " + "Mkl Layout")); MklDnnData src(&cpu_engine); MklDnnData filter(&cpu_engine); @@ -531,11 +537,10 @@ class MklConv2DOp : public OpKernel { MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_); auto src_tf_shape = GetTfShape(context, kInputIndex_Src); auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter); - conv_utl.GetConvFwdSizesInMklOrder(src_tf_shape, filter_tf_shape, - &src_dims, &filter_dims, &strides, - &output_dims_tf_order, - &output_dims_mkl_order, &padding_l, - &padding_r); + conv_utl.GetConvFwdSizesInMklOrder( + src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides, + &output_dims_tf_order, &output_dims_mkl_order, &padding_l, + &padding_r); if (!context->status().ok()) return; // Check for corner case - if there is nothing to compute, return. @@ -543,21 +548,20 @@ class MklConv2DOp : public OpKernel { // Corner cases: output with 0 elements and 0 batch size. Tensor* output_tensor = nullptr; - if (output_tf_shape.num_elements() == 0 || - output_dims_tf_order[0] == 0) { + if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) { // TODO(jbobba): Verify correctness here // Need semantics for Null MKL tensor MklDnnShape output_mkl_shape; output_mkl_shape.SetMklTensor(false); AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor, - src_tf_shape, output_mkl_shape); + src_tf_shape, output_mkl_shape); // MklConv2D also outputs converted filter as 2nd output of Conv2D. filter_mkl_shape.SetMklTensor(false); Tensor* output_filter_tensor = nullptr; AllocateOutputSetMklShape(context, kOutputIndex_Filter, - &output_filter_tensor, - filter_tf_shape, filter_mkl_shape); + &output_filter_tensor, filter_tf_shape, + filter_mkl_shape); return; } @@ -570,14 +574,15 @@ class MklConv2DOp : public OpKernel { // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's // layout (NHWC or NCHW depending on data format). auto src_md = src_mkl_shape.IsMklTensor() - ? src_mkl_shape.GetMklLayout() - : memory::desc(src_dims, MklDnnType(), tf_fmt); + ? src_mkl_shape.GetMklLayout() + : memory::desc(src_dims, MklDnnType(), tf_fmt); src.SetUsrMem(src_md, &src_tensor); // Although filter shape (filter_dims) required is in MKL-DNN order, // the layout is Tensorflow's layout (HWIO). auto filter_md = filter_mkl_shape.IsMklTensor() // Should NEVER be true - ? filter_mkl_shape.GetMklLayout() - : memory::desc(filter_dims, MklDnnType(), memory::format::hwio); + ? filter_mkl_shape.GetMklLayout() + : memory::desc(filter_dims, MklDnnType(), + memory::format::hwio); filter.SetUsrMem(filter_md, &filter_tensor); // Set output shape (output_dims) required in MKL-DNN order. @@ -601,34 +606,13 @@ class MklConv2DOp : public OpKernel { bias.SetOpMemDesc(bias_size, memory::format::any); // Create convolution primitive with Bias. - auto conv_desc = convolution_forward::desc(prop_kind::forward, - convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(), - bias.GetOpMemDesc(), output.GetOpMemDesc(), strides, - padding_l, padding_r, TFPaddingToMklDnnPadding(padding_)); + auto conv_desc = convolution_forward::desc( + prop_kind::forward, convolution_direct, src.GetOpMemDesc(), + filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(), + strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_)); - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, - cpu_engine); - AllocateOutputTensor(context, conv_prim_desc, - output_dims_mkl_order, tf_fmt, &output_tensor); - // Set data handle for output. - output.SetUsrMemDataHandle(output_tensor); - - Tensor* filter_out_tensor = nullptr; - AllocateFilterOutputTensor(context, conv_prim_desc, - TFShapeToMklDnnDims(filter_tf_shape), - &filter_out_tensor); - - PrepareAndExecuteNet(conv_prim_desc, &src, &filter, - &bias, &output, filter_out_tensor); - } else { - // Create convolution primitive without Bias. - auto conv_desc = convolution_forward::desc(prop_kind::forward, - convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(), - output.GetOpMemDesc(), strides, padding_l, padding_r, - TFPaddingToMklDnnPadding(padding_)); - - auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc, - cpu_engine); + auto conv_prim_desc = + convolution_forward::primitive_desc(conv_desc, cpu_engine); AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order, tf_fmt, &output_tensor); // Set data handle for output. @@ -636,18 +620,39 @@ class MklConv2DOp : public OpKernel { Tensor* filter_out_tensor = nullptr; AllocateFilterOutputTensor(context, conv_prim_desc, - TFShapeToMklDnnDims(filter_tf_shape), - &filter_out_tensor); - PrepareAndExecuteNet(conv_prim_desc, &src, &filter, - nullptr, &output, filter_out_tensor); + TFShapeToMklDnnDims(filter_tf_shape), + &filter_out_tensor); + + PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output, + filter_out_tensor); + } else { + // Create convolution primitive without Bias. + auto conv_desc = convolution_forward::desc( + prop_kind::forward, convolution_direct, src.GetOpMemDesc(), + filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l, + padding_r, TFPaddingToMklDnnPadding(padding_)); + + auto conv_prim_desc = + convolution_forward::primitive_desc(conv_desc, cpu_engine); + AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order, + tf_fmt, &output_tensor); + // Set data handle for output. + output.SetUsrMemDataHandle(output_tensor); + + Tensor* filter_out_tensor = nullptr; + AllocateFilterOutputTensor(context, conv_prim_desc, + TFShapeToMklDnnDims(filter_tf_shape), + &filter_out_tensor); + PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output, + filter_out_tensor); } - } catch (mkldnn::error &e) { + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + std::string(e.message) + - ", in file " + std::string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", error_msg)); + ", message: " + std::string(e.message) + ", in file " + + std::string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -655,71 +660,67 @@ class MklConv2DOp : public OpKernel { std::vector strides_; Padding padding_; TensorFormat data_format_; - const int kInputIndex_Src = 0, - kInputIndex_Filter = 1, - kInputIndex_Bias = 2; + const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2; const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1; // Allocate output tensor. void AllocateOutputTensor( - OpKernelContext* context, - const convolution_forward::primitive_desc& conv_prim_desc, - const memory::dims& output_dims_mkl_order, - memory::format output_tf_format, Tensor** output_tensor) { - CHECK_NOTNULL(output_tensor); - auto dst_pd = conv_prim_desc.dst_primitive_desc(); + OpKernelContext* context, + const convolution_forward::primitive_desc& conv_prim_desc, + const memory::dims& output_dims_mkl_order, + memory::format output_tf_format, Tensor** output_tensor) { + CHECK_NOTNULL(output_tensor); + auto dst_pd = conv_prim_desc.dst_primitive_desc(); - // Allocate shape of Mkl tensor. - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(true); - output_mkl_shape.SetMklLayout(&dst_pd); - output_mkl_shape.SetElemType(MklDnnType()); - output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, output_tf_format); + // Allocate shape of Mkl tensor. + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(true); + output_mkl_shape.SetMklLayout(&dst_pd); + output_mkl_shape.SetElemType(MklDnnType()); + output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), + output_dims_mkl_order, output_tf_format); - // Allocate shape of TF tensor. - TensorShape output_tf_shape; - output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T))); + // Allocate shape of TF tensor. + TensorShape output_tf_shape; + output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T))); - AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor, - output_tf_shape, output_mkl_shape); + AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor, + output_tf_shape, output_mkl_shape); } // Allocate output tensor. void AllocateFilterOutputTensor( - OpKernelContext* context, - const convolution_forward::primitive_desc& conv_prim_desc, - const memory::dims& filter_dims_tf_order, - Tensor** filter_tensor) { - CHECK_NOTNULL(filter_tensor); - auto filter_pd = conv_prim_desc.weights_primitive_desc(); + OpKernelContext* context, + const convolution_forward::primitive_desc& conv_prim_desc, + const memory::dims& filter_dims_tf_order, Tensor** filter_tensor) { + CHECK_NOTNULL(filter_tensor); + auto filter_pd = conv_prim_desc.weights_primitive_desc(); - // Allocate shape of Mkl tensor. - MklDnnShape filter_mkl_shape; - filter_mkl_shape.SetMklTensor(true); - filter_mkl_shape.SetMklLayout(&filter_pd); - filter_mkl_shape.SetElemType(MklDnnType()); + // Allocate shape of Mkl tensor. + MklDnnShape filter_mkl_shape; + filter_mkl_shape.SetMklTensor(true); + filter_mkl_shape.SetMklLayout(&filter_pd); + filter_mkl_shape.SetElemType(MklDnnType()); - // The format of the filter is actually OIhw8i8o, but TF doesn't support - // this format. Just use format::blocked for now because the layout - // is stored in the MKL data. - filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(), - filter_dims_tf_order, memory::format::blocked); + // The format of the filter is actually OIhw8i8o, but TF doesn't support + // this format. Just use format::blocked for now because the layout + // is stored in the MKL data. + filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(), + filter_dims_tf_order, memory::format::blocked); - // Allocate the data space for the filter to propagate as TF tensor. - TensorShape filter_tf_shape; - filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T))); + // Allocate the data space for the filter to propagate as TF tensor. + TensorShape filter_tf_shape; + filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T))); - AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor, - filter_tf_shape, filter_mkl_shape); + AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor, + filter_tf_shape, filter_mkl_shape); } // Prepare and execute net - checks for input and output reorders. void PrepareAndExecuteNet( - const convolution_forward::primitive_desc& conv_prim_desc, - MklDnnData* src, MklDnnData* filter, - MklDnnData* bias, MklDnnData* output, - Tensor* filter_out_tensor) { + const convolution_forward::primitive_desc& conv_prim_desc, + MklDnnData* src, MklDnnData* filter, MklDnnData* bias, + MklDnnData* output, Tensor* filter_out_tensor) { CHECK_NOTNULL(filter_out_tensor); // Create reorders between user layout and MKL layout if it is needed and @@ -731,18 +732,20 @@ class MklConv2DOp : public OpKernel { // rather than re-order to a temp buffer, reorder directly to the // filter output tensor filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(), - filter->GetTensorBuffer(filter_out_tensor), &net); + filter->GetTensorBuffer(filter_out_tensor), + &net); // Create convolution primitive and add it to net. if (bias) { CHECK_EQ(biasEnabled, true); net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(), - filter->GetOpMem(), bias->GetOpMem(), - output->GetOpMem())); + filter->GetOpMem(), bias->GetOpMem(), + output->GetOpMem())); } else { CHECK_EQ(biasEnabled, false); net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(), - filter->GetOpMem(), output->GetOpMem())); + filter->GetOpMem(), + output->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h index c6456bd5c33..9dd88221a84 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.h +++ b/tensorflow/core/kernels/mkl_conv_ops.h @@ -16,9 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_ #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_ -#include #include #include +#include #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -27,8 +27,8 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" #include "tensorflow/core/kernels/bounds_check.h" -#include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/kernels/conv_grad_ops.h" +#include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/strings/numbers.h" @@ -40,19 +40,19 @@ limitations under the License. #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" -using mkldnn::stream; using mkldnn::prop_kind; +using mkldnn::stream; -using mkldnn::convolution_forward; using mkldnn::convolution_direct; +using mkldnn::convolution_forward; #endif namespace tensorflow { -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML class MklDnnConvUtil { protected: @@ -63,13 +63,13 @@ class MklDnnConvUtil { public: MklDnnConvUtil(OpKernelContext* context, const std::vector& strides, - Padding pad, TensorFormat fm) : context_(context), - strides_(strides), padding_(pad), data_format_(fm) {} + Padding pad, TensorFormat fm) + : context_(context), strides_(strides), padding_(pad), data_format_(fm) {} virtual ~MklDnnConvUtil() { context_ = nullptr; } // Calculate Convolution strides - virtual inline void GetStridesInMklOrder(memory::dims *strides) { + virtual inline void GetStridesInMklOrder(memory::dims* strides) { // For now we take the stride from the second and third dimensions only // (we do not support striding on the batch or depth dimension). CHECK_NOTNULL(strides); @@ -82,14 +82,14 @@ class MklDnnConvUtil { // requires input in NCHW format. Function does not return anything. // But errors arising from sanity checks are returned in context's // status. - virtual inline void - GetInputSizeInMklOrder(const TensorShape& input_shape, - memory::dims *input_dims) { - #define CHECK_BOUNDS(val, err_msg) do { \ - OP_REQUIRES(context_, FastBoundsCheck(val, \ - std::numeric_limits::max()), \ - errors::InvalidArgument(err_msg)); \ - }while(0) + virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape, + memory::dims* input_dims) { +#define CHECK_BOUNDS(val, err_msg) \ + do { \ + OP_REQUIRES(context_, \ + FastBoundsCheck(val, std::numeric_limits::max()), \ + errors::InvalidArgument(err_msg)); \ + } while (0) CHECK_NOTNULL(input_dims); @@ -112,7 +112,7 @@ class MklDnnConvUtil { CHECK_BOUNDS(input_batch_raw, "Input batch too large"); int input_batch = static_cast(input_batch_raw); - #undef CHECK_BOUNDS +#undef CHECK_BOUNDS // MKL-DNN always requires input in NCHW format. std::vector mkldnn_sizes(4, -1); @@ -138,10 +138,9 @@ class MklDnnConvUtil { // forward gets actual tensor as input). // // TODO(nhasabni): Add similar function for input and filter in MklShape. - virtual inline void - GetFilterSizeInMklOrder(const TensorShape& input_shape, - const TensorShape& filter_shape, - memory::dims *filter_dims) { + virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape, + const TensorShape& filter_shape, + memory::dims* filter_dims) { CHECK_NOTNULL(filter_dims); OP_REQUIRES(context_, filter_shape.dims() == 4, @@ -149,17 +148,18 @@ class MklDnnConvUtil { filter_shape.DebugString())); for (int i = 0; i < 3; i++) { - OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i), - std::numeric_limits::max()), - errors::InvalidArgument("filter too large")); + OP_REQUIRES(context_, + FastBoundsCheck(filter_shape.dim_size(i), + std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); } int input_depth = GetTensorDim(input_shape, data_format_, 'C'); - OP_REQUIRES( - context_, input_depth == filter_shape.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - input_depth, " vs ", filter_shape.dim_size(2))); + OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", input_depth, + " vs ", filter_shape.dim_size(2))); // TF filter is always in (rows, cols, in_depth, out_depth) order. int filter_rows = static_cast(filter_shape.dim_size(0)); @@ -182,25 +182,24 @@ class MklDnnConvUtil { // requires filter in OIHW format. Function does not return anything. // But errors arising from sanity checks are returned in context's // status. - virtual inline void - GetFilterSizeInMklOrder(size_t src_index, size_t filter_index, - memory::dims *filter_dims) { + virtual inline void GetFilterSizeInMklOrder(size_t src_index, + size_t filter_index, + memory::dims* filter_dims) { CHECK_NOTNULL(filter_dims); GetFilterSizeInMklOrder(GetTfShape(context_, src_index), - GetTfShape(context_, filter_index), - filter_dims); + GetTfShape(context_, filter_index), filter_dims); } // Calculate Bias size for 2D Convolution. Function does not return // anything, but sets error in context status. - virtual inline void - GetBiasSizeInMklOrder(size_t bias_index, memory::dims *bias_dims) { + virtual inline void GetBiasSizeInMklOrder(size_t bias_index, + memory::dims* bias_dims) { const Tensor& bias = MklGetInput(context_, bias_index); OP_REQUIRES(context_, bias.dims() == 1, errors::InvalidArgument("bias must be 1-dimensional: ", bias.shape().DebugString())); - *bias_dims = { static_cast(bias.dim_size(0)) }; + *bias_dims = {static_cast(bias.dim_size(0))}; } // Function to calculate output and padding size for 2D convolution. @@ -212,13 +211,11 @@ class MklDnnConvUtil { // status is returned via context status. // // TODO(nhasabni): Add similar function for input and filter in MklShape. - virtual inline void - GetOutputAndPadSizeInMklOrder(const TensorShape& input_shape, - const TensorShape& filter_shape, - const memory::dims& strides, - memory::dims *output_dims_tf_order, - memory::dims *output_dims_mkl_order, - memory::dims *pad_l, memory::dims *pad_r) { + virtual inline void GetOutputAndPadSizeInMklOrder( + const TensorShape& input_shape, const TensorShape& filter_shape, + const memory::dims& strides, memory::dims* output_dims_tf_order, + memory::dims* output_dims_mkl_order, memory::dims* pad_l, + memory::dims* pad_r) { CHECK_NOTNULL(output_dims_tf_order); CHECK_NOTNULL(output_dims_mkl_order); CHECK_NOTNULL(pad_l); @@ -244,16 +241,16 @@ class MklDnnConvUtil { int64 out_rows = 0, out_cols = 0; int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right; - OP_REQUIRES_OK(context_, - GetWindowedOutputSizeVerbose(input_rows, filter_rows, stride_rows, - padding_, &out_rows, &pad_top, &pad_bottom)); - OP_REQUIRES_OK(context_, - GetWindowedOutputSizeVerbose(input_cols, filter_cols, stride_cols, - padding_, &out_cols, &pad_left, &pad_right)); + OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose( + input_rows, filter_rows, stride_rows, padding_, + &out_rows, &pad_top, &pad_bottom)); + OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose( + input_cols, filter_cols, stride_cols, padding_, + &out_cols, &pad_left, &pad_right)); // Tensorflow output is in data_format order. (NHWC or NCHW) - TensorShape out_shape = ShapeFromFormat(data_format_, out_batch, - out_rows, out_cols, out_depth); + TensorShape out_shape = + ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth); *output_dims_tf_order = TFShapeToMklDnnDims(out_shape); // MKL-DNN always needs output in NCHW format. @@ -273,12 +270,10 @@ class MklDnnConvUtil { // See comment on GetConvOutputAndPadSizeInMklOrder for parameters. // // Function does not return anything, but sets error in context status. - inline void - GetOutputAndPadSizeInMklOrder(size_t src_index, size_t filter_index, - const memory::dims& strides, - memory::dims *output_dims_tf_order, - memory::dims *output_dims_mkl_order, - memory::dims *pad_l, memory::dims *pad_r) { + inline void GetOutputAndPadSizeInMklOrder( + size_t src_index, size_t filter_index, const memory::dims& strides, + memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order, + memory::dims* pad_l, memory::dims* pad_r) { CHECK_NOTNULL(output_dims_tf_order); CHECK_NOTNULL(output_dims_mkl_order); CHECK_NOTNULL(pad_l); @@ -289,11 +284,11 @@ class MklDnnConvUtil { OP_REQUIRES(context_, input_tf_shape.dims() == 4, errors::InvalidArgument("input must be 4-dimensional", - input_tf_shape.DebugString())); + input_tf_shape.DebugString())); - GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, - strides, output_dims_tf_order, - output_dims_mkl_order, pad_l, pad_r); + GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides, + output_dims_tf_order, output_dims_mkl_order, + pad_l, pad_r); } // Wrapper function to calculate input, filter, and output sizes of @@ -302,15 +297,12 @@ class MklDnnConvUtil { // also calculates strides and paddings for 2D Convolution. // // Function does not return anything, but sets error in context status. - inline void GetConvFwdSizesInMklOrder(const TensorShape& input_shape, - const TensorShape& filter_shape, - memory::dims *input_dims, - memory::dims *filter_dims, - memory::dims *strides, - memory::dims *output_dims_tf_order, - memory::dims *output_dims_mkl_order, - memory::dims *pad_l, - memory::dims *pad_r) { + inline void GetConvFwdSizesInMklOrder( + const TensorShape& input_shape, const TensorShape& filter_shape, + memory::dims* input_dims, memory::dims* filter_dims, + memory::dims* strides, memory::dims* output_dims_tf_order, + memory::dims* output_dims_mkl_order, memory::dims* pad_l, + memory::dims* pad_r) { CHECK_NOTNULL(input_dims); CHECK_NOTNULL(filter_dims); CHECK_NOTNULL(strides); @@ -325,8 +317,7 @@ class MklDnnConvUtil { if (!context_->status().ok()) return; GetStridesInMklOrder(strides); GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides, - output_dims_tf_order, - output_dims_mkl_order, + output_dims_tf_order, output_dims_mkl_order, pad_l, pad_r); if (!context_->status().ok()) return; } @@ -337,7 +328,7 @@ class MklDnnConvUtil { ///////////////////////////////////////////////////////////////////// template -class MklConv2DBackpropCommonOp : public OpKernel { +class MklConv2DBackpropCommonOp : public OpKernel { public: ~MklConv2DBackpropCommonOp() {} explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context) @@ -397,12 +388,11 @@ class MklConv2DBackpropCommonOp : public OpKernel { outbprop_tf_shape.num_elements() == 0) { MklDnnShape output_mkl_shape; output_mkl_shape.SetMklTensor(false); - TensorShape output_tf_shape = GetOutputTfShape(input_tf_shape, - filter_tf_shape, - outbprop_tf_shape); + TensorShape output_tf_shape = GetOutputTfShape( + input_tf_shape, filter_tf_shape, outbprop_tf_shape); const int kOutputIdx = 0; AllocateOutputSetMklShape(context, kOutputIdx, &output_tensor, - output_tf_shape, output_mkl_shape); + output_tf_shape, output_mkl_shape); CHECK_NOTNULL(output_tensor); // if output tensor has more than 0 elements, we need to 0 them out. @@ -421,12 +411,10 @@ class MklConv2DBackpropCommonOp : public OpKernel { // Get forward convolution parameters. MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_); - conv_utl.GetConvFwdSizesInMklOrder(input_tf_shape, filter_tf_shape, - &fwd_input_dims, &fwd_filter_dims, - &strides, - &fwd_output_dims_tf_order, - &fwd_output_dims, - &padding_l, &padding_r); + conv_utl.GetConvFwdSizesInMklOrder( + input_tf_shape, filter_tf_shape, &fwd_input_dims, &fwd_filter_dims, + &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l, + &padding_r); if (!context->status().ok()) return; // Create Convolution forward descriptor since Convolution backward @@ -437,20 +425,22 @@ class MklConv2DBackpropCommonOp : public OpKernel { // construct input TF layout. For TF layout, although input shape // required is in MKL-DNN order, the layout is Tensorflow's layout // (NHWC or NCHW depending on data format). - auto fwd_input_md = input_mkl_shape.IsMklTensor() ? - input_mkl_shape.GetMklLayout() : - memory::desc(fwd_input_dims, MklDnnType(), tf_fmt); + auto fwd_input_md = + input_mkl_shape.IsMklTensor() + ? input_mkl_shape.GetMklLayout() + : memory::desc(fwd_input_dims, MklDnnType(), tf_fmt); // If filter is in MKL layout, then simply grab filter layout; otherwise // construct filter in TF layout. For TF layout, filter is in HWIO format. - auto fwd_filter_md = filter_mkl_shape.IsMklTensor() ? - filter_mkl_shape.GetMklLayout() : - memory::desc(fwd_filter_dims, MklDnnType(), - memory::format::hwio); + auto fwd_filter_md = filter_mkl_shape.IsMklTensor() + ? filter_mkl_shape.GetMklLayout() + : memory::desc(fwd_filter_dims, MklDnnType(), + memory::format::hwio); // Tensorflow Output of Conv2D is in data_format order. auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType(), tf_fmt); - auto fwd_desc = convolution_forward::desc(prop_kind::forward, - convolution_direct, fwd_input_md, fwd_filter_md, fwd_out_md, - strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_)); + auto fwd_desc = convolution_forward::desc( + prop_kind::forward, convolution_direct, fwd_input_md, fwd_filter_md, + fwd_out_md, strides, padding_l, padding_r, + TFPaddingToMklDnnPadding(padding_)); auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine); // Create memory for user data. Describe how the inputs and outputs of @@ -495,17 +485,16 @@ class MklConv2DBackpropCommonOp : public OpKernel { // Operator-specific call to create and execute primitive. CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter, - &outbackprop, &output, &output_tensor, - strides, padding_l, padding_r, - TFPaddingToMklDnnPadding(padding_), + &outbackprop, &output, &output_tensor, strides, padding_l, + padding_r, TFPaddingToMklDnnPadding(padding_), bwd_output_dims, bwd_output_format); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:", - error_msg)); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -523,11 +512,11 @@ class MklConv2DBackpropCommonOp : public OpKernel { /// Get TensorFlow shape of input tensor. virtual TensorShape MakeInputTfShape(OpKernelContext* context, - const Tensor& input_tensor) = 0; + const Tensor& input_tensor) = 0; /// Get TensorFlow shape of filter tensor. virtual TensorShape MakeFilterTfShape(OpKernelContext* context, - const Tensor& filter_tensor) = 0; + const Tensor& filter_tensor) = 0; /// Get the TensorFlow shape of output tensor. virtual TensorShape GetOutputTfShape(const TensorShape& input_shape, @@ -536,9 +525,9 @@ class MklConv2DBackpropCommonOp : public OpKernel { /// Get shape of output in MKL-DNN order. Computes shape of output from /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims). - virtual - const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims, - const memory::dims& fwd_filter_dims) = 0; + virtual const memory::dims& GetOutputDims( + const memory::dims& fwd_input_dims, + const memory::dims& fwd_filter_dims) = 0; /// Get data_format of output in MKL-DNN order. If output data format is /// same as input data format, then it simply returns value of data_format @@ -546,24 +535,25 @@ class MklConv2DBackpropCommonOp : public OpKernel { virtual memory::format GetOutputFormat(const memory::format data_format) = 0; /// Create and execute the primitive storing output in the output_tensor. - virtual void CreatePrimitive(OpKernelContext* context, - const engine& cpu_engine, - const convolution_forward::primitive_desc& conv_fwd_pd, - MklDnnData* input, MklDnnData* filter, MklDnnData* outbackprop, - MklDnnData* output, Tensor** output_tensor, const memory::dims& strides, - const memory::dims& padding_l, const memory::dims& padding_r, - padding_kind padding, const memory::dims& bwd_output_dims, - memory::format bwd_output_format) = 0; + virtual void CreatePrimitive( + OpKernelContext* context, const engine& cpu_engine, + const convolution_forward::primitive_desc& conv_fwd_pd, + MklDnnData* input, MklDnnData* filter, MklDnnData* outbackprop, + MklDnnData* output, Tensor** output_tensor, + const memory::dims& strides, const memory::dims& padding_l, + const memory::dims& padding_r, padding_kind padding, + const memory::dims& bwd_output_dims, + memory::format bwd_output_format) = 0; // Get the data_format {NCHW, NHWC} - TensorFormat GetTFDataFormat () { return data_format_; } + TensorFormat GetTFDataFormat() { return data_format_; } private: std::vector strides_; Padding padding_; TensorFormat data_format_; }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML ///////////////////////////////////////////////////////////////////// /// Dummy Mkl op that is just used for operators that are intermediate @@ -575,12 +565,12 @@ class MklDummyOp : public OpKernel { public: ~MklDummyOp() {} - explicit MklDummyOp(OpKernelConstruction* context) : - OpKernel(context) {} + explicit MklDummyOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { - TF_CHECK_OK(errors::Unimplemented("This is a dummy op." - "It should not have been invoked.")); + TF_CHECK_OK( + errors::Unimplemented("This is a dummy op." + "It should not have been invoked.")); } }; diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc index c065724e0db..58f0c30f32b 100644 --- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc +++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc @@ -1,4 +1,4 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0(the "License"); you may not use this file except in compliance with the License. diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc index 8340a91d059..8313224d7fe 100644 --- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc +++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc @@ -25,15 +25,15 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" -using mkldnn::stream; -using mkldnn::prop_kind; -using mkldnn::use_scale_shift; -using mkldnn::use_global_stats; -using mkldnn::batch_normalization_forward; using mkldnn::batch_normalization_backward; +using mkldnn::batch_normalization_forward; +using mkldnn::prop_kind; +using mkldnn::stream; +using mkldnn::use_global_stats; +using mkldnn::use_scale_shift; #endif // TODO(inteltf) Address comments from PR 8968. @@ -41,7 +41,7 @@ using mkldnn::batch_normalization_backward; namespace tensorflow { using CPUDevice = Eigen::ThreadPoolDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template class MklFusedBatchNormOp : public OpKernel { @@ -601,7 +601,7 @@ class MklFusedBatchNormGradOp : public OpKernel { mkl_res_batchnorm_bwd[dnnResourceSrc] = (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input; - bool mkl_convert_out_backprop; + bool mkl_convert_out_backprop; dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr; dnnLayout_t mkl_lt_internal_out_backprop = nullptr; void* mkl_buf_converted_out_backprop = nullptr; @@ -683,7 +683,7 @@ class MklFusedBatchNormGradOp : public OpKernel { }; #endif -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML template class MklFusedBatchNormOp : public OpKernel { @@ -709,12 +709,11 @@ class MklFusedBatchNormOp : public OpKernel { const size_t kMeanIndex = 3; // index of est_mean tensor const size_t kVarianceIndex = 4; // index of est_variance tensor - const Tensor& src_tensor = MklGetInput(context, kSrcIndex); - const Tensor& scale_tensor = MklGetInput(context, kScaleIndex); - const Tensor& shift_tensor = MklGetInput(context, kShiftIndex); - const Tensor& est_mean_tensor = MklGetInput(context, kMeanIndex); - const Tensor& est_variance_tensor = MklGetInput(context, - kVarianceIndex); + const Tensor& src_tensor = MklGetInput(context, kSrcIndex); + const Tensor& scale_tensor = MklGetInput(context, kScaleIndex); + const Tensor& shift_tensor = MklGetInput(context, kShiftIndex); + const Tensor& est_mean_tensor = MklGetInput(context, kMeanIndex); + const Tensor& est_variance_tensor = MklGetInput(context, kVarianceIndex); TensorShape tf_shape_src; MklDnnShape dnn_shape_src; @@ -723,37 +722,34 @@ class MklFusedBatchNormOp : public OpKernel { if (dnn_shape_src.IsMklTensor()) { tf_shape_src = dnn_shape_src.GetTfShape(); OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - src_tensor.shape().DebugString())); + errors::InvalidArgument("input must be 4-dimensional", + src_tensor.shape().DebugString())); } else { tf_shape_src = src_tensor.shape(); OP_REQUIRES(context, src_tensor.dims() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - src_tensor.shape().DebugString())); + errors::InvalidArgument("input must be 4-dimensional", + src_tensor.shape().DebugString())); } OP_REQUIRES(context, scale_tensor.dims() == 1, - errors::InvalidArgument( - "scale must be 1-dimensional", - scale_tensor.shape().DebugString())); + errors::InvalidArgument("scale must be 1-dimensional", + scale_tensor.shape().DebugString())); OP_REQUIRES(context, shift_tensor.dims() == 1, errors::InvalidArgument("offset must be 1-dimensional", - shift_tensor.shape().DebugString())); - OP_REQUIRES(context, est_mean_tensor.dims() == 1, - errors::InvalidArgument( - "estimated_mean must be 1-dimensional", - est_mean_tensor.shape().DebugString())); - OP_REQUIRES(context, est_variance_tensor.dims() == 1, - errors::InvalidArgument( - "estimated_variance must be 1-dimensional", - est_variance_tensor.shape().DebugString())); + shift_tensor.shape().DebugString())); + OP_REQUIRES( + context, est_mean_tensor.dims() == 1, + errors::InvalidArgument("estimated_mean must be 1-dimensional", + est_mean_tensor.shape().DebugString())); + OP_REQUIRES( + context, est_variance_tensor.dims() == 1, + errors::InvalidArgument("estimated_variance must be 1-dimensional", + est_variance_tensor.shape().DebugString())); if (is_training_) { - OP_REQUIRES(context, est_mean_tensor.dim_size(0) == 0, - errors::InvalidArgument( - "estimated_mean must be empty for training", - est_mean_tensor.shape().DebugString())); + OP_REQUIRES( + context, est_mean_tensor.dim_size(0) == 0, + errors::InvalidArgument("estimated_mean must be empty for training", + est_mean_tensor.shape().DebugString())); OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0, errors::InvalidArgument( "estimated_variance must be empty for training", @@ -763,11 +759,9 @@ class MklFusedBatchNormOp : public OpKernel { // special case: input with 0 element and 0 batch size Tensor* dst_tensor = nullptr; if (tf_shape_src.num_elements() == 0) { - HandleEmptyInput(context, - tf_shape_src, - scale_tensor.shape(), - &dst_tensor); - return; + HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(), + &dst_tensor); + return; } if (dnn_shape_src.IsMklTensor()) @@ -783,11 +777,8 @@ class MklFusedBatchNormOp : public OpKernel { Tensor* batch_variance_tensor = nullptr; Tensor* saved_mean_tensor = nullptr; Tensor* saved_variance_tensor = nullptr; - AllocateTFOutputs(context, - scale_tensor.shape(), - &batch_mean_tensor, - &batch_variance_tensor, - &saved_mean_tensor, + AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor, + &batch_variance_tensor, &saved_mean_tensor, &saved_variance_tensor); if (is_training_) @@ -815,69 +806,63 @@ class MklFusedBatchNormOp : public OpKernel { src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(), tensor_format_); } else { - src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), - tensor_format_); + src_dims = + TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_); } auto src_md = dnn_shape_src.IsMklTensor() - ? dnn_shape_src.GetMklLayout() - : memory::desc(src_dims, MklDnnType(), format_m); + ? dnn_shape_src.GetMklLayout() + : memory::desc(src_dims, MklDnnType(), format_m); src.SetUsrMem(src_md, &src_tensor); // set weights primitive // MKL-DNN packs scale & shift as "weights": // ...... - auto weights_desc = memory::desc({2, depth_}, - MklDnnType(), - memory::format::nc); + auto weights_desc = + memory::desc({2, depth_}, MklDnnType(), memory::format::nc); auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine); auto weights_m = memory(weights_pd); - T* weights_data = reinterpret_cast( - weights_m.get_data_handle()); - T* scale_tf = reinterpret_cast( - const_cast(scale_tensor.flat().data())); - T* shift_tf = reinterpret_cast( - const_cast(shift_tensor.flat().data())); + T* weights_data = reinterpret_cast(weights_m.get_data_handle()); + T* scale_tf = + reinterpret_cast(const_cast(scale_tensor.flat().data())); + T* shift_tf = + reinterpret_cast(const_cast(shift_tensor.flat().data())); - for (int k=0; k < depth_; k++) { + for (int k = 0; k < depth_; k++) { weights_data[k] = scale_tf[k]; weights_data[k + depth_] = shift_tf[k]; } // set mean primitive - auto mean_desc = memory::desc({1, depth_}, - MklDnnType(), - memory::format::nc); + auto mean_desc = + memory::desc({1, depth_}, MklDnnType(), memory::format::nc); auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine); - char* saved_mean_data_tf = reinterpret_cast - (saved_mean_tensor->flat().data()); - std::memcpy(saved_mean_data_tf, - reinterpret_cast(mean_values_), - depth_*sizeof(T)); - auto mean_m = memory(mean_pd, - reinterpret_cast(saved_mean_data_tf)); + char* saved_mean_data_tf = + reinterpret_cast(saved_mean_tensor->flat().data()); + std::memcpy(saved_mean_data_tf, reinterpret_cast(mean_values_), + depth_ * sizeof(T)); + auto mean_m = + memory(mean_pd, reinterpret_cast(saved_mean_data_tf)); // set variance primitive - auto variance_desc = memory::desc({1, depth_}, - MklDnnType(), - memory::format::nc); + auto variance_desc = + memory::desc({1, depth_}, MklDnnType(), memory::format::nc); auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine); - char* saved_variance_data_tf = reinterpret_cast - (saved_variance_tensor->flat().data()); + char* saved_variance_data_tf = + reinterpret_cast(saved_variance_tensor->flat().data()); std::memcpy(saved_variance_data_tf, reinterpret_cast(variance_values_), - depth_*sizeof(T)); + depth_ * sizeof(T)); auto variance_m = memory(variance_pd, saved_variance_data_tf); - prop_kind pk = (is_training_) ? - prop_kind::forward_training : - prop_kind::forward_scoring; + prop_kind pk = (is_training_) ? prop_kind::forward_training + : prop_kind::forward_scoring; auto bnrm_fwd_desc = batch_normalization_forward::desc( - pk, src.GetUsrMemDesc(), epsilon_, - is_training_ ? use_scale_shift : - (use_scale_shift | use_global_stats)); + pk, src.GetUsrMemDesc(), epsilon_, + is_training_ ? use_scale_shift + : (use_scale_shift | use_global_stats)); auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc( - bnrm_fwd_desc, cpu_engine); + bnrm_fwd_desc, cpu_engine); // allocate dst tensor MklDnnShape dnn_shape_dst; @@ -887,47 +872,39 @@ class MklFusedBatchNormOp : public OpKernel { auto dst_pd = bnrm_fwd_pd.dst_primitive_desc(); dnn_shape_dst.SetMklLayout(&dst_pd); dnn_shape_dst.SetElemType(MklDnnType()); - dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), - src_dims, format_m); - tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T)); + dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), src_dims, + format_m); + tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T)); } else { dnn_shape_dst.SetMklTensor(false); tf_shape_dst = src_tensor.shape(); } - AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, - tf_shape_dst, dnn_shape_dst); + AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, tf_shape_dst, + dnn_shape_dst); // Output of batchnorm has same shape as input. dst.SetUsrMem(src_md, dst_tensor); primitive bnrm_fwd_op; if (is_training_) { - bnrm_fwd_op = batch_normalization_forward( - bnrm_fwd_pd, - src.GetOpMem(), - weights_m, - dst.GetOpMem(), - mean_m, - variance_m); + bnrm_fwd_op = + batch_normalization_forward(bnrm_fwd_pd, src.GetOpMem(), weights_m, + dst.GetOpMem(), mean_m, variance_m); } else { bnrm_fwd_op = batch_normalization_forward( - bnrm_fwd_pd, - src.GetOpMem(), - mean_m, - variance_m, - (const primitive::at) weights_m, - dst.GetOpMem()); + bnrm_fwd_pd, src.GetOpMem(), mean_m, variance_m, + (const primitive::at)weights_m, dst.GetOpMem()); } std::vector net; net.push_back(bnrm_fwd_op); stream(stream::kind::eager).submit(net).wait(); // copy batch_mean data - T* batch_mean_data_tf = reinterpret_cast( - batch_mean_tensor->flat().data()); + T* batch_mean_data_tf = + reinterpret_cast(batch_mean_tensor->flat().data()); std::memcpy(reinterpret_cast(batch_mean_data_tf), reinterpret_cast(mean_m.get_data_handle()), - depth_*sizeof(T)); + depth_ * sizeof(T)); // copy batch_variance data with Bessel's correction // if training mode is on @@ -937,18 +914,17 @@ class MklFusedBatchNormOp : public OpKernel { size_t adjust_size = orig_size - 1; adjust_factor = (static_cast(orig_size)) / adjust_size; } - for (int k=0; k < depth_; k++) + for (int k = 0; k < depth_; k++) batch_variance_tensor->flat().data()[k] = - (reinterpret_cast(variance_m.get_data_handle()))[k] - * adjust_factor; - } catch (mkldnn::error &e) { + (reinterpret_cast(variance_m.get_data_handle()))[k] * + adjust_factor; + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } @@ -958,7 +934,7 @@ class MklFusedBatchNormOp : public OpKernel { bool is_training_; T* mean_values_; T* variance_values_; - size_t depth_; // batch normalization is done for per channel. + size_t depth_; // batch normalization is done for per channel. void ExtractParams(OpKernelContext* context) { const Tensor& input = MklGetInput(context, 0); @@ -966,23 +942,20 @@ class MklFusedBatchNormOp : public OpKernel { } void SetMeanVariance(const Tensor& mean, const Tensor& variance) { - mean_values_ = reinterpret_cast( - const_cast(mean.flat().data())); - variance_values_ = reinterpret_cast( - const_cast(variance.flat().data())); + mean_values_ = reinterpret_cast(const_cast(mean.flat().data())); + variance_values_ = + reinterpret_cast(const_cast(variance.flat().data())); } - void HandleEmptyInput(OpKernelContext* context, - TensorShape tf_shape_src, - TensorShape tf_shape_scale, - Tensor** dst_tensor) { + void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src, + TensorShape tf_shape_scale, Tensor** dst_tensor) { CHECK_NOTNULL(dst_tensor); const size_t kDstIndex = 0; MklDnnShape dnn_shape_dst; dnn_shape_dst.SetMklTensor(false); - AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, - tf_shape_src, dnn_shape_dst); + AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, tf_shape_src, + dnn_shape_dst); CHECK_NOTNULL(*dst_tensor); memset(const_cast((*dst_tensor)->tensor_data().data()), 0, (*dst_tensor)->tensor_data().size()); @@ -991,15 +964,12 @@ class MklFusedBatchNormOp : public OpKernel { Tensor* batch_variance_tensor = nullptr; Tensor* saved_mean_tensor = nullptr; Tensor* saved_variance_tensor = nullptr; - AllocateTFOutputs(context, tf_shape_scale, - &batch_mean_tensor, - &batch_variance_tensor, - &saved_mean_tensor, + AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor, + &batch_variance_tensor, &saved_mean_tensor, &saved_variance_tensor); } - void AllocateTFOutputs(OpKernelContext* context, - TensorShape tf_shape_scale, + void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale, Tensor** batch_mean_tensor, Tensor** batch_variance_tensor, Tensor** saved_mean_tensor, @@ -1017,51 +987,43 @@ class MklFusedBatchNormOp : public OpKernel { // allocate batch mean output tensor MklDnnShape mkl_shape_batch_mean; mkl_shape_batch_mean.SetMklTensor(false); - AllocateOutputSetMklShape(context, - kBatchMeanIndex, - batch_mean_tensor, - tf_shape_scale, - mkl_shape_batch_mean); + AllocateOutputSetMklShape(context, kBatchMeanIndex, batch_mean_tensor, + tf_shape_scale, mkl_shape_batch_mean); CHECK_NOTNULL(*batch_mean_tensor); // set NAN mean value in case of empty input tensor - for (int k=0; k < tf_shape_scale.num_elements(); k++) + for (int k = 0; k < tf_shape_scale.num_elements(); k++) (*batch_mean_tensor)->flat().data()[k] = NAN; // allocate batch variance output tensor MklDnnShape mkl_shape_batch_variance; mkl_shape_batch_variance.SetMklTensor(false); - AllocateOutputSetMklShape(context, - kBatchVarianceIndex, - batch_variance_tensor, - tf_shape_scale, + AllocateOutputSetMklShape(context, kBatchVarianceIndex, + batch_variance_tensor, tf_shape_scale, mkl_shape_batch_variance); CHECK_NOTNULL(*batch_variance_tensor); // set NAN variance value in case of empty input tensor - for (int k=0; k < tf_shape_scale.num_elements(); k++) + for (int k = 0; k < tf_shape_scale.num_elements(); k++) (*batch_variance_tensor)->flat().data()[k] = NAN; // Mean and variance (without Bessel's correction) saved for backward // computation to serve as pre-computed mean and variance. MklDnnShape mkl_shape_saved_mean; mkl_shape_saved_mean.SetMklTensor(false); - AllocateOutputSetMklShape(context, kSavedMeanIndex, - saved_mean_tensor, - tf_shape_scale, - mkl_shape_saved_mean); + AllocateOutputSetMklShape(context, kSavedMeanIndex, saved_mean_tensor, + tf_shape_scale, mkl_shape_saved_mean); CHECK_NOTNULL(*saved_mean_tensor); // set NAN mean value in case of empty input tensor - for (int k=0; k < tf_shape_scale.num_elements(); k++) + for (int k = 0; k < tf_shape_scale.num_elements(); k++) (*saved_mean_tensor)->flat().data()[k] = NAN; MklDnnShape mkl_shape_saved_variance; mkl_shape_saved_variance.SetMklTensor(false); AllocateOutputSetMklShape(context, kSavedVarianceIndex, - saved_variance_tensor, - tf_shape_scale, + saved_variance_tensor, tf_shape_scale, mkl_shape_saved_variance); CHECK_NOTNULL(*saved_variance_tensor); // set NAN variance value in case of empty input tensor - for (int k=0; k < tf_shape_scale.num_elements(); k++) + for (int k = 0; k < tf_shape_scale.num_elements(); k++) (*saved_variance_tensor)->flat().data()[k] = NAN; } }; @@ -1093,8 +1055,8 @@ class MklFusedBatchNormGradOp : public OpKernel { const Tensor& src_tensor = MklGetInput(context, kSrcIndex); const Tensor& scale_tensor = MklGetInput(context, kScaleIndex); const Tensor& saved_mean_tensor = MklGetInput(context, kMeanIndex); - const Tensor& saved_variance_tensor = MklGetInput(context, - kVarianceIndex); + const Tensor& saved_variance_tensor = + MklGetInput(context, kVarianceIndex); MklDnnShape dnn_shape_src, dnn_shape_diff_dst; GetMklShape(context, kSrcIndex, &dnn_shape_src); @@ -1103,53 +1065,49 @@ class MklFusedBatchNormGradOp : public OpKernel { if (dnn_shape_diff_dst.IsMklTensor()) { tf_shape_diff_dst = dnn_shape_diff_dst.GetTfShape(); - OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - diff_dst_tensor.shape().DebugString())); + OP_REQUIRES( + context, dnn_shape_diff_dst.GetDimension() == 4, + errors::InvalidArgument("input must be 4-dimensional", + diff_dst_tensor.shape().DebugString())); } else { tf_shape_diff_dst = diff_dst_tensor.shape(); - OP_REQUIRES(context, diff_dst_tensor.dims() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - diff_dst_tensor.shape().DebugString())); + OP_REQUIRES( + context, diff_dst_tensor.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + diff_dst_tensor.shape().DebugString())); } if (dnn_shape_src.IsMklTensor()) { tf_shape_src = dnn_shape_src.GetTfShape(); OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - src_tensor.shape().DebugString())); + errors::InvalidArgument("input must be 4-dimensional", + src_tensor.shape().DebugString())); } else { tf_shape_src = src_tensor.shape(); OP_REQUIRES(context, src_tensor.dims() == 4, - errors::InvalidArgument( - "input must be 4-dimensional", - src_tensor.shape().DebugString())); + errors::InvalidArgument("input must be 4-dimensional", + src_tensor.shape().DebugString())); } OP_REQUIRES(context, scale_tensor.dims() == 1, - errors::InvalidArgument( - "scale must be 1-dimensional", - scale_tensor.shape().DebugString())); - OP_REQUIRES(context, saved_mean_tensor.dims() == 1, - errors::InvalidArgument( - "saved mean must be 1-dimensional", - saved_mean_tensor.shape().DebugString())); + errors::InvalidArgument("scale must be 1-dimensional", + scale_tensor.shape().DebugString())); + OP_REQUIRES( + context, saved_mean_tensor.dims() == 1, + errors::InvalidArgument("saved mean must be 1-dimensional", + saved_mean_tensor.shape().DebugString())); - OP_REQUIRES(context, saved_variance_tensor.dims() == 1, - errors::InvalidArgument( - "saved variance must be 1-dimensional", - saved_variance_tensor.shape().DebugString())); + OP_REQUIRES( + context, saved_variance_tensor.dims() == 1, + errors::InvalidArgument("saved variance must be 1-dimensional", + saved_variance_tensor.shape().DebugString())); Tensor* diff_src_tensor = nullptr; if (tf_shape_src.num_elements() == 0 || tf_shape_diff_dst.num_elements() == 0) { - HandleEmptyInput(context, tf_shape_src, - scale_tensor.shape(), - &diff_src_tensor); - return; + HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(), + &diff_src_tensor); + return; } if (dnn_shape_src.IsMklTensor()) @@ -1175,20 +1133,18 @@ class MklFusedBatchNormGradOp : public OpKernel { memory::dims src_dims, diff_dst_dims; if (dnn_shape_src.IsMklTensor()) - src_dims = TFShapeToMklDnnDimsInNCHW( - dnn_shape_src.GetTfShape(), tensor_format_); + src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(), + tensor_format_); else - src_dims = TFShapeToMklDnnDimsInNCHW( - src_tensor.shape(), tensor_format_); + src_dims = + TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_); if (dnn_shape_diff_dst.IsMklTensor()) diff_dst_dims = TFShapeToMklDnnDimsInNCHW( - dnn_shape_diff_dst.GetTfShape(), - tensor_format_); + dnn_shape_diff_dst.GetTfShape(), tensor_format_); else - diff_dst_dims = TFShapeToMklDnnDimsInNCHW( - diff_dst_tensor.shape(), - tensor_format_); + diff_dst_dims = + TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), tensor_format_); // set src and diff_dst primitives memory::desc src_md({}, memory::data_undef, memory::format_undef); @@ -1202,7 +1158,7 @@ class MklFusedBatchNormGradOp : public OpKernel { src_md = diff_dst_md; } } else { - src_md = memory::desc(src_dims, MklDnnType(), format_m); + src_md = memory::desc(src_dims, MklDnnType(), format_m); diff_dst_md = src_md; } src.SetUsrMem(src_md, &src_tensor); @@ -1210,55 +1166,47 @@ class MklFusedBatchNormGradOp : public OpKernel { // weights -- DNN packs scales/shifts as weights in order of // scale, ..., scale, shift, ..., shift - auto weights_desc = memory::desc({2, depth_}, - MklDnnType(), - memory::format::nc); + auto weights_desc = + memory::desc({2, depth_}, MklDnnType(), memory::format::nc); auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine); auto weights_m = memory(weights_pd); T* weights_data = reinterpret_cast(weights_m.get_data_handle()); - T* scale_tf = reinterpret_cast(const_cast - (scale_tensor.flat().data())); - for (int k=0; k < depth_; k++) { + T* scale_tf = + reinterpret_cast(const_cast(scale_tensor.flat().data())); + for (int k = 0; k < depth_; k++) { weights_data[k] = scale_tf[k]; weights_data[k + depth_] = 0; } // set mean primitive memory::dims mv_dims = GetMeanVarianceDims(); - mean.SetUsrMem(mv_dims, - memory::format::nc, - const_cast(static_cast - (saved_mean_tensor.flat().data()))); + mean.SetUsrMem(mv_dims, memory::format::nc, + const_cast(static_cast( + saved_mean_tensor.flat().data()))); mean.SetOpMemDesc(mv_dims, memory::format::nc); // set variance primitive - variance.SetUsrMem(mv_dims, memory::format::nc, - const_cast(static_cast - (saved_variance_tensor.flat().data()))); + variance.SetUsrMem(mv_dims, memory::format::nc, + const_cast(static_cast( + saved_variance_tensor.flat().data()))); variance.SetOpMemDesc(mv_dims, memory::format::nc); // set diff_weight primitive - auto diff_weights_desc = memory::desc( - {2, depth_}, - MklDnnType(), - memory::format::nc); - auto diff_weights_pd = memory::primitive_desc( - diff_weights_desc, - cpu_engine); + auto diff_weights_desc = + memory::desc({2, depth_}, MklDnnType(), memory::format::nc); + auto diff_weights_pd = + memory::primitive_desc(diff_weights_desc, cpu_engine); auto diff_weights_m = memory(diff_weights_pd); auto bnrm_fwd_desc = batch_normalization_forward::desc( - prop_kind::forward_training, - src.GetUsrMemDesc(), - epsilon_, - is_training_ ? use_scale_shift : - (use_scale_shift | use_global_stats)); + prop_kind::forward_training, src.GetUsrMemDesc(), epsilon_, + is_training_ ? use_scale_shift + : (use_scale_shift | use_global_stats)); auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc( - bnrm_fwd_desc, - cpu_engine); + bnrm_fwd_desc, cpu_engine); // Indices of output tensors - const size_t kDiffSrcIndex = 0; // index of diff_src tensor + const size_t kDiffSrcIndex = 0; // index of diff_src tensor // allocate diff_src tensor MklDnnShape dnn_shape_diff_src; @@ -1268,14 +1216,11 @@ class MklFusedBatchNormGradOp : public OpKernel { auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc(); dnn_shape_diff_src.SetMklLayout(&diff_src_pd); dnn_shape_diff_src.SetElemType(MklDnnType()); - dnn_shape_diff_src.SetTfLayout( - dnn_shape_src.GetDimension(), - src_dims, - format_m); - dnn_shape_diff_src.SetTfDimOrder( - dnn_shape_src.GetDimension(), - tensor_format_); - tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T)); + dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), src_dims, + format_m); + dnn_shape_diff_src.SetTfDimOrder(dnn_shape_src.GetDimension(), + tensor_format_); + tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T)); } else { dnn_shape_diff_src.SetMklTensor(false); tf_shape_diff_src = src_tensor.shape(); @@ -1287,33 +1232,22 @@ class MklFusedBatchNormGradOp : public OpKernel { prop_kind pk = prop_kind::backward; auto bnrm_bwd_desc = batch_normalization_backward::desc( - pk, - diff_src.GetUsrMemDesc(), - src.GetUsrMemDesc(), - epsilon_, - /* for inference, specify use_global_stats - 1. on fwd prop, use mean and variance - provided as inputs - 2. on bwd prop, mean and variance are - considered as constants. Thus, - reduce the amout of MKL computations - */ - is_training_ ? use_scale_shift : - (use_scale_shift | use_global_stats)); + pk, diff_src.GetUsrMemDesc(), src.GetUsrMemDesc(), epsilon_, + /* for inference, specify use_global_stats + 1. on fwd prop, use mean and variance + provided as inputs + 2. on bwd prop, mean and variance are + considered as constants. Thus, + reduce the amout of MKL computations + */ + is_training_ ? use_scale_shift + : (use_scale_shift | use_global_stats)); auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc( - bnrm_bwd_desc, - cpu_engine, - bnrm_fwd_pd); + bnrm_bwd_desc, cpu_engine, bnrm_fwd_pd); auto bnrm_bwd_op = batch_normalization_backward( - bnrm_bwd_pd, - src.GetOpMem(), - mean.GetOpMem(), - variance.GetOpMem(), - diff_dst.GetOpMem(), - weights_m, - diff_src.GetOpMem(), - diff_weights_m); + bnrm_bwd_pd, src.GetOpMem(), mean.GetOpMem(), variance.GetOpMem(), + diff_dst.GetOpMem(), weights_m, diff_src.GetOpMem(), diff_weights_m); std::vector net; net.push_back(bnrm_bwd_op); @@ -1322,43 +1256,39 @@ class MklFusedBatchNormGradOp : public OpKernel { // allocate 4 output TF tensors Tensor* diff_scale_tensor = nullptr; Tensor* diff_shift_tensor = nullptr; - AllocateTFOutputs(context, scale_tensor.shape(), - &diff_scale_tensor, + AllocateTFOutputs(context, scale_tensor.shape(), &diff_scale_tensor, &diff_shift_tensor); // copy data: diff_scale and diff_shift - T* diff_weights_data_dnn = reinterpret_cast - (diff_weights_m.get_data_handle()); + T* diff_weights_data_dnn = + reinterpret_cast(diff_weights_m.get_data_handle()); for (int i = 0; i < depth_; i++) { - diff_scale_tensor->flat().data()[i] = - diff_weights_data_dnn[i]; + diff_scale_tensor->flat().data()[i] = diff_weights_data_dnn[i]; diff_shift_tensor->flat().data()[i] = - diff_weights_data_dnn[i + depth_]; + diff_weights_data_dnn[i + depth_]; } - } catch (mkldnn::error &e) { + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } private: T epsilon_; TensorFormat tensor_format_; - int depth_; // batch normalization is done for per channel. + int depth_; // batch normalization is done for per channel. bool is_training_; void ExtractParams(OpKernelContext* context) { - const Tensor& input = MklGetInput(context, 0); - depth_ = static_cast(GetTensorDim(input, tensor_format_, 'C')); + const Tensor& input = MklGetInput(context, 0); + depth_ = static_cast(GetTensorDim(input, tensor_format_, 'C')); } - void HandleEmptyInput(OpKernelContext* context, - TensorShape tf_shape_src, + void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src, TensorShape tf_shape_scale_shift, Tensor** diff_src_tensor) { const size_t kDiffSrcIndex = 0; @@ -1366,22 +1296,20 @@ class MklFusedBatchNormGradOp : public OpKernel { MklDnnShape dnn_shape_diff_src; dnn_shape_diff_src.SetMklTensor(false); AllocateOutputSetMklShape(context, kDiffSrcIndex, diff_src_tensor, - tf_shape_src, dnn_shape_diff_src); - for (size_t i=0; i < (*diff_src_tensor)->shape().num_elements(); i++) - (*diff_src_tensor)->flat().data()[i] = 0; + tf_shape_src, dnn_shape_diff_src); + for (size_t i = 0; i < (*diff_src_tensor)->shape().num_elements(); i++) + (*diff_src_tensor)->flat().data()[i] = 0; Tensor* diff_scale_tensor = nullptr; Tensor* diff_shift_tensor = nullptr; - AllocateTFOutputs(context, - tf_shape_scale_shift, - &diff_scale_tensor, + AllocateTFOutputs(context, tf_shape_scale_shift, &diff_scale_tensor, &diff_shift_tensor); } void AllocateTFOutputs(OpKernelContext* context, - TensorShape tf_shape_scale_shift, - Tensor** diff_scale_tensor, - Tensor** diff_shift_tensor) { + TensorShape tf_shape_scale_shift, + Tensor** diff_scale_tensor, + Tensor** diff_shift_tensor) { CHECK_NOTNULL(diff_scale_tensor); CHECK_NOTNULL(diff_shift_tensor); @@ -1396,31 +1324,29 @@ class MklFusedBatchNormGradOp : public OpKernel { AllocateOutputSetMklShape(context, kDiffScaleIndex, diff_scale_tensor, tf_shape_scale_shift, mkl_shape_diff_scale); CHECK_NOTNULL(*diff_scale_tensor); - for (size_t i=0; i < (*diff_scale_tensor)->shape().num_elements(); i++) - (*diff_scale_tensor)->flat().data()[i] = 0; + for (size_t i = 0; i < (*diff_scale_tensor)->shape().num_elements(); i++) + (*diff_scale_tensor)->flat().data()[i] = 0; MklDnnShape mkl_shape_diff_shift; mkl_shape_diff_shift.SetMklTensor(false); AllocateOutputSetMklShape(context, kDiffShiftIndex, diff_shift_tensor, tf_shape_scale_shift, mkl_shape_diff_shift); CHECK_NOTNULL(*diff_shift_tensor); - for (size_t i=0; i < (*diff_shift_tensor)->shape().num_elements(); i++) - (*diff_shift_tensor)->flat().data()[i] = 0; + for (size_t i = 0; i < (*diff_shift_tensor)->shape().num_elements(); i++) + (*diff_shift_tensor)->flat().data()[i] = 0; // Placeholders for estimated_mean and estimated_variance, which are // used for inference and thus not needed here for gradient computation. - Tensor* p1_tensor = nullptr, *p2_tensor = nullptr; + Tensor *p1_tensor = nullptr, *p2_tensor = nullptr; MklDnnShape mkl_shape_p; mkl_shape_p.SetMklTensor(false); - AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, - TensorShape({}), mkl_shape_p); - AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, - TensorShape({}), mkl_shape_p); + AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, TensorShape({}), + mkl_shape_p); + AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, TensorShape({}), + mkl_shape_p); } - memory::dims GetMeanVarianceDims() { - return memory::dims({1, depth_}); - } + memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); } }; #endif diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc index 9ee27ee21c8..6c027f8e728 100644 --- a/tensorflow/core/kernels/mkl_identity_op.cc +++ b/tensorflow/core/kernels/mkl_identity_op.cc @@ -28,14 +28,14 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" #endif namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template class MklIdentityOp : public OpKernel { diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc index 4b5f7b83100..acb0db57b38 100644 --- a/tensorflow/core/kernels/mkl_input_conversion_op.cc +++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc @@ -31,7 +31,7 @@ limitations under the License. #include "tensorflow/core/kernels/mkl_tfconv_op.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::stream; @@ -59,7 +59,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; // convert the TF format input to MKL format /////////////////////////////////////////////////////////// -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template class MklInputConversionOp : public OpKernel { public: @@ -271,8 +271,8 @@ class MklInputConversionOp : public OpKernel { MklDnnShape input_shape_1; GetMklShape(context, 1, &input_shape_1); - bool tf_shapes_are_same = context->input(0).shape() == - context->input(1).shape(); + bool tf_shapes_are_same = + context->input(0).shape() == context->input(1).shape(); VLOG(1) << "MklInputConversionOp: Input shapes are " << (tf_shapes_are_same ? "*same*" : "*different*") << ": " @@ -293,14 +293,56 @@ class MklInputConversionOp : public OpKernel { // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // If both inputs are in MKL format if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) { - // If both have the same shape, pass them through if (tf_shapes_are_same) { - VLOG(1) << "MklInputConversionOp: No conversion needed, " - << "copying MKL inputs with identical shapes to output"; + auto input0_md = input_shape_0.GetMklLayout(); + auto input1_md = input_shape_1.GetMklLayout(); + + // If both have the same shape and same format, pass them through + if ( input0_md.data.format == input1_md.data.format) { + VLOG(1) << "MklInputConversionOp: No conversion needed, " + << "copying MKL inputs with identical shapes to output"; - ForwardMklTensorInToOut(context, 0, 0); - ForwardMklTensorInToOut(context, 1, 1); - return; + ForwardMklTensorInToOut(context, 0, 0); + ForwardMklTensorInToOut(context, 1, 1); + return; + } else { + VLOG(1) << "MklInputConversionOp: Shape is same, but format is different, " + << "need to convert to same format"; + + // Convert input0, and keep input1 unchanged + // Create MklDnnShape for output mkl tensor based on input0 + Tensor* tensor_out; + MklDnnShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(true); + mkl_output_mkl_shape.SetElemType(MklDnnType()); + mkl_output_mkl_shape.SetTfLayout(input_shape_0.GetDimension(), + input_shape_0.GetSizesAsMklDnnDims(), + input_shape_0.GetTfDataFormat()); + + // Get MKL layout from input1 as destination layout + mkl_output_mkl_shape.SetMklLayout(&input1_md); + + // Create output Mkl tensor for index 0 + AllocateOutputSetMklShape(context, 0, &tensor_out, + input_tensor_0.shape(), mkl_output_mkl_shape); + + // Create MklDnnData object for input0 tesnsor + auto cpu_engine = engine(engine::cpu, 0); + MklDnnData input(&cpu_engine); + input.SetUsrMem(input0_md, &input_tensor_0); + + // Create reorder from input0's layout to input1's layout + std::vector net; + CHECK_EQ(input.CheckReorderToOpMem(memory::primitive_desc( + input1_md, cpu_engine), + tensor_out, &net), + true); + stream(stream::kind::eager).submit(net).wait(); + + // Input1 will be passed through + ForwardMklTensorInToOut(context, 1, 1); + return; + } } // Sanity check @@ -400,9 +442,9 @@ class MklInputConversionOp : public OpKernel { // Create reorder between tensorflow layout and Mkl layout. std::vector net; - CHECK_EQ(tf_input.CheckReorderToOpMem(memory::primitive_desc( - output_mkl_md, cpu_engine), - tensor_out, &net), + CHECK_EQ(tf_input.CheckReorderToOpMem( + memory::primitive_desc(output_mkl_md, cpu_engine), + tensor_out, &net), true); stream(stream::kind::eager).submit(net).wait(); diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc index 95e0404ba8a..5f0a12a1fb9 100644 --- a/tensorflow/core/kernels/mkl_lrn_op.cc +++ b/tensorflow/core/kernels/mkl_lrn_op.cc @@ -22,6 +22,9 @@ limitations under the License. #define EIGEN_USE_THREADS #include +#include "mkl_dnn.h" +#include "mkl_dnn_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -30,20 +33,17 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/util/mkl_util.h" #include "tensorflow/core/util/tensor_format.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "mkl_dnn.h" -#include "mkl_dnn_types.h" #if !defined(IS_MOBILE_PLATFORM) #include "tensorflow/core/util/work_sharder.h" #endif -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" -using mkldnn::lrn_forward; -using mkldnn::lrn_backward; -using mkldnn::prop_kind; using mkldnn::lrn_across_channels; +using mkldnn::lrn_backward; +using mkldnn::lrn_forward; +using mkldnn::prop_kind; using mkldnn::stream; #endif @@ -67,7 +67,7 @@ void GetBandMatrix(int depth, int depth_radius, } // namespace -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template class MklLRNOp : public OpKernel { @@ -77,10 +77,11 @@ class MklLRNOp : public OpKernel { explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); @@ -103,9 +104,10 @@ class MklLRNOp : public OpKernel { : input.dims(); OP_REQUIRES(context, mkl_context.in_dims == 4, errors::InvalidArgument("input must be 4-dimensional")); - OP_REQUIRES(context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("argument to LRN too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument("argument to LRN too large")); if (!input_in_mkl_format) { mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_, @@ -339,17 +341,17 @@ class MklLRNOp : public OpKernel { float beta_; }; - template class MklLRNGradOp : public OpKernel { public: explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); @@ -740,10 +742,11 @@ class MklLRNOp : public OpKernel { explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); @@ -773,10 +776,10 @@ class MklLRNOp : public OpKernel { if (!src_dnn_shape.IsMklTensor()) { MklDefaultToEigen(context, src_tensor); return; - } else if (!src_dnn_shape.IsMklChannelDim( - src_dnn_shape.GetDimension() - 1) ) { + } else if (!src_dnn_shape.IsMklChannelDim(src_dnn_shape.GetDimension() - + 1)) { Tensor converted_tensor = - ConvertMklToTF(context, src_tensor, src_dnn_shape); + ConvertMklToTF(context, src_tensor, src_dnn_shape); MklDefaultToEigen(context, converted_tensor); return; } @@ -807,18 +810,16 @@ class MklLRNOp : public OpKernel { // Create LRN primitive descriptor. // Tensorflow's normalization semantics is across channels. // MKL-DNN also supports normalization within channel. - auto lrn_desc = lrn_forward::desc(prop_kind::forward, - lrn_across_channels, + auto lrn_desc = lrn_forward::desc(prop_kind::forward, lrn_across_channels, src_dnn_data.GetUsrMemDesc(), - kernel_size, - new_alpha, beta_, bias_); + kernel_size, new_alpha, beta_, bias_); auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine); // Allocate output_dnn_data tensor. Tensor* output_tensor = nullptr; memory::format input_format = src_dnn_shape.GetTfDataFormat(); - AllocateOutputTensor(context, lrn_prim_desc, input_dims, - input_format, &output_tensor); + AllocateOutputTensor(context, lrn_prim_desc, input_dims, input_format, + &output_tensor); OP_REQUIRES_OK(context, context->status()); CHECK_NOTNULL(output_tensor); dst_dnn_data.SetUsrMemDataHandle(output_tensor); @@ -827,25 +828,23 @@ class MklLRNOp : public OpKernel { AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data); OP_REQUIRES_OK(context, context->status()); - PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, - &dst_dnn_data, &workspace_dnn_data); - } catch (mkldnn::error &e) { + PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, &dst_dnn_data, + &workspace_dnn_data); + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } private: - void PrepareAndExecuteNet( - const lrn_forward::primitive_desc& lrn_fwd_desc, - MklDnnData* src_dnn_data, - MklDnnData* dst_dnn_data, - MklDnnData* wksp_dnn_data = nullptr) { + void PrepareAndExecuteNet(const lrn_forward::primitive_desc& lrn_fwd_desc, + MklDnnData* src_dnn_data, + MklDnnData* dst_dnn_data, + MklDnnData* wksp_dnn_data = nullptr) { std::vector net; // Check for input reorder @@ -853,23 +852,21 @@ class MklLRNOp : public OpKernel { // Create pooling primitive and add it to net if (wksp_dnn_data != nullptr) { - net.push_back(lrn_forward(lrn_fwd_desc, - src_dnn_data->GetOpMem(), - wksp_dnn_data->GetOpMem(), - dst_dnn_data->GetOpMem())); + net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(), + wksp_dnn_data->GetOpMem(), + dst_dnn_data->GetOpMem())); } else { - net.push_back(lrn_forward(lrn_fwd_desc, - src_dnn_data->GetOpMem(), - dst_dnn_data->GetOpMem())); + net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(), + dst_dnn_data->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); } - void AllocateOutputTensor(OpKernelContext* context, - const lrn_forward::primitive_desc& lrn_fwd_prim_desc, - const memory::dims output_dims_mkl_order, - const memory::format& output_tf_format, - Tensor** output_tensor) { + void AllocateOutputTensor( + OpKernelContext* context, + const lrn_forward::primitive_desc& lrn_fwd_prim_desc, + const memory::dims output_dims_mkl_order, + const memory::format& output_tf_format, Tensor** output_tensor) { CHECK_NOTNULL(output_tensor); memory::primitive_desc dst_pd = lrn_fwd_prim_desc.dst_primitive_desc(); @@ -880,111 +877,106 @@ class MklLRNOp : public OpKernel { output_mkl_shape.SetMklLayout(&dst_pd); output_mkl_shape.SetElemType(MklDnnType()); output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, - output_tf_format); + output_dims_mkl_order, output_tf_format); TensorShape output_tf_shape; // only allocate enough space for the elements we need. size_t num_bytes = dst_pd.get_size(); CHECK_EQ(num_bytes % sizeof(T), 0); output_tf_shape.AddDim(num_bytes / sizeof(T)); - AllocateOutputSetMklShape(context, kIdxOutput, - output_tensor, - output_tf_shape, output_mkl_shape); - } + AllocateOutputSetMklShape(context, kIdxOutput, output_tensor, + output_tf_shape, output_mkl_shape); + } - // Fallback implementation - Taken from lrn_op.cc - // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a - // copy. - void MklDefaultToEigen(OpKernelContext* context, - const Tensor& input) { - const int batch = static_cast(input.dim_size(0)); - const int rows = static_cast(input.dim_size(1)); - const int cols = static_cast(input.dim_size(2)); - const int depth = static_cast(input.dim_size(3)); - const int nodes = cols * rows; + // Fallback implementation - Taken from lrn_op.cc + // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a + // copy. + void MklDefaultToEigen(OpKernelContext* context, const Tensor& input) { + const int batch = static_cast(input.dim_size(0)); + const int rows = static_cast(input.dim_size(1)); + const int cols = static_cast(input.dim_size(2)); + const int depth = static_cast(input.dim_size(3)); + const int nodes = cols * rows; - auto in_shaped = input.shaped({nodes * batch, depth}); - // Multiplying the input with the band matrix has the effect of reducing - // the - // correct patch along the depth. - Eigen::Tensor multiplier(depth, depth); - GetBandMatrix(depth, depth_radius_, &multiplier); + auto in_shaped = input.shaped({nodes * batch, depth}); + // Multiplying the input with the band matrix has the effect of reducing + // the + // correct patch along the depth. + Eigen::Tensor multiplier(depth, depth); + GetBandMatrix(depth, depth_radius_, &multiplier); - Tensor *output_dnn_data = nullptr; - MklDnnShape mkl_output_mkl_shape; - mkl_output_mkl_shape.SetMklTensor(false); - mkl_output_mkl_shape.SetDimensions(4); - AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data, - input.shape(), mkl_output_mkl_shape); - CHECK_NOTNULL(output_dnn_data); + Tensor* output_dnn_data = nullptr; + MklDnnShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(false); + mkl_output_mkl_shape.SetDimensions(4); + AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data, + input.shape(), mkl_output_mkl_shape); + CHECK_NOTNULL(output_dnn_data); - Tensor* workspace_tensor = nullptr; - MklDnnShape workspace_mkl_shape; - workspace_mkl_shape.SetMklTensor(false); - TensorShape workspace_tf_shape; - workspace_tf_shape.AddDim(0); - AllocateOutputSetMklShape(context, kIdxWorkspace, - &workspace_tensor, + Tensor* workspace_tensor = nullptr; + MklDnnShape workspace_mkl_shape; + workspace_mkl_shape.SetMklTensor(false); + TensorShape workspace_tf_shape; + workspace_tf_shape.AddDim(0); + AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor, workspace_tf_shape, workspace_mkl_shape); - CHECK_NOTNULL(workspace_tensor); + CHECK_NOTNULL(workspace_tensor); - auto out_shaped = output_dnn_data->shaped({nodes * batch, depth}); - Eigen::array dims = {{DimPair(1, 0)}}; - auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_; - if (beta_ == T(1)) { - out_shaped.device(context->eigen_cpu_device()) = - in_shaped * tmp.inverse(); - } else if (beta_ == T(0.5)) { - out_shaped.device(context->eigen_cpu_device()) = - in_shaped * tmp.rsqrt(); - } else { - out_shaped.device(context->eigen_cpu_device()) = - in_shaped * (tmp.log() * -beta_).exp(); - } + auto out_shaped = output_dnn_data->shaped({nodes * batch, depth}); + Eigen::array dims = {{DimPair(1, 0)}}; + auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_; + if (beta_ == T(1)) { + out_shaped.device(context->eigen_cpu_device()) = + in_shaped * tmp.inverse(); + } else if (beta_ == T(0.5)) { + out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt(); + } else { + out_shaped.device(context->eigen_cpu_device()) = + in_shaped * (tmp.log() * -beta_).exp(); } + } - void AllocateWorkspaceTensor(OpKernelContext* context, - const lrn_forward::primitive_desc& lrn_fwd_prim_desc, - MklDnnData* dnn_data_wksp) { - CHECK_NOTNULL(dnn_data_wksp); - Tensor* workspace_tensor = nullptr; - memory::primitive_desc workspace_pd - = lrn_fwd_prim_desc.workspace_primitive_desc(); - size_t workspace_bytes = workspace_pd.get_size(); - MklDnnShape workspace_mkl_shape; - // the workspace tensor is a uint8 tensor that has - // exactly the number of bytes necessary - workspace_mkl_shape.SetMklTensor(false); - TensorShape workspace_tf_shape; - workspace_tf_shape.AddDim(workspace_bytes); - AllocateOutputSetMklShape(context, kIdxWorkspace, - &workspace_tensor, + void AllocateWorkspaceTensor( + OpKernelContext* context, + const lrn_forward::primitive_desc& lrn_fwd_prim_desc, + MklDnnData* dnn_data_wksp) { + CHECK_NOTNULL(dnn_data_wksp); + Tensor* workspace_tensor = nullptr; + memory::primitive_desc workspace_pd = + lrn_fwd_prim_desc.workspace_primitive_desc(); + size_t workspace_bytes = workspace_pd.get_size(); + MklDnnShape workspace_mkl_shape; + // the workspace tensor is a uint8 tensor that has + // exactly the number of bytes necessary + workspace_mkl_shape.SetMklTensor(false); + TensorShape workspace_tf_shape; + workspace_tf_shape.AddDim(workspace_bytes); + AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor, workspace_tf_shape, workspace_mkl_shape); - CHECK_NOTNULL(workspace_tensor); - dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor); - } + CHECK_NOTNULL(workspace_tensor); + dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor); + } void SanityCheckInputs(OpKernelContext* context) { const Tensor& src_tensor = MklGetInput(context, kIdxInput); MklDnnShape src_dnn_shape; GetMklShape(context, kIdxInput, &src_dnn_shape); if (src_dnn_shape.IsMklTensor()) { - OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4, - errors::InvalidArgument("input must be 4-dimensional")); - OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("argument to LRN too large")); + OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4, + errors::InvalidArgument("input must be 4-dimensional")); + OP_REQUIRES(context, + FastBoundsCheck(src_tensor.NumElements(), + std::numeric_limits::max()), + errors::InvalidArgument("argument to LRN too large")); } else { - OP_REQUIRES(context, src_tensor.dims() == 4, - errors::InvalidArgument("input must be 4-dimensional")); - OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("argument to LRN too large")); + OP_REQUIRES(context, src_tensor.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional")); + OP_REQUIRES(context, + FastBoundsCheck(src_tensor.NumElements(), + std::numeric_limits::max()), + errors::InvalidArgument("argument to LRN too large")); } } - const int kIdxInput = 0, - kIdxOutput = 0, - kIdxWorkspace = 1; + const int kIdxInput = 0, kIdxOutput = 0, kIdxWorkspace = 1; typedef typename Eigen::Tensor::DimensionPair DimPair; bool workspace_enabled_; @@ -994,17 +986,17 @@ class MklLRNOp : public OpKernel { float beta_; }; - template class MklLRNGradOp : public OpKernel { public: explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) { int64 depth_radius64; OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64)); - OP_REQUIRES(context, FastBoundsCheck(depth_radius64, - std::numeric_limits::max()), - errors::InvalidArgument("depth_radius = ", depth_radius64, - " larger than int max")); + OP_REQUIRES( + context, + FastBoundsCheck(depth_radius64, std::numeric_limits::max()), + errors::InvalidArgument("depth_radius = ", depth_radius64, + " larger than int max")); depth_radius_ = static_cast(depth_radius64); OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); @@ -1025,7 +1017,7 @@ class MklLRNGradOp : public OpKernel { MklDnnData output_dnn_data(&cpu_engine); MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape, - orig_output_dnn_shape; + orig_output_dnn_shape; GetMklShape(context, kIdxGradient, &input_grad_dnn_shape); GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape); GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape); @@ -1037,16 +1029,16 @@ class MklLRNGradOp : public OpKernel { orig_input_dnn_shape.IsMklTensor() && orig_output_dnn_shape.IsMklTensor() && input_grad_dnn_shape.IsMklChannelDim( - input_grad_dnn_shape.GetDimension() - 1) && + input_grad_dnn_shape.GetDimension() - 1) && orig_input_dnn_shape.IsMklChannelDim( - orig_input_dnn_shape.GetDimension() - 1) && + orig_input_dnn_shape.GetDimension() - 1) && orig_output_dnn_shape.IsMklChannelDim( - orig_output_dnn_shape.GetDimension() - 1); + orig_output_dnn_shape.GetDimension() - 1); if (!can_use_mkldnn) { - // Fallback to eigen - MklDefaultToEigen(context); - return; + // Fallback to eigen + MklDefaultToEigen(context); + return; } // At this point, we have the all clear to use MklDnn constructs // Naming: diff_dst is input_gradient_tensor; src is orig_input_tensor. @@ -1059,13 +1051,11 @@ class MklLRNGradOp : public OpKernel { // NHWC format. memory::desc original_output_md = orig_output_dnn_shape.GetCurLayout(); memory::desc target_diff_dst_md = ConfigureInputGradient( - input_grad_tensor, - input_grad_dnn_shape, - &input_grad_dnn_data); + input_grad_tensor, input_grad_dnn_shape, &input_grad_dnn_data); memory::desc orig_input_md = orig_input_dnn_shape.GetCurLayout(); memory::dims orig_input_dims = - orig_input_dnn_shape.GetSizesAsMklDnnDims(); + orig_input_dnn_shape.GetSizesAsMklDnnDims(); orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor); orig_input_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc); @@ -1079,27 +1069,21 @@ class MklLRNGradOp : public OpKernel { // Create LRN backward primitive descriptor. It requires LRN forward // primitive descriptor also. - auto lrn_fwd_desc = lrn_forward::desc(prop_kind::forward, - lrn_across_channels, - orig_input_md, - kernel_size, - new_alpha, beta_, bias_); - auto lrn_fwd_prim_desc = lrn_forward::primitive_desc(lrn_fwd_desc, - cpu_engine); - auto lrn_bwd_desc = lrn_backward::desc(lrn_across_channels, - original_output_md, - target_diff_dst_md, - kernel_size, - new_alpha, beta_, bias_); - auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(lrn_bwd_desc, - cpu_engine, - lrn_fwd_prim_desc); + auto lrn_fwd_desc = lrn_forward::desc( + prop_kind::forward, lrn_across_channels, orig_input_md, kernel_size, + new_alpha, beta_, bias_); + auto lrn_fwd_prim_desc = + lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine); + auto lrn_bwd_desc = lrn_backward::desc( + lrn_across_channels, original_output_md, target_diff_dst_md, + kernel_size, new_alpha, beta_, bias_); + auto lrn_bwd_prim_desc = lrn_backward::primitive_desc( + lrn_bwd_desc, cpu_engine, lrn_fwd_prim_desc); Tensor* output_tensor = nullptr; - memory::format orig_input_format - = orig_input_dnn_shape.GetTfDataFormat(); - AllocateOutputTensor(context, lrn_bwd_prim_desc, - orig_input_dims, orig_input_format, &output_tensor); + memory::format orig_input_format = orig_input_dnn_shape.GetTfDataFormat(); + AllocateOutputTensor(context, lrn_bwd_prim_desc, orig_input_dims, + orig_input_format, &output_tensor); OP_REQUIRES_OK(context, context->status()); CHECK_NOTNULL(output_tensor); output_dnn_data.SetUsrMemDataHandle(output_tensor); @@ -1110,35 +1094,32 @@ class MklLRNGradOp : public OpKernel { const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace); MklDnnData workspace_dnn_data(&cpu_engine); ConfigureWorkspace(workspace_tensor, - lrn_fwd_prim_desc.workspace_primitive_desc(), - &workspace_dnn_data); + lrn_fwd_prim_desc.workspace_primitive_desc(), + &workspace_dnn_data); - PrepareAndExecuteNet(lrn_bwd_prim_desc, - lrn_fwd_prim_desc, - &orig_input_dnn_data, - &input_grad_dnn_data, - &output_dnn_data, - memory::primitive_desc(target_diff_dst_md, cpu_engine), - &workspace_dnn_data); - } catch (mkldnn::error &e) { + PrepareAndExecuteNet( + lrn_bwd_prim_desc, lrn_fwd_prim_desc, &orig_input_dnn_data, + &input_grad_dnn_data, &output_dnn_data, + memory::primitive_desc(target_diff_dst_md, cpu_engine), + &workspace_dnn_data); + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } - void AllocateOutputTensor(OpKernelContext* context, - const lrn_backward::primitive_desc& lrn_bkwd_prim_desc, - const memory::dims output_dims_mkl_order, - const memory::format& output_tf_format, - Tensor** output_tensor) { + void AllocateOutputTensor( + OpKernelContext* context, + const lrn_backward::primitive_desc& lrn_bkwd_prim_desc, + const memory::dims output_dims_mkl_order, + const memory::format& output_tf_format, Tensor** output_tensor) { CHECK_NOTNULL(output_tensor); - memory::primitive_desc dst_pd - = lrn_bkwd_prim_desc.diff_src_primitive_desc(); + memory::primitive_desc dst_pd = + lrn_bkwd_prim_desc.diff_src_primitive_desc(); MklDnnShape output_mkl_shape; // We assume that all outputs at this point are MKL Tensors @@ -1146,170 +1127,153 @@ class MklLRNGradOp : public OpKernel { output_mkl_shape.SetMklLayout(&dst_pd); output_mkl_shape.SetElemType(MklDnnType()); output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, - output_tf_format); + output_dims_mkl_order, output_tf_format); TensorShape output_tf_shape; size_t num_bytes = dst_pd.get_size(); CHECK_EQ(num_bytes % sizeof(T), 0); output_tf_shape.AddDim(num_bytes / sizeof(T)); - AllocateOutputSetMklShape(context, kIdxOutput, - output_tensor, - output_tf_shape, output_mkl_shape); + AllocateOutputSetMklShape(context, kIdxOutput, output_tensor, + output_tf_shape, output_mkl_shape); } memory::desc ConfigureInputGradient(const Tensor& input_grad_tensor, - const MklDnnShape& input_grad_dnn_shape, - MklDnnData *input_grad_dnn_data) { + const MklDnnShape& input_grad_dnn_shape, + MklDnnData* input_grad_dnn_data) { CHECK_NOTNULL(input_grad_dnn_data); // This shouldn't be necessary at this point, but just in case CHECK_EQ(input_grad_dnn_shape.IsMklTensor(), true); memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout(); - memory::dims orig_input_dims = - input_grad_dnn_shape.GetSizesAsMklDnnDims(); + memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims(); input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor); input_grad_dnn_data->SetOpMemDesc(orig_input_dims, memory::format::nhwc); return input_grad_md; } void PrepareAndExecuteNet( - const lrn_backward::primitive_desc& lrn_bkwd_desc, - const lrn_forward::primitive_desc& lrn_fwd_desc, - MklDnnData* src_dnn_data, - MklDnnData* input_gradient_diff_dst, - MklDnnData* output_diff_src, - const memory::primitive_desc& target_diff_dst_pd, - const MklDnnData* workspace_dnn_data = nullptr) { + const lrn_backward::primitive_desc& lrn_bkwd_desc, + const lrn_forward::primitive_desc& lrn_fwd_desc, + MklDnnData* src_dnn_data, MklDnnData* input_gradient_diff_dst, + MklDnnData* output_diff_src, + const memory::primitive_desc& target_diff_dst_pd, + const MklDnnData* workspace_dnn_data = nullptr) { std::vector net; // Check for input reordering on the diff dst input input_gradient_diff_dst->CheckReorderToOpMem( - lrn_bkwd_desc.diff_dst_primitive_desc(), &net); + lrn_bkwd_desc.diff_dst_primitive_desc(), &net); // Check for input reordering on the original input - src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), - &net); + src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), &net); // Create pooling primitive and add it to net if (nullptr == workspace_dnn_data) { - net.push_back(lrn_backward(lrn_bkwd_desc, - src_dnn_data->GetOpMem(), - input_gradient_diff_dst->GetOpMem(), - output_diff_src->GetOpMem())); + net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(), + input_gradient_diff_dst->GetOpMem(), + output_diff_src->GetOpMem())); } else { - net.push_back(lrn_backward(lrn_bkwd_desc, - src_dnn_data->GetOpMem(), - input_gradient_diff_dst->GetOpMem(), - workspace_dnn_data->GetOpMem(), - output_diff_src->GetOpMem())); + net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(), + input_gradient_diff_dst->GetOpMem(), + workspace_dnn_data->GetOpMem(), + output_diff_src->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); } void ConfigureWorkspace(const Tensor& workspace_tensor, - memory::primitive_desc workspace_pd, - MklDnnData *workspace_dnn_data) { + memory::primitive_desc workspace_pd, + MklDnnData* workspace_dnn_data) { CHECK_NOTNULL(workspace_dnn_data); workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor); } - // Fallback implementation - Taken from lrn_op.cc - // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a - // copy. - void MklDefaultToEigen(OpKernelContext* context) { - Tensor input_gradient_tensor; - Tensor orig_input_tensor; - Tensor orig_output_tensor; + // Fallback implementation - Taken from lrn_op.cc + // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a + // copy. + void MklDefaultToEigen(OpKernelContext* context) { + Tensor input_gradient_tensor; + Tensor orig_input_tensor; + Tensor orig_output_tensor; - MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape, - orig_output_dnn_shape; - GetMklShape(context, kIdxGradient, &input_grad_dnn_shape); - GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape); - GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape); + MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape, + orig_output_dnn_shape; + GetMklShape(context, kIdxGradient, &input_grad_dnn_shape); + GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape); + GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape); - if (input_grad_dnn_shape.IsMklTensor()) { - input_gradient_tensor = - ConvertMklToTF(context, - MklGetInput(context, kIdxGradient), - input_grad_dnn_shape); - } else { - input_gradient_tensor = MklGetInput(context, kIdxGradient); - } + if (input_grad_dnn_shape.IsMklTensor()) { + input_gradient_tensor = ConvertMklToTF( + context, MklGetInput(context, kIdxGradient), input_grad_dnn_shape); + } else { + input_gradient_tensor = MklGetInput(context, kIdxGradient); + } - if (orig_input_dnn_shape.IsMklTensor()) { - orig_input_tensor = - ConvertMklToTF(context, - MklGetInput(context, kIdxOrigInput), - orig_input_dnn_shape); - } else { - orig_input_tensor = MklGetInput(context, kIdxOrigInput); - } + if (orig_input_dnn_shape.IsMklTensor()) { + orig_input_tensor = ConvertMklToTF( + context, MklGetInput(context, kIdxOrigInput), orig_input_dnn_shape); + } else { + orig_input_tensor = MklGetInput(context, kIdxOrigInput); + } - if (orig_output_dnn_shape.IsMklTensor()) { - orig_output_tensor = - ConvertMklToTF(context, - MklGetInput(context, kIdxOrigOutput), - orig_output_dnn_shape); - } else { - orig_output_tensor = MklGetInput(context, kIdxOrigOutput); - } + if (orig_output_dnn_shape.IsMklTensor()) { + orig_output_tensor = ConvertMklToTF( + context, MklGetInput(context, kIdxOrigOutput), orig_output_dnn_shape); + } else { + orig_output_tensor = MklGetInput(context, kIdxOrigOutput); + } - const int64 batch = static_cast(input_gradient_tensor.dim_size(0)); - const int64 rows = static_cast(input_gradient_tensor.dim_size(1)); - const int64 cols = static_cast(input_gradient_tensor.dim_size(2)); - const int64 depth = static_cast(input_gradient_tensor.dim_size(3)); - const auto nodes = cols * rows; + const int64 batch = static_cast(input_gradient_tensor.dim_size(0)); + const int64 rows = static_cast(input_gradient_tensor.dim_size(1)); + const int64 cols = static_cast(input_gradient_tensor.dim_size(2)); + const int64 depth = static_cast(input_gradient_tensor.dim_size(3)); + const auto nodes = cols * rows; - auto grads_shaped = - input_gradient_tensor.shaped({nodes * batch, depth}); + auto grads_shaped = + input_gradient_tensor.shaped({nodes * batch, depth}); - auto in_shaped = orig_input_tensor.shaped({nodes * batch, depth}); - auto activations = - orig_output_tensor.shaped({nodes * batch, depth}); + auto in_shaped = orig_input_tensor.shaped({nodes * batch, depth}); + auto activations = orig_output_tensor.shaped({nodes * batch, depth}); - Tensor* output_dnn_data; - MklShape mkl_output_mkl_shape; - mkl_output_mkl_shape.SetMklTensor(false); - mkl_output_mkl_shape.SetDimensions(4); - AllocateOutputSetMklShape(context, kIdxOutput, - &output_dnn_data, - input_gradient_tensor.shape(), - mkl_output_mkl_shape); + Tensor* output_dnn_data; + MklShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(false); + mkl_output_mkl_shape.SetDimensions(4); + AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data, + input_gradient_tensor.shape(), + mkl_output_mkl_shape); - auto out_shaped = output_dnn_data->shaped({nodes * batch, depth}); - out_shaped.setZero(); - auto shard = [this, activations, in_shaped, grads_shaped, out_shaped, - depth](int64 begin, int64 end) { - for (int64 i = begin; i < end; ++i) { - for (int64 j = 0; j < depth; ++j) { - int64 depth_begin = std::max(0, j - depth_radius_); - int64 depth_end = std::min(depth, j + depth_radius_ + 1); + auto out_shaped = output_dnn_data->shaped({nodes * batch, depth}); + out_shaped.setZero(); + auto shard = [this, activations, in_shaped, grads_shaped, out_shaped, + depth](int64 begin, int64 end) { + for (int64 i = begin; i < end; ++i) { + for (int64 j = 0; j < depth; ++j) { + int64 depth_begin = std::max(0, j - depth_radius_); + int64 depth_end = std::min(depth, j + depth_radius_ + 1); - T norm(0); - for (int64 k = depth_begin; k < depth_end; ++k) { - norm += in_shaped(i, k) * in_shaped(i, k); - } - norm = alpha_ * norm + bias_; - DCHECK_GT(norm, T(1e-6)); - for (int64 k = depth_begin; k < depth_end; ++k) { - T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) * - activations(i, j) / norm; - if (k == j) { - dyi += Eigen::numext::pow(norm, -beta_); - } - dyi *= grads_shaped(i, j); - const_cast::Tensor&>(out_shaped)(i, k) += - dyi; + T norm(0); + for (int64 k = depth_begin; k < depth_end; ++k) { + norm += in_shaped(i, k) * in_shaped(i, k); + } + norm = alpha_ * norm + bias_; + DCHECK_GT(norm, T(1e-6)); + for (int64 k = depth_begin; k < depth_end; ++k) { + T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) * + activations(i, j) / norm; + if (k == j) { + dyi += Eigen::numext::pow(norm, -beta_); } + dyi *= grads_shaped(i, j); + const_cast::Tensor&>(out_shaped)(i, k) += dyi; } } - }; - auto worker_threads = - *(context->device()->tensorflow_cpu_worker_threads()); - Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch, - depth * depth, shard); - } + } + }; + auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch, + depth * depth, shard); + } void SanityCheckInputs(OpKernelContext* context) { const Tensor& input_gradient_tensor = MklGetInput(context, kIdxGradient); @@ -1317,59 +1281,59 @@ class MklLRNGradOp : public OpKernel { const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput); const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace); MklDnnShape in_grads_dnn_shape, in_image_dnn_shape, out_image_dnn_shape, - workspace_dnn_shape; + workspace_dnn_shape; GetMklShape(context, kIdxGradient, &in_grads_dnn_shape); GetMklShape(context, kIdxOrigInput, &in_image_dnn_shape); GetMklShape(context, kIdxOrigOutput, &out_image_dnn_shape); GetMklShape(context, kIdxWorkspace, &workspace_dnn_shape); if (in_grads_dnn_shape.IsMklTensor()) { OP_REQUIRES(context, in_grads_dnn_shape.GetDimension() == 4, - errors::InvalidArgument("Input gradient must be " - "4-dimensional")); + errors::InvalidArgument("Input gradient must be " + "4-dimensional")); } else { - OP_REQUIRES(context, input_gradient_tensor.dims() == 4, - errors::InvalidArgument("input gradient must be 4-dimensional")); + OP_REQUIRES( + context, input_gradient_tensor.dims() == 4, + errors::InvalidArgument("input gradient must be 4-dimensional")); } if (in_image_dnn_shape.IsMklTensor()) { OP_REQUIRES(context, in_image_dnn_shape.GetDimension() == 4, - errors::InvalidArgument("input images must be " - "4-dimensional")); + errors::InvalidArgument("input images must be " + "4-dimensional")); } else { OP_REQUIRES(context, orig_input_tensor.dims() == 4, errors::InvalidArgument("input images must be " - "4-dimensional")); + "4-dimensional")); } if (out_image_dnn_shape.IsMklTensor()) { OP_REQUIRES(context, out_image_dnn_shape.GetDimension() == 4, - errors::InvalidArgument("Output image must be " - "4-dimensional")); + errors::InvalidArgument("Output image must be " + "4-dimensional")); } else { - OP_REQUIRES(context, orig_output_tensor.dims() == 4, - errors::InvalidArgument("Output image must be 4-dimensional")); + OP_REQUIRES( + context, orig_output_tensor.dims() == 4, + errors::InvalidArgument("Output image must be 4-dimensional")); } if (workspace_enabled_) { if (workspace_dnn_shape.IsMklTensor()) { - OP_REQUIRES(context, workspace_dnn_shape.IsMklTensor() == false, - errors::InvalidArgument("Workspace should not be MKL Tensor.")); + OP_REQUIRES( + context, workspace_dnn_shape.IsMklTensor() == false, + errors::InvalidArgument("Workspace should not be MKL Tensor.")); } else { OP_REQUIRES(context, workspace_tensor.dims() == 1, - errors::InvalidArgument("Workspace must be 1-dimensional")); + errors::InvalidArgument("Workspace must be 1-dimensional")); } } } -// Input("input_grads: T") -// Input("input_image: T") -// Input("output_image: T") -// Input("workspace: uint8") - const int kIdxGradient = 0, - kIdxOrigInput = 1, - kIdxOrigOutput = 2, - kIdxWorkspace = 3, - kIdxOutput = 0; + // Input("input_grads: T") + // Input("input_image: T") + // Input("output_image: T") + // Input("workspace: uint8") + const int kIdxGradient = 0, kIdxOrigInput = 1, kIdxOrigOutput = 2, + kIdxWorkspace = 3, kIdxOutput = 0; typedef typename Eigen::Tensor::DimensionPair DimPair; bool workspace_enabled_; @@ -1379,7 +1343,7 @@ class MklLRNGradOp : public OpKernel { float beta_; }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML #define REGISTER_MKL_LRN_CPU(T) \ REGISTER_KERNEL_BUILDER(Name("_MklLRN") \ @@ -1393,7 +1357,6 @@ class MklLRNGradOp : public OpKernel { .Label(mkl_op_registry::kMklOpLabel), \ MklLRNGradOp); - TF_CALL_float(REGISTER_MKL_LRN_CPU); } // namespace tensorflow diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc index 82c5229bab0..14607f26e0c 100644 --- a/tensorflow/core/kernels/mkl_maxpooling_op.cc +++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc @@ -22,25 +22,25 @@ limitations under the License. #include "tensorflow/core/util/mkl_util.h" #include "tensorflow/core/util/padding.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include #include "mkldnn.hpp" -using mkldnn::memory; -using mkldnn::error; -using mkldnn::pooling_forward; -using mkldnn::pooling_backward; -using mkldnn::padding_kind; -using mkldnn::engine; -using mkldnn::prop_kind; using mkldnn::algorithm; +using mkldnn::engine; +using mkldnn::error; +using mkldnn::memory; +using mkldnn::padding_kind; +using mkldnn::pooling_backward; +using mkldnn::pooling_forward; +using mkldnn::prop_kind; #endif namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -// For now, MKL-ML is default. So making MKL-DNN not a default choice. -#ifndef INTEL_MKL_DNN +// MKL-DNN is now default. MKL-ML must be specified explicitly. +#ifdef INTEL_MKL_ML // An implementation of MaxPooling (forward). template @@ -397,18 +397,19 @@ class MklMaxPoolingGradOp : public OpKernel { if (workspace_enabled == false) { if (convert_input != nullptr) { if (input_in_mkl_format == false) { - CHECK_EQ( - dnnConversionExecute_F32( - convert_input, const_cast(static_cast( - tensor_in.flat().data())), - input_buf), - E_SUCCESS); + CHECK_EQ(dnnConversionExecute_F32( + convert_input, + const_cast(static_cast( + tensor_in.flat().data())), + input_buf), + E_SUCCESS); CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS); convert_input = nullptr; } else { input_shape.GetConvertedFlatData( - lt_input_prim, const_cast(static_cast( - tensor_in.flat().data())), + lt_input_prim, + const_cast( + static_cast(tensor_in.flat().data())), input_buf); } pooling_resfwd[dnnResourceSrc] = input_buf; @@ -453,8 +454,9 @@ class MklMaxPoolingGradOp : public OpKernel { CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS); } else { output_backprop_shape.GetConvertedFlatData( - lt_outbackprop_prim, const_cast(static_cast( - out_backprop.flat().data())), + lt_outbackprop_prim, + const_cast( + static_cast(out_backprop.flat().data())), outbackprop_buf); } pooling_res[dnnResourceDiffDst] = outbackprop_buf; @@ -492,14 +494,14 @@ class MklMaxPoolingGradOp : public OpKernel { bool workspace_enabled_; }; // MklMaxPoolingGradOp -#else // INTEL_MKL_DNN is defined +#else // An implementation of MaxPooling (forward). template class MklMaxPoolingOp : public MklPoolingForwardOpBase { public: explicit MklMaxPoolingOp(OpKernelConstruction* context) - : MklPoolingForwardOpBase(context) { + : MklPoolingForwardOpBase(context) { // In Max Pooling, MKLDNN does not allow passing workspace as NULL. // So we set workspace_enabled_ to true. this->workspace_enabled_ = true; @@ -508,8 +510,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { void Compute(OpKernelContext* context) override { try { auto cpu_engine = engine(engine::cpu, 0); - const Tensor& input_tensor = MklGetInput(context, - this->kInputTensorIndexInput); + const Tensor& input_tensor = + MklGetInput(context, this->kInputTensorIndexInput); MklDnnShape dnn_shape_input; GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input); this->SanityCheckInput(context, input_tensor, dnn_shape_input); @@ -522,9 +524,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // initialize variables for the pooling op MklPoolParameters pool_params; // Get the input tensor and initialize the pooling parameters - this->ConfigureInput(context, dnn_shape_input, - input_tensor, &pool_params, - &dnn_data_input); + this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params, + &dnn_data_input); OP_REQUIRES_OK(context, context->status()); // Declare output tensor @@ -535,9 +536,10 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // If input is in Mkl layout, then just get the memory format from it // directly, instead of using input data_format to MaxPool. if (dnn_shape_input.IsMklTensor()) { - dnn_data_output.SetUsrMem(output_dims_mkl_order, - static_cast( - dnn_data_input.GetUsrMemDesc().data.format)); + dnn_data_output.SetUsrMem( + output_dims_mkl_order, + static_cast( + dnn_data_input.GetUsrMemDesc().data.format)); } else { dnn_data_output.SetUsrMem(output_dims_mkl_order, this->data_format_mkldnn_); @@ -546,24 +548,21 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // describe the memory layout; let mkl-dnn choose the best for the op dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any); - auto pool_desc = pooling_forward::desc(prop_kind::forward, - algorithm::pooling_max, - dnn_data_input.GetUsrMemDesc(), - dnn_data_output.GetUsrMemDesc(), - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_fwd_desc = pooling_forward::primitive_desc(pool_desc, - cpu_engine); + auto pool_desc = pooling_forward::desc( + prop_kind::forward, algorithm::pooling_max, + dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(), + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_fwd_desc = + pooling_forward::primitive_desc(pool_desc, cpu_engine); this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order, - this->data_format_mkldnn_, &output_tensor); + this->data_format_mkldnn_, &output_tensor); OP_REQUIRES_OK(context, context->status()); dnn_data_output.SetUsrMemDataHandle(output_tensor); @@ -571,39 +570,38 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { OP_REQUIRES_OK(context, context->status()); this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input, - &dnn_data_output, &dnn_data_wksp); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Compute received an exception:", - error_msg)); + &dnn_data_output, &dnn_data_wksp); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:", + error_msg)); } } // Compute private: - const int kOutputTensorIndexWorkspace = 1; + const int kOutputTensorIndexWorkspace = 1; - void AllocateWorkspaceTensor(OpKernelContext* context, - const pooling_forward::primitive_desc& pool_fwd_prim_desc, - MklDnnData* dnn_data_wksp) { - CHECK_NOTNULL(dnn_data_wksp); - Tensor* workspace_tensor = nullptr; - memory::primitive_desc workspace_pd - = pool_fwd_prim_desc.workspace_primitive_desc(); - size_t workspace_bytes = workspace_pd.get_size(); - MklDnnShape workspace_mkl_shape; - workspace_mkl_shape.SetMklTensor(false); - TensorShape workspace_tf_shape; - workspace_tf_shape.AddDim(workspace_bytes); - AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace, - &workspace_tensor, - workspace_tf_shape, workspace_mkl_shape); - CHECK_NOTNULL(workspace_tensor); - dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor); - } + void AllocateWorkspaceTensor( + OpKernelContext* context, + const pooling_forward::primitive_desc& pool_fwd_prim_desc, + MklDnnData* dnn_data_wksp) { + CHECK_NOTNULL(dnn_data_wksp); + Tensor* workspace_tensor = nullptr; + memory::primitive_desc workspace_pd = + pool_fwd_prim_desc.workspace_primitive_desc(); + size_t workspace_bytes = workspace_pd.get_size(); + MklDnnShape workspace_mkl_shape; + workspace_mkl_shape.SetMklTensor(false); + TensorShape workspace_tf_shape; + workspace_tf_shape.AddDim(workspace_bytes); + AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace, + &workspace_tensor, workspace_tf_shape, + workspace_mkl_shape); + CHECK_NOTNULL(workspace_tensor); + dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor); + } }; // The operation to compute MaxPool gradients. @@ -616,221 +614,186 @@ template class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { public: explicit MklMaxPoolingGradOp(OpKernelConstruction* context) - : MklPoolingBackwardOpBase(context) { - } + : MklPoolingBackwardOpBase(context) {} void Compute(OpKernelContext* context) override { try { - auto cpu_engine = engine(engine::cpu, 0); - const Tensor& orig_input_tensor = MklGetInput(context, - kInputTensorIndexOrigInput); - const Tensor& orig_output_tensor = MklGetInput(context, - kInputTensorIndexOrigOutput); - const Tensor& grad_tensor = MklGetInput(context, - kInputTensorIndexGradient); - const Tensor& workspace_tensor = MklGetInput(context, - kInputTensorIndexWorkspace); - MklDnnShape orig_input_mkl_shape, - orig_output_mkl_shape, - grad_mkl_shape, - workspace_mkl_shape; - GetMklShape(context, kInputTensorIndexOrigInput, - &orig_input_mkl_shape); - GetMklShape(context, kInputTensorIndexOrigOutput, - &orig_output_mkl_shape); - GetMklShape(context, kInputTensorIndexGradient, - &grad_mkl_shape); - GetMklShape(context, kInputTensorIndexWorkspace, - &workspace_mkl_shape); + auto cpu_engine = engine(engine::cpu, 0); + const Tensor& orig_input_tensor = + MklGetInput(context, kInputTensorIndexOrigInput); + const Tensor& orig_output_tensor = + MklGetInput(context, kInputTensorIndexOrigOutput); + const Tensor& grad_tensor = + MklGetInput(context, kInputTensorIndexGradient); + const Tensor& workspace_tensor = + MklGetInput(context, kInputTensorIndexWorkspace); + MklDnnShape orig_input_mkl_shape, orig_output_mkl_shape, grad_mkl_shape, + workspace_mkl_shape; + GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape); + GetMklShape(context, kInputTensorIndexOrigOutput, &orig_output_mkl_shape); + GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape); + GetMklShape(context, kInputTensorIndexWorkspace, &workspace_mkl_shape); - SanityCheckInputs(context, - orig_input_tensor, orig_output_tensor, - grad_tensor, workspace_tensor, - orig_input_mkl_shape, orig_output_mkl_shape, - grad_mkl_shape, workspace_mkl_shape); - if (!context->status().ok()) return; + SanityCheckInputs(context, orig_input_tensor, orig_output_tensor, + grad_tensor, workspace_tensor, orig_input_mkl_shape, + orig_output_mkl_shape, grad_mkl_shape, + workspace_mkl_shape); + if (!context->status().ok()) return; - MklDnnData grad_dnn_data(&cpu_engine); - MklDnnData workspace_dnn_data(&cpu_engine); - MklDnnData output_dnn_data(&cpu_engine); - Tensor* output_tensor = nullptr; - MklPoolParameters pool_params; - TensorShape orig_input_shape; - memory::dims output_dims_mkl_order, orig_input_dims_mkl_order; - memory::desc original_input_md = ConfigureOriginalInput(context, - orig_input_tensor, - orig_input_mkl_shape, - &orig_input_dims_mkl_order, - &pool_params, - &orig_input_shape); + MklDnnData grad_dnn_data(&cpu_engine); + MklDnnData workspace_dnn_data(&cpu_engine); + MklDnnData output_dnn_data(&cpu_engine); + Tensor* output_tensor = nullptr; + MklPoolParameters pool_params; + TensorShape orig_input_shape; + memory::dims output_dims_mkl_order, orig_input_dims_mkl_order; + memory::desc original_input_md = ConfigureOriginalInput( + context, orig_input_tensor, orig_input_mkl_shape, + &orig_input_dims_mkl_order, &pool_params, &orig_input_shape); - memory::desc original_output_md = this->ConfigureOriginalOutput( - pool_params, - orig_output_mkl_shape, - output_dims_mkl_order); + memory::desc original_output_md = this->ConfigureOriginalOutput( + pool_params, orig_output_mkl_shape, output_dims_mkl_order); - memory::desc target_diff_dst_md = this->ConfigureInputGradient( - grad_mkl_shape, - grad_tensor, - &grad_dnn_data, - original_output_md); + memory::desc target_diff_dst_md = this->ConfigureInputGradient( + grad_mkl_shape, grad_tensor, &grad_dnn_data, original_output_md); - output_dnn_data.SetUsrMem(original_input_md); + output_dnn_data.SetUsrMem(original_input_md); - // Create the forward pooling primitive descriptor so we can - // pass it as a hint to the backward pooling primitive descriptor - auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward, - algorithm::pooling_max, - original_input_md, - original_output_md, - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_fwd_prim_desc - = pooling_forward::primitive_desc(pool_fwd_desc, - cpu_engine); + // Create the forward pooling primitive descriptor so we can + // pass it as a hint to the backward pooling primitive descriptor + auto pool_fwd_desc = pooling_forward::desc( + prop_kind::forward, algorithm::pooling_max, original_input_md, + original_output_md, + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_fwd_prim_desc = + pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine); - auto pool_bkwd_desc = pooling_backward::desc( - algorithm::pooling_max, - output_dnn_data.GetUsrMemDesc(), - target_diff_dst_md, - memory::dims({ pool_params.row_stride, - pool_params.col_stride}), - memory::dims({ pool_params.window_rows, - pool_params.window_cols}), - memory::dims({ static_cast(pool_params.pad_top), - static_cast(pool_params.pad_left)}), - memory::dims({ static_cast(pool_params.pad_bottom), - static_cast(pool_params.pad_right)}), - TFPaddingToMklDnnPadding(this->padding_)); - auto pool_bkwd_prim_desc - = pooling_backward::primitive_desc(pool_bkwd_desc, - cpu_engine, - pool_fwd_prim_desc); + auto pool_bkwd_desc = pooling_backward::desc( + algorithm::pooling_max, output_dnn_data.GetUsrMemDesc(), + target_diff_dst_md, + memory::dims({pool_params.row_stride, pool_params.col_stride}), + memory::dims({pool_params.window_rows, pool_params.window_cols}), + memory::dims({static_cast(pool_params.pad_top), + static_cast(pool_params.pad_left)}), + memory::dims({static_cast(pool_params.pad_bottom), + static_cast(pool_params.pad_right)}), + TFPaddingToMklDnnPadding(this->padding_)); + auto pool_bkwd_prim_desc = pooling_backward::primitive_desc( + pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc); - this->AllocateOutputTensor(context, pool_bkwd_prim_desc, - orig_input_dims_mkl_order, - this->data_format_mkldnn_, - &output_tensor); - output_dnn_data.SetUsrMemDataHandle(output_tensor); + this->AllocateOutputTensor(context, pool_bkwd_prim_desc, + orig_input_dims_mkl_order, + this->data_format_mkldnn_, &output_tensor); + output_dnn_data.SetUsrMemDataHandle(output_tensor); - ConfigureWorkspace(workspace_tensor, - pool_fwd_prim_desc.workspace_primitive_desc(), - &workspace_dnn_data); - this->PrepareAndExecuteNet(pool_bkwd_prim_desc, - &grad_dnn_data, - &output_dnn_data, - memory::primitive_desc( - target_diff_dst_md, - cpu_engine), - &workspace_dnn_data); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Compute received an exception:", - error_msg)); + ConfigureWorkspace(workspace_tensor, + pool_fwd_prim_desc.workspace_primitive_desc(), + &workspace_dnn_data); + this->PrepareAndExecuteNet( + pool_bkwd_prim_desc, &grad_dnn_data, &output_dnn_data, + memory::primitive_desc(target_diff_dst_md, cpu_engine), + &workspace_dnn_data); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:", + error_msg)); } } // Compute private: - // .Input("orig_input: T") - // .Input("orig_output: T") - // .Input("grad: T") - // .Input("workspace: T") - const int kInputTensorIndexOrigInput = 0; - const int kInputTensorIndexOrigOutput = 1; - const int kInputTensorIndexGradient = 2; - const int kInputTensorIndexWorkspace = 3; - // Output("output: T") in Base Class + // .Input("orig_input: T") + // .Input("orig_output: T") + // .Input("grad: T") + // .Input("workspace: T") + const int kInputTensorIndexOrigInput = 0; + const int kInputTensorIndexOrigOutput = 1; + const int kInputTensorIndexGradient = 2; + const int kInputTensorIndexWorkspace = 3; + // Output("output: T") in Base Class - memory::desc ConfigureOriginalInput(OpKernelContext* context, - const Tensor& tensor_original_input, - const MklDnnShape& original_input_mkl_shape, - memory::dims* original_input_dims_mkl_order, - MklPoolParameters* pool_params, - TensorShape* input_tensor_shape) { - *input_tensor_shape = tensor_original_input.shape(); - return MklPoolingBackwardOpBase::ConfigureOriginalInput( - context, - tensor_original_input, - original_input_mkl_shape, - original_input_dims_mkl_order, - pool_params, - *input_tensor_shape); + memory::desc ConfigureOriginalInput( + OpKernelContext* context, const Tensor& tensor_original_input, + const MklDnnShape& original_input_mkl_shape, + memory::dims* original_input_dims_mkl_order, + MklPoolParameters* pool_params, TensorShape* input_tensor_shape) { + *input_tensor_shape = tensor_original_input.shape(); + return MklPoolingBackwardOpBase::ConfigureOriginalInput( + context, tensor_original_input, original_input_mkl_shape, + original_input_dims_mkl_order, pool_params, *input_tensor_shape); + } + + void ConfigureWorkspace(const Tensor& workspace_tensor, + memory::primitive_desc workspace_pd, + MklDnnData* workspace_dnn_data) { + CHECK_NOTNULL(workspace_dnn_data); + + workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor); + } + + void SanityCheckInputs(OpKernelContext* context, + const Tensor& orig_input_tensor, + const Tensor& orig_output_tensor, + const Tensor& grad_tensor, + const Tensor& workspace_tensor, + const MklDnnShape& orig_input_mkl_shape, + const MklDnnShape& orig_output_mkl_shape, + const MklDnnShape& grad_mkl_shape, + const MklDnnShape& workspace_mkl_shape) { + if (!orig_input_mkl_shape.IsMklTensor()) { + OP_REQUIRES(context, orig_input_tensor.dims() == 4, + errors::InvalidArgument("Original input shape must be " + "4-dimensional")); + } else { + OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4, + errors::InvalidArgument("Original input shape must be " + "4-dimensional")); } - - void ConfigureWorkspace(const Tensor& workspace_tensor, - memory::primitive_desc workspace_pd, - MklDnnData *workspace_dnn_data) { - CHECK_NOTNULL(workspace_dnn_data); - - workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor); + if (!orig_output_mkl_shape.IsMklTensor()) { + OP_REQUIRES(context, orig_output_tensor.dims() == 4, + errors::InvalidArgument("Original output must be " + "4-dimensional")); + } else { + OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4, + errors::InvalidArgument("Original output must be " + "4-dimensional")); } - - void SanityCheckInputs(OpKernelContext* context, - const Tensor& orig_input_tensor, - const Tensor& orig_output_tensor, - const Tensor& grad_tensor, - const Tensor& workspace_tensor, - const MklDnnShape& orig_input_mkl_shape, - const MklDnnShape& orig_output_mkl_shape, - const MklDnnShape& grad_mkl_shape, - const MklDnnShape& workspace_mkl_shape) { - if (!orig_input_mkl_shape.IsMklTensor()) { - OP_REQUIRES(context, orig_input_tensor.dims() == 4, - errors::InvalidArgument("Original input shape must be " - "4-dimensional")); - } else { - OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Original input shape must be " - "4-dimensional")); - } - if (!orig_output_mkl_shape.IsMklTensor()) { - OP_REQUIRES(context, orig_output_tensor.dims() == 4, - errors::InvalidArgument("Original output must be " - "4-dimensional")); - } else { - OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Original output must be " - "4-dimensional")); - } - if (!grad_mkl_shape.IsMklTensor()) { - OP_REQUIRES(context, grad_tensor.dims() == 4, - errors::InvalidArgument("Gradient must be 4-dimensional")); - } else { - OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Gradient must be " - "4-dimensional")); - } - if (this->workspace_enabled_) { - // The workspace should not be an MKL tensor - OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false, - errors::InvalidArgument("Workspace tensor should not" - " be an MKL Tensor.")); - // It should only have one dimension - OP_REQUIRES(context, workspace_tensor.dims() == 1, - errors::InvalidArgument("Workspace tensor must be " - "1-dimensional")); - } else { - OP_REQUIRES(context, this->workspace_enabled_, - errors::Unimplemented("MKL-DNN Max Pooling does not " + if (!grad_mkl_shape.IsMklTensor()) { + OP_REQUIRES(context, grad_tensor.dims() == 4, + errors::InvalidArgument("Gradient must be 4-dimensional")); + } else { + OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4, + errors::InvalidArgument("Gradient must be " + "4-dimensional")); + } + if (this->workspace_enabled_) { + // The workspace should not be an MKL tensor + OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false, + errors::InvalidArgument("Workspace tensor should not" + " be an MKL Tensor.")); + // It should only have one dimension + OP_REQUIRES(context, workspace_tensor.dims() == 1, + errors::InvalidArgument("Workspace tensor must be " + "1-dimensional")); + } else { + OP_REQUIRES( + context, this->workspace_enabled_, + errors::Unimplemented("MKL-DNN Max Pooling does not " "yet support the use case " "where MaxPoolGrad is called without first" " calling MaxPool.")); - } } + } }; // MklMaxPoolingGradOp -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML REGISTER_KERNEL_BUILDER(Name("_MklMaxPool") .Device(DEVICE_CPU) diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc index f7cadffd39c..5ef6ce2a578 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc @@ -15,9 +15,9 @@ limitations under the License. #ifdef INTEL_MKL -#include -#include #include "tensorflow/core/kernels/mkl_pooling_ops_common.h" +#include +#include #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/framework/common_shape_fns.h" #include "tensorflow/core/kernels/bounds_check.h" @@ -42,7 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context, Init(context, ksize, stride, padding, data_format); } -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML // Initialization for MKL format void MklPoolParameters::Init(OpKernelContext* context, const std::vector& ksize, @@ -72,7 +72,7 @@ void MklPoolParameters::Init(OpKernelContext* context, Init(context, ksize, stride, padding, data_format); } -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML // Common Initialization for TensorFlow and MKL formats void MklPoolParameters::Init(OpKernelContext* context, const std::vector& ksize, @@ -107,21 +107,21 @@ void MklPoolParameters::Init(OpKernelContext* context, OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( tensor_in_cols, window_cols, col_stride, padding, &out_width, &pad_left, &pad_right)); -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // TF can work with int64, but mkldnn only supports int32 // Fail if the height or width are greater than MAX_INT - OP_REQUIRES(context, FastBoundsCheck(out_height, - std::numeric_limits::max()), + OP_REQUIRES(context, + FastBoundsCheck(out_height, std::numeric_limits::max()), errors::InvalidArgument("output height is too large")); - OP_REQUIRES(context, FastBoundsCheck(out_width, - std::numeric_limits::max()), + OP_REQUIRES(context, + FastBoundsCheck(out_width, std::numeric_limits::max()), errors::InvalidArgument("output width is too large")); #endif out_depth = depth; // output will have the same depth as the input - } else { // we are pooling in the depth dimension + } else { // we are pooling in the depth dimension // Our current version of depthwise max pooling does not support // any padding, and expects the depth_window to equal the depth // stride (no overlapping). diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h index b974b2c59af..279167aba24 100644 --- a/tensorflow/core/kernels/mkl_pooling_ops_common.h +++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h @@ -17,16 +17,16 @@ limitations under the License. #define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_ #ifdef INTEL_MKL -#include #include +#include #include "tensorflow/core/util/mkl_util.h" #include "tensorflow/core/util/padding.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::memory; -using mkldnn::pooling_forward; using mkldnn::pooling_backward; +using mkldnn::pooling_forward; using mkldnn::stream; #endif @@ -61,19 +61,31 @@ struct MklPoolParameters { TensorFormat data_format; MklPoolParameters() - : depth(0) - , tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0) - , window_rows(0), window_cols(0), depth_window(0) - , row_stride(0), col_stride(0), depth_stride(0) - , out_height(0), out_width(0), out_depth(0) - , pad_left(0), pad_right(0), pad_top(0), pad_bottom(0), pad_depth(0) - , data_format(TensorFormat::FORMAT_NCHW) {} + : depth(0), + tensor_in_cols(0), + tensor_in_rows(0), + tensor_in_batch(0), + window_rows(0), + window_cols(0), + depth_window(0), + row_stride(0), + col_stride(0), + depth_stride(0), + out_height(0), + out_width(0), + out_depth(0), + pad_left(0), + pad_right(0), + pad_top(0), + pad_bottom(0), + pad_depth(0), + data_format(TensorFormat::FORMAT_NCHW) {} // Updates context->status if there is an invalid input. void Init(OpKernelContext* context, const std::vector& ksize, const std::vector& stride, Padding padding, TensorFormat data_format, const TensorShape& tensor_in_shape); -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML void Init(OpKernelContext* context, const std::vector& ksize, const std::vector& stride, Padding padding, TensorFormat data_format, const MklShape* mkl_in_shape); @@ -90,39 +102,37 @@ struct MklPoolParameters { TensorFormat data_format); }; -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML template class MklPoolingOpBase : public OpKernel { public: explicit MklPoolingOpBase(OpKernelConstruction* context) - : OpKernel(context) - , workspace_enabled_(false) { - string data_format; - OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); - OP_REQUIRES(context, - FormatFromString(data_format, &this->data_format_tf_), - errors::InvalidArgument("Invalid data format")); - this->data_format_mkldnn_ - = TFDataFormatToMklDnnDataFormat(this->data_format_tf_); - OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_)); - OP_REQUIRES(context, this->ksize_.size() == 4, - errors::InvalidArgument("Sliding window ksize field must " - "specify 4 dimensions")); - OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_)); - OP_REQUIRES(context, this->stride_.size() == 4, - errors::InvalidArgument("Sliding window strides field must " - "specify 4 dimensions")); - OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_)); - OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1, - errors::Unimplemented("Pooling is not yet supported on the " - "batch dimension.")); + : OpKernel(context), workspace_enabled_(false) { + string data_format; + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); + OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_), + errors::InvalidArgument("Invalid data format")); + this->data_format_mkldnn_ = + TFDataFormatToMklDnnDataFormat(this->data_format_tf_); + OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_)); + OP_REQUIRES(context, this->ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_)); + OP_REQUIRES(context, this->stride_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_)); + OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1, + errors::Unimplemented("Pooling is not yet supported on the " + "batch dimension.")); - // We may not get this attribute for this node if it does not go through - // graph rewrite pass. So we do not check for error while retrieving this - // attribute value. - context->GetAttr("workspace_enabled", &this->workspace_enabled_); - } + // We may not get this attribute for this node if it does not go through + // graph rewrite pass. So we do not check for error while retrieving this + // attribute value. + context->GetAttr("workspace_enabled", &this->workspace_enabled_); + } void Compute(OpKernelContext* context) override = 0; protected: @@ -132,24 +142,24 @@ class MklPoolingOpBase : public OpKernel { // output height and output width to have already been int32 // bounds-checked void GetOutputDims(const MklPoolParameters& mkl_pool_params, - memory::dims* output_dims_mkl_order) { + memory::dims* output_dims_mkl_order) { // MKL-DNN always needs output in NCHW format. - *output_dims_mkl_order = { mkl_pool_params.tensor_in_batch, + *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch, mkl_pool_params.out_depth, static_cast(mkl_pool_params.out_height), static_cast(mkl_pool_params.out_width)}; } void InitMklPoolParameters(OpKernelContext* context, - MklPoolParameters* pool_params, - const MklDnnShape& original_input_mkl_shape, - const TensorShape& input_tensor_shape) { + MklPoolParameters* pool_params, + const MklDnnShape& original_input_mkl_shape, + const TensorShape& input_tensor_shape) { if (!original_input_mkl_shape.IsMklTensor()) { pool_params->Init(context, this->ksize_, this->stride_, this->padding_, - this->data_format_tf_, input_tensor_shape); + this->data_format_tf_, input_tensor_shape); } else { pool_params->Init(context, this->ksize_, this->stride_, this->padding_, - this->data_format_tf_, &original_input_mkl_shape); + this->data_format_tf_, &original_input_mkl_shape); } } @@ -159,13 +169,12 @@ class MklPoolingOpBase : public OpKernel { size_t GetNumTElements(const memory::primitive_desc& pd) { size_t num_bytes = pd.get_size(); size_t ret_val = num_bytes / sizeof(T); - if ( num_bytes % sizeof(T) != 0 ) { - ret_val++; + if (num_bytes % sizeof(T) != 0) { + ret_val++; } return ret_val; } - std::vector ksize_; std::vector stride_; Padding padding_; @@ -183,30 +192,29 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase { protected: void ConfigureInput(OpKernelContext* context, - const MklDnnShape& input_mkl_shape, - const Tensor& input_tensor, - MklPoolParameters* pool_params, - MklDnnData* dnn_data_input) { + const MklDnnShape& input_mkl_shape, + const Tensor& input_tensor, + MklPoolParameters* pool_params, + MklDnnData* dnn_data_input) { CHECK_NOTNULL(pool_params); CHECK_NOTNULL(dnn_data_input); TensorShape input_tensor_shape = input_tensor.shape(); - memory::desc input_md = input_mkl_shape.IsMklTensor() - ? input_mkl_shape.GetMklLayout() - : memory::desc( - TFShapeToMklDnnDimsInNCHW( - input_tensor_shape, this->data_format_tf_), - MklDnnType(), - this->data_format_mkldnn_); + memory::desc input_md = + input_mkl_shape.IsMklTensor() + ? input_mkl_shape.GetMklLayout() + : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape, + this->data_format_tf_), + MklDnnType(), this->data_format_mkldnn_); dnn_data_input->SetUsrMem(input_md, &input_tensor); - this->InitMklPoolParameters(context, pool_params, - input_mkl_shape, input_tensor_shape); + this->InitMklPoolParameters(context, pool_params, input_mkl_shape, + input_tensor_shape); } - void AllocateOutputTensor(OpKernelContext* context, - const pooling_forward::primitive_desc& pool_fwd_prim_desc, - const memory::dims output_dims_mkl_order, - const memory::format& output_tf_format, - Tensor** output_tensor) { + void AllocateOutputTensor( + OpKernelContext* context, + const pooling_forward::primitive_desc& pool_fwd_prim_desc, + const memory::dims output_dims_mkl_order, + const memory::format& output_tf_format, Tensor** output_tensor) { CHECK_NOTNULL(output_tensor); memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc(); @@ -215,50 +223,42 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase { output_mkl_shape.SetMklLayout(&dst_pd); output_mkl_shape.SetElemType(MklDnnType()); output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, - output_tf_format); + output_dims_mkl_order, output_tf_format); TensorShape output_tf_shape; // only allocate enough space for the elements we need. output_tf_shape.AddDim(this->GetNumTElements(dst_pd)); - AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, - output_tensor, - output_tf_shape, output_mkl_shape); + AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor, + output_tf_shape, output_mkl_shape); CHECK_NOTNULL(*output_tensor); } void PrepareAndExecuteNet( - const pooling_forward::primitive_desc& pool_fwd_desc, - const MklDnnData* src, - MklDnnData* dst, - MklDnnData* wksp = nullptr) { + const pooling_forward::primitive_desc& pool_fwd_desc, + const MklDnnData* src, MklDnnData* dst, + MklDnnData* wksp = nullptr) { std::vector net; // Create pooling primitive and add it to net if (wksp != nullptr) { - net.push_back(pooling_forward(pool_fwd_desc, - src->GetOpMem(), - dst->GetOpMem(), - wksp->GetOpMem())); + net.push_back(pooling_forward(pool_fwd_desc, src->GetOpMem(), + dst->GetOpMem(), wksp->GetOpMem())); } else { - net.push_back(pooling_forward(pool_fwd_desc, - src->GetOpMem(), - dst->GetOpMem())); + net.push_back( + pooling_forward(pool_fwd_desc, src->GetOpMem(), dst->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); } - - void SanityCheckInput(OpKernelContext* context, - const Tensor& input_tensor, - const MklDnnShape& input_mkl_shape) { + void SanityCheckInput(OpKernelContext* context, const Tensor& input_tensor, + const MklDnnShape& input_mkl_shape) { if (!input_mkl_shape.IsMklTensor()) { OP_REQUIRES(context, input_tensor.dims() == 4, - errors::InvalidArgument("Input must be 4-dimensional")); + errors::InvalidArgument("Input must be 4-dimensional")); } else { - OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4, - errors::InvalidArgument("Input shape must be " - "4-dimensional")); + OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4, + errors::InvalidArgument("Input shape must be " + "4-dimensional")); } } // .Input("value: T") @@ -267,66 +267,58 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase { const int kOutputTensorIndexOutput = 0; }; // MklPoolingForwardBaseOp - template class MklPoolingBackwardOpBase : public MklPoolingOpBase { public: explicit MklPoolingBackwardOpBase(OpKernelConstruction* context) - : MklPoolingOpBase(context) { } + : MklPoolingOpBase(context) {} void Compute(OpKernelContext* context) override = 0; protected: const int kOutputTensorIndexOutput = 0; - void AllocateOutputTensor(OpKernelContext* context, - const pooling_backward::primitive_desc& pool_bkwd_prim_desc, - const memory::dims output_dims_mkl_order, - const memory::format& output_tf_format, - Tensor** output_tensor) { + void AllocateOutputTensor( + OpKernelContext* context, + const pooling_backward::primitive_desc& pool_bkwd_prim_desc, + const memory::dims output_dims_mkl_order, + const memory::format& output_tf_format, Tensor** output_tensor) { CHECK_NOTNULL(output_tensor); - memory::primitive_desc dst_pd - = pool_bkwd_prim_desc.diff_src_primitive_desc(); + memory::primitive_desc dst_pd = + pool_bkwd_prim_desc.diff_src_primitive_desc(); MklDnnShape output_mkl_shape; output_mkl_shape.SetMklTensor(true); output_mkl_shape.SetMklLayout(&dst_pd); output_mkl_shape.SetElemType(MklDnnType()); output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, - output_tf_format); + output_dims_mkl_order, output_tf_format); TensorShape output_tf_shape; output_tf_shape.AddDim(this->GetNumTElements(dst_pd)); - AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, - output_tensor, - output_tf_shape, output_mkl_shape); + AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor, + output_tf_shape, output_mkl_shape); CHECK_NOTNULL(*output_tensor); } void PrepareAndExecuteNet( - const pooling_backward::primitive_desc& pool_bkwd_desc, - MklDnnData* input_gradient_diff_dst, - MklDnnData* output_diff_src, - const memory::primitive_desc& target_diff_dst_pd, - const MklDnnData* workspace = nullptr) { - + const pooling_backward::primitive_desc& pool_bkwd_desc, + MklDnnData* input_gradient_diff_dst, MklDnnData* output_diff_src, + const memory::primitive_desc& target_diff_dst_pd, + const MklDnnData* workspace = nullptr) { std::vector net; // If the input gradient isn't in the same format as the output // reorder it to the same format as the output - input_gradient_diff_dst->CheckReorderToOpMem( - target_diff_dst_pd, - &net); + input_gradient_diff_dst->CheckReorderToOpMem(target_diff_dst_pd, &net); // Create pooling primitive and add it to net if (nullptr == workspace) { net.push_back(pooling_backward(pool_bkwd_desc, - input_gradient_diff_dst->GetOpMem(), - output_diff_src->GetOpMem())); + input_gradient_diff_dst->GetOpMem(), + output_diff_src->GetOpMem())); } else { - net.push_back(pooling_backward(pool_bkwd_desc, - input_gradient_diff_dst->GetOpMem(), - workspace->GetOpMem(), - output_diff_src->GetOpMem())); + net.push_back( + pooling_backward(pool_bkwd_desc, input_gradient_diff_dst->GetOpMem(), + workspace->GetOpMem(), output_diff_src->GetOpMem())); } stream(stream::kind::eager).submit(net).wait(); } @@ -334,80 +326,76 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase { // Max Pooling and Avg Pooling have slightly different implementations // Takes the Tensor containing original input data and the original // mkl Dnn Shape and populates other data - memory::desc ConfigureOriginalInput(OpKernelContext* context, - const Tensor& tensor_original_input_shape, - const MklDnnShape& original_input_mkl_shape, - memory::dims* original_input_dims_nchw, - MklPoolParameters* pool_params, - const TensorShape& input_tensor_shape) { + memory::desc ConfigureOriginalInput( + OpKernelContext* context, const Tensor& tensor_original_input_shape, + const MklDnnShape& original_input_mkl_shape, + memory::dims* original_input_dims_nchw, MklPoolParameters* pool_params, + const TensorShape& input_tensor_shape) { CHECK_NOTNULL(original_input_dims_nchw); CHECK_NOTNULL(pool_params); - this->InitMklPoolParameters(context, pool_params, - original_input_mkl_shape, - input_tensor_shape); + this->InitMklPoolParameters(context, pool_params, original_input_mkl_shape, + input_tensor_shape); - *original_input_dims_nchw - = original_input_mkl_shape.IsMklTensor() - ? original_input_mkl_shape.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(input_tensor_shape, - this->data_format_tf_); + *original_input_dims_nchw = + original_input_mkl_shape.IsMklTensor() + ? original_input_mkl_shape.GetSizesAsMklDnnDims() + : TFShapeToMklDnnDimsInNCHW(input_tensor_shape, + this->data_format_tf_); - return original_input_mkl_shape.IsMklTensor() - ? original_input_mkl_shape.GetMklLayout() - : memory::desc(*original_input_dims_nchw, - MklDnnType(), - this->data_format_mkldnn_); + return original_input_mkl_shape.IsMklTensor() + ? original_input_mkl_shape.GetMklLayout() + : memory::desc(*original_input_dims_nchw, MklDnnType(), + this->data_format_mkldnn_); } - memory::desc ConfigureOriginalOutput(const MklPoolParameters& pool_params, - const MklDnnShape& original_output_mkl_shape, - memory::dims output_dims_mkl_order) { + memory::desc ConfigureOriginalOutput( + const MklPoolParameters& pool_params, + const MklDnnShape& original_output_mkl_shape, + memory::dims output_dims_mkl_order) { this->GetOutputDims(pool_params, &output_dims_mkl_order); return original_output_mkl_shape.IsMklTensor() - ? original_output_mkl_shape.GetMklLayout() - : memory::desc(output_dims_mkl_order, - MklDnnType(), - this->data_format_mkldnn_); + ? original_output_mkl_shape.GetMklLayout() + : memory::desc(output_dims_mkl_order, MklDnnType(), + this->data_format_mkldnn_); } memory::desc ConfigureInputGradient( - const MklDnnShape& input_gradient_mkl_shape, - const Tensor& input_gradient_tensor, - MklDnnData* input_gradient_dnn_data, - const memory::desc& original_output_md) { + const MklDnnShape& input_gradient_mkl_shape, + const Tensor& input_gradient_tensor, + MklDnnData* input_gradient_dnn_data, + const memory::desc& original_output_md) { // Configure the gradient as is - memory::desc original_input_grad_md - = input_gradient_mkl_shape.IsMklTensor() - ? input_gradient_mkl_shape.GetMklLayout() - : memory::desc(TFShapeToMklDnnDimsInNCHW( - input_gradient_tensor.shape(), - this->data_format_tf_), - MklDnnType(), this->data_format_mkldnn_); + memory::desc original_input_grad_md = + input_gradient_mkl_shape.IsMklTensor() + ? input_gradient_mkl_shape.GetMklLayout() + : memory::desc( + TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(), + this->data_format_tf_), + MklDnnType(), this->data_format_mkldnn_); input_gradient_dnn_data->SetUsrMem(original_input_grad_md, - &input_gradient_tensor); + &input_gradient_tensor); // Check to see if input grad diff dst is in the right format // Create a new memory descriptor with the same shape as the // original, but the format of the other tensors. memory::format original_output_format = - static_cast(original_output_md.data.format); - bool grad_reorder_needed = input_gradient_dnn_data->IsReorderNeeded( - original_output_format); - memory::dims diff_dst_dims = input_gradient_mkl_shape.IsMklTensor() - ? input_gradient_mkl_shape.GetSizesAsMklDnnDims() - : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(), - this->data_format_tf_); - memory::desc target_diff_dst_md = memory::desc(diff_dst_dims, - MklDnnType(), original_output_format); + static_cast(original_output_md.data.format); + bool grad_reorder_needed = + input_gradient_dnn_data->IsReorderNeeded(original_output_format); + memory::dims diff_dst_dims = + input_gradient_mkl_shape.IsMklTensor() + ? input_gradient_mkl_shape.GetSizesAsMklDnnDims() + : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(), + this->data_format_tf_); + memory::desc target_diff_dst_md = + memory::desc(diff_dst_dims, MklDnnType(), original_output_format); - return grad_reorder_needed - ? target_diff_dst_md - : original_input_grad_md; + return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md; } }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML //------------------------------------------------------------------- // Utility functions diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc index dc899d8c7ee..0be8355afa4 100644 --- a/tensorflow/core/kernels/mkl_relu_op.cc +++ b/tensorflow/core/kernels/mkl_relu_op.cc @@ -16,29 +16,29 @@ limitations under the License. // See docs in ../ops/nn_ops.cc. #ifdef INTEL_MKL +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/lib/core/errors.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/platform/default/logging.h" -#include "tensorflow/core/util/mkl_util.h" #include "mkl_dnn.h" #include "mkl_dnn_types.h" +#include "tensorflow/core/platform/default/logging.h" +#include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" -using mkldnn::stream; -using mkldnn::prop_kind; using mkldnn::algorithm; -using mkldnn::relu_forward; -using mkldnn::relu_backward; -using mkldnn::eltwise_relu; using mkldnn::eltwise_elu; +using mkldnn::eltwise_relu; using mkldnn::eltwise_tanh; +using mkldnn::prop_kind; +using mkldnn::relu_backward; +using mkldnn::relu_forward; +using mkldnn::stream; #endif namespace tensorflow { @@ -58,7 +58,7 @@ struct MklReluHelpers { } }; -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template class MklReluOp : public OpKernel { @@ -180,7 +180,6 @@ class MklReluOp : public OpKernel { } MklReluOpContext; }; - template class MklReluGradOp : public OpKernel { public: @@ -214,10 +213,11 @@ class MklReluGradOp : public OpKernel { if (!dnnLayoutCompare_F32(lt_input, lt_grad)) { AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad, &mkl_buffer_convert); - CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, - lt_grad), E_SUCCESS); + CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad), + E_SUCCESS); CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input, - mkl_buffer_convert), E_SUCCESS); + mkl_buffer_convert), + E_SUCCESS); relu_res[dnnResourceSrc] = mkl_buffer_convert; dnnDelete_F32(cv_input_to_grad); } else { @@ -325,7 +325,8 @@ void MklReluGradOp::Compute(OpKernelContext* context) { float negative_slope = 0.0; CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL, mkl_context.lt_grad, mkl_context.lt_grad, - negative_slope), E_SUCCESS); + negative_slope), + E_SUCCESS); Tensor mkl_tmp_input_buf_tensor; mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor); @@ -348,7 +349,8 @@ void MklReluGradOp::Compute(OpKernelContext* context) { } tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast( - mkl_context.output_shape.GetMklLayout())) / sizeof(T)); + mkl_context.output_shape.GetMklLayout())) / + sizeof(T)); AllocateOutputSetMklShape(context, 0, &output, tf_shape, mkl_context.output_shape); } else { @@ -361,22 +363,22 @@ void MklReluGradOp::Compute(OpKernelContext* context) { mkl_context.relu_res[dnnResourceDiffSrc] = static_cast(output->flat().data()); - CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, - mkl_context.relu_res), - E_SUCCESS); + CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res), + E_SUCCESS); mkl_context.MklCleanup(); } -#else // INTEL_MKL_DNN + +#else // INTEL_MKL_ML + template class MklReluOpBase : public OpKernel { public: ~MklReluOpBase() {} - explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) { - } + explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {} virtual void Compute_Scalar(OpKernelContext* context) = 0; @@ -413,12 +415,12 @@ class MklReluOpBase : public OpKernel { T alpha = 0, beta = 0; std::shared_ptr relu_fwd_pd; - auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training, + auto relu_fwd_desc = relu_forward::desc( + prop_kind::forward_training, // Operator memory descriptor is same as user memory descriptor. - alg_kind, src.GetUsrMemDesc(), - alpha, beta); - relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc, - cpu_engine)); + alg_kind, src.GetUsrMemDesc(), alpha, beta); + relu_fwd_pd.reset( + new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine)); // allocate dst tensor MklDnnShape dnn_shape_dst; @@ -431,7 +433,7 @@ class MklReluOpBase : public OpKernel { dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), dnn_shape_src.GetSizesAsMklDnnDims(), dnn_shape_src.GetTfDataFormat()); - tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T)); + tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T)); } else { dnn_shape_dst.SetMklTensor(false); tf_shape_dst = src_tensor.shape(); @@ -445,34 +447,32 @@ class MklReluOpBase : public OpKernel { // execute net std::vector net; - auto relu_fwd = relu_forward(*relu_fwd_pd, src.GetOpMem(), - dst.GetOpMem()); + auto relu_fwd = + relu_forward(*relu_fwd_pd, src.GetOpMem(), dst.GetOpMem()); net.push_back(relu_fwd); stream(stream::kind::eager).submit(net).wait(); - } catch (mkldnn::error &e) { + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } }; - template class MklReluGradOpBase : public OpKernel { public: ~MklReluGradOpBase() {} - explicit MklReluGradOpBase(OpKernelConstruction* context) : - OpKernel(context) {} + explicit MklReluGradOpBase(OpKernelConstruction* context) + : OpKernel(context) {} virtual void Compute_Scalar(OpKernelContext* context) = 0; - void Compute(OpKernelContext* context) { + void Compute(OpKernelContext* context) { try { auto cpu_engine = engine(engine::cpu, 0); MklDnnData src(&cpu_engine); @@ -483,9 +483,9 @@ class MklReluGradOpBase : public OpKernel { const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); - Tensor* diff_src_tensor = nullptr; + Tensor* diff_src_tensor = nullptr; MklDnnShape dnn_shape_src, dnn_shape_diff_dst; GetMklShape(context, src_index, &dnn_shape_src); @@ -526,25 +526,25 @@ class MklReluGradOpBase : public OpKernel { src_md = dnn_shape_src.GetMklLayout(); memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat(); - auto src_tf_data_format = MklDnnDataFormatToTFDataFormat( - src_mkl_data_format); + auto src_tf_data_format = + MklDnnDataFormatToTFDataFormat(src_mkl_data_format); auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), src_tf_data_format); - diff_dst_md = memory::desc(diff_dst_dims, MklDnnType(), - src_mkl_data_format); + diff_dst_md = + memory::desc(diff_dst_dims, MklDnnType(), src_mkl_data_format); } else if (!dnn_shape_src.IsMklTensor() && - dnn_shape_diff_dst.IsMklTensor()) { + dnn_shape_diff_dst.IsMklTensor()) { // Same comment as above. diff_dst_md = dnn_shape_diff_dst.GetMklLayout(); memory::format diff_dst_mkl_data_format = - dnn_shape_diff_dst.GetTfDataFormat(); - auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat( - diff_dst_mkl_data_format); + dnn_shape_diff_dst.GetTfDataFormat(); + auto diff_dst_tf_data_format = + MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format); auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), diff_dst_tf_data_format); - src_md = memory::desc(src_dims, MklDnnType(), - diff_dst_mkl_data_format); + src_md = + memory::desc(src_dims, MklDnnType(), diff_dst_mkl_data_format); } else { // If both the inputs are in MKL format, we use Mkl layout of the input // tensors. @@ -572,12 +572,12 @@ class MklReluGradOpBase : public OpKernel { std::shared_ptr relu_fwd_pd; auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training, alg_kind, src_md, alpha, beta); - relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc, - cpu_engine)); - auto relu_bwd_desc = relu_backward::desc(alg_kind, common_md, common_md, - alpha, beta); - auto relu_bwd_pd = relu_backward::primitive_desc(relu_bwd_desc, - cpu_engine, *relu_fwd_pd); + relu_fwd_pd.reset( + new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine)); + auto relu_bwd_desc = + relu_backward::desc(alg_kind, common_md, common_md, alpha, beta); + auto relu_bwd_pd = relu_backward::primitive_desc( + relu_bwd_desc, cpu_engine, *relu_fwd_pd); // allocate diff_src tensor MklDnnShape dnn_shape_diff_src; @@ -590,33 +590,32 @@ class MklReluGradOpBase : public OpKernel { dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), dnn_shape_src.GetSizesAsMklDnnDims(), dnn_shape_src.GetTfDataFormat()); - tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T)); + tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T)); } else { dnn_shape_diff_src.SetMklTensor(false); tf_shape_diff_src = src_tensor.shape(); } AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor, - tf_shape_diff_src, dnn_shape_diff_src); + tf_shape_diff_src, dnn_shape_diff_src); // diff_src memory descriptor is same as memory descriptor for both // inputs. diff_src.SetUsrMem(common_md, diff_src_tensor); PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst); - } catch (mkldnn::error &e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + } catch (mkldnn::error& e) { + string error_msg = "Status: " + std::to_string(e.status) + + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc, - MklDnnData* src, MklDnnData* diff_src, MklDnnData* - diff_dst) { + MklDnnData* src, MklDnnData* diff_src, + MklDnnData* diff_dst) { std::vector net; // Check if we need to reorder original input tensors into common_md layout @@ -632,14 +631,13 @@ class MklReluGradOpBase : public OpKernel { } }; - template class MklReluOp : public MklReluOpBase { public: ~MklReluOp() {} - explicit MklReluOp(OpKernelConstruction* context) : - MklReluOpBase(context) {} + explicit MklReluOp(OpKernelConstruction* context) + : MklReluOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t src_index = 0; // index of src input tensor @@ -649,15 +647,15 @@ class MklReluOp : public MklReluOpBase { GetMklShape(context, src_index, &dnn_shape_src); Tensor* dst_tensor = nullptr; - void* user_i = static_cast(const_cast( - src_tensor.flat().data())); + void* user_i = + static_cast(const_cast(src_tensor.flat().data())); MklDnnShape dnn_shape_dst; dnn_shape_dst.SetMklTensor(false); AllocateOutputSetMklShape(context, dst_index, &dst_tensor, src_tensor.shape(), dnn_shape_dst); void* out_o = static_cast(dst_tensor->flat().data()); (static_cast(out_o))[0] = - std::max((static_cast(user_i))[0], static_cast(0)); + std::max((static_cast(user_i))[0], static_cast(0)); return; } }; @@ -667,14 +665,14 @@ class MklReluGradOp : public MklReluGradOpBase { public: ~MklReluGradOp() {} - explicit MklReluGradOp(OpKernelConstruction* context) : - MklReluGradOpBase(context) {} + explicit MklReluGradOp(OpKernelConstruction* context) + : MklReluGradOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t diff_dst_index = 0; // index of diff_dst input tensor const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); Tensor* diff_src_tensor = nullptr; @@ -687,11 +685,11 @@ class MklReluGradOp : public MklReluGradOpBase { diff_dst_tensor.shape(), dnn_shape_diff_src); void* out_o = static_cast(diff_src_tensor->flat().data()); void* user_i = - static_cast(const_cast(src_tensor.flat().data())); + static_cast(const_cast(src_tensor.flat().data())); void* user_g = - static_cast(const_cast(diff_dst_tensor.flat().data())); - (static_cast(out_o))[0] = (static_cast(user_g))[0] * - ((static_cast(user_i))[0] > 0); + static_cast(const_cast(diff_dst_tensor.flat().data())); + (static_cast(out_o))[0] = + (static_cast(user_g))[0] * ((static_cast(user_i))[0] > 0); return; } }; @@ -701,8 +699,8 @@ class MklEluOp : public MklReluOpBase { public: ~MklEluOp() {} - explicit MklEluOp(OpKernelConstruction* context) : - MklReluOpBase(context) {} + explicit MklEluOp(OpKernelConstruction* context) + : MklReluOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t src_index = 0; // index of src input tensor @@ -712,8 +710,8 @@ class MklEluOp : public MklReluOpBase { GetMklShape(context, src_index, &dnn_shape_src); Tensor* dst_tensor = nullptr; - void* user_i = static_cast(const_cast( - src_tensor.flat().data())); + void* user_i = + static_cast(const_cast(src_tensor.flat().data())); MklDnnShape dnn_shape_dst; dnn_shape_dst.SetMklTensor(false); AllocateOutputSetMklShape(context, dst_index, &dst_tensor, @@ -734,14 +732,14 @@ class MklEluGradOp : public MklReluGradOpBase { public: ~MklEluGradOp() {} - explicit MklEluGradOp(OpKernelConstruction* context) : - MklReluGradOpBase(context) {} + explicit MklEluGradOp(OpKernelConstruction* context) + : MklReluGradOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t diff_dst_index = 0; // index of diff_dst input tensor const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); Tensor* diff_src_tensor = nullptr; @@ -754,9 +752,9 @@ class MklEluGradOp : public MklReluGradOpBase { diff_dst_tensor.shape(), dnn_shape_diff_src); void* out_o = static_cast(diff_src_tensor->flat().data()); void* user_i = - static_cast(const_cast(src_tensor.flat().data())); + static_cast(const_cast(src_tensor.flat().data())); void* user_g = - static_cast(const_cast(diff_dst_tensor.flat().data())); + static_cast(const_cast(diff_dst_tensor.flat().data())); // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise T feature = (static_cast(user_i))[0]; if (feature > 0) { @@ -773,8 +771,8 @@ class MklTanhOp : public MklReluOpBase { public: ~MklTanhOp() {} - explicit MklTanhOp(OpKernelConstruction* context) : - MklReluOpBase(context) {} + explicit MklTanhOp(OpKernelConstruction* context) + : MklReluOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t src_index = 0; // index of src input tensor @@ -784,8 +782,8 @@ class MklTanhOp : public MklReluOpBase { GetMklShape(context, src_index, &dnn_shape_src); Tensor* dst_tensor = nullptr; - void* user_i = static_cast(const_cast( - src_tensor.flat().data())); + void* user_i = + static_cast(const_cast(src_tensor.flat().data())); MklDnnShape dnn_shape_dst; dnn_shape_dst.SetMklTensor(false); AllocateOutputSetMklShape(context, dst_index, &dst_tensor, @@ -795,7 +793,7 @@ class MklTanhOp : public MklReluOpBase { T feature = (static_cast(user_i))[0]; T e1 = std::exp(feature); T e2 = std::exp(-feature); - (static_cast(out_o))[0] = (e1 - e2)/(e1 + e2); + (static_cast(out_o))[0] = (e1 - e2) / (e1 + e2); return; } }; @@ -805,14 +803,14 @@ class MklTanhGradOp : public MklReluGradOpBase { public: ~MklTanhGradOp() {} - explicit MklTanhGradOp(OpKernelConstruction* context) : - MklReluGradOpBase(context) {} + explicit MklTanhGradOp(OpKernelConstruction* context) + : MklReluGradOpBase(context) {} virtual void Compute_Scalar(OpKernelContext* context) { const size_t diff_dst_index = 0; // index of diff_dst input tensor const size_t src_index = 1; // index of src input tensor const size_t diff_src_index = 0; // index of diff_src output tensor - const Tensor& src_tensor = MklGetInput(context, src_index); + const Tensor& src_tensor = MklGetInput(context, src_index); const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index); Tensor* diff_src_tensor = nullptr; @@ -825,16 +823,16 @@ class MklTanhGradOp : public MklReluGradOpBase { diff_dst_tensor.shape(), dnn_shape_diff_src); void* out_o = static_cast(diff_src_tensor->flat().data()); void* user_i = - static_cast(const_cast(src_tensor.flat().data())); + static_cast(const_cast(src_tensor.flat().data())); // gradient of tanh(x) = 1 - tanh(x)^2 T feature = (static_cast(user_i))[0]; T e1 = std::exp(feature); T e2 = std::exp(-feature); - T tanh = (e1 - e2)/(e1 + e2); + T tanh = (e1 - e2) / (e1 + e2); void* user_g = - static_cast(const_cast(diff_dst_tensor.flat().data())); - (static_cast(out_o))[0] = (static_cast(user_g))[0] * - (1 - tanh * tanh); + static_cast(const_cast(diff_dst_tensor.flat().data())); + (static_cast(out_o))[0] = + (static_cast(user_g))[0] * (1 - tanh * tanh); } }; @@ -854,16 +852,16 @@ class MklTanhGradOp : public MklReluGradOpBase { MklReluGradOp); TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES); -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // register dnn kernels for supported operations and supported types -#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type) \ - REGISTER_KERNEL_BUILDER(Name("_MklElu") \ +#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type) \ + REGISTER_KERNEL_BUILDER(Name("_MklElu") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ - MklEluOp); \ - REGISTER_KERNEL_BUILDER(Name("_MklEluGrad") \ + MklEluOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklEluGrad") \ .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklOpLabel), \ @@ -888,4 +886,3 @@ TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES); } // namespace tensorflow #endif // INTEL_MKL - diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc index b41e529357b..5dbc4a2709e 100644 --- a/tensorflow/core/kernels/mkl_reshape_op.cc +++ b/tensorflow/core/kernels/mkl_reshape_op.cc @@ -28,7 +28,7 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::stream; #endif @@ -40,7 +40,7 @@ class MklReshapeOp : public OpKernel { public: explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {} -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML void Compute(OpKernelContext* context) override { const Tensor& input = MklGetInput(context, 0); const Tensor& sizes = MklGetInput(context, 1); @@ -166,9 +166,9 @@ class MklReshapeOp : public OpKernel { MklDnnShape mkl_shape_input; GetMklShape(context, kInputSlotIdx, &mkl_shape_input); bool input_in_mkl_format = mkl_shape_input.IsMklTensor(); - const int64 nelems = input_in_mkl_format ? - mkl_shape_input.GetTfShape().num_elements() - : input_tensor.NumElements(); + const int64 nelems = input_in_mkl_format + ? mkl_shape_input.GetTfShape().num_elements() + : input_tensor.NumElements(); // Preliminary validation of sizes. OP_REQUIRES(context, IsLegacyVector(sizes.shape()), @@ -210,11 +210,11 @@ class MklReshapeOp : public OpKernel { product)); shape.set_dim(unknown_index, missing); } - OP_REQUIRES(context, shape.num_elements() == nelems, - errors::InvalidArgument("Input to reshape is a tensor with ", - nelems, - " values, but the requested shape has ", - shape.num_elements())); + OP_REQUIRES( + context, shape.num_elements() == nelems, + errors::InvalidArgument("Input to reshape is a tensor with ", nelems, + " values, but the requested shape has ", + shape.num_elements())); if (input_in_mkl_format) { TensorShape& shape_to = shape; @@ -237,38 +237,38 @@ class MklReshapeOp : public OpKernel { // need to update MklDnnShape object associated with the input // tensor to reflect the shape change expected by reshape. if (!SkipReorder(mkl_shape_input, shape_to)) { - // If dimensions that are being expanded or collapsed are not - // maintained contiguously by MKLDNN, then we use reorder. + // If dimensions that are being expanded or collapsed are not + // maintained contiguously by MKLDNN, then we use reorder. - // Get Mkl layout of input tensor. - auto input_mkl_md = mkl_shape_input.GetMklLayout(); - // Set input Mkl layout as the user layout. - dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor); - // Get expected Tensorflow layout of input tensor. - auto output_tf_md = mkl_shape_input.GetTfLayout(); - auto output_tf_pd = memory::primitive_desc(output_tf_md, - cpu_engine); + // Get Mkl layout of input tensor. + auto input_mkl_md = mkl_shape_input.GetMklLayout(); + // Set input Mkl layout as the user layout. + dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor); + // Get expected Tensorflow layout of input tensor. + auto output_tf_md = mkl_shape_input.GetTfLayout(); + auto output_tf_pd = + memory::primitive_desc(output_tf_md, cpu_engine); - Tensor* output_tensor = nullptr; - MklShape mkl_shape_output; - mkl_shape_output.SetMklTensor(false); - // We allocate output tensor in the shape expected by Reshape. - AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor, - shape_to, mkl_shape_output); + Tensor* output_tensor = nullptr; + MklShape mkl_shape_output; + mkl_shape_output.SetMklTensor(false); + // We allocate output tensor in the shape expected by Reshape. + AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor, + shape_to, mkl_shape_output); - // Insert reorder between Mkl layout and TensorFlow layout if - // needed. If reorder is not needed but reshape is needed (since - // shape_from != shape_to), then we just copy input tensor to - // output tensor with target shape (we cannot forward Mkl layout - // in such case because shape has changed.) - std::vector net; - if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, - output_tensor, &net)) { - stream(stream::kind::eager).submit(net).wait(); - } else { - output_tensor->CopyFrom(input_tensor, shape_to); - } - return; + // Insert reorder between Mkl layout and TensorFlow layout if + // needed. If reorder is not needed but reshape is needed (since + // shape_from != shape_to), then we just copy input tensor to + // output tensor with target shape (we cannot forward Mkl layout + // in such case because shape has changed.) + std::vector net; + if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, output_tensor, + &net)) { + stream(stream::kind::eager).submit(net).wait(); + } else { + output_tensor->CopyFrom(input_tensor, shape_to); + } + return; } else { // If dimensions that are being expanded or collapsed are // maintained contiguously by MKLDNN, then we skip reorder, just @@ -276,10 +276,10 @@ class MklReshapeOp : public OpKernel { // Tensorflow tensor as it is to the output. auto output_dims = TFShapeToMklDnnDims(shape_to); auto output_strides = CalculateTFStrides(output_dims); - auto output_tf_md = MklDnnData::CreateBlockedMemDesc(output_dims, - output_strides); - auto output_tf_pd = memory::primitive_desc(output_tf_md, - cpu_engine); + auto output_tf_md = MklDnnData::CreateBlockedMemDesc( + output_dims, output_strides); + auto output_tf_pd = + memory::primitive_desc(output_tf_md, cpu_engine); // Set MklDnnShape MklDnnShape mkl_shape_output; @@ -291,18 +291,17 @@ class MklReshapeOp : public OpKernel { // We now simply forward input Mkl tensor to output and change its // output MklDnnShape object. - ForwardMklTensorInToOutWithMklShape(context, kInputSlotIdx, - kOutputSlotIdx, mkl_shape_output); + ForwardMklTensorInToOutWithMklShape( + context, kInputSlotIdx, kOutputSlotIdx, mkl_shape_output); return; } - } catch (mkldnn::error &e) { + } catch (mkldnn::error& e) { string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + - ", in file " + string(__FILE__) + ":" + - std::to_string(__LINE__); - OP_REQUIRES_OK(context, - errors::Aborted("Operation received an exception:", - error_msg)); + ", message: " + string(e.message) + ", in file " + + string(__FILE__) + ":" + std::to_string(__LINE__); + OP_REQUIRES_OK( + context, + errors::Aborted("Operation received an exception:", error_msg)); } } } else { @@ -313,7 +312,7 @@ class MklReshapeOp : public OpKernel { } } -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML private: const int kInputSlotIdx = 0; diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc index c46eabdde10..aceef1e234e 100644 --- a/tensorflow/core/kernels/mkl_softmax_op.cc +++ b/tensorflow/core/kernels/mkl_softmax_op.cc @@ -15,7 +15,7 @@ limitations under the License. // See docs in ../ops/nn_ops.cc. #ifdef INTEL_MKL -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/numeric_op.h" @@ -156,5 +156,5 @@ TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES); } // namespace tensorflow -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML #endif // INTEL_MKL diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h index c4d5a45d3ca..5fafa14b5db 100644 --- a/tensorflow/core/kernels/mkl_tfconv_op.h +++ b/tensorflow/core/kernels/mkl_tfconv_op.h @@ -35,7 +35,7 @@ limitations under the License. #include "mkl_dnn_types.h" #include "tensorflow/core/util/mkl_util.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML using mkldnn::stream; #endif @@ -61,7 +61,7 @@ class MklToTfOp : public OpKernel { VLOG(1) << "MKLToTFConversion complete successfully."; } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context, string data_format_str, DataType op_data_type, bool has_avx512f, uint input_number) { diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc index 17f2af550f2..0e820bbb620 100644 --- a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc +++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc @@ -71,10 +71,10 @@ class NeonDepthwiseConv2dNativeOp : public BinaryOp { filter.shape().DebugString())); const int32 in_depth = input.dim_size(3); - OP_REQUIRES( - context, in_depth == filter.dim_size(2), - errors::InvalidArgument("input and filter must have the same depth: ", - in_depth, " vs ", filter.dim_size(2))); + OP_REQUIRES(context, in_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", in_depth, + " vs ", filter.dim_size(2))); const int32 batch = input.dim_size(0); const int32 input_rows = input.dim_size(1); const int32 input_cols = input.dim_size(2); diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc index 64bdef0008f..5d28b87e6bb 100644 --- a/tensorflow/core/kernels/non_max_suppression_op.cc +++ b/tensorflow/core/kernels/non_max_suppression_op.cc @@ -92,13 +92,11 @@ static inline bool IOUGreaterThanThreshold( return iou > iou_threshold; } -void DoNonMaxSuppressionOp(OpKernelContext* context, - const Tensor& boxes, - const Tensor& scores, - const Tensor& max_output_size, +void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes, + const Tensor& scores, const Tensor& max_output_size, const float iou_threshold) { OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1, - errors::InvalidArgument("iou_threshold must be in [0, 1]")); + errors::InvalidArgument("iou_threshold must be in [0, 1]")); int num_boxes = 0; ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes); @@ -106,10 +104,8 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, return; } - const int output_size = - std::min(max_output_size.scalar()(), num_boxes); - typename TTypes::ConstTensor boxes_data = - boxes.tensor(); + const int output_size = std::min(max_output_size.scalar()(), num_boxes); + typename TTypes::ConstTensor boxes_data = boxes.tensor(); std::vector scores_data(num_boxes); std::copy_n(scores.flat().data(), num_boxes, scores_data.begin()); @@ -181,8 +177,7 @@ template class NonMaxSuppressionV2Op : public OpKernel { public: explicit NonMaxSuppressionV2Op(OpKernelConstruction* context) - : OpKernel(context) { - } + : OpKernel(context) {} void Compute(OpKernelContext* context) override { // boxes: [num_boxes, 4] @@ -197,10 +192,9 @@ class NonMaxSuppressionV2Op : public OpKernel { max_output_size.shape().DebugString())); // iou_threshold: scalar const Tensor& iou_threshold = context->input(3); - OP_REQUIRES( - context, TensorShapeUtils::IsScalar(iou_threshold.shape()), - errors::InvalidArgument("iou_threshold must be 0-D, got shape ", - iou_threshold.shape().DebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()), + errors::InvalidArgument("iou_threshold must be 0-D, got shape ", + iou_threshold.shape().DebugString())); const float iou_threshold_val = iou_threshold.scalar()(); diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc index fdbcf05b89d..67d9217b950 100644 --- a/tensorflow/core/kernels/non_max_suppression_op_test.cc +++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc @@ -43,9 +43,10 @@ class NonMaxSuppressionOpTest : public OpsTestBase { TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) { MakeOp(.5); - AddInputFromArray(TensorShape({6, 4}), - {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, - 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); + AddInputFromArray( + TensorShape({6, 4}), + {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, + 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); AddInputFromArray(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f}); AddInputFromArray(TensorShape({}), {3}); TF_ASSERT_OK(RunOpKernel()); @@ -58,7 +59,7 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) { TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) { MakeOp(.5); AddInputFromArray(TensorShape({6, 4}), - {1, 1, 0, 0, 0, 0.1f, 1, 1.1f, 0, .9f, 1, -0.1f, + {1, 1, 0, 0, 0, 0.1f, 1, 1.1f, 0, .9f, 1, -0.1f, 0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100}); AddInputFromArray(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f}); AddInputFromArray(TensorShape({}), {3}); @@ -71,9 +72,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) { TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) { MakeOp(.5); - AddInputFromArray(TensorShape({6, 4}), - {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, - 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); + AddInputFromArray( + TensorShape({6, 4}), + {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, + 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); AddInputFromArray(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f}); AddInputFromArray(TensorShape({}), {2}); TF_ASSERT_OK(RunOpKernel()); @@ -85,9 +87,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) { TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) { MakeOp(.5); - AddInputFromArray(TensorShape({6, 4}), - {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, - 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); + AddInputFromArray( + TensorShape({6, 4}), + {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, + 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); AddInputFromArray(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f}); AddInputFromArray(TensorShape({}), {30}); TF_ASSERT_OK(RunOpKernel()); @@ -134,9 +137,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromTenIdenticalBoxes) { TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) { MakeOp(.5); - AddInputFromArray(TensorShape({6, 4}), - {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, - 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); + AddInputFromArray( + TensorShape({6, 4}), + {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1, 0.9f, + 0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100, 1, 101}); AddInputFromArray(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f}); AddInputFromArray(TensorShape({}), {30}); Status s = RunOpKernel(); diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc index da825e408c2..7f12eb953a3 100644 --- a/tensorflow/core/kernels/nth_element_op.cc +++ b/tensorflow/core/kernels/nth_element_op.cc @@ -16,15 +16,15 @@ limitations under the License. // See docs in ../ops/nn_ops.cc. #include "tensorflow/core/kernels/nth_element_op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/util/work_sharder.h" -#include #include #include +#include +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/work_sharder.h" namespace tensorflow { @@ -54,8 +54,9 @@ class NthElementOp : public OpKernel { errors::InvalidArgument("Input must be >= 1-D, got shape ", input_in.shape().DebugString())); // The last dimension of input tensor must be greater than N. - OP_REQUIRES(context, input_in.dim_size(num_dims-1) > n, - errors::InvalidArgument("Input must have at least n+1 columns")); + OP_REQUIRES( + context, input_in.dim_size(num_dims - 1) > n, + errors::InvalidArgument("Input must have at least n+1 columns")); // std::nth_element only support the nth-smallest selection. if (reverse_) { @@ -64,7 +65,7 @@ class NthElementOp : public OpKernel { // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1]. TensorShape out_shape; - for (int i = 0; i < num_dims-1; ++i) { + for (int i = 0; i < num_dims - 1; ++i) { out_shape.AddDim(input_in.dim_size(i)); } Tensor* output_tensor = nullptr; @@ -83,32 +84,28 @@ namespace functor { template struct NthElementFunctor { - void operator() (OpKernelContext* context, - const Tensor& input_tensor, - Tensor& output_tensor, - int n, - bool reverse) { + void operator()(OpKernelContext* context, const Tensor& input_tensor, + Tensor& output_tensor, int n, bool reverse) { const T* input = input_tensor.flat().data(); T* output = output_tensor.flat().data(); // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1], // then num_rows = d1*d2...dk-1, last_dim = dk. const int num_rows = output_tensor.NumElements(); - const int last_dim = input_tensor.dim_size(input_tensor.dims()-1); + const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1); // Allocate each row to different shard. - auto SubNthElement = [&, input, output, last_dim, n](int start, - int limit) { + auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) { // std::nth_element would rearrange the array, so we need a new buffer. std::vector buf(last_dim); for (int b = start; b < limit; ++b) { // Copy from one row of elements to buffer const T* input_start = input + b * last_dim; - const T* input_end = input + (b+1) * last_dim; + const T* input_end = input + (b + 1) * last_dim; std::copy(input_start, input_end, buf.begin()); - std::nth_element(buf.begin(), buf.begin()+n, buf.end()); + std::nth_element(buf.begin(), buf.begin() + n, buf.end()); // The element placed in the nth position is exactly the element that // would occur in this position if the range was fully sorted. output[b] = buf[n]; @@ -116,9 +113,9 @@ struct NthElementFunctor { }; auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); - // The average time complexity of partition-based nth_element (BFPRT) is O(n), - // althought the worst time complexity could be O(n^2). - // Here, 20 is a empirical factor of cost_per_unit. + // The average time complexity of partition-based nth_element (BFPRT) is + // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a + // empirical factor of cost_per_unit. Shard(worker_threads.num_threads, worker_threads.workers, num_rows, 20 * last_dim, SubNthElement); } @@ -126,7 +123,6 @@ struct NthElementFunctor { } // namespace functor - #define REGISTER_NTHOP(T) \ REGISTER_KERNEL_BUILDER( \ Name("NthElement").Device(DEVICE_CPU).TypeConstraint("T"), \ @@ -136,4 +132,3 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_NTHOP); #undef REGISTER_NTHOP } // end namespace tensorflow - diff --git a/tensorflow/core/kernels/nth_element_op.h b/tensorflow/core/kernels/nth_element_op.h index 11a6c996b09..e7d25daecc7 100644 --- a/tensorflow/core/kernels/nth_element_op.h +++ b/tensorflow/core/kernels/nth_element_op.h @@ -26,10 +26,8 @@ namespace functor { template struct NthElementFunctor { - void operator() (OpKernelContext* context, - const Tensor& input_tensor, - Tensor& output_tensor, - int n); + void operator()(OpKernelContext* context, const Tensor& input_tensor, + Tensor& output_tensor, int n); }; } // namespace functor diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc index 49fd4bdebad..647515ae38a 100644 --- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc +++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc @@ -19,16 +19,16 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/one_hot_op.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/one_hot_op.h" namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; -#define DEFINE_GPU_SPEC_INDEX(T, TI) \ - template class generator::OneGenerator; \ +#define DEFINE_GPU_SPEC_INDEX(T, TI) \ + template class generator::OneGenerator; \ template struct functor::OneHot; #define DEFINE_GPU_SPEC(T) \ diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc index 9d53882deef..13427d71ff6 100644 --- a/tensorflow/core/kernels/ops_util_test.cc +++ b/tensorflow/core/kernels/ops_util_test.cc @@ -218,7 +218,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) { // in_size = 3, ksize = 3, stride = 2, pad_size = 0 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) { bcast_struct bcast[] = { - {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}}, + {{0, 3, 3, 2, 0}, {0, 3}}, + {{1, 3, 3, 2, 0}, {2, 1}}, }; for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { VerifyBcastValues(bcast[i]); @@ -228,7 +229,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) { // in_size = 3, ksize = 3, stride = 2, pad_size = 1 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) { bcast_struct bcast[] = { - {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}}, + {{0, 3, 3, 2, 1}, {0, 2}}, + {{1, 3, 3, 2, 1}, {1, 2}}, }; for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { VerifyBcastValues(bcast[i]); @@ -258,7 +260,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) { // in_size = 3, ksize = 3, stride = 3, pad_size = 1 TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) { bcast_struct bcast[] = { - {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}}, + {{0, 3, 3, 3, 1}, {0, 2}}, + {{1, 3, 3, 3, 1}, {2, 1}}, }; for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { VerifyBcastValues(bcast[i]); @@ -348,8 +351,8 @@ TEST_F(OpsUtilTest, Misaligned1DSlice) { TEST_F(OpsUtilTest, Aligned2DSliceOfDim0) { #if EIGEN_MAX_ALIGN_BYTES == 0 - // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is nonzero, - // a multidimensional tensor is always aligned. + // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is + // nonzero, a multidimensional tensor is always aligned. Tensor t(DT_FLOAT, TensorShape({3, 4})); int64 start = 1; int64 end = 2; diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc index 2033fbf5dc3..5645275cfa9 100644 --- a/tensorflow/core/kernels/pack_op.cc +++ b/tensorflow/core/kernels/pack_op.cc @@ -36,7 +36,7 @@ typedef Eigen::GpuDevice GPUDevice; #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // -------------------------------------------------------------------------- template @@ -123,7 +123,7 @@ class PackOp : public OpKernel { ConcatSYCL(c->eigen_sycl_device(), inputs_flat, &output_flat); return; } -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL ConcatCPU(c->device(), inputs_flat, &output_flat); } } @@ -139,7 +139,6 @@ class PackOp : public OpKernel { TF_CALL_ALL_TYPES(REGISTER_PACK); TF_CALL_QUANTIZED_TYPES(REGISTER_PACK); -TF_CALL_variant(REGISTER_PACK); #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) // Primarily used for SavedModel support on mobile. diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc index b232ba16a76..0ab9ff9f650 100644 --- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc +++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc @@ -95,9 +95,10 @@ struct TruncatedNormalFunctor { int64 sample = b * samples_per_batch; // On GPU, this check will just fill samples with NAN if it fails. - OP_REQUIRES(ctx, stddev > T(0) && minval < maxval && - (Eigen::numext::isfinite(minval) || - Eigen::numext::isfinite(maxval)), + OP_REQUIRES(ctx, + stddev > T(0) && minval < maxval && + (Eigen::numext::isfinite(minval) || + Eigen::numext::isfinite(maxval)), errors::InvalidArgument("Invalid parameters")); int numIterations = 0; @@ -118,8 +119,9 @@ struct TruncatedNormalFunctor { // Determine the method to use. const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4)); const T cutoff = - T(2) * Eigen::numext::exp( - T(0.5) + (normMin * (normMin - sqrtFactor)) / T(4)) / + T(2) * + Eigen::numext::exp(T(0.5) + + (normMin * (normMin - sqrtFactor)) / T(4)) / (normMin + sqrtFactor); const T diff = normMax - normMin; if (diff < cutoff) { @@ -309,30 +311,34 @@ class ParameterizedTruncatedNormalOp : public OpKernel { } else { // Parameters must be broadcastable to the shape [num_batches]. OP_REQUIRES( - ctx, TensorShapeUtils::IsScalar(means_tensor.shape()) || - means_tensor.dim_size(0) == 1 || - means_tensor.dim_size(0) == num_batches, + ctx, + TensorShapeUtils::IsScalar(means_tensor.shape()) || + means_tensor.dim_size(0) == 1 || + means_tensor.dim_size(0) == num_batches, errors::InvalidArgument( "Input means should have length 1 or shape[0], got shape: ", means_tensor.shape().DebugString())); OP_REQUIRES( - ctx, TensorShapeUtils::IsScalar(stddevs_tensor.shape()) || - stddevs_tensor.dim_size(0) == 1 || - stddevs_tensor.dim_size(0) == num_batches, + ctx, + TensorShapeUtils::IsScalar(stddevs_tensor.shape()) || + stddevs_tensor.dim_size(0) == 1 || + stddevs_tensor.dim_size(0) == num_batches, errors::InvalidArgument( "Input stddevs should have length 1 or shape[0], got shape: ", stddevs_tensor.shape().DebugString())); OP_REQUIRES( - ctx, TensorShapeUtils::IsScalar(minvals_tensor.shape()) || - minvals_tensor.dim_size(0) == 1 || - minvals_tensor.dim_size(0) == num_batches, + ctx, + TensorShapeUtils::IsScalar(minvals_tensor.shape()) || + minvals_tensor.dim_size(0) == 1 || + minvals_tensor.dim_size(0) == num_batches, errors::InvalidArgument( "Input minvals should have length 1 or shape[0], got shape: ", minvals_tensor.shape().DebugString())); OP_REQUIRES( - ctx, TensorShapeUtils::IsScalar(maxvals_tensor.shape()) || - maxvals_tensor.dim_size(0) == 1 || - maxvals_tensor.dim_size(0) == num_batches, + ctx, + TensorShapeUtils::IsScalar(maxvals_tensor.shape()) || + maxvals_tensor.dim_size(0) == 1 || + maxvals_tensor.dim_size(0) == num_batches, errors::InvalidArgument( "Input maxvals should have length 1 or shape[0], got shape: ", maxvals_tensor.shape().DebugString())); diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc index 933de65c15a..ddfeb1bb790 100644 --- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc +++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc @@ -202,12 +202,13 @@ struct TruncatedNormalFunctor { typename TTypes::Flat output) { const auto config = GetCudaLaunchConfig(num_elements, d); - TruncatedNormalKernel< - T><<>>( - gen, output.data(), num_batches, samples_per_batch, num_elements, - means.data(), means.dimension(0) == 1, stddevs.data(), - stddevs.dimension(0) == 1, minvals.data(), minvals.dimension(0) == 1, - maxvals.data(), maxvals.dimension(0) == 1, kMaxIterations); + TruncatedNormalKernel + <<>>( + gen, output.data(), num_batches, samples_per_batch, num_elements, + means.data(), means.dimension(0) == 1, stddevs.data(), + stddevs.dimension(0) == 1, minvals.data(), + minvals.dimension(0) == 1, maxvals.data(), + maxvals.dimension(0) == 1, kMaxIterations); }; }; diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc index 6b599612ad7..8e175fe8d4b 100644 --- a/tensorflow/core/kernels/parse_tensor_op.cc +++ b/tensorflow/core/kernels/parse_tensor_op.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/framework/register_types.h" namespace tensorflow { @@ -92,7 +91,6 @@ class SerializeTensorOp : public OpKernel { Name("SerializeTensor").Device(DEVICE_CPU).TypeConstraint("T"), \ SerializeTensorOp); TF_CALL_ALL_TYPES(REGISTER) -TF_CALL_variant(REGISTER) #undef REGISTER } // namespace tensorflow diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc index a406317213f..01bcfede1e8 100644 --- a/tensorflow/core/kernels/pooling_ops_3d.cc +++ b/tensorflow/core/kernels/pooling_ops_3d.cc @@ -258,7 +258,7 @@ struct LaunchMaxPooling3dGradOp { Eigen::array bcast = {1, csize, rsize, psize, 1}; #else Eigen::IndexList, int, int, int, - Eigen::type2index<1> > + Eigen::type2index<1>> bcast; bcast.set(1, csize); bcast.set(2, rsize); @@ -431,7 +431,7 @@ struct LaunchAvgPooling3dGradOp { Eigen::array bcast = {1, csize, rsize, psize, 1}; #else Eigen::IndexList, int, int, int, - Eigen::type2index<1> > + Eigen::type2index<1>> bcast; bcast.set(1, csize); bcast.set(2, rsize); @@ -833,7 +833,7 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS) #ifdef TENSORFLOW_USE_SYCL #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T) -TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS) + TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS) #undef REGISTER_SYCL_KERNELS #endif // TENSORFLOW_USE_SYCL diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h index c1bc5af4986..b4bead2456d 100644 --- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h +++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h @@ -281,12 +281,11 @@ class MaxPool3DGradSYCL { const T* input_data_n = input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_; - const T* output_data_n = - output_data + - n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_; - const T* input_backprop_n = - input_backprop + - n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_; + const T* output_data_n = output_data + n * p_.out_planes_ * p_.out_cols_ * + p_.out_rows_ * p_.depth_; + const T* input_backprop_n = input_backprop + n * p_.out_planes_ * + p_.out_cols_ * + p_.out_rows_ * p_.depth_; for (int poolp = poolpstart; poolp < poolpend; ++poolp) { int pstart = poolp * p_.stride_planes_ - p_.pad_planes_; const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_); @@ -678,9 +677,9 @@ class AvgPool3DGradSYCL { n /= p_.in_planes_; T gradient = T(0); - const T* input_backprop_n = - input_backprop + - n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_; + const T* input_backprop_n = input_backprop + n * p_.out_planes_ * + p_.out_cols_ * + p_.out_rows_ * p_.depth_; for (int poolp = poolpstart; poolp < poolpend; ++poolp) { int pstart = poolp * p_.stride_planes_ - p_.pad_planes_; const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_); diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h index e3131b804f2..fc7cb437b8f 100644 --- a/tensorflow/core/kernels/pooling_ops_common.h +++ b/tensorflow/core/kernels/pooling_ops_common.h @@ -195,7 +195,6 @@ class MaxPoolingOp : public OpKernel { // and updates the corresponding column(s) in output_as_matrix with the // max value. auto shard = [¶ms, &in_mat, &out_mat](int64 start, int64 limit) { - const int32 in_rows = params.tensor_in_rows; const int32 in_cols = params.tensor_in_cols; const int32 pad_rows = params.pad_rows; @@ -443,7 +442,6 @@ class MaxPoolingV2Op : public OpKernel { // and updates the corresponding column(s) in output_as_matrix with the // max value. auto shard = [¶ms, &in_mat, &out_mat](int64 start, int64 limit) { - const int32 in_rows = params.tensor_in_rows; const int32 in_cols = params.tensor_in_cols; const int32 pad_rows = params.pad_rows; diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc index d148c9f78d6..176720c22cc 100644 --- a/tensorflow/core/kernels/quantization_utils_test.cc +++ b/tensorflow/core/kernels/quantization_utils_test.cc @@ -385,8 +385,12 @@ void TestQuantizedToFloatInPlaceUsingEigen( // These are the float values we're going to test the conversions on. typedef std::pair FPair; for (FPair min_and_max : std::vector{ - FPair(-255.0f, 255.0f), FPair(-1.0f, 1.0f), FPair(-1.0f, 255.0f), - FPair(0.0f, 1e6), FPair(0.0f, 1.0f), FPair(-31.0f, 13.0f), + FPair(-255.0f, 255.0f), + FPair(-1.0f, 1.0f), + FPair(-1.0f, 255.0f), + FPair(0.0f, 1e6), + FPair(0.0f, 1.0f), + FPair(-31.0f, 13.0f), FPair(-5.89505e+08, 5.89505e+08), }) { const float f_min = min_and_max.first; diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h index 1363c7e325b..3b09ea2527d 100644 --- a/tensorflow/core/kernels/quantize_and_dequantize_op.h +++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h @@ -71,7 +71,8 @@ struct QuantizeAndDequantizeOneScaleImpl { out.device(d) = ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) * scale + - T(0.5)).floor() * + T(0.5)) + .floor() * inverse_scale + min_range; } else { diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc index d2cc55a94dd..57982bdf76e 100644 --- a/tensorflow/core/kernels/quantize_op_test.cc +++ b/tensorflow/core/kernels/quantize_op_test.cc @@ -250,7 +250,8 @@ TEST_F(QuantizedOpTest, QuantizeV2_32Bit) { Tensor expected(allocator(), DT_QINT32, TensorShape({element_count})); test::FillValues(&expected, { - std::numeric_limits::min(), 0, + std::numeric_limits::min(), + 0, static_cast(1.0f * (1 << 23)), static_cast(1.25f * (1 << 23)), static_cast(1.75f * (1 << 23)), diff --git a/tensorflow/core/kernels/quantized_batch_norm_op.cc b/tensorflow/core/kernels/quantized_batch_norm_op.cc index 18d83b41494..b03da7ad17f 100644 --- a/tensorflow/core/kernels/quantized_batch_norm_op.cc +++ b/tensorflow/core/kernels/quantized_batch_norm_op.cc @@ -16,11 +16,11 @@ limitations under the License. #define EIGEN_USE_THREADS #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "tensorflow/core/kernels/quantization_utils.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/kernels/quantization_utils.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc index d67f1ab3ec2..b03ac8e87da 100644 --- a/tensorflow/core/kernels/quantized_concat_op.cc +++ b/tensorflow/core/kernels/quantized_concat_op.cc @@ -135,8 +135,8 @@ class QuantizedConcatOp : public OpKernel { context, in.dims() == input_dims || (input_is_scalar && in_is_scalar), errors::InvalidArgument( "ConcatOp : Ranks of all input tensors should match: shape[0] = ", - input_shape.DebugString(), " vs. shape[", i, "] = ", - in.shape().DebugString())); + input_shape.DebugString(), " vs. shape[", i, + "] = ", in.shape().DebugString())); for (int j = 0; j < input_dims; ++j) { if (j == concat_dim) { continue; @@ -145,8 +145,8 @@ class QuantizedConcatOp : public OpKernel { context, in.dim_size(j) == input_shape.dim_size(j), errors::InvalidArgument( "ConcatOp : Dimensions of inputs should match: shape[0] = ", - input_shape.DebugString(), " vs. shape[", i, "] = ", - in.shape().DebugString())); + input_shape.DebugString(), " vs. shape[", i, + "] = ", in.shape().DebugString())); } if (in.NumElements() > 0) { int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0; diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc index 1921b83d12c..5b3570edff5 100644 --- a/tensorflow/core/kernels/quantized_conv_ops.cc +++ b/tensorflow/core/kernels/quantized_conv_ops.cc @@ -278,10 +278,9 @@ class Im2ColConvFunctor { *resource = new Im2ColBufferResource(); return Status::OK(); }; - OP_REQUIRES_OK( - context, - context->resource_manager()->LookupOrCreate( - "Conv2d", "im2col_buffer", &im2col_buffer_resource, creator)); + OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate( + "Conv2d", "im2col_buffer", + &im2col_buffer_resource, creator)); // This means that multiple ops can't be run simultaneously on different // threads, because we have a single shared resource. The platforms this is // aimed at have intra-op parallelism as their focus though, so it shouldn't diff --git a/tensorflow/core/kernels/quantized_instance_norm.cc b/tensorflow/core/kernels/quantized_instance_norm.cc index c29f534f31b..d62094cc9fa 100644 --- a/tensorflow/core/kernels/quantized_instance_norm.cc +++ b/tensorflow/core/kernels/quantized_instance_norm.cc @@ -278,10 +278,10 @@ class QuantizedInstanceNorm : public OpKernel { float input_max = context->input(2).flat()(0); float input_scale = (input_max - input_min) / 255.0f; - OP_REQUIRES( - context, input_min < input_max, - errors::InvalidArgument("input_min must be less than input_max : ", - input_min, " >= ", input_max)); + OP_REQUIRES(context, input_min < input_max, + errors::InvalidArgument( + "input_min must be less than input_max : ", input_min, + " >= ", input_max)); auto input_tensor = input.tensor(); auto N = input_tensor.dimension(0); diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc index afb30d5f627..da8c46dc516 100644 --- a/tensorflow/core/kernels/quantized_matmul_op.cc +++ b/tensorflow/core/kernels/quantized_matmul_op.cc @@ -104,9 +104,9 @@ class QuantizedMatMulOp : public OpKernel { OP_REQUIRES(context, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second), - errors::InvalidArgument("Matrix size-compatible: In[0]: ", - a.shape().DebugString(), ", In[1]: ", - b.shape().DebugString())); + errors::InvalidArgument( + "Matrix size-compatible: In[0]: ", a.shape().DebugString(), + ", In[1]: ", b.shape().DebugString())); OP_REQUIRES(context, ((shift_c >= 0) && (shift_c <= 31)), errors::InvalidArgument("shift_c must be between 0 and 31, " diff --git a/tensorflow/core/kernels/quantized_matmul_op_test.cc b/tensorflow/core/kernels/quantized_matmul_op_test.cc index 535b5115c34..c9f05dbc10b 100644 --- a/tensorflow/core/kernels/quantized_matmul_op_test.cc +++ b/tensorflow/core/kernels/quantized_matmul_op_test.cc @@ -206,17 +206,32 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) { // We have set the transpose_a flag to true, so the matrix is transposed, and // for filling the values the in-memory storage order is effectively // column major, rather than the default row-major. - AddInputFromArray(TensorShape({a_rows, a_cols}), - { - 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - }); + AddInputFromArray(TensorShape({a_rows, a_cols}), { + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + }); // The B matrix is: // | 1 | 4| // | 2 | 5| // | 3 | 6| AddInputFromArray(TensorShape({b_rows, b_cols}), { - 1, 4, 2, 5, 3, 6, + 1, + 4, + 2, + 5, + 3, + 6, }); AddInputFromArray(TensorShape({1}), {-12.0f}); AddInputFromArray(TensorShape({1}), {243.0f}); @@ -238,10 +253,16 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) { // | -50 | -113 | // | -56 | -128 | Tensor expected(allocator(), DT_QINT32, TensorShape({a_cols, b_cols})); - test::FillValues(&expected, - { - -38, -83, -44, -98, -50, -113, -56, -128, - }); + test::FillValues(&expected, { + -38, + -83, + -44, + -98, + -50, + -113, + -56, + -128, + }); test::ExpectTensorEqual(expected, *GetOutput(0)); } diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc index eaa5e667f7d..3c7536e0373 100644 --- a/tensorflow/core/kernels/quantized_mul_op.cc +++ b/tensorflow/core/kernels/quantized_mul_op.cc @@ -298,9 +298,8 @@ class QuantizedMulOp : public OpKernel { return; } Tensor* z; - OP_REQUIRES_OK( - context, - context->allocate_output(0, BCast::ToShape(bcast.output_shape()), &z)); + OP_REQUIRES_OK(context, context->allocate_output( + 0, BCast::ToShape(bcast.output_shape()), &z)); // Make sure that we have valid quantization ranges for the input buffers. // If the difference between the min and max is negative or zero, it makes diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc index b0550c8260c..a4e407c7a94 100644 --- a/tensorflow/core/kernels/quantized_mul_op_test.cc +++ b/tensorflow/core/kernels/quantized_mul_op_test.cc @@ -188,11 +188,12 @@ void TestManualScalar() { 10.0f, {1}, {10.0f}, -100.0f, 100.0f, {10}, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f}, 3.0f); - TestMul({1}, {10.0f}, -100.0f, 100.0f, {10}, - {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f, - 10.0f, {10}, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, - 90.0f, 100.0f}, - 3.0f); + TestMul( + {1}, {10.0f}, -100.0f, 100.0f, {10}, + {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f, + 10.0f, {10}, + {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f}, + 3.0f); } void TestScalar() { diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc index 330d161c32b..de495c19cba 100644 --- a/tensorflow/core/kernels/queue_base.cc +++ b/tensorflow/core/kernels/queue_base.cc @@ -39,8 +39,8 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element, return errors::Internal( "HandleSliceToElement Cannot copy slice: number of elements does not " "match. Shapes are: [element]: ", - element->shape().DebugString(), ", [parent slice]: ", - chip_shape.DebugString()); + element->shape().DebugString(), + ", [parent slice]: ", chip_shape.DebugString()); } auto parent_as_matrix = parent.flat_outer_dims(); element->flat() = parent_as_matrix.chip(index, 0); diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc index 17831b74370..46a02854d73 100644 --- a/tensorflow/core/kernels/queue_ops.cc +++ b/tensorflow/core/kernels/queue_ops.cc @@ -428,13 +428,14 @@ REGISTER_KERNEL_BUILDER(Name("QueueSizeV2").Device(DEVICE_CPU), QueueSizeOp); class QueueIsClosedOp : public QueueOpKernel { public: explicit QueueIsClosedOp(OpKernelConstruction* context) - : QueueOpKernel(context) {} + : QueueOpKernel(context) {} protected: void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue, DoneCallback callback) override { Tensor* Tqueue_is_closed = nullptr; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed)); + OP_REQUIRES_OK(ctx, + ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed)); Tqueue_is_closed->flat().setConstant(queue->is_closed()); callback(); } @@ -443,8 +444,10 @@ class QueueIsClosedOp : public QueueOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(QueueIsClosedOp); }; -REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), QueueIsClosedOp); -REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), QueueIsClosedOp); +REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), + QueueIsClosedOp); +REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), + QueueIsClosedOp); class FakeQueueOp : public OpKernel { public: diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc index ba94d6be5ca..554909760aa 100644 --- a/tensorflow/core/kernels/random_crop_op.cc +++ b/tensorflow/core/kernels/random_crop_op.cc @@ -68,10 +68,10 @@ class RandomCropOp : public OpKernel { // Edge case. The target dimensions are larger then the image, so // zero-pad the image. This guarantees that the image will *always* // be [target_height, target_width] in size. - OP_REQUIRES( - context, width >= target_width, - errors::FailedPrecondition("width must be >= target_width: width = ", - width, ", target_width = ", target_width)); + OP_REQUIRES(context, width >= target_width, + errors::FailedPrecondition( + "width must be >= target_width: width = ", width, + ", target_width = ", target_width)); OP_REQUIRES(context, height >= target_height, errors::FailedPrecondition( "height must be >= target_height: height = ", height, diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc index 55a8b9c9b67..78ff7948fbf 100644 --- a/tensorflow/core/kernels/random_op.cc +++ b/tensorflow/core/kernels/random_op.cc @@ -50,7 +50,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace functor { using random::PhiloxRandom; @@ -271,9 +271,10 @@ class RandomGammaOp : public OpKernel { const Tensor& shape_t = ctx->input(0); const Tensor& alpha_t = ctx->input(1); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(shape_t.shape()) && - (shape_t.dtype() == DataType::DT_INT32 || - shape_t.dtype() == DataType::DT_INT64), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(shape_t.shape()) && + (shape_t.dtype() == DataType::DT_INT32 || + shape_t.dtype() == DataType::DT_INT64), errors::InvalidArgument( "shape must be a vector of {int32,int64}, got shape: ", shape_t.DebugString())); @@ -325,7 +326,7 @@ class RandomGammaOp : public OpKernel { // avoid a couple flops which can be done on a per-alpha basis. auto DoWork = [num_samples, num_alphas, &rng, samples_flat, alpha_flat]( - int start_output, int limit_output) { + int start_output, int limit_output) { using Eigen::numext::exp; using Eigen::numext::log; using Eigen::numext::pow; @@ -448,40 +449,40 @@ class RandomGammaOp : public OpKernel { } // namespace -#define REGISTER(TYPE) \ - template struct functor::FillPhiloxRandom< \ - CPUDevice, random::UniformDistribution >; \ - template struct functor::FillPhiloxRandom< \ - CPUDevice, random::NormalDistribution >; \ - template struct functor::FillPhiloxRandom< \ - CPUDevice, \ - random::TruncatedNormalDistribution< \ - random::SingleSampleAdapter, TYPE> >; \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomUniform") \ - .Device(DEVICE_CPU) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomStandardNormal") \ - .Device(DEVICE_CPU) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("TruncatedNormal") \ - .Device(DEVICE_CPU) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp< \ - CPUDevice, \ - random::TruncatedNormalDistribution< \ - random::SingleSampleAdapter, TYPE> >); \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint("T"), \ +#define REGISTER(TYPE) \ + template struct functor::FillPhiloxRandom< \ + CPUDevice, random::UniformDistribution>; \ + template struct functor::FillPhiloxRandom< \ + CPUDevice, random::NormalDistribution>; \ + template struct functor::FillPhiloxRandom< \ + CPUDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter, TYPE>>; \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomUniform") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomStandardNormal") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("TruncatedNormal") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp< \ + CPUDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter, TYPE>>); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint("T"), \ RandomGammaOp) #define REGISTER_INT(IntType) \ @@ -504,33 +505,33 @@ TF_CALL_int64(REGISTER_INT); #if GOOGLE_CUDA -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomUniform") \ - .Device(DEVICE_GPU) \ - .HostMemory("shape") \ - .TypeConstraint("T") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomStandardNormal") \ - .Device(DEVICE_GPU) \ - .HostMemory("shape") \ - .TypeConstraint("T") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("TruncatedNormal") \ - .Device(DEVICE_GPU) \ - .HostMemory("shape") \ - .TypeConstraint("T") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp< \ - GPUDevice, \ - random::TruncatedNormalDistribution< \ - random::SingleSampleAdapter, TYPE> >); +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomUniform") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint("T") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomStandardNormal") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint("T") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("TruncatedNormal") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint("T") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp< \ + GPUDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter, TYPE>>); #define REGISTER_INT(IntType) \ REGISTER_KERNEL_BUILDER(Name("RandomUniformInt") \ @@ -565,13 +566,12 @@ struct FillPhiloxRandomKernel; template struct FillPhiloxRandomKernel { typedef typename Distribution::ResultElementType T; - using write_accessor = sycl::accessor; + using write_accessor = sycl::accessor; - FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist) - : data_(data), - gen_(gen), - dist_(dist) { - } + FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, + Distribution& dist) + : data_(data), gen_(gen), dist_(dist) {} void operator()(sycl::nd_item<1> item) { const size_t kGroupSize = Distribution::kResultElementCount; @@ -597,7 +597,7 @@ struct FillPhiloxRandomKernel { const typename Distribution::ResultType samples = dist_(&gen_); for (size_t i = 0; i < kGroupSize; ++i) { if (offset >= size) { - return; + return; } data[offset] = samples[i]; ++offset; @@ -610,17 +610,15 @@ struct FillPhiloxRandomKernel { Distribution dist_; }; - template struct FillPhiloxRandomKernel { typedef typename Distribution::ResultElementType T; - using write_accessor = sycl::accessor; + using write_accessor = sycl::accessor; - FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist) - : data_(data), - gen_(gen), - dist_(dist) { - } + FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, + Distribution& dist) + : data_(data), gen_(gen), dist_(dist) {} void operator()(sycl::nd_item<1> item) { using random::PhiloxRandom; @@ -628,9 +626,9 @@ struct FillPhiloxRandomKernel { const size_t kReservedSamplesPerOutput = 256; const size_t kGroupSize = Distribution::kResultElementCount; - const size_t kGeneratorSkipPerOutputGroup = kGroupSize * - kReservedSamplesPerOutput / - PhiloxRandom::kResultElementCount; + const size_t kGeneratorSkipPerOutputGroup = + kGroupSize * kReservedSamplesPerOutput / + PhiloxRandom::kResultElementCount; const size_t item_id = item.get_global(0); const size_t total_item_count = item.get_global_range(); @@ -674,10 +672,9 @@ class FillRandomKernel; // It splits the work into several tasks and run them in parallel template void FillPhiloxRandom::operator()( - OpKernelContext* context, const SYCLDevice& device, random::PhiloxRandom gen, - typename Distribution::ResultElementType* data, int64 size, - Distribution dist) { - + OpKernelContext* context, const SYCLDevice& device, + random::PhiloxRandom gen, typename Distribution::ResultElementType* data, + int64 size, Distribution dist) { const size_t group_size = device.maxSyclThreadsPerBlock(); const size_t group_count = (size + group_size - 1) / group_size; @@ -686,50 +683,52 @@ void FillPhiloxRandom::operator()( device.sycl_queue().submit([&](sycl::handler& cgh) { auto access = buffer.template get_access(cgh); - FillPhiloxRandomKernel task(access, gen, dist); + FillPhiloxRandomKernel + task(access, gen, dist); cgh.parallel_for>( - sycl::nd_range<1>(sycl::range<1>(group_count * group_size), sycl::range<1>(group_size)), - task - ); + sycl::nd_range<1>(sycl::range<1>(group_count * group_size), + sycl::range<1>(group_size)), + task); }); } -} +} // namespace functor -#define REGISTER(TYPE) \ - template struct functor::FillPhiloxRandom< \ - SYCLDevice, random::UniformDistribution >; \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomUniform") \ - .Device(DEVICE_SYCL) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("RandomStandardNormal") \ - .Device(DEVICE_SYCL) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp >); \ - REGISTER_KERNEL_BUILDER( \ - Name("TruncatedNormal") \ - .Device(DEVICE_SYCL) \ - .HostMemory("shape") \ - .TypeConstraint("dtype"), \ - PhiloxRandomOp< \ - SYCLDevice, \ - random::TruncatedNormalDistribution< \ - random::SingleSampleAdapter, TYPE> >); +#define REGISTER(TYPE) \ + template struct functor::FillPhiloxRandom< \ + SYCLDevice, random::UniformDistribution>; \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomUniform") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomStandardNormal") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp>); \ + REGISTER_KERNEL_BUILDER( \ + Name("TruncatedNormal") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + PhiloxRandomOp< \ + SYCLDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter, TYPE>>); -#define REGISTER_INT(IntType) \ - REGISTER_KERNEL_BUILDER(Name("RandomUniformInt") \ - .Device(DEVICE_SYCL) \ - .HostMemory("shape") \ - .HostMemory("minval") \ - .HostMemory("maxval") \ - .TypeConstraint("Tout"), \ +#define REGISTER_INT(IntType) \ + REGISTER_KERNEL_BUILDER(Name("RandomUniformInt") \ + .Device(DEVICE_SYCL) \ + .HostMemory("shape") \ + .HostMemory("minval") \ + .HostMemory("maxval") \ + .TypeConstraint("Tout"), \ RandomUniformIntOp); TF_CALL_float(REGISTER); @@ -740,6 +739,6 @@ TF_CALL_int64(REGISTER_INT); #undef REGISTER #undef REGISTER_INT -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc index 7afa6974c6a..3393b39faf4 100644 --- a/tensorflow/core/kernels/random_op_gpu.cu.cc +++ b/tensorflow/core/kernels/random_op_gpu.cu.cc @@ -222,9 +222,8 @@ void FillPhiloxRandom::operator()( (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) / block_size; - FillPhiloxRandomKernelLaunch< - Distribution><<>>(gen, data, size, - dist); + FillPhiloxRandomKernelLaunch + <<>>(gen, data, size, dist); }; // Explicit instantiation of the GPU distributions functors diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc index bf1d83ec751..64fb4a5c228 100644 --- a/tensorflow/core/kernels/random_poisson_op.cc +++ b/tensorflow/core/kernels/random_poisson_op.cc @@ -103,7 +103,7 @@ struct PoissonFunctor { typedef random::UniformDistribution Uniform; auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat]( - int start_output, int limit_output) { + int start_output, int limit_output) { // Capturing "rng" by value would only make a copy for the _shared_ // lambda. Since we want to let each worker have its own copy, we pass // "rng" by reference and explicitly do a copy assignment. diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc index e9695cfde30..87fc9433316 100644 --- a/tensorflow/core/kernels/random_shuffle_queue_op.cc +++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc @@ -334,96 +334,95 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx, // TODO(josh11b): This makes two copies of callback, avoid this if possible. dequeue_attempts_.emplace_back( num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token, - [callback, allow_small_batch, this](Attempt* attempt) - EXCLUSIVE_LOCKS_REQUIRED(mu_) { - int32 queue_size = queues_[0].size(); - if (closed_ && queue_size < attempt->elements_requested) { - // If we don't have enough for a full dequeue, we have - // to reset the attempt tuple. - if (!attempt->tuple.empty()) { - // Restore already-dequeued elements to the queue. - for (int64 i = attempt->tuple[0].dim_size(0) - - attempt->elements_requested - 1; - i >= 0; --i) { - for (int j = 0; j < num_components(); ++j) { - PersistentTensor element; - Status s = GetElementComponentFromBatch( - attempt->tuple, i, j, attempt->context, &element); - if (!s.ok()) { - attempt->context->SetStatus( - errors::DataLoss("Failed to restore element from " - "partially-dequeued batch " - "to RandomShuffleQueue: ", - s.error_message())); - } - queues_[j].push_back(element); - } - } - } - if (allow_small_batch && !queues_[0].empty()) { - // Request all remaining elements in the queue. - queue_size = queues_[0].size(); - attempt->tuple.clear(); - attempt->elements_requested = queue_size; - } else { - if (allow_small_batch) { - // There may be some other attempts containing - // values. If so, we'll yield and wait for them - // to add elements to the queue. - if (!enqueue_attempts_.empty()) return kProgress; - } - if (attempt->context->status().ok()) { - attempt->context->SetStatus(errors::OutOfRange( - "RandomShuffleQueue '", name_, "' is closed and has ", - "insufficient elements (requested ", - attempt->elements_requested, ", current size ", - queue_size, ")")); - } - return kComplete; - } - } - - RunResult result = kNoProgress; - if (!closed_) queue_size -= min_after_dequeue_; - for (; queue_size > 0; --queue_size) { - if (attempt->tuple.empty()) { - // Only allocate tuple when we have something to dequeue - // so we don't use excessive memory when there are many - // blocked dequeue attempts waiting. - attempt->tuple.reserve(num_components()); - for (int i = 0; i < num_components(); ++i) { - const TensorShape shape = - ManyOutShape(i, attempt->elements_requested); - Tensor element; + [callback, allow_small_batch, + this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + int32 queue_size = queues_[0].size(); + if (closed_ && queue_size < attempt->elements_requested) { + // If we don't have enough for a full dequeue, we have + // to reset the attempt tuple. + if (!attempt->tuple.empty()) { + // Restore already-dequeued elements to the queue. + for (int64 i = attempt->tuple[0].dim_size(0) - + attempt->elements_requested - 1; + i >= 0; --i) { + for (int j = 0; j < num_components(); ++j) { + PersistentTensor element; + Status s = GetElementComponentFromBatch( + attempt->tuple, i, j, attempt->context, &element); + if (!s.ok()) { attempt->context->SetStatus( - attempt->context->allocate_temp(component_dtypes_[i], - shape, &element)); - if (!attempt->context->status().ok()) return kComplete; - attempt->tuple.emplace_back(element); + errors::DataLoss("Failed to restore element from " + "partially-dequeued batch " + "to RandomShuffleQueue: ", + s.error_message())); } - } - result = kProgress; - Tuple tuple; - DequeueLocked(attempt->context, &tuple); - const int index = attempt->tuple[0].dim_size(0) - - attempt->elements_requested; - for (int i = 0; i < num_components(); ++i) { - attempt->context->SetStatus(batch_util::CopyElementToSlice( - std::move(tuple[i]), &attempt->tuple[i], index)); - if (!attempt->context->status().ok()) return kComplete; - } - tuple.clear(); - --attempt->elements_requested; - if (attempt->elements_requested == 0) { - tuple = attempt->tuple; - attempt->done_callback = [callback, tuple]() { - callback(tuple); - }; - return kComplete; + queues_[j].push_back(element); } } - return result; - }); + } + if (allow_small_batch && !queues_[0].empty()) { + // Request all remaining elements in the queue. + queue_size = queues_[0].size(); + attempt->tuple.clear(); + attempt->elements_requested = queue_size; + } else { + if (allow_small_batch) { + // There may be some other attempts containing + // values. If so, we'll yield and wait for them + // to add elements to the queue. + if (!enqueue_attempts_.empty()) return kProgress; + } + if (attempt->context->status().ok()) { + attempt->context->SetStatus(errors::OutOfRange( + "RandomShuffleQueue '", name_, "' is closed and has ", + "insufficient elements (requested ", + attempt->elements_requested, ", current size ", + queue_size, ")")); + } + return kComplete; + } + } + + RunResult result = kNoProgress; + if (!closed_) queue_size -= min_after_dequeue_; + for (; queue_size > 0; --queue_size) { + if (attempt->tuple.empty()) { + // Only allocate tuple when we have something to dequeue + // so we don't use excessive memory when there are many + // blocked dequeue attempts waiting. + attempt->tuple.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + const TensorShape shape = + ManyOutShape(i, attempt->elements_requested); + Tensor element; + attempt->context->SetStatus(attempt->context->allocate_temp( + component_dtypes_[i], shape, &element)); + if (!attempt->context->status().ok()) return kComplete; + attempt->tuple.emplace_back(element); + } + } + result = kProgress; + Tuple tuple; + DequeueLocked(attempt->context, &tuple); + const int index = + attempt->tuple[0].dim_size(0) - attempt->elements_requested; + for (int i = 0; i < num_components(); ++i) { + attempt->context->SetStatus(batch_util::CopyElementToSlice( + std::move(tuple[i]), &attempt->tuple[i], index)); + if (!attempt->context->status().ok()) return kComplete; + } + tuple.clear(); + --attempt->elements_requested; + if (attempt->elements_requested == 0) { + tuple = attempt->tuple; + attempt->done_callback = [callback, tuple]() { + callback(tuple); + }; + return kComplete; + } + } + return result; + }); } } if (!already_cancelled) { diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h index 36ca7f834f7..15ae4c1fc53 100644 --- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h +++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h @@ -312,8 +312,7 @@ __global__ void ColumnReduceKernel( int col = blockIdx.x * 32 + threadIdx.x; value_type sum = initVal; - if (row < num_rows && col < num_cols) - sum = in[row * num_cols + col]; + if (row < num_rows && col < num_cols) sum = in[row * num_cols + col]; // 1D array necessary due to bug in CUDA 9 compiler. // TODO(nluehr) revert to 2D array when compiler is ready. @@ -366,8 +365,7 @@ __global__ void CleanupSegments( const int tid = threadIdx.x + blockIdx.x * blockDim.x; value_type val = initVal; - if (tid < segment_size * num_cols) - val = partial_sums[tid]; + if (tid < segment_size * num_cols) val = partial_sums[tid]; typedef cub::WarpReduce WarpReduce; diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc index afad288cc00..d52358737fd 100644 --- a/tensorflow/core/kernels/relu_op.cc +++ b/tensorflow/core/kernels/relu_op.cc @@ -31,7 +31,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_RELU_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -113,8 +113,7 @@ namespace functor { \ template <> \ void Selu::operator()( \ - const GPUDevice& d, \ - typename TTypes::ConstTensor features, \ + const GPUDevice& d, typename TTypes::ConstTensor features, \ typename TTypes::Tensor activations); \ extern template struct Selu; \ \ @@ -125,8 +124,6 @@ namespace functor { typename TTypes::Tensor backprops); \ extern template struct SeluGrad; - - TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); } // namespace functor @@ -157,8 +154,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ SeluGradOp) - - TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_GPU_KERNELS @@ -192,10 +187,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint("T"), \ SeluGradOp) - - TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS); #undef REGISTER_SYCL_KERNELS -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h index 24b789c5437..3bc5ba8a50d 100644 --- a/tensorflow/core/kernels/relu_op_functor.h +++ b/tensorflow/core/kernels/relu_op_functor.h @@ -85,10 +85,9 @@ struct Relu6Grad { // make sure not to propagate the associated gradient // value. This allows "features" to be either the input or the output of // the relu6. - backprops.device(d) = - gradients * - ((features > static_cast(0)) * (features < static_cast(6))) - .template cast(); + backprops.device(d) = gradients * ((features > static_cast(0)) * + (features < static_cast(6))) + .template cast(); } }; @@ -161,8 +160,8 @@ struct SeluGrad { const auto scale = static_cast(1.0507009873554804934193349852946); const auto scale_alpha = static_cast(1.7580993408473768599402175208123); backprops.device(d) = - (activations < static_cast(0)).select( - gradients * (activations + scale_alpha), gradients * scale); + (activations < static_cast(0)) + .select(gradients * (activations + scale_alpha), gradients * scale); } }; diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc index 1a9cf4c6406..86e61bbcefc 100644 --- a/tensorflow/core/kernels/resize_bicubic_op.cc +++ b/tensorflow/core/kernels/resize_bicubic_op.cc @@ -20,6 +20,7 @@ limitations under the License. #include #include +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" @@ -28,7 +29,6 @@ limitations under the License. #include "tensorflow/core/kernels/image_resizer_state.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { namespace { diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc index 9e10fec4232..25a37d5e1af 100644 --- a/tensorflow/core/kernels/resize_bicubic_op_test.cc +++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc @@ -286,13 +286,14 @@ BM_ResizeBicubicDev(32, 128, 3); BM_ResizeBicubicDev(32, 512, 3); BM_ResizeBicubicDev(32, 1024, 3); -#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS) \ - static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(int iters) { \ - testing::ItemsProcessed(static_cast(iters) * BATCH * SIZE * SIZE * \ - CHANNELS * 8 * 8); \ - test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8)) \ - .Run(iters); \ - } \ +#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS) \ + static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS( \ + int iters) { \ + testing::ItemsProcessed(static_cast(iters) * BATCH * SIZE * SIZE * \ + CHANNELS * 8 * 8); \ + test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8)) \ + .Run(iters); \ + } \ BENCHMARK(BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS); BM_ResizeBicubicExpand(12, 48, 1); diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc index a7da7a0777d..f82c3fcd9ff 100644 --- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc +++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc @@ -164,11 +164,11 @@ struct ResizeBilinear { if (total_count == 0) return; CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); - ResizeBilinearKernel< - T><<>>( - config.virtual_thread_count, images.data(), height_scale, width_scale, - batch, in_height, in_width, channels, out_height, out_width, - output.data()); + ResizeBilinearKernel + <<>>( + config.virtual_thread_count, images.data(), height_scale, + width_scale, batch, in_height, in_width, channels, out_height, + out_width, output.data()); } }; @@ -200,11 +200,11 @@ struct ResizeBilinearGrad { // Accumulate. total_count = batch * resized_height * resized_width * channels; config = GetCudaLaunchConfig(total_count, d); - ResizeBilinearGradKernel< - T><<>>( - config.virtual_thread_count, input_grad.data(), height_scale, - width_scale, batch, original_height, original_width, channels, - resized_height, resized_width, output_grad.data()); + ResizeBilinearGradKernel + <<>>( + config.virtual_thread_count, input_grad.data(), height_scale, + width_scale, batch, original_height, original_width, channels, + resized_height, resized_width, output_grad.data()); } }; diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc index 9cc8e03e3ac..5b4aad3cdd8 100644 --- a/tensorflow/core/kernels/resource_variable_ops.cc +++ b/tensorflow/core/kernels/resource_variable_ops.cc @@ -387,7 +387,6 @@ class AssignVariableOp : public OpKernel { TF_CALL_ALL_TYPES(REGISTER_KERNELS); TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS); -TF_CALL_variant(REGISTER_KERNELS); #undef REGISTER_KERNELS #if GOOGLE_CUDA @@ -635,6 +634,9 @@ class ResourceScatterUpdateOp : public OpKernel { TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHEMTIC_CPU); +REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate", + scatter_op::UpdateOp::ASSIGN); + // Registers GPU kernels. #if GOOGLE_CUDA #define REGISTER_SCATTER_ARITHEMTIC_GPU(type) \ diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc index 8f82784d936..bb96c42f10c 100644 --- a/tensorflow/core/kernels/reverse_op.cc +++ b/tensorflow/core/kernels/reverse_op.cc @@ -269,10 +269,10 @@ class ReverseV2Op : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &output)); -// TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse of -// a single dimension to the dims=3 or dims=2 case, regardless of the number -// of dimensions in the tensor. This would let some ops use faster -// lower-dimension code (and use optimized versions). + // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse + // of a single dimension to the dims=3 or dims=2 case, regardless of the + // number of dimensions in the tensor. This would let some ops use faster + // lower-dimension code (and use optimized versions). #define HANDLE_REVERSE(NDIMS) \ case NDIMS: \ diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc index b05a7c55504..3ee49db669f 100644 --- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc +++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc @@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice; #define DEFINE_REVERSE(T, DIM) \ template struct functor::Reverse; #define DEFINE_REVERSE_ALL_DIMS(T) \ - DEFINE_REVERSE(T, 0) \ - DEFINE_REVERSE(T, 1) \ - DEFINE_REVERSE(T, 2) \ - DEFINE_REVERSE(T, 3) \ - DEFINE_REVERSE(T, 4) \ - DEFINE_REVERSE(T, 5) \ - DEFINE_REVERSE(T, 6) \ - DEFINE_REVERSE(T, 7) \ + DEFINE_REVERSE(T, 0) \ + DEFINE_REVERSE(T, 1) \ + DEFINE_REVERSE(T, 2) \ + DEFINE_REVERSE(T, 3) \ + DEFINE_REVERSE(T, 4) \ + DEFINE_REVERSE(T, 5) \ + DEFINE_REVERSE(T, 6) \ + DEFINE_REVERSE(T, 7) \ DEFINE_REVERSE(T, 8) TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS); diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc index d1980d4b652..15a707a9c66 100644 --- a/tensorflow/core/kernels/reverse_sequence_op.cc +++ b/tensorflow/core/kernels/reverse_sequence_op.cc @@ -51,8 +51,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) { // Copy seq_len info down for validity checks context->eigen_device().memcpyDeviceToHost( - seq_lens_vec.data(), seq_lens_t.data(), - sizeof(Tlen) * seq_lens_t.size()); + seq_lens_vec.data(), seq_lens_t.data(), sizeof(Tlen) * seq_lens_t.size()); OP_REQUIRES(context, batch_dim != seq_dim, errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim)); @@ -76,8 +75,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) { } } -void CheckErrorsGPU(OpKernelContext* context, int batch_dim, - int seq_dim) { +void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) { const Tensor& input = context->input(0); const Tensor& seq_lens = context->input(1); @@ -98,13 +96,13 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim, template <> void CheckErrors(OpKernelContext* context, int batch_dim, - int seq_dim) { + int seq_dim) { CheckErrorsGPU(context, batch_dim, seq_dim); } template <> void CheckErrors(OpKernelContext* context, int batch_dim, - int seq_dim) { + int seq_dim) { CheckErrorsGPU(context, batch_dim, seq_dim); } @@ -164,14 +162,15 @@ class ReverseSequenceOp : public OpKernel { TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp); }; -#define REGISTER_REVERSE_SEQUENCE(type, len_type) \ - REGISTER_KERNEL_BUILDER( \ - Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint("T"). \ - TypeConstraint("Tlen"), \ - ReverseSequenceOp); +#define REGISTER_REVERSE_SEQUENCE(type, len_type) \ + REGISTER_KERNEL_BUILDER(Name("ReverseSequence") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tlen"), \ + ReverseSequenceOp); -#define REGISTER_REVERSE_SEQUENCE_LEN(type) \ - REGISTER_REVERSE_SEQUENCE(type, int32); \ +#define REGISTER_REVERSE_SEQUENCE_LEN(type) \ + REGISTER_REVERSE_SEQUENCE(type, int32); \ REGISTER_REVERSE_SEQUENCE(type, int64); TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_LEN); @@ -181,23 +180,23 @@ TF_CALL_bool(REGISTER_REVERSE_SEQUENCE_LEN); // Forward declarations of the functor specializations for GPU. namespace functor { -#define DECLARE_GPU_SPEC(T, Tlen, Dims) \ - template <> \ - void ReverseSequence::Compute( \ - const GPUDevice& d, typename TTypes::ConstTensor input, \ - int32 batch_dim, int32 seq_dim, \ - typename TTypes::ConstVec seq_lens, \ - typename TTypes::Tensor output); \ +#define DECLARE_GPU_SPEC(T, Tlen, Dims) \ + template <> \ + void ReverseSequence::Compute( \ + const GPUDevice& d, typename TTypes::ConstTensor input, \ + int32 batch_dim, int32 seq_dim, \ + typename TTypes::ConstVec seq_lens, \ + typename TTypes::Tensor output); \ extern template struct ReverseSequence; -#define DECLARE_GPU_SPEC_LEN(T, Dims) \ - DECLARE_GPU_SPEC(T, int32, Dims); \ +#define DECLARE_GPU_SPEC_LEN(T, Dims) \ + DECLARE_GPU_SPEC(T, int32, Dims); \ DECLARE_GPU_SPEC(T, int64, Dims); -#define DECLARE_GPU_SPECS(T) \ - DECLARE_GPU_SPEC_LEN(T, 2); \ - DECLARE_GPU_SPEC_LEN(T, 3); \ - DECLARE_GPU_SPEC_LEN(T, 4); \ +#define DECLARE_GPU_SPECS(T) \ + DECLARE_GPU_SPEC_LEN(T, 2); \ + DECLARE_GPU_SPEC_LEN(T, 3); \ + DECLARE_GPU_SPEC_LEN(T, 4); \ DECLARE_GPU_SPEC_LEN(T, 5); TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS); @@ -206,14 +205,15 @@ TF_CALL_bool(DECLARE_GPU_SPECS); } // namespace functor // Registration of the GPU implementations. -#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type) \ - REGISTER_KERNEL_BUILDER( \ - Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint("T"). \ - TypeConstraint("Tlen"), \ - ReverseSequenceOp); +#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type) \ + REGISTER_KERNEL_BUILDER(Name("ReverseSequence") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tlen"), \ + ReverseSequenceOp); -#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type) \ - REGISTER_REVERSE_SEQUENCE_GPU(type, int32); \ +#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type) \ + REGISTER_REVERSE_SEQUENCE_GPU(type, int32); \ REGISTER_REVERSE_SEQUENCE_GPU(type, int64); TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU_LEN); diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc index cb49f14525a..4a2136a2cd3 100644 --- a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc +++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc @@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice; template class generator::ReverseGenerator; \ template struct functor::ReverseSequence; -#define DEFINE_GPU_SPEC_LEN(T, dims) \ - DEFINE_GPU_SPEC(T, int32, dims); \ +#define DEFINE_GPU_SPEC_LEN(T, dims) \ + DEFINE_GPU_SPEC(T, int32, dims); \ DEFINE_GPU_SPEC(T, int64, dims); -#define DEFINE_GPU_SPECS(T) \ - DEFINE_GPU_SPEC_LEN(T, 2); \ - DEFINE_GPU_SPEC_LEN(T, 3); \ - DEFINE_GPU_SPEC_LEN(T, 4); \ +#define DEFINE_GPU_SPECS(T) \ + DEFINE_GPU_SPEC_LEN(T, 2); \ + DEFINE_GPU_SPEC_LEN(T, 3); \ + DEFINE_GPU_SPEC_LEN(T, 4); \ DEFINE_GPU_SPEC_LEN(T, 5); TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS); diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc new file mode 100644 index 00000000000..bcbdbee058b --- /dev/null +++ b/tensorflow/core/kernels/roll_op.cc @@ -0,0 +1,334 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/register_types_traits.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +#define EIGEN_USE_THREADS +using CPUDevice = Eigen::ThreadPoolDevice; + +// dim_size - the size of each dimension +// dim_range - the number of indices over in the flattened tensor +// you need to skip in order to make it over from one side of a dimension +// to the other. Used to make the shifts wrap around after a threshold. +// threshold - the index for each dimension that the roll starts to wrap +// back to the front +template +void DoRoll(OpKernelContext* context, const int64 num_elements, + const int num_dims, const gtl::ArraySlice& dim_size, + const T* input, T* output, const gtl::ArraySlice& threshold, + const gtl::ArraySlice& dim_range) { + auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range]( + int64 start, int64 end) { + // array of indices for each dimension + gtl::InlinedVector indices(num_dims); + int offset = 0; // the shift along the flattened tensor for current element + // initialize indices and offset + for (int i = 0; i < num_dims; i++) { + // stride is the number of indices over in the flattened tensor + // you need to skip in order to make it over to an adjacent element + // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1) + const int64 stride = dim_range[i] / dim_size[i]; + const int shift = dim_size[i] - threshold[i]; + const int indx = (start / stride) % dim_size[i]; + indices[i] = indx; + // calculate dimension index after the shift + const int shifted_indx = (indx + shift) % dim_size[i]; + offset += (shifted_indx - indx) * stride; + } + + for (int64 i = start; i < end; i++) { + output[i + offset] = input[i]; + // create next combination of indices + // while at it adjust offset if needed + for (int j = num_dims - 1; j >= 0; j--) { + const int indx = (indices[j] + 1) % dim_size[j]; + indices[j] = indx; + if (indx != 0) { + if (indx == threshold[j]) { // we've reached the threshold + // dim_range[j] = threshold[j] + shift[j] + // offset = shift[j] + ... other offsets + // offset - dim_range[j] = -threshold[j] + ... other offsets + // thus we undo our previous offset as well as add a new offset of + // -threshold[j] in one operation + offset -= dim_range[j]; // now wraps around + } + break; // indx != 0 don't need to carry + } else if (threshold[j] != 0) { // if threshold is 0 shift is 0 + offset += dim_range[j]; // indx became 0 so reverse wrap around + } + } + } + }; + // Shard + auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); + // 15 - expiramentally determined with float and bool types + const int cost_per_element = 15 * sizeof(T); // rough esitmate + Shard(worker_threads->num_threads, worker_threads->workers, num_elements, + cost_per_element, std::move(work)); +} + +// dim_size - the size of each dimension +// dim_range - the number of indices over in the flattened tensor +// you need to skip in order to make it over from one side of a dimension +// to the other. Used to make the shifts wrap around after a threshold. +// threshold - the index for each dimension that the roll starts to wrap +// back to the front +// isd - inner shift dimension +template +// Use memcpy to copy memory in groups when the data type supports memcpy +void DoRollWithMemcpy(OpKernelContext* context, const int64 num_elements, + const int num_dims, const gtl::ArraySlice& dim_size, + const T* input, T* output, + const gtl::ArraySlice& threshold, + const gtl::ArraySlice& dim_range, + const int64 isd) { + auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd]( + int64 start, int64 end) { + // the number of indices over in the flattened tensor you need to skip in + // order to make it over from one side of the isd to the other + const int64 isd_range = std::max(dim_range[isd], 1); + // the distance along the flattend tensor to the next element in the isd + const int64 isd_stride = isd_range / std::max(dim_size[isd], 1); + + // start and end represent the i-th group currently so we will convert + // them into numbers representing the i-th elements. + // there are 2 groups per isd one for all elements before threshold[isd] + // and another for all elements after threshold[isd]. + const int64 start_remainder = (start % 2) * threshold[isd] * isd_stride; + const int64 end_remainder = (end % 2) * threshold[isd] * isd_stride; + start = (start / 2) * isd_range + start_remainder; + end = (end / 2) * isd_range + end_remainder; + + const T* in_ptr = &input[0]; + T* out_ptr = &output[0]; + in_ptr += start; + out_ptr += start; + + // array of indices for each dimension + // indicies = [i, j, k, l, m, n] + gtl::InlinedVector indicies(num_dims); + // the offset needed to make all inner non-shifting dimensions become 0 + int64 remainder_offset = 0; + // initialize indicies + for (int i = 0; i < num_dims; i++) { + // stride is the number of indices over in the flattened tensor + // you need to skip in order to make it over to an adjacent element + // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1) + const int64 stride = dim_range[i] / dim_size[i]; + const int shift = dim_size[i] - threshold[i]; + const int indx = (start / stride) % dim_size[i]; + indicies[i] = indx; + // calculate dimension index after the shift + int out_indx = (indx + shift) % dim_size[i]; + if (i > isd) { + // trailing zeroes for indices after the inner shifted dimension + out_indx = 0; + remainder_offset += (out_indx - indx) * stride; + } + out_ptr += (out_indx - indx) * stride; + } + // set trailing zeroes for indices after the inner shifted dimension + for (int i = num_dims - 1; i > isd; i--) indicies[i] = 0; + + // the number of indices in the isd dimension the next group will skip + // to make it to the next threshold or end point + int isd_indx_skip = 0; + // the size of the next group + int64 group_size = 0; + // initialize isd_indx_skip and group_size + if (indicies[isd] < threshold[isd]) { + isd_indx_skip = threshold[isd] - indicies[isd]; + group_size = isd_indx_skip * isd_stride + remainder_offset; + } else { + isd_indx_skip = dim_size[isd] - indicies[isd]; + group_size = isd_indx_skip * isd_stride + remainder_offset; + } + + int64 i = start; + while (i < end) { + // copy group of elements + memcpy(out_ptr, in_ptr, group_size * sizeof(T)); + + // shift i and the pointers over to the next group position + i += group_size; + out_ptr += group_size; + in_ptr += group_size; + + // produce next combination of indices and adjust the out_ptr position + // to fix the offset if necessary + // the isd (inner shift dim) should skip to next threshold or endpoint + // all dimensions to the left increment by 1 when a digit is carried + // all dimensions to the right remain set to 0 + // +1 +1 +1 +isd_indx_skip + // indicies = [i, j, k, l, 0, 0] + // ^isd + for (int j = isd; j >= 0; j--) { + int inc = 1; + if (j == isd) inc = isd_indx_skip; + const int indx = (indicies[j] + inc) % dim_size[j]; + indicies[j] = indx; + if (indx != 0) { + if (indx == threshold[j]) { + out_ptr -= dim_range[j]; // now wraps around + } + break; // indx != 0 don't need to carry + } else if (threshold[j] != 0) { // if threshold is 0 shift is 0 + out_ptr += dim_range[j]; // indx became 0 so reverse wrap around + } + } + + // set isd_indx_skip and group_size for next iteration + if (indicies[isd] < threshold[isd]) { + isd_indx_skip = threshold[isd] - indicies[isd]; + group_size = isd_indx_skip * isd_stride; + } else { + isd_indx_skip = dim_size[isd] - indicies[isd]; + group_size = isd_indx_skip * isd_stride; + } + } + }; + // Shard + auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); + const int64 ave_group_size = dim_range[isd] / 2; + const int total_work = 2 * num_elements / std::max(dim_range[isd], 1); + // 25000 - expiramentally determined with float and bool types + const int cost_per_group = 25000 * sizeof(T) * ave_group_size; + Shard(worker_threads->num_threads, worker_threads->workers, total_work, + cost_per_group, std::move(work)); +} + +template +class RollOp : public OpKernel { + public: + explicit RollOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& input = context->input(0); + const Tensor& shift = context->input(1); + const Tensor& axis = context->input(2); + + auto shift_flat = shift.flat(); + auto axis_flat = axis.flat(); + + OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input.shape()), + errors::InvalidArgument("input must be 1-D or higher")); + OP_REQUIRES(context, shift.shape().dims() <= 1, + errors::InvalidArgument( + "shift must be a scalar or a 1-D vector. Found: ", + shift.shape().DebugString())); + OP_REQUIRES(context, axis.shape().dims() <= 1, + errors::InvalidArgument( + "axis must be a scalar or a 1-D vector. Found: ", + axis.shape().DebugString())); + OP_REQUIRES( + context, shift.shape() == axis.shape(), + errors::InvalidArgument("shift and axis must have the same size")); + const int64 num_elements = input.NumElements(); + const int num_shifts = static_cast(shift_flat.size()); + const int num_dims = input.dims(); + + // if there are any duplicate axes, shift_mod_sum will have the + // total modulo sum of shifts for each dimension + gtl::InlinedVector shift_mod_sum(num_dims, 0); + for (int i = 0; i < num_shifts; i++) { + const int axis = axis_flat(i); + OP_REQUIRES(context, axis < num_dims, + errors::InvalidArgument("axis ", axis, " is out of range")); + const int ds = std::max(static_cast(input.dim_size(axis)), 1); + const int sum = shift_mod_sum[axis] + static_cast(shift_flat(i)); + // modulo that works with negatives: ((x % y) + y) % y + shift_mod_sum[axis] = (sum % ds + ds) % ds; + } + // the size of each dimension + gtl::InlinedVector dim_size(num_dims); + // threshold[i] is the index that the roll starts to wrap back to the front + gtl::InlinedVector threshold(num_dims); + // dim_range is the number of indices over in the flattened tensor + // you need to skip in order to make it over from one side of a dimension + // to the other. Used to make the shifts wrap around after a threshold. + gtl::InlinedVector dim_range(num_dims); + int64 dim_size_prod = 1; // dimension size product + // inner shift dimension (inner most shifted dimension) + int64 isd = 0; + for (int i = num_dims - 1; i >= 0; i--) { + if (isd == 0 && shift_mod_sum[i] != 0) isd = i; + const int ds = std::max(static_cast(input.dim_size(i)), 1); + dim_size[i] = ds; + threshold[i] = (ds - shift_mod_sum[i]) % ds; + dim_size_prod *= static_cast(input.dim_size(i)); + dim_range[i] = dim_size_prod; + } + + Tensor* output = NULL; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + auto input_flat = input.flat().data(); + auto output_flat = output->flat().data(); + + if (std::is_same::value) { + if (DataTypeCanUseMemcpy(DataTypeToEnum::v())) { + // V2 copies memory in groups instead of element by element + DoRollWithMemcpy(context, num_elements, num_dims, dim_size, + input_flat, output_flat, threshold, dim_range, isd); + } else { + // incase memcpy does not work for current data type + DoRoll(context, num_elements, num_dims, dim_size, input_flat, + output_flat, threshold, dim_range); + } + } + } +}; + +// Register the CPU kernels. +#define REGISTER_CPU(type) \ + REGISTER_KERNEL_BUILDER(Name("Roll") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tshift") \ + .TypeConstraint("Taxis"), \ + RollOp) \ + REGISTER_KERNEL_BUILDER(Name("Roll") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tshift") \ + .TypeConstraint("Taxis"), \ + RollOp) \ + REGISTER_KERNEL_BUILDER(Name("Roll") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tshift") \ + .TypeConstraint("Taxis"), \ + RollOp) \ + REGISTER_KERNEL_BUILDER(Name("Roll") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tshift") \ + .TypeConstraint("Taxis"), \ + RollOp) + +TF_CALL_ALL_TYPES(REGISTER_CPU); +#undef REGISTER_CPU +} // namespace tensorflow diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc new file mode 100644 index 00000000000..90b6f8d0f30 --- /dev/null +++ b/tensorflow/core/kernels/roll_op_test.cc @@ -0,0 +1,484 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { +namespace { + +class RollOpTest : public OpsTestBase { + protected: + void MakeOp(DataType data_type, DataType index_type) { + TF_ASSERT_OK(NodeDefBuilder("myop", "Roll") + .Input(FakeInput(data_type)) + .Input(FakeInput(index_type)) + .Input(FakeInput(index_type)) + .Finalize(node_def())); + TF_ASSERT_OK(InitOp()); + } +}; + +TEST_F(RollOpTest, ScalarIndices) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({5}), {0, 1, 2, 3, 4}); + AddInputFromArray(TensorShape({}), {3}); + AddInputFromArray(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({5})); + test::FillValues(&expected, {2, 3, 4, 0, 1}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ScalarIndices_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({5}), {"a", "b", "c", "d", "e"}); + AddInputFromArray(TensorShape({}), {3}); + AddInputFromArray(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({5})); + test::FillValues(&expected, {"c", "d", "e", "a", "b"}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ScalarIndices_Complex) { + MakeOp(DT_COMPLEX64, DT_INT32); + + // Feed and run + AddInputFromArray>( + TensorShape({5}), {std::complex(0, 10), std::complex(1, 11), + std::complex(2, 12), std::complex(3, 13), + std::complex(4, 14)}); + AddInputFromArray(TensorShape({}), {3}); + AddInputFromArray(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_COMPLEX64, TensorShape({5})); + test::FillValues>( + &expected, {std::complex(2, 12), std::complex(3, 13), + std::complex(4, 14), std::complex(0, 10), + std::complex(1, 11)}); + test::ExpectTensorEqual>(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_TwoD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({3, 5}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray(TensorShape({2}), {2, -1}); + AddInputFromArray(TensorShape({2}), {0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5})); + test::FillValues(&expected, + {6, 7, 8, 9, 5, 11, 12, 13, 14, 10, 1, 2, 3, 4, 0}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_TwoD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({3, 5}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", + "k", "l", "m", "n", "o"}); + AddInputFromArray(TensorShape({2}), {2, -1}); + AddInputFromArray(TensorShape({2}), {0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({3, 5})); + test::FillValues(&expected, {"g", "h", "i", "j", "f", "l", "m", "n", + "o", "k", "b", "c", "d", "e", "a"}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_ThreeD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({2, 2, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + AddInputFromArray(TensorShape({3}), {1, -1, -1}); + AddInputFromArray(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3})); + test::FillValues(&expected, {10, 11, 9, 7, 8, 6, 4, 5, 3, 1, 2, 0}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray( + TensorShape({2, 2, 3}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}); + AddInputFromArray(TensorShape({3}), {1, -1, -1}); + AddInputFromArray(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3})); + test::FillValues( + &expected, {"k", "l", "j", "h", "i", "g", "e", "f", "d", "b", "c", "a"}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_TwoD64) { + MakeOp(DT_FLOAT, DT_INT64); + + // Feed and run + AddInputFromArray(TensorShape({5, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray(TensorShape({2}), {-1, 4}); + AddInputFromArray(TensorShape({2}), {0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3})); + test::FillValues(&expected, + {5, 3, 4, 8, 6, 7, 11, 9, 10, 14, 12, 13, 2, 0, 1}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_TwoD64_NoMemcpy) { + MakeOp(DT_STRING, DT_INT64); + + // Feed and run + AddInputFromArray(TensorShape({5, 3}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", + "k", "l", "m", "n", "o"}); + AddInputFromArray(TensorShape({2}), {-1, 4}); + AddInputFromArray(TensorShape({2}), {0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({5, 3})); + test::FillValues(&expected, {"f", "d", "e", "i", "g", "h", "l", "j", + "k", "o", "m", "n", "c", "a", "b"}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_ThreeD64) { + MakeOp(DT_FLOAT, DT_INT64); + + // Feed and run + AddInputFromArray(TensorShape({4, 1, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + AddInputFromArray(TensorShape({3}), {4, 3, 2}); + AddInputFromArray(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 1, 3})); + test::FillValues(&expected, {1, 2, 0, 4, 5, 3, 7, 8, 6, 10, 11, 9}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) { + MakeOp(DT_STRING, DT_INT64); + + // Feed and run + AddInputFromArray( + TensorShape({4, 1, 3}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}); + AddInputFromArray(TensorShape({3}), {4, 3, 2}); + AddInputFromArray(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({4, 1, 3})); + test::FillValues( + &expected, {"b", "c", "a", "e", "f", "d", "h", "i", "g", "k", "l", "j"}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ZeroShift_ThreeD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({2, 2, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + AddInputFromArray(TensorShape({3}), {0, 0, 0}); + AddInputFromArray(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3})); + test::FillValues(&expected, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray( + TensorShape({2, 2, 3}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}); + AddInputFromArray(TensorShape({3}), {0, 0, 0}); + AddInputFromArray(TensorShape({3}), {0, 1, 2}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3})); + test::FillValues( + &expected, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ZeroSize_ThreeD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({5, 0, 0}), {}); + AddInputFromArray(TensorShape({}), {1}); + AddInputFromArray(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 0, 0})); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, ZeroSize_ThreeD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({5, 0, 0}), {}); + AddInputFromArray(TensorShape({}), {1}); + AddInputFromArray(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({5, 0, 0})); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, OneSize_ThreeD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({1, 1, 1}), {5}); + AddInputFromArray(TensorShape({}), {1}); + AddInputFromArray(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1})); + test::FillValues(&expected, {5}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, OneSize_ThreeD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({1, 1, 1}), {"a"}); + AddInputFromArray(TensorShape({}), {1}); + AddInputFromArray(TensorShape({}), {0}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({1, 1, 1})); + test::FillValues(&expected, {"a"}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, MultiShifts_TwoD32) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({3, 5}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray(TensorShape({4}), {-2, 2, -1, 1}); + AddInputFromArray(TensorShape({4}), {1, 0, 0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5})); + test::FillValues(&expected, + {11, 12, 13, 14, 10, 1, 2, 3, 4, 0, 6, 7, 8, 9, 5}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, MultiShifts_TwoD32_NoMemcpy) { + MakeOp(DT_STRING, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({3, 5}), + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", + "k", "l", "m", "n", "o"}); + AddInputFromArray(TensorShape({4}), {-2, 2, -1, 1}); + AddInputFromArray(TensorShape({4}), {1, 0, 0, 1}); + TF_ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_STRING, TensorShape({3, 5})); + test::FillValues(&expected, {"l", "m", "n", "o", "k", "b", "c", "d", + "e", "a", "g", "h", "i", "j", "f"}); + test::ExpectTensorEqual(expected, *GetOutput(0)); +} + +TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({}), {7}); + AddInputFromArray(TensorShape({}), {1}); + AddInputFromArray(TensorShape({}), {0}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher")) + << s; +} + +TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({2, 2}), {1, 2, 3, 4}); + AddInputFromArray(TensorShape({}), {1}); + AddInputFromArray(TensorShape({1, 2}), {0, 1}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("axis must be a scalar or a 1-D vector")) + << s; +} + +TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({2, 2}), {1, 2, 3, 4}); + AddInputFromArray(TensorShape({1, 2}), {0, 1}); + AddInputFromArray(TensorShape({}), {1}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("shift must be a scalar or a 1-D vector")) + << s; +} + +TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({2, 2}), {1, 2, 3, 4}); + AddInputFromArray(TensorShape({1}), {1}); + AddInputFromArray(TensorShape({2}), {0, 1}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("shift and axis must have the same size")) + << s; +} + +TEST_F(RollOpTest, Error_AxisOutOfRange) { + MakeOp(DT_FLOAT, DT_INT32); + + // Feed and run + AddInputFromArray(TensorShape({4}), {1, 2, 3, 4}); + AddInputFromArray(TensorShape({}), {1}); + AddInputFromArray(TensorShape({}), {1}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s; +} + +// isd - (inner shift dimension) The inner most dimension to be shifted. +// All outer dimensions will also be shifted for testing. +static Graph* RollGraph(const TensorShape& shape, int isd) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor input(DT_FLOAT, shape); + input.flat().setRandom(); + const int dims = static_cast(input.dims()); + Tensor shift(DT_INT32, TensorShape({dims})); + for (int i = 0; i < dims; i++) { + // shift the inner shift dimension and all outer dimensions + shift.flat()(i) = (i <= isd) ? 2 : 0; + } + Tensor axis(DT_INT32, TensorShape({dims})); + for (int i = 0; i < dims; i++) { + axis.flat()(i) = i; + } + test::graph::Roll(g, test::graph::Constant(g, input), + test::graph::Constant(g, shift), + test::graph::Constant(g, axis)); + return g; +} + +#define BM_ROLL_OUTER(DEVICE) \ + static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) { \ + TensorShape shape{rows, columns}; \ + const int64 num_items = static_cast(iters) * shape.num_elements(); \ + testing::ItemsProcessed(num_items); \ + testing::BytesProcessed(num_items * sizeof(float)); \ + testing::UseRealTime(); \ + test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_roll_outer) \ + ->ArgPair(256, 256) \ + ->ArgPair(512, 512) \ + ->ArgPair(1024, 1024) \ + ->ArgPair(2048, 2048) + +#define BM_ROLL_ALL(DEVICE) \ + static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) { \ + TensorShape shape{rows, columns}; \ + const int64 num_items = static_cast(iters) * shape.num_elements(); \ + testing::ItemsProcessed(num_items); \ + testing::BytesProcessed(num_items * sizeof(float)); \ + testing::UseRealTime(); \ + test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_roll_all) \ + ->ArgPair(256, 256) \ + ->ArgPair(512, 512) \ + ->ArgPair(1024, 1024) \ + ->ArgPair(2048, 2048) + +BM_ROLL_OUTER(cpu); +BM_ROLL_ALL(cpu); +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc index df60eda7597..990bd2bff94 100644 --- a/tensorflow/core/kernels/save_restore_tensor.cc +++ b/tensorflow/core/kernels/save_restore_tensor.cc @@ -106,11 +106,11 @@ void SaveTensors( OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice( shape_spec, &shape, &slice, &slice_shape)); OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()), - errors::InvalidArgument("Slice in shape_and_slice " - "specification does not match the " - "shape of the tensor to save: ", - shape_spec, ", tensor: ", - input.shape().DebugString())); + errors::InvalidArgument( + "Slice in shape_and_slice " + "specification does not match the " + "shape of the tensor to save: ", + shape_spec, ", tensor: ", input.shape().DebugString())); } #define WRITER_ADD(T) \ diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h index c6e35fe329e..079f15e1013 100644 --- a/tensorflow/core/kernels/scatter_functor.h +++ b/tensorflow/core/kernels/scatter_functor.h @@ -29,7 +29,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace scatter_op { @@ -117,7 +117,7 @@ struct AssignSYCL { p.device(d) = p / u; } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace internal } // namespace scatter_op @@ -156,7 +156,7 @@ struct ScatterFunctorBase { #ifdef TENSORFLOW_USE_SYCL template -struct ScatterFunctorBase { +struct ScatterFunctorBase { Index operator()(OpKernelContext* c, const SYCLDevice& d, typename TTypes::Matrix params, typename TTypes::ConstMatrix updates, @@ -171,13 +171,13 @@ struct ScatterFunctorBase { const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i)); if (!FastBoundsCheck(index, limit)) return i; // Copy last Ndim-1 dimensions of updates[i] to params[index] - scatter_op::internal::AssignSYCL::Run(d, params.template chip<0>(index), - updates.template chip<0>(i)); + scatter_op::internal::AssignSYCL::Run( + d, params.template chip<0>(index), updates.template chip<0>(i)); } return -1; } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template struct ScatterFunctorBase { @@ -217,7 +217,7 @@ struct ScatterFunctorBase { template struct ScatterFunctor - : ScatterFunctorBase{}; + : ScatterFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template @@ -239,7 +239,7 @@ struct ScatterFunctorSYCL { return -1; } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h index e116077d3cf..be18658543e 100644 --- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h +++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h @@ -30,9 +30,10 @@ namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; template -__global__ void ScatterOpCustomKernel( - T* params, const T* updates, const Index* indices, - Index first_dim_size, Index updates_size, Index indices_size) { +__global__ void ScatterOpCustomKernel(T* params, const T* updates, + const Index* indices, + Index first_dim_size, Index updates_size, + Index indices_size) { Index update_block = updates_size / indices_size; CUDA_1D_KERNEL_LOOP(i, updates_size) { int indices_i = i / update_block; @@ -85,8 +86,8 @@ struct ScatterFunctor { CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d); ScatterOpCustomKernel <<>>( - params.data(), updates.data(), indices.data(), - first_dim_size, updates_size, indices_size); + params.data(), updates.data(), indices.data(), first_dim_size, + updates_size, indices_size); return -1; } }; diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h index c6c9d4e6588..e82660dcc1d 100644 --- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h +++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h @@ -40,7 +40,7 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class OpKernelContext; @@ -251,7 +251,7 @@ REGISTER_SCATTER_ND_MATH_SYCL(int32); #undef REGISTER_SCATTER_ND_INDEX_SYCL #undef REGISTER_SCATTER_ND_FULL_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc index 8607c7f95af..282165349f3 100644 --- a/tensorflow/core/kernels/scatter_op.cc +++ b/tensorflow/core/kernels/scatter_op.cc @@ -25,7 +25,7 @@ limitations under the License. #ifdef TENSORFLOW_USE_SYCL #include "tensorflow/core/common_runtime/sycl/sycl_util.h" -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL namespace tensorflow { @@ -33,7 +33,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Check whether updates.shape = indices.shape + params.shape[1:] static bool ValidShapes(const Tensor& params, const Tensor& updates, @@ -102,11 +102,12 @@ class ScatterUpdateOp : public OpKernel { // Check that we have enough index space const int64 N_big = indices.NumElements(); - OP_REQUIRES(c, N_big <= std::numeric_limits::max(), - errors::InvalidArgument( - "indices has too many elements for ", - DataTypeString(DataTypeToEnum::v()), " indexing: ", - N_big, " > ", std::numeric_limits::max())); + OP_REQUIRES( + c, N_big <= std::numeric_limits::max(), + errors::InvalidArgument("indices has too many elements for ", + DataTypeString(DataTypeToEnum::v()), + " indexing: ", N_big, " > ", + std::numeric_limits::max())); const Index N = static_cast(indices.NumElements()); OP_REQUIRES( c, params.dim_size(0) <= std::numeric_limits::max(), @@ -137,7 +138,7 @@ class ScatterUpdateOp : public OpKernel { #ifdef TENSORFLOW_USE_SYCL template -class ScatterUpdateOp : public OpKernel { +class ScatterUpdateOp : public OpKernel { public: explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) { OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_)); @@ -165,11 +166,12 @@ class ScatterUpdateOp : public OpKernel { // Check that we have enough index space const int64 N_big = indices.NumElements(); - OP_REQUIRES(c, N_big <= std::numeric_limits::max(), - errors::InvalidArgument( - "indices has too many elements for ", - DataTypeString(DataTypeToEnum::v()), " indexing: ", - N_big, " > ", std::numeric_limits::max())); + OP_REQUIRES( + c, N_big <= std::numeric_limits::max(), + errors::InvalidArgument("indices has too many elements for ", + DataTypeString(DataTypeToEnum::v()), + " indexing: ", N_big, " > ", + std::numeric_limits::max())); const Index N = static_cast(indices.NumElements()); OP_REQUIRES( c, params.dim_size(0) <= std::numeric_limits::max(), @@ -206,7 +208,7 @@ class ScatterUpdateOp : public OpKernel { } } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \ REGISTER_KERNEL_BUILDER(Name(name) \ diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc index 863c123b43f..066a4b80a2b 100644 --- a/tensorflow/core/kernels/sdca_internal.cc +++ b/tensorflow/core/kernels/sdca_internal.cc @@ -37,9 +37,8 @@ void FeatureWeightsDenseStorage::UpdateDenseDeltaWeights( const size_t num_weight_vectors = normalized_bounded_dual_delta.size(); if (num_weight_vectors == 1) { deltas_.device(device) = - deltas_ + - dense_vector.RowAsMatrix() * - deltas_.constant(normalized_bounded_dual_delta[0]); + deltas_ + dense_vector.RowAsMatrix() * + deltas_.constant(normalized_bounded_dual_delta[0]); } else { // Transform the dual vector into a column matrix. const Eigen::TensorMap> @@ -61,9 +60,8 @@ void FeatureWeightsSparseStorage::UpdateSparseDeltaWeights( const Example::SparseFeatures& sparse_features, const std::vector& normalized_bounded_dual_delta) { for (int64 k = 0; k < sparse_features.indices->size(); ++k) { - const double feature_value = sparse_features.values == nullptr - ? 1.0 - : (*sparse_features.values)(k); + const double feature_value = + sparse_features.values == nullptr ? 1.0 : (*sparse_features.values)(k); auto it = indices_to_id_.find((*sparse_features.indices)(k)); for (size_t l = 0; l < normalized_bounded_dual_delta.size(); ++l) { deltas_(l, it->second) += @@ -122,23 +120,24 @@ Status ModelWeights::Initialize(OpKernelContext* const context) { } // Reads in the weights, and allocates and initializes the delta weights. - const auto initialize_weights = [&]( - const OpInputList& weight_inputs, OpOutputList* const weight_outputs, - std::vector* const feature_weights) { - for (int i = 0; i < weight_inputs.size(); ++i) { - Tensor* delta_t; - TF_RETURN_IF_ERROR( - weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t)); - // Convert the input vector to a row matrix in internal representation. - auto deltas = delta_t->shaped({1, delta_t->NumElements()}); - deltas.setZero(); - feature_weights->emplace_back( - FeatureWeightsDenseStorage{weight_inputs[i].shaped( - {1, weight_inputs[i].NumElements()}), - deltas}); - } - return Status::OK(); - }; + const auto initialize_weights = + [&](const OpInputList& weight_inputs, OpOutputList* const weight_outputs, + std::vector* const feature_weights) { + for (int i = 0; i < weight_inputs.size(); ++i) { + Tensor* delta_t; + TF_RETURN_IF_ERROR( + weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t)); + // Convert the input vector to a row matrix in internal + // representation. + auto deltas = delta_t->shaped({1, delta_t->NumElements()}); + deltas.setZero(); + feature_weights->emplace_back(FeatureWeightsDenseStorage{ + weight_inputs[i].shaped( + {1, weight_inputs[i].NumElements()}), + deltas}); + } + return Status::OK(); + }; return initialize_weights(dense_weights_inputs, &dense_weights_outputs, &dense_weights_); diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h index 9f072700754..45915693ac6 100644 --- a/tensorflow/core/kernels/sdca_internal.h +++ b/tensorflow/core/kernels/sdca_internal.h @@ -149,7 +149,8 @@ class Example { // 1.0f. struct SparseFeatures { std::unique_ptr::UnalignedConstVec> indices; - std::unique_ptr::UnalignedConstVec> values; // nullptr encodes optional. + std::unique_ptr::UnalignedConstVec> + values; // nullptr encodes optional. }; // A dense vector which is a row-slice of the underlying matrix. diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc index 0f5c2424b38..dbe0177dda3 100644 --- a/tensorflow/core/kernels/sdca_ops.cc +++ b/tensorflow/core/kernels/sdca_ops.cc @@ -57,11 +57,11 @@ namespace tensorflow { namespace { -using sdca::Regularizations; using sdca::Example; using sdca::Examples; using sdca::ExampleStatistics; using sdca::ModelWeights; +using sdca::Regularizations; struct ComputeOptions { explicit ComputeOptions(OpKernelConstruction* const context) { @@ -76,8 +76,9 @@ struct ComputeOptions { } else if (loss_type == "smooth_hinge_loss") { loss_updater.reset(new SmoothHingeLossUpdater); } else { - OP_REQUIRES(context, false, errors::InvalidArgument( - "Unsupported loss type: ", loss_type)); + OP_REQUIRES( + context, false, + errors::InvalidArgument("Unsupported loss type: ", loss_type)); } OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptative)); OP_REQUIRES_OK( @@ -90,9 +91,10 @@ struct ComputeOptions { context, num_sparse_features + num_dense_features > 0, errors::InvalidArgument("Requires at least one feature to train.")); - OP_REQUIRES(context, static_cast(num_sparse_features) + - static_cast(num_dense_features) <= - std::numeric_limits::max(), + OP_REQUIRES(context, + static_cast(num_sparse_features) + + static_cast(num_dense_features) <= + std::numeric_limits::max(), errors::InvalidArgument( strings::Printf("Too many feature groups: %lld > %d", static_cast(num_sparse_features) + diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc index 3ef1cd1e062..27b8081eb88 100644 --- a/tensorflow/core/kernels/segment_reduction_ops.cc +++ b/tensorflow/core/kernels/segment_reduction_ops.cc @@ -115,7 +115,7 @@ class SegmentReductionOp : public OpKernel { Eigen::DSizes dims_to_reduce; dims_to_reduce[0] = 0; #else - Eigen::IndexList> dims_to_reduce; + Eigen::IndexList > dims_to_reduce; #endif Index start = 0, end = 1; @@ -359,7 +359,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL); namespace functor { // UnsortedSegmentSumFunctor implementation for CPUDevice. -// todo: Remove duplicate code in UnsortedSegmentSumFunctor and UnsortedSegmentMaxFunctor. +// todo: Remove duplicate code in UnsortedSegmentSumFunctor and +// UnsortedSegmentMaxFunctor. template struct UnsortedSegmentSumFunctor : UnsortedSegmentBaseFunctor { @@ -461,9 +462,10 @@ class UnsortedSegmentBaseOp : public OpKernel { auto data_ptr = data.template flat().data(); reduction_functor_(context, context->template eigen_device(), - output_rows, segment_ids.shape(), segment_flat, - data.NumElements(), data_ptr, output_flat); + output_rows, segment_ids.shape(), segment_flat, + data.NumElements(), data_ptr, output_flat); } + private: functor::UnsortedSegmentBaseFunctor& reduction_functor_; }; @@ -472,22 +474,20 @@ template class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp { public: explicit UnsortedSegmentSumOp(OpKernelConstruction* context) - : UnsortedSegmentBaseOp( - context, - sum_functor_) {} + : UnsortedSegmentBaseOp(context, sum_functor_) {} + private: - functor::UnsortedSegmentSumFunctor sum_functor_; + functor::UnsortedSegmentSumFunctor sum_functor_; }; template class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp { public: explicit UnsortedSegmentMaxOp(OpKernelConstruction* context) - : UnsortedSegmentBaseOp( - context, - max_functor_) {} + : UnsortedSegmentBaseOp(context, max_functor_) {} + private: - functor::UnsortedSegmentMaxFunctor max_functor_; + functor::UnsortedSegmentMaxFunctor max_functor_; }; #define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type) \ @@ -663,9 +663,9 @@ class SparseSegmentReductionOpBase : public OpKernel { Reduce(input_flat, indices_vec, start, end - start, out); OP_REQUIRES(context, bad_offset < 0, errors::InvalidArgument( - "Bad: indices[", start + bad_offset, "] == ", - indices_vec(start + bad_offset), " out of range [0, ", - input_flat.dimension(0), ")")); + "Bad: indices[", start + bad_offset, + "] == ", indices_vec(start + bad_offset), + " out of range [0, ", input_flat.dimension(0), ")")); start = end; ++end; diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h index bcdd42c80c1..5c9cfe09065 100644 --- a/tensorflow/core/kernels/segment_reduction_ops.h +++ b/tensorflow/core/kernels/segment_reduction_ops.h @@ -51,13 +51,14 @@ struct SegmentSumFunctor { // BaseFunctor for definition of UnsorteSegmentReductionOp // for usage without templates. template -struct UnsortedSegmentBaseFunctor{ - virtual ~UnsortedSegmentBaseFunctor(){} +struct UnsortedSegmentBaseFunctor { + virtual ~UnsortedSegmentBaseFunctor() {} virtual void operator()(OpKernelContext* ctx, const Device& d, - const Index output_rows, const TensorShape& segment_ids_shape, - typename TTypes::ConstFlat segment_ids, - const Index data_size, const T* data, - typename TTypes::Tensor output){}; + const Index output_rows, + const TensorShape& segment_ids_shape, + typename TTypes::ConstFlat segment_ids, + const Index data_size, const T* data, + typename TTypes::Tensor output){}; }; // Functor for UnsortedSegmentSumOp. @@ -70,7 +71,8 @@ struct UnsortedSegmentBaseFunctor{ // data: input data tensor. // output: output reshaped to {output_rows, output.size/output_rows} template -struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor { +struct UnsortedSegmentSumFunctor + : public UnsortedSegmentBaseFunctor { void operator()(OpKernelContext* ctx, const Device& d, const Index output_rows, const TensorShape& segment_ids_shape, typename TTypes::ConstFlat segment_ids, @@ -88,7 +90,8 @@ struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor -struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor { +struct UnsortedSegmentMaxFunctor + : public UnsortedSegmentBaseFunctor { void operator()(OpKernelContext* ctx, const Device& d, const Index output_rows, const TensorShape& segment_ids_shape, typename TTypes::ConstFlat segment_ids, diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc index 159fada621b..39d520698e1 100644 --- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc @@ -194,7 +194,8 @@ void SegmentSumFunctor::operator()( // UnsortedSegmentSumFunctor implementation for GPUDevice. template -struct UnsortedSegmentSumFunctor: UnsortedSegmentBaseFunctor { +struct UnsortedSegmentSumFunctor + : UnsortedSegmentBaseFunctor { void operator()(OpKernelContext* ctx, const GPUDevice& d, const Index output_rows, const TensorShape& segment_ids_shape, typename TTypes::ConstFlat segment_ids, @@ -221,11 +222,10 @@ struct UnsortedSegmentSumFunctor: UnsortedSegmentBaseFuncto const Index input_inner_dim_size = input_total_size / input_outer_dim_size; config = GetCudaLaunchConfig(input_total_size, d); - UnsortedSegmentSumCustomKernel< - T, - Index><<>>( - input_outer_dim_size, input_inner_dim_size, output_rows, - segment_ids.data(), data, output.data()); + UnsortedSegmentSumCustomKernel + <<>>( + input_outer_dim_size, input_inner_dim_size, output_rows, + segment_ids.data(), data, output.data()); } }; diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc index 97657807268..bcd88773902 100644 --- a/tensorflow/core/kernels/self_adjoint_eig_op.cc +++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc @@ -25,7 +25,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" - namespace tensorflow { template diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc index 206fd40fa68..688e61fcadc 100644 --- a/tensorflow/core/kernels/sendrecv_ops.cc +++ b/tensorflow/core/kernels/sendrecv_ops.cc @@ -114,7 +114,7 @@ REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp); REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp); REGISTER_KERNEL_BUILDER( Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp); REGISTER_KERNEL_BUILDER( @@ -198,7 +198,7 @@ REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp); #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp); REGISTER_KERNEL_BUILDER( @@ -207,6 +207,6 @@ REGISTER_KERNEL_BUILDER( #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER( Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc index e2e3758d87e..9db0bd4d98b 100644 --- a/tensorflow/core/kernels/sequence_ops.cc +++ b/tensorflow/core/kernels/sequence_ops.cc @@ -53,13 +53,13 @@ class RangeOp : public OpKernel { if (delta > 0) { OP_REQUIRES( context, start <= limit, - errors::InvalidArgument("Requires start <= limit when delta > 0: ", - start, "/", limit)); + errors::InvalidArgument( + "Requires start <= limit when delta > 0: ", start, "/", limit)); } else { OP_REQUIRES( context, start >= limit, - errors::InvalidArgument("Requires start >= limit when delta < 0: ", - start, "/", limit)); + errors::InvalidArgument( + "Requires start >= limit when delta < 0: ", start, "/", limit)); } int64 size = (std::is_integral::value ? ((std::abs(limit - start) + std::abs(delta) - 1) / diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc index 61e40caef99..799c574d154 100644 --- a/tensorflow/core/kernels/serialize_sparse_op.cc +++ b/tensorflow/core/kernels/serialize_sparse_op.cc @@ -426,7 +426,6 @@ class DeserializeSparseOp : public OpKernel { switch (dtype_) { TF_CALL_ALL_TYPES(HANDLE_TYPE); TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE); - TF_CALL_variant(HANDLE_TYPE); #undef HANDLE_TYPE default: OP_REQUIRES(context, false, diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc index 185c5b248fc..f2dd2812b53 100644 --- a/tensorflow/core/kernels/session_ops.cc +++ b/tensorflow/core/kernels/session_ops.cc @@ -144,7 +144,7 @@ REGISTER_GPU_KERNEL(bool); TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL); REGISTER_SYCL_KERNEL(bool); #undef REGISTER_SYCL_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class DeleteSessionTensorOp : public OpKernel { public: diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h index 8d9d0ea8461..55be308901b 100644 --- a/tensorflow/core/kernels/shape_ops.h +++ b/tensorflow/core/kernels/shape_ops.h @@ -235,10 +235,10 @@ class SqueezeOp : public OpKernel { if (!wrapped_squeeze_dims.empty()) { if (wrapped_squeeze_dims.count(i) > 0) { OP_REQUIRES(ctx, existing_dim == 1, - errors::InvalidArgument("Tried to explicitly squeeze " - "dimension ", - i, " but dimension was not 1: ", - existing_dim)); + errors::InvalidArgument( + "Tried to explicitly squeeze " + "dimension ", + i, " but dimension was not 1: ", existing_dim)); } else { // This dimension is not being squeezed. new_shape.push_back(existing_dim); diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc index 82595de7794..79369fd4a9c 100644 --- a/tensorflow/core/kernels/slice_op.cc +++ b/tensorflow/core/kernels/slice_op.cc @@ -58,7 +58,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Shared code that is not dependent on the type of T. We do this to reduce // code size by not duplicating all this for all T (float, double, int32, etc.) @@ -72,10 +72,11 @@ static void SharedValidation(OpKernelContext* context, const Tensor& size_tensor = context->input(2); OP_REQUIRES( - context, context->op_kernel().IsLegacyVector(begin_tensor.shape()) && - context->op_kernel().IsLegacyVector(size_tensor.shape()) && - begin_tensor.NumElements() == input.dims() && - size_tensor.NumElements() == input.dims(), + context, + context->op_kernel().IsLegacyVector(begin_tensor.shape()) && + context->op_kernel().IsLegacyVector(size_tensor.shape()) && + begin_tensor.NumElements() == input.dims() && + size_tensor.NumElements() == input.dims(), errors::InvalidArgument( "Expected begin and size arguments to be 1-D tensors of size ", input.dims(), ", but got shapes ", begin_tensor.shape().DebugString(), @@ -125,8 +126,7 @@ static void SharedSliceCommonCases(OpKernelContext* context, TensorShape* output_shape, gtl::InlinedVector* begin, gtl::InlinedVector* size, - Tensor** result, - bool* done) { + Tensor** result, bool* done) { bool is_identity = true; bool slice_dim0 = true; *done = false; @@ -142,8 +142,8 @@ static void SharedSliceCommonCases(OpKernelContext* context, return; } - if (slice_dim0 && IsDim0SliceAligned(input.shape(), (*begin)[0], - (*size)[0])) { + if (slice_dim0 && + IsDim0SliceAligned(input.shape(), (*begin)[0], (*size)[0])) { VLOG(1) << "Slice dim 0: " << input.shape().DebugString(); CHECK_GE(input.dims(), 1); // Otherwise, is_identity should be true. context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0])); @@ -154,7 +154,6 @@ static void SharedSliceCommonCases(OpKernelContext* context, OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result)); } - template class SliceOp : public OpKernel { public: @@ -206,8 +205,9 @@ class SliceOp : public OpKernel { #undef HANDLE_DIM - OP_REQUIRES(context, false, errors::Unimplemented( - "SliceOp : Unhandled input dimensions")); + OP_REQUIRES( + context, false, + errors::Unimplemented("SliceOp : Unhandled input dimensions")); } } @@ -280,8 +280,9 @@ class MklSliceOp : public OpKernel { #undef HANDLE_DIM - OP_REQUIRES(context, false, errors::Unimplemented( - "SliceOp : Unhandled input dimensions")); + OP_REQUIRES( + context, false, + errors::Unimplemented("SliceOp : Unhandled input dimensions")); } } @@ -292,9 +293,9 @@ class MklSliceOp : public OpKernel { // as the sizes of all the dimensions of the input except slice_dim, then // returns True. Otherwise, returns False. bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape, - const gtl::ArraySlice& begin, - const gtl::ArraySlice& size, - int slice_dim) { + const gtl::ArraySlice& begin, + const gtl::ArraySlice& size, + int slice_dim) { for (int dim = 0; dim < 4; dim++) { if (dim != slice_dim && (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) { @@ -316,9 +317,9 @@ class MklSliceOp : public OpKernel { // Returns True if Slicing over a single dimension, and sets slice_dim // to the number of the dimension that satisfies criteria. bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape, - const gtl::ArraySlice& begin, - const gtl::ArraySlice& size, - int* slice_dim) { + const gtl::ArraySlice& begin, + const gtl::ArraySlice& size, + int* slice_dim) { for (int dim = 0; dim < 4; dim++) { if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) { *slice_dim = dim; @@ -329,8 +330,7 @@ class MklSliceOp : public OpKernel { } template - void HandleCase(OpKernelContext* context, - const gtl::ArraySlice& begin, + void HandleCase(OpKernelContext* context, const gtl::ArraySlice& begin, const gtl::ArraySlice& size, Tensor* result) { int slice_dim = -1; TensorShape in_shape = context->input(0).shape(); @@ -340,67 +340,63 @@ class MklSliceOp : public OpKernel { // format over channel dimension. if (NDIM == 4 && DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) { - size_t in_strides[4] = { (size_t) in_shape.dim_size(1) * - in_shape.dim_size(2) * - in_shape.dim_size(3), - (size_t) in_shape.dim_size(2) * - in_shape.dim_size(3), - (size_t) in_shape.dim_size(3), - (size_t) 1 - }; + size_t in_strides[4] = { + (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) * + in_shape.dim_size(3), + (size_t)in_shape.dim_size(2) * in_shape.dim_size(3), + (size_t)in_shape.dim_size(3), (size_t)1}; - size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3], - (size_t) size[2] * size[3], - (size_t) size[3], - (size_t) 1 }; + size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3], + (size_t)size[2] * size[3], (size_t)size[3], + (size_t)1}; - T *in_buf = const_cast(const_cast( - context->input(0).flat().data())); - T *op_buf = result->flat().data(); + T* in_buf = const_cast( + const_cast(context->input(0).flat().data())); + T* op_buf = result->flat().data(); - if (slice_dim == 1) { - /* data format = NCHW */ + if (slice_dim == 1) { + /* data format = NCHW */ - #pragma omp parallel for - for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { - T *ip = in_buf + (d0 * in_strides[0]); - T *op = op_buf + ((d0 - begin[0]) * out_strides[0]); - #pragma omp parallel for - for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { - T *ip1 = ip + (d1 * in_strides[1]); - T *op1 = op + ((d1 - begin[1]) * out_strides[1]); - // For NCHW, H and W will be contiguous. So we can copy - // both with one memcpy. - memcpy(static_cast(op1), static_cast(ip1), - sizeof(T) * in_strides[1]); - } +#pragma omp parallel for + for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { + T* ip = in_buf + (d0 * in_strides[0]); + T* op = op_buf + ((d0 - begin[0]) * out_strides[0]); +#pragma omp parallel for + for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { + T* ip1 = ip + (d1 * in_strides[1]); + T* op1 = op + ((d1 - begin[1]) * out_strides[1]); + // For NCHW, H and W will be contiguous. So we can copy + // both with one memcpy. + memcpy(static_cast(op1), static_cast(ip1), + sizeof(T) * in_strides[1]); } - return; - } else if (slice_dim == 3) { - /* data_format = NHWC */ - - #pragma omp parallel for - for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { - T *ip = in_buf + (d0 * in_strides[0]); - T *op = op_buf + ((d0 - begin[0]) * out_strides[0]); - #pragma omp parallel for - for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { - T *ip1 = ip + (d1 * in_strides[1]); - T *op1 = op + ((d1 - begin[1]) * out_strides[1]); - #pragma omp parallel for - for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) { - T *ip2 = ip1 + (d2 * in_strides[2]); - T *ip3 = ip2 + begin[3]; - T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]); - T *op3 = op2; - memcpy(static_cast(op3), static_cast(ip3), - sizeof(T) * size[3]); - } - } - } - return; } - // slice_dim is not 1 or 3, then we fallback to Eigen implementation. + return; + } else if (slice_dim == 3) { + /* data_format = NHWC */ + +#pragma omp parallel for + for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) { + T* ip = in_buf + (d0 * in_strides[0]); + T* op = op_buf + ((d0 - begin[0]) * out_strides[0]); +#pragma omp parallel for + for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) { + T* ip1 = ip + (d1 * in_strides[1]); + T* op1 = op + ((d1 - begin[1]) * out_strides[1]); +#pragma omp parallel for + for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) { + T* ip2 = ip1 + (d2 * in_strides[2]); + T* ip3 = ip2 + begin[3]; + T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]); + T* op3 = op2; + memcpy(static_cast(op3), static_cast(ip3), + sizeof(T) * size[3]); + } + } + } + return; + } + // slice_dim is not 1 or 3, then we fallback to Eigen implementation. } Eigen::DSizes indices; @@ -535,13 +531,13 @@ REGISTER_KERNEL_BUILDER(Name("Slice") #ifdef TENSORFLOW_USE_SYCL // Forward declarations of the functor specializations for SYCL. namespace functor { -#define DECLARE_SYCL_SPEC(T, NDIM) \ - template <> \ - void Slice::operator()( \ - const SYCLDevice& d, typename TTypes::Tensor output,\ - typename TTypes::ConstTensor input, \ - const Eigen::DSizes& indices, \ - const Eigen::DSizes& sizes); \ +#define DECLARE_SYCL_SPEC(T, NDIM) \ + template <> \ + void Slice::operator()( \ + const SYCLDevice& d, typename TTypes::Tensor output, \ + typename TTypes::ConstTensor input, \ + const Eigen::DSizes& indices, \ + const Eigen::DSizes& sizes); \ extern template struct Slice; #define DECLARE_FOR_N(T) \ diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h index 0362a021336..db7eded745e 100644 --- a/tensorflow/core/kernels/slice_op.h +++ b/tensorflow/core/kernels/slice_op.h @@ -24,7 +24,6 @@ limitations under the License. namespace tensorflow { namespace functor { - template struct Slice { void operator()(const Device& d, typename TTypes::Tensor output, diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h index 47f1d5342a9..64b6948190a 100644 --- a/tensorflow/core/kernels/slice_op_cpu_impl.h +++ b/tensorflow/core/kernels/slice_op_cpu_impl.h @@ -43,7 +43,7 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS); DEFINE_SYCL_KERNELS(int32); #undef DEFINE_SYCL_KERNELS -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc index 590f01c4691..e1712ac239d 100644 --- a/tensorflow/core/kernels/softmax_op.cc +++ b/tensorflow/core/kernels/softmax_op.cc @@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL // Partial specialization for a CPUDevice, that uses the Eigen implementation // from SoftmaxEigenImpl. @@ -48,7 +48,7 @@ struct SoftmaxFunctor : SoftmaxFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template struct SoftmaxFunctor : SoftmaxFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor template @@ -100,5 +100,5 @@ REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER( Name("Softmax").Device(DEVICE_SYCL).TypeConstraint("T"), SoftmaxOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc index c25ce2d8bb5..92ddf8edbfb 100644 --- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc +++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc @@ -70,7 +70,7 @@ static Graph* ConstructSpaceToBatchGraph( } \ BENCHMARK( \ BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11); -#define BM_SpaceToBatch(OP, ...) \ +#define BM_SpaceToBatch(OP, ...) \ BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \ BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \ BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_HALF, __VA_ARGS__)); \ diff --git a/tensorflow/core/kernels/spacetobatch_functor.cc b/tensorflow/core/kernels/spacetobatch_functor.cc index 23d8a5f9ed4..4c374b8d994 100644 --- a/tensorflow/core/kernels/spacetobatch_functor.cc +++ b/tensorflow/core/kernels/spacetobatch_functor.cc @@ -154,7 +154,7 @@ struct SpaceToBatchFunctor { #define INSTANTIATE(NUM_BLOCK_DIMS, T) \ template struct SpaceToBatchFunctor; \ template struct SpaceToBatchFunctor; \ -/**/ + /**/ #define INSTANTIATE_FOR_T(T) \ TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T) diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h index 06813650c08..f46a84da1e9 100644 --- a/tensorflow/core/kernels/spacetobatch_functor.h +++ b/tensorflow/core/kernels/spacetobatch_functor.h @@ -44,7 +44,7 @@ constexpr int kMaxSpaceToBatchBlockDims = 4; MACRO(2 /**/, ##__VA_ARGS__) \ MACRO(3 /**/, ##__VA_ARGS__) \ MACRO(4 /**/, ##__VA_ARGS__) \ -/**/ + /**/ namespace internal { namespace spacetobatch { diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc index db8d419c38f..5687141c9ea 100644 --- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc +++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc @@ -141,10 +141,10 @@ struct SpaceToBatchFunctor { } CudaLaunchConfig config = GetCudaLaunchConfig(static_cast(total_count), d); - S2B<<>>( - config.virtual_thread_count, const_cast(space_tensor.data()), args, - const_cast(batch_tensor.data())); + S2B + <<>>( + config.virtual_thread_count, const_cast(space_tensor.data()), + args, const_cast(batch_tensor.data())); return Status::OK(); } }; @@ -153,7 +153,7 @@ struct SpaceToBatchFunctor { #define INSTANTIATE(NUM_BLOCK_DIMS, T) \ template struct SpaceToBatchFunctor; \ template struct SpaceToBatchFunctor; \ -/**/ + /**/ #define INSTANTIATE_FOR_T(T) \ TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T) diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc index 95c1f5e7e8c..fdc08ec8e3b 100644 --- a/tensorflow/core/kernels/spacetobatch_op.cc +++ b/tensorflow/core/kernels/spacetobatch_op.cc @@ -58,9 +58,10 @@ void SpaceToBatchOpCompute(OpKernelContext* context, errors::InvalidArgument("input rank should be >= ", 1 + block_dims, " instead of ", orig_input_tensor.dims())); - OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_paddings.shape()) && - block_dims == orig_paddings.dim_size(0) && - 2 == orig_paddings.dim_size(1), + OP_REQUIRES(context, + TensorShapeUtils::IsMatrix(orig_paddings.shape()) && + block_dims == orig_paddings.dim_size(0) && + 2 == orig_paddings.dim_size(1), errors::InvalidArgument("paddings should have shape [", block_dims, ", 2] instead of ", orig_paddings.shape().DebugString())); diff --git a/tensorflow/core/kernels/sparse_add_grad_op.cc b/tensorflow/core/kernels/sparse_add_grad_op.cc index d8ed0c6f0c2..8597f3a8f73 100644 --- a/tensorflow/core/kernels/sparse_add_grad_op.cc +++ b/tensorflow/core/kernels/sparse_add_grad_op.cc @@ -35,9 +35,10 @@ class SparseAddGradOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices)); OP_REQUIRES_OK(ctx, ctx->input("sum_indices", &sum_indices)); - OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) && - TensorShapeUtils::IsMatrix(b_indices->shape()) && - TensorShapeUtils::IsMatrix(sum_indices->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsMatrix(a_indices->shape()) && + TensorShapeUtils::IsMatrix(b_indices->shape()) && + TensorShapeUtils::IsMatrix(sum_indices->shape()), errors::InvalidArgument( "Input indices should be matrices but received shapes: ", a_indices->shape().DebugString(), " and ", @@ -49,8 +50,9 @@ class SparseAddGradOp : public OpKernel { "Input backprop_val_grad should be a vector but received shape: ", backprop_val_grad->shape().DebugString())); OP_REQUIRES( - ctx, a_indices->dim_size(1) == b_indices->dim_size(1) && - b_indices->dim_size(1) == sum_indices->dim_size(1), + ctx, + a_indices->dim_size(1) == b_indices->dim_size(1) && + b_indices->dim_size(1) == sum_indices->dim_size(1), errors::InvalidArgument("The densified operands should have the same " "ndims; for A, B, sum got: ", a_indices->dim_size(1), b_indices->dim_size(1), diff --git a/tensorflow/core/kernels/sparse_add_op.cc b/tensorflow/core/kernels/sparse_add_op.cc index bd91dfdce64..d16317af671 100644 --- a/tensorflow/core/kernels/sparse_add_op.cc +++ b/tensorflow/core/kernels/sparse_add_op.cc @@ -34,8 +34,9 @@ class SparseAddOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices)); OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices)); - OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) && - TensorShapeUtils::IsMatrix(b_indices->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsMatrix(a_indices->shape()) && + TensorShapeUtils::IsMatrix(b_indices->shape()), errors::InvalidArgument( "Input indices should be matrices but received shapes: ", a_indices->shape().DebugString(), " and ", @@ -46,8 +47,9 @@ class SparseAddOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values_t)); OP_REQUIRES_OK(ctx, ctx->input("b_values", &b_values_t)); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) && - TensorShapeUtils::IsVector(b_values_t->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(a_values_t->shape()) && + TensorShapeUtils::IsVector(b_values_t->shape()), errors::InvalidArgument( "Input values should be vectors but received shapes: ", a_values_t->shape().DebugString(), " and ", @@ -62,8 +64,9 @@ class SparseAddOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape)); OP_REQUIRES_OK(ctx, ctx->input("b_shape", &b_shape)); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()) && - TensorShapeUtils::IsVector(b_shape->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(a_shape->shape()) && + TensorShapeUtils::IsVector(b_shape->shape()), errors::InvalidArgument( "Input shapes should be a vector but received shapes ", a_shape->shape().DebugString(), " and ", diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc index 4cad02bbee8..1f08e6c5ce2 100644 --- a/tensorflow/core/kernels/sparse_add_op_test.cc +++ b/tensorflow/core/kernels/sparse_add_op_test.cc @@ -61,9 +61,9 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) { // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); #define ADD_TENSOR_INPUT() \ diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc index c122616cf15..80bc1f19344 100644 --- a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc +++ b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc @@ -103,8 +103,9 @@ class SparseAccumulatorTakeGradientOp DoneCallback callback) override { // Check signature OP_REQUIRES_OK_ASYNC( - ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32}, - {DT_INT64, accumulator->dtype(), DT_INT64}), + ctx, + ctx->MatchSignature({DT_STRING_REF, DT_INT32}, + {DT_INT64, accumulator->dtype(), DT_INT64}), callback); } diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc index 07d935d55fe..7cd4532ad63 100644 --- a/tensorflow/core/kernels/sparse_cross_op.cc +++ b/tensorflow/core/kernels/sparse_cross_op.cc @@ -288,8 +288,7 @@ struct CrossTraits { template class SparseCrossOp : public OpKernel { public: - explicit SparseCrossOp(OpKernelConstruction* context) - : OpKernel(context) { + explicit SparseCrossOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_)); // Read signed_hash_key_ as int64 since uint64 attributes are not // supported by REGISTER_OP. @@ -316,8 +315,8 @@ class SparseCrossOp : public OpKernel { GenerateColumnsFromInput(indices_list_in, values_list_in, shapes_list_in, dense_list_in); - typename CrossTraits::Crosser - crosser(columns, num_buckets_, hash_key_); + typename CrossTraits::Crosser crosser( + columns, num_buckets_, hash_key_); Tensor* indices_out; Tensor* values_out; Tensor* shape_out; @@ -326,8 +325,8 @@ class SparseCrossOp : public OpKernel { CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out, &shape_out, &output_start_indices); - typename CrossTraits::Updater - updater(output_start_indices, indices_out, values_out); + typename CrossTraits::Updater updater( + output_start_indices, indices_out, values_out); auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) { for (int b = begin; b < end; b++) { ProductIterator product_iterator(columns, b); @@ -381,8 +380,9 @@ class SparseCrossOp : public OpKernel { "Input values should be a std::vector but received shape ", values_list_in[i].shape().DebugString(), " at position ", i)); OP_REQUIRES( - context, indices_list_in[i].shape().dim_size(0) == - values_list_in[i].shape().dim_size(0), + context, + indices_list_in[i].shape().dim_size(0) == + values_list_in[i].shape().dim_size(0), errors::InvalidArgument( "Expected size of values to be ", indices_list_in[i].shape().dim_size(0), " got ", diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc index cc0f86ce05e..ac48202ada2 100644 --- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc +++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc @@ -70,8 +70,9 @@ class SparseDenseBinaryOpShared : public OpKernel { errors::InvalidArgument( "Input sp_indices should be a matrix but received shape: ", indices_t->shape().DebugString())); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(values_t->shape()) && - TensorShapeUtils::IsVector(shape_t->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(values_t->shape()) && + TensorShapeUtils::IsVector(shape_t->shape()), errors::InvalidArgument( "Inputs sp_values and sp_shape should be vectors " "but received shapes: ", @@ -150,8 +151,9 @@ class SparseDenseBinaryOpShared : public OpKernel { CASE(4); CASE(5); default: - OP_REQUIRES(ctx, false, errors::InvalidArgument( - "Only tensors with ranks between 1 and 5 " + OP_REQUIRES( + ctx, false, + errors::InvalidArgument("Only tensors with ranks between 1 and 5 " "are currently supported. Tensor rank: ", ndims)); #undef CASE diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc index eaf1884243e..fe198af7e6c 100644 --- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc +++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc @@ -96,9 +96,9 @@ TEST_F(SparseDenseCDivTest, SameShape) { // [2 ] cdiv [dense: same shape, all 1's] // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); // Tensor dense(DT_FLOAT, TensorShape({3, 1})); @@ -125,9 +125,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) { // [2 ] cdiv [dense: shape [3,1], all 1's] // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); Tensor dense(DT_FLOAT, TensorShape({3, 1})); @@ -152,9 +152,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) { // [2 ] cdiv [dense: shape [2]] // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); Tensor dense(DT_FLOAT, TensorShape({2})); @@ -184,9 +184,9 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) { // [1 ?] where ? remains implicitly zero. // [1.5 0] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); Tensor dense(DT_FLOAT, TensorShape({2})); diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc index 8ab23b64d3d..a1f9667b783 100644 --- a/tensorflow/core/kernels/sparse_matmul_op.cc +++ b/tensorflow/core/kernels/sparse_matmul_op.cc @@ -159,8 +159,8 @@ struct SparseSlice { template template -void SparseSlice::Initialize(const typename SparseSlice::ConstMatrixMap& mat, - int col_offset) { +void SparseSlice::Initialize( + const typename SparseSlice::ConstMatrixMap& mat, int col_offset) { const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0); const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1); DCHECK_LE(num_rows, mat_rows); @@ -278,9 +278,9 @@ ALWAYS_INLINE float ConvertBfloat16ToFloat(const bfloat16* src) { float out = 0; auto tmp = reinterpret_cast(&out); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - tmp[0] = *src; + tmp[0] = *src; #else - tmp[1] = *src; + tmp[1] = *src; #endif return out; } @@ -970,9 +970,9 @@ class SparseMatMulOp : public OpKernel { const int k2 = transpose_b_ ? b.dim_size(1) : b.dim_size(0); OP_REQUIRES(ctx, k == k2, - errors::InvalidArgument("Matrix size incompatible: a: ", - a.shape().DebugString(), ", b: ", - b.shape().DebugString())); + errors::InvalidArgument( + "Matrix size incompatible: a: ", a.shape().DebugString(), + ", b: ", b.shape().DebugString())); Tensor* output = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output)); @@ -1224,8 +1224,9 @@ ALWAYS_INLINE void CopyAndMayBeInterleave(void* dst, const void* src, template inline BlockingCounter* SparseMatMul::ShuffleMatrix( - const typename SparseMatMul::ConstMatrixMapR& mat, int slice_row_start, - int slice_num_rows, int slice_col_start, int slice_num_cols, const int N, + const typename SparseMatMul::ConstMatrixMapR& mat, + int slice_row_start, int slice_num_rows, int slice_col_start, + int slice_num_cols, const int N, const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) { DCHECK_EQ(N % 2, 0); DCHECK_LE(kNumOperands * sizeof(float) / sizeof(TR), N); @@ -1306,8 +1307,9 @@ inline std::unique_ptr SparseMatMul::CreateDenseSlices( template inline void SparseMatMul::ComputeBlockSizes( const typename SparseMatMul::ConstMatrixMapL& left, - const typename SparseMatMul::ConstMatrixMapR& right, bool transpose_left, - int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) { + const typename SparseMatMul::ConstMatrixMapR& right, + bool transpose_left, int num_threads, int* KR, int* NR, int* KL, int* JB, + int* IB) { // Heuristics for calculating block sizes // Assume two hyperthreads per core. const int est_num_cores = std::max(1, (num_threads + 1) / 2); diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h index cca52558ae2..14ef2ed7044 100644 --- a/tensorflow/core/kernels/sparse_matmul_op.h +++ b/tensorflow/core/kernels/sparse_matmul_op.h @@ -159,25 +159,25 @@ EIGEN_STRONG_INLINE Packet4f pload2bf16(const float* from) { // Return a packet with the first value of the input Packet replicated template <> EIGEN_STRONG_INLINE Packet4f pbroadcast_first(const Packet4f& a) { - return vec_splat (a, 0); + return vec_splat(a, 0); } // Return a packet with the second value of the input Packet replicated template <> EIGEN_STRONG_INLINE Packet4f pbroadcast_second(const Packet4f& a) { - return vec_splat (a, 1); + return vec_splat(a, 1); } // Return a packet with the third value of the input Packet replicated template <> EIGEN_STRONG_INLINE Packet4f pbroadcast_third(const Packet4f& a) { - return vec_splat (a, 2); + return vec_splat(a, 2); } // Return a packet with the fourth value of the input Packet replicated template <> EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth(const Packet4f& a) { - return vec_splat (a, 3); + return vec_splat(a, 3); } #endif diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc index f815ca9e344..ebc6d8fa4ec 100644 --- a/tensorflow/core/kernels/sparse_matmul_op_test.cc +++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc @@ -284,11 +284,11 @@ class SparseMatmulOpTest : public ::testing::Test { uint16_t* data3_bfloat16_p = reinterpret_cast(data3_bfloat16) + i; #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - data3_p[1] = 0; - data3_bfloat16_p[0] = data3_p[0]; + data3_p[1] = 0; + data3_bfloat16_p[0] = data3_p[0]; #else - data3_p[0] = 0; - data3_bfloat16_p[0] = data3_p[1]; + data3_p[0] = 0; + data3_bfloat16_p[0] = data3_p[1]; #endif } } diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc index 110376be425..96246c7a712 100644 --- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc +++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc @@ -51,9 +51,9 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) { // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); AddInputFromArray(indices_shape, indices); @@ -93,9 +93,9 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) { // [3 4] const auto indices_shape = TensorShape({4, 2}); - std::initializer_list in{ 0, 1, 1, 0, 2, 0, 2, 1 }; + std::initializer_list in{0, 1, 1, 0, 2, 0, 2, 1}; const gtl::ArraySlice indices(in); - std::initializer_list sh{ 3, 2 }; + std::initializer_list sh{3, 2}; const gtl::ArraySlice shape(sh); AddInputFromArray(indices_shape, indices); diff --git a/tensorflow/core/kernels/sparse_softmax_op.cc b/tensorflow/core/kernels/sparse_softmax_op.cc index 327a94b8a12..444a5f657a9 100644 --- a/tensorflow/core/kernels/sparse_softmax_op.cc +++ b/tensorflow/core/kernels/sparse_softmax_op.cc @@ -50,8 +50,9 @@ class SparseSoftmaxOp : public OpKernel { errors::InvalidArgument( "Input sp_indices should be a matrix but received shape: ", indices_t->shape().DebugString())); - OP_REQUIRES(context, TensorShapeUtils::IsVector(values_t->shape()) && - TensorShapeUtils::IsVector(shape_t->shape()), + OP_REQUIRES(context, + TensorShapeUtils::IsVector(values_t->shape()) && + TensorShapeUtils::IsVector(shape_t->shape()), errors::InvalidArgument( "Inputs sp_values and sp_shape should be vectors " "but received shapes: ", diff --git a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc index b027adba6b3..09cb2a6a71c 100644 --- a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc +++ b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc @@ -132,14 +132,16 @@ class SparseSparseBinaryOpShared : public OpKernel { // Validations. OP_REQUIRES( - ctx, TensorShapeUtils::IsMatrix(a_indices_t->shape()) && - TensorShapeUtils::IsMatrix(b_indices_t->shape()), + ctx, + TensorShapeUtils::IsMatrix(a_indices_t->shape()) && + TensorShapeUtils::IsMatrix(b_indices_t->shape()), errors::InvalidArgument("Inputs a_indices and b_indices should be " "matrices but received shapes: ", a_indices_t->shape().DebugString(), ", ", b_indices_t->shape().DebugString())); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) && - TensorShapeUtils::IsVector(b_values_t->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(a_values_t->shape()) && + TensorShapeUtils::IsVector(b_values_t->shape()), errors::InvalidArgument( "Inputs a_values and b_values should be vectors " "but received shapes: ", @@ -157,8 +159,9 @@ class SparseSparseBinaryOpShared : public OpKernel { " non-empty input values, got ", a_values.size(), " and ", b_values.size())); - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape_t->shape()) && - TensorShapeUtils::IsVector(b_shape_t->shape()), + OP_REQUIRES(ctx, + TensorShapeUtils::IsVector(a_shape_t->shape()) && + TensorShapeUtils::IsVector(b_shape_t->shape()), errors::InvalidArgument( "Input shapes should be a vector but received shapes ", a_shape_t->shape().DebugString(), " and ", diff --git a/tensorflow/core/kernels/sparse_split_op.cc b/tensorflow/core/kernels/sparse_split_op.cc index 6171b532aa2..67dcf05a6ce 100644 --- a/tensorflow/core/kernels/sparse_split_op.cc +++ b/tensorflow/core/kernels/sparse_split_op.cc @@ -48,18 +48,20 @@ class SparseSplitOp : public OpKernel { "Input shape should be a vector but received shape ", input_shape.shape().DebugString())); - OP_REQUIRES(context, input_shape.dim_size(0) && - split_dim < input_shape.vec().size(), - errors::InvalidArgument( - "Input split_dim should be between 0 and rank (", - input_shape.vec().size(), "), got ", split_dim)); + OP_REQUIRES( + context, + input_shape.dim_size(0) && split_dim < input_shape.vec().size(), + errors::InvalidArgument( + "Input split_dim should be between 0 and rank (", + input_shape.vec().size(), "), got ", split_dim)); - OP_REQUIRES(context, num_split_ >= 1 && - num_split_ <= input_shape.vec()(split_dim), - errors::InvalidArgument("Input num_split should be between 1 " - "and the splitting dimension size (", - input_shape.vec()(split_dim), - "), got ", num_split_)); + OP_REQUIRES( + context, + num_split_ >= 1 && num_split_ <= input_shape.vec()(split_dim), + errors::InvalidArgument("Input num_split should be between 1 " + "and the splitting dimension size (", + input_shape.vec()(split_dim), "), got ", + num_split_)); sparse::SparseTensor sparse_tensor(input_indices, input_values, TensorShape(input_shape.vec())); diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc index 6a6cc3d8138..ba3da21a433 100644 --- a/tensorflow/core/kernels/sparse_to_dense_op.cc +++ b/tensorflow/core/kernels/sparse_to_dense_op.cc @@ -73,8 +73,9 @@ class SparseToDense : public OpKernel { // sparse_values const Tensor& sparse_values = c->input(2); const int64 num_values = sparse_values.NumElements(); - OP_REQUIRES(c, sparse_values.dims() == 0 || - (sparse_values.dims() == 1 && num_values == num_elems), + OP_REQUIRES(c, + sparse_values.dims() == 0 || + (sparse_values.dims() == 1 && num_values == num_elems), errors::InvalidArgument("sparse_values has incorrect shape ", sparse_values.shape().DebugString(), ", should be [] or [", num_elems, "]")); diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc index f0d19da8046..d8b0f930824 100644 --- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc +++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc @@ -38,7 +38,6 @@ namespace { class SparseToDenseTest : public OpsTestBase { protected: - void MakeOp(int dim, DataType index_type, DataType value_type) { TF_ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense") .Input(FakeInput(index_type)) diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc index c35ba42db29..f84ffd53238 100644 --- a/tensorflow/core/kernels/sparse_xent_op.cc +++ b/tensorflow/core/kernels/sparse_xent_op.cc @@ -39,10 +39,10 @@ Status CheckInvalidLabelIndex(const Tensor& labels, int64 max_index) { if (*min_max_dim_value.first < 0 || *min_max_dim_value.second >= max_index) { bad_index = (*min_max_dim_value.first < 0) ? *min_max_dim_value.first : *min_max_dim_value.second; - return errors::InvalidArgument("Received a label value of ", bad_index, - " which is outside the valid range of [0, ", - max_index, "). Label values: ", - labels.SummarizeValue(labels.NumElements())); + return errors::InvalidArgument( + "Received a label value of ", bad_index, + " which is outside the valid range of [0, ", max_index, + "). Label values: ", labels.SummarizeValue(labels.NumElements())); } return Status::OK(); } diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc index b8ea0d2d7e2..afb0bf76267 100644 --- a/tensorflow/core/kernels/sparse_xent_op_test.cc +++ b/tensorflow/core/kernels/sparse_xent_op_test.cc @@ -41,10 +41,10 @@ static Graph* SparseXent(int batch_size, int num_classes) { return g; } -#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \ - static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \ +#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \ + static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \ testing::ItemsProcessed(static_cast(iters) * BATCH * CLASS); \ - test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters); \ + test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters); \ } \ BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE); diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h index ff92ffeeb38..a08949e626c 100644 --- a/tensorflow/core/kernels/split_lib.h +++ b/tensorflow/core/kernels/split_lib.h @@ -57,7 +57,7 @@ struct Split { const Eigen::DSizes& slice_indices, const Eigen::DSizes& slice_sizes); }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc index 25026208d1e..771c633b156 100644 --- a/tensorflow/core/kernels/split_lib_cpu.cc +++ b/tensorflow/core/kernels/split_lib_cpu.cc @@ -49,13 +49,13 @@ void Split::operator()( typename TTypes::ConstTensor input, const Eigen::DSizes& slice_indices, const Eigen::DSizes& slice_sizes) { - output.device(d) = input.slice(slice_indices, slice_sizes); + output.device(d) = input.slice(slice_indices, slice_sizes); } #define DEFINE_SYCL_KERNELS(T) template struct Split; TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc index 78badde27e5..85f529326db 100644 --- a/tensorflow/core/kernels/split_op.cc +++ b/tensorflow/core/kernels/split_op.cc @@ -39,7 +39,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class SplitOpBase : public OpKernel { @@ -142,8 +142,9 @@ class SplitOpCPU : public SplitOpBase { // Android also uses int32 indexing, so check here also. OP_REQUIRES( - context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), + context, + FastBoundsCheck(input.NumElements(), + std::numeric_limits::max()), errors::InvalidArgument("Split requires input size < ", std::numeric_limits::max())); @@ -245,10 +246,11 @@ class SplitOpGPU : public SplitOpBase { const int32 split_dim = split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; const int32 num_split = Base::num_outputs(); - OP_REQUIRES(context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("Split on GPU requires input size " - "< max int32")); + OP_REQUIRES( + context, + FastBoundsCheck(input.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument("Split on GPU requires input size " + "< max int32")); int32 prefix_dim_size; int32 split_dim_size; int32 suffix_dim_size; @@ -304,8 +306,9 @@ class SplitOpSYCL : public SplitOpBase { // Android also uses int32 indexing, so check here also. OP_REQUIRES( - context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), + context, + FastBoundsCheck(input.NumElements(), + std::numeric_limits::max()), errors::InvalidArgument("Split requires input size < ", std::numeric_limits::max())); @@ -342,14 +345,14 @@ class SplitOpSYCL : public SplitOpBase { {prefix_dim_size, split_dim_output_size, suffix_dim_size}); functor::Split()(context->eigen_device(), - result_shaped, input_reshaped, - slice_indices, slice_sizes); + result_shaped, input_reshaped, + slice_indices, slice_sizes); } indices[1] += split_dim_output_size; } } }; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #define REGISTER_SPLIT(type) \ REGISTER_KERNEL_BUILDER(Name("Split") \ @@ -381,11 +384,11 @@ REGISTER_GPU(bfloat16); #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL -#define REGISTER_SYCL(type) \ - REGISTER_KERNEL_BUILDER(Name("Split") \ - .Device(DEVICE_SYCL) \ - .TypeConstraint("T") \ - .HostMemory("split_dim"), \ +#define REGISTER_SYCL(type) \ + REGISTER_KERNEL_BUILDER(Name("Split") \ + .Device(DEVICE_SYCL) \ + .TypeConstraint("T") \ + .HostMemory("split_dim"), \ SplitOpSYCL) TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL); diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc index f1078ac349c..7ff5df47d70 100644 --- a/tensorflow/core/kernels/split_v_op.cc +++ b/tensorflow/core/kernels/split_v_op.cc @@ -197,8 +197,9 @@ class SplitVOpCPU : public SplitVOpBase { // Android also uses int32 indexing, so check here also. OP_REQUIRES( - context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), + context, + FastBoundsCheck(input.NumElements(), + std::numeric_limits::max()), errors::InvalidArgument("Split requires input size < ", std::numeric_limits::max())); @@ -305,10 +306,11 @@ class SplitVOpGPU : public SplitVOpBase { const int32 split_dim_orig = context->input(2).flat()(0); const int32 split_dim = split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig; - OP_REQUIRES(context, FastBoundsCheck(input.NumElements(), - std::numeric_limits::max()), - errors::InvalidArgument("Split on GPU requires input size " - "< max int32")); + OP_REQUIRES( + context, + FastBoundsCheck(input.NumElements(), std::numeric_limits::max()), + errors::InvalidArgument("Split on GPU requires input size " + "< max int32")); int32 prefix_dim_size; int32 split_dim_size; diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc index affe81a5556..65296f61fd1 100644 --- a/tensorflow/core/kernels/stack_ops.cc +++ b/tensorflow/core/kernels/stack_ops.cc @@ -42,7 +42,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class Stack : public ResourceBase { public: @@ -242,7 +242,7 @@ REGISTER_KERNEL_BUILDER(Name("StackV2") .HostMemory("max_size") .HostMemory("handle"), StackOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class StackPushOp : public AsyncOpKernel { @@ -274,11 +274,11 @@ class StackPushOp : public AsyncOpKernel { static constexpr int kCopyThreshold = 2048; static constexpr double kOccupancy = 0.7; if (swap_memory_ && !alloc_attrs.on_host() && - ( std::is_same::value + (std::is_same::value #ifdef TENSORFLOW_USE_SYCL - || std::is_same::value -#endif // TENSORFLOW_USE_SYCL - ) && + || std::is_same::value +#endif // TENSORFLOW_USE_SYCL + ) && tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) { DeviceContext* device_ctxt = ctx->op_device_context(); auto device = static_cast(ctx->device()); @@ -391,7 +391,7 @@ REGISTER_SYCL_HOST_KERNEL(int32); REGISTER_SYCL_HOST_KERNEL(bool); #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class StackPopOp : public AsyncOpKernel { public: @@ -498,7 +498,7 @@ REGISTER_SYCL_HOST_KERNEL(bool); #undef REGISTER_SYCL_KERNEL #undef REGISTER_SYCL_HOST_KERNEL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class StackCloseOp : public OpKernel { public: @@ -526,6 +526,6 @@ REGISTER_KERNEL_BUILDER( REGISTER_KERNEL_BUILDER( Name("StackCloseV2").Device(DEVICE_SYCL).HostMemory("handle"), StackCloseOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc index 0fae46dea61..03fc4467a1d 100644 --- a/tensorflow/core/kernels/stage_op.cc +++ b/tensorflow/core/kernels/stage_op.cc @@ -70,12 +70,11 @@ class Buffer : public ResourceBase { return bytes + current_bytes_ > memory_limit_; } - std::size_t GetTupleBytes(const Tuple & tuple) - { + std::size_t GetTupleBytes(const Tuple& tuple) { return std::accumulate(tuple.begin(), tuple.end(), 0, - [](const std::size_t & lhs, const Tensor & rhs) { - return lhs + rhs.TotalBytes(); - }); + [](const std::size_t& lhs, const Tensor& rhs) { + return lhs + rhs.TotalBytes(); + }); } public: @@ -90,19 +89,22 @@ class Buffer : public ResourceBase { std::size_t tuple_bytes = GetTupleBytes(*tuple); // Sanity check so that we don't block for ever below - if(memory_limit_ > 0 && tuple_bytes > memory_limit_) { - return Status(errors::ResourceExhausted("Attempted to insert " - "tensors with combined size of '", tuple_bytes, "' bytes into " - "Staging Area with a memory limit of '", memory_limit_, "'.")); + if (memory_limit_ > 0 && tuple_bytes > memory_limit_) { + return Status( + errors::ResourceExhausted("Attempted to insert " + "tensors with combined size of '", + tuple_bytes, + "' bytes into " + "Staging Area with a memory limit of '", + memory_limit_, "'.")); } - // If buffer capacity is bounded wait until elements have been removed - if(IsBounded()) { + if (IsBounded()) { full_cond_var_.wait(lock, [tuple_bytes, this]() { // If there's a memory limit, check if there's space for insertion - bool memory_limit_valid = memory_limit_ > 0 ? - !WouldExceedMemoryLimit(tuple_bytes) : true; + bool memory_limit_valid = + memory_limit_ > 0 ? !WouldExceedMemoryLimit(tuple_bytes) : true; // If we're configured for capacity check if there's space for insertion bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true; @@ -186,8 +188,7 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) { ContainerInfo cinfo; // Lambda for creating the Staging Area - auto create_fn = [&ndef](Buffer** ret) -> Status - { + auto create_fn = [&ndef](Buffer** ret) -> Status { int64 capacity; int64 memory_limit; TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity)); @@ -196,7 +197,6 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) { return Status::OK(); }; - TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */)); TF_RETURN_IF_ERROR(rm->LookupOrCreate(cinfo.container(), cinfo.name(), buf, create_fn)); @@ -228,7 +228,7 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class UnstageOp : public OpKernel { public: @@ -244,7 +244,8 @@ class UnstageOp : public OpKernel { buf->Get(&tuple); - OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(), + OP_REQUIRES( + ctx, tuple.size() == (size_t)ctx->num_outputs(), errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(), " vs. ", ctx->num_outputs())); @@ -260,7 +261,7 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL class StagePeekOp : public OpKernel { public: @@ -278,7 +279,8 @@ class StagePeekOp : public OpKernel { OP_REQUIRES_OK(ctx, buf->Peek(index, &tuple)); - OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(), + OP_REQUIRES( + ctx, tuple.size() == (size_t)ctx->num_outputs(), errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(), " vs. ", ctx->num_outputs())); @@ -288,17 +290,15 @@ class StagePeekOp : public OpKernel { } }; -REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), - StagePeekOp); +REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp); #if GOOGLE_CUDA -REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index"). - Device(DEVICE_GPU), StagePeekOp); +REGISTER_KERNEL_BUILDER( + Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp); #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index") - .Device(DEVICE_SYCL), StagePeekOp); -#endif // TENSORFLOW_USE_SYCL - +REGISTER_KERNEL_BUILDER( + Name("StagePeek").HostMemory("index").Device(DEVICE_SYCL), StagePeekOp); +#endif // TENSORFLOW_USE_SYCL class StageSizeOp : public OpKernel { public: @@ -312,9 +312,8 @@ class StageSizeOp : public OpKernel { core::ScopedUnref scope(buf); // Allocate size output tensor - Tensor * size = nullptr; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), - &size)); + Tensor* size = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size)); // Set it to the actual size size->scalar().setConstant(buf->Size()); @@ -323,13 +322,13 @@ class StageSizeOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp); #if GOOGLE_CUDA -REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size") - .Device(DEVICE_GPU), StageSizeOp); +REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU), + StageSizeOp); #endif #ifdef TENSORFLOW_USE_SYCL -REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size") - .Device(DEVICE_SYCL), StageSizeOp); -#endif // TENSORFLOW_USE_SYCL +REGISTER_KERNEL_BUILDER( + Name("StageSize").HostMemory("size").Device(DEVICE_SYCL), StageSizeOp); +#endif // TENSORFLOW_USE_SYCL class StageClearOp : public OpKernel { public: @@ -352,7 +351,6 @@ REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp); #endif #ifdef TENSORFLOW_USE_SYCL REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp); -#endif // TENSORFLOW_USE_SYCL - +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc index 7c213e14d21..7745effe2ab 100644 --- a/tensorflow/core/kernels/strided_slice_op.cc +++ b/tensorflow/core/kernels/strided_slice_op.cc @@ -294,6 +294,11 @@ class StridedSliceAssignOp : public OpKernel { OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0), &v)); old_lhs = *v->tensor(); + OP_REQUIRES(context, old_lhs.dtype() == DataTypeToEnum::value, + errors::InvalidArgument( + "l-value dtype ", DataTypeString(old_lhs.dtype()), + " does not match r-value dtype ", + DataTypeString(DataTypeToEnum::value))); } else { context->forward_ref_input_to_ref_output(0, 0); old_lhs = context->mutable_input(0, true); @@ -541,5 +546,5 @@ REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign") .HostMemory("strides"), StridedSliceAssignOp) #undef REGISTER_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/strided_slice_op.h b/tensorflow/core/kernels/strided_slice_op.h index 0f72c4b7710..2b586322986 100644 --- a/tensorflow/core/kernels/strided_slice_op.h +++ b/tensorflow/core/kernels/strided_slice_op.h @@ -21,6 +21,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/resource_handle.h" #include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/variant_encode_decode.h" #include "tensorflow/core/platform/types.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h index a84ba38ef41..1c4472bb1ab 100644 --- a/tensorflow/core/kernels/strided_slice_op_impl.h +++ b/tensorflow/core/kernels/strided_slice_op_impl.h @@ -26,6 +26,8 @@ limitations under the License. #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types_traits.h" #include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/variant.h" +#include "tensorflow/core/framework/variant_encode_decode.h" #include "tensorflow/core/kernels/bounds_check.h" #include "tensorflow/core/kernels/dense_update_functor.h" #include "tensorflow/core/kernels/ops_util.h" @@ -302,7 +304,7 @@ DECLARE_FOR_N_SYCL(int32); DECLARE_FOR_N_SYCL(int64); #undef DECLARE_FOR_N_SYCL -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef INSTANTIATE #undef DECLARE_FOR_N_CPU diff --git a/tensorflow/core/kernels/string_join_op.cc b/tensorflow/core/kernels/string_join_op.cc index 721702bec68..28cca9f4484 100644 --- a/tensorflow/core/kernels/string_join_op.cc +++ b/tensorflow/core/kernels/string_join_op.cc @@ -50,9 +50,9 @@ class StringJoinOp : public OpKernel { } else { OP_REQUIRES( context, input_shape == input.shape(), - errors::InvalidArgument("Input shapes do not match: ", - input_shape.DebugString(), " vs. ", - input.shape().DebugString())); + errors::InvalidArgument( + "Input shapes do not match: ", input_shape.DebugString(), + " vs. ", input.shape().DebugString())); } } } diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc index 743f1131504..e29f67297f9 100644 --- a/tensorflow/core/kernels/substr_op.cc +++ b/tensorflow/core/kernels/substr_op.cc @@ -95,9 +95,9 @@ class SubstrOp : public OpKernel { // Create BCast helper with shape of input and pos/len BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(pos_shape)); OP_REQUIRES(context, bcast.IsValid(), - errors::InvalidArgument("Incompatible shapes: ", - input_shape.DebugString(), " vs. ", - pos_shape.DebugString())); + errors::InvalidArgument( + "Incompatible shapes: ", input_shape.DebugString(), + " vs. ", pos_shape.DebugString())); TensorShape output_shape = BCast::ToShape(bcast.result_shape()); int ndims = output_shape.dims(); Tensor* output_tensor = nullptr; diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc index 233b824bcc3..29b21ee7353 100644 --- a/tensorflow/core/kernels/summary_image_op.cc +++ b/tensorflow/core/kernels/summary_image_op.cc @@ -54,18 +54,20 @@ class SummaryImageOp : public OpKernel { const Tensor& tensor = c->input(1); OP_REQUIRES(c, IsLegacyScalar(tags.shape()), errors::InvalidArgument("Tags must be a scalar")); - OP_REQUIRES(c, tensor.dims() == 4 && - (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 || - tensor.dim_size(3) == 4), + OP_REQUIRES(c, + tensor.dims() == 4 && + (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 || + tensor.dim_size(3) == 4), errors::InvalidArgument( "Tensor must be 4-D with last dim 1, 3, or 4, not ", tensor.shape().DebugString())); const string& base_tag = tags.scalar()(); - OP_REQUIRES(c, tensor.dim_size(0) < (1LL << 31) && - tensor.dim_size(1) < (1LL << 31) && - tensor.dim_size(2) < (1LL << 31) && - (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29), + OP_REQUIRES(c, + tensor.dim_size(0) < (1LL << 31) && + tensor.dim_size(1) < (1LL << 31) && + tensor.dim_size(2) < (1LL << 31) && + (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29), errors::InvalidArgument("Tensor too large for summary ", tensor.shape().DebugString())); diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc index b818724ec2e..1f4e3418f48 100644 --- a/tensorflow/core/kernels/summary_op.cc +++ b/tensorflow/core/kernels/summary_op.cc @@ -41,11 +41,12 @@ class SummaryScalarOp : public OpKernel { const Tensor& values = c->input(1); OP_REQUIRES( - c, tags.IsSameSize(values) || - (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())), - errors::InvalidArgument("tags and values not the same shape: ", - tags.shape().DebugString(), " != ", - values.shape().DebugString(), SingleTag(tags))); + c, + tags.IsSameSize(values) || + (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())), + errors::InvalidArgument( + "tags and values not the same shape: ", tags.shape().DebugString(), + " != ", values.shape().DebugString(), SingleTag(tags))); auto Ttags = tags.flat(); auto Tvalues = values.flat(); Summary s; diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc index b2fd669541d..f8144867014 100644 --- a/tensorflow/core/kernels/tile_functor_cpu.cc +++ b/tensorflow/core/kernels/tile_functor_cpu.cc @@ -15,10 +15,10 @@ limitations under the License. #define EIGEN_USE_THREADS -#include "tensorflow/core/kernels/tile_functor.h" #include "tensorflow/core/framework/attr_value.pb.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/tile_functor.h" namespace tensorflow { diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h index 054b31ef9e0..df6a666cd44 100644 --- a/tensorflow/core/kernels/tile_ops_cpu_impl.h +++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h @@ -63,7 +63,7 @@ TF_CALL_int64(DEFINE_TYPE); #undef DEFINE_DIM #undef DEFINE_TYPE -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // end namespace functor } // end namespace tensorflow diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index 38e77ab60fb..07befa27bc5 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -3279,7 +3279,6 @@ REGISTER_KERNELS(double, int64); #undef REGISTER_KERNELS - template class ApplyAddSignOp : public OpKernel { public: @@ -3362,17 +3361,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { -#define DECLARE_GPU_SPEC(T) \ - template <> \ - void ApplyAddSign::operator()( \ - const GPUDevice& d, \ - typename TTypes::Flat var, \ - typename TTypes::Flat m, \ - typename TTypes::ConstScalar lr, \ - typename TTypes::ConstScalar alpha, \ - typename TTypes::ConstScalar sign_decay, \ - typename TTypes::ConstScalar beta, \ - typename TTypes::ConstFlat grad); \ +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyAddSign::operator()( \ + const GPUDevice& d, typename TTypes::Flat var, \ + typename TTypes::Flat m, typename TTypes::ConstScalar lr, \ + typename TTypes::ConstScalar alpha, \ + typename TTypes::ConstScalar sign_decay, \ + typename TTypes::ConstScalar beta, \ + typename TTypes::ConstFlat grad); \ extern template struct ApplyAddSign; DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); @@ -3387,7 +3384,6 @@ REGISTER_KERNELS(GPU, double); #undef REGISTER_CPU_KERNELS #undef REGISTER_KERNELS - template class ApplyPowerSignOp : public OpKernel { public: @@ -3470,17 +3466,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS); #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { -#define DECLARE_GPU_SPEC(T) \ - template <> \ - void ApplyPowerSign::operator()( \ - const GPUDevice& d, \ - typename TTypes::Flat var, \ - typename TTypes::Flat m, \ - typename TTypes::ConstScalar lr, \ - typename TTypes::ConstScalar logbase, \ - typename TTypes::ConstScalar sign_decay, \ - typename TTypes::ConstScalar beta, \ - typename TTypes::ConstFlat grad); \ +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyPowerSign::operator()( \ + const GPUDevice& d, typename TTypes::Flat var, \ + typename TTypes::Flat m, typename TTypes::ConstScalar lr, \ + typename TTypes::ConstScalar logbase, \ + typename TTypes::ConstScalar sign_decay, \ + typename TTypes::ConstScalar beta, \ + typename TTypes::ConstFlat grad); \ extern template struct ApplyPowerSign; DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc index d443a6b3c1d..0376a3b2c60 100644 --- a/tensorflow/core/kernels/training_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc @@ -17,8 +17,8 @@ limitations under the License. #define EIGEN_USE_GPU -#include "tensorflow/core/kernels/training_ops.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/training_ops.h" namespace tensorflow { @@ -115,13 +115,11 @@ struct ApplyAdam { Eigen::Sizes<1> single; const auto one = static_cast(1.0); m.device(d) = - m + - (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) * - (grad - m); + m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) * + (grad - m); v.device(d) = - v + - (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) * - (grad.square() - v); + v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) * + (grad.square() - v); if (use_nesterov) { var.device(d) -= @@ -157,9 +155,9 @@ struct ApplyRMSProp { bcast[0] = grad.dimension(0); Eigen::Sizes<1> single; const auto one = static_cast(1.0); - ms.device(d) = ms + - (rho.constant(one) - rho).reshape(single).broadcast(bcast) * - (grad.square() - ms); + ms.device(d) = + ms + (rho.constant(one) - rho).reshape(single).broadcast(bcast) * + (grad.square() - ms); mom.device(d) = mom * momentum.reshape(single).broadcast(bcast) + lr.reshape(single).broadcast(bcast) * grad / @@ -212,7 +210,7 @@ struct ApplyAddSign { auto beta_bcast = beta.reshape(single).broadcast(bcast); auto one_minus_beta = (beta.constant(one) - beta).reshape(single).broadcast(bcast); - m.device(d) = m * beta_bcast + grad * one_minus_beta; + m.device(d) = m * beta_bcast + grad * one_minus_beta; // The following is the GPU equivalent of the CPU version: // var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad; @@ -244,7 +242,7 @@ struct ApplyPowerSign { auto beta_bcast = beta.reshape(single).broadcast(bcast); auto one_minus_beta = (beta.constant(one) - beta).reshape(single).broadcast(bcast); - m.device(d) = m * beta_bcast + grad * one_minus_beta; + m.device(d) = m * beta_bcast + grad * one_minus_beta; // The following is the GPU equivalent of the CPU version: // auto grad_scale = (logbase() * sign_decay() * sign_gm).exp(); @@ -253,7 +251,7 @@ struct ApplyPowerSign { auto lr_bcast = lr.reshape(single).broadcast(bcast); auto logbase_bcast = logbase.reshape(single).broadcast(bcast); auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast); - auto grad_scale = (logbase_bcast * sign_decay_bcast * sign_gm).exp(); + auto grad_scale = (logbase_bcast * sign_decay_bcast * sign_gm).exp(); var.device(d) -= lr_bcast * grad_scale * grad; } }; diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc index ffa7f87c9ef..2dcc4a500e6 100644 --- a/tensorflow/core/kernels/training_ops_test.cc +++ b/tensorflow/core/kernels/training_ops_test.cc @@ -176,8 +176,9 @@ static void Adam(int32 n, Graph** init_g, Graph** train_g) { auto beta2 = Scalar(g, 0.99); auto epsilon = Scalar(g, 1e-8); auto grad = Random(g, n); - test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr, - beta1, beta2, epsilon, grad}); + test::graph::Multi( + g, "ApplyAdam", + {var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad}); *train_g = g; } } diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc index 2e0d18b634a..7177ad78884 100644 --- a/tensorflow/core/kernels/transpose_op.cc +++ b/tensorflow/core/kernels/transpose_op.cc @@ -176,9 +176,10 @@ void TransposeOp::Compute(OpKernelContext* ctx) { } } for (int i = 0; i < dims; ++i) { - OP_REQUIRES(ctx, bits[i], errors::InvalidArgument( - i, " is missing from {", - str_util::Join(permutation, ","), "}.")); + OP_REQUIRES( + ctx, bits[i], + errors::InvalidArgument(i, " is missing from {", + str_util::Join(permutation, ","), "}.")); } // 0-D, 1-D, and identity transposes do nothing. diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h index 0d608d9b879..43dcb4cef74 100644 --- a/tensorflow/core/kernels/typed_queue.h +++ b/tensorflow/core/kernels/typed_queue.h @@ -58,9 +58,9 @@ Status TypedQueue::Initialize() { if (!component_shapes_.empty() && component_dtypes_.size() != component_shapes_.size()) { return errors::InvalidArgument( - "Different number of component types. ", "Types: ", - DataTypeSliceString(component_dtypes_), ", Shapes: ", - ShapeListString(component_shapes_)); + "Different number of component types. ", + "Types: ", DataTypeSliceString(component_dtypes_), + ", Shapes: ", ShapeListString(component_shapes_)); } mutex_lock lock(mu_); diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc index 397bdd56708..764b6a252ad 100644 --- a/tensorflow/core/kernels/unpack_op.cc +++ b/tensorflow/core/kernels/unpack_op.cc @@ -34,7 +34,7 @@ typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class UnpackOp : public OpKernel { @@ -65,8 +65,9 @@ class UnpackOp : public OpKernel { output_shape.RemoveDim(axis); const int64 output_size = output_shape.num_elements(); OP_REQUIRES( - context, FastBoundsCheck(output_size, - std::numeric_limits::max()), + context, + FastBoundsCheck(output_size, + std::numeric_limits::max()), errors::InvalidArgument("output size must fit in Eigen DenseIndex")); // This optimization is currently not applicable for SYCL devices diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc new file mode 100644 index 00000000000..da9ab01e8db --- /dev/null +++ b/tensorflow/core/kernels/unravel_index_op.cc @@ -0,0 +1,120 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +namespace { +template +struct mod_op { + const T operator()(const T& a, const T& b) const { return a % b; } +}; +} // namespace + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template +class UnravelIndexOp : public OpKernel { + public: + explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& indices_tensor = ctx->input(0); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices_tensor.shape()) || + TensorShapeUtils::IsScalar(indices_tensor.shape()), + errors::InvalidArgument( + "The indices can only be scalar or vector, got \"", + indices_tensor.shape().DebugString(), "\"")); + + const Tensor& dims_tensor = ctx->input(1); + OP_REQUIRES( + ctx, TensorShapeUtils::IsVector(dims_tensor.shape()), + errors::InvalidArgument("The indices can only be 1-D, got \"", + dims_tensor.shape().DebugString(), "\"")); + + auto dims = dims_tensor.vec(); + + Eigen::array reverse({true}); + + Tensor strides_tensor; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum::value, + TensorShape({dims_tensor.NumElements()}), + &strides_tensor)); + + auto strides = strides_tensor.vec(); + strides = dims.reverse(reverse) + .scan(0, Eigen::internal::ProdReducer(), false) + .reverse(reverse); + + Tensor strides_shifted_tensor; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp(DataTypeToEnum::value, + TensorShape({dims_tensor.NumElements()}), + &strides_shifted_tensor)); + + auto strides_shifted = strides_shifted_tensor.vec(); + strides_shifted = dims.reverse(reverse) + .scan(0, Eigen::internal::ProdReducer(), true) + .reverse(reverse); + + Tensor* output_tensor = nullptr; + if (TensorShapeUtils::IsScalar(indices_tensor.shape())) { + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}), + &output_tensor)); + + auto output = output_tensor->vec(); + + output = output.constant(indices_tensor.scalar()()); + output = output.binaryExpr(strides, mod_op()) / strides_shifted; + } else { + OP_REQUIRES_OK(ctx, ctx->allocate_output( + 0, TensorShape({dims_tensor.NumElements(), + indices_tensor.NumElements()}), + &output_tensor)); + + auto output = output_tensor->matrix(); + + Eigen::array reshape{{dims_tensor.NumElements(), 1}}; + Eigen::array bcast({1, indices_tensor.NumElements()}); + Eigen::array indices_reshape{{1, indices_tensor.NumElements()}}; + Eigen::array indices_bcast({dims_tensor.NumElements(), 1}); + + output = indices_tensor.vec() + .reshape(indices_reshape) + .broadcast(indices_bcast); + output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast), + mod_op()) / + strides_shifted.reshape(reshape).broadcast(bcast); + } + } +}; + +#define REGISTER_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("UnravelIndex").Device(DEVICE_CPU).TypeConstraint("Tidx"), \ + UnravelIndexOp); +TF_CALL_int32(REGISTER_KERNEL) TF_CALL_int64(REGISTER_KERNEL) +#undef REGISTER_KERNEL + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc index 2d05d72bff1..3477445197a 100644 --- a/tensorflow/core/kernels/word2vec_kernels.cc +++ b/tensorflow/core/kernels/word2vec_kernels.cc @@ -188,9 +188,9 @@ class SkipgramOp : public OpKernel { ++corpus_size_; } if (corpus_size_ < window_size_ * 10) { - return errors::InvalidArgument("The text file ", filename, - " contains too little data: ", - corpus_size_, " words"); + return errors::InvalidArgument( + "The text file ", filename, + " contains too little data: ", corpus_size_, " words"); } typedef std::pair WordFreq; std::vector ordered; diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc index 0f8d027caad..a6a71fdfaf1 100644 --- a/tensorflow/core/kernels/xent_op.cc +++ b/tensorflow/core/kernels/xent_op.cc @@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template class SoftmaxXentWithLogitsOp : public OpKernel { @@ -44,8 +44,8 @@ class SoftmaxXentWithLogitsOp : public OpKernel { OP_REQUIRES(context, logits_in.IsSameSize(labels_in), errors::InvalidArgument( "logits and labels must be same size: logits_size=", - logits_in.shape().DebugString(), " labels_size=", - labels_in.shape().DebugString())); + logits_in.shape().DebugString(), + " labels_size=", labels_in.shape().DebugString())); OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()), errors::InvalidArgument("logits must be 2-dimensional")); // As we already tested that both inputs have the same shape no need to @@ -72,7 +72,7 @@ class SoftmaxXentWithLogitsOp : public OpKernel { functor(context->eigen_device(), logits_in.matrix(), labels_in.matrix(), scratch.matrix(), loss_out->vec(), back_out->matrix()); - } + } } }; @@ -87,7 +87,7 @@ struct XentFunctorBase { typename TTypes::Vec loss, typename TTypes::Matrix backprop) { XentEigenImpl::Compute(d, logits, labels, scratch, loss, - backprop); + backprop); } }; @@ -97,7 +97,7 @@ struct XentFunctor : XentFunctorBase {}; #ifdef TENSORFLOW_USE_SYCL template struct XentFunctor : XentFunctorBase {}; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace functor #define REGISTER_CPU(T) \ @@ -129,6 +129,6 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") .Device(DEVICE_SYCL) .TypeConstraint("T"), SoftmaxXentWithLogitsOp); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL } // namespace tensorflow diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc index e2947012467..481f3b7ba46 100644 --- a/tensorflow/core/kernels/xsmm_conv2d_test.cc +++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc @@ -13,18 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/kernels/conv_ops.h" -#include "tensorflow/core/platform/test.h" -#include "tensorflow/core/graph/graph.h" -#include "tensorflow/core/graph/node_builder.h" -#include "tensorflow/core/kernels/ops_testutil.h" #include "include/libxsmm.h" #include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/kernels/conv_ops.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/platform/test.h" namespace tensorflow { namespace { - typedef struct { int nImg; int nIfm; @@ -49,45 +48,41 @@ typedef struct { int stride_w; } naive_conv_t; - -LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor &nhwc, int N, int H, int W, int C) -{ - LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W); +LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor& nhwc, + int N, int H, int W, int C) { + LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W); int n, h, w, c; - auto output = nhwc.flat(); - for ( n = 0; n < N; n++ ) { - for ( h = 0; h < H; h++ ) { - for ( w = 0; w < W; w++ ) { - for ( c = 0; c < C; c++ ) { - output(n*H*W*C + h*W*C +w*C + c) = - LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W); + auto output = nhwc.flat(); + for (n = 0; n < N; n++) { + for (h = 0; h < H; h++) { + for (w = 0; w < W; w++) { + for (c = 0; c < C; c++) { + output(n * H * W * C + h * W * C + w * C + c) = + LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W); } } } } } - -LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor &rsck, int R, int S, int C, int K) -{ - LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S); +LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor& rsck, + int R, int S, int C, int K) { + LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S); int r, s, c, k; - auto output = rsck.flat(); + auto output = rsck.flat(); - for ( r = 0; r < R; r++ ) { - for ( s = 0; s < S; s++ ) { - for ( c = 0; c < C; c++ ) { - for ( k = 0; k < K; k++ ) { - output(r*S*C*K + s*C*K + c*K + k) = - LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S); + for (r = 0; r < R; r++) { + for (s = 0; s < S; s++) { + for (c = 0; c < C; c++) { + for (k = 0; k < K; k++) { + output(r * S * C * K + s * C * K + c * K + k) = + LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S); } } } } } - - LIBXSMM_INLINE void zero_buf(float* buf, long size) { int i; for (i = 0; i < size; ++i) { @@ -95,52 +90,53 @@ LIBXSMM_INLINE void zero_buf(float* buf, long size) { } } -LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) { - long i; - auto output = dst.flat(); - for (i = 0; i < size; ++i) - output(i) = src[i]; +LIBXSMM_INLINE void copy_buf(Tensor& dst, float* src, long size) { + long i; + auto output = dst.flat(); + for (i = 0; i < size; ++i) output(i) = src[i]; } -LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) -{ +LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) { int i; zero_buf(buf, size); for (i = 0; i < size; ++i) { - buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0))); + buf[i] = + (float)((initOne != 0) + ? 1.0 + : ((initPos != 0) ? drand48() : (0.05 - drand48() / 10.0))); } } - - -LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter) -{ - int nImg = param->nImg; - int nIfm = param->nIfm; - int nOfm = param->nOfm; - int ifhp = param->ifhp; - int ifwp = param->ifwp; - int ofhp = param->ofhp; - int ofwp = param->ofwp; - int ifh = param->ifh; - int ifw = param->ifw; - int ofh = param->ofh; - int ofw = param->ofw; - int pad_h = param->pad_h; - int pad_w = param->pad_w; - int pad_h_in = param->pad_h_in; - int pad_w_in = param->pad_w_in; +LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, + float* output, const float* filter) { + int nImg = param->nImg; + int nIfm = param->nIfm; + int nOfm = param->nOfm; + int ifhp = param->ifhp; + int ifwp = param->ifwp; + int ofhp = param->ofhp; + int ofwp = param->ofwp; + int ifh = param->ifh; + int ifw = param->ifw; + int ofh = param->ofh; + int ofw = param->ofw; + int pad_h = param->pad_h; + int pad_w = param->pad_w; + int pad_h_in = param->pad_h_in; + int pad_w_in = param->pad_w_in; int pad_h_out = param->pad_h_out; int pad_w_out = param->pad_w_out; - int kh = param->kh; - int kw = param->kw; - int stride_h = param->stride_h; - int stride_w = param->stride_w; + int kh = param->kh; + int kw = param->kw; + int stride_h = param->stride_h; + int stride_w = param->stride_w; /* loop counters */ int img, ofm, ifm, oj, oi, ij, ii, kj, ki; - LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp); - LIBXSMM_VLA_DECL(4, const float, input_t, input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp); + LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out), + nOfm, ofhp, ofwp); + LIBXSMM_VLA_DECL(4, const float, input_t, + input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp); LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw); for (img = 0; img < nImg; ++img) { @@ -151,12 +147,15 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float for (oi = 0; oi < ofw; ++oi) { ii = oi * stride_w - pad_w; for (kj = 0; kj < kh; ++kj) { - if(ij+kj < 0 || ij+kj >= ifh) continue; + if (ij + kj < 0 || ij + kj >= ifh) continue; for (ki = 0; ki < kw; ++ki) { - if(ii+ki < 0 || ii+ki >= ifw) continue; - LIBXSMM_VLA_ACCESS( 4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) += - LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp) - * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw); + if (ii + ki < 0 || ii + ki >= ifw) continue; + LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp, + ofwp) += + LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki, + nIfm, ifhp, ifwp) * + LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, + kw); } } } @@ -168,134 +167,118 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float void RunXsmmVsGeneric() {} - class XsmmConv2DTest : public OpsTestBase { protected: void MakeOp(int stride) { - TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D") - .Input(FakeInput(DT_FLOAT)) - .Input(FakeInput(DT_FLOAT)) - .Attr("strides", {1, stride,stride, 1}) - .Attr("padding", "VALID" ) - .Finalize(node_def())); - + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Attr("strides", {1, stride, stride, 1}) + .Attr("padding", "VALID") + .Finalize(node_def())); TF_ASSERT_OK(InitOp()); } }; TEST_F(XsmmConv2DTest, Basic) { - MakeOp(1); + MakeOp(1); - // setup scoped allocator, which uses cpu_allocator() for this scope - const libxsmm_tf_allocator tf_allocator; + // setup scoped allocator, which uses cpu_allocator() for this scope + const libxsmm_tf_allocator tf_allocator; - int ifw = 14; /* input width, "W" */ - int ifh = 14; /* input height, "H" */ - int nImg = 32; /* mini-batch size, "N" */ - int nIfm = 64; /* number of input feature maps, "C" */ - int nOfm = 64; /* number of output feature maps, "K" */ - int kh = 3; /* filter height, "R" */ - int kw = 3; /* filter width, "S" */ - int pad = 0; /* padding in output */ - int stride = 1; /* stride when accessing inputs */ + int ifw = 14; /* input width, "W" */ + int ifh = 14; /* input height, "H" */ + int nImg = 32; /* mini-batch size, "N" */ + int nIfm = 64; /* number of input feature maps, "C" */ + int nOfm = 64; /* number of output feature maps, "K" */ + int kh = 3; /* filter height, "R" */ + int kw = 3; /* filter width, "S" */ + int pad = 0; /* padding in output */ + int stride = 1; /* stride when accessing inputs */ + int stride_w = stride; + int stride_h = stride; + int pad_h = pad; + int pad_w = pad; - int stride_w = stride; - int stride_h = stride; - int pad_h = pad; - int pad_w = pad; + int pad_h_in = pad_h; + int pad_w_in = pad_w; - int pad_h_in = pad_h; - int pad_w_in = pad_w; - - int pad_h_out = 0; - int pad_w_out = 0; + int pad_h_out = 0; + int pad_w_out = 0; /* deriving some values for naive code */ - int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1; - int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1; - int ifhp = ifh + 2 * pad_h_in; - int ifwp = ifw + 2 * pad_w_in; - int ofhp = ofh + 2 * pad_h_out; - int ofwp = ofw + 2 * pad_w_out; + int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1; + int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1; + int ifhp = ifh + 2 * pad_h_in; + int ifwp = ifw + 2 * pad_w_in; + int ofhp = ofh + 2 * pad_h_out; + int ofwp = ofw + 2 * pad_w_out; + // Initialization of Filter and Image - //Initialization of Filter and Image + /* allocate data */ + float* naive_input = (float*)libxsmm_aligned_scratch( + nImg * nIfm * ifhp * ifwp * sizeof(float), 2097152); + float* naive_output = (float*)libxsmm_aligned_scratch( + nImg * nOfm * ofhp * ofwp * sizeof(float), 2097152); + float* naive_filter = (float*)libxsmm_aligned_scratch( + nOfm * nIfm * kh * kw * sizeof(float), 2097152); + /* initialize data */ + init_buf(naive_input, nImg * nIfm * ifhp * ifwp, 0, 0); + zero_buf(naive_output, nImg * nOfm * ofhp * ofwp); + init_buf(naive_filter, nOfm * nIfm * kh * kw, 0, 0); - /* allocate data */ - float *naive_input = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152); - float *naive_output = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152); - float *naive_filter = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw* sizeof(float), 2097152); - /* initialize data */ - init_buf(naive_input, nImg*nIfm*ifhp*ifwp, 0, 0); - zero_buf(naive_output, nImg*nOfm*ofhp*ofwp); - init_buf(naive_filter, nOfm*nIfm*kh*kw, 0, 0); + Tensor image(DT_FLOAT, {nImg, ifhp, ifwp, nIfm}); + Tensor filter(DT_FLOAT, {kh, kw, nIfm, nOfm}); - Tensor image(DT_FLOAT, - {nImg, ifhp, ifwp, nIfm}); + naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm); + naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm); + // Run naive convolution - Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm}); + naive_conv_t naive_param; + naive_param.nImg = nImg; + naive_param.nIfm = nIfm; + naive_param.nOfm = nOfm; + naive_param.ifhp = ifhp; + naive_param.ifwp = ifwp; + naive_param.ofhp = ofhp; + naive_param.ofwp = ofwp; + naive_param.ifh = ifh; + naive_param.ifw = ifw; + naive_param.ofh = ofh; + naive_param.ofw = ofw; + naive_param.pad_h = pad_h; + naive_param.pad_w = pad_w; + naive_param.pad_h_in = pad_h_in; + naive_param.pad_w_in = pad_w_in; + naive_param.pad_h_out = pad_h_out; + naive_param.pad_w_out = pad_w_out; + naive_param.kh = kh; + naive_param.kw = kw; + naive_param.stride_h = stride_h; + naive_param.stride_w = stride_w; - naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm); - naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm); + naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter); + AddInputFromArray(image.shape(), image.flat()); + AddInputFromArray(filter.shape(), filter.flat()); - //Run naive convolution - - naive_conv_t naive_param; - - naive_param.nImg = nImg; - naive_param.nIfm = nIfm; - naive_param.nOfm = nOfm; - naive_param.ifhp = ifhp; - naive_param.ifwp = ifwp; - naive_param.ofhp = ofhp; - naive_param.ofwp = ofwp; - naive_param.ifh = ifh; - naive_param.ifw = ifw; - naive_param.ofh = ofh; - naive_param.ofw = ofw; - naive_param.pad_h = pad_h; - naive_param.pad_w = pad_w; - naive_param.pad_h_in = pad_h_in; - naive_param.pad_w_in = pad_w_in; - naive_param.pad_h_out = pad_h_out; - naive_param.pad_w_out = pad_w_out; - naive_param.kh = kh; - naive_param.kw = kw; - naive_param.stride_h = stride_h; - naive_param.stride_w = stride_w; - - - naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter); - - - - AddInputFromArray(image.shape(), image.flat()); - AddInputFromArray(filter.shape(), filter.flat()); - - - - //Run Op (TF) - TF_ASSERT_OK(RunOpKernel()); - - // Check the output. - Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm}); - naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm); - - - test::ExpectTensorNear(expected, *GetOutput(0), 1e-5); - libxsmm_free(naive_input); - libxsmm_free(naive_output); - libxsmm_free(naive_filter); - + // Run Op (TF) + TF_ASSERT_OK(RunOpKernel()); + // Check the output. + Tensor expected(DT_FLOAT, {nImg, ofhp, ofwp, nOfm}); + naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm); + test::ExpectTensorNear(expected, *GetOutput(0), 1e-5); + libxsmm_free(naive_input); + libxsmm_free(naive_output); + libxsmm_free(naive_filter); } /* @@ -325,7 +308,8 @@ TEST(XsmmConv2DTest, Basic) { desc.threads = num_threads; desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC; - desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK; + desc.filter_format = +LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK; desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; desc.options = LIBXSMM_DNN_CONV_OPTION_NONE; desc.datatype = LIBXSMM_DNN_DATATYPE_F32; diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h index 58a50a70c26..49f74ff47fb 100644 --- a/tensorflow/core/lib/core/status.h +++ b/tensorflow/core/lib/core/status.h @@ -131,7 +131,7 @@ inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v, while (auto _result = ::tensorflow::TfCheckOpHelper(val, #val)) \ LOG(level) << *(_result) -#define TF_CHECK_OK(val) TF_DO_CHECK_OK(val, FATAL) +#define TF_CHECK_OK(val) TF_DO_CHECK_OK(val, FATAL) #define TF_QCHECK_OK(val) TF_DO_CHECK_OK(val, QFATAL) // DEBUG only version of TF_CHECK_OK. Compiler still parses 'val' even in opt diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc index 2b10ebeaf7c..e55ed79d36c 100644 --- a/tensorflow/core/lib/core/threadpool.cc +++ b/tensorflow/core/lib/core/threadpool.cc @@ -66,7 +66,9 @@ struct EigenEnvironment { } return Task{ std::unique_ptr(new TaskImpl{ - std::move(f), Context(ContextKind::kThread), id, + std::move(f), + Context(ContextKind::kThread), + id, }), }; } diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc index 49ddb16645c..627ef5a892a 100644 --- a/tensorflow/core/lib/core/threadpool_test.cc +++ b/tensorflow/core/lib/core/threadpool_test.cc @@ -97,8 +97,8 @@ TEST(ThreadPool, ParallelForWithWorkerId) { } pool.ParallelForWithWorkerId( kWorkItems, kHugeCost, - [&threads_running, &work, num_threads]( - int64 begin, int64 end, int64 id) { + [&threads_running, &work, num_threads](int64 begin, int64 end, + int64 id) { // Store true for the current thread, and assert that another thread // is not running with the same id. ASSERT_LE(0, id); diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h index 0faa458f1d6..efe97f78d25 100644 --- a/tensorflow/core/lib/db/sqlite.h +++ b/tensorflow/core/lib/db/sqlite.h @@ -18,12 +18,12 @@ limitations under the License. #include #include "sqlite3.h" +#include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/lib/core/refcount.h" /// TensorFlow SQLite Veneer /// @@ -121,10 +121,7 @@ class LOCKABLE Sqlite : public core::RefCounted { Sqlite(sqlite3* db, sqlite3_stmt* begin, sqlite3_stmt* commit, sqlite3_stmt* rollback) noexcept - : db_(db), - begin_(begin), - commit_(commit), - rollback_(rollback) {} + : db_(db), begin_(begin), commit_(commit), rollback_(rollback) {} sqlite3* const db_; sqlite3_stmt* const begin_; @@ -233,7 +230,8 @@ class SqliteStatement { /// freed until this statement is Reset() or finalized. void BindText(int parameter, const StringPiece& text) { Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(), - SQLITE_TRANSIENT, SQLITE_UTF8), parameter); + SQLITE_TRANSIENT, SQLITE_UTF8), + parameter); size_ += text.size(); } void BindText(const char* parameter, const StringPiece& text) { @@ -241,7 +239,8 @@ class SqliteStatement { } void BindTextUnsafe(int parameter, const StringPiece& text) { Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(), - SQLITE_STATIC, SQLITE_UTF8), parameter); + SQLITE_STATIC, SQLITE_UTF8), + parameter); size_ += text.size(); } void BindTextUnsafe(const char* parameter, const StringPiece& text) { @@ -254,7 +253,8 @@ class SqliteStatement { /// freed until this statement is Reset() or finalized. void BindBlob(int parameter, const StringPiece& blob) { Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(), - SQLITE_TRANSIENT), parameter); + SQLITE_TRANSIENT), + parameter); size_ += blob.size(); } void BindBlob(const char* parameter, const StringPiece& blob) { @@ -262,7 +262,8 @@ class SqliteStatement { } void BindBlobUnsafe(int parameter, const StringPiece& blob) { Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(), - SQLITE_STATIC), parameter); + SQLITE_STATIC), + parameter); size_ += blob.size(); } void BindBlobUnsafe(const char* parameter, const StringPiece& text) { @@ -320,9 +321,7 @@ class SqliteStatement { /// \brief Move constructor, after which is reset to empty. SqliteStatement(SqliteStatement&& other) noexcept - : db_(other.db_), - stmt_(other.stmt_), - bind_error_(other.bind_error_) { + : db_(other.db_), stmt_(other.stmt_), bind_error_(other.bind_error_) { other.db_ = nullptr; other.stmt_ = nullptr; other.bind_error_ = SQLITE_OK; diff --git a/tensorflow/core/lib/db/sqlite_test.cc b/tensorflow/core/lib/db/sqlite_test.cc index c9c76ea5f2c..1e88323d017 100644 --- a/tensorflow/core/lib/db/sqlite_test.cc +++ b/tensorflow/core/lib/db/sqlite_test.cc @@ -33,9 +33,7 @@ class SqliteTest : public ::testing::Test { db_->PrepareOrDie("CREATE TABLE T (a BLOB, b BLOB)").StepAndResetOrDie(); } - void TearDown() override { - db_->Unref(); - } + void TearDown() override { db_->Unref(); } Sqlite* db_; bool is_done_; @@ -213,7 +211,7 @@ TEST_F(SqliteTest, BindFailed) { Status s = stmt.StepOnce(); EXPECT_NE(string::npos, s.error_message().find("INSERT INTO T (a) VALUES (123)")) - << s.error_message(); + << s.error_message(); } TEST_F(SqliteTest, SnappyExtension) { @@ -226,7 +224,7 @@ TEST_F(SqliteTest, SnappyBinaryCompatibility) { EXPECT_EQ( "today is the end of the republic", db_->PrepareOrDie("SELECT UNSNAP(X'03207C746F6461792069732074686520656E64" - "206F66207468652072657075626C6963')") + "206F66207468652072657075626C6963')") .StepOnceOrDie() .ColumnString(0)); } diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc index 0f6999c88fc..e5deb2b873e 100644 --- a/tensorflow/core/lib/gif/gif_io.cc +++ b/tensorflow/core/lib/gif/gif_io.cc @@ -44,6 +44,14 @@ int input_callback(GifFileType* gif_file, GifByteType* buf, int size) { return 0; } +static const char* GifErrorStringNonNull(int error_code) { + const char* error_string = GifErrorString(error_code); + if (error_string == nullptr) { + return "Unknown error"; + } + return error_string; +} + uint8* Decode(const void* srcdata, int datasize, const std::function& allocate_output, string* error_string) { @@ -55,17 +63,17 @@ uint8* Decode(const void* srcdata, int datasize, int error_code = D_GIF_SUCCEEDED; if (gif_file && DGifCloseFile(gif_file, &error_code) != GIF_OK) { LOG(WARNING) << "Fail to close gif file, reason: " - << GifErrorString(error_code); + << GifErrorStringNonNull(error_code); } }); if (error_code != D_GIF_SUCCEEDED) { *error_string = strings::StrCat("failed to open gif file: ", - GifErrorString(error_code)); + GifErrorStringNonNull(error_code)); return nullptr; } if (DGifSlurp(gif_file) != GIF_OK) { *error_string = strings::StrCat("failed to slurp gif file: ", - GifErrorString(gif_file->Error)); + GifErrorStringNonNull(gif_file->Error)); return nullptr; } if (gif_file->ImageCount <= 0) { diff --git a/tensorflow/core/lib/gtl/cleanup.h b/tensorflow/core/lib/gtl/cleanup.h index 6053e986402..6bd60ca4824 100644 --- a/tensorflow/core/lib/gtl/cleanup.h +++ b/tensorflow/core/lib/gtl/cleanup.h @@ -55,22 +55,21 @@ namespace gtl { template class Cleanup { public: - Cleanup() - : released_(true), f_() {} + Cleanup() : released_(true), f_() {} template - explicit Cleanup(G&& f) // NOLINT + explicit Cleanup(G&& f) // NOLINT : f_(std::forward(f)) {} // NOLINT(build/c++11) Cleanup(Cleanup&& src) // NOLINT - : released_(src.is_released()), f_(src.release()) { } + : released_(src.is_released()), f_(src.release()) {} // Implicitly move-constructible from any compatible Cleanup. // The source will be released as if src.release() were called. // A moved-from Cleanup can be safely destroyed or reassigned. template Cleanup(Cleanup&& src) // NOLINT - : released_(src.is_released()), f_(src.release()) { } + : released_(src.is_released()), f_(src.release()) {} // Assignment to a Cleanup object behaves like destroying it // and making a new one in its place, analogous to unique_ptr @@ -102,8 +101,8 @@ class Cleanup { F f_; }; -template ::type> +template ::type> TF_MUST_USE_RESULT Cleanup MakeCleanup(F&& f) { return Cleanup(std::forward(f)); } diff --git a/tensorflow/core/lib/gtl/cleanup_test.cc b/tensorflow/core/lib/gtl/cleanup_test.cc index bd151cb2ab1..a86ffd5fe28 100644 --- a/tensorflow/core/lib/gtl/cleanup_test.cc +++ b/tensorflow/core/lib/gtl/cleanup_test.cc @@ -65,15 +65,14 @@ TEST(CleanupTest, Release) { TEST(FinallyTest, TypeErasedWithoutFactory) { string s = "active"; { - AnyCleanup s_cleaner([&s]{ s.append(" clean"); }); + AnyCleanup s_cleaner([&s] { s.append(" clean"); }); EXPECT_EQ("active", s); } EXPECT_EQ("active clean", s); } struct Appender { - Appender(string* s, const string& msg) - : s_(s), msg_(msg) {} + Appender(string* s, const string& msg) : s_(s), msg_(msg) {} void operator()() const { s_->append(msg_); } string* s_; string msg_; @@ -163,7 +162,12 @@ class CleanupReferenceTest : public ::testing::Test { int* i; F(int* cp, int* i) : cp(cp), i(i) {} F(const F& o) : cp(o.cp), i(o.i) { ++*cp; } - F& operator=(const F& o) { cp = o.cp; i = o.i; ++*cp; return *this; } + F& operator=(const F& o) { + cp = o.cp; + i = o.i; + ++*cp; + return *this; + } F(F&&) = default; F& operator=(F&&) = default; void operator()() const { ++*i; } @@ -279,7 +283,7 @@ BENCHMARK(BM_AnyCleanup); void BM_AnyCleanupNoFactory(int iters) { while (iters--) { - AnyCleanup fin([]{Incr();}); + AnyCleanup fin([] { Incr(); }); } } BENCHMARK(BM_AnyCleanupNoFactory); diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h index d6e5d9effa7..6e3cb2206d9 100644 --- a/tensorflow/core/lib/gtl/inlined_vector.h +++ b/tensorflow/core/lib/gtl/inlined_vector.h @@ -31,12 +31,12 @@ limitations under the License. #ifndef TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_ #define TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_ -#include #include #include #include #include #include +#include #include #include #include @@ -407,7 +407,7 @@ class InlinedVector { }; // 2) Construct a T with args at not-yet-initialized memory pointed by dst. struct Construct { - template + template void operator()(T* dst, Args&&... args) const { new (dst) T(std::forward(args)...); } diff --git a/tensorflow/core/lib/gtl/int_type.h b/tensorflow/core/lib/gtl/int_type.h index 647fc81aa7e..af3e50ad78f 100644 --- a/tensorflow/core/lib/gtl/int_type.h +++ b/tensorflow/core/lib/gtl/int_type.h @@ -255,13 +255,13 @@ class IntType { value_ op arg_value; \ return *this; \ } - INT_TYPE_ASSIGNMENT_OP(+= ); - INT_TYPE_ASSIGNMENT_OP(-= ); - INT_TYPE_ASSIGNMENT_OP(*= ); - INT_TYPE_ASSIGNMENT_OP(/= ); - INT_TYPE_ASSIGNMENT_OP(<<= ); // NOLINT - INT_TYPE_ASSIGNMENT_OP(>>= ); // NOLINT - INT_TYPE_ASSIGNMENT_OP(%= ); + INT_TYPE_ASSIGNMENT_OP(+=); + INT_TYPE_ASSIGNMENT_OP(-=); + INT_TYPE_ASSIGNMENT_OP(*=); + INT_TYPE_ASSIGNMENT_OP(/=); + INT_TYPE_ASSIGNMENT_OP(<<=); // NOLINT + INT_TYPE_ASSIGNMENT_OP(>>=); // NOLINT + INT_TYPE_ASSIGNMENT_OP(%=); #undef INT_TYPE_ASSIGNMENT_OP ThisType& operator=(ValueType arg_value) { @@ -314,10 +314,10 @@ std::ostream& operator<<(std::ostream& os, // NOLINT INT_TYPE_ARITHMETIC_OP(+); INT_TYPE_ARITHMETIC_OP(-); INT_TYPE_ARITHMETIC_OP(*); -INT_TYPE_ARITHMETIC_OP(/ ); -INT_TYPE_ARITHMETIC_OP(<< ); // NOLINT -INT_TYPE_ARITHMETIC_OP(>> ); // NOLINT -INT_TYPE_ARITHMETIC_OP(% ); +INT_TYPE_ARITHMETIC_OP(/); +INT_TYPE_ARITHMETIC_OP(<<); // NOLINT +INT_TYPE_ARITHMETIC_OP(>>); // NOLINT +INT_TYPE_ARITHMETIC_OP(%); #undef INT_TYPE_ARITHMETIC_OP // -- NON-MEMBER COMPARISON OPERATORS ------------------------------------------ @@ -345,12 +345,12 @@ INT_TYPE_ARITHMETIC_OP(% ); IntType id) { \ return val op id.value(); \ } -INT_TYPE_COMPARISON_OP(== ); // NOLINT -INT_TYPE_COMPARISON_OP(!= ); // NOLINT -INT_TYPE_COMPARISON_OP(< ); // NOLINT -INT_TYPE_COMPARISON_OP(<= ); // NOLINT -INT_TYPE_COMPARISON_OP(> ); // NOLINT -INT_TYPE_COMPARISON_OP(>= ); // NOLINT +INT_TYPE_COMPARISON_OP(==); // NOLINT +INT_TYPE_COMPARISON_OP(!=); // NOLINT +INT_TYPE_COMPARISON_OP(<); // NOLINT +INT_TYPE_COMPARISON_OP(<=); // NOLINT +INT_TYPE_COMPARISON_OP(>); // NOLINT +INT_TYPE_COMPARISON_OP(>=); // NOLINT #undef INT_TYPE_COMPARISON_OP } // namespace gtl diff --git a/tensorflow/core/lib/gtl/int_type_test.cc b/tensorflow/core/lib/gtl/int_type_test.cc index d3c405d9acd..61d364017cb 100644 --- a/tensorflow/core/lib/gtl/int_type_test.cc +++ b/tensorflow/core/lib/gtl/int_type_test.cc @@ -42,7 +42,8 @@ class IntTypeTest : public ::testing::Test { // All tests below will be executed on all supported IntTypes. typedef ::testing::Types SupportedIntTypes; + Int64_IT, UInt64_IT, Long_IT> + SupportedIntTypes; TYPED_TEST_CASE(IntTypeTest, SupportedIntTypes); @@ -232,7 +233,8 @@ TYPED_TEST(IntTypeTest, TestOperators) { TYPED_TEST(IntTypeTest, TestHashFunctor) { std::unordered_map map; + typename TestFixture::T::Hasher> + map; typename TestFixture::T a(0); map[a] = 'c'; EXPECT_EQ('c', map[a]); diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h index 2ff8b9c7d1a..fa33c24c0c0 100644 --- a/tensorflow/core/lib/gtl/optional.h +++ b/tensorflow/core/lib/gtl/optional.h @@ -593,12 +593,12 @@ class optional : private internal_optional::optional_data, assert(this->engaged_); return this->pointer(); } - constexpr const T& operator*() const & { return reference(); } + constexpr const T& operator*() const& { return reference(); } T& operator*() & { assert(this->engaged_); return reference(); } - constexpr const T&& operator*() const && { return std::move(reference()); } + constexpr const T&& operator*() const&& { return std::move(reference()); } T&& operator*() && { assert(this->engaged_); return std::move(reference()); @@ -621,7 +621,7 @@ class optional : private internal_optional::optional_data, // Use `opt.value()` to get a reference to underlying value. The constness // and lvalue/rvalue-ness of `opt` is preserved to the view of the T // subobject. - const T& value() const & { + const T& value() const& { CHECK(*this) << "Bad optional access"; return reference(); } @@ -633,7 +633,7 @@ class optional : private internal_optional::optional_data, CHECK(*this) << "Bad optional access"; return std::move(reference()); } - const T&& value() const && { // NOLINT(build/c++11) + const T&& value() const&& { // NOLINT(build/c++11) CHECK(*this) << "Bad optional access"; return std::move(reference()); } @@ -641,7 +641,7 @@ class optional : private internal_optional::optional_data, // Use `opt.value_or(val)` to get either the value of T or the given default // `val` in the empty case. template - constexpr T value_or(U&& v) const & { + constexpr T value_or(U&& v) const& { return static_cast(*this) ? **this : static_cast(std::forward(v)); } @@ -656,8 +656,8 @@ class optional : private internal_optional::optional_data, constexpr const T& reference() const { return *this->pointer(); } T& reference() { return *(this->pointer()); } - // T constraint checks. You can't have an optional of nullopt_t, in_place_t or - // a reference. + // T constraint checks. You can't have an optional of nullopt_t, in_place_t + // or a reference. static_assert( !std::is_same::type>::value, "optional is not allowed."); diff --git a/tensorflow/core/lib/gtl/optional_test.cc b/tensorflow/core/lib/gtl/optional_test.cc index 547bee7b75f..12b5bbc60be 100644 --- a/tensorflow/core/lib/gtl/optional_test.cc +++ b/tensorflow/core/lib/gtl/optional_test.cc @@ -24,17 +24,29 @@ limitations under the License. namespace tensorflow { namespace { -using tensorflow::gtl::optional; -using tensorflow::gtl::nullopt; -using tensorflow::gtl::nullopt_t; using tensorflow::gtl::in_place; using tensorflow::gtl::in_place_t; using tensorflow::gtl::make_optional; +using tensorflow::gtl::nullopt; +using tensorflow::gtl::nullopt_t; +using tensorflow::gtl::optional; -template string TypeQuals(T&) { return "&"; } -template string TypeQuals(T&&) { return "&&"; } -template string TypeQuals(const T&) { return "c&"; } -template string TypeQuals(const T&&) { return "c&&"; } +template +string TypeQuals(T&) { + return "&"; +} +template +string TypeQuals(T&&) { + return "&&"; +} +template +string TypeQuals(const T&) { + return "c&"; +} +template +string TypeQuals(const T&&) { + return "c&&"; +} struct StructorListener { int construct0 = 0; diff --git a/tensorflow/core/lib/gtl/top_n_test.cc b/tensorflow/core/lib/gtl/top_n_test.cc index fae85570dc0..ba30c072a90 100644 --- a/tensorflow/core/lib/gtl/top_n_test.cc +++ b/tensorflow/core/lib/gtl/top_n_test.cc @@ -28,10 +28,10 @@ limitations under the License. namespace { +using tensorflow::string; using tensorflow::gtl::TopN; using tensorflow::random::PhiloxRandom; using tensorflow::random::SimplePhilox; -using tensorflow::string; // Move the contents from an owned raw pointer, returning by value. // Objects are easier to manage by value. diff --git a/tensorflow/core/lib/io/compression.cc b/tensorflow/core/lib/io/compression.cc index c12de98e401..0d25bca9ecc 100644 --- a/tensorflow/core/lib/io/compression.cc +++ b/tensorflow/core/lib/io/compression.cc @@ -22,6 +22,6 @@ namespace compression { const char kNone[] = ""; const char kGzip[] = "GZIP"; -} -} -} +} // namespace compression +} // namespace io +} // namespace tensorflow diff --git a/tensorflow/core/lib/io/compression.h b/tensorflow/core/lib/io/compression.h index ef90c60a3a4..4d8e7788cad 100644 --- a/tensorflow/core/lib/io/compression.h +++ b/tensorflow/core/lib/io/compression.h @@ -23,8 +23,8 @@ namespace compression { extern const char kNone[]; extern const char kGzip[]; -} -} -} +} // namespace compression +} // namespace io +} // namespace tensorflow #endif // TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_ diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc index 403c82818ef..9cc6c4034f4 100644 --- a/tensorflow/core/lib/io/record_reader.cc +++ b/tensorflow/core/lib/io/record_reader.cc @@ -207,7 +207,7 @@ Status RecordReader::SkipNBytes(uint64 offset) { } } return Status::OK(); -} +} // namespace io SequentialRecordReader::SequentialRecordReader( RandomAccessFile* file, const RecordReaderOptions& options) diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc index 507c26a63ff..b7e51256a22 100644 --- a/tensorflow/core/lib/io/recordio_test.cc +++ b/tensorflow/core/lib/io/recordio_test.cc @@ -218,8 +218,8 @@ TEST_F(RecordioTest, RandomRead) { // Tests of all the error paths in log_reader.cc follow: static void AssertHasSubstr(StringPiece s, StringPiece expected) { - EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain " - << expected; + EXPECT_TRUE(StringPiece(s).contains(expected)) + << s << " does not contain " << expected; } TEST_F(RecordioTest, ReadError) { diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc index 354c819b090..77a3414442c 100644 --- a/tensorflow/core/lib/png/png_io.cc +++ b/tensorflow/core/lib/png/png_io.cc @@ -197,8 +197,8 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels, int desired_channel_bits, DecodeContext* context) { CHECK(desired_channel_bits == 8 || desired_channel_bits == 16) << "desired_channel_bits = " << desired_channel_bits; - CHECK(0 <= desired_channels && desired_channels <= 4) << "desired_channels = " - << desired_channels; + CHECK(0 <= desired_channels && desired_channels <= 4) + << "desired_channels = " << desired_channels; context->error_condition = false; context->channels = desired_channels; context->png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, context, diff --git a/tensorflow/core/lib/random/philox_random_test_utils.h b/tensorflow/core/lib/random/philox_random_test_utils.h index f4bb087e107..6c29ae6b6a2 100644 --- a/tensorflow/core/lib/random/philox_random_test_utils.h +++ b/tensorflow/core/lib/random/philox_random_test_utils.h @@ -35,8 +35,8 @@ void FillRandoms(PhiloxRandom gen, typename Distribution::ResultElementType* p, int64 size) { const int granularity = Distribution::kResultElementCount; - CHECK(size % granularity == 0) << " size: " << size - << " granularity: " << granularity; + CHECK(size % granularity == 0) + << " size: " << size << " granularity: " << granularity; Distribution dist; for (int i = 0; i < size; i += granularity) { diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h index 0e281403f87..3fe1f9bc6cf 100644 --- a/tensorflow/core/lib/random/random_distributions.h +++ b/tensorflow/core/lib/random/random_distributions.h @@ -17,8 +17,8 @@ limitations under the License. #define TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_ #define _USE_MATH_DEFINES -#include #include +#include #undef _USE_MATH_DEFINES #include @@ -27,7 +27,6 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/lib/random/philox_random.h" - namespace tensorflow { namespace random { diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc index 90d0dba4a77..85d68f456e1 100644 --- a/tensorflow/core/lib/random/random_distributions_test.cc +++ b/tensorflow/core/lib/random/random_distributions_test.cc @@ -45,8 +45,8 @@ void FillRandomsWithSingles(PhiloxRandom gen, int64 size) { int granularity = Distribution::kResultElementCount; - CHECK(size % granularity == 0) << " size: " << size - << " granularity: " << granularity; + CHECK(size % granularity == 0) + << " size: " << size << " granularity: " << granularity; SingleSampleAdapter single_samples(&gen); diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc index af9a1512594..ef90050b4f6 100644 --- a/tensorflow/core/lib/strings/ordered_code.cc +++ b/tensorflow/core/lib/strings/ordered_code.cc @@ -472,7 +472,8 @@ void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) { // buf = val in network byte order, sign extended to 10 bytes const char sign_byte = val < 0 ? '\xff' : '\0'; char buf[10] = { - sign_byte, sign_byte, + sign_byte, + sign_byte, }; StoreBigEndian64(buf + 2, val); static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch"); diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h index 5835b0101d9..2bc14945cd0 100644 --- a/tensorflow/core/lib/strings/strcat.h +++ b/tensorflow/core/lib/strings/strcat.h @@ -126,7 +126,7 @@ class AlphaNum { : piece_(digits_, strlen(DoubleToBuffer(f, digits_))) {} AlphaNum(const Eigen::half &f); // NOLINT(runtime/explicit) - AlphaNum(Hex hex); // NOLINT(runtime/explicit) + AlphaNum(Hex hex); // NOLINT(runtime/explicit) AlphaNum(const char *c_str) : piece_(c_str) {} // NOLINT(runtime/explicit) AlphaNum(const StringPiece &pc) : piece_(pc) {} // NOLINT(runtime/explicit) diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc index 279a5876f96..267ce884400 100644 --- a/tensorflow/core/ops/array_ops.cc +++ b/tensorflow/core/ops/array_ops.cc @@ -335,6 +335,13 @@ REGISTER_OP("Unpack") return Status::OK(); }); +REGISTER_OP("UnravelIndex") + .Input("indices: Tidx") + .Input("dims: Tidx") + .Output("output: Tidx") + .Attr("Tidx: {int32, int64} = DT_INT32") + .SetShapeFn([](InferenceContext* c) { return Status::OK(); }); + // -------------------------------------------------------------------------- // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph // in the N == 1 case to remove the node. @@ -701,10 +708,11 @@ REGISTER_OP("MatrixDiagPart") // -------------------------------------------------------------------------- REGISTER_OP("MatrixBandPart") .Input("input: T") - .Input("num_lower: int64") - .Input("num_upper: int64") + .Input("num_lower: Tindex") + .Input("num_upper: Tindex") .Output("band: T") .Attr("T: type") + .Attr("Tindex: {int32, int64} = DT_INT64") .SetShapeFn(shape_inference::UnchangedShape); // -------------------------------------------------------------------------- @@ -977,8 +985,8 @@ REGISTER_OP("GatherNd") if (c->Value(r_dim) > c->Rank(params)) { return errors::InvalidArgument( "indices.shape[-1] must be <= params.rank, but saw indices shape: ", - c->DebugString(indices), " and params shape: ", - c->DebugString(params)); + c->DebugString(indices), + " and params shape: ", c->DebugString(params)); } // Remove r_dim from indices to get output. @@ -1252,12 +1260,12 @@ REGISTER_OP("ReverseSequence") // Validate batch_dim and seq_dim against input. const int32 input_rank = c->Rank(input); if (batch_dim >= input_rank) { - return errors::InvalidArgument("batch_dim must be < input rank: ", - batch_dim, " vs. ", input_rank); + return errors::InvalidArgument( + "batch_dim must be < input rank: ", batch_dim, " vs. ", input_rank); } if (seq_dim >= input_rank) { - return errors::InvalidArgument("seq_dim must be < input rank: ", - seq_dim, " vs. ", input_rank); + return errors::InvalidArgument( + "seq_dim must be < input rank: ", seq_dim, " vs. ", input_rank); } DimensionHandle batch_dim_dim = c->Dim(input, batch_dim); @@ -2638,8 +2646,9 @@ Status ScatterNdShape(InferenceContext* c) { Status s = c->Merge(prefix_indices, prefix_updates, &unused); if (!s.ok()) { return errors::InvalidArgument( - "The outer ", outer_dims, " dimensions of indices.shape=", - c->DebugString(indices_shape), " must match the outer ", outer_dims, + "The outer ", outer_dims, + " dimensions of indices.shape=", c->DebugString(indices_shape), + " must match the outer ", outer_dims, " dimensions of updates.shape=", c->DebugString(updates_shape), ": ", s.error_message()); } diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc index a182fd1c475..86d64635f4c 100644 --- a/tensorflow/core/ops/array_ops_test.cc +++ b/tensorflow/core/ops/array_ops_test.cc @@ -142,8 +142,13 @@ TEST(ArrayOpsTest, Const_ShapeFn) { TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) { for (const char* op_name : { - "CheckNumerics", "Identity", "RefIdentity", "QuantizeAndDequantize", - "StopGradient", "ZerosLike", "OnesLike", + "CheckNumerics", + "Identity", + "RefIdentity", + "QuantizeAndDequantize", + "StopGradient", + "ZerosLike", + "OnesLike", }) { ShapeInferenceTestOp op(op_name); INFER_OK(op, "?", "in0"); diff --git a/tensorflow/core/ops/candidate_sampling_ops_test.cc b/tensorflow/core/ops/candidate_sampling_ops_test.cc index c79b4439148..f3673716040 100644 --- a/tensorflow/core/ops/candidate_sampling_ops_test.cc +++ b/tensorflow/core/ops/candidate_sampling_ops_test.cc @@ -23,9 +23,12 @@ namespace tensorflow { TEST(CandidateSamplerOpsTest, CandidateSampler_ShapeFn) { for (const char* op_name : { - "AllCandidateSampler", "FixedUnigramCandidateSampler", - "LearnedUnigramCandidateSampler", "LogUniformCandidateSampler", - "ThreadUnsafeUnigramCandidateSampler", "UniformCandidateSampler", + "AllCandidateSampler", + "FixedUnigramCandidateSampler", + "LearnedUnigramCandidateSampler", + "LogUniformCandidateSampler", + "ThreadUnsafeUnigramCandidateSampler", + "UniformCandidateSampler", }) { ShapeInferenceTestOp op(op_name); TF_ASSERT_OK(NodeDefBuilder("test", op.name) diff --git a/tensorflow/core/ops/compat/backwards_compatibility_test.cc b/tensorflow/core/ops/compat/backwards_compatibility_test.cc index add05d6610a..6e05ae4be4f 100644 --- a/tensorflow/core/ops/compat/backwards_compatibility_test.cc +++ b/tensorflow/core/ops/compat/backwards_compatibility_test.cc @@ -25,8 +25,9 @@ namespace tensorflow { namespace { TEST(BackwardsCompatibilityTest, IsCompatible) { - OpCompatibilityLib compatibility( - "tensorflow/core/ops", strings::StrCat("v", TF_MAJOR_VERSION), nullptr); + OpCompatibilityLib compatibility("tensorflow/core/ops", + strings::StrCat("v", TF_MAJOR_VERSION), + nullptr); Env* env = Env::Default(); int changed_ops = 0; diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 65ab81931ad..2580eaf987b 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -17136,6 +17136,24 @@ op { type: DT_STRING } } +op { + name: "EnqueueInQueueDataset" + input_arg { + name: "queue" + type: DT_VARIANT + } + input_arg { + name: "components" + type_list_attr: "Tcomponents" + } + attr { + name: "Tcomponents" + type: "list(type)" + has_minimum: true + minimum: 1 + } + is_stateful: true +} op { name: "Enter" input_arg { @@ -24840,6 +24858,42 @@ op { type: "type" } } +op { + name: "MatrixBandPart" + input_arg { + name: "input" + type_attr: "T" + } + input_arg { + name: "num_lower" + type_attr: "Tindex" + } + input_arg { + name: "num_upper" + type_attr: "Tindex" + } + output_arg { + name: "band" + type_attr: "T" + } + attr { + name: "T" + type: "type" + } + attr { + name: "Tindex" + type: "type" + default_value { + type: DT_INT64 + } + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } +} op { name: "MatrixDeterminant" input_arg { @@ -32096,6 +32150,48 @@ op { minimum: 1 } } +op { + name: "PrependFromQueueAndPaddedBatchDataset" + input_arg { + name: "input_dataset" + type: DT_VARIANT + } + input_arg { + name: "batch_size" + type: DT_INT64 + } + input_arg { + name: "padded_shapes" + type: DT_INT64 + number_attr: "N" + } + input_arg { + name: "padding_values" + type_list_attr: "Toutput_types" + } + output_arg { + name: "handle" + type: DT_VARIANT + } + attr { + name: "Toutput_types" + type: "list(type)" + has_minimum: true + minimum: 1 + } + attr { + name: "output_shapes" + type: "list(shape)" + has_minimum: true + minimum: 1 + } + attr { + name: "N" + type: "int" + has_minimum: true + minimum: 1 + } +} op { name: "PreventGradient" input_arg { @@ -42820,6 +42916,36 @@ op { } is_stateful: true } +op { + name: "ResourceScatterUpdate" + input_arg { + name: "resource" + type: DT_RESOURCE + } + input_arg { + name: "indices" + type_attr: "Tindices" + } + input_arg { + name: "updates" + type_attr: "dtype" + } + attr { + name: "dtype" + type: "type" + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + is_stateful: true +} op { name: "ResourceSparseApplyAdadelta" input_arg { diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc index 2cae814eab1..3c8e9a8a5f2 100644 --- a/tensorflow/core/ops/dataset_ops.cc +++ b/tensorflow/core/ops/dataset_ops.cc @@ -491,4 +491,29 @@ REGISTER_OP("StatsAggregatorSummary") .Output("summary: string") .SetShapeFn(shape_inference::ScalarShape); +REGISTER_OP("PrependFromQueueAndPaddedBatchDataset") + .Input("input_dataset: variant") + .Input("batch_size: int64") + .Input("padded_shapes: N * int64") + .Input("padding_values: Toutput_types") + .Output("handle: variant") + .Attr("Toutput_types: list(type) >= 1") + .Attr("output_shapes: list(shape) >= 1") + .Attr("N: int >= 1") + // TODO(ebrevdo): Validate that `padded_shapes` are all vectors, the lengths + // of `Toutput_types` and `output_shapes` are `N`, that the + // length of `output_types` is `N`, the `output_shapes` are + // (as far as possible to tell statically) compatible with `padded_shapes`, + // and that `padding_values` are all scalars. + .SetShapeFn(shape_inference::ScalarShape); + +REGISTER_OP("EnqueueInQueueDataset") + .Input("queue: variant") + .Input("components: Tcomponents") + .Attr("Tcomponents: list(type) >= 1") + .SetIsStateful() // To avoid CSE on multiple calls to Enqueue. + // TODO(ebrevdo): SetShapeFn to test input dtypes and shapes by + // reading from queue handle (is that even possible?). + .SetShapeFn(shape_inference::NoOutputs); + } // namespace tensorflow diff --git a/tensorflow/core/ops/functional_grad.cc b/tensorflow/core/ops/functional_grad.cc index 6df3536795c..eeccb72da65 100644 --- a/tensorflow/core/ops/functional_grad.cc +++ b/tensorflow/core/ops/functional_grad.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/framework/function.h" #include +#include "tensorflow/core/framework/function.h" #include "tensorflow/core/lib/core/errors.h" namespace tensorflow { diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc index ef2ac267cc9..a62e2d782b8 100644 --- a/tensorflow/core/ops/image_ops.cc +++ b/tensorflow/core/ops/image_ops.cc @@ -586,6 +586,17 @@ REGISTER_OP("NonMaxSuppression") .Output("selected_indices: int32") .Attr("iou_threshold: float = 0.5") .SetShapeFn([](InferenceContext* c) { + // Get inputs and validate ranks. + ShapeHandle boxes; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes)); + ShapeHandle scores; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores)); + ShapeHandle max_output_size; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size)); + // The boxes is a 2-D float Tensor of shape [num_boxes, 4]. + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused)); + c->set_output(0, c->Vector(c->UnknownDim())); return Status::OK(); }); @@ -597,6 +608,19 @@ REGISTER_OP("NonMaxSuppressionV2") .Input("iou_threshold: float") .Output("selected_indices: int32") .SetShapeFn([](InferenceContext* c) { + // Get inputs and validate ranks. + ShapeHandle boxes; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes)); + ShapeHandle scores; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores)); + ShapeHandle max_output_size; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size)); + ShapeHandle iou_threshold; + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold)); + // The boxes is a 2-D float Tensor of shape [num_boxes, 4]. + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused)); + c->set_output(0, c->Vector(c->UnknownDim())); return Status::OK(); }); diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc index a67267418d6..444aa8b9544 100644 --- a/tensorflow/core/ops/lookup_ops.cc +++ b/tensorflow/core/ops/lookup_ops.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_def_builder.h" #include "tensorflow/core/framework/shape_inference.h" @@ -102,6 +103,8 @@ REGISTER_OP("LookupTableFindV2") c->set_output(0, c->UnknownShape()); return Status::OK(); }); +WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableFindV2"); +// TODO(b/72710477): Update this. REGISTER_OP("LookupTableInsert") .Input("table_handle: Ref(string)") diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc new file mode 100644 index 00000000000..95b4774fe6e --- /dev/null +++ b/tensorflow/core/ops/manip_ops.cc @@ -0,0 +1,33 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { + +// -------------------------------------------------------------------------- +REGISTER_OP("Roll") + .Input("input: T") + .Input("shift: Tshift") + .Input("axis: Taxis") + .Output("output: T") + .Attr("T: type") + .Attr("Tshift: {int32,int64}") + .Attr("Taxis: {int32,int64}") + .SetShapeFn(shape_inference::UnchangedShape); + +} // namespace tensorflow diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index dd484c3ee75..872ebe98c1f 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -1172,12 +1172,12 @@ Status RangeSize(const Tensor* start_t, const Tensor* limit_t, T limit = limit_t->scalar()(); T delta = delta_t->scalar()(); if (start > limit && delta > 0) { - return errors::InvalidArgument("Requires start <= limit when delta > 0: ", - start, "/", limit); + return errors::InvalidArgument( + "Requires start <= limit when delta > 0: ", start, "/", limit); } if (start < limit && delta < 0) { - return errors::InvalidArgument("Requires start >= limit when delta < 0: ", - start, "/", limit); + return errors::InvalidArgument( + "Requires start >= limit when delta < 0: ", start, "/", limit); } if (delta == 0) { return errors::InvalidArgument("Requires delta != 0"); diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index 3f72b415699..67481fd202b 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -1155,9 +1155,9 @@ Status TopKShapeFn(InferenceContext* c) { DimensionHandle last_dim = c->Dim(input, -1); if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) && c->Value(last_dim) < c->Value(k_dim)) { - return errors::InvalidArgument("input must have last dimension >= k = ", - c->Value(k_dim), " but is ", - c->Value(last_dim)); + return errors::InvalidArgument( + "input must have last dimension >= k = ", c->Value(k_dim), " but is ", + c->Value(last_dim)); } // Replace last_dim with k_dim. @@ -1211,9 +1211,9 @@ REGISTER_OP("NthElement") DimensionHandle last_dim = c->Dim(input, -1); if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) && c->Value(last_dim) <= c->Value(n_dim)) { - return errors::InvalidArgument("Input must have last dimension > n = ", - c->Value(n_dim), " but is ", - c->Value(last_dim)); + return errors::InvalidArgument( + "Input must have last dimension > n = ", c->Value(n_dim), + " but is ", c->Value(last_dim)); } // Reduce last_dim for output tensor @@ -1818,7 +1818,7 @@ REGISTER_OP("_MklMaxPool") .Input("input: T") .Input("mkl_input: uint8") .Output("output: T") -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML .Output("workspace: T") #else .Output("workspace: uint8") @@ -1844,7 +1844,7 @@ REGISTER_OP("_MklMaxPoolGrad") .Input("orig_input: T") .Input("orig_output: T") .Input("grad: T") -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML .Input("workspace: T") #else .Input("workspace: uint8") @@ -1916,7 +1916,7 @@ REGISTER_OP("_MklLRN") .Input("input: T") .Input("mkl_input: uint8") .Output("output: T") -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML .Output("workspace: T") #else .Output("workspace: uint8") @@ -1944,7 +1944,7 @@ REGISTER_OP("_MklLRNGrad") .Input("input_grads: T") .Input("input_image: T") .Input("output_image: T") -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML .Input("workspace: T") #else .Input("workspace: uint8") diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index b57206c9c4f..8df126735b5 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -7644,6 +7644,24 @@ op { type: DT_STRING } } +op { + name: "EnqueueInQueueDataset" + input_arg { + name: "queue" + type: DT_VARIANT + } + input_arg { + name: "components" + type_list_attr: "Tcomponents" + } + attr { + name: "Tcomponents" + type: "list(type)" + has_minimum: true + minimum: 1 + } + is_stateful: true +} op { name: "Enter" input_arg { @@ -12330,11 +12348,11 @@ op { } input_arg { name: "num_lower" - type: DT_INT64 + type_attr: "Tindex" } input_arg { name: "num_upper" - type: DT_INT64 + type_attr: "Tindex" } output_arg { name: "band" @@ -12344,6 +12362,19 @@ op { name: "T" type: "type" } + attr { + name: "Tindex" + type: "type" + default_value { + type: DT_INT64 + } + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } } op { name: "MatrixDeterminant" @@ -15926,6 +15957,48 @@ op { minimum: 1 } } +op { + name: "PrependFromQueueAndPaddedBatchDataset" + input_arg { + name: "input_dataset" + type: DT_VARIANT + } + input_arg { + name: "batch_size" + type: DT_INT64 + } + input_arg { + name: "padded_shapes" + type: DT_INT64 + number_attr: "N" + } + input_arg { + name: "padding_values" + type_list_attr: "Toutput_types" + } + output_arg { + name: "handle" + type: DT_VARIANT + } + attr { + name: "Toutput_types" + type: "list(type)" + has_minimum: true + minimum: 1 + } + attr { + name: "output_shapes" + type: "list(shape)" + has_minimum: true + minimum: 1 + } + attr { + name: "N" + type: "int" + has_minimum: true + minimum: 1 + } +} op { name: "PreventGradient" input_arg { @@ -20925,27 +20998,6 @@ op { attr { name: "dtype" type: "type" - allowed_values { - list { - type: DT_FLOAT - type: DT_DOUBLE - type: DT_INT32 - type: DT_UINT8 - type: DT_INT16 - type: DT_INT8 - type: DT_COMPLEX64 - type: DT_INT64 - type: DT_QINT8 - type: DT_QUINT8 - type: DT_QINT32 - type: DT_BFLOAT16 - type: DT_UINT16 - type: DT_COMPLEX128 - type: DT_HALF - type: DT_UINT32 - type: DT_UINT64 - } - } } attr { name: "Tindices" diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc index f6cfbf873a0..8dae7e1ff5f 100644 --- a/tensorflow/core/ops/resource_variable_ops.cc +++ b/tensorflow/core/ops/resource_variable_ops.cc @@ -193,7 +193,7 @@ REGISTER_OP("ResourceScatterUpdate") .Input("resource: resource") .Input("indices: Tindices") .Input("updates: dtype") - .Attr("dtype: numbertype") + .Attr("dtype: type") .Attr("Tindices: {int32, int64}") .SetShapeFn([](InferenceContext* c) { ShapeAndType handle_shape_and_type; diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc index e67d95fa8cb..4025070adb2 100644 --- a/tensorflow/core/ops/sdca_ops.cc +++ b/tensorflow/core/ops/sdca_ops.cc @@ -19,8 +19,8 @@ limitations under the License. namespace tensorflow { -using shape_inference::ShapeHandle; using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; // -------------------------------------------------------------------------- static Status ApplySdcaOptimizerShapeFn(InferenceContext* c) { diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index 8beb28de0a2..e4c5bcfb540 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -137,9 +137,9 @@ REGISTER_OP("Substr") DimensionHandle pos_dim = c->Dim(pos_shape, i); DimensionHandle len_dim = c->Dim(len_shape, i); if (c->Value(pos_dim) != c->Value(len_dim)) { - return errors::InvalidArgument("pos and len shapes must match: ", - c->DebugString(pos_shape), " vs. ", - c->DebugString(len_shape)); + return errors::InvalidArgument( + "pos and len shapes must match: ", c->DebugString(pos_shape), + " vs. ", c->DebugString(len_shape)); } } // c->input(0) is the ShapeHandle to input strings diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc index de4e3cd9e70..0f309c1f4e9 100644 --- a/tensorflow/core/ops/training_ops_test.cc +++ b/tensorflow/core/ops/training_ops_test.cc @@ -24,7 +24,7 @@ static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op, string shape_spec_middle, const string& shape_spec_end = "") { auto shape_spec = [&shape_spec_middle, shape_spec_end]( - const char* var_spec, const char* grad_indices_spec) { + const char* var_spec, const char* grad_indices_spec) { return strings::StrCat(var_spec, ";", shape_spec_middle, ";", grad_indices_spec, shape_spec_end); }; diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD index 07aecf84832..9ba25dea4fb 100644 --- a/tensorflow/core/platform/cloud/BUILD +++ b/tensorflow/core/platform/cloud/BUILD @@ -57,6 +57,17 @@ cc_library( ], ) +cc_library( + name = "gcs_throttle", + srcs = ["gcs_throttle.cc"], + hdrs = ["gcs_throttle.h"], + copts = tf_copts(), + visibility = ["//tensorflow:__subpackages__"], + deps = [ + "//tensorflow/core:lib", + ], +) + cc_library( name = "gcs_file_system", srcs = ["gcs_file_system.cc"], @@ -69,6 +80,7 @@ cc_library( ":expiring_lru_cache", ":file_block_cache", ":gcs_dns_cache", + ":gcs_throttle", ":google_auth_provider", ":http_request", ":retrying_file_system", @@ -271,6 +283,19 @@ tf_cc_test( ], ) +tf_cc_test( + name = "gcs_throttle_test", + size = "small", + srcs = ["gcs_throttle_test.cc"], + linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]), + deps = [ + ":gcs_throttle", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + tf_cc_test( name = "curl_http_request_test", size = "small", diff --git a/tensorflow/core/platform/cloud/file_block_cache.cc b/tensorflow/core/platform/cloud/file_block_cache.cc index 0375af516b0..6add1142a15 100644 --- a/tensorflow/core/platform/cloud/file_block_cache.cc +++ b/tensorflow/core/platform/cloud/file_block_cache.cc @@ -131,6 +131,7 @@ Status FileBlockCache::MaybeFetch(const Key& key, block->mu.lock(); // Reacquire the lock immediately afterwards if (status.ok()) { block->data.resize(bytes_transferred, 0); + block->data.shrink_to_fit(); downloaded_block = true; block->state = FetchState::FINISHED; } else { diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc index 2b0e55bf371..4d9aff4d24f 100644 --- a/tensorflow/core/platform/cloud/gcs_dns_cache.cc +++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc @@ -18,9 +18,9 @@ limitations under the License. #include #include #else +#include #include #include -#include #endif #include diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc index 520720372d9..01ca0d76bab 100644 --- a/tensorflow/core/platform/cloud/gcs_file_system.cc +++ b/tensorflow/core/platform/cloud/gcs_file_system.cc @@ -50,7 +50,6 @@ limitations under the License. #endif namespace tensorflow { - namespace { constexpr char kGcsUriBase[] = "https://www.googleapis.com/storage/v1/"; @@ -59,9 +58,6 @@ constexpr char kGcsUploadUriBase[] = constexpr char kStorageHost[] = "storage.googleapis.com"; constexpr size_t kReadAppendableFileBufferSize = 1024 * 1024; // In bytes. constexpr int kGetChildrenDefaultPageSize = 1000; -// Initial delay before retrying a GCS upload. -// Subsequent delays can be larger due to exponential back-off. -constexpr uint64 kUploadRetryDelayMicros = 1000000L; // The HTTP response code "308 Resume Incomplete". constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308; // The environment variable that overrides the size of the readahead buffer. @@ -120,6 +116,15 @@ constexpr char kWriteRequestTimeout[] = "GCS_WRITE_REQUEST_TIMEOUT_SECS"; // The environment variable to configure an additional header to send with // all requests to GCS (format HEADERNAME:HEADERCONTENT) constexpr char kAdditionalRequestHeader[] = "GCS_ADDITIONAL_REQUEST_HEADER"; +// The environment variable to configure the throttle (format: ) +constexpr char kThrottleRate[] = "GCS_THROTTLE_TOKEN_RATE"; +// The environment variable to configure the token bucket size (format: ) +constexpr char kThrottleBucket[] = "GCS_THROTTLE_BUCKET_SIZE"; +// The environment variable that controls the number of tokens per request. +// (format: ) +constexpr char kTokensPerRequest[] = "GCS_TOKENS_PER_REQUEST"; +// The environment variable to configure the initial tokens (format: ) +constexpr char kInitialTokens[] = "GCS_INITIAL_TOKENS"; // TODO: DO NOT use a hardcoded path Status GetTmpFilename(string* filename) { @@ -725,6 +730,26 @@ GcsFileSystem::GcsFileSystem() if (GetEnvVar(kWriteRequestTimeout, strings::safe_strtou32, &timeout_value)) { timeouts_.write = timeout_value; } + + int64 token_value; + if (GetEnvVar(kThrottleRate, strings::safe_strto64, &token_value)) { + GcsThrottleConfig config; + config.enabled = true; + config.token_rate = token_value; + + if (GetEnvVar(kThrottleBucket, strings::safe_strto64, &token_value)) { + config.bucket_size = token_value; + } + + if (GetEnvVar(kTokensPerRequest, strings::safe_strto64, &token_value)) { + config.tokens_per_request = token_value; + } + + if (GetEnvVar(kInitialTokens, strings::safe_strto64, &token_value)) { + config.initial_tokens = token_value; + } + throttle_.SetConfig(config); + } } GcsFileSystem::GcsFileSystem( @@ -778,7 +803,9 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset, TF_RETURN_IF_ERROR(ParseGcsPath(filename, false, &bucket, &object)); std::unique_ptr request; - TF_RETURN_IF_ERROR(CreateHttpRequest(&request)); + TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request), + "when reading gs://", bucket, "/", object); + request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket, "/", request->EscapeString(object))); request->SetRange(offset, offset + n - 1); @@ -793,6 +820,8 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset, VLOG(1) << "Successful read of gs://" << bucket << "/" << object << " @ " << offset << " of size: " << bytes_read; + throttle_.RecordResponse(bytes_read); + if (bytes_read < block_size()) { // Check stat cache to see if we encountered an interrupted read. FileStatistics stat; @@ -930,41 +959,43 @@ Status GcsFileSystem::StatForObject(const string& fname, const string& bucket, "'object' must be a non-empty string. (File: %s)", fname.c_str())); } - StatCache::ComputeFunc compute_func = - [this, &bucket, &object](const string& fname, FileStatistics* stat) { - std::vector output_buffer; - std::unique_ptr request; - TF_RETURN_IF_ERROR(CreateHttpRequest(&request)); - request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/", - request->EscapeString(object), - "?fields=size%2Cupdated")); - request->SetResultBuffer(&output_buffer); - request->SetTimeouts(timeouts_.connect, timeouts_.idle, - timeouts_.metadata); + StatCache::ComputeFunc compute_func = [this, &bucket, &object]( + const string& fname, + FileStatistics* stat) { + std::vector output_buffer; + std::unique_ptr request; + TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request), + " when reading metadata of gs://", bucket, + "/", object); - TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), - " when reading metadata of gs://", - bucket, "/", object); + request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/", + request->EscapeString(object), + "?fields=size%2Cupdated")); + request->SetResultBuffer(&output_buffer); + request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata); - Json::Value root; - TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root)); + TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), + " when reading metadata of gs://", bucket, + "/", object); - // Parse file size. - TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length)); + Json::Value root; + TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root)); - // Parse file modification time. - string updated; - TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated)); - TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec))); + // Parse file size. + TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length)); - VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- " - << " length: " << stat->length - << "; mtime_nsec: " << stat->mtime_nsec - << "; updated: " << updated; + // Parse file modification time. + string updated; + TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated)); + TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec))); - stat->is_directory = false; - return Status::OK(); - }; + VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- " + << " length: " << stat->length + << "; mtime_nsec: " << stat->mtime_nsec << "; updated: " << updated; + + stat->is_directory = false; + return Status::OK(); + }; TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(fname, stat, compute_func)); if (stat->is_directory) { @@ -1442,6 +1473,10 @@ Status GcsFileSystem::CreateHttpRequest(std::unique_ptr* request) { additional_header_->second); } + if (!throttle_.AdmitRequest()) { + return errors::Unavailable("Request throttled"); + } + *request = std::move(new_request); return Status::OK(); } diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h index 2eae39608e3..e8edde8a445 100644 --- a/tensorflow/core/platform/cloud/gcs_file_system.h +++ b/tensorflow/core/platform/cloud/gcs_file_system.h @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/core/platform/cloud/expiring_lru_cache.h" #include "tensorflow/core/platform/cloud/file_block_cache.h" #include "tensorflow/core/platform/cloud/gcs_dns_cache.h" +#include "tensorflow/core/platform/cloud/gcs_throttle.h" #include "tensorflow/core/platform/cloud/http_request.h" #include "tensorflow/core/platform/cloud/retrying_file_system.h" #include "tensorflow/core/platform/file_system.h" @@ -194,6 +195,7 @@ class GcsFileSystem : public FileSystem { std::unique_ptr http_request_factory_; std::unique_ptr file_block_cache_; std::unique_ptr dns_cache_; + GcsThrottle throttle_; using StatCache = ExpiringLRUCache; std::unique_ptr stat_cache_; diff --git a/tensorflow/core/platform/cloud/gcs_throttle.cc b/tensorflow/core/platform/cloud/gcs_throttle.cc new file mode 100644 index 00000000000..eb5f8958a37 --- /dev/null +++ b/tensorflow/core/platform/cloud/gcs_throttle.cc @@ -0,0 +1,62 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/cloud/gcs_throttle.h" + +#include + +namespace tensorflow { + +GcsThrottle::GcsThrottle(EnvTime* env_time) + : last_updated_secs_(env_time->NowSeconds()), + available_tokens_(0), + env_time_(env_time) {} + +bool GcsThrottle::AdmitRequest() { + mutex_lock l(mu_); + if (!config_.enabled) return true; + UpdateState(); + if (available_tokens_ < config_.tokens_per_request) { + return false; + } + available_tokens_ -= config_.tokens_per_request; + return true; +} + +void GcsThrottle::RecordResponse(size_t num_bytes) { + mutex_lock l(mu_); + if (!config_.enabled) return; + UpdateState(); + available_tokens_ -= request_bytes_to_tokens(num_bytes); +} + +void GcsThrottle::SetConfig(GcsThrottleConfig config) { + mutex_lock l(mu_); + config_ = config; + available_tokens_ = config.initial_tokens; + last_updated_secs_ = env_time_->NowSeconds(); +} + +void GcsThrottle::UpdateState() { + // TODO(b/72643279): Switch to a monotonic clock. + int64 now = env_time_->NowSeconds(); + uint64 delta_secs = + std::max(0LL, now - static_cast(last_updated_secs_)); + available_tokens_ += delta_secs * config_.token_rate; + available_tokens_ = std::min(available_tokens_, config_.bucket_size); + last_updated_secs_ = now; +} + +} // namespace tensorflow diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h new file mode 100644 index 00000000000..1a89daef084 --- /dev/null +++ b/tensorflow/core/platform/cloud/gcs_throttle.h @@ -0,0 +1,156 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_ +#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_ + +#include "tensorflow/core/platform/env.h" + +namespace tensorflow { + +/** + * GcsThrottleConfig is used to configure the GcsThrottle. + */ +struct GcsThrottleConfig { + /** + * enabled is true if GcsThrottle should throttle requests, false otherwise. + */ + bool enabled = false; + + /** + * token_rate is the number of tokens accrued every second that can be used + * for making requests to the GCS service. + */ + int64 token_rate = 100000; // Approximately 800 MBits/second bandwidth-only. + + /** + * bucket_size is the maximum number of available tokens the GcsThrottle can + * accrue. + */ + int64 bucket_size = 10000000; // 10 million tokens total + + /** + * tokens_per_request determines the number of tokens consumed for every + * request. + * + * Note: tokens are also consumed in proportion to the response size. + */ + int64 tokens_per_request = 100; + + /** + * initial_tokens determines how many tokens should be available immediately + * after the GcsThrottle is constructed. + */ + int64 initial_tokens = 0; +}; + +/** + * GcsThrottle is used to ensure fair use of the available GCS capacity. + * + * GcsThrottle operates around a concept of tokens. Tokens are consumed when + * making requests to the GCS service. Tokens are consumed both based on the + * number of requests made, as well as the bandwidth consumed (response sizes). + * + * GcsThrottle is thread safe and can be used from multiple threads. + */ +class GcsThrottle { + public: + /** + * Constructs a GcsThrottle. + */ + explicit GcsThrottle(EnvTime* env_time = EnvTime::Default()); + + /** + * AdmitRequest updates the GcsThrottle to record a request will be made. + * + * AdmitRequest should be called before any request is made. AdmitRequest + * returns false if the request should be denied. If AdmitRequest + * returns false, no tokens are consumed. If true is returned, the configured + * number of tokens are consumed. + */ + bool AdmitRequest(); + + /** + * RecordResponse updates the GcsThrottle to record a request has been made. + * + * RecordResponse should be called after the response has been received. + * RecordResponse will update the internal state based on the number of bytes + * in the response. + * + * Note: we split up the request and the response in this fashion in order to + * avoid penalizing consumers who are using large readahead buffers at higher + * layers of the I/O stack. + */ + void RecordResponse(size_t num_bytes); + + /** + * SetConfig sets the configuration for GcsThrottle and re-initializes state. + * + * After calling this, the token pool will be config.initial_tokens. + */ + void SetConfig(GcsThrottleConfig config); + + /** + * available_tokens gives a snapshot of how many tokens are available. + * + * The returned value should not be used to make admission decisions. The + * purpose of this function is to make available to monitoring or other + * instrumentation the number of available tokens in the pool. + */ + inline int64 available_tokens() { + mutex_lock l(mu_); + if (!config_.enabled) return 0; + UpdateState(); + return available_tokens_; + } + + private: + /** + * UpdateState updates the available_tokens_ and last_updated_secs_ variables. + * + * UpdateState should be called in order to mark the passage of time, and + * therefore add tokens to the availble_tokens_ pool. + */ + void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_); + + inline uint64 request_bytes_to_tokens(size_t num_bytes) { + return num_bytes >> 10; + } + + mutex mu_; + + /** + * last_updated_secs_ records the number of seconds since the Unix epoch that + * the internal state of the GcsThrottle was updated. This is important when + * determining the number of tokens to add to the available_tokens_ pool. + */ + uint64 last_updated_secs_ GUARDED_BY(mu_) = 0; + + /** + * available_tokens_ records how many tokens are available to be consumed. + * + * Note: it is possible for available_tokens_ to become negative. If a + * response comes back that consumes more than the available tokens, the count + * will go negative, and block future requests until we have available tokens. + */ + int64 available_tokens_ GUARDED_BY(mu_) = 0; + + EnvTime* const env_time_; + GcsThrottleConfig config_ GUARDED_BY(mu_); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_ diff --git a/tensorflow/core/platform/cloud/gcs_throttle_test.cc b/tensorflow/core/platform/cloud/gcs_throttle_test.cc new file mode 100644 index 00000000000..694756022e3 --- /dev/null +++ b/tensorflow/core/platform/cloud/gcs_throttle_test.cc @@ -0,0 +1,101 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/cloud/gcs_throttle.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { + +namespace { + +class TestTime : public EnvTime { + public: + uint64 NowMicros() override { return now_; } + + void SetTime(uint64 now_micros) { now_ = now_micros; } + + void AdvanceSeconds(int64 secs) { now_ += secs * 1000000L; } + + private: + uint64 now_ = 1234567890000000ULL; +}; + +class GcsThrottleTest : public ::testing::Test { + protected: + GcsThrottleTest() : throttle_(&time_) { + config_.enabled = true; + throttle_.SetConfig(config_); + } + + GcsThrottleConfig config_; + TestTime time_; + GcsThrottle throttle_; +}; + +TEST_F(GcsThrottleTest, ReplenishTokens) { + EXPECT_EQ(0, throttle_.available_tokens()); + time_.AdvanceSeconds(1); + EXPECT_EQ(100000, throttle_.available_tokens()); + time_.AdvanceSeconds(2); + EXPECT_EQ(300000, throttle_.available_tokens()); +} + +TEST_F(GcsThrottleTest, RejectRequest) { + EXPECT_EQ(0, throttle_.available_tokens()); + time_.AdvanceSeconds(1); + EXPECT_TRUE(throttle_.AdmitRequest()); + EXPECT_EQ(99900, throttle_.available_tokens()); + for (int i = 1; i < 1000; i++) { + EXPECT_TRUE(throttle_.AdmitRequest()); + } + EXPECT_FALSE(throttle_.AdmitRequest()); +} + +TEST_F(GcsThrottleTest, MarkResponses) { + time_.AdvanceSeconds(1); + EXPECT_TRUE(throttle_.AdmitRequest()); + throttle_.RecordResponse(128000000); // 128 MB response + EXPECT_EQ(-25100, throttle_.available_tokens()); + EXPECT_FALSE(throttle_.AdmitRequest()); + time_.AdvanceSeconds(1); + EXPECT_TRUE(throttle_.AdmitRequest()) + << "Available tokens: " << throttle_.available_tokens(); +} + +TEST_F(GcsThrottleTest, Skippingtime_) { + EXPECT_EQ(0, throttle_.available_tokens()); + time_.AdvanceSeconds(90); + EXPECT_EQ(9000000, throttle_.available_tokens()); +} + +TEST_F(GcsThrottleTest, BucketLimit) { + time_.AdvanceSeconds(120); + EXPECT_EQ(10000000, throttle_.available_tokens()); +} + +TEST_F(GcsThrottleTest, ReverseTime) { + time_.AdvanceSeconds(1); + EXPECT_EQ(100000, throttle_.available_tokens()); + time_.AdvanceSeconds(-3600); + EXPECT_EQ(100000, throttle_.available_tokens()); + time_.AdvanceSeconds(1); + EXPECT_EQ(200000, throttle_.available_tokens()); +} + +} // namespace + +} // namespace tensorflow diff --git a/tensorflow/core/platform/cloud/http_request_fake.h b/tensorflow/core/platform/cloud/http_request_fake.h index 682b97f6ec6..7711eaceb29 100644 --- a/tensorflow/core/platform/cloud/http_request_fake.h +++ b/tensorflow/core/platform/cloud/http_request_fake.h @@ -38,8 +38,7 @@ class FakeHttpRequest : public CurlHttpRequest { public: /// Return the response for the given request. FakeHttpRequest(const string& request, const string& response) - : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) { - } + : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) {} /// Return the response with headers for the given request. FakeHttpRequest(const string& request, const string& response, diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc index 236259dbc16..ad569758cc6 100644 --- a/tensorflow/core/platform/cloud/oauth_client_test.cc +++ b/tensorflow/core/platform/cloud/oauth_client_test.cc @@ -160,12 +160,12 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) { ASSERT_EQ(1, EVP_DigestVerifyInit(md_ctx, nullptr, md, nullptr, key)); ASSERT_EQ(1, EVP_DigestVerifyUpdate(md_ctx, header_dot_claim.c_str(), header_dot_claim.size())); - ASSERT_EQ( - 1, - EVP_DigestVerifyFinal( - md_ctx, const_cast( - reinterpret_cast(signature.data())), - signature.size())); + ASSERT_EQ(1, + EVP_DigestVerifyFinal( + md_ctx, + const_cast( + reinterpret_cast(signature.data())), + signature.size())); EVP_MD_CTX_cleanup(md_ctx); // Free all the crypto-related resources. diff --git a/tensorflow/core/platform/cloud/retrying_file_system.cc b/tensorflow/core/platform/cloud/retrying_file_system.cc index c3b68313613..be9ebe67b18 100644 --- a/tensorflow/core/platform/cloud/retrying_file_system.cc +++ b/tensorflow/core/platform/cloud/retrying_file_system.cc @@ -25,7 +25,6 @@ namespace tensorflow { namespace { - class RetryingRandomAccessFile : public RandomAccessFile { public: RetryingRandomAccessFile(std::unique_ptr base_file, @@ -203,4 +202,6 @@ Status RetryingFileSystem::DeleteRecursively(const string& dirname, initial_delay_microseconds_); } +void RetryingFileSystem::FlushCaches() { base_file_system_->FlushCaches(); } + } // namespace tensorflow diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h index d9d8ea6b004..a262a5fd940 100644 --- a/tensorflow/core/platform/cloud/retrying_file_system.h +++ b/tensorflow/core/platform/cloud/retrying_file_system.h @@ -69,6 +69,8 @@ class RetryingFileSystem : public FileSystem { Status DeleteRecursively(const string& dirname, int64* undeleted_files, int64* undeleted_dirs) override; + void FlushCaches() override; + private: std::unique_ptr base_file_system_; const int64 initial_delay_microseconds_; diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc index 232dcb3e71a..d3f763bb3c8 100644 --- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc +++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc @@ -84,7 +84,8 @@ class MockWritableFile : public WritableFile { class MockFileSystem : public FileSystem { public: - explicit MockFileSystem(const ExpectedCalls& calls) : calls_(calls) {} + explicit MockFileSystem(const ExpectedCalls& calls, bool* flushed = nullptr) + : calls_(calls), flushed_(flushed) {} Status NewRandomAccessFile( const string& fname, std::unique_ptr* result) override { @@ -156,11 +157,18 @@ class MockFileSystem : public FileSystem { return calls_.ConsumeNextCall("DeleteRecursively"); } + void FlushCaches() override { + if (flushed_) { + *flushed_ = true; + } + } + std::unique_ptr writable_file_to_return; std::unique_ptr random_access_file_to_return; private: MockCallSequence calls_; + bool* flushed_ = nullptr; }; TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) { @@ -702,5 +710,14 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) { << status; } +TEST(RetryingFileSystemTest, FlushCaches) { + ExpectedCalls none; + bool flushed = false; + std::unique_ptr base_fs(new MockFileSystem(none, &flushed)); + RetryingFileSystem fs(std::move(base_fs), 0); + fs.FlushCaches(); + EXPECT_TRUE(flushed); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc index b0d7b3a67ae..7caf9d4db64 100644 --- a/tensorflow/core/platform/cpu_feature_guard.cc +++ b/tensorflow/core/platform/cpu_feature_guard.cc @@ -97,14 +97,17 @@ std::once_flag g_cpu_feature_guard_warn_once_flag; void InfoAboutUnusedCPUFeatures() { std::call_once(g_cpu_feature_guard_warn_once_flag, [] { string missing_instructions; -#ifdef PLATFORM_WINDOWS +#if defined(_MSC_VER) && !defined(__clang__) + #ifndef __AVX__ CheckIfFeatureUnused(CPUFeature::AVX, "AVX", missing_instructions); #endif // __AVX__ #ifndef __AVX2__ CheckIfFeatureUnused(CPUFeature::AVX2, "AVX2", missing_instructions); #endif // __AVX2__ -#else // ifdef platform windows + +#else // if defined(_MSC_VER) && !defined(__clang__) + #ifndef __SSE__ CheckIfFeatureUnused(CPUFeature::SSE, "SSE", missing_instructions); #endif // __SSE__ @@ -132,7 +135,7 @@ void InfoAboutUnusedCPUFeatures() { #ifndef __FMA__ CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions); #endif // __FMA__ -#endif // else of ifdef platform windows +#endif // else of if defined(_MSC_VER) && !defined(__clang__) if (!missing_instructions.empty()) { LOG(INFO) << "Your CPU supports instructions that this TensorFlow " << "binary was not compiled to use:" << missing_instructions; diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc index 639f6804ea2..2d34239a995 100644 --- a/tensorflow/core/platform/cuda_libdevice_path_test.cc +++ b/tensorflow/core/platform/cuda_libdevice_path_test.cc @@ -27,8 +27,7 @@ TEST(CudaLibdevicePathTest, LibdevicePath) { VLOG(2) << "Libdevice root = " << LibdeviceRoot(); std::vector libdevice_files; TF_EXPECT_OK(Env::Default()->GetMatchingPaths( - io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), - &libdevice_files)); + io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), &libdevice_files)); EXPECT_LT(0, libdevice_files.size()); } #endif diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc index f4b0f16393d..8e60a7f0910 100644 --- a/tensorflow/core/platform/default/device_tracer.cc +++ b/tensorflow/core/platform/default/device_tracer.cc @@ -579,8 +579,10 @@ Status DeviceTracerImpl::Collect(StepStatsCollector *collector) { // TODO(pbar) Handle device IDs and prefix properly. const string prefix = ""; const int id = 0; - const string stream_device = strings::StrCat(prefix, "/device:GPU:", id, "/stream:"); - const string memcpy_device = strings::StrCat(prefix, "/device:GPU:", id, "/memcpy"); + const string stream_device = + strings::StrCat(prefix, "/device:GPU:", id, "/stream:"); + const string memcpy_device = + strings::StrCat(prefix, "/device:GPU:", id, "/memcpy"); mutex_lock l2(trace_mu_); for (const auto &rec : kernel_records_) { diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc index 82bd69f9ca4..2b874da1981 100644 --- a/tensorflow/core/platform/default/logging.cc +++ b/tensorflow/core/platform/default/logging.cc @@ -83,15 +83,14 @@ void LogMessage::GenerateLogMessage() { const size_t time_buffer_size = 30; char time_buffer[time_buffer_size]; strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S", - localtime(&now_seconds)); + localtime(&now_seconds)); // TODO(jeff,sanjay): Replace this with something that logs through the env. fprintf(stderr, "%s.%06d: %c %s:%d] %s\n", time_buffer, micros_remainder, - "IWEF"[severity_], fname_, line_, str().c_str()); + "IWEF"[severity_], fname_, line_, str().c_str()); } #endif - namespace { // Parse log level (int64) from environment variable (char*) diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h index 40c260f2366..f0efa31d557 100644 --- a/tensorflow/core/platform/default/logging.h +++ b/tensorflow/core/platform/default/logging.h @@ -19,8 +19,8 @@ limitations under the License. // IWYU pragma: private, include "third_party/tensorflow/core/platform/logging.h" // IWYU pragma: friend third_party/tensorflow/core/platform/logging.h -#include #include +#include #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" @@ -205,16 +205,18 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) { inline string* name##Impl(int v1, int v2, const char* exprtext) { \ return name##Impl(v1, v2, exprtext); \ } \ - inline string* name##Impl(const size_t v1, const int v2, const char* exprtext) { \ + inline string* name##Impl(const size_t v1, const int v2, \ + const char* exprtext) { \ if (TF_PREDICT_FALSE(v2 < 0)) { \ - return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\ + return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \ } \ const size_t uval = (size_t)((unsigned)v1); \ return name##Impl(uval, v2, exprtext); \ } \ - inline string* name##Impl(const int v1, const size_t v2, const char* exprtext) { \ - if (TF_PREDICT_FALSE(v2 >= std::numeric_limits::max())) { \ - return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\ + inline string* name##Impl(const int v1, const size_t v2, \ + const char* exprtext) { \ + if (TF_PREDICT_FALSE(v2 >= std::numeric_limits::max())) { \ + return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \ } \ const size_t uval = (size_t)((unsigned)v2); \ return name##Impl(v1, uval, exprtext); \ @@ -225,12 +227,12 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) { // This happens if, for example, those are used as token names in a // yacc grammar. TF_DEFINE_CHECK_OP_IMPL(Check_EQ, - == ) // Compilation error with CHECK_EQ(NULL, x)? -TF_DEFINE_CHECK_OP_IMPL(Check_NE, != ) // Use CHECK(x == NULL) instead. -TF_DEFINE_CHECK_OP_IMPL(Check_LE, <= ) -TF_DEFINE_CHECK_OP_IMPL(Check_LT, < ) -TF_DEFINE_CHECK_OP_IMPL(Check_GE, >= ) -TF_DEFINE_CHECK_OP_IMPL(Check_GT, > ) + ==) // Compilation error with CHECK_EQ(NULL, x)? +TF_DEFINE_CHECK_OP_IMPL(Check_NE, !=) // Use CHECK(x == NULL) instead. +TF_DEFINE_CHECK_OP_IMPL(Check_LE, <=) +TF_DEFINE_CHECK_OP_IMPL(Check_LT, <) +TF_DEFINE_CHECK_OP_IMPL(Check_GE, >=) +TF_DEFINE_CHECK_OP_IMPL(Check_GT, >) #undef TF_DEFINE_CHECK_OP_IMPL // In optimized mode, use CheckOpString to hint to compiler that diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc index f13b0af2a79..e00dbdb4ae5 100644 --- a/tensorflow/core/platform/denormal.cc +++ b/tensorflow/core/platform/denormal.cc @@ -41,8 +41,8 @@ namespace tensorflow { namespace port { ScopedFlushDenormal::ScopedFlushDenormal() { -// For now, we flush denormals only on SSE 3. Other architectures such as ARM -// can be added as needed. + // For now, we flush denormals only on SSE 3. Other architectures such as ARM + // can be added as needed. #ifdef DENORM_USE_INTRINSICS if (TestCPUFeature(SSE3)) { diff --git a/tensorflow/core/platform/device_tracer_test.cc b/tensorflow/core/platform/device_tracer_test.cc index c0c08dabacb..89f14e905af 100644 --- a/tensorflow/core/platform/device_tracer_test.cc +++ b/tensorflow/core/platform/device_tracer_test.cc @@ -77,7 +77,8 @@ class DeviceTracerTest : public ::testing::Test { Node* y_neg = test::graph::Unary(&graph, "Neg", i); y_neg_ = y_neg->name(); - y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0"); + y_neg->set_assigned_device_name( + "/job:localhost/replica:0/task:0/device:GPU:0"); test::graph::ToGraphDef(&graph, &def_); } diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h index 557bfa87e50..34aaf3f78ba 100644 --- a/tensorflow/core/platform/env.h +++ b/tensorflow/core/platform/env.h @@ -286,7 +286,7 @@ class Env { // "version" should be the version of the library or NULL // returns the name that LoadLibrary() can use virtual string FormatLibraryFileName(const string& name, - const string& version) = 0; + const string& version) = 0; private: // Returns a possible list of local temporary directories. @@ -353,6 +353,7 @@ class EnvWrapper : public Env { const string& version) override { return target_->FormatLibraryFileName(name, version); } + private: Env* target_; }; diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc index 14755891fa2..b9866cf641a 100644 --- a/tensorflow/core/platform/file_system.cc +++ b/tensorflow/core/platform/file_system.cc @@ -131,18 +131,19 @@ Status FileSystem::GetMatchingPaths(const string& pattern, if (children.empty()) continue; // This IsDirectory call can be expensive for some FS. Parallelizing it. children_dir_status.resize(children.size()); - ForEach(0, children.size(), [this, ¤t_dir, &children, &fixed_prefix, - &children_dir_status](int i) { - const string child_path = io::JoinPath(current_dir, children[i]); - // In case the child_path doesn't start with the fixed_prefix then - // we don't need to explore this path. - if (!StringPiece(child_path).starts_with(fixed_prefix)) { - children_dir_status[i] = - Status(tensorflow::error::CANCELLED, "Operation not needed"); - } else { - children_dir_status[i] = IsDirectory(child_path); - } - }); + ForEach(0, children.size(), + [this, ¤t_dir, &children, &fixed_prefix, + &children_dir_status](int i) { + const string child_path = io::JoinPath(current_dir, children[i]); + // In case the child_path doesn't start with the fixed_prefix then + // we don't need to explore this path. + if (!StringPiece(child_path).starts_with(fixed_prefix)) { + children_dir_status[i] = Status(tensorflow::error::CANCELLED, + "Operation not needed"); + } else { + children_dir_status[i] = IsDirectory(child_path); + } + }); for (int i = 0; i < children.size(); ++i) { const string child_path = io::JoinPath(current_dir, children[i]); // If the IsDirectory call was cancelled we bail. diff --git a/tensorflow/core/platform/gif.h b/tensorflow/core/platform/gif.h index 9c72d34ff51..ab095a35c93 100644 --- a/tensorflow/core/platform/gif.h +++ b/tensorflow/core/platform/gif.h @@ -20,7 +20,8 @@ limitations under the License. #if defined(PLATFORM_GOOGLE) #include "tensorflow/core/platform/google/build_config/gif.h" -#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID) +#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \ + defined(PLATFORM_POSIX_ANDROID) #include #else #error Define the appropriate PLATFORM_ macro for this platform diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc index 0baeac09841..74863293a32 100644 --- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc +++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc @@ -164,8 +164,9 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) { } else { hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str()); } - // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is the build in - // environment variable of Kerberos, so KERB_TICKET_CACHE_PATH and related code are unnecessary. + // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is + // the build in environment variable of Kerberos, so KERB_TICKET_CACHE_PATH + // and related code are unnecessary. char* ticket_cache_path = getenv("KERB_TICKET_CACHE_PATH"); if (ticket_cache_path != nullptr) { hdfs_->hdfsBuilderSetKerbTicketCachePath(builder, ticket_cache_path); diff --git a/tensorflow/core/platform/jpeg.h b/tensorflow/core/platform/jpeg.h index edbcbd960a7..1b5e633f0aa 100644 --- a/tensorflow/core/platform/jpeg.h +++ b/tensorflow/core/platform/jpeg.h @@ -20,7 +20,8 @@ limitations under the License. #if defined(PLATFORM_GOOGLE) #include "tensorflow/core/platform/google/build_config/jpeg.h" -#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID) +#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \ + defined(PLATFORM_POSIX_ANDROID) #include #include #include diff --git a/tensorflow/core/platform/png.h b/tensorflow/core/platform/png.h index 5b0203c343e..dad18d72195 100644 --- a/tensorflow/core/platform/png.h +++ b/tensorflow/core/platform/png.h @@ -20,7 +20,8 @@ limitations under the License. #if defined(PLATFORM_GOOGLE) #include "tensorflow/core/platform/google/build_config/png.h" -#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID) +#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \ + defined(PLATFORM_POSIX_ANDROID) #include #else #error Define the appropriate PLATFORM_ macro for this platform diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc index cda6d7d8f9d..2bb9443fb3c 100644 --- a/tensorflow/core/platform/posix/error.cc +++ b/tensorflow/core/platform/posix/error.cc @@ -73,19 +73,19 @@ error::Code ErrnoToCode(int err_number) { case ECHILD: // No child processes case EISCONN: // Socket is connected #if !defined(_WIN32) && !defined(__HAIKU__) - case ENOTBLK: // Block device required + case ENOTBLK: // Block device required #endif - case ENOTCONN: // The socket is not connected - case EPIPE: // Broken pipe + case ENOTCONN: // The socket is not connected + case EPIPE: // Broken pipe #if !defined(_WIN32) - case ESHUTDOWN: // Cannot send after transport endpoint shutdown + case ESHUTDOWN: // Cannot send after transport endpoint shutdown #endif - case ETXTBSY: // Text file busy + case ETXTBSY: // Text file busy code = error::FAILED_PRECONDITION; break; - case ENOSPC: // No space left on device + case ENOSPC: // No space left on device #if !defined(_WIN32) - case EDQUOT: // Disk quota exceeded + case EDQUOT: // Disk quota exceeded #endif case EMFILE: // Too many open files case EMLINK: // Too many links @@ -95,7 +95,7 @@ error::Code ErrnoToCode(int err_number) { case ENOMEM: // Not enough space case ENOSR: // No STREAM resources #if !defined(_WIN32) && !defined(__HAIKU__) - case EUSERS: // Too many users + case EUSERS: // Too many users #endif code = error::RESOURCE_EXHAUSTED; break; @@ -104,17 +104,17 @@ error::Code ErrnoToCode(int err_number) { case ERANGE: // Result too large code = error::OUT_OF_RANGE; break; - case ENOSYS: // Function not implemented - case ENOTSUP: // Operation not supported - case EAFNOSUPPORT: // Address family not supported + case ENOSYS: // Function not implemented + case ENOTSUP: // Operation not supported + case EAFNOSUPPORT: // Address family not supported #if !defined(_WIN32) - case EPFNOSUPPORT: // Protocol family not supported + case EPFNOSUPPORT: // Protocol family not supported #endif case EPROTONOSUPPORT: // Protocol not supported #if !defined(_WIN32) && !defined(__HAIKU__) case ESOCKTNOSUPPORT: // Socket type not supported #endif - case EXDEV: // Improper link + case EXDEV: // Improper link code = error::UNIMPLEMENTED; break; case EAGAIN: // Resource temporarily unavailable @@ -123,7 +123,7 @@ error::Code ErrnoToCode(int err_number) { case ECONNRESET: // Connection reset case EINTR: // Interrupted function call #if !defined(_WIN32) - case EHOSTDOWN: // Host is down + case EHOSTDOWN: // Host is down #endif case EHOSTUNREACH: // Host is unreachable case ENETDOWN: // Network is down @@ -139,7 +139,7 @@ error::Code ErrnoToCode(int err_number) { break; case EDEADLK: // Resource deadlock avoided #if !defined(_WIN32) - case ESTALE: // Stale file handle + case ESTALE: // Stale file handle #endif code = error::ABORTED; break; @@ -158,7 +158,7 @@ error::Code ErrnoToCode(int err_number) { case ENOMSG: // No message of the desired type case EPROTO: // Protocol error #if !defined(_WIN32) && !defined(__HAIKU__) - case EREMOTE: // Object is remote + case EREMOTE: // Object is remote #endif code = error::UNKNOWN; break; diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h index 8604b01c53e..ce2069b0044 100644 --- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h +++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h @@ -58,8 +58,8 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper { TF_DISALLOW_COPY_AND_ASSIGN(AndroidArmV7ACpuUtilsHelper); }; -} // profile_utils -} // tensorflow +} // namespace profile_utils +} // namespace tensorflow #endif // defined(__ANDROID__) && (__ANDROID_API__ >= 21) && // (defined(__ARM_ARCH_7A__) || defined(__aarch64__)) diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc index d3362690d7e..02de7d1362b 100644 --- a/tensorflow/core/platform/profile_utils/cpu_utils.cc +++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc @@ -28,15 +28,17 @@ namespace profile_utils { static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr; -#if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__)) - /* static */ uint64 CpuUtils::GetCycleCounterFrequency() { - static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl(); - return cpu_frequency; +#if (defined(__powerpc__) || \ + defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ + (defined(__s390x__)) +/* static */ uint64 CpuUtils::GetCycleCounterFrequency() { + static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl(); + return cpu_frequency; } #else - /* static */ int64 CpuUtils::GetCycleCounterFrequency() { - static const int64 cpu_frequency = GetCycleCounterFrequencyImpl(); - return cpu_frequency; +/* static */ int64 CpuUtils::GetCycleCounterFrequency() { + static const int64 cpu_frequency = GetCycleCounterFrequencyImpl(); + return cpu_frequency; } #endif diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h index e95843b80a5..7b580c8bf60 100644 --- a/tensorflow/core/platform/profile_utils/cpu_utils.h +++ b/tensorflow/core/platform/profile_utils/cpu_utils.h @@ -94,14 +94,16 @@ class CpuUtils { #endif } - // Return cycle counter frequency. - // As this method caches the cpu frequency internally, - // the first call will incur overhead, but not subsequent calls. - #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__)) - static uint64 GetCycleCounterFrequency(); - #else - static int64 GetCycleCounterFrequency(); - #endif +// Return cycle counter frequency. +// As this method caches the cpu frequency internally, +// the first call will incur overhead, but not subsequent calls. +#if (defined(__powerpc__) || \ + defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ + (defined(__s390x__)) + static uint64 GetCycleCounterFrequency(); +#else + static int64 GetCycleCounterFrequency(); +#endif // Return micro second per each clock // As this method caches the cpu frequency internally, diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc index 5b11b684dd9..eb8161fbfd5 100644 --- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc +++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc @@ -53,15 +53,17 @@ TEST_F(CpuUtilsTest, CheckGetCurrentClockCycle) { } TEST_F(CpuUtilsTest, CheckCycleCounterFrequency) { - #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__)) - const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency(); - CHECK_GT(cpu_frequency, 0); - CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY)); - #else - const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency(); - CHECK_GT(cpu_frequency, 0); - CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY); - #endif +#if (defined(__powerpc__) || \ + defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ + (defined(__s390x__)) + const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency(); + CHECK_GT(cpu_frequency, 0); + CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY)); +#else + const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency(); + CHECK_GT(cpu_frequency, 0); + CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY); +#endif if (DBG) { LOG(INFO) << "Cpu frequency = " << cpu_frequency; } diff --git a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h index 51c54d50d1d..11b739c0096 100644 --- a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h +++ b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h @@ -47,7 +47,7 @@ class ICpuUtilsHelper { TF_DISALLOW_COPY_AND_ASSIGN(ICpuUtilsHelper); }; -} // profile_utils -} // tensorflow +} // namespace profile_utils +} // namespace tensorflow #endif // TENSORFLOW_PLATFORM_PROFILEUTILS_I_CPU_UTILS_HELPER_H__ diff --git a/tensorflow/core/platform/protobuf_internal.h b/tensorflow/core/platform/protobuf_internal.h index 7d6e8f57a62..2f151a5aee6 100644 --- a/tensorflow/core/platform/protobuf_internal.h +++ b/tensorflow/core/platform/protobuf_internal.h @@ -45,8 +45,8 @@ Status ParseAny(const google::protobuf::Any& any, T* message, #ifdef TENSORFLOW_LITE_PROTOS if (any.type_url() != strings::StrCat("type.googleapis.com/", type_name)) { return errors::FailedPrecondition( - "Expected Any type_url for: ", type_name, ". Got: ", - string(any.type_url().data(), any.type_url().size()), "."); + "Expected Any type_url for: ", type_name, + ". Got: ", string(any.type_url().data(), any.type_url().size()), "."); } if (!message->ParseFromString(any.value())) { return errors::FailedPrecondition("Failed to unpack: ", diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc index fbca0acc36b..44317f1a3e4 100644 --- a/tensorflow/core/platform/s3/aws_logging.cc +++ b/tensorflow/core/platform/s3/aws_logging.cc @@ -96,7 +96,7 @@ Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() { return log_level; } -} +} // namespace static bool initialized = false; static mutex s3_logging_mutex(LINKER_INITIALIZED); diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc index 1e89fa77c10..4862fd85be0 100644 --- a/tensorflow/core/platform/s3/s3_file_system.cc +++ b/tensorflow/core/platform/s3/s3_file_system.cc @@ -306,8 +306,15 @@ std::shared_ptr S3FileSystem::GetS3Client() { }; Aws::InitAPI(options); - this->s3_client_ = std::shared_ptr( - new Aws::S3::S3Client(GetDefaultClientConfig())); + // The creation of S3Client disables virtual addressing: + // S3Client(clientConfiguration, signPayloads, useVirtualAdressing = true) + // The purpose is to address the issue encountered when there is an `.` + // in the bucket name. Due to TLS hostname validation or DNS rules, + // the bucket may not be resolved. Disabling of virtual addressing + // should address the issue. See GitHub issue 16397 for details. + this->s3_client_ = std::shared_ptr(new Aws::S3::S3Client( + GetDefaultClientConfig(), + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false)); } return this->s3_client_; diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h index 168b8007f3b..8177e48dba5 100644 --- a/tensorflow/core/platform/s3/s3_file_system.h +++ b/tensorflow/core/platform/s3/s3_file_system.h @@ -57,6 +57,18 @@ class S3FileSystem : public FileSystem { Status RenameFile(const string& src, const string& target) override; private: // Returns the member S3 client, initializing as-needed. + // When the client tries to access the object in S3, e.g., + // s3://bucket-name/path/to/object + // the behavior could be controlled by various environmental + // variables. + // By default S3 access regional endpoint, with region + // controlled by `AWS_REGION`. The endpoint could be overridden + // explicitly with `S3_ENDPOINT`. S3 uses HTTPS by default. + // If S3_USE_HTTPS=0 is specified, HTTP is used. Also, + // S3_VERIFY_SSL=0 could disable SSL verification in case + // HTTPS is used. + // This S3 Client does not support Virtual Hosted–Style Method + // for a bucket. std::shared_ptr GetS3Client(); std::shared_ptr s3_client_; diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc index 0c66da09bb9..592626bfa17 100644 --- a/tensorflow/core/platform/setround.cc +++ b/tensorflow/core/platform/setround.cc @@ -15,7 +15,6 @@ limitations under the License. #include "tensorflow/core/platform/setround.h" - namespace tensorflow { namespace port { diff --git a/tensorflow/core/platform/test_benchmark.h b/tensorflow/core/platform/test_benchmark.h index a6636225ccb..327237dba93 100644 --- a/tensorflow/core/platform/test_benchmark.h +++ b/tensorflow/core/platform/test_benchmark.h @@ -60,7 +60,7 @@ class Benchmark { private: string name_; int num_args_; - std::vector> args_; + std::vector > args_; void (*fn0_)(int) = nullptr; void (*fn1_)(int, int) = nullptr; void (*fn2_)(int, int, int) = nullptr; diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h index d6e78dbc8f9..f20939d3c0f 100644 --- a/tensorflow/core/platform/windows/cpu_info.h +++ b/tensorflow/core/platform/windows/cpu_info.h @@ -22,8 +22,10 @@ limitations under the License. // Byte order defines provided by gcc. MSVC doesn't define those so // we define them here. // We assume that all windows platform out there are little endian. +#if defined(_MSC_VER) && !defined(__clang__) #define __ORDER_LITTLE_ENDIAN__ 0x4d2 #define __ORDER_BIG_ENDIAN__ 0x10e1 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#endif #endif // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_ diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc index 788a4bf4b1a..41b26441707 100644 --- a/tensorflow/core/platform/windows/env.cc +++ b/tensorflow/core/platform/windows/env.cc @@ -24,9 +24,9 @@ limitations under the License. #undef LoadLibrary #undef ERROR +#include #include #include -#include #include "tensorflow/core/lib/core/error_codes.pb.h" #include "tensorflow/core/platform/load_library.h" @@ -53,8 +53,7 @@ class StdThread : public Thread { class WindowsEnv : public Env { public: - WindowsEnv() - : GetSystemTimePreciseAsFileTime_(NULL) { + WindowsEnv() : GetSystemTimePreciseAsFileTime_(NULL) { // GetSystemTimePreciseAsFileTime function is only available in the latest // versions of Windows. For that reason, we try to look it up in // kernel32.dll at runtime and use an alternative option if the function @@ -72,8 +71,8 @@ class WindowsEnv : public Env { } bool MatchPath(const string& path, const string& pattern) override { - std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path)); - std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern)); + std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path)); + std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern)); return PathMatchSpecW(ws_path.c_str(), ws_pattern.c_str()) == TRUE; } @@ -122,14 +121,14 @@ class WindowsEnv : public Env { SetThreadpoolTimer(timer, &FileDueTime, 0, 0); } - Status LoadLibrary(const char *library_filename, void** handle) override { + Status LoadLibrary(const char* library_filename, void** handle) override { std::string file_name = library_filename; std::replace(file_name.begin(), file_name.end(), '/', '\\'); std::wstring ws_file_name(WindowsFileSystem::Utf8ToWideChar(file_name)); HMODULE hModule = LoadLibraryExW(ws_file_name.c_str(), NULL, - LOAD_WITH_ALTERED_SEARCH_PATH); + LOAD_WITH_ALTERED_SEARCH_PATH); if (!hModule) { return errors::NotFound(file_name + " not found"); } @@ -138,31 +137,30 @@ class WindowsEnv : public Env { } Status GetSymbolFromLibrary(void* handle, const char* symbol_name, - void** symbol) override { + void** symbol) override { FARPROC found_symbol; found_symbol = GetProcAddress((HMODULE)handle, symbol_name); if (found_symbol == NULL) { return errors::NotFound(std::string(symbol_name) + " not found"); } - *symbol = (void **)found_symbol; + *symbol = (void**)found_symbol; return Status::OK(); } - string FormatLibraryFileName(const string& name, const string& version) - override { + string FormatLibraryFileName(const string& name, + const string& version) override { string filename; if (version.size() == 0) { filename = name + ".dll"; - } - else { + } else { filename = name + version + ".dll"; } return filename; } private: - typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME); + typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME); FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_; }; diff --git a/tensorflow/core/platform/windows/error.cc b/tensorflow/core/platform/windows/error.cc index 39e941a3834..291fc5003fb 100644 --- a/tensorflow/core/platform/windows/error.cc +++ b/tensorflow/core/platform/windows/error.cc @@ -21,7 +21,7 @@ namespace internal { std::string GetWindowsErrorMessage(DWORD err) { LPSTR buffer = NULL; DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS; + FORMAT_MESSAGE_IGNORE_INSERTS; FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), reinterpret_cast(&buffer), 0, NULL); std::string message = buffer; diff --git a/tensorflow/core/platform/windows/error.h b/tensorflow/core/platform/windows/error.h index 026e0d5aa94..ba643a0fa8f 100644 --- a/tensorflow/core/platform/windows/error.h +++ b/tensorflow/core/platform/windows/error.h @@ -24,9 +24,7 @@ namespace tensorflow { namespace internal { std::string GetWindowsErrorMessage(DWORD err); - -} } +} // namespace tensorflow #endif // TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_ - diff --git a/tensorflow/core/platform/windows/integral_types.h b/tensorflow/core/platform/windows/integral_types.h index 4970b8ca6a1..46338a536db 100644 --- a/tensorflow/core/platform/windows/integral_types.h +++ b/tensorflow/core/platform/windows/integral_types.h @@ -1,18 +1,18 @@ - /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ==============================================================================*/ - +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + #ifndef TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_ #define TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_ diff --git a/tensorflow/core/platform/windows/net.cc b/tensorflow/core/platform/windows/net.cc index 46eb072d425..2ab558ab95c 100644 --- a/tensorflow/core/platform/windows/net.cc +++ b/tensorflow/core/platform/windows/net.cc @@ -26,7 +26,7 @@ limitations under the License. #undef ERROR -#pragma comment(lib,"Ws2_32.lib") +#pragma comment(lib, "Ws2_32.lib") namespace tensorflow { namespace internal { @@ -44,8 +44,8 @@ bool IsPortAvailable(int* port, bool is_tcp) { CHECK_GE(*port, 0); CHECK_LE(*port, 65535); if (sock == INVALID_SOCKET) { - LOG(ERROR) << "socket() failed: " << - GetWindowsErrorMessage(WSAGetLastError()); + LOG(ERROR) << "socket() failed: " + << GetWindowsErrorMessage(WSAGetLastError()); return false; } @@ -54,8 +54,8 @@ bool IsPortAvailable(int* port, bool is_tcp) { int result = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast(&one), sizeof(one)); if (result == SOCKET_ERROR) { - LOG(ERROR) << "setsockopt() failed: " << - GetWindowsErrorMessage(WSAGetLastError()); + LOG(ERROR) << "setsockopt() failed: " + << GetWindowsErrorMessage(WSAGetLastError()); closesocket(sock); return false; } @@ -66,8 +66,8 @@ bool IsPortAvailable(int* port, bool is_tcp) { addr.sin_port = htons((uint16_t)*port); result = bind(sock, (struct sockaddr*)&addr, sizeof(addr)); if (result == SOCKET_ERROR) { - LOG(WARNING) << "bind(port=" << *port << ") failed: " << - GetWindowsErrorMessage(WSAGetLastError()); + LOG(WARNING) << "bind(port=" << *port + << ") failed: " << GetWindowsErrorMessage(WSAGetLastError()); closesocket(sock); return false; } @@ -75,8 +75,8 @@ bool IsPortAvailable(int* port, bool is_tcp) { // Get the bound port number. result = getsockname(sock, (struct sockaddr*)&addr, &addr_len); if (result == SOCKET_ERROR) { - LOG(WARNING) << "getsockname() failed: " << - GetWindowsErrorMessage(WSAGetLastError()); + LOG(WARNING) << "getsockname() failed: " + << GetWindowsErrorMessage(WSAGetLastError()); closesocket(sock); return false; } diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h index b65313363ed..66ec44885d5 100644 --- a/tensorflow/core/platform/windows/subprocess.h +++ b/tensorflow/core/platform/windows/subprocess.h @@ -19,8 +19,7 @@ limitations under the License. namespace tensorflow { // SubProcess is not yet implemented for Windows. -class SubProcess { -}; +class SubProcess {}; } // namespace tensorflow diff --git a/tensorflow/core/platform/windows/test.cc b/tensorflow/core/platform/windows/test.cc index 0ffd02ff148..584acad91b2 100644 --- a/tensorflow/core/platform/windows/test.cc +++ b/tensorflow/core/platform/windows/test.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/platform/net.h" #include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/net.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc index 604348fe03a..b6b3722caae 100644 --- a/tensorflow/core/platform/windows/windows_file_system.cc +++ b/tensorflow/core/platform/windows/windows_file_system.cc @@ -13,12 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include #include #include #include #include #include -#include #undef StrCat #include #include @@ -75,16 +75,16 @@ SSIZE_T pread(HANDLE hfile, char* src, size_t num_bytes, uint64_t offset) { if (TRUE == read_result) { result = bytes_read; } else if ((FALSE == read_result) && - ((last_error = GetLastError()) != ERROR_IO_PENDING)) { + ((last_error = GetLastError()) != ERROR_IO_PENDING)) { result = (last_error == ERROR_HANDLE_EOF) ? 0 : -1; } else { - if (ERROR_IO_PENDING == last_error) { // Otherwise bytes_read already has the result. - BOOL overlapped_result = ::GetOverlappedResult(hfile, &overlapped, - &bytes_read, TRUE); + if (ERROR_IO_PENDING == + last_error) { // Otherwise bytes_read already has the result. + BOOL overlapped_result = + ::GetOverlappedResult(hfile, &overlapped, &bytes_read, TRUE); if (FALSE == overlapped_result) { result = (::GetLastError() == ERROR_HANDLE_EOF) ? 0 : -1; - } - else { + } else { result = bytes_read; } } @@ -151,11 +151,11 @@ class WindowsWritableFile : public WritableFile { Status Append(const StringPiece& data) override { DWORD bytes_written = 0; DWORD data_size = static_cast(data.size()); - BOOL write_result = ::WriteFile(hfile_, data.data(), data_size, - &bytes_written, NULL); + BOOL write_result = + ::WriteFile(hfile_, data.data(), data_size, &bytes_written, NULL); if (FALSE == write_result) { - return IOErrorFromWindowsError( - "Failed to WriteFile: " + filename_, ::GetLastError()); + return IOErrorFromWindowsError("Failed to WriteFile: " + filename_, + ::GetLastError()); } assert(size_t(bytes_written) == data.size()); @@ -171,8 +171,8 @@ class WindowsWritableFile : public WritableFile { } if (FALSE == ::CloseHandle(hfile_)) { - return IOErrorFromWindowsError( - "CloseHandle failed for: " + filename_, ::GetLastError()); + return IOErrorFromWindowsError("CloseHandle failed for: " + filename_, + ::GetLastError()); } hfile_ = INVALID_HANDLE_VALUE; @@ -187,9 +187,7 @@ class WindowsWritableFile : public WritableFile { return Status::OK(); } - Status Sync() override { - return Flush(); - } + Status Sync() override { return Flush(); } }; class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion { @@ -204,7 +202,10 @@ class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion { public: WinReadOnlyMemoryRegion(const std::string& filename, HANDLE hfile, HANDLE hmap, const void* address, uint64 length) - : filename_(filename), hfile_(hfile), hmap_(hmap), address_(address), + : filename_(filename), + hfile_(hfile), + hmap_(hmap), + address_(address), length_(length) {} ~WinReadOnlyMemoryRegion() { @@ -238,9 +239,9 @@ Status WindowsFileSystem::NewRandomAccessFile( // almost all tests would work with a possible exception of fault_injection. DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; - HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, - share_mode, NULL, OPEN_EXISTING, file_flags, - NULL); + HANDLE hfile = + ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL, + OPEN_EXISTING, file_flags, NULL); if (INVALID_HANDLE_VALUE == hfile) { string context = "NewRandomAccessFile failed to Create/Open: " + fname; @@ -258,9 +259,9 @@ Status WindowsFileSystem::NewWritableFile( result->reset(); DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; - HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, - share_mode, NULL, CREATE_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL); + HANDLE hfile = + ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode, + NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); if (INVALID_HANDLE_VALUE == hfile) { string context = "Failed to create a NewWriteableFile: " + fname; @@ -278,9 +279,9 @@ Status WindowsFileSystem::NewAppendableFile( result->reset(); DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; - HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, - share_mode, NULL, OPEN_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL); + HANDLE hfile = + ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode, + NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); if (INVALID_HANDLE_VALUE == hfile) { string context = "Failed to create a NewAppendableFile: " + fname; @@ -316,9 +317,9 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile( file_flags |= FILE_FLAG_OVERLAPPED; DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; - HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, - share_mode, NULL, OPEN_EXISTING, file_flags, - NULL); + HANDLE hfile = + ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL, + OPEN_EXISTING, file_flags, NULL); if (INVALID_HANDLE_VALUE == hfile) { return IOErrorFromWindowsError( @@ -345,28 +346,32 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile( NULL); // Mapping name if (!hmap) { - string context = "Failed to create file mapping for " - "NewReadOnlyMemoryRegionFromFile: " + fname; + string context = + "Failed to create file mapping for " + "NewReadOnlyMemoryRegionFromFile: " + + fname; return IOErrorFromWindowsError(context, ::GetLastError()); } UniqueCloseHandlePtr map_guard(hmap, CloseHandleFunc); - const void* mapped_region = ::MapViewOfFileEx( - hmap, FILE_MAP_READ, - 0, // High DWORD of access start - 0, // Low DWORD - file_size, - NULL); // Let the OS choose the mapping + const void* mapped_region = + ::MapViewOfFileEx(hmap, FILE_MAP_READ, + 0, // High DWORD of access start + 0, // Low DWORD + file_size, + NULL); // Let the OS choose the mapping if (!mapped_region) { - string context = "Failed to MapViewOfFile for " - "NewReadOnlyMemoryRegionFromFile: " + fname; + string context = + "Failed to MapViewOfFile for " + "NewReadOnlyMemoryRegionFromFile: " + + fname; return IOErrorFromWindowsError(context, ::GetLastError()); } - result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap, - mapped_region, file_size)); + result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap, mapped_region, + file_size)); map_guard.release(); file_guard.release(); @@ -404,8 +409,8 @@ Status WindowsFileSystem::GetChildren(const string& dir, } do { - string file_name = WideCharToUtf8(find_data.cFileName); - const StringPiece basename = file_name; + string file_name = WideCharToUtf8(find_data.cFileName); + const StringPiece basename = file_name; if (basename != "." && basename != "..") { result->push_back(file_name); } @@ -457,8 +462,7 @@ Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) { file_size.HighPart = attrs.nFileSizeHigh; file_size.LowPart = attrs.nFileSizeLow; *size = file_size.QuadPart; - } - else { + } else { string context = "Can not get size for: " + fname; result = IOErrorFromWindowsError(context, ::GetLastError()); } @@ -472,7 +476,7 @@ Status WindowsFileSystem::RenameFile(const string& src, const string& target) { std::wstring ws_translated_src = Utf8ToWideChar(TranslateName(src)); std::wstring ws_translated_target = Utf8ToWideChar(TranslateName(target)); if (!::MoveFileExW(ws_translated_src.c_str(), ws_translated_target.c_str(), - MOVEFILE_REPLACE_EXISTING)) { + MOVEFILE_REPLACE_EXISTING)) { string context(strings::StrCat("Failed to rename: ", src, " to: ", target)); result = IOErrorFromWindowsError(context, ::GetLastError()); } diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h index 8dcc1530370..ba0302f0fd8 100644 --- a/tensorflow/core/platform/windows/windows_file_system.h +++ b/tensorflow/core/platform/windows/windows_file_system.h @@ -63,33 +63,35 @@ class WindowsFileSystem : public FileSystem { Status RenameFile(const string& src, const string& target) override; - string TranslateName(const string& name) const override { - return name; - } + string TranslateName(const string& name) const override { return name; } static std::wstring Utf8ToWideChar(const string& utf8str) { - int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), NULL, 0); - std::wstring ws_translated_str(size_required, 0); - MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), &ws_translated_str[0], size_required); - return ws_translated_str; + int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), + (int)utf8str.size(), NULL, 0); + std::wstring ws_translated_str(size_required, 0); + MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), + &ws_translated_str[0], size_required); + return ws_translated_str; } - static string WideCharToUtf8(const std::wstring &wstr) { - if (wstr.empty()) return std::string(); - int size_required = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL); - string utf8_translated_str(size_required, 0); - WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), &utf8_translated_str[0], size_required, NULL, NULL); - return utf8_translated_str; + static string WideCharToUtf8(const std::wstring& wstr) { + if (wstr.empty()) return std::string(); + int size_required = WideCharToMultiByte( + CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL); + string utf8_translated_str(size_required, 0); + WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), + &utf8_translated_str[0], size_required, NULL, NULL); + return utf8_translated_str; } }; class LocalWinFileSystem : public WindowsFileSystem { -public: - string TranslateName(const string& name) const override { - StringPiece scheme, host, path; - io::ParseURI(name, &scheme, &host, &path); - return path.ToString(); - } + public: + string TranslateName(const string& name) const override { + StringPiece scheme, host, path; + io::ParseURI(name, &scheme, &host, &path); + return path.ToString(); + } }; } // namespace tensorflow diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md index 9e628b10651..57d76eb4cb9 100644 --- a/tensorflow/core/profiler/README.md +++ b/tensorflow/core/profiler/README.md @@ -240,8 +240,9 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file. # can also generate memory profile using `-select bytes` tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile= -trim_name_regexes .*apply_op.* -# Use pprof to visualize the generated file. -pprof -png --nodecount=100 --sample_index=1 +# Use google-pprof, from the google-perftools package to visualize the generated file. +# On Ubuntu you can install it with `apt-get install it google-perftools`. +google-pprof --pdf --nodecount=100 ``` ![PprofGraph](g3doc/pprof.jpg) @@ -256,7 +257,7 @@ bug fix. `OpLogProto` is a good plus if it is used. #### Teams -* Xin Pan (xpan@google.com, github: panyx0718) +* Xin Pan * Chris Antaki * Yao Zhang * Jon Shlens diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc index d05143aff9b..e968b9c97e2 100644 --- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc +++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc @@ -53,10 +53,13 @@ class TFProfAdvisorTest : public ::testing::Test { NodeExecStats node_stat; node_stat.set_all_start_micros(start_miros); node_stat.set_op_end_rel_micros(end_rel_micros); - node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0", node_stat); - node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:all", + node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0", node_stat); - node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:0", + node->AddStepStat(step, + "/job:localhost/replica:0/task:0/device:GPU:0:stream:all", + node_stat); + node->AddStepStat(step, + "/job:localhost/replica:0/task:0/device:GPU:0:stream:0", node_stat); return node; } diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc index 5a8429d4893..3dce1d85db3 100644 --- a/tensorflow/core/profiler/internal/tfprof_op.cc +++ b/tensorflow/core/profiler/internal/tfprof_op.cc @@ -113,8 +113,9 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts, root_->formatted_str = FormatNode(root_.get(), root_.get(), opts); } if (timeline) { - fprintf(stderr, "op view doesn't support timeline yet. " - "Consider graph/scope/code view.\n"); + fprintf(stderr, + "op view doesn't support timeline yet. " + "Consider graph/scope/code view.\n"); return root_.get(); } if (cnodes_map_.empty()) { @@ -265,9 +266,9 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const { double pct = 0.0; if (node->proto().total_parameters() > 0) { accu_pct = 100.0 * node->proto().total_parameters() / - root->proto().total_parameters(); - pct = 100.0 * node->proto().parameters() / - root->proto().total_parameters(); + root->proto().total_parameters(); + pct = + 100.0 * node->proto().parameters() / root->proto().total_parameters(); } attrs.push_back(strings::Printf( "%30s", @@ -282,9 +283,8 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const { double pct = 0.0; if (node->proto().total_float_ops() > 0) { accu_pct = 100.0 * node->proto().total_float_ops() / - root->proto().total_float_ops(); - pct = 100.0 * node->proto().float_ops() / - root->proto().total_float_ops(); + root->proto().total_float_ops(); + pct = 100.0 * node->proto().float_ops() / root->proto().total_float_ops(); } attrs.push_back(strings::Printf( diff --git a/tensorflow/core/profiler/internal/tfprof_op.h b/tensorflow/core/profiler/internal/tfprof_op.h index fe1c3b2ae82..aa22182d36c 100644 --- a/tensorflow/core/profiler/internal/tfprof_op.h +++ b/tensorflow/core/profiler/internal/tfprof_op.h @@ -41,8 +41,7 @@ namespace tfprof { // to input ops. class TFOp : public TFMultiShow { public: - explicit TFOp() - : TFMultiShow() {} + explicit TFOp() : TFMultiShow() {} ~TFOp() override {} void AddNode(TFGraphNode* node) override; @@ -51,7 +50,7 @@ class TFOp : public TFMultiShow { private: const ShowMultiNode* ShowInternal(const Options& opts, - Timeline* timeline) override; + Timeline* timeline) override; int64 SearchRoot(const std::vector nodes, const std::vector& regexes); diff --git a/tensorflow/core/profiler/internal/tfprof_show.h b/tensorflow/core/profiler/internal/tfprof_show.h index 4d6de060705..81b021549a4 100644 --- a/tensorflow/core/profiler/internal/tfprof_show.h +++ b/tensorflow/core/profiler/internal/tfprof_show.h @@ -78,40 +78,43 @@ class TFShow { return nodes; } std::vector sorted_nodes = nodes; - std::sort(sorted_nodes.begin(), sorted_nodes.end(), [&opts](const T* n1, - const T* n2) { - if (n1->name() == kTFProfRoot) return true; - if (n2->name() == kTFProfRoot) return false; - bool name_cmp = n1->name() < n2->name(); - if (opts.order_by == kOrderBy[0]) { - return name_cmp; - } else if (opts.order_by == kOrderBy[1]) { - return n1->proto().total_requested_bytes() > - n2->proto().total_requested_bytes(); - } else if (opts.order_by == kOrderBy[2]) { - return n1->proto().total_peak_bytes() > n2->proto().total_peak_bytes(); - } else if (opts.order_by == kOrderBy[3]) { - return n1->proto().total_residual_bytes() > - n2->proto().total_residual_bytes(); - } else if (opts.order_by == kOrderBy[4]) { - return n1->proto().total_output_bytes() > - n2->proto().total_output_bytes(); - } else if (opts.order_by == kOrderBy[5]) { - return n1->proto().total_exec_micros() > - n2->proto().total_exec_micros(); - } else if (opts.order_by == kOrderBy[6]) { - return n1->proto().total_accelerator_exec_micros() > - n2->proto().total_accelerator_exec_micros(); - } else if (opts.order_by == kOrderBy[7]) { - return n1->proto().total_cpu_exec_micros() > - n2->proto().total_cpu_exec_micros(); - } else if (opts.order_by == kOrderBy[8]) { - return n1->proto().total_parameters() > n2->proto().total_parameters(); - } else if (opts.order_by == kOrderBy[9]) { - return n1->proto().total_float_ops() > n2->proto().total_float_ops(); - } - return name_cmp; - }); + std::sort(sorted_nodes.begin(), sorted_nodes.end(), + [&opts](const T* n1, const T* n2) { + if (n1->name() == kTFProfRoot) return true; + if (n2->name() == kTFProfRoot) return false; + bool name_cmp = n1->name() < n2->name(); + if (opts.order_by == kOrderBy[0]) { + return name_cmp; + } else if (opts.order_by == kOrderBy[1]) { + return n1->proto().total_requested_bytes() > + n2->proto().total_requested_bytes(); + } else if (opts.order_by == kOrderBy[2]) { + return n1->proto().total_peak_bytes() > + n2->proto().total_peak_bytes(); + } else if (opts.order_by == kOrderBy[3]) { + return n1->proto().total_residual_bytes() > + n2->proto().total_residual_bytes(); + } else if (opts.order_by == kOrderBy[4]) { + return n1->proto().total_output_bytes() > + n2->proto().total_output_bytes(); + } else if (opts.order_by == kOrderBy[5]) { + return n1->proto().total_exec_micros() > + n2->proto().total_exec_micros(); + } else if (opts.order_by == kOrderBy[6]) { + return n1->proto().total_accelerator_exec_micros() > + n2->proto().total_accelerator_exec_micros(); + } else if (opts.order_by == kOrderBy[7]) { + return n1->proto().total_cpu_exec_micros() > + n2->proto().total_cpu_exec_micros(); + } else if (opts.order_by == kOrderBy[8]) { + return n1->proto().total_parameters() > + n2->proto().total_parameters(); + } else if (opts.order_by == kOrderBy[9]) { + return n1->proto().total_float_ops() > + n2->proto().total_float_ops(); + } + return name_cmp; + }); return sorted_nodes; } diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.h b/tensorflow/core/profiler/internal/tfprof_show_multi.h index 2a2208d8e78..711d35f9753 100644 --- a/tensorflow/core/profiler/internal/tfprof_show_multi.h +++ b/tensorflow/core/profiler/internal/tfprof_show_multi.h @@ -50,7 +50,7 @@ class TFMultiShow { protected: virtual const ShowMultiNode* ShowInternal(const Options& opts, - Timeline* timeline) = 0; + Timeline* timeline) = 0; bool LookUpCheckPoint(const string& name, std::unique_ptr* tensor); diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h index 651ad3f0c1c..baf3fb2bedb 100644 --- a/tensorflow/core/profiler/internal/tfprof_timeline.h +++ b/tensorflow/core/profiler/internal/tfprof_timeline.h @@ -20,8 +20,8 @@ limitations under the License. #include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/step_stats.pb.h" #include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/protobuf/config.pb.h" #include "tensorflow/core/profiler/internal/tfprof_node_show.h" +#include "tensorflow/core/protobuf/config.pb.h" namespace tensorflow { namespace tfprof { diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc index 2cc212d5898..808e3c853be 100644 --- a/tensorflow/core/profiler/profiler.cc +++ b/tensorflow/core/profiler/profiler.cc @@ -206,8 +206,12 @@ int Run(int argc, char** argv) { "graph_path,op_log_path,run_meta_path\n"); std::unique_ptr graph(new GraphDef()); if (!FLAGS_graph_path.empty()) { - TF_CHECK_OK( - ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false)); + s = ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false); + if (!s.ok()) { + fprintf(stderr, "Failed to read graph_path: %s\n", + s.ToString().c_str()); + return 1; + } } std::unique_ptr op_log(new OpLogProto()); diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index 67da7bf4526..b02f899b87d 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -24,7 +24,7 @@ limitations under the License. // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1", // "-beta", "-rc", "-rc.1") -#define TF_VERSION_SUFFIX "-rc1" +#define TF_VERSION_SUFFIX "" #define TF_STR_HELPER(x) #x #define TF_STR(x) TF_STR_HELPER(x) diff --git a/tensorflow/core/util/bcast.cc b/tensorflow/core/util/bcast.cc index 1eab7e3d024..3a5f1f83af8 100644 --- a/tensorflow/core/util/bcast.cc +++ b/tensorflow/core/util/bcast.cc @@ -69,9 +69,9 @@ BCast::BCast(const Vec& sx, const Vec& sy, const bool fewer_dims_optimization) { State curr = UNKNOWN; const int64 x_i = x[i]; // i-th dimension of x. const int64 y_i = y[i]; // i-th dimension of y. - int64 o_i; // i-th dimension of the output. - int64 bx_i; // i-th broadcast for x. - int64 by_i; // i-th broadcast for y. + int64 o_i; // i-th dimension of the output. + int64 bx_i; // i-th broadcast for x. + int64 by_i; // i-th broadcast for y. // Invariant: // o_i = x_i * bx_i = y_i * by_i if (x_i == y_i) { diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h index be00895b0d3..dd1163310bf 100644 --- a/tensorflow/core/util/ctc/ctc_loss_calculator.h +++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h @@ -130,13 +130,13 @@ Status CTCLossCalculator::CalculateLoss( for (int t = 1; t < num_time_steps; ++t) { if (inputs[t].rows() != batch_size) { return errors::InvalidArgument("Expected batch size at t: ", t, - " to be: ", batch_size, " but got: ", - inputs[t].rows()); + " to be: ", batch_size, + " but got: ", inputs[t].rows()); } if (inputs[t].cols() != num_classes) { return errors::InvalidArgument("Expected class count at t: ", t, - " to be: ", num_classes, " but got: ", - inputs[t].cols()); + " to be: ", num_classes, + " but got: ", inputs[t].cols()); } } @@ -282,8 +282,8 @@ Status CTCLossCalculator::PopulateLPrimes( LabelSequences* l_primes) const { // labels is a Label array of size batch_size if (labels.size() != batch_size) { - return errors::InvalidArgument("labels.size() != batch_size: ", - labels.size(), " vs. ", batch_size); + return errors::InvalidArgument( + "labels.size() != batch_size: ", labels.size(), " vs. ", batch_size); } *max_u_prime = 0; // keep track of longest l' modified label sequence. @@ -325,12 +325,13 @@ Status CTCLossCalculator::PopulateLPrimes( for (int l_i : l) { if (l_i < 0) { return errors::InvalidArgument( - "All labels must be nonnegative integers, batch: ", b, " labels: ", - str_util::Join(l, ",")); + "All labels must be nonnegative integers, batch: ", b, + " labels: ", str_util::Join(l, ",")); } else if (l_i >= num_classes) { return errors::InvalidArgument( - "No label may be greater than num_classes. ", "num_classes: ", - num_classes, ", batch: ", b, " labels: ", str_util::Join(l, ",")); + "No label may be greater than num_classes. ", + "num_classes: ", num_classes, ", batch: ", b, + " labels: ", str_util::Join(l, ",")); } } if (!ignore_longer_outputs_than_inputs) { diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h index f787687f662..525bef16a02 100644 --- a/tensorflow/core/util/cuda_device_functions.h +++ b/tensorflow/core/util/cuda_device_functions.h @@ -29,13 +29,8 @@ limitations under the License. #include #include #include "cuda/include/cuda.h" -#include "cuda/include/device_functions.h" #include "tensorflow/core/platform/types.h" -#if CUDA_VERSION >= 7050 -#include "cuda/include/cuda_fp16.h" -#endif // CUDA_VERSION >= 7050 - namespace tensorflow { namespace detail { diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc index bd4c356ea01..732ed33ede1 100644 --- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc +++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc @@ -149,27 +149,27 @@ class CudaLaunchConfigTest : public ::testing::Test { TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) { CudaLaunchConfig cfg; - // test valid inputs - #define TEST_LAUNCH_PARAMETER(work_element_count) \ - cfg = GetCudaLaunchConfig(bufsize, d); \ - SetOutbufZero<<>> \ - (cfg, outbuf); \ - CUDA_ASSERT_SUCCESS \ - cfg = GetCudaLaunchConfig(work_element_count, d); \ - Count1D<<>> ( \ - cfg, bufsize, outbuf); \ - CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0));\ - \ - cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ - SetOutbufZero<<>> \ - (cfg, outbuf); \ - CUDA_ASSERT_SUCCESS \ - cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0); \ - Count1D<<>> ( \ - cfg, bufsize, outbuf); \ - CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)) +// test valid inputs +#define TEST_LAUNCH_PARAMETER(work_element_count) \ + cfg = GetCudaLaunchConfig(bufsize, d); \ + SetOutbufZero<<>>( \ + cfg, outbuf); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetCudaLaunchConfig(work_element_count, d); \ + Count1D<<>>( \ + cfg, bufsize, outbuf); \ + CUDA_EXPECT_SUCCESS \ + EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \ + \ + cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ + SetOutbufZero<<>>( \ + cfg, outbuf); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0); \ + Count1D<<>>( \ + cfg, bufsize, outbuf); \ + CUDA_EXPECT_SUCCESS \ + EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)) TEST_LAUNCH_PARAMETER(128); TEST_LAUNCH_PARAMETER(129); @@ -181,7 +181,7 @@ TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) { TEST_LAUNCH_PARAMETER(8192); TEST_LAUNCH_PARAMETER(123456); TEST_LAUNCH_PARAMETER(1 << 30); - #undef TEST_LAUNCH_PARAMETER +#undef TEST_LAUNCH_PARAMETER } bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) { @@ -200,27 +200,27 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) { Cuda2DLaunchConfig cfg; CudaLaunchConfig cfg1d; - // test valid inputs - #define TEST_LAUNCH_PARAMETER(dimx, dimy) \ - cfg1d = GetCudaLaunchConfig(bufsize, d); \ - SetOutbufZero<<>> \ - (cfg1d, outbuf);\ - CUDA_ASSERT_SUCCESS \ - cfg = GetCuda2DLaunchConfig(dimx, dimy, d); \ - Count2D<<>> ( \ - cfg, bufsize, outbuf); \ - CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \ - \ - cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ - SetOutbufZero<<>> \ - (cfg1d, outbuf);\ - CUDA_ASSERT_SUCCESS \ - cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \ - Count2D<<>> ( \ - cfg, bufsize, outbuf); \ - CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0)) +// test valid inputs +#define TEST_LAUNCH_PARAMETER(dimx, dimy) \ + cfg1d = GetCudaLaunchConfig(bufsize, d); \ + SetOutbufZero<<>>( \ + cfg1d, outbuf); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetCuda2DLaunchConfig(dimx, dimy, d); \ + Count2D<<>>( \ + cfg, bufsize, outbuf); \ + CUDA_EXPECT_SUCCESS \ + EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \ + \ + cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ + SetOutbufZero<<>>( \ + cfg1d, outbuf); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \ + Count2D<<>>( \ + cfg, bufsize, outbuf); \ + CUDA_EXPECT_SUCCESS \ + EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)) TEST_LAUNCH_PARAMETER(128, 128); TEST_LAUNCH_PARAMETER(129, 64); @@ -233,24 +233,24 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) { TEST_LAUNCH_PARAMETER(123456, 12); TEST_LAUNCH_PARAMETER(1, 1 << 30); TEST_LAUNCH_PARAMETER(1 << 30, 1); - #undef TEST_LAUNCH_PARAMETER +#undef TEST_LAUNCH_PARAMETER } TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) { Cuda3DLaunchConfig cfg; CudaLaunchConfig cfg1d; - // test valid inputs - #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \ - cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ - SetOutbufZero<<>> \ - (cfg1d, outbuf);\ - CUDA_ASSERT_SUCCESS \ - cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \ - Count3D<<>> ( \ - cfg, bufsize, outbuf); \ - CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(dimx * dimy * dimz, std::accumulate(outbuf, outbuf + bufsize, 0)) +// test valid inputs +#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \ + cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ + SetOutbufZero<<>>( \ + cfg1d, outbuf); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \ + Count3D<<>>( \ + cfg, bufsize, outbuf); \ + CUDA_EXPECT_SUCCESS \ + EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0)) TEST_LAUNCH_PARAMETER(128, 128, 128); TEST_LAUNCH_PARAMETER(129, 64, 1024); @@ -264,7 +264,7 @@ TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) { TEST_LAUNCH_PARAMETER(1, 1, 1 << 30); TEST_LAUNCH_PARAMETER(1, 1 << 30, 1); TEST_LAUNCH_PARAMETER(1 << 30, 1, 1); - #undef TEST_LAUNCH_PARAMETER +#undef TEST_LAUNCH_PARAMETER } TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) { diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc index 9b6a8e12511..13e41c17f7c 100644 --- a/tensorflow/core/util/example_proto_fast_parsing_test.cc +++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc @@ -57,6 +57,7 @@ void TestCorrectness(const string& serialized) { Example example; Example fast_example; EXPECT_TRUE(example.ParseFromString(serialized)); + example.DiscardUnknownFields(); EXPECT_TRUE(TestFastParse(serialized, &fast_example)); EXPECT_EQ(example.DebugString(), fast_example.DebugString()); if (example.DebugString() != fast_example.DebugString()) { diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc index 41f56d2daa4..e156a3bc8f0 100644 --- a/tensorflow/core/util/example_proto_helper.cc +++ b/tensorflow/core/util/example_proto_helper.cc @@ -247,8 +247,9 @@ Status SingleExampleProtoToTensors( bool types_match; TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match)); if (!types_match) { - return errors::InvalidArgument("Name: ", example_name, ", Feature: ", - key, ". Data types don't match. ", + return errors::InvalidArgument("Name: ", example_name, + ", Feature: ", key, + ". Data types don't match. ", "Expected type: ", DataTypeString(dtype), " Feature is: ", ProtoDebugString(f)); } @@ -278,8 +279,9 @@ Status SingleExampleProtoToTensors( bool types_match; TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match)); if (!types_match) { - return errors::InvalidArgument("Name: ", example_name, ", Feature: ", - key, ". Data types don't match. ", + return errors::InvalidArgument("Name: ", example_name, + ", Feature: ", key, + ". Data types don't match. ", "Expected type: ", DataTypeString(dtype), " Feature is: ", ProtoDebugString(f)); } diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc index 616eb5dac32..504d2d353f8 100644 --- a/tensorflow/core/util/memmapped_file_system_test.cc +++ b/tensorflow/core/util/memmapped_file_system_test.cc @@ -144,8 +144,8 @@ TEST(MemmappedFileSystemTest, ProxyToDefault) { TF_ASSERT_OK(memmapped_env.NewAppendableFile(filename, &writable_file_temp)); // Making sure to clean up after the test finishes. const auto adh = [&memmapped_env, &filename](WritableFile* f) { - delete f; - TF_CHECK_OK(memmapped_env.DeleteFile(filename)); + delete f; + TF_CHECK_OK(memmapped_env.DeleteFile(filename)); }; std::unique_ptr writable_file( writable_file_temp.release(), adh); diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 2caf5fc56da..4467373c006 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -35,7 +35,7 @@ limitations under the License. #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML #include "mkldnn.hpp" using mkldnn::engine; @@ -210,31 +210,32 @@ class MklShape { CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS); } -// The following methods are used for serializing and de-serializing the -// contents of the mklshape object. -// The data is serialized in this order -// isMklTensor_ -// dimension_ -// sizes_ -// strides_ -// mklLayout_ -// tfLayout_ -// tf_to_mkl_dim_map_ + // The following methods are used for serializing and de-serializing the + // contents of the mklshape object. + // The data is serialized in this order + // isMklTensor_ + // dimension_ + // sizes_ + // strides_ + // mklLayout_ + // tfLayout_ + // tf_to_mkl_dim_map_ #define SIZE_OF_MKL_DNN_BUF \ (dnnLayoutSerializationBufferSize_F32()) // Size of buffer needed to // serialize dnn_layout pointer -// Size of buffer to hold the serialized object, the size is computed as follows -// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + sizeof(strides_) -// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer) -// + sizeof(tf_to_mkl_dim_map_) + // Size of buffer to hold the serialized object, the size is computed as + // follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + + // sizeof(strides_) + // + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer) + // + sizeof(tf_to_mkl_dim_map_) #define SIZE_OF_MKL_SERIAL_DATA(dims) \ (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF) -// First we need to define some macro for offsets into the serial buffer where -// different elements of Mklshape is written/read from + // First we need to define some macro for offsets into the serial buffer where + // different elements of Mklshape is written/read from #define IS_MKL_TENSOR_OFFSET 0 // Location from start of buffer where isMklTensor_ is serialized @@ -324,7 +325,7 @@ class MklShape { nullptr; // TF dimension corresponding to this MKL dimension }; -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // Forward decl TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format); @@ -388,7 +389,7 @@ class MklDnnShape { /// Equality function for MklDnnShape objects /// @return true if both are equal; false otherwise. - inline bool operator == (const MklDnnShape& input_shape) const { + inline bool operator==(const MklDnnShape& input_shape) const { if (this->IsMklTensor() != input_shape.IsMklTensor()) { return false; } @@ -406,7 +407,7 @@ class MklDnnShape { /// Equality operator for MklDnnShape and TFShape. /// Returns: true if TF shapes for both are the same, false otherwise - inline bool operator == (const TensorShape& input_shape) const { + inline bool operator==(const TensorShape& input_shape) const { if (!this->IsMklTensor()) { return false; } @@ -425,7 +426,7 @@ class MklDnnShape { inline size_t GetDimension(char dimension) const { int index = GetMklDnnTensorDimIndex(dimension); CHECK(index >= 0 && index < this->GetDimension()) - << "Invalid index from the dimension: " << index << ", " << dimension; + << "Invalid index from the dimension: " << index << ", " << dimension; return this->DimSize(index); } @@ -659,7 +660,7 @@ class MklDnnShape { typedef std::vector MklShapeList; -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML typedef std::vector MklDnnShapeList; #endif @@ -673,7 +674,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) { return true; } -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML template inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, const MklShape& mkl_shape) { @@ -705,8 +706,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, Tensor output_tensor; TensorShape output_shape; - TF_CHECK_OK(Status(error::Code::UNIMPLEMENTED, - "Unimplemented conversion function")); + TF_CHECK_OK( + Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function")); return output_tensor; } @@ -724,7 +725,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) { sizeof(uint8)); } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) { mklshape->DeSerializeMklDnnShape( ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs())) @@ -749,7 +750,7 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name, } -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name, MklShapeList* mkl_shapes) { @@ -779,7 +780,7 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name, #endif -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML /// Get shape of input tensor pointed by 'input_idx' in TensorShape format. /// If the input tensor is in MKL layout, then obtains TensorShape from /// MklShape. @@ -814,7 +815,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, second_tensor->flat().size() * sizeof(uint8)); } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // Allocate the second output tensor that will contain // the MKL shape serialized inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, @@ -851,7 +852,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, second_tensor->flat().size() * sizeof(uint8)); } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML // Allocate the output tensor, create a second output tensor that will contain // the MKL shape serialized inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, @@ -875,7 +876,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, // Allocates a temp tensor and returns the data buffer for temporary storage. // Currently -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML template inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, const memory::primitive_desc& pd, void** buf_out) { @@ -973,8 +974,8 @@ inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) { return mkl_shape.dim_size(index); } -inline void CopyMklTensorInToOut(OpKernelContext* context, - int idx_in, int idx_out) { +inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in, + int idx_out) { int num_inputs = context->num_inputs(); int num_outputs = context->num_outputs(); int idx_data_in = GetTensorDataIndex(idx_in, num_inputs); @@ -994,9 +995,9 @@ inline void CopyMklTensorInToOut(OpKernelContext* context, context->set_output(idx_meta_out, meta_output); } -#ifndef INTEL_MKL_DNN -inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, - int idx_in, int idx_out, +#ifdef INTEL_MKL_ML +inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in, + int idx_out, const TensorShape& shape) { int num_inputs = context->num_inputs(); int num_outputs = context->num_outputs(); @@ -1013,8 +1014,8 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, context->set_output(idx_data_out, output); } #else -inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, - int idx_in, int idx_out, +inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in, + int idx_out, const TensorShape& shape) { int num_inputs = context->num_inputs(); int num_outputs = context->num_outputs(); @@ -1032,10 +1033,10 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, } #endif -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML -inline void ForwardTfTensorInToOut(OpKernelContext* context, - int idx_in, int idx_out) { +inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in, + int idx_out) { int num_inputs = context->num_inputs(); int num_outputs = context->num_outputs(); int idx_data_in = GetTensorDataIndex(idx_in, num_inputs); @@ -1053,8 +1054,8 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context, #else -inline void ForwardTfTensorInToOut(OpKernelContext* context, - int idx_in, int idx_out) { +inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in, + int idx_out) { int num_inputs = context->num_inputs(); int num_outputs = context->num_outputs(); int idx_data_in = GetTensorDataIndex(idx_in, num_inputs); @@ -1072,8 +1073,8 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context, #endif -inline void ForwardMklTensorInToOut(OpKernelContext* context, - int idx_in, int idx_out) { +inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in, + int idx_out) { int num_inputs = context->num_inputs(); int num_outputs = context->num_outputs(); int idx_data_in = GetTensorDataIndex(idx_in, num_inputs); @@ -1090,10 +1091,10 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context, } } -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context, - int idx_in, int idx_out, - const MklDnnShape& mkl_shape) { + int idx_in, int idx_out, + const MklDnnShape& mkl_shape) { int num_inputs = context->num_inputs(); int num_outputs = context->num_outputs(); int idx_data_in = GetTensorDataIndex(idx_in, num_inputs); @@ -1132,7 +1133,7 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context, AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output); } -#ifndef INTEL_MKL_DNN +#ifdef INTEL_MKL_ML // We don't need these functions in MKLDNN. We have defined equality operator // on MklDnnShape class directly. @@ -1216,11 +1217,11 @@ inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) { int64 H = input.dim_size(1); int64 W = input.dim_size(2); int64 C = input.dim_size(3); - int64 stride_n = H*W*C; -# pragma omp parallel for num_threads(16) + int64 stride_n = H * W * C; +#pragma omp parallel for num_threads(16) for (int64 n = 0; n < N; ++n) { - mkl_somatcopy('R', 'T', H*W, C, 1, buf_in + n*stride_n, C, - buf_out + n*stride_n, H*W); + mkl_somatcopy('R', 'T', H * W, C, 1, buf_in + n * stride_n, C, + buf_out + n * stride_n, H * W); } } @@ -1232,17 +1233,17 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) { int64 H = (*output)->dim_size(1); int64 W = (*output)->dim_size(2); int64 C = (*output)->dim_size(3); - int64 stride_n = H*W*C; -# pragma omp parallel for num_threads(16) + int64 stride_n = H * W * C; +#pragma omp parallel for num_threads(16) for (int64 n = 0; n < N; ++n) { - mkl_somatcopy('R', 'T', C, H*W, 1, buf_in + n*stride_n, H*W, - buf_out + n*stride_n, C); + mkl_somatcopy('R', 'T', C, H * W, 1, buf_in + n * stride_n, H * W, + buf_out + n * stride_n, C); } } // ------------------------------------------------------------------- -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML /// Return MKL-DNN data type (memory::data_type) for input type T /// @@ -1279,10 +1280,11 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) { /// @return: Tensorflow data format corresponding to memory::format /// Fails with an error if invalid data format. inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) { - if (format == memory::format::nhwc) return FORMAT_NHWC; - else if (format == memory::format::nchw) return FORMAT_NCHW; - TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, - "Unsupported data format")); + if (format == memory::format::nhwc) + return FORMAT_NHWC; + else if (format == memory::format::nchw) + return FORMAT_NCHW; + TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure // that we don't come here. @@ -1425,7 +1427,6 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, return memory::desc(md); } - /* * Class to represent all the resources corresponding to a tensor in TensorFlow * that are required to execute an operation (such as Convolution). @@ -1494,7 +1495,7 @@ class MklDnnData { /// @return: memory::desc object corresponding to blocked memory format /// for given dimensions and strides. static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim, - const memory::dims& strides) { + const memory::dims& strides) { return CreateBlockedMemDescHelper(dim, strides, MklDnnType()); } @@ -1563,7 +1564,6 @@ class MklDnnData { return user_memory_->get_primitive_desc(); } - /// Get function for descriptor of user memory. inline memory::desc GetUsrMemDesc() { // This is ugly. Why MKL-DNN does not provide desc() method of const type?? @@ -1634,7 +1634,8 @@ class MklDnnData { /// @return: true in case reorder of input is needed; false, otherwise. inline bool IsReorderNeeded(const memory::format& target_format) const { CHECK_NOTNULL(user_memory_); - return target_format != user_memory_->get_primitive_desc().desc().data.format; + return target_format != + user_memory_->get_primitive_desc().desc().data.format; } /// Function to create a reorder from memory pointed by from to memory pointed @@ -1753,7 +1754,7 @@ class MklDnnData { } }; -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace tensorflow #endif // INTEL_MKL diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc index 8b73eadb400..cd1d0713ad5 100644 --- a/tensorflow/core/util/mkl_util_test.cc +++ b/tensorflow/core/util/mkl_util_test.cc @@ -22,7 +22,7 @@ limitations under the License. namespace tensorflow { namespace { -#ifdef INTEL_MKL_DNN +#ifndef INTEL_MKL_ML TEST(MklUtilTest, MklDnnTfShape) { auto cpu_engine = engine(engine::cpu, 0); @@ -84,7 +84,7 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) { EXPECT_EQ(b_md2.data.format, mkldnn_blocked); } -#endif // INTEL_MKL_DNN +#endif // INTEL_MKL_ML } // namespace } // namespace tensorflow diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h index e7dab830f0e..f88ad2faaff 100644 --- a/tensorflow/core/util/presized_cuckoo_map.h +++ b/tensorflow/core/util/presized_cuckoo_map.h @@ -67,7 +67,7 @@ inline uint64 multiply_high_u64(uint64 x, uint64 y) { return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry; #endif } -} +} // namespace presized_cuckoo_map template class PresizedCuckooMap { diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc index 1cb07718fee..575c27d4ef7 100644 --- a/tensorflow/core/util/reporter_test.cc +++ b/tensorflow/core/util/reporter_test.cc @@ -29,8 +29,8 @@ namespace { // Tests of all the error paths in log_reader.cc follow: static void ExpectHasSubstr(StringPiece s, StringPiece expected) { - EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain " - << expected; + EXPECT_TRUE(StringPiece(s).contains(expected)) + << s << " does not contain " << expected; } TEST(TestReporter, NoLogging) { diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h index f2401a0af4e..258ee418c14 100644 --- a/tensorflow/core/util/sparse/sparse_tensor.h +++ b/tensorflow/core/util/sparse/sparse_tensor.h @@ -20,6 +20,7 @@ limitations under the License. #include #include +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/types.h" @@ -31,7 +32,6 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/sparse/dim_comparator.h" #include "tensorflow/core/util/sparse/group_iterator.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { namespace sparse { @@ -59,8 +59,8 @@ class SparseTensor { shape_(shape.begin(), shape.end()), order_(order.begin(), order.end()), dims_(GetDimsFromIx(ix)) { - CHECK_EQ(ix.dtype(), DT_INT64) << "indices must be type int64 but got: " - << ix.dtype(); + CHECK_EQ(ix.dtype(), DT_INT64) + << "indices must be type int64 but got: " << ix.dtype(); CHECK(TensorShapeUtils::IsVector(vals.shape())) << "vals must be a vec, but got: " << vals.shape().DebugString(); CHECK_EQ(ix.shape().dim_size(0), vals.shape().dim_size(0)) diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc index efdd97fd3d6..85de0320857 100644 --- a/tensorflow/core/util/sparse/sparse_tensor_test.cc +++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/lib/core/status_test_util.h" @@ -25,7 +26,6 @@ limitations under the License. #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { namespace sparse { diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h index 6a5ddec04c9..f7767ace716 100644 --- a/tensorflow/core/util/stream_executor_util.h +++ b/tensorflow/core/util/stream_executor_util.h @@ -41,9 +41,10 @@ class StreamExecutorUtil { // This assumes that the error codes between the two implementations // match. static Status ConvertStatus(const perftools::gputools::port::Status& s) { - return s.ok() ? Status::OK() : Status(static_cast( - static_cast(s.code())), - s.error_message()); + return s.ok() ? Status::OK() + : Status(static_cast( + static_cast(s.code())), + s.error_message()); } }; diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc index 579b70ab514..462b420976e 100644 --- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc +++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc @@ -913,8 +913,8 @@ Status BundleReader::LookupSlice(StringPiece full_tensor_key, Status BundleReader::GetSliceValue(StringPiece full_tensor_key, const BundleEntryProto& full_tensor_entry, const TensorSlice& slice_spec, Tensor* val) { - using checkpoint::TensorSliceSet; using checkpoint::RegisterTensorSlice; + using checkpoint::TensorSliceSet; DCHECK_GE(full_tensor_entry.slices_size(), 0); const TensorShape full_shape(TensorShape(full_tensor_entry.shape())); diff --git a/tensorflow/core/util/tensor_slice_reader_cache.cc b/tensorflow/core/util/tensor_slice_reader_cache.cc index 0f009d7de57..424f8098a9c 100644 --- a/tensorflow/core/util/tensor_slice_reader_cache.cc +++ b/tensorflow/core/util/tensor_slice_reader_cache.cc @@ -55,7 +55,7 @@ const TensorSliceReader* TensorSliceReaderCache::GetReader( TensorSliceReader::OpenTableFunction open_function, int preferred_shard) { mutex_lock l(mu_); -#if defined(__GXX_RTTI) || defined(_CPPRTTI) +#if defined(__GXX_RTTI) || defined(_CPPRTTI) // Get the function pointer from the open_function value. TensorSliceReaderCache::OpenFuncType* func_ptr = open_function.target(); diff --git a/tensorflow/core/util/tensor_slice_set.cc b/tensorflow/core/util/tensor_slice_set.cc index 4217df90ca1..7c1d325c0a5 100644 --- a/tensorflow/core/util/tensor_slice_set.cc +++ b/tensorflow/core/util/tensor_slice_set.cc @@ -188,9 +188,9 @@ Status RegisterTensorSlice( } if (type != tss->type()) { return errors::Internal("Incompatible tensor types detected for tensor ", - name, ": existing = ", - DataTypeString(tss->type()), ", new = ", - DataTypeString(type)); + name, + ": existing = ", DataTypeString(tss->type()), + ", new = ", DataTypeString(type)); } } // Register the tensor slices without the actual data. diff --git a/tensorflow/core/util/tensor_slice_util.h b/tensorflow/core/util/tensor_slice_util.h index c7edae66b26..8f5a6f1d935 100644 --- a/tensorflow/core/util/tensor_slice_util.h +++ b/tensorflow/core/util/tensor_slice_util.h @@ -139,9 +139,9 @@ static bool CopyDataFromTensorSliceToTensorSlice(const TensorShape& shape, const TensorSlice& slice_d, const SrcT* ptr_s, DstT* ptr_d) { - CHECK_LE(shape.dims(), kTensorSliceMaxRank) << "Only tensors of size up to " - << kTensorSliceMaxRank - << " are supported"; + CHECK_LE(shape.dims(), kTensorSliceMaxRank) + << "Only tensors of size up to " << kTensorSliceMaxRank + << " are supported"; // We need to compute the intersection of the two slices. TensorSlice inter; if (!slice_s.Intersect(slice_d, &inter)) { diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h index bdb4921e1bb..2888c66d10f 100644 --- a/tensorflow/core/util/tensor_slice_writer.h +++ b/tensorflow/core/util/tensor_slice_writer.h @@ -101,8 +101,8 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape, // The tensor and the slice have to be compatible if (shape.dims() != slice.dims()) { return errors::Internal("Incompatible tensor shape and slice: ", "shape = ", - shape.DebugString(), ", slice = ", - slice.DebugString()); + shape.DebugString(), + ", slice = ", slice.DebugString()); } DataType dt = DataTypeToEnum::value; // We need to add an entry for "name" if there isn't an entry already. @@ -114,9 +114,9 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape, CHECK_EQ(name, ssm.name()) << ProtoShortDebugString(ssm); TensorShape ssm_shape(ssm.shape()); if (!shape.IsSameSize(ssm_shape)) { - return errors::Internal("Mismatching shapes: existing tensor = ", - ssm_shape.DebugString(), ", trying to add name ", - name, ", shape = ", shape.DebugString()); + return errors::Internal( + "Mismatching shapes: existing tensor = ", ssm_shape.DebugString(), + ", trying to add name ", name, ", shape = ", shape.DebugString()); } if (dt != ssm.type()) { return errors::Internal( diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md index c9f0c532c62..5593a3d95c4 100644 --- a/tensorflow/docs_src/about/bib.md +++ b/tensorflow/docs_src/about/bib.md @@ -60,7 +60,7 @@ author={ Lukasz~Kaiser and Manjunath~Kudlur and Josh~Levenberg and - Dan~Man\'{e} and + Dandelion~Man\'{e} and Rajat~Monga and Sherry~Moore and Derek~Murray and diff --git a/tensorflow/docs_src/api_guides/python/TPUEstimator.md b/tensorflow/docs_src/api_guides/python/TPUEstimator.md new file mode 100644 index 00000000000..d74d7f3181c --- /dev/null +++ b/tensorflow/docs_src/api_guides/python/TPUEstimator.md @@ -0,0 +1,396 @@ +# Using TPUs + +This document walks through the principal TensorFlow APIs necessary to make +effective use of a [Cloud TPU](https://cloud.google.com/tpu/), and highlights +the differences between regular TensorFlow usage, and usage on a TPU. + +This doc is aimed at users who: + +* Are familiar with TensorFlow's `Estimator` and `Dataset` APIs +* Have maybe [tried out a Cloud TPU](https://cloud.google.com/tpu/docs/quickstart) + using an existing model. +* Have, perhaps, skimmed the code of an example TPU model + [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py) + [[2]](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models). +* Are interested in porting an existing `Estimator` model to + run on Cloud TPUs + +## TPUEstimator + +@{tf.estimator.Estimator$Estimators} are TensorFlow's model-level abstraction. +Standard `Estimators` can drive models on CPU and GPUs. You must use +@{tf.contrib.tpu.TPUEstimator} to drive a model on TPUs. + +Refer to TensorFlow's Getting Started section for an introduction to the basics +of using a @{$get_started/premade_estimators$pre-made `Estimator`}, and +@{$get_started/custom_estimators$custom `Estimator`s}. + +The `TPUEstimator` class differs somewhat from the `Estimator` class. + +The simplest way to maintain a model that can be run both on CPU/GPU or on a +Cloud TPU is to define the model's inference phase (from inputs to predictions) +outside of the `model_fn`. Then maintain separate implementations of the +`Estimator` setup and `model_fn`, both wrapping this inference step. For an +example of this pattern compare the `mnist.py` and `mnist_tpu.py` implementation in +[tensorflow/models](https://github.com/tensorflow/models/tree/master/official/mnist). + +### Running a `TPUEstimator` locally + +To create a standard `Estimator` you call the constructor, and pass it a +`model_fn`, for example: + +``` +my_estimator = tf.estimator.Estimator( + model_fn=my_model_fn) +``` + +The changes required to use a @{tf.contrib.tpu.TPUEstimator} on your local +machine are relatively minor. The constructor requires two additional arguments. +You should set the `use_tpu` argument to `False`, and pass a +@{tf.contrib.tpu.RunConfig} as the `config` argument, as shown below: + +``` python +my_tpu_estimator = tf.contrib.tpu.TPUEstimator( + model_fn=my_model_fn, + config=tf.contrib.tpu.RunConfig() + use_tpu=False) +``` + +Just this simple change will allow you to run a `TPUEstimator` locally. +The majority of example TPU models can be run in this local mode, +by setting the command line flags as follows: + + +``` +$> python mnist_tpu.py --use_tpu=false --master='' +``` + +Note: This `use_tpu=False` argument is useful for trying out the `TPUEstimator` +API. It is not meant to be a complete TPU compatibility test. Successfully +running a model locally in a `TPUEstimator` does not guarantee that it will +work on a TPU. + + +### Building a `tpu.RunConfig` + +While the default `RunConfig` is sufficient for local training, these settings +cannot be ignored in real usage. + +A more typical setup for a `RunConfig`, that can be switched to use a Cloud +TPU, might be as follows: + +``` python +import tempfile +import subprocess + +class FLAGS(object): + use_tpu=False + tpu_name=None + # Use a local temporary path for the `model_dir` + model_dir = tempfile.mkdtemp() + # Number of training steps to run on the Cloud TPU before returning control. + iterations = 50 + # A single Cloud TPU has 8 shards. + num_shards = 8 + +if FLAGS.use_tpu: + my_project_name = subprocess.check_output([ + 'gcloud','config','get-value','project']) + my_zone = subprocess.check_output([ + 'gcloud','config','get-value','compute/zone']) + cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + tpu_names=[FLAGS.tpu_name], + zone=my_zone, + project=my_project) + master = tpu_cluster_resolver.get_master() +else: + master = '' + +my_tpu_run_config = tf.contrib.tpu.RunConfig( + master=master, + evaluation_master=master, + model_dir=FLAGS.model_dir, + session_config=tf.ConfigProto( + allow_soft_placement=True, log_device_placement=True), + tpu_config=tf.contrib.tpu.TPUConfig(FLAGS.iterations, + FLAGS.num_shards), +) +``` + +Then you must pass the @{tf.contrib.tpu.RunConfig} to the constructor: + +``` python +my_tpu_estimator = tf.contrib.tpu.TPUEstimator( + model_fn=my_model_fn, + config = my_tpu_run_config, + use_tpu=FLAGS.use_tpu) +``` + +Typically the `FLAGS` would be set by command line arguments. To switch from +training locally to training on a cloud TPU you would need to: + + 1) Set `FLAGS.use_tpu` to `True` + 1) Set `FLAGS.tpu_name` so the + `tf.contrib.cluster_resolver.TPUClusterResolver` can find it + 1) Set `FLAGS.model_dir` to a Google Cloud Storage bucket url (`gs://`). + + +## Optimizer + +When training on a cloud TPU you **must** wrap the optimizer in a +@{tf.contrib.tpu.CrossShardOptimizer}, which uses an `allreduce` to aggregate +gradients and broadcast the result to each shard (each TPU core). + +The `CrossShardOptimizer` is not compatible with local training. So, to have +the same code run both locally and on a Cloud TPU, add lines like the following: + +``` python +optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) +if FLAGS.use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) +``` + +If you prefer to avoid a global `FLAGS` variable in your model code, one +approach is to set the optimizer as one of the `Estimator`'s params, +as follows: + +``` python +my_tpu_estimator = tf.contrib.tpu.TPUEstimator( + model_fn=my_model_fn, + config = my_tpu_run_config, + use_tpu=FLAGS.use_tpu, + params={'optimizer':optimizer}) +``` + +## Model Function + +This section details the changes you must make to the model function +(`model_fn()`) to make it `TPUEstimator` compatible. + +### Static shapes + +During regular usage TensorFlow attempts to determine the shapes of each +`tf.Tensor` during graph construction. During execution any unknown shape +dimensions are determined dynamically, +see @{$programmers_guide/tensors#shape$Tensor Shapes} for more details. + +To run on Cloud TPUs TensorFlow models are compiled using @{$xla$XLA}. +XLA uses a similar system for determining shapes at compile time. XLA requires +that all tensor dimensions be statically defined at compile time. All shapes +must evaluate to a constant, and not depend on external data, or stateful +operations like variables or a random number generator. + + +### Summaries + +Remove any use of `tf.summary` from your model. + +@{$summaries_and_tensorboard$TensorBoard summaries} are a great way see inside +your model. A minimal set of basic summaries are automatically recorded by the +`TPUEstimator`, to `event` files in the `model_dir`. Custom summaries, however, +are currently unsupported when training on a Cloud TPU. So while the +`TPUEstimator` will still run locally with summaries, it will fail if used on a +TPU. + +### Metrics + +Build your evaluation metrics dictionary in a stand-alone `metric_fn`. + + + +Evaluation metrics are an essential part of training a model. These are fully +supported on Cloud TPUs, but with a slightly different syntax. + +A standard @{tf.metrics} returns two tensors. The first returns the running +average of the metric value, while the second updates the running average and +returns the value for this batch: + +``` +running_average, current_batch = tf.metrics.accuracy(labels, predictions) +``` + +In a standard `Estimator` you create a dictionary of these pairs, and return it +as part of the `EstimatorSpec`. + +```python +my_metrics = {'accuracy': tf.metrics.accuracy(labels, predictions)} + +return tf.estimator.EstimatorSpec( + ... + eval_metric_ops=my_metrics +) +``` + +In a `TPUEstimator` you instead pass a function (which returns a metrics +dictionary) and a list of argument tensors, as shown below: + +```python +def my_metric_fn(labels, predictions): + return {'accuracy': tf.metrics.accuracy(labels, predictions)} + +return tf.contrib.tpu.TPUEstimatorSpec( + ... + eval_metrics=(my_metric_fn, [labels, predictions]) +) +``` + +### Use `TPUEstimatorSpec` + +`TPUEstimatorSpec` do not support hooks, and require function wrappers for +some fields. + +An `Estimator`'s `model_fn` must return an `EstimatorSpec`. An `EstimatorSpec` +is a simple structure of named fields containing all the `tf.Tensors` of the +model that the `Estimator` may need to interact with. + +`TPUEstimators` use a @{tf.contrib.tpu.TPUEstimatorSpec}. There are a few +differences between it and a standard @{tf.estimator.EstimatorSpec}: + + +* The `eval_metric_ops` must be wrapped into a `metrics_fn`, this field is + renamed `eval_metrics` ([see above](#metrics)). +* The @{tf.train.SessionRunHook$hooks} are unsupported, so these fields are + omitted. +* The @{tf.train.Scaffold$`scaffold`}, if used, must also be wrapped in a + function. This field is renamed to `scaffold_fn`. + +`Scaffold` and `Hooks` are for advanced usage, and can typically be omitted. + +## Input functions + +Input functions work mainly unchanged as they run on the host computer, not the +Cloud TPU itself. This section explains the two necessary adjustments. + +### Params argument + + + +The `input_fn` for a standard `Estimator` _can_ include a +`params` argument; the `input_fn` for a `TPUEstimator` *must* include a +`params` argument. This is necessary to allow the estimator to set the batch +size for each replica of the input stream. So the minimum signature for an +`input_fn` for a `TPUEstimator` is: + +``` +def my_input_fn(params): + pass +``` + +Where `params['batch-size']` will contain the batch size. + +### Static shapes and batch size + +The input pipeline generated by your `input_fn` is run on CPU. So it is mostly +free strict static shape requirements imposed by the XLA/TPU environment. The +one requirement is that the batches of data fed from your input pipeline to +the TPU have a static shape, as determined by the standard TensorFlow shape +inference algorithm. Intermediate tensors are free to have a dynamic shapes. +If shape inference has failed, but the shape is known it is possible to +impose the correct shape using `tf.set_shape()`. + +In the example below the shape +inference algorithm fails, but it is corrected using `set_shape`: + +``` +>>> x = tf.zeros(tf.constant([1,2,3])+1) +>>> x.shape + +TensorShape([Dimension(None), Dimension(None), Dimension(None)]) + +>>> x.set_shape([2,3,4]) +``` + +In many cases the batch size is the only unknown dimension. + +A typical input pipeline, using `tf.data`, will usually produce batches of a +fixed size. The last batch of a finite `Dataset`, however, is typically smaller, +containing just the remaining elements. Since a `Dataset` does not know its own +length or finiteness, the standard @{tf.data.Dataset.batch$`batch`} method +cannot determine if all batches will have a fixed size batch on its own: + +``` +>>> params = {'batch_size':32} +>>> ds = tf.data.Dataset.from_tensors([0, 1, 2]) +>>> ds = ds.repeat().batch(params['batch-size']) +>>> ds + + +``` + +The most straightforward fix is to +@{tf.data.Dataset.apply$apply} @{tf.contrib.data.batch_and_drop_remainder} +as follows: + +``` +>>> params = {'batch_size':32} +>>> ds = tf.data.Dataset.from_tensors([0, 1, 2]) +>>> ds = ds.repeat().apply( +... tf.contrib.data.batch_and_drop_remainder(params['batch-size'])) +>>> ds + + <_RestructuredDataset shapes: (32, 3), types: tf.int32> +``` + +The one downside to this approach is that, as the name implies, this batching +method throws out any fractional batch at the end of the dataset. This is fine +for an infinitely repeating dataset being used for training, but could be a +problem if you want to train for an exact number of epochs. + +To do an exact 1-epoch of _evaluation_ you can work around this by manually +padding the length of the batches, and setting the padding entries to have zero +weight when creating your `tf.metrics`. + +## Datasets + +Efficient use of the `tf.data.Dataset` API is critical when using a Cloud +TPU, as it is impossible to use the Cloud TPU's unless you can feed it data +quickly enough. See @{$datasets_performance} for details on dataset performance. + +For all but the simplest experimentation (using +@{tf.data.Dataset.from_tensor_slices} or other in-graph data) you will need to +store all data files read by the `TPUEstimator`'s `Dataset` in Google Cloud +Storage Buckets. + + + +For most use-cases, we recommend converting your data into `TFRecord` +format and using a @{tf.data.TFRecordDataset} to read it. This, however, is not +a hard requirement and you can use other dataset readers +(`FixedLengthRecordDataset` or `TextLineDataset`) if you prefer. + +Small datasets can be loaded entirely into memory using +@{tf.data.Dataset.cache}. + +Regardless of the data format used, it is strongly recommended that you +@{$performance_guide#use_large_files$use large files}, on the order of +100MB. This is especially important in this networked setting as the overhead +of opening a file is significantly higher. + +It is also important, regardless of the type of reader used, to enable buffering +using the `buffer_size` argument to the constructor. This argument is specified +in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so +that data is available when needed. + +The TPU-demos repo includes +[a script](https://github.com/tensorflow/tpu-demos/blob/master/cloud_tpu/datasets/imagenet_to_gcs.py) +for downloading the imagenet dataset and converting it to an appropriate format. +This together with the imagenet +[models](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models) +included in the repo demonstrate all of these best-practices. + + +## What Next + +For details on how to actually set up and run a Cloud TPU see: + + * [Google Cloud TPU Documentation](https://cloud.google.com/tpu/docs/) + +This document is by no means exhaustive. The best source of more detail on how +to make a Cloud TPU compatible model are the example models published in: + + * The [TPU Demos Repository.](https://github.com/tensorflow/tpu-demos/) + +For more information about tuning TensorFlow code for performance see: + + * The @{$performance$Performance Section.} + diff --git a/tensorflow/docs_src/api_guides/python/regression_examples.md b/tensorflow/docs_src/api_guides/python/regression_examples.md index 45cb9d829cf..7de2be05521 100644 --- a/tensorflow/docs_src/api_guides/python/regression_examples.md +++ b/tensorflow/docs_src/api_guides/python/regression_examples.md @@ -38,7 +38,7 @@ The preceding examples rely on the following data set utility: Utility Description - imports85.py + imports85.py This program provides utility functions that load the imports85 data set into formats that other TensorFlow programs (for example, linear_regression.py and @@ -229,4 +229,4 @@ passed through to the `model_fn` when the `model_fn` is called. The `model_fn` returns an @{tf.estimator.EstimatorSpec$`EstimatorSpec`} which is a simple structure indicating to the `Estimator` which operations should be run to accomplish -varions tasks. +various tasks. diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md index a3abf255075..9f6fe91b149 100644 --- a/tensorflow/docs_src/community/welcome.md +++ b/tensorflow/docs_src/community/welcome.md @@ -12,7 +12,6 @@ The source code for TensorFlow is on Before contributing to TensorFlow source code, please review the [Contribution guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md). - ### Projects developed by the TensorFlow community The TensorFlow community has created many great projects around TensorFlow, including: @@ -65,5 +64,6 @@ please read the following list carefully: [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues) on GitHub. For example, use the issue tracker to request a new operation in TensorFlow. - + * To report vulnerabilities, please follow our + [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/SECURITY.md). diff --git a/tensorflow/docs_src/get_started/checkpoints.md b/tensorflow/docs_src/get_started/checkpoints.md index 680e1c0d3f5..dfa2110e691 100644 --- a/tensorflow/docs_src/get_started/checkpoints.md +++ b/tensorflow/docs_src/get_started/checkpoints.md @@ -16,7 +16,7 @@ This document focuses on checkpoints. For details on SavedModel, see the ## Sample code This document relies on the same -[https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py](Iris classification example) detailed in @{$premade_estimators$Getting Started with TensorFlow}. +[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in @{$premade_estimators$Getting Started with TensorFlow}. To download and access the example, invoke the following two commands: ```shell diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md index 6343cc4ee45..42a246678a0 100644 --- a/tensorflow/docs_src/get_started/custom_estimators.md +++ b/tensorflow/docs_src/get_started/custom_estimators.md @@ -15,7 +15,7 @@ git clone https://github.com/tensorflow/models/ cd models/samples/core/get_started ``` -In this document we wil be looking at +In this document we will be looking at [`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py). You can run it with the following command: @@ -161,7 +161,7 @@ classifier = tf.estimator.Estimator( To implement a typical model function, you must do the following: -* (Define the model)[#define_the_model]. +* [Define the model](#define_the_model). * Specify additional calculations for each of the [three different modes](#modes): * [Predict](#predict) diff --git a/tensorflow/docs_src/get_started/datasets_quickstart.md b/tensorflow/docs_src/get_started/datasets_quickstart.md index ecfbf160f0d..a8a2ab6e561 100644 --- a/tensorflow/docs_src/get_started/datasets_quickstart.md +++ b/tensorflow/docs_src/get_started/datasets_quickstart.md @@ -169,7 +169,7 @@ the number of examples in the `Dataset` ensures that the data is completely shuffled. The Iris data set only contains 150 examples. The @{tf.data.Dataset.repeat$`repeat`} method has the `Dataset` restart when -it reaches the end. To limit the number of epochss, set the `count` argument. +it reaches the end. To limit the number of epochs, set the `count` argument. The @{tf.data.Dataset.repeat$`batch`} method collects a number of examples and stacks them, to create batches. This adds a dimension to their shape. The new @@ -282,7 +282,7 @@ produce the necessary `(features, label)` pairs. We will start by building a function to parse a single line. -The following `iris_data.parse_line` function acomplishes this taks using the +The following `iris_data.parse_line` function accomplishes this task using the @{tf.decode_csv} function, and some simple python code: We must parse each of the lines in the dataset in order to generate the diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md index e3308ed716d..ad3e1fe3e3a 100644 --- a/tensorflow/docs_src/get_started/feature_columns.md +++ b/tensorflow/docs_src/get_started/feature_columns.md @@ -461,8 +461,8 @@ permitting a richer palette of numbers for every cell, an embedding column contains far fewer cells than an indicator column. Let's look at an example comparing indicator and embedding columns. Suppose our -input examples consists of different words from a limited palette of only 81 -words. Further suppose that the data set provides provides the following input +input examples consist of different words from a limited palette of only 81 +words. Further suppose that the data set provides the following input words in 4 separate examples: * `"dog"` diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md index ea1c2fb3f47..390cc81eef7 100644 --- a/tensorflow/docs_src/get_started/get_started_for_beginners.md +++ b/tensorflow/docs_src/get_started/get_started_for_beginners.md @@ -700,7 +700,7 @@ for pred_dict, expec in zip(predictions, expected): class_id = pred_dict['class_ids'][0] probability = pred_dict['probabilities'][class_id] - print(template.format(SPECIES[class_id], 100 * probability, expec)) + print(template.format(iris_data.SPECIES[class_id], 100 * probability, expec)) ``` Running the program yields the following output: diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md index dbc35065abf..4f01f997c33 100644 --- a/tensorflow/docs_src/get_started/premade_estimators.md +++ b/tensorflow/docs_src/get_started/premade_estimators.md @@ -2,37 +2,39 @@ # Getting Started with TensorFlow This document introduces the TensorFlow programming environment and shows you -how to write the Iris classification problem in TensorFlow. +how to solve the Iris classification problem in TensorFlow. -Prior to reading this document, do the following: +## Prerequisites + +Prior to using the sample code in this document, you'll need to do the +following: * @{$install$Install TensorFlow}. * If you installed TensorFlow with virtualenv or Anaconda, activate your TensorFlow environment. -* To keep the data import simple, our Iris example uses Pandas. You can - install Pandas with: +* Install or upgrade pandas by issuing the following command: - `pip install pandas` + pip install pandas ## Getting the sample code -Take the following steps to get the sample code for this program: +Take the following steps to get the sample code we'll be going through: -1. Clone the TensorFlow Models repository from github by entering the following +1. Clone the TensorFlow Models repository from GitHub by entering the following command: - `git clone https://github.com/tensorflow/models` + git clone https://github.com/tensorflow/models 1. Change directory within that branch to the location containing the examples used in this document: - `cd models/samples/core/get_started/` + cd models/samples/core/get_started/ The program described in this document is [`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py). This program uses [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py) -To fetch its training data. +to fetch its training data. ### Running the program @@ -45,7 +47,7 @@ python premade_estimator.py The program should output training logs followed by some predictions against the test set. For example, the first line in the following output shows that the model thinks there is a 99.6% chance that the first example in the test -set is a Setosa. Since the test set `expected "Setosa"`, this appears to be +set is a Setosa. Since the test set expected Setosa, this appears to be a good prediction. ``` None @@ -61,9 +63,9 @@ If the program generates errors instead of answers, ask yourself the following questions: * Did you install TensorFlow properly? -* Are you using the correct version of tensorflow? +* Are you using the correct version of TensorFlow? * Did you activate the environment you installed TensorFlow in? (This is - only relevant in certain installation environments.) + only relevant in certain installation mechanisms.) ## The programming stack @@ -74,18 +76,15 @@ provides a programming stack consisting of multiple API layers:
-
-The TensorFlow Programming Environment -
We strongly recommend writing TensorFlow programs with the following APIs: -* @{tf.estimator$Estimators}, which represent a complete model. +* @{$programmers_guide/estimators$Estimators}, which represent a complete model. The Estimator API provides methods to train the model, to judge the model's accuracy, and to generate predictions. * @{$get_started/datasets_quickstart$Datasets}, which build a data input pipeline. The Dataset API has methods to load and manipulate data, and feed - it into your model. The Datasets API meshes well with the Estimators API. + it into your model. The Dataset API meshes well with the Estimators API. ## Classifying irises: an overview @@ -120,7 +119,7 @@ individual Iris flowers: * petal length * petal width -Our model will represent these features as float32 numerical data. +Our model will represent these features as `float32` numerical data. The label identifies the Iris species, which must be one of the following: @@ -154,9 +153,6 @@ The following figure illustrates the features, hidden layers, and predictions alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs" src="../images/custom_estimators/full_network.png">
-
-The Model. -
### Inference @@ -174,12 +170,12 @@ example is an Iris Versicolor. ## Overview of programming with Estimators -An Estimator is TensorFlow's high level representation of a complete model. It +An Estimator is TensorFlow's high-level representation of a complete model. It handles the details of initialization, logging, saving and restoring, and many other features so you can concentrate on your model. For more details see @{$programmers_guide/estimators}. -An "Estimator" is any class derived from @{tf.estimator.Estimator}. TensorFlow +An Estimator is any class derived from @{tf.estimator.Estimator}. TensorFlow provides a collection of [pre-made Estimators](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator) (for example, `LinearRegressor`) to implement common ML algorithms. Beyond @@ -199,7 +195,7 @@ following tasks: * Call one or more methods on the Estimator object, passing the appropriate input function as the source of the data. -Let's see how those tasks are implemented in Iris. +Let's see how those tasks are implemented for Iris classification. ## Create input functions @@ -209,17 +205,30 @@ evaluating, and prediction. An **input function** is a function that returns a @{tf.data.Dataset} object which outputs the following two-element tuple: -* "features" - A Python dictionary in which: +* [`features`](https://developers.google.com/machine-learning/glossary/#feature) - A Python dictionary in which: * Each key is the name of a feature. * Each value is an array containing all of that feature's values. -* "label" - An array containing the values of the +* `label` - An array containing the values of the [label](https://developers.google.com/machine-learning/glossary/#label) for every example. -Your input function may generate the "features" dictionary and "label" list any -way you like. However, we recommend using TensorFlow's @{tf.data.Dataset} API, -which can deftly parse all sorts of data. At a high-level, -the @{tf.data.Dataset} API consists of the following classes: +Just to demonstrate the format of the input function, here's a simple +implementation: + +```python +def input_evaluation_set(): + features = {'SepalLength': np.array([6.4, 5.0]), + 'SepalWidth': np.array([2.8, 2.3]), + 'PetalLength': np.array([5.6, 3.3]), + 'PetalWidth': np.array([2.2, 1.0])} + labels = np.array([2, 1]) + return features, labels +``` + +Your input function may generate the `features` dictionary and `label` list any +way you like. However, we recommend using TensorFlow's Dataset API, which can +parse all sorts of data. At a high level, the Dataset API consists of the +following classes:
+Where the individual members are: -Where: - -* Dataset: Base class containing methods to create and transform datasets. Also - allows you to initialize a dataset from data in memory, or from a Python - generator. -* TextLineDataset: Reads lines from text files. -* TFRecordDataset: Reads records from TFRecord files. -* FixedLengthRecordDataset: Reads fixed size records from binary files. -* Iterator: Provides a way to access one data set element at a time. +* `Dataset` - Base class containing methods to create and transform + datasets. Also allows you to initialize a dataset from data in memory, or from + a Python generator. +* `TextLineDataset` - Reads lines from text files. +* `TFRecordDataset` - Reads records from TFRecord files. +* `FixedLengthRecordDataset` - Reads fixed size records from binary files. +* `Iterator` - Provides a way to access one data set element at a time. The Dataset API can handle a lot of common cases for you. For example, using the Dataset API, you can easily read in records from a large collection of files in parallel and join them into a single stream. -To keep things simple in this example we are going to load the data with pandas, -and build our input pipeline from this in-memory data. +To keep things simple in this example we are going to load the data with +[pandas](https://pandas.pydata.org/), and build our input pipeline from this +in-memory data. Here is the input function used for training in this program, which is available in [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py): @@ -258,9 +267,9 @@ def train_input_fn(features, labels, batch_size): return dataset.shuffle(1000).repeat().batch(batch_size) ``` -## Define the Feature Columns +## Define the feature columns -A [**Feature Column**](https://developers.google.com/machine-learning/glossary/#feature_columns) +A [**feature column**](https://developers.google.com/machine-learning/glossary/#feature_columns) is an object describing how the model should use raw input data from the features dictionary. When you build an Estimator model, you pass it a list of feature columns that describes each of the features you want the model to use. @@ -270,7 +279,7 @@ to the model. For Iris, the 4 raw features are numeric values, so we'll build a list of feature columns to tell the Estimator model to represent each of the four features as 32-bit floating-point values. Therefore, the code to create the -Feature Column is simply: +feature column is: ```python # Feature columns describe how to use the input. @@ -279,29 +288,29 @@ for key in train_x.keys(): my_feature_columns.append(tf.feature_column.numeric_column(key=key)) ``` -Feature Columns can be far more sophisticated than those we're showing here. -We detail feature columns @{$get_started/feature_columns$later on} in -getting started. +Feature columns can be far more sophisticated than those we're showing here. We +detail feature columns @{$get_started/feature_columns$later on} in our Getting +Started guide. Now that we have the description of how we want the model to represent the raw features, we can build the estimator. -## Instantiate an Estimator +## Instantiate an estimator The Iris problem is a classic classification problem. Fortunately, TensorFlow provides several pre-made classifier Estimators, including: -* @{tf.estimator.DNNClassifier}—for deep models that perform multi-class +* @{tf.estimator.DNNClassifier} for deep models that perform multi-class classification. -* @{tf.estimator.DNNLinearCombinedClassifier}—for wide-n-deep models. -* @{tf.estimator.LinearClassifier}— for classifiers based on linear models. +* @{tf.estimator.DNNLinearCombinedClassifier} for wide & deep models. +* @{tf.estimator.LinearClassifier} for classifiers based on linear models. For the Iris problem, `tf.estimator.DNNClassifier` seems like the best choice. Here's how we instantiated this Estimator: ```python -# Build 2 hidden layer DNN with 10, 10 units respectively. +# Build a DNN with 2 hidden layers and 10 nodes in each hidden layer. classifier = tf.estimator.DNNClassifier( feature_columns=my_feature_columns, # Two hidden layers of 10 nodes each. @@ -363,7 +372,7 @@ Test set accuracy: 0.967 We now have a trained model that produces good evaluation results. We can now use the trained model to predict the species of an Iris flower -based on some unlabeled measurments. As with training and evaluation, we make +based on some unlabeled measurements. As with training and evaluation, we make predictions using a single function call: ```python diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md index ba1a4118aec..14add7c77e6 100644 --- a/tensorflow/docs_src/install/install_c.md +++ b/tensorflow/docs_src/install/install_c.md @@ -38,7 +38,7 @@ enable TensorFlow for C: OS="linux" # Change to "darwin" for macOS TARGET_DIRECTORY="/usr/local" curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" | + "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" | sudo tar -C $TARGET_DIRECTORY -xz The `tar` command extracts the TensorFlow C library into the `lib` diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md index 87cc647317a..d2af9d98434 100644 --- a/tensorflow/docs_src/install/install_go.md +++ b/tensorflow/docs_src/install/install_go.md @@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go: TF_TYPE="cpu" # Change to "gpu" for GPU support TARGET_DIRECTORY='/usr/local' curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0-rc1.tar.gz" | + "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0.tar.gz" | sudo tar -C $TARGET_DIRECTORY -xz The `tar` command extracts the TensorFlow C library into the `lib` diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md index 37e109a6e4b..e5388c4b1e0 100644 --- a/tensorflow/docs_src/install/install_java.md +++ b/tensorflow/docs_src/install/install_java.md @@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs: org.tensorflow tensorflow - 1.5.0-rc1 + 1.5.0 ``` @@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow: org.tensorflow tensorflow - 1.5.0-rc1 + 1.5.0 @@ -123,12 +123,12 @@ instead: org.tensorflow libtensorflow - 1.5.0-rc1 + 1.5.0 org.tensorflow libtensorflow_jni_gpu - 1.5.0-rc1 + 1.5.0 ``` @@ -147,7 +147,7 @@ refer to the simpler instructions above instead. Take the following steps to install TensorFlow for Java on Linux or macOS: 1. Download - [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar), + [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar), which is the TensorFlow Java Archive (JAR). 2. Decide whether you will run TensorFlow for Java on CPU(s) only or with @@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS: OS=$(uname -s | tr '[:upper:]' '[:lower:]') mkdir -p ./jni curl -L \ - "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" | + "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" | tar -xz -C ./jni ### Install on Windows @@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS: Take the following steps to install TensorFlow for Java on Windows: 1. Download - [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar), + [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar), which is the TensorFlow Java Archive (JAR). 2. Download the following Java Native Interface (JNI) file appropriate for - [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0-rc1.zip). + [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0.zip). 3. Extract this .zip file. @@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the downloaded `.jar` in your `classpath` by using the `-cp` compilation flag as follows: -
javac -cp libtensorflow-1.5.0-rc1.jar HelloTF.java
+
javac -cp libtensorflow-1.5.0.jar HelloTF.java
### Running @@ -239,11 +239,11 @@ two files are available to the JVM: For example, the following command line executes the `HelloTF` program on Linux and macOS X: -
java -cp libtensorflow-1.5.0-rc1.jar:. -Djava.library.path=./jni HelloTF
+
java -cp libtensorflow-1.5.0.jar:. -Djava.library.path=./jni HelloTF
And the following command line executes the `HelloTF` program on Windows: -
java -cp libtensorflow-1.5.0-rc1.jar;. -Djava.library.path=jni HelloTF
+
java -cp libtensorflow-1.5.0.jar;. -Djava.library.path=jni HelloTF
If the program prints Hello from version, you've successfully installed TensorFlow for Java and are ready to use the API. If the program diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md index 03f12dff08c..cd8c14599fb 100644 --- a/tensorflow/docs_src/install/install_linux.md +++ b/tensorflow/docs_src/install/install_linux.md @@ -31,13 +31,13 @@ If you are installing TensorFlow with GPU support using one of the mechanisms described in this guide, then the following NVIDIA software must be installed on your system: - * CUDA® Toolkit 8.0. For details, see + * CUDA® Toolkit 9.0. For details, see [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A). Ensure that you append the relevant Cuda pathnames to the `LD_LIBRARY_PATH` environment variable as described in the NVIDIA documentation. - * The NVIDIA drivers associated with CUDA Toolkit 8.0. - * cuDNN v6.0. For details, see + * The NVIDIA drivers associated with CUDA Toolkit 9.0. + * cuDNN v7.0. For details, see [NVIDIA's documentation](https://developer.nvidia.com/cudnn). Ensure that you create the `CUDA_HOME` environment variable as described in the NVIDIA documentation. @@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv: Virtualenv environment:
(tensorflow)$ pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+ https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl If you encounter installation problems, see [Common Installation Problems](#common_installation_problems). @@ -293,7 +293,7 @@ take the following steps:
      $ sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl
      
If this step fails, see @@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      (tensorflow)$ pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+ https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl @@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations. CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp27-none-linux_x86_64.whl
 
Note that GPU support requires the NVIDIA hardware and software described in @@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp34-cp34m-linux_x86_64.whl
 
Note that GPU support requires the NVIDIA hardware and software described in @@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp35-cp35m-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in CPU only:
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp36-cp36m-linux_x86_64.whl
 
GPU support:
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp36-cp36m-linux_x86_64.whl
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md index 555a6837d8b..f49d3a2f085 100644 --- a/tensorflow/docs_src/install/install_mac.md +++ b/tensorflow/docs_src/install/install_mac.md @@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv: TensorFlow in the active Virtualenv is as follows:
 $ pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl If you encounter installation problems, see [Common Installation Problems](#common-installation-problems). @@ -238,7 +238,7 @@ take the following steps: issue the following command:
 $ sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl 
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl If the preceding command fails, see [installation problems](#common-installation-problems). @@ -347,7 +347,7 @@ Take the following steps to install TensorFlow in an Anaconda environment: TensorFlow for Python 2.7:
 (targetDirectory)$ pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl
+ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl @@ -520,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl
 
@@ -528,5 +528,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-a
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md index f494cc7a7c0..ccf62a169f3 100644 --- a/tensorflow/docs_src/install/install_sources.md +++ b/tensorflow/docs_src/install/install_sources.md @@ -133,7 +133,7 @@ The following NVIDIA hardware must be installed on your system: The following NVIDIA software must be installed on your system: - * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 8.0. + * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 9.0. For details, see [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A). Ensure that you append the relevant Cuda pathnames to the @@ -221,7 +221,7 @@ problem, do either of the following: * Download Xcode 7.2 and select it as your default by issuing the following command: -
 $ sudo xcode-select -s /Application/Xcode-7.2/Xcode.app
+
 $ sudo xcode-select -s /Applications/Xcode-7.2/Xcode.app
**NOTE:** Your system must fulfill the NVIDIA software requirements described in one of the following documents: @@ -272,8 +272,6 @@ Found possible Python library paths: Please input the desired Python library path to use. Default is [/usr/lib/python2.7/dist-packages] Using python library path: /usr/local/lib/python2.7/dist-packages -Do you wish to build TensorFlow with MKL support? [y/N] -No MKL support will be enabled for TensorFlow Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]: Do you wish to use jemalloc as the malloc implementation? [Y/n] jemalloc enabled @@ -291,11 +289,11 @@ Do you wish to build TensorFlow with CUDA support? [y/N] Y CUDA support will be enabled for TensorFlow Do you want to use clang as CUDA compiler? [y/N] nvcc will be used as CUDA compiler -Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: 8.0 -Please specify the location where CUDA 8.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]: +Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: 9.0 +Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]: Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]: -Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: 6 -Please specify the location where cuDNN 6 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]: +Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: 7 +Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]: Please specify a list of comma-separated Cuda compute capabilities you want to build with. You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus. Please note that each additional compute capability significantly increases your build time and binary size. @@ -361,10 +359,10 @@ Invoke `pip install` to install that pip package. The filename of the `.whl` file depends on your platform. For example, the following command will install the pip package -for TensorFlow 1.5.0rc1 on Linux: +for TensorFlow 1.5.0 on Linux:
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0rc1-py2-none-any.whl
+$ sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0-py2-none-any.whl
 
## Validate your installation @@ -463,8 +461,8 @@ Stack Overflow and specify the `tensorflow` tag. - - + + @@ -480,7 +478,7 @@ Stack Overflow and specify the `tensorflow` tag. **Mac**
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.5.0-rc1CPU2.7, 3.3-3.6GCC 4.8Bazel 0.8.0N/AN/A
tensorflow_gpu-1.5.0-rc1GPU2.7, 3.3-3.6GCC 4.8Bazel 0.8.079
tensorflow-1.5.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.8.0N/AN/A
tensorflow_gpu-1.5.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.8.079
tensorflow-1.4.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.5.4N/AN/A
tensorflow_gpu-1.4.0GPU2.7, 3.3-3.6GCC 4.8Bazel 0.5.468
tensorflow-1.3.0CPU2.7, 3.3-3.6GCC 4.8Bazel 0.4.5N/AN/A
- + @@ -493,8 +491,8 @@ Stack Overflow and specify the `tensorflow` tag. **Windows**
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.5.0-rc1CPU2.7, 3.3-3.6Clang from xcodeBazel 0.8.1N/AN/A
tensorflow-1.5.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.8.1N/AN/A
tensorflow-1.4.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.5.4N/AN/A
tensorflow-1.3.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.4.5N/AN/A
tensorflow-1.2.0CPU2.7, 3.3-3.6Clang from xcodeBazel 0.4.5N/AN/A
- - + + diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md index 8d0eb7966fd..86a111c2ec1 100644 --- a/tensorflow/docs_src/install/install_windows.md +++ b/tensorflow/docs_src/install/install_windows.md @@ -30,13 +30,13 @@ If you are installing TensorFlow with GPU support using one of the mechanisms described in this guide, then the following NVIDIA software must be installed on your system: - * CUDA® Toolkit 8.0. For details, see + * CUDA® Toolkit 9.0. For details, see [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/) Ensure that you append the relevant Cuda pathnames to the `%PATH%` environment variable as described in the NVIDIA documentation. - * The NVIDIA drivers associated with CUDA Toolkit 8.0. - * cuDNN v6.0. For details, see + * The NVIDIA drivers associated with CUDA Toolkit 9.0. + * cuDNN v7.0. For details, see [NVIDIA's documentation](https://developer.nvidia.com/cudnn). Note that cuDNN is typically installed in a different location from the other CUDA DLLs. Ensure that you add the directory where you installed diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md index 10e7ad7ada5..cd47fc2803b 100644 --- a/tensorflow/docs_src/performance/performance_guide.md +++ b/tensorflow/docs_src/performance/performance_guide.md @@ -498,7 +498,7 @@ For TensorFlow source versions after 1.3.0: ```bash ./configure # Pick the desired options -bazel build --config=mkl -c opt //tensorflow/tools/pip_package:build_pip_package +bazel build --config=mkl --config=opt //tensorflow/tools/pip_package:build_pip_package ``` diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md index 9eaee270282..c1a90dee0a6 100644 --- a/tensorflow/docs_src/programmers_guide/debugger.md +++ b/tensorflow/docs_src/programmers_guide/debugger.md @@ -214,7 +214,7 @@ navigate between these screens by clicking the `<--` and ### Other Features of the tfdbg CLI In addition to the commands listed above, the tfdbg CLI provides the following -addditional features: +additional features: * To navigate through previous tfdbg commands, type in a few characters followed by the Up or Down arrow keys. tfdbg will show you the history of diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md index 2b4896c3810..9049a5a9f3d 100644 --- a/tensorflow/docs_src/programmers_guide/graphs.md +++ b/tensorflow/docs_src/programmers_guide/graphs.md @@ -125,14 +125,14 @@ an operation: @{tf.Tensor} accepts an optional `name` argument. For example, `tf.constant(42.0, name="answer")` creates a new @{tf.Operation} named `"answer"` and returns a @{tf.Tensor} named `"answer:0"`. If the default graph - already contained an operation named `"answer"`, the TensorFlow would append + already contains an operation named `"answer"`, then TensorFlow would append `"_1"`, `"_2"`, and so on to the name, in order to make it unique. * The @{tf.name_scope} function makes it possible to add a **name scope** prefix to all operations created in a particular context. The current name scope prefix is a `"/"`-delimited list of the names of all active @{tf.name_scope} context managers. If a name scope has already been used in the current - context, TensorFlow appens `"_1"`, `"_2"`, and so on. For example: + context, TensorFlow appends `"_1"`, `"_2"`, and so on. For example: ```python c_0 = tf.constant(0, name="c") # => operation named "c" diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md index d45e666ce7b..7a5e90081d9 100644 --- a/tensorflow/docs_src/programmers_guide/index.md +++ b/tensorflow/docs_src/programmers_guide/index.md @@ -13,7 +13,7 @@ works. The units are as follows: ## Low Level APIs * @{$programmers_guide/low_level_intro}, which introduces the - basics of how you can to use TensorFlow outside of the high Level APIs. + basics of how you can use TensorFlow outside of the high Level APIs. * @{$programmers_guide/tensors}, which explains how to create, manipulate, and access Tensors--the fundamental object in TensorFlow. * @{$programmers_guide/variables}, which details how diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md index 9f50be5b31c..f27a658342b 100644 --- a/tensorflow/docs_src/programmers_guide/saved_model.md +++ b/tensorflow/docs_src/programmers_guide/saved_model.md @@ -285,7 +285,7 @@ with tf.Session(graph=tf.Graph()) as sess: ``` -### Loading a Savedmodel in C++ +### Loading a SavedModel in C++ The C++ version of the SavedModel [loader](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/loader.h) @@ -303,6 +303,30 @@ LoadSavedModel(session_options, run_options, export_dir, {kSavedModelTagTrain}, &bundle); ``` +### Loading and Serving a SavedModel in TensorFlow Serving + +You can easily load and serve a SavedModel with the TensorFlow Serving Model +Server binary. See [instructions](https://www.tensorflow.org/serving/setup#installing_using_apt-get) +on how to install the server, or build it if you wish. + +Once you have the Model Server, run it with: +``` +tensorflow_model_server --port=port-numbers --model_name=your-model-name --model_base_path=your_model_base_path +``` +Set the port and model_name flags to values of your choosing. The +model_base_path flag expects to be to a base directory, with each version of +your model residing in a numerically named subdirectory. If you only have a +single version of your model, simply place it in a subdirectory like so: +* Place the model in /tmp/model/0001 +* Set model_base_path to /tmp/model + +Store different versions of your model in numerically named subdirectories of a +common base directory. For example, suppose the base directory is `/tmp/model`. +If you have only one version of your model, store it in `/tmp/model/0001`. If +you have two versions of your model, store the second version in +`/tmp/model/0002`, and so on. Set the `--model-base_path` flag to the base +directory (`/tmp/model`, in this example). TensorFlow Model Server will serve +the model in the highest numbered subdirectory of that base directory. ### Standard constants diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle index f7bdf8b816a..0767726aa9a 100644 --- a/tensorflow/examples/android/build.gradle +++ b/tensorflow/examples/android/build.gradle @@ -56,10 +56,12 @@ def nativeOutDir = 'libs/' + cpuType def nativeBuildRule = 'buildNativeBazel' def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so' def inferenceLibPath = '../../../bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so' + +// Override for Makefile builds. if (nativeBuildSystem == 'makefile') { nativeBuildRule = 'buildNativeMake' - demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_demo.so' - inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_inference.so' + demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_demo.so' + inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_inference.so' } // If building with Bazel, this is the location of the bazel binary. @@ -154,7 +156,8 @@ task buildNativeMake(type: Exec) { '-s', \ 'tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in', \ '-t', \ - 'libtensorflow_inference.so libtensorflow_demo.so' \ + 'libtensorflow_inference.so libtensorflow_demo.so all' \ + , '-a', cpuType \ //, '-T' // Uncomment to skip protobuf and speed up subsequent builds. } diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java index 2fe2ba539ed..af6af2bc8f5 100644 --- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java +++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java @@ -199,7 +199,7 @@ public class MultiBoxTracker { final int w, final int h, final int rowStride, - final int sensorOrienation, + final int sensorOrientation, final byte[] frame, final long timestamp) { if (objectTracker == null && !initialized) { @@ -209,7 +209,7 @@ public class MultiBoxTracker { objectTracker = ObjectTracker.getInstance(w, h, rowStride, true); frameWidth = w; frameHeight = h; - this.sensorOrientation = sensorOrienation; + this.sensorOrientation = sensorOrientation; initialized = true; if (objectTracker == null) { diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py index ec22684eaf6..58c5f87884e 100644 --- a/tensorflow/examples/image_retraining/retrain.py +++ b/tensorflow/examples/image_retraining/retrain.py @@ -344,8 +344,8 @@ def maybe_download_and_extract(data_url): filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress) print() statinfo = os.stat(filepath) - tf.logging.info('Successfully downloaded', filename, statinfo.st_size, - 'bytes.') + tf.logging.info('Successfully downloaded %s %d bytes.', + filename, statinfo.st_size) print('Extracting file from ', filepath) tarfile.open(filepath, 'r:gz').extractall(dest_directory) else: diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py index eb117c39a12..e4e61862b02 100644 --- a/tensorflow/examples/learn/text_classification.py +++ b/tensorflow/examples/learn/text_classification.py @@ -34,8 +34,7 @@ MAX_LABEL = 15 WORDS_FEATURE = 'words' # Name of the input words feature. -def estimator_spec_for_softmax_classification( - logits, labels, mode): +def estimator_spec_for_softmax_classification(logits, labels, mode): """Returns EstimatorSpec instance for softmax classification.""" predicted_classes = tf.argmax(logits, 1) if mode == tf.estimator.ModeKeys.PREDICT: @@ -53,8 +52,8 @@ def estimator_spec_for_softmax_classification( return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) eval_metric_ops = { - 'accuracy': tf.metrics.accuracy( - labels=labels, predictions=predicted_classes) + 'accuracy': + tf.metrics.accuracy(labels=labels, predictions=predicted_classes) } return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) @@ -67,8 +66,7 @@ def bag_of_words_model(features, labels, mode): bow_embedding_column = tf.feature_column.embedding_column( bow_column, dimension=EMBEDDING_SIZE) bow = tf.feature_column.input_layer( - features, - feature_columns=[bow_embedding_column]) + features, feature_columns=[bow_embedding_column]) logits = tf.layers.dense(bow, MAX_LABEL, activation=None) return estimator_spec_for_softmax_classification( @@ -110,9 +108,9 @@ def main(unused_argv): # Prepare training and testing data dbpedia = tf.contrib.learn.datasets.load_dataset( 'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data) - x_train = pandas.Series(dbpedia.train.data[:,1]) + x_train = pandas.Series(dbpedia.train.data[:, 1]) y_train = pandas.Series(dbpedia.train.target) - x_test = pandas.Series(dbpedia.test.data[:,1]) + x_test = pandas.Series(dbpedia.test.data[:, 1]) y_test = pandas.Series(dbpedia.test.target) # Process vocabulary @@ -152,10 +150,7 @@ def main(unused_argv): # Predict. test_input_fn = tf.estimator.inputs.numpy_input_fn( - x={WORDS_FEATURE: x_test}, - y=y_test, - num_epochs=1, - shuffle=False) + x={WORDS_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False) predictions = classifier.predict(input_fn=test_input_fn) y_predicted = np.array(list(p['class'] for p in predictions)) y_predicted = y_predicted.reshape(np.array(y_test).shape) diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py index fb3ac942039..47dd6a19478 100644 --- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py +++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """A very simple MNIST classifier. See extensive documentation at @@ -67,12 +66,19 @@ def main(_): # Test trained model correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) - print(sess.run(accuracy, feed_dict={x: mnist.test.images, - y_: mnist.test.labels})) + print(sess.run( + accuracy, feed_dict={ + x: mnist.test.images, + y_: mnist.test.labels + })) + if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data', - help='Directory for storing input data') + parser.add_argument( + '--data_dir', + type=str, + default='/tmp/tensorflow/mnist/input_data', + help='Directory for storing input data') FLAGS, unparsed = parser.parse_known_args() tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py index d055d157454..f6906b0f79b 100644 --- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py +++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py @@ -270,12 +270,6 @@ with tf.Session(graph=graph) as session: run_metadata=run_metadata) average_loss += loss_val - # Add returned summaries to writer in each step. - writer.add_summary(summary, step) - # Add metadata to visualize the graph for the last run. - if step == (num_steps - 1): - writer.add_run_metadata(run_metadata, 'step%d' % step) - # Add returned summaries to writer in each step. writer.add_summary(summary, step) # Add metadata to visualize the graph for the last run. diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go index fc087d9d995..08943a527cb 100644 --- a/tensorflow/go/graph.go +++ b/tensorflow/go/graph.go @@ -173,7 +173,11 @@ type OpSpec struct { // operation. Attrs map[string]interface{} - // Other possible fields: Device, ColocateWith, ControlInputs. + // Operations that must be executed before executing the operation + // being added. + ControlDependencies []*Operation + + // Other possible fields: Device, ColocateWith. } // AddOperation adds an operation to g. @@ -204,6 +208,9 @@ func (g *Graph) AddOperation(args OpSpec) (*Operation, error) { } } } + for _, in := range args.ControlDependencies { + C.TF_AddControlInput(cdesc, in.c) + } status := newStatus() for name, value := range args.Attrs { if err := setAttr(cdesc, status, name, value); err != nil { diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go index a9ec79463a0..13de4294dc2 100644 --- a/tensorflow/go/op/scope.go +++ b/tensorflow/go/op/scope.go @@ -33,10 +33,11 @@ import ( // A Scope object and all its derivates (e.g., obtained from Scope.SubScope) // are not safe for concurrent use by multiple goroutines. type Scope struct { - graph *tf.Graph - namemap map[string]int - namespace string - err *scopeErr + graph *tf.Graph + namemap map[string]int + namespace string + controlDependencies []*tf.Operation + err *scopeErr } // scopeErr is used to share errors between all derivatives of a root scope. @@ -80,6 +81,7 @@ func (s *Scope) AddOperation(args tf.OpSpec) *tf.Operation { if s.namespace != "" { args.Name = s.namespace + "/" + args.Name } + args.ControlDependencies = append(args.ControlDependencies, s.controlDependencies...) op, err := s.graph.AddOperation(args) if err != nil { s.UpdateErr(args.Type, err) @@ -103,6 +105,28 @@ func (s *Scope) SubScope(namespace string) *Scope { } } +// WithControlDependencies returns a new Scope which will cause all operations +// added to the graph to execute only after all the provided operations have +// executed first (in addition to any other control dependencies in s). +func (s *Scope) WithControlDependencies(ops ...*tf.Operation) *Scope { + // Force a copy of the control dependencies into a new underlying array on + // every call. We cannot alias the same underlying array as `ops`, otherwise + // the user could modify that array after calling s.WithControlDependencies, + // which would be confusing. We cannot alias the same underlying array as the + // original `s.controlDependencies`, since Scopes form a logical tree, and + // other calls to s.WithControlDependencies could stomp on each other. + deps := make([]*tf.Operation, 0, len(s.controlDependencies)+len(ops)) + deps = append(deps, s.controlDependencies...) + deps = append(deps, ops...) + return &Scope{ + graph: s.graph, + namemap: s.namemap, + namespace: s.namespace, + controlDependencies: deps, + err: s.err, + } +} + // Err returns the error, if any, encountered during the construction // of the Graph managed by s. // diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go index 6fb5d32e503..b58a61de98b 100644 --- a/tensorflow/go/op/scope_test.go +++ b/tensorflow/go/op/scope_test.go @@ -69,6 +69,49 @@ func TestScopeSubScopeErrors(t *testing.T) { } } +func TestControlDependencies(t *testing.T) { + var ( + s = NewScope() + zero = Const(s.SubScope("zero"), int32(0)) + one = Const(s.SubScope("one"), int32(1)) + variable = VarHandleOp(s, tf.Int32, tf.ScalarShape()) + init = AssignVariableOp(s, variable, zero) + update = AssignAddVariableOp(s, variable, one) + readDeps = []*tf.Operation{update} + ) + // We intend for `read` to have a control dependency on `update`. + s = s.WithControlDependencies(readDeps...) + // Ensure that Scope.WithControlDependencies makes a copy of the underlying + // array, rather than just holding a slice reference to the same user-supplied + // underlying array. If the copy is correctly performed, overwriting + // readDeps[0] should have no effect on control dependencies for `read`. + readDeps[0] = init + read := ReadVariableOp(s, variable, tf.Int32) + + graph, err := s.Finalize() + if err != nil { + t.Fatal(err) + } + sess, err := tf.NewSession(graph, nil) + if err != nil { + t.Fatal(err) + } + if _, err = sess.Run(nil, nil, []*tf.Operation{init}); err != nil { + t.Fatal(err) + } + // Without the control dependency, the read operation may not see the + // update. + for i := int32(0); i < 10; i++ { + out, err := sess.Run(nil, []tf.Output{read}, nil) + if err != nil { + t.Fatal(err) + } + if got, want := out[0].Value().(int32), i+1; got != want { + t.Errorf("Got %d, want %d", got, want) + } + } +} + func TestScopeFinalize(t *testing.T) { var ( root = NewScope() diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go index 5b19c90238e..cb47651d7b3 100644 --- a/tensorflow/go/op/wrappers.go +++ b/tensorflow/go/op/wrappers.go @@ -8729,31 +8729,6 @@ func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Out return op.Output(0) } -// Compute the pairwise cross product. -// -// `a` and `b` must be the same shape; they can either be simple 3-element vectors, -// or any shape where the innermost dimension is 3. In the latter case, each pair -// of corresponding 3-element vectors is cross-multiplied independently. -// -// Arguments: -// a: A tensor containing 3-element vectors. -// b: Another tensor, of same type and shape as `a`. -// -// Returns Pairwise cross product of the vectors in `a` and `b`. -func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) { - if scope.Err() != nil { - return - } - opspec := tf.OpSpec{ - Type: "Cross", - Input: []tf.Input{ - a, b, - }, - } - op := scope.AddOperation(opspec) - return op.Output(0) -} - // Transforms a vector of brain.Example protos (as strings) into typed tensors. // // Arguments: @@ -21290,6 +21265,31 @@ func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output return op.Output(0) } +// Compute the pairwise cross product. +// +// `a` and `b` must be the same shape; they can either be simple 3-element vectors, +// or any shape where the innermost dimension is 3. In the latter case, each pair +// of corresponding 3-element vectors is cross-multiplied independently. +// +// Arguments: +// a: A tensor containing 3-element vectors. +// b: Another tensor, of same type and shape as `a`. +// +// Returns Pairwise cross product of the vectors in `a` and `b`. +func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) { + if scope.Err() != nil { + return + } + opspec := tf.OpSpec{ + Type: "Cross", + Input: []tf.Input{ + a, b, + }, + } + op := scope.AddOperation(opspec) + return op.Output(0) +} + // Performs a padding as a preprocess during a convolution. // // Similar to FusedResizeAndPadConv2d, this op allows for an optimized diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml index 6285ee0483d..a9ce5372aeb 100644 --- a/tensorflow/java/maven/libtensorflow/pom.xml +++ b/tensorflow/java/maven/libtensorflow/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.5.0-rc1 + 1.5.0 ../ libtensorflow diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml index b0e5c44fecc..fe34ca83ff3 100644 --- a/tensorflow/java/maven/libtensorflow_jni/pom.xml +++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.5.0-rc1 + 1.5.0 ../ libtensorflow_jni diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml index 02c5dca13f4..390152808eb 100644 --- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml +++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.5.0-rc1 + 1.5.0 ../ libtensorflow_jni_gpu diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml index 949597ca7f1..524ec45f48b 100644 --- a/tensorflow/java/maven/pom.xml +++ b/tensorflow/java/maven/pom.xml @@ -6,7 +6,7 @@ 4.0.0org.tensorflowparentpom - 1.5.0-rc1 + 1.5.0pomhttps://www.tensorflow.org diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml index 9f0ebcf84c9..9cf3217f51f 100644 --- a/tensorflow/java/maven/proto/pom.xml +++ b/tensorflow/java/maven/proto/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.5.0-rc1 + 1.5.0 ../ proto diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml index 88d897362ad..d619f986a9a 100644 --- a/tensorflow/java/maven/tensorflow/pom.xml +++ b/tensorflow/java/maven/tensorflow/pom.xml @@ -6,7 +6,7 @@ org.tensorflow parentpom - 1.5.0-rc1 + 1.5.0 ../ tensorflow diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java index 499757e8cf4..cf773e1686d 100644 --- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java +++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java @@ -88,7 +88,7 @@ final class NativeLibrary { // Deletions are in the reverse order of requests, so we need to request that the directory be // deleted first, so that it is empty when the request is fulfilled. tempPath.deleteOnExit(); - final String tempDirectory = tempPath.toString(); + final String tempDirectory = tempPath.getCanonicalPath(); if (frameworkResource != null) { extractResource(frameworkResource, frameworkLibName, tempDirectory); } else { diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 01b3e92d2d9..059bf729b07 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -76,6 +76,7 @@ py_library( ":layers", ":lib", ":list_ops", + ":manip_ops", ":math_ops", ":metrics", ":nn", @@ -298,6 +299,7 @@ cc_library( ":safe_ptr", "//tensorflow/c:tf_status_helper", "//tensorflow/c/eager:c_api", + "//tensorflow/c/eager:c_api_internal", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", @@ -1394,6 +1396,14 @@ tf_gen_op_wrapper_private_py( ], ) +tf_gen_op_wrapper_private_py( + name = "manip_ops_gen", + visibility = [ + "//learning/brain/python/ops:__pkg__", + "//tensorflow/python/kernel_tests:__pkg__", + ], +) + tf_gen_op_wrapper_private_py( name = "math_ops_gen", visibility = [ @@ -1726,6 +1736,8 @@ py_library( ":linalg_grad", ":linalg_ops", ":logging_ops", + ":manip_grad", + ":manip_ops", ":math_grad", ":math_ops", ":platform", @@ -1848,6 +1860,29 @@ py_library( ], ) +py_library( + name = "manip_grad", + srcs = ["ops/manip_grad.py"], + srcs_version = "PY2AND3", + deps = [ + ":control_flow_ops", + ":framework_for_generated_wrappers", + ":manip_ops", + ], +) + +py_library( + name = "manip_ops", + srcs = ["ops/manip_ops.py"], + srcs_version = "PY2AND3", + deps = [ + ":dtypes", + ":framework_ops", + ":manip_ops_gen", + "//third_party/py/numpy", + ], +) + py_library( name = "logging_ops", srcs = ["ops/logging_ops.py"], @@ -2310,6 +2345,8 @@ py_library( ":linalg_ops", ":logging_ops", ":lookup_ops", + ":manip_grad", + ":manip_ops", ":math_grad", ":math_ops", ":numerics", @@ -2668,6 +2705,7 @@ cuda_py_test( ":nn_ops_gen", "//third_party/py/numpy", ], + shard_count = 4, tags = ["no_windows"], ) @@ -4228,12 +4266,6 @@ filegroup( visibility = ["//tensorflow:__subpackages__"], ) -filegroup( - name = "hidden_ops", - srcs = ["ops/hidden_ops.txt"], - visibility = ["//tensorflow:__subpackages__"], -) - cuda_py_test( name = "accumulate_n_benchmark", size = "large", diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py index bc9ddec2a54..ea7604d30f1 100644 --- a/tensorflow/python/__init__.py +++ b/tensorflow/python/__init__.py @@ -84,6 +84,7 @@ from tensorflow.python.feature_column import feature_column_lib as feature_colum from tensorflow.python.layers import layers from tensorflow.python.ops import bitwise_ops as bitwise from tensorflow.python.ops import image_ops as image +from tensorflow.python.ops import manip_ops as manip from tensorflow.python.ops import metrics from tensorflow.python.ops import nn from tensorflow.python.ops import sets @@ -241,6 +242,7 @@ _allowed_symbols.extend([ 'linalg', 'logging', 'losses', + 'manip', 'metrics', 'newaxis', 'nn', diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl index 7f29adc06fc..b9056f86e6d 100644 --- a/tensorflow/python/build_defs.bzl +++ b/tensorflow/python/build_defs.bzl @@ -22,7 +22,6 @@ def tf_gen_op_wrapper_private_py(name, out=None, deps=[], bare_op_name = name[:-4] # Strip off the _gen tf_gen_op_wrapper_py(name=bare_op_name, out=out, - hidden_file="ops/hidden_ops.txt", visibility=visibility, deps=deps, require_shape_functions=require_shape_functions, diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py index 7bba10efacf..aaf41626ab0 100644 --- a/tensorflow/python/client/device_lib_test.py +++ b/tensorflow/python/client/device_lib_test.py @@ -34,7 +34,8 @@ class DeviceLibTest(test_util.TensorFlowTestCase): # GPU test if test.is_gpu_available(): self.assertGreater(len(devices), 1) - self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices]) + self.assertTrue("GPU" in [d.device_type for d in devices] or + "SYCL" in [d.device_type for d in devices]) if __name__ == "__main__": diff --git a/tensorflow/python/client/notebook.py b/tensorflow/python/client/notebook.py index 8babe35b323..4b6a0f71ae6 100644 --- a/tensorflow/python/client/notebook.py +++ b/tensorflow/python/client/notebook.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Notebook front-end to TensorFlow. When you run this binary, you'll see something like below, which indicates @@ -43,10 +42,8 @@ from tensorflow.python.platform import app os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp" os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION"] = "2" - FLAGS = None - ORIG_ARGV = sys.argv # Main notebook process calls itself with argv[1]="kernel" to start kernel # subprocesses. @@ -73,8 +70,8 @@ def main(unused_argv): notebookapp.ip = "0.0.0.0" notebookapp.password = passwd(FLAGS.password) else: - print ("\nNo password specified; Notebook server will only be available" - " on the local machine.\n") + print("\nNo password specified; Notebook server will only be available" + " on the local machine.\n") notebookapp.initialize(argv=["--notebook-dir", FLAGS.notebook_dir]) if notebookapp.ip == "0.0.0.0": @@ -125,8 +122,8 @@ if __name__ == "__main__": # kernel app. if IS_KERNEL: # Drop everything except --flagfile. - sys.argv = ([sys.argv[0]] + - [x for x in sys.argv[1:] if x.startswith("--flagfile")]) + sys.argv = ( + [sys.argv[0]] + [x for x in sys.argv[1:] if x.startswith("--flagfile")]) FLAGS, unparsed = parser.parse_known_args() app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py index e6f94396b85..6befeb846d8 100644 --- a/tensorflow/python/client/session.py +++ b/tensorflow/python/client/session.py @@ -35,6 +35,7 @@ from tensorflow.python.ops import session_ops from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import compat from tensorflow.python.util import nest +from tensorflow.python.util.tf_export import tf_export class SessionInterface(object): @@ -1441,6 +1442,7 @@ class BaseSession(SessionInterface): return handles +@tf_export('Session') class Session(BaseSession): """A class for running TensorFlow operations. @@ -1581,6 +1583,7 @@ class Session(BaseSession): tf_session.TF_Reset(target, containers, config) +@tf_export('InteractiveSession') class InteractiveSession(BaseSession): """A TensorFlow `Session` for use in interactive contexts, such as a shell. diff --git a/tensorflow/python/client/session_benchmark.py b/tensorflow/python/client/session_benchmark.py index 721bca91b71..06e9a099267 100644 --- a/tensorflow/python/client/session_benchmark.py +++ b/tensorflow/python/client/session_benchmark.py @@ -22,6 +22,7 @@ import time import numpy as np +from six.moves import xrange from tensorflow.python.client import session from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py index 768a5db88aa..f12c0055115 100644 --- a/tensorflow/python/client/session_test.py +++ b/tensorflow/python/client/session_test.py @@ -46,6 +46,7 @@ from tensorflow.python.framework import versions from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import data_flow_ops +from tensorflow.python.ops import gen_control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops # Import resource_variable_ops for the variables-to-tensor implicit conversion. @@ -1745,8 +1746,10 @@ class SessionTest(test_util.TensorFlowTestCase): def runTestBuildGraphError(self, sess): # Ensure that errors from building the graph get propagated. data = array_ops.placeholder(dtypes.float32, shape=[]) - enter_1 = control_flow_ops.enter(data, 'foo_1', False) - enter_2 = control_flow_ops.enter(data, 'foo_2', False) + # pylint: disable=protected-access + enter_1 = gen_control_flow_ops._enter(data, 'foo_1', False) + enter_2 = gen_control_flow_ops._enter(data, 'foo_2', False) + # pylint: enable=protected-access res = math_ops.add(enter_1, enter_2) with self.assertRaisesOpError('has inputs from different frames'): sess.run(res, feed_dict={data: 1.0}) diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD index 43cbde69d9d..8b8adefa65a 100644 --- a/tensorflow/python/data/kernel_tests/BUILD +++ b/tensorflow/python/data/kernel_tests/BUILD @@ -357,6 +357,9 @@ tf_py_test( "//tensorflow/python:session", "//tensorflow/python/data/ops:dataset_ops", "//tensorflow/python/data/ops:iterator_ops", + "//tensorflow/python:constant_op", + "//tensorflow/python:string_ops", + "//tensorflow/python:lookup_ops", ], grpc_enabled = True, tags = [ diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py index 45dfa13720b..2c65c49ebdc 100644 --- a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py +++ b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py @@ -21,6 +21,7 @@ from tensorflow.core.protobuf import config_pb2 from tensorflow.python.client import session from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import iterator_ops +from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import function @@ -28,6 +29,8 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import functional_ops +from tensorflow.python.ops import lookup_ops +from tensorflow.python.ops import string_ops from tensorflow.python.platform import test @@ -103,6 +106,40 @@ class IteratorClusterTest(test.TestCase): "/job:worker/replica:0/task:1/cpu:0", workers[0].target) + def testCaptureHashTableInSharedIterator(self): + worker, _ = test_util.create_local_cluster(1, 1) + + # NOTE(mrry): We must use the V2 variants of `HashTable` + # etc. because these produce a `tf.resource`-typed output that is + # compatible with the in-graph function implementation. + default_val = -1 + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([0, 1, 2], dtypes.int64) + table = lookup_ops.HashTable( + lookup_ops.KeyValueTensorInitializer(keys, values), + default_val, + shared_name="shared_table") + + input_sentences = dataset_ops.Dataset.from_tensor_slices( + ["brain brain tank salad surgery", "surgery brain"]) + + iterator = ( + input_sentences.map(lambda x: string_ops.string_split([x]).values).map( + table.lookup) + .make_initializable_iterator(shared_name="shared_iterator")) + init_op = iterator.initializer + get_next = iterator.get_next() + + with session.Session(worker[0].target) as sess: + sess.run(table.init) + sess.run(init_op) + self.assertAllEqual([0, 0, -1, 1, 2], sess.run(get_next)) + + with session.Session(worker[0].target) as sess: + self.assertAllEqual([2, 0], sess.run(get_next)) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py index c1ba67e4744..c4b7e4919bb 100644 --- a/tensorflow/python/data/ops/dataset_ops.py +++ b/tensorflow/python/data/ops/dataset_ops.py @@ -41,8 +41,10 @@ from tensorflow.python.ops import gen_io_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import script_ops from tensorflow.python.util import deprecation +from tensorflow.python.util.tf_export import tf_export +@tf_export("data.Dataset") class Dataset(object): """Represents a potentially large set of elements. @@ -556,6 +558,8 @@ class Dataset(object): - /path/to/dir/b.py - /path/to/dir/c.py + NOTE: The order of the file names returned can be non-deterministic. + Args: file_pattern: A string or scalar string `tf.Tensor`, representing the filename pattern that will be matched. @@ -899,10 +903,11 @@ class Dataset(object): Args: transformation_func: A function that takes one `Dataset` argument and - returns a `Dataset`. + returns a `Dataset`. Returns: - Dataset: The `Dataset` returned by applying `transformation_func` to this dataset. + Dataset: The `Dataset` returned by applying `transformation_func` to this + dataset. """ dataset = transformation_func(self) if not isinstance(dataset, Dataset): @@ -1454,6 +1459,19 @@ def _padding_value_to_tensor(value, output_type): return value +def _default_padding(input_dataset): + + def make_zero(t): + if t.base_dtype == dtypes.string: + return "" + elif t.base_dtype == dtypes.variant: + raise TypeError("Unable to create padding for field of type 'variant'") + else: + return np.zeros_like(t.as_numpy_dtype()) + + return nest.map_structure(make_zero, input_dataset.output_types) + + class PaddedBatchDataset(Dataset): """A `Dataset` that batches and pads contiguous elements from its input.""" @@ -1469,23 +1487,13 @@ class PaddedBatchDataset(Dataset): batch_size, dtype=dtypes.int64, name="batch_size") padding_values = ( padding_values - if padding_values is not None else self._default_padding(input_dataset)) + if padding_values is not None else _default_padding(input_dataset)) self._padded_shapes = nest.map_structure_up_to( input_dataset.output_shapes, _partial_shape_to_tensor, padded_shapes) self._padding_values = nest.map_structure_up_to( input_dataset.output_shapes, _padding_value_to_tensor, padding_values, input_dataset.output_types) - def _default_padding(self, input_dataset): - - def make_zero(t): - if t.base_dtype == dtypes.string: - return "" - else: - return np.zeros_like(t.as_numpy_dtype()) - - return nest.map_structure(make_zero, input_dataset.output_types) - def _as_variant_tensor(self): return gen_dataset_ops.padded_batch_dataset( self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py index 0cbdb3ab19d..e573fe01928 100644 --- a/tensorflow/python/data/ops/iterator_ops.py +++ b/tensorflow/python/data/ops/iterator_ops.py @@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import gen_dataset_ops +from tensorflow.python.util.tf_export import tf_export # NOTE(mrry): It is legitimate to call `Iterator.get_next()` multiple @@ -47,6 +48,7 @@ GET_NEXT_CALL_WARNING_MESSAGE = ( "`next_element` inside the loop.") +@tf_export("data.Iterator") class Iterator(object): """Represents the state of iterating through a `Dataset`.""" @@ -165,8 +167,10 @@ class Iterator(object): iterator_resource = gen_dataset_ops.iterator( container="", shared_name=shared_name, - output_types=nest.flatten(output_types), - output_shapes=nest.flatten(output_shapes)) + output_types=nest.flatten( + sparse.as_dense_types(output_types, output_classes)), + output_shapes=nest.flatten( + sparse.as_dense_shapes(output_shapes, output_classes))) return Iterator(iterator_resource, None, output_types, output_shapes, output_classes) @@ -232,8 +236,10 @@ class Iterator(object): string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string) iterator_resource = gen_dataset_ops.iterator_from_string_handle( string_handle, - output_types=nest.flatten(output_types), - output_shapes=nest.flatten(output_shapes)) + output_types=nest.flatten( + sparse.as_dense_types(output_types, output_classes)), + output_shapes=nest.flatten( + sparse.as_dense_shapes(output_shapes, output_classes))) return Iterator(iterator_resource, None, output_types, output_shapes, output_classes) diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index 830dc5cec4a..fa7601741b1 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -23,12 +23,14 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import gen_dataset_ops +from tensorflow.python.util.tf_export import tf_export # TODO(b/64974358): Increase default buffer size to 256 MB. _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024 # 256 KB +@tf_export("data.TextLineDataset") class TextLineDataset(Dataset): """A `Dataset` comprising lines from one or more text files.""" @@ -71,6 +73,7 @@ class TextLineDataset(Dataset): return dtypes.string +@tf_export("data.TFRecordDataset") class TFRecordDataset(Dataset): """A `Dataset` comprising records from one or more TFRecord files.""" @@ -115,6 +118,7 @@ class TFRecordDataset(Dataset): return dtypes.string +@tf_export("data.FixedLengthRecordDataset") class FixedLengthRecordDataset(Dataset): """A `Dataset` of fixed-length records from one or more binary files.""" diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py index 2455395635c..e90ce3fb40a 100644 --- a/tensorflow/python/data/util/nest.py +++ b/tensorflow/python/data/util/nest.py @@ -266,7 +266,7 @@ def map_structure(func, *structure, **check_types_dict): and the return value will contain the results in the same structure. Args: - func: A callable that acceps as many arguments are there are structures. + func: A callable that accepts as many arguments are there are structures. *structure: scalar, or tuple or list of constructed scalars and/or other tuples/lists, or scalars. Note: numpy arrays are considered scalars. **check_types_dict: only valid keyword argument is `check_types`. If set to @@ -383,8 +383,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True): "structure has keys %s, while shallow structure has keys %s." % (list(_six.iterkeys(input_tree)), list(_six.iterkeys(shallow_tree)))) - input_tree = list(_six.iteritems(input_tree)) - shallow_tree = list(_six.iteritems(shallow_tree)) + input_tree = list(sorted(_six.iteritems(input_tree))) + shallow_tree = list(sorted(_six.iteritems(shallow_tree))) for shallow_branch, input_branch in zip(shallow_tree, input_tree): assert_shallow_structure(shallow_branch, input_branch, @@ -479,8 +479,8 @@ def map_structure_up_to(shallow_tree, func, *inputs): The `inputs`, can be thought of as having the same structure as `shallow_tree`, but with leaf nodes that are themselves tree structures. - This function therefore will return something with the same base structure as - `shallow_tree`. + This function, therefore, will return something with the same base structure + as `shallow_tree`. Examples: diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py index 90dd7dfe777..ff380815a4a 100644 --- a/tensorflow/python/data/util/nest_test.py +++ b/tensorflow/python/data/util/nest_test.py @@ -277,6 +277,10 @@ class NestTest(test.TestCase): with self.assertRaisesRegexp(ValueError, expected_message): nest.assert_shallow_structure(inp_ab2, inp_ab1) + inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))]) + inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)]) + nest.assert_shallow_structure(inp_ab, inp_ba) + def testFlattenUpTo(self): input_tree = (((2, 2), (3, 3)), ((4, 9), (5, 5))) shallow_tree = ((True, True), (False, True)) diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py index 5ebcb4ea81b..5e6d2247097 100644 --- a/tensorflow/python/data/util/sparse.py +++ b/tensorflow/python/data/util/sparse.py @@ -141,7 +141,7 @@ def serialize_sparse_tensors(tensors): tensors: a tensor structure to serialize. Returns: - `tensors` with any sparse tensors replaced by the their serialized version. + `tensors` with any sparse tensors replaced by their serialized version. """ ret = nest.pack_sequence_as(tensors, [ diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py index d4aea76d652..e0759a8bc1a 100644 --- a/tensorflow/python/debug/cli/tensor_format.py +++ b/tensorflow/python/debug/cli/tensor_format.py @@ -535,7 +535,7 @@ def numeric_summary(tensor): if not isinstance(tensor, np.ndarray) or not np.size(tensor): return debugger_cli_common.RichTextLines([ "No numeric summary available due to empty tensor."]) - elif (np.issubdtype(tensor.dtype, np.float) or + elif (np.issubdtype(tensor.dtype, np.floating) or np.issubdtype(tensor.dtype, np.complex) or np.issubdtype(tensor.dtype, np.integer)): counts = [ diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py index 704dbda357d..3821b393ec6 100644 --- a/tensorflow/python/debug/examples/debug_fibonacci.py +++ b/tensorflow/python/debug/examples/debug_fibonacci.py @@ -44,6 +44,10 @@ def main(_): sess.run(tf.global_variables_initializer()) # Wrap the TensorFlow Session object for debugging. + if FLAGS.debug and FLAGS.tensorboard_debug_address: + raise ValueError( + "The --debug and --tensorboard_debug_address flags are mutually " + "exclusive.") if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) @@ -52,6 +56,9 @@ def main(_): sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) sess.add_tensor_filter("has_negative", has_negative) + elif FLAGS.tensorboard_debug_address: + sess = tf_debug.TensorBoardDebugWrapperSession( + sess, FLAGS.tensorboard_debug_address) print("Fibonacci number at position %d:\n%s" % (FLAGS.length, sess.run(n1))) @@ -82,7 +89,15 @@ if __name__ == "__main__": "--debug", dest="debug", action="store_true", - help="Use TensorFlow Debugger (tfdbg).") + help="Use TensorFlow Debugger (tfdbg). Mutually exclusive with the " + "--tensorboard_debug_address flag.") + parser.add_argument( + "--tensorboard_debug_address", + type=str, + default=None, + help="Connect to the TensorBoard Debugger Plugin backend specified by " + "the gRPC address (e.g., localhost:1234). Mutually exclusive with the " + "--debug flag.") FLAGS, unparsed = parser.parse_known_args() tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py index 0a6dbf311d8..ab1c90371cd 100644 --- a/tensorflow/python/debug/examples/debug_mnist.py +++ b/tensorflow/python/debug/examples/debug_mnist.py @@ -120,8 +120,15 @@ def main(_): sess.run(tf.global_variables_initializer()) + if FLAGS.debug and FLAGS.tensorboard_debug_address: + raise ValueError( + "The --debug and --tensorboard_debug_address flags are mutually " + "exclusive.") if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type) + elif FLAGS.tensorboard_debug_address: + sess = tf_debug.TensorBoardDebugWrapperSession( + sess, FLAGS.tensorboard_debug_address) # Add this point, sess is a debug wrapper around the actual Session if # FLAGS.debug is true. In that case, calling run() will launch the CLI. @@ -173,6 +180,14 @@ if __name__ == "__main__": nargs="?", const=True, default=False, - help="Use debugger to track down bad values during training") + help="Use debugger to track down bad values during training. " + "Mutually exclusive with the --tensorboard_debug_address flag.") + parser.add_argument( + "--tensorboard_debug_address", + type=str, + default=None, + help="Connect to the TensorBoard Debugger Plugin backend specified by " + "the gRPC address (e.g., localhost:1234). Mutually exclusive with the " + "--debug flag.") FLAGS, unparsed = parser.parse_known_args() tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py index 92314d8dd9f..4f4666ee4fa 100644 --- a/tensorflow/python/debug/examples/debug_tflearn_iris.py +++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py @@ -110,10 +110,16 @@ def main(_): model_dir=model_dir) hooks = None + if FLAGS.debug and FLAGS.tensorboard_debug_address: + raise ValueError( + "The --debug and --tensorboard_debug_address flags are mutually " + "exclusive.") if FLAGS.debug: debug_hook = tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type, dump_root=FLAGS.dump_root) - hooks = [debug_hook] + elif FLAGS.tensorboard_debug_address: + debug_hook = tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address) + hooks = [debug_hook] if not FLAGS.use_experiment: # Fit model. @@ -185,11 +191,19 @@ if __name__ == "__main__": nargs="?", const=True, default=False, - help="Use debugger to track down bad values during training") + help="Use debugger to track down bad values during training. " + "Mutually exclusive with the --tensorboard_debug_address flag.") parser.add_argument( "--dump_root", type=str, default="", help="Optional custom root directory for temporary debug dump data") + parser.add_argument( + "--tensorboard_debug_address", + type=str, + default=None, + help="Connect to the TensorBoard Debugger Plugin backend specified by " + "the gRPC address (e.g., localhost:1234). Mutually exclusive with the " + "--debug flag.") FLAGS, unparsed = parser.parse_known_args() tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py index c4b13a1045d..8d355aa27f6 100644 --- a/tensorflow/python/debug/lib/debug_data.py +++ b/tensorflow/python/debug/lib/debug_data.py @@ -222,7 +222,7 @@ def has_inf_or_nan(datum, tensor): # Also return False for data types that cannot be represented as numpy # arrays. return False - elif (np.issubdtype(tensor.dtype, np.float) or + elif (np.issubdtype(tensor.dtype, np.floating) or np.issubdtype(tensor.dtype, np.complex) or np.issubdtype(tensor.dtype, np.integer)): return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor)) diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py index b6c7280a415..c1e9869d978 100644 --- a/tensorflow/python/debug/lib/debug_gradients_test.py +++ b/tensorflow/python/debug/lib/debug_gradients_test.py @@ -22,6 +22,7 @@ import shutil import tempfile from tensorflow.core.protobuf import config_pb2 +from tensorflow.core.protobuf import rewriter_config_pb2 from tensorflow.python.client import session from tensorflow.python.debug.lib import debug_data from tensorflow.python.debug.lib import debug_gradients @@ -38,7 +39,11 @@ from tensorflow.python.training import gradient_descent class IdentifyGradientTest(test_util.TensorFlowTestCase): def setUp(self): - self.sess = session.Session() + rewriter_config = rewriter_config_pb2.RewriterConfig( + dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF) + graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config) + config = config_pb2.ConfigProto(graph_options=graph_options) + self.sess = session.Session(config=config) with self.sess.as_default(): self.u = variables.Variable(2.0, name="u") self.v = variables.Variable(3.0, name="v") diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py index 367b3535450..b623ee31c5d 100644 --- a/tensorflow/python/debug/lib/session_debug_grpc_test.py +++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py @@ -54,7 +54,8 @@ from tensorflow.python.training import monitored_session def no_rewrite_session_config(): rewriter_config = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, - arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF) + arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, + dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF) graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config) return config_pb2.ConfigProto(graph_options=graph_options) diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py index acea9433e22..254201c3937 100644 --- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py +++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py @@ -389,6 +389,11 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase): r"mode\."): sess.invoke_node_stepper(node_stepper) + def testDumpingWrapperWithEmptyFetchWorks(self): + sess = dumping_wrapper.DumpingDebugWrapperSession( + self.sess, session_root=self.session_root, log_usage=False) + sess.run([]) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py index 909150eb6aa..c530204bbf6 100644 --- a/tensorflow/python/debug/wrappers/framework.py +++ b/tensorflow/python/debug/wrappers/framework.py @@ -121,7 +121,9 @@ from tensorflow.python.debug.lib import debug_utils from tensorflow.python.debug.lib import stepper from tensorflow.python.framework import errors from tensorflow.python.framework import ops +from tensorflow.python.platform import tf_logging from tensorflow.python.training import monitored_session +from tensorflow.python.util import nest # Helper function. @@ -439,7 +441,12 @@ class BaseDebugWrapperSession(session.SessionInterface): "callable_runner and fetches/feed_dict are mutually exclusive, but " "are used simultaneously.") - if self._is_disabled_thread(): + empty_fetches = not nest.flatten(fetches) + if empty_fetches: + tf_logging.info( + "Due to empty fetches, tfdbg Session wrapper is letting a " + "Session.run pass through without any debugging actions.") + if self._is_disabled_thread() or empty_fetches: if callable_runner: return callable_runner(*callable_runner_args) else: diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py index 989ad801e53..0204254ccab 100644 --- a/tensorflow/python/debug/wrappers/hooks.py +++ b/tensorflow/python/debug/wrappers/hooks.py @@ -35,10 +35,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook): `tf.contrib.learn`'s `Estimator`s and `Experiment`s. """ - def __init__(self, - ui_type="curses", - dump_root=None, - thread_name_filter=None): + def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None): """Create a local debugger command-line interface (CLI) hook. Args: @@ -62,7 +59,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook): """Add a tensor filter. See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details. - Override default behavior to accommodate the possibility of this method being + Override default behavior to accommodate the possibility of this method + being called prior to the initialization of the underlying `LocalCLIDebugWrapperSession` object. @@ -137,9 +135,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook): # pylint: enable=protected-access with stepper.NodeStepper( - run_context.session, - run_context.original_args. - fetches, + run_context.session, run_context.original_args.fetches, run_context.original_args.feed_dict) as node_stepper: self._session_wrapper.invoke_node_stepper( node_stepper, restore_variable_values_on_exit=True) @@ -149,8 +145,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook): def after_run(self, run_context, run_values): # Adapt run_context and run_values to OnRunEndRequest and invoke superclass # on_run_end() - on_run_end_request = framework.OnRunEndRequest( - self._performed_action, run_values.run_metadata) + on_run_end_request = framework.OnRunEndRequest(self._performed_action, + run_values.run_metadata) self._session_wrapper.on_run_end(on_run_end_request) @@ -260,8 +256,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook): self._thread_name_filter = thread_name_filter self._grpc_debug_server_addresses = ( grpc_debug_server_addresses - if isinstance(grpc_debug_server_addresses, list) - else [grpc_debug_server_addresses]) + if isinstance(grpc_debug_server_addresses, list) else + [grpc_debug_server_addresses]) self._watch_fn = watch_fn self._log_usage = log_usage @@ -334,6 +330,7 @@ class TensorBoardDebugHook(GrpcDebugHook): log_usage: Whether the usage of this class is to be logged (if applicable). """ + def _gated_grpc_watch_fn(fetches, feeds): del fetches, feeds # Unused. return framework.WatchOptions( diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py index 770a496aa9d..490812c96d8 100644 --- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py +++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py @@ -664,6 +664,20 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase): [["run"], ["run"]], monitored_sess) self.assertFalse(wrapped_monitored_sess.should_stop()) + def testRunsWithEmptyFetchWorks(self): + wrapped_sess = LocalCLIDebuggerWrapperSessionForTest( + [["run"]], self.sess, dump_root="") + + run_output = wrapped_sess.run([]) + self.assertEqual([], run_output) + + def testRunsWithEmptyNestedFetchWorks(self): + wrapped_sess = LocalCLIDebuggerWrapperSessionForTest( + [["run"]], self.sess, dump_root="") + + run_output = wrapped_sess.run({"foo": {"baz": []}, "bar": ()}) + self.assertEqual({"foo": {"baz": []}, "bar": ()}, run_output) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD index 9e3382d4f30..ab81d401484 100644 --- a/tensorflow/python/eager/BUILD +++ b/tensorflow/python/eager/BUILD @@ -206,29 +206,6 @@ cc_library( ], ) -cc_library( - name = "python_eager_op_gen_main", - srcs = [ - "python_eager_op_gen_main.cc", - ], - visibility = ["//visibility:public"], - deps = [ - ":python_eager_op_gen", - "//tensorflow/core:framework", - "//tensorflow/core:lib", - "//tensorflow/core:op_gen_lib", - "//tensorflow/core:protos_all_cc", - ], -) - -tf_cc_binary( - name = "python_eager_op_gen_demo", - deps = [ - ":python_eager_op_gen_main", - "//tensorflow/core:ops", - ], -) - py_library( name = "custom_gradient", srcs = ["custom_gradient.py"], diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py index a2a3e230bbb..d79d1fc0a64 100644 --- a/tensorflow/python/eager/backprop.py +++ b/tensorflow/python/eager/backprop.py @@ -734,7 +734,7 @@ def _num_elements(grad): raise ValueError("`grad` not a Tensor or IndexedSlices.") -_last_shape_dtype = [None, None] +_last_zero_shape_dtype = [None, None] _last_zero = [None] @@ -748,13 +748,15 @@ def _zeros(shape, dtype): # TODO(apassos): need to save enough information about variant tensors to do # a zeros return None - if [shape, dtype] != _last_shape_dtype: - _last_shape_dtype[:] = [shape, dtype] + if [shape, dtype] != _last_zero_shape_dtype: + _last_zero_shape_dtype[:] = [shape, dtype] _last_zero[0] = _fast_fill(0, shape, dtype) return _last_zero[0] def _ones(shape, dtype): + if shape == (): # pylint: disable=g-explicit-bool-comparison + return constant_op.constant(1, dtype=dtype) return _fast_fill(1, shape, dtype) diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py index 2f1654dda49..988442c971f 100644 --- a/tensorflow/python/eager/execution_callbacks.py +++ b/tensorflow/python/eager/execution_callbacks.py @@ -153,7 +153,7 @@ def inf_nan_callback(op_type, continue numpy_dtype = output.dtype.as_numpy_dtype - if (np.issubdtype(numpy_dtype, np.float) or + if (np.issubdtype(numpy_dtype, np.floating) or np.issubdtype(numpy_dtype, np.complex) or np.issubdtype(numpy_dtype, np.integer)): try: diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index 81b1f6f12a1..246df9afefb 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -292,6 +292,22 @@ def _map_sequence_obj_to_idx(sequence): return {id(x): i for i, x in enumerate(sequence)} +def _flatten(sequence): + """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`.""" + # TODO(akshayka): Support `SparseTensor` in a similar fashion. + flat_sequence = nest.flatten(sequence) + outputs = [] + for item in flat_sequence: + if isinstance(item, ops.IndexedSlices): + if item.dense_shape is not None: + outputs.extend([item.values, item.indices, item.dense_shape]) + else: + outputs.extend([item.values, item.indices]) + else: + outputs.append(item) + return outputs + + class GraphModeFunction(object): """Callable object representing a graph-mode function. @@ -333,14 +349,14 @@ class GraphModeFunction(object): self._input_placeholders = input_placeholders self._extra_inputs = list(extra_inputs) self._graph = graph - self._has_backprop = False + self._backward_function = None self._func_name = name self._function_def = defined_function self._num_outputs = len(defined_function.signature.output_arg) self._ops = operations self._func_outputs = func_outputs self._returns = [func_outputs] if isinstance( - func_outputs, (ops.Tensor, type(None))) else list(func_outputs) + func_outputs, (ops.Tensor, type(None))) else _flatten(func_outputs) self._output_shapes = output_shapes self._variables = variables if variables is not None else [] @@ -348,9 +364,8 @@ class GraphModeFunction(object): def variables(self): return self._variables - def _compute_backprop(self): - """Computes the backprop function object for this function.""" - self._has_backprop = True + def _construct_backprop_function(self): + """Constructs the backprop function object for this function.""" with self._graph.as_default(), context.graph_mode(): c = _CapturingContext() with c: @@ -361,13 +376,16 @@ class GraphModeFunction(object): filtered_outputs, self._input_placeholders, grad_ys=self._out_grad_placeholders) - shapes = tuple(x.shape for x in in_gradients if x is not None) + + backward_outputs = tuple( + grad for grad in _flatten(in_gradients) if grad is not None) + output_shapes = tuple(grad.shape for grad in backward_outputs) + captures = list(sorted(c.captured_tensors, key=lambda x: x.name)) forward_name = _forward_name(self._func_name) self._forward_fdef = _EagerDefinedFunction( forward_name, self._graph, self._ops, self._input_placeholders, filtered_outputs + captures) - backward_outputs = tuple(x for x in in_gradients if x is not None) all_inputs = self._out_grad_placeholders + captures # Excluding input ops from the body as we do not intend to execute these # operations when the function is executed. @@ -381,7 +399,7 @@ class GraphModeFunction(object): bname = _backward_name(self._func_name) self._backward_function = GraphModeFunction( bname, all_inputs, [], self._graph, function_def_ops, - backward_outputs, in_gradients, shapes) + backward_outputs, in_gradients, output_shapes) def _backprop_call(self, args): """Calls the wrapped function and records the result on a tape.""" @@ -426,9 +444,24 @@ class GraphModeFunction(object): @property def output_shapes(self): + """The function's output shapes.""" # TODO(ebrevdo): Should we only keep the output shapes associated # with len(self._returns) outputs? - return nest.pack_sequence_as(self._func_outputs, self._output_shapes) + outputs_list = nest.flatten(self._func_outputs) + j = 0 + for i, o in enumerate(outputs_list): + if o is not None: + if isinstance(o, ops.IndexedSlices): + # Extract the shape of the `IndexedSlices` object's `values` field. + outputs_list[i] = self._output_shapes[j] # the `values` shape + if o.dense_shape is not None: + j += 3 # skip over shapes for `values`, `indices`, `dense_shape` + else: + j += 2 # skip over shapes for `values`, `indices` + else: + outputs_list[i] = self._output_shapes[j] + j += 1 + return nest.pack_sequence_as(self._func_outputs, outputs_list) @property def output_dtypes(self): @@ -457,12 +490,11 @@ class GraphModeFunction(object): if v._trainable: # pylint: disable=protected-access tape.watch_variable(v) - tensor_inputs = [x for x in nest.flatten(args) - if isinstance(x, ops.Tensor)] + tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)] if tape.should_record(tensor_inputs) or tape.should_record( self._extra_inputs): - if not self._has_backprop: - self._compute_backprop() + if self._backward_function is None: + self._construct_backprop_function() return self._backprop_call(tensor_inputs) ctx = context.context() @@ -503,13 +535,30 @@ class GraphModeFunction(object): """ if self._func_outputs is None: return None + # Use `nest.flatten` instead of `_flatten` in order to preserve any + # IndexedSlices in `self._func_outputs`. outputs_list = nest.flatten(self._func_outputs) j = 0 for i, o in enumerate(outputs_list): if o is not None: - outputs_list[i] = result[j] - j += 1 - return nest.pack_sequence_as(self._func_outputs, outputs_list) + if isinstance(o, ops.IndexedSlices): + # Repack Tensors for IndexedSlices. + if o.dense_shape is not None: + outputs_list[i] = ops.IndexedSlices( + values=result[j], + indices=result[j + 1], + dense_shape=result[j + 2]) + j += 3 + else: + outputs_list[i] = ops.IndexedSlices( + values=result[j], + indices=result[j + 1]) + j += 2 + else: + outputs_list[i] = result[j] + j += 1 + ret = nest.pack_sequence_as(self._func_outputs, outputs_list) + return ret def _get_defun_inputs(args): @@ -526,15 +575,13 @@ def _get_defun_inputs(args): def _defun_internal(name, func, args, kwds): """Defines and returns graph-mode version of func.""" - container_prefix = ops.get_default_graph()._container_prefix # pylint: disable=protected-access + graph_key = ops.get_default_graph()._graph_key # pylint: disable=protected-access with context.graph_mode(): captures = {} tmp_graph = CapturingGraph(captures) - # Inherit the container prefix, since this is used for error checking when - # isolating eager execution (the container prefix at creation must match the - # container prefix when used, and variables accessed in the defun will be - # used in the outside context). - tmp_graph._container_prefix = container_prefix # pylint: disable=protected-access + # Inherit the graph key, since this is used for matching variables in + # optimizers. + tmp_graph._graph_key = graph_key # pylint: disable=protected-access # Copy the graph collections to ensure summaries and other things work. This # lets the function access (but not mutate) collections of the containing # graph, such as the global step and the summary writer collections. @@ -555,7 +602,7 @@ def _defun_internal(name, func, args, kwds): # Returning a closed-over tensor as an output does not trigger a # call to convert_to_tensor, so we manually capture all such tensors. - outputs_list = nest.flatten(func_outputs) + outputs_list = _flatten(func_outputs) func_def_outputs = [ _convert_to_graph_tensor(x) for x in outputs_list if x is not None ] @@ -600,6 +647,18 @@ def _cache_key(x): """Cache key for tfe functions.""" if isinstance(x, ops.Tensor): return _TensorDtype(x.dtype, x._shape_tuple()) # pylint: disable=protected-access + if isinstance(x, ops.IndexedSlices): + if x.dense_shape is not None: + return tuple([ + _TensorDtype(x.values.dtype, x.values._shape_tuple()), # pylint: disable=protected-access + _TensorDtype(x.indices.dtype, x.indices._shape_tuple()), # pylint: disable=protected-access + _TensorDtype(x.dense_shape.dtype, x.dense_shape._shape_tuple()) # pylint: disable=protected-access + ]) + else: + return tuple([ + _TensorDtype(x.values.dtype, x.values._shape_tuple()), # pylint: disable=protected-access + _TensorDtype(x.indices.dtype, x.indices._shape_tuple()) # pylint: disable=protected-access + ]) if isinstance(x, np.ndarray): return ("array", x.shape, tuple(x.reshape(-1))) if isinstance(x, (list, tuple)): diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py index 0babc29f17b..3e8e67ac7e2 100644 --- a/tensorflow/python/eager/function_test.py +++ b/tensorflow/python/eager/function_test.py @@ -374,6 +374,78 @@ class FunctionTest(test.TestCase): self.assertAllEqual(f(constant_op.constant(1.0)), 2.0) + def testGradientOfGatherWithDefun(self): + + v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0]) + + def sum_gather(): + return math_ops.reduce_sum(array_ops.gather(v, [1, 2])) + + grad_fn = backprop.implicit_grad(sum_gather) + gradient = grad_fn() + defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather)) + defun_gradient = defun_grad_fn() + self.assertEqual(len(gradient), len(defun_gradient)) + + gradient = gradient[0][0] + defun_gradient = defun_gradient[0][0] + self.assertAllEqual(gradient.values, defun_gradient.values) + self.assertAllEqual(gradient.indices, defun_gradient.indices) + self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape) + + def testReturningIndexedSlicesWithDefun(self): + + def validate(indexed_slice): + def f(): + return indexed_slice + + output = function.defun(f)() + self.assertTrue(isinstance(output, ops.IndexedSlices)) + self.assertAllEqual(indexed_slice.values, output.values) + self.assertAllEqual(indexed_slice.indices, output.indices) + self.assertAllEqual(indexed_slice.dense_shape, output.dense_shape) + + self.assertEqual( + function.make_defun_op(f).output_shapes, indexed_slice.values.shape) + + arg = ops.IndexedSlices( + values=constant_op.constant([1, 2]), + indices=constant_op.constant([0, 1]), + dense_shape=constant_op.constant([2])) + validate(arg) + + arg = ops.IndexedSlices( + values=constant_op.constant([1, 2]), + indices=constant_op.constant([0, 1]), + dense_shape=None) + validate(arg) + + def testIndexedSliceAsArgumentWithDefun(self): + + @function.defun + def f(indexed_slice): + return indexed_slice + + def validate(arg): + output = f(arg) + self.assertTrue(isinstance(output, ops.IndexedSlices)) + self.assertAllEqual(arg.values, output.values) + self.assertAllEqual(arg.indices, output.indices) + self.assertAllEqual(arg.dense_shape, output.dense_shape) + + indexed_slice = ops.IndexedSlices( + values=constant_op.constant([1]), + indices=constant_op.constant([0]), + dense_shape=constant_op.constant([1])) + validate(indexed_slice) + + # Test that `f` works even when `dense_shape` is None. + indexed_slice = ops.IndexedSlices( + values=constant_op.constant([1]), + indices=constant_op.constant([0]), + dense_shape=None) + validate(indexed_slice) + def testFunctionOnDevice(self): if not context.context().num_gpus(): self.skipTest('No GPUs found') @@ -504,6 +576,24 @@ class FunctionTest(test.TestCase): self.assertAllEqual(ret[0][2], 10) self.assertAllEqual(ret[1], 15) + def testVariableNamesRespectNameScopesWithDefun(self): + @function.defun + def create_variable(): + with ops.name_scope('foo'): + v = resource_variable_ops.ResourceVariable(0.0, name='bar') + self.assertEqual(v.name, 'foo/bar:0') + create_variable() + + def testVariableNamesRespectNameScopesWithDefunInGraph(self): + with context.graph_mode(): + @function.defun + def create_variable(): + with ops.name_scope('foo'): + v = resource_variable_ops.ResourceVariable([1.0, 2.0], name='bar') + self.assertEqual(v.name, 'foo/bar:0') + with ops.get_default_graph().as_default(): + create_variable() + if __name__ == '__main__': test.main() diff --git a/tensorflow/python/eager/gen_op.bzl b/tensorflow/python/eager/gen_op.bzl deleted file mode 100644 index 8bc1d6c10a6..00000000000 --- a/tensorflow/python/eager/gen_op.bzl +++ /dev/null @@ -1,65 +0,0 @@ -"""For eager-mode Python.""" - -load("//tensorflow:tensorflow.bzl", - "clean_dep", - "tf_binary_additional_srcs", - "tf_copts", - "tf_cc_binary") - -def tfe_gen_op_wrapper_py(name, - out=None, - visibility=None, - deps=[], - generated_target_name=None, - # ApiDefs will be loaded in the order specified in this list. - api_def_srcs=[]): - """Generate an eager-mode Python op wrapper for an op library.""" - # Construct a cc_binary containing the specified ops. - tool_name = "gen_" + name + "_py_wrappers_cc" - if not deps: - deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))] - tf_cc_binary( - name=tool_name, - linkopts=["-lm"], - copts=tf_copts(), - linkstatic=1, - deps=([ - clean_dep("//tensorflow/python/eager:python_eager_op_gen_main") - ] + deps), - visibility=[clean_dep("//visibility:public")],) - - # Invoke the previous cc_binary to generate a python file. - if not out: - out = "gen_" + name + ".py" - - if not api_def_srcs: - api_def_args_str = "," - else: - api_def_args = [] - for api_def_src in api_def_srcs: - # Add directory of the first ApiDef source to args. - # We are assuming all ApiDefs in a single api_def_src are in the - # same directory. - api_def_args.append( - "$$(dirname $$(echo $(locations " + api_def_src + - ") | cut -d\" \" -f1))") - api_def_args_str = ",".join(api_def_args) - - native.genrule( - name=name + "_pygenrule", - outs=[out], - srcs=api_def_srcs, - tools=[tool_name] + tf_binary_additional_srcs(), - cmd=("$(location " + tool_name + ") " + api_def_args_str + " > $@")) - - # Make a py_library out of the generated python file. - if not generated_target_name: - generated_target_name = name - native.py_library( - name=generated_target_name, - srcs=[out], - srcs_version="PY2AND3", - visibility=visibility, - deps=[ - clean_dep("//tensorflow/python/eager:framework_for_generated_wrappers"), - ],) diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py index 5c13ea89081..62106bf0e28 100644 --- a/tensorflow/python/eager/graph_callable.py +++ b/tensorflow/python/eager/graph_callable.py @@ -252,21 +252,17 @@ def _graph_callable_internal(func, shape_and_dtypes): Callable graph object. """ container = tf_ops.get_default_graph()._container # pylint: disable=protected-access - container_prefix = tf_ops.get_default_graph()._container_prefix # pylint: disable=protected-access + graph_key = tf_ops.get_default_graph()._graph_key # pylint: disable=protected-access with context.graph_mode(): # This graph will store both the initialization and the call version of the # wrapped function. It will later be used by the backprop code to build the # backprop graph, if necessary. captures = {} tmp_graph = function.CapturingGraph(captures) - # Inherit the container from the original graph to create resources at user - # expected containers. Also inherits the container prefix, since this is - # used for error checking when isolating Eager execution (the container - # prefix at creation must match the container prefix when used, and - # variables returned from the graph callable will be used in the outside - # context). + # Inherit the graph key from the original graph to ensure optimizers don't + # misbehave. tmp_graph._container = container # pylint: disable=protected-access - tmp_graph._container_prefix = container_prefix # pylint: disable=protected-access + tmp_graph._graph_key = graph_key # pylint: disable=protected-access with tmp_graph.as_default(): # Placeholders for the non-variable inputs. func_inputs = _get_graph_callable_inputs(shape_and_dtypes) diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc index 90a8779ff84..0f18f28c955 100644 --- a/tensorflow/python/eager/python_eager_op_gen.cc +++ b/tensorflow/python/eager/python_eager_op_gen.cc @@ -756,11 +756,21 @@ from tensorflow.python.util.tf_export import tf_export auto out = cleaned_ops.mutable_op(); out->Reserve(ops.op_size()); for (const auto& op_def : ops.op()) { - bool is_hidden = false; - for (const string& hidden : hidden_ops) { - if (op_def.name() == hidden) { - is_hidden = true; - break; + const auto* api_def = api_defs.GetApiDef(op_def.name()); + + if (api_def->visibility() == ApiDef::SKIP) { + continue; + } + + // An op is hidden if either its ApiDef visibility is HIDDEN + // or it is in the hidden_ops list. + bool is_hidden = api_def->visibility() == ApiDef::HIDDEN; + if (!is_hidden) { + for (const string& hidden : hidden_ops) { + if (op_def.name() == hidden) { + is_hidden = true; + break; + } } } @@ -777,7 +787,6 @@ from tensorflow.python.util.tf_export import tf_export continue; } - const auto* api_def = api_defs.GetApiDef(op_def.name()); strings::StrAppend(&result, GetEagerPythonOp(op_def, *api_def, function_name)); diff --git a/tensorflow/python/eager/python_eager_op_gen_main.cc b/tensorflow/python/eager/python_eager_op_gen_main.cc deleted file mode 100644 index 05351bd8b11..00000000000 --- a/tensorflow/python/eager/python_eager_op_gen_main.cc +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/python/eager/python_eager_op_gen.h" - -#include -#include -#include - -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_def.pb.h" -#include "tensorflow/core/framework/op_gen_lib.h" -#include "tensorflow/core/lib/io/path.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/init_main.h" - -namespace tensorflow { -namespace { - -void PrintAllPythonOps(const std::vector& hidden_ops, - const std::vector& api_def_dirs) { - OpList ops; - OpRegistry::Global()->Export(false, &ops); - - ApiDefMap api_def_map(ops); - if (!api_def_dirs.empty()) { - Env* env = Env::Default(); - - for (const auto& api_def_dir : api_def_dirs) { - std::vector api_files; - TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"), - &api_files)); - TF_CHECK_OK(api_def_map.LoadFileList(env, api_files)); - } - api_def_map.UpdateDocs(); - } - - PrintEagerPythonOps(ops, api_def_map, hidden_ops, true /* require_shapes */); -} - -} // namespace -} // namespace tensorflow - -int main(int argc, char* argv[]) { - tensorflow::port::InitMain(argv[0], &argc, &argv); - - // Usage: - // python_eager_op_gen_main api_def_dir1,api_def_dir2,... - if (argc == 1) { - tensorflow::PrintAllPythonOps({}, {}); - } else if (argc == 2) { - const std::vector api_def_dirs = - tensorflow::str_util::Split(argv[1], ",", - tensorflow::str_util::SkipEmpty()); - tensorflow::PrintAllPythonOps({}, api_def_dirs); - } else { - return -1; - } - return 0; -} diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc index 647f03351d9..d927f3abedb 100644 --- a/tensorflow/python/eager/pywrap_tfe_src.cc +++ b/tensorflow/python/eager/pywrap_tfe_src.cc @@ -86,30 +86,6 @@ bool ParseBoolValue(const string& key, PyObject* py_value, TF_Status* status, return true; } -const char* ParseProtoValue(const string& key, const char* proto_name, - PyObject* py_value, size_t* size, - TF_Status* status) { - char* output = nullptr; - Py_ssize_t py_size; - if (PyBytes_Check(py_value) && - PyBytes_AsStringAndSize(py_value, &output, &py_size) >= 0) { - *size = static_cast(py_size); - return output; - } -#if PY_MAJOR_VERSION >= 3 - if (PyUnicode_Check(py_value) && - (output = PyUnicode_AsUTF8AndSize(py_value, &py_size)) != nullptr) { - *size = static_cast(py_size); - return output; - } -#endif - TF_SetStatus(status, TF_INVALID_ARGUMENT, - tensorflow::strings::StrCat("Expecting a string (serialized ", - proto_name, ") value for attr ", key) - .c_str()); - return nullptr; -} - bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list, TF_AttrType type, TF_Status* status) { if (!PySequence_Check(py_list)) { @@ -329,8 +305,9 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index, tensorflow::mutex exception_class_mutex(tensorflow::LINKER_INITIALIZED); PyObject* exception_class GUARDED_BY(exception_class_mutex) = nullptr; -static tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED); -static tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0; +tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED); +tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0; + } // namespace void TFE_Py_Execute(TFE_Context* ctx, const char* device_name, @@ -551,6 +528,34 @@ tensorflow::gtl::CompactPointerSet* GetTapeSet() { return tape_set; } +// A safe copy of the current tapeset. Does not get affected by other python +// threads changing the set of active tapes. +class SafeTapeSet { + public: + SafeTapeSet() : tape_set_(*GetTapeSet()) { + for (auto* tape : tape_set_) { + Py_INCREF(tape); + } + } + + ~SafeTapeSet() { + for (auto* tape : tape_set_) { + Py_DECREF(tape); + } + } + + tensorflow::gtl::CompactPointerSet::const_iterator begin() { + return tape_set_.begin(); + } + + tensorflow::gtl::CompactPointerSet::const_iterator end() { + return tape_set_.end(); + } + + private: + tensorflow::gtl::CompactPointerSet tape_set_; +}; + // xcode 7 doesn't define thread_local, so for compatibility we implement our // own. TODO(apassos) remove once we can deprecate xcode 7. #ifndef __APPLE__ @@ -741,10 +746,7 @@ void TFE_Py_TapeSetWatchVariable(PyObject* variable) { if (*ThreadTapeIsStopped()) { return; } - // Note: making a copy because watching a variable can trigger a change to the - // set of tapes by allowing python's garbage collector to run. - auto tape_set = *GetTapeSet(); - for (TFE_Py_Tape* tape : tape_set) { + for (TFE_Py_Tape* tape : SafeTapeSet()) { tape->tape->WatchVariable(variable); } } @@ -800,8 +802,7 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors, return; } - auto set = *GetTapeSet(); - for (TFE_Py_Tape* tape : set) { + for (TFE_Py_Tape* tape : SafeTapeSet()) { Py_INCREF(backward_function); tape->tape->RecordOperation( op_type_str, output_info, input_ids, backward_function, @@ -810,10 +811,7 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors, } void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) { - // Note: making a copy because deleting the trace can trigger a change to the - // set of tapes by allowing python's garbage collector to run. - auto tape_set = *GetTapeSet(); - for (TFE_Py_Tape* tape : tape_set) { + for (TFE_Py_Tape* tape : SafeTapeSet()) { tape->tape->DeleteTrace(tensor_id); } } diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py index 96e4ecd29fb..138152ac1c6 100644 --- a/tensorflow/python/estimator/canned/baseline.py +++ b/tensorflow/python/estimator/canned/baseline.py @@ -57,6 +57,7 @@ from tensorflow.python.ops import check_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope +from tensorflow.python.ops.losses import losses from tensorflow.python.training import training_util # The default learning rate of 0.3 is a historical artifact of the initial @@ -220,7 +221,8 @@ class BaselineClassifier(estimator.Estimator): weight_column=None, label_vocabulary=None, optimizer='Ftrl', - config=None): + config=None, + loss_reduction=losses.Reduction.SUM): """Initializes a BaselineClassifier instance. Args: @@ -240,6 +242,8 @@ class BaselineClassifier(estimator.Estimator): optimizer to use for training. If not specified, will use `FtrlOptimizer` with a default learning rate of 0.3. config: `RunConfig` object to configure the runtime settings. + loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how + to reduce training loss over batch. Defaults to `SUM`. Returns: A `BaselineClassifier` estimator. @@ -249,11 +253,13 @@ class BaselineClassifier(estimator.Estimator): if n_classes == 2: head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss( # pylint: disable=protected-access weight_column=weight_column, - label_vocabulary=label_vocabulary) + label_vocabulary=label_vocabulary, + loss_reduction=loss_reduction) else: head = head_lib._multi_class_head_with_softmax_cross_entropy_loss( # pylint: disable=protected-access n_classes, weight_column=weight_column, - label_vocabulary=label_vocabulary) + label_vocabulary=label_vocabulary, + loss_reduction=loss_reduction) def _model_fn(features, labels, mode, config): return _baseline_model_fn( features=features, @@ -311,7 +317,8 @@ class BaselineRegressor(estimator.Estimator): label_dimension=1, weight_column=None, optimizer='Ftrl', - config=None): + config=None, + loss_reduction=losses.Reduction.SUM): """Initializes a BaselineRegressor instance. Args: @@ -328,13 +335,16 @@ class BaselineRegressor(estimator.Estimator): optimizer to use for training. If not specified, will use `FtrlOptimizer` with a default learning rate of 0.3. config: `RunConfig` object to configure the runtime settings. + loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how + to reduce training loss over batch. Defaults to `SUM`. Returns: A `BaselineRegressor` estimator. """ head = head_lib._regression_head_with_mean_squared_error_loss( # pylint: disable=protected-access label_dimension=label_dimension, - weight_column=weight_column) + weight_column=weight_column, + loss_reduction=loss_reduction) def _model_fn(features, labels, mode, config): return _baseline_model_fn( features=features, diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py index 96555b5e03c..78d74b63d3e 100644 --- a/tensorflow/python/estimator/estimator.py +++ b/tensorflow/python/estimator/estimator.py @@ -478,13 +478,16 @@ class Estimator(object): estimator_spec = self._call_model_fn( features, None, model_fn_lib.ModeKeys.PREDICT, self.config) predictions = self._extract_keys(estimator_spec.predictions, predict_keys) + all_hooks = list(input_hooks) + all_hooks.extend(hooks) + all_hooks.extend(list(estimator_spec.prediction_hooks or [])) with training.MonitoredSession( session_creator=training.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, master=self._config.master, scaffold=estimator_spec.scaffold, config=self._session_config), - hooks=input_hooks + hooks) as mon_sess: + hooks=all_hooks) as mon_sess: while not mon_sess.should_stop(): preds_evaluated = mon_sess.run(predictions) if not isinstance(predictions, dict): diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py index 833f3dcac3b..39a5b998ebd 100644 --- a/tensorflow/python/estimator/estimator_test.py +++ b/tensorflow/python/estimator/estimator_test.py @@ -1355,6 +1355,25 @@ class EstimatorPredictTest(test.TestCase): est.train(dummy_input_fn, steps=1) self.assertEqual(10., next(est.predict(dummy_input_fn))) + def test_predictionhooks_are_used(self): + hook = test.mock.MagicMock( + wraps=training.SessionRunHook(), spec=training.SessionRunHook) + + def _model_fn_hooks(features, labels, mode): + _, _ = features, labels + return model_fn_lib.EstimatorSpec( + mode=mode, + loss=constant_op.constant(0.), + train_op=state_ops.assign_add(training.get_global_step(), 1), + predictions=constant_op.constant([[10.]]), + prediction_hooks=[hook]) + + est = estimator.Estimator(model_fn=_model_fn_hooks) + est.train(dummy_input_fn, steps=1) + self.assertFalse(hook.begin.called) + next(est.predict(dummy_input_fn)) + self.assertTrue(hook.begin.called) + def test_warn_if_no_queue_runner(self): def _model_fn(features, labels, mode): diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py index 51075731ddc..83251c79fc5 100644 --- a/tensorflow/python/estimator/export/export.py +++ b/tensorflow/python/estimator/export/export.py @@ -36,12 +36,14 @@ from tensorflow.python.platform import tf_logging as logging from tensorflow.python.saved_model import signature_constants from tensorflow.python.saved_model import signature_def_utils from tensorflow.python.util import compat +from tensorflow.python.util.tf_export import tf_export _SINGLE_FEATURE_DEFAULT_NAME = 'feature' _SINGLE_RECEIVER_DEFAULT_NAME = 'input' +@tf_export('estimator.export.ServingInputReceiver') class ServingInputReceiver(collections.namedtuple( 'ServingInputReceiver', ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])): @@ -118,6 +120,7 @@ class ServingInputReceiver(collections.namedtuple( receiver_tensors_alternatives=receiver_tensors_alternatives) +@tf_export('estimator.export.build_parsing_serving_input_receiver_fn') def build_parsing_serving_input_receiver_fn(feature_spec, default_batch_size=None): """Build a serving_input_receiver_fn expecting fed tf.Examples. @@ -146,6 +149,7 @@ def build_parsing_serving_input_receiver_fn(feature_spec, return serving_input_receiver_fn +@tf_export('estimator.export.build_raw_serving_input_receiver_fn') def build_raw_serving_input_receiver_fn(features, default_batch_size=None): """Build a serving_input_receiver_fn expecting feature Tensors. diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py index 863af6d41d9..87b964be371 100644 --- a/tensorflow/python/estimator/export/export_output.py +++ b/tensorflow/python/estimator/export/export_output.py @@ -26,8 +26,10 @@ import six from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.saved_model import signature_def_utils +from tensorflow.python.util.tf_export import tf_export +@tf_export('estimator.export.ExportOutput') class ExportOutput(object): """Represents an output of a model that can be served. @@ -50,6 +52,7 @@ class ExportOutput(object): pass +@tf_export('estimator.export.ClassificationOutput') class ClassificationOutput(ExportOutput): """Represents the output of a classification head. @@ -118,6 +121,7 @@ class ClassificationOutput(ExportOutput): examples, self.classes, self.scores) +@tf_export('estimator.export.RegressionOutput') class RegressionOutput(ExportOutput): """Represents the output of a regression head.""" @@ -153,6 +157,7 @@ class RegressionOutput(ExportOutput): _SINGLE_OUTPUT_DEFAULT_NAME = 'output' +@tf_export('estimator.export.PredictOutput') class PredictOutput(ExportOutput): """Represents the output of a generic prediction head. diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py index c4c2e30e877..a6f47129100 100644 --- a/tensorflow/python/estimator/inputs/numpy_io.py +++ b/tensorflow/python/estimator/inputs/numpy_io.py @@ -24,6 +24,7 @@ import numpy as np from six import string_types from tensorflow.python.estimator.inputs.queues import feeding_functions +from tensorflow.python.util.tf_export import tf_export # Key name to pack the target into dict of `features`. See # `_get_unique_target_key` for details. @@ -86,6 +87,7 @@ def _validate_and_convert_features(x): return ordered_dict_data +@tf_export('estimator.inputs.numpy_input_fn') def numpy_input_fn(x, y=None, batch_size=128, diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py index 90d6145377d..bd06843021f 100644 --- a/tensorflow/python/estimator/inputs/pandas_io.py +++ b/tensorflow/python/estimator/inputs/pandas_io.py @@ -21,6 +21,7 @@ from __future__ import print_function import numpy as np from tensorflow.python.estimator.inputs.queues import feeding_functions +from tensorflow.python.util.tf_export import tf_export try: # pylint: disable=g-import-not-at-top @@ -34,6 +35,7 @@ except ImportError: HAS_PANDAS = False +@tf_export('estimator.inputs.pandas_input_fn') def pandas_input_fn(x, y=None, batch_size=128, diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py index da202408c36..b08f83fc569 100644 --- a/tensorflow/python/estimator/model_fn.py +++ b/tensorflow/python/estimator/model_fn.py @@ -56,7 +56,7 @@ class EstimatorSpec( collections.namedtuple('EstimatorSpec', [ 'mode', 'predictions', 'loss', 'train_op', 'eval_metric_ops', 'export_outputs', 'training_chief_hooks', 'training_hooks', 'scaffold', - 'evaluation_hooks' + 'evaluation_hooks', 'prediction_hooks' ])): """Ops and objects returned from a `model_fn` and passed to an `Estimator`. @@ -73,7 +73,8 @@ class EstimatorSpec( training_chief_hooks=None, training_hooks=None, scaffold=None, - evaluation_hooks=None): + evaluation_hooks=None, + prediction_hooks=None): """Creates a validated `EstimatorSpec` instance. Depending on the value of `mode`, different arguments are required. Namely @@ -154,6 +155,8 @@ class EstimatorSpec( initialization, saver, and more to be used in training. evaluation_hooks: Iterable of `tf.train.SessionRunHook` objects to run during evaluation. + prediction_hooks: Iterable of `tf.train.SessionRunHook` objects to + run during predictions. Returns: A validated `EstimatorSpec` object. @@ -282,7 +285,10 @@ class EstimatorSpec( training_chief_hooks = tuple(training_chief_hooks or []) training_hooks = tuple(training_hooks or []) evaluation_hooks = tuple(evaluation_hooks or []) - for hook in training_hooks + training_chief_hooks + evaluation_hooks: + prediction_hooks = tuple(prediction_hooks or []) + + for hook in (training_hooks + training_chief_hooks + evaluation_hooks + + prediction_hooks): if not isinstance(hook, session_run_hook.SessionRunHook): raise TypeError( 'All hooks must be SessionRunHook instances, given: {}'.format( @@ -305,7 +311,8 @@ class EstimatorSpec( training_chief_hooks=training_chief_hooks, training_hooks=training_hooks, scaffold=scaffold, - evaluation_hooks=evaluation_hooks) + evaluation_hooks=evaluation_hooks, + prediction_hooks=prediction_hooks) def _replace(self, **kwds): """Return a new EstimatorSpec replacing specified fields with new values.""" diff --git a/tensorflow/python/estimator/model_fn_test.py b/tensorflow/python/estimator/model_fn_test.py index d67c4b71616..b7eeeb437cb 100644 --- a/tensorflow/python/estimator/model_fn_test.py +++ b/tensorflow/python/estimator/model_fn_test.py @@ -72,7 +72,8 @@ class EstimatorSpecTrainTest(test.TestCase): training_chief_hooks=[_FakeHook()], training_hooks=[_FakeHook()], scaffold=monitored_session.Scaffold(), - evaluation_hooks=[_FakeHook()]) + evaluation_hooks=[_FakeHook()], + prediction_hooks=[_FakeHook()]) def testLossNumber(self): """Tests that error is raised when loss is a number (not Tensor).""" @@ -465,7 +466,17 @@ class EstimatorSpecInferTest(test.TestCase): training_chief_hooks=[_FakeHook()], training_hooks=[_FakeHook()], scaffold=monitored_session.Scaffold(), - evaluation_hooks=[_FakeHook()]) + evaluation_hooks=[_FakeHook()], + prediction_hooks=[_FakeHook()]) + + def testPredictionHookInvalid(self): + with ops.Graph().as_default(), self.test_session(): + with self.assertRaisesRegexp( + TypeError, 'All hooks must be SessionRunHook instances'): + model_fn.EstimatorSpec( + mode=model_fn.ModeKeys.PREDICT, + predictions=constant_op.constant(1.), + prediction_hooks=[_InvalidHook()]) def testPredictionsMissing(self): with ops.Graph().as_default(), self.test_session(): diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py index 52fb1d39ae2..2e84c5014f6 100644 --- a/tensorflow/python/estimator/training.py +++ b/tensorflow/python/estimator/training.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Classes and functions related to train_and_evaluate.""" from __future__ import absolute_import @@ -37,7 +36,6 @@ from tensorflow.python.training import server_lib from tensorflow.python.training import session_run_hook from tensorflow.python.util import compat - _MAX_DELAY_SECS = 60 _DELAY_SECS_PER_WORKER = 5 _TF_CONFIG_ENV = 'TF_CONFIG' @@ -50,8 +48,7 @@ _TRAINER_JOBS = (run_config_lib.TaskType.CHIEF, run_config_lib.TaskType.MASTER, def _validate_input_fn(input_fn): """Validates the `input_fn`.""" if not callable(input_fn): - raise TypeError( - '`input_fn` must be callable, given: {}'.format(input_fn)) + raise TypeError('`input_fn` must be callable, given: {}'.format(input_fn)) def _validate_hooks(hooks): @@ -125,10 +122,7 @@ class TrainSpec( duration. Optional hooks run at various stages of training. """ - def __new__(cls, - input_fn, - max_steps=None, - hooks=None): + def __new__(cls, input_fn, max_steps=None, hooks=None): """Creates a validated `TrainSpec` instance. Args: @@ -161,16 +155,13 @@ class TrainSpec( hooks = _validate_hooks(hooks) return super(TrainSpec, cls).__new__( - cls, - input_fn=input_fn, - max_steps=max_steps, - hooks=hooks) + cls, input_fn=input_fn, max_steps=max_steps, hooks=hooks) class EvalSpec( collections.namedtuple('EvalSpec', [ - 'input_fn', 'steps', 'name', 'hooks', 'exporters', - 'start_delay_secs', 'throttle_secs' + 'input_fn', 'steps', 'name', 'hooks', 'exporters', 'start_delay_secs', + 'throttle_secs' ])): """Configuration for the "eval" part for the `train_and_evaluate` call. @@ -417,8 +408,8 @@ def train_and_evaluate(estimator, train_spec, eval_spec): Raises: ValueError: if environment variable `TF_CONFIG` is incorrectly set. """ - executor = _TrainingExecutor(estimator=estimator, train_spec=train_spec, - eval_spec=eval_spec) + executor = _TrainingExecutor( + estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) config = estimator.config if (config.task_type == run_config_lib.TaskType.EVALUATOR and @@ -561,9 +552,8 @@ class _TrainingExecutor(object): self._timer.update_last_triggered_step(global_step_value) self._evaluator.evaluate_and_export() else: - logging.info( - 'Skip the current checkpoint eval due to throttle secs ' - '({} secs).'.format(self._eval_throttle_secs)) + logging.info('Skip the current checkpoint eval due to throttle secs ' + '({} secs).'.format(self._eval_throttle_secs)) # Final export signal: For any eval result with global_step >= train # max_steps, the evaluator will send the final export signal. There is a @@ -576,8 +566,8 @@ class _TrainingExecutor(object): # # But here, throttle_secs will skip the next intermediate checkpoint and, # so, the double final export chance is very small. - evaluator = _TrainingExecutor._Evaluator( - self._estimator, self._eval_spec, self._train_spec.max_steps) + evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec, + self._train_spec.max_steps) # When the underlying `Estimator` object saves a new checkpoint, we would # like this callback to be called so that evaluation and export can trigger. @@ -617,8 +607,7 @@ class _TrainingExecutor(object): raise ValueError('eval_spec.throttle_secs should be positive, given: {}.' 'It is used do determine how long each training ' 'iteration should go when train and evaluate ' - 'locally.'.format( - self._eval_spec.throttle_secs)) + 'locally.'.format(self._eval_spec.throttle_secs)) stop_hook = _StopAtSecsHook(self._eval_spec.throttle_secs) train_hooks = ( @@ -663,8 +652,9 @@ class _TrainingExecutor(object): if not config.master: jobs = config.cluster_spec.jobs - if (len(jobs) == 1 and len(config.cluster_spec.job_tasks(jobs[0])) == 1 - and config.task_type in _TRAINER_JOBS): + if (len(jobs) == 1 and + len(config.cluster_spec.job_tasks(jobs[0])) == 1 and + config.task_type in _TRAINER_JOBS): # For distributed training, config.master is empty if and only if it has # a single node in the cluster spec. In this case, we should not start # the server. @@ -679,9 +669,9 @@ class _TrainingExecutor(object): logging.info('Start Tensorflow server.') if config.session_config is None: - session_config=config_pb2.ConfigProto(log_device_placement=False) + session_config = config_pb2.ConfigProto(log_device_placement=False) else: - session_config=config_pb2.ConfigProto( + session_config = config_pb2.ConfigProto( log_device_placement=False, gpu_options=config.session_config.gpu_options) @@ -744,8 +734,7 @@ class _TrainingExecutor(object): global_step >= self._train_spec.max_steps): logging.info( 'Exiting evaluation, global_step=%s >= train max_steps=%s', - global_step, - self._train_spec.max_steps) + global_step, self._train_spec.max_steps) return latest_eval_result, should_early_stop = self._execute_evaluator_once( @@ -781,10 +770,9 @@ class _TrainingExecutor(object): # Throttle if necessary. elapsed_time = time.time() - start - difference = throttle_secs - elapsed_time + difference = throttle_secs - elapsed_time if difference > 0: - logging.info('Waiting %f secs before starting next eval run.', - difference) + logging.info('Waiting %f secs before starting next eval run.', difference) time.sleep(difference) return (eval_result, should_early_stop) @@ -929,8 +917,8 @@ class _EvalResult( if checkpoint_path: raise ValueError( 'checkpoint must be `None` if status is not {}; got status {}, ' - 'checkpoint_path {}'.format( - _EvalStatus.EVALUATED, status, checkpoint_path)) + 'checkpoint_path {}'.format(_EvalStatus.EVALUATED, status, + checkpoint_path)) return super(_EvalResult, cls).__new__(cls, status, metrics, checkpoint_path) diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/estimator/warm_starting_util.py index ad95c71234f..48110ef57fc 100644 --- a/tensorflow/python/estimator/warm_starting_util.py +++ b/tensorflow/python/estimator/warm_starting_util.py @@ -415,8 +415,8 @@ def _warm_start(warm_start_settings): a stronger check for variable configuration than relying on users to examine the logs. """ - logging.info("Warm-starting from: ", - warm_start_settings.ckpt_to_initialize_from) + logging.info("Warm-starting from: %s", + (warm_start_settings.ckpt_to_initialize_from,)) # We have to deal with partitioned variables, since get_collection flattens # out the list. grouped_variables = {} diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index 7feb209cc49..5947d8f6e23 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -157,6 +157,7 @@ from tensorflow.python.platform import gfile from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import checkpoint_utils from tensorflow.python.util import nest +from tensorflow.python.util.tf_export import tf_export def _internal_input_layer(features, @@ -209,6 +210,7 @@ def _internal_input_layer(features, return array_ops.concat(output_tensors, 1) +@tf_export('feature_column.input_layer') def input_layer(features, feature_columns, weight_collections=None, @@ -329,6 +331,7 @@ class InputLayer(object): return self._input_layer_template.weights +@tf_export('feature_column.linear_model') def linear_model(features, feature_columns, units=1, @@ -498,6 +501,7 @@ def _transform_features(features, feature_columns): return outputs +@tf_export('feature_column.make_parse_example_spec') def make_parse_example_spec(feature_columns): """Creates parsing spec dictionary from input feature_columns. @@ -557,6 +561,7 @@ def make_parse_example_spec(feature_columns): return result +@tf_export('feature_column.embedding_column') def embedding_column( categorical_column, dimension, combiner='mean', initializer=None, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, @@ -807,6 +812,7 @@ def shared_embedding_columns( return result +@tf_export('feature_column.numeric_column') def numeric_column(key, shape=(1,), default_value=None, @@ -881,6 +887,7 @@ def numeric_column(key, normalizer_fn=normalizer_fn) +@tf_export('feature_column.bucketized_column') def bucketized_column(source_column, boundaries): """Represents discretized dense input. @@ -970,6 +977,7 @@ def _assert_string_or_int(dtype, prefix): '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype)) +@tf_export('feature_column.categorical_column_with_hash_bucket') def categorical_column_with_hash_bucket(key, hash_bucket_size, dtype=dtypes.string): @@ -1026,6 +1034,7 @@ def categorical_column_with_hash_bucket(key, return _HashedCategoricalColumn(key, hash_bucket_size, dtype) +@tf_export('feature_column.categorical_column_with_vocabulary_file') def categorical_column_with_vocabulary_file(key, vocabulary_file, vocabulary_size=None, @@ -1145,6 +1154,7 @@ def categorical_column_with_vocabulary_file(key, dtype=dtype) +@tf_export('feature_column.categorical_column_with_vocabulary_list') def categorical_column_with_vocabulary_list( key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0): """A `_CategoricalColumn` with in-memory vocabulary. @@ -1255,6 +1265,7 @@ def categorical_column_with_vocabulary_list( default_value=default_value, num_oov_buckets=num_oov_buckets) +@tf_export('feature_column.categorical_column_with_identity') def categorical_column_with_identity(key, num_buckets, default_value=None): """A `_CategoricalColumn` that returns identity values. @@ -1322,6 +1333,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None): key=key, num_buckets=num_buckets, default_value=default_value) +@tf_export('feature_column.indicator_column') def indicator_column(categorical_column): """Represents multi-hot representation of given categorical column. @@ -1350,6 +1362,7 @@ def indicator_column(categorical_column): return _IndicatorColumn(categorical_column) +@tf_export('feature_column.weighted_categorical_column') def weighted_categorical_column( categorical_column, weight_feature_key, dtype=dtypes.float32): """Applies weight values to a `_CategoricalColumn`. @@ -1424,6 +1437,7 @@ def weighted_categorical_column( dtype=dtype) +@tf_export('feature_column.crossed_column') def crossed_column(keys, hash_bucket_size, hash_key=None): """Returns a column for performing crosses of categorical features. diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py index 67ccf990d6a..c8251144836 100644 --- a/tensorflow/python/framework/dtypes.py +++ b/tensorflow/python/framework/dtypes.py @@ -12,20 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Library of dtypes (Tensor element types).""" from __future__ import absolute_import from __future__ import division from __future__ import print_function - import numpy as np from tensorflow.core.framework import types_pb2 from tensorflow.python import pywrap_tensorflow from tensorflow.python.util.tf_export import tf_export - _np_bfloat16 = pywrap_tensorflow.TF_bfloat16_type() @@ -83,8 +80,8 @@ class DType(object): # TODO(mrry): Make the necessary changes (using __new__) to ensure # that calling this returns one of the interned values. type_enum = int(type_enum) - if (type_enum not in types_pb2.DataType.values() - or type_enum == types_pb2.DT_INVALID): + if (type_enum not in types_pb2.DataType.values() or + type_enum == types_pb2.DT_INVALID): raise TypeError( "type_enum is not a valid types_pb2.DataType: %s" % type_enum) self._type_enum = type_enum @@ -123,10 +120,10 @@ class DType(object): @property def is_numpy_compatible(self): - numpy_incompatible = [types_pb2.DT_VARIANT, - types_pb2.DT_VARIANT_REF, - types_pb2.DT_RESOURCE, - types_pb2.DT_RESOURCE_REF] + numpy_incompatible = [ + types_pb2.DT_VARIANT, types_pb2.DT_VARIANT_REF, types_pb2.DT_RESOURCE, + types_pb2.DT_RESOURCE_REF + ] return self._type_enum not in numpy_incompatible @property @@ -153,9 +150,9 @@ class DType(object): @property def is_floating(self): """Returns whether this is a (non-quantized, real) floating point type.""" - return ((self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype, - np.floating)) - or self.base_dtype == bfloat16) + return ((self.is_numpy_compatible and + np.issubdtype(self.as_numpy_dtype, np.floating)) or + self.base_dtype == bfloat16) @property def is_complex(self): @@ -190,8 +187,8 @@ class DType(object): TypeError: if this is a non-numeric, unordered, or quantized type. """ - if (self.is_quantized or self.base_dtype in - (bool, string, complex64, complex128)): + if (self.is_quantized or + self.base_dtype in (bool, string, complex64, complex128)): raise TypeError("Cannot find minimum value of %s." % self) # there is no simple way to get the min value of a dtype, we have to check @@ -214,8 +211,8 @@ class DType(object): TypeError: if this is a non-numeric, unordered, or quantized type. """ - if (self.is_quantized or self.base_dtype in - (bool, string, complex64, complex128)): + if (self.is_quantized or + self.base_dtype in (bool, string, complex64, complex128)): raise TypeError("Cannot find maximum value of %s." % self) # there is no simple way to get the max value of a dtype, we have to check @@ -266,8 +263,8 @@ class DType(object): this `DType`. """ other = as_dtype(other) - return self._type_enum in ( - other.as_datatype_enum, other.base_dtype.as_datatype_enum) + return self._type_enum in (other.as_datatype_enum, + other.base_dtype.as_datatype_enum) def __eq__(self, other): """Returns True iff this DType refers to the same type as `other`.""" @@ -307,19 +304,22 @@ class DType(object): return 1 return np.dtype(self.as_numpy_dtype).itemsize + # Define data type range of numpy dtype -dtype_range = {np.bool_: (False, True), - np.bool8: (False, True), - np.uint8: (0, 255), - np.uint16: (0, 65535), - np.int8: (-128, 127), - np.int16: (-32768, 32767), - np.int64: (-2**63, 2**63 - 1), - np.uint64: (0, 2**64 - 1), - np.int32: (-2**31, 2**31 - 1), - np.uint32: (0, 2**32 - 1), - np.float32: (-1, 1), - np.float64: (-1, 1)} +dtype_range = { + np.bool_: (False, True), + np.bool8: (False, True), + np.uint8: (0, 255), + np.uint16: (0, 65535), + np.int8: (-128, 127), + np.int16: (-32768, 32767), + np.int64: (-2**63, 2**63 - 1), + np.uint64: (0, 2**64 - 1), + np.int32: (-2**31, 2**31 - 1), + np.uint32: (0, 2**32 - 1), + np.float32: (-1, 1), + np.float64: (-1, 1) +} # Define standard wrappers for the types_pb2.DataType enum. resource = DType(types_pb2.DT_RESOURCE) @@ -396,7 +396,6 @@ quint16_ref = DType(types_pb2.DT_QUINT16_REF) qint32_ref = DType(types_pb2.DT_QINT32_REF) bfloat16_ref = DType(types_pb2.DT_BFLOAT16_REF) - # Maintain an intern table so that we don't have to create a large # number of small objects. _INTERN_TABLE = { @@ -448,7 +447,6 @@ _INTERN_TABLE = { types_pb2.DT_VARIANT_REF: variant_ref, } - # Standard mappings between types_pb2.DataType values and string names. _TYPE_TO_STRING = { types_pb2.DT_HALF: "float16", @@ -498,8 +496,10 @@ _TYPE_TO_STRING = { types_pb2.DT_RESOURCE_REF: "resource_ref", types_pb2.DT_VARIANT_REF: "variant_ref", } -_STRING_TO_TF = {value: _INTERN_TABLE[key] - for key, value in _TYPE_TO_STRING.items()} +_STRING_TO_TF = { + value: _INTERN_TABLE[key] + for key, value in _TYPE_TO_STRING.items() +} # Add non-canonical aliases. _STRING_TO_TF["half"] = float16 _STRING_TO_TF["half_ref"] = float16_ref @@ -508,7 +508,6 @@ _STRING_TO_TF["float_ref"] = float32_ref _STRING_TO_TF["double"] = float64 _STRING_TO_TF["double_ref"] = float64_ref - # Numpy representation for quantized dtypes. # # These are magic strings that are used in the swig wrapper to identify @@ -551,58 +550,100 @@ _NP_TO_TF = frozenset([ (_np_bfloat16, bfloat16), ]) _TF_TO_NP = { - types_pb2.DT_HALF: np.float16, - types_pb2.DT_FLOAT: np.float32, - types_pb2.DT_DOUBLE: np.float64, - types_pb2.DT_INT32: np.int32, - types_pb2.DT_UINT8: np.uint8, - types_pb2.DT_UINT16: np.uint16, - types_pb2.DT_UINT32: np.uint32, - types_pb2.DT_UINT64: np.uint64, - types_pb2.DT_INT16: np.int16, - types_pb2.DT_INT8: np.int8, + types_pb2.DT_HALF: + np.float16, + types_pb2.DT_FLOAT: + np.float32, + types_pb2.DT_DOUBLE: + np.float64, + types_pb2.DT_INT32: + np.int32, + types_pb2.DT_UINT8: + np.uint8, + types_pb2.DT_UINT16: + np.uint16, + types_pb2.DT_UINT32: + np.uint32, + types_pb2.DT_UINT64: + np.uint64, + types_pb2.DT_INT16: + np.int16, + types_pb2.DT_INT8: + np.int8, # NOTE(touts): For strings we use np.object as it supports variable length # strings. - types_pb2.DT_STRING: np.object, - types_pb2.DT_COMPLEX64: np.complex64, - types_pb2.DT_COMPLEX128: np.complex128, - types_pb2.DT_INT64: np.int64, - types_pb2.DT_BOOL: np.bool, - types_pb2.DT_QINT8: _np_qint8, - types_pb2.DT_QUINT8: _np_quint8, - types_pb2.DT_QINT16: _np_qint16, - types_pb2.DT_QUINT16: _np_quint16, - types_pb2.DT_QINT32: _np_qint32, - types_pb2.DT_BFLOAT16: _np_bfloat16, + types_pb2.DT_STRING: + np.object, + types_pb2.DT_COMPLEX64: + np.complex64, + types_pb2.DT_COMPLEX128: + np.complex128, + types_pb2.DT_INT64: + np.int64, + types_pb2.DT_BOOL: + np.bool, + types_pb2.DT_QINT8: + _np_qint8, + types_pb2.DT_QUINT8: + _np_quint8, + types_pb2.DT_QINT16: + _np_qint16, + types_pb2.DT_QUINT16: + _np_quint16, + types_pb2.DT_QINT32: + _np_qint32, + types_pb2.DT_BFLOAT16: + _np_bfloat16, # Ref types - types_pb2.DT_HALF_REF: np.float16, - types_pb2.DT_FLOAT_REF: np.float32, - types_pb2.DT_DOUBLE_REF: np.float64, - types_pb2.DT_INT32_REF: np.int32, - types_pb2.DT_UINT32_REF: np.uint32, - types_pb2.DT_UINT8_REF: np.uint8, - types_pb2.DT_UINT16_REF: np.uint16, - types_pb2.DT_INT16_REF: np.int16, - types_pb2.DT_INT8_REF: np.int8, - types_pb2.DT_STRING_REF: np.object, - types_pb2.DT_COMPLEX64_REF: np.complex64, - types_pb2.DT_COMPLEX128_REF: np.complex128, - types_pb2.DT_INT64_REF: np.int64, - types_pb2.DT_UINT64_REF: np.uint64, - types_pb2.DT_BOOL_REF: np.bool, - types_pb2.DT_QINT8_REF: _np_qint8, - types_pb2.DT_QUINT8_REF: _np_quint8, - types_pb2.DT_QINT16_REF: _np_qint16, - types_pb2.DT_QUINT16_REF: _np_quint16, - types_pb2.DT_QINT32_REF: _np_qint32, - types_pb2.DT_BFLOAT16_REF: _np_bfloat16, + types_pb2.DT_HALF_REF: + np.float16, + types_pb2.DT_FLOAT_REF: + np.float32, + types_pb2.DT_DOUBLE_REF: + np.float64, + types_pb2.DT_INT32_REF: + np.int32, + types_pb2.DT_UINT32_REF: + np.uint32, + types_pb2.DT_UINT8_REF: + np.uint8, + types_pb2.DT_UINT16_REF: + np.uint16, + types_pb2.DT_INT16_REF: + np.int16, + types_pb2.DT_INT8_REF: + np.int8, + types_pb2.DT_STRING_REF: + np.object, + types_pb2.DT_COMPLEX64_REF: + np.complex64, + types_pb2.DT_COMPLEX128_REF: + np.complex128, + types_pb2.DT_INT64_REF: + np.int64, + types_pb2.DT_UINT64_REF: + np.uint64, + types_pb2.DT_BOOL_REF: + np.bool, + types_pb2.DT_QINT8_REF: + _np_qint8, + types_pb2.DT_QUINT8_REF: + _np_quint8, + types_pb2.DT_QINT16_REF: + _np_qint16, + types_pb2.DT_QUINT16_REF: + _np_quint16, + types_pb2.DT_QINT32_REF: + _np_qint32, + types_pb2.DT_BFLOAT16_REF: + _np_bfloat16, } - -QUANTIZED_DTYPES = frozenset( - [qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref, - quint16_ref, qint32_ref]) +QUANTIZED_DTYPES = frozenset([ + qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref, + quint16_ref, qint32_ref +]) tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES") @@ -613,7 +654,8 @@ def as_dtype(type_value): Args: type_value: A value that can be converted to a `tf.DType` object. This may currently be a `tf.DType` object, a - [`DataType` enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto), + [`DataType` + enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto), a string type name, or a `numpy.dtype`. Returns: @@ -650,5 +692,4 @@ def as_dtype(type_value): except TypeError as e: raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e)) - raise TypeError( - "Cannot convert value %r to a TensorFlow DType." % type_value) + raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value) diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py index a4ca3f9a89b..b35cee01112 100644 --- a/tensorflow/python/framework/function_test.py +++ b/tensorflow/python/framework/function_test.py @@ -19,8 +19,8 @@ from __future__ import division from __future__ import print_function import re -import time import sys +import time import numpy as np @@ -86,6 +86,21 @@ class FunctionTest(test.TestCase): with session.Session() as sess: self.assertAllEqual([18.0], sess.run(call)) + def testIdentityImplicitDeref(self): + + @function.Defun(dtypes.float32, func_name="MyIdentity") + def MyIdentityFunc(a): + return a + + with ops.Graph().as_default(): + var = variables.Variable([18.0]) + call = MyIdentityFunc(var._ref()) # pylint: disable=protected-access + self.assertEqual("MyIdentity", call.op.name) + for cfg in _OptimizerOptions(): + with session.Session(config=cfg) as sess: + sess.run(var.initializer) + self.assertAllEqual([18.0], sess.run(call)) + def testIdentityOutputName(self): @function.Defun( @@ -771,7 +786,7 @@ class FunctionTest(test.TestCase): # We added more randomness to function names in C API. # TODO(iga): Remove this if statement when we switch to C API. if ops._USE_C_API: # pylint: disable=protected-access - if sys.byteorder == 'big': + if sys.byteorder == "big": self.assertEqual("Foo_kEdkAG8SJvg", Foo.instantiate([dtypes.float32] * 3).name) else: diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py index 00fff8d040d..c26644362c8 100644 --- a/tensorflow/python/framework/importer.py +++ b/tensorflow/python/framework/importer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """A utility function for importing TensorFlow graphs.""" from __future__ import absolute_import from __future__ import division @@ -43,8 +42,8 @@ from tensorflow.python.util.tf_export import tf_export # the logic here. def _GetNodeAttr(node_def, attr_name): if attr_name not in node_def.attr: - raise ValueError('Expected one attr with name %r in %s.' - % (attr_name, str(node_def))) + raise ValueError('Expected one attr with name %r in %s.' % (attr_name, + str(node_def))) return node_def.attr[attr_name] @@ -170,9 +169,8 @@ def _ProcessInputMapParam(input_map): if input_map is None: input_map = {} else: - if not (isinstance(input_map, dict) - and all(isinstance(k, compat.bytes_or_text_types) - for k in input_map.keys())): + if not (isinstance(input_map, dict) and all( + isinstance(k, compat.bytes_or_text_types) for k in input_map.keys())): raise TypeError('input_map must be a dictionary mapping strings to ' 'Tensor objects.') return input_map @@ -180,9 +178,10 @@ def _ProcessInputMapParam(input_map): def _ProcessReturnElementsParam(return_elements): """Type-checks and possibly canonicalizes `return_elements`.""" - if return_elements is None: return None - if not all(isinstance(x, compat.bytes_or_text_types) - for x in return_elements): + if return_elements is None: + return None + if not all( + isinstance(x, compat.bytes_or_text_types) for x in return_elements): raise TypeError('return_elements must be a list of strings.') return tuple(compat.as_str(x) for x in return_elements) @@ -262,14 +261,14 @@ def _PopulateTFImportGraphDefOptions(options, prefix, input_map, if input_src.startswith('^'): src_name = compat.as_bytes(input_src[1:]) dst_op = input_dst._as_tf_output().oper # pylint: disable=protected-access - c_api.TF_ImportGraphDefOptionsRemapControlDependency(options, src_name, - dst_op) + c_api.TF_ImportGraphDefOptionsRemapControlDependency( + options, src_name, dst_op) else: src_name, src_idx = _ParseTensorName(input_src) src_name = compat.as_str(src_name) dst_output = input_dst._as_tf_output() # pylint: disable=protected-access - c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name, - src_idx, dst_output) + c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name, src_idx, + dst_output) for name in return_elements or []: if ':' in name: op_name, index = _ParseTensorName(name) @@ -315,8 +314,8 @@ def _ProcessNewOps(graph): coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name) # pylint: disable=protected-access except KeyError: raise ValueError('Specified colocation to an op that ' - 'does not exist during import: %s in %s' % ( - coloc_op_name, op.name)) + 'does not exist during import: %s in %s' % + (coloc_op_name, op.name)) if coloc_op.device: coloc_device = pydev.DeviceSpec.from_string(coloc_op.device) break @@ -373,10 +372,13 @@ def _GatherReturnElements(requested_return_elements, graph, results): @tf_export('import_graph_def') @deprecated_args(None, 'Please file an issue at ' 'https://github.com/tensorflow/tensorflow/issues if you depend' - ' on this feature.', - 'op_dict') -def import_graph_def(graph_def, input_map=None, return_elements=None, - name=None, op_dict=None, producer_op_list=None): + ' on this feature.', 'op_dict') +def import_graph_def(graph_def, + input_map=None, + return_elements=None, + name=None, + op_dict=None, + producer_op_list=None): """Imports the graph from `graph_def` into the current default `Graph`. This function provides a way to import a serialized TensorFlow @@ -480,11 +482,12 @@ def import_graph_def(graph_def, input_map=None, return_elements=None, c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper( results)) if missing_unused_input_keys: - missing_unused_input_keys = [compat.as_str(s) - for s in missing_unused_input_keys] + missing_unused_input_keys = [ + compat.as_str(s) for s in missing_unused_input_keys + ] raise ValueError( - 'Attempted to map inputs that were not found in graph_def: [%s]' - % ', '.join(missing_unused_input_keys)) + 'Attempted to map inputs that were not found in graph_def: [%s]' % + ', '.join(missing_unused_input_keys)) if return_elements is None: return None diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index b107670275c..ea589cc4d40 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -2103,6 +2103,10 @@ class Operation(object): logging.warning("Operation._control_inputs is private, use " "Operation.control_inputs instead. " "Operation._control_inputs will eventually be removed.") + # Copy value because it may be self._control_inputs_val (in particular if + # this is called from self._control_inputs += ...), and we don't want to + # clear value below. + value = copy.copy(value) self._remove_all_control_inputs() self._add_control_inputs(value) @@ -2756,15 +2760,12 @@ class Graph(object): self._handle_movers = {} # A map from tensor handle to its delete op. self._handle_deleters = {} - # Resource container. - if context.in_graph_mode(): - self._container_prefix = "" - else: - # In Eager mode, isolate resources (particularly ResourceVariables) in - # Graphs by default. This prevents unintended variable sharing. Graph mode - # gets this kind of isolation from Sessions. - self._container_prefix = "eager-execution-%d/" % (uid(),) - self._container = self._container_prefix + # Allow optimizers and other objects to pseudo-uniquely key graphs (this key + # will be shared when defining function graphs, for example, so optimizers + # being called inside function definitions behave as if they were seeing the + # actual outside graph). + self._graph_key = "grap-key-%d/" % (uid(),) + self._container = "" self._registered_ops = op_def_registry.get_registered_ops() # TODO(skyewm): fold as much of the above as possible into the C @@ -4225,7 +4226,7 @@ class Graph(object): """ original_container = self._container try: - self._container = self._container_prefix + container_name + self._container = container_name yield self._container finally: self._container = original_container @@ -5004,9 +5005,22 @@ def init_scope(): """ # pylint: enable=g-doc-return-or-yield,line-too-long + in_graph_mode = context.in_graph_mode() + # Retrieve the active name scope: entering an `init_scope` preserves + # the name scope of the current context. + if in_graph_mode: + default_graph = get_default_graph() + scope = default_graph.get_name_scope() + else: + scope = context.context().scope_name + if scope and scope[-1] != '/': + # Names that end with trailing slashes are treated by `name_scope` as + # absolute. + scope = scope + '/' + outer_context = None - if context.in_graph_mode() and not _default_graph_stack.stack: - outer_context = get_default_graph().as_default + if in_graph_mode and not _default_graph_stack.stack: + outer_context = default_graph.as_default else: for stack_entry in reversed(context.context_stack.stack): if not stack_entry.is_building_function: @@ -5018,7 +5032,8 @@ def init_scope(): "eager context was previously active.") try: - with outer_context(), control_dependencies(None), tape.stop_recording(): + with outer_context(), name_scope(scope), control_dependencies( + None), tape.stop_recording(): yield finally: pass diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py index 78519f108ba..c5e177d5210 100644 --- a/tensorflow/python/framework/ops_test.py +++ b/tensorflow/python/framework/ops_test.py @@ -2072,10 +2072,34 @@ class InitScopeTest(test_util.TensorFlowTestCase): # pylint: disable=protected-access self.assertEqual(len(ops._default_graph_stack.stack), 0) with ops.init_scope(): - self.assertEqual(len(ops._default_graph_stack.stack), 1) + self.assertGreater(len(ops._default_graph_stack.stack), 0) self.assertEqual(len(ops._default_graph_stack.stack), 0) # pylint: enable=protected-access + def testPreservesNameScopeInGraphConstruction(self): + with ops.Graph().as_default(): + function_graph = ops.Graph() + with function_graph.as_default(): + with ops.name_scope("inner"), ops.init_scope(): + self.assertEqual(ops.get_name_scope(), "inner") + self.assertEqual(ops.get_name_scope(), "") + + def testPreservesNameScopeInEagerExecution(self): + with context.eager_mode(): + def foo(): + with ops.name_scope("inner"), ops.init_scope(): + if context.in_graph_mode(): + self.assertEqual(ops.get_name_scope(), "inner") + else: + # A trailing slash is always appended when eager execution is + # enabled. + self.assertEqual(context.context().scope_name, "inner/") + foo() + self.assertEqual(ops.get_name_scope(), "") + foo_compiled = eager_function.defun(foo) + foo_compiled() + self.assertEqual(ops.get_name_scope(), "") + @test_util.with_c_api class GraphTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc index 65810fa7094..85cba59be4d 100644 --- a/tensorflow/python/framework/python_op_gen.cc +++ b/tensorflow/python/framework/python_op_gen.cc @@ -476,9 +476,6 @@ GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def, GenPythonOp::~GenPythonOp() {} string GenPythonOp::Code() { - if (api_def_.visibility() == ApiDef::SKIP) { - return ""; - } // This has all the input args followed by those attrs that don't have // defaults. std::vector params_no_default; @@ -805,11 +802,21 @@ from tensorflow.python.util.tf_export import tf_export auto out = cleaned_ops.mutable_op(); out->Reserve(ops.op_size()); for (const auto& op_def : ops.op()) { - bool is_hidden = false; - for (const string& hidden : hidden_ops) { - if (op_def.name() == hidden) { - is_hidden = true; - break; + const auto* api_def = api_defs.GetApiDef(op_def.name()); + + if (api_def->visibility() == ApiDef::SKIP) { + continue; + } + + // An op is hidden if either its ApiDef visibility is HIDDEN + // or it is in the hidden_ops list. + bool is_hidden = api_def->visibility() == ApiDef::HIDDEN; + if (!is_hidden) { + for (const string& hidden : hidden_ops) { + if (op_def.name() == hidden) { + is_hidden = true; + break; + } } } @@ -826,7 +833,6 @@ from tensorflow.python.util.tf_export import tf_export continue; } - const auto* api_def = api_defs.GetApiDef(op_def.name()); strings::StrAppend(&result, GetPythonOp(op_def, *api_def, function_name)); if (!require_shapes) { diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py index d2b8e803057..0e5f696111a 100644 --- a/tensorflow/python/framework/tensor_util.py +++ b/tensorflow/python/framework/tensor_util.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Utilities to create TensorProtos.""" from __future__ import absolute_import from __future__ import division @@ -39,6 +38,7 @@ except ImportError: from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.util.tf_export import tf_export + # pylint: enable=g-import-not-at-top @@ -47,8 +47,8 @@ def ExtractBitsFromFloat16(x): def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values): - tensor_proto.half_val.extend([ - ExtractBitsFromFloat16(x) for x in proto_values]) + tensor_proto.half_val.extend( + [ExtractBitsFromFloat16(x) for x in proto_values]) def ExtractBitsFromBFloat16(x): @@ -57,31 +57,47 @@ def ExtractBitsFromBFloat16(x): def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values): - tensor_proto.half_val.extend([ - ExtractBitsFromBFloat16(x) for x in proto_values]) + tensor_proto.half_val.extend( + [ExtractBitsFromBFloat16(x) for x in proto_values]) if _FAST_TENSOR_UTIL_AVAILABLE: _NP_TO_APPEND_FN = { - dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto, + dtypes.bfloat16.as_numpy_dtype: + SlowAppendBFloat16ArrayToTensorProto, # TODO(sesse): We should have a # fast_tensor_util.AppendFloat16ArrayToTensorProto, # but it seems np.float16_t doesn't exist? - np.float16: SlowAppendFloat16ArrayToTensorProto, - np.float32: fast_tensor_util.AppendFloat32ArrayToTensorProto, - np.float64: fast_tensor_util.AppendFloat64ArrayToTensorProto, - np.int32: fast_tensor_util.AppendInt32ArrayToTensorProto, - np.int64: fast_tensor_util.AppendInt64ArrayToTensorProto, - np.uint8: fast_tensor_util.AppendUInt8ArrayToTensorProto, - np.uint16: fast_tensor_util.AppendUInt16ArrayToTensorProto, - np.uint32: fast_tensor_util.AppendUInt32ArrayToTensorProto, - np.uint64: fast_tensor_util.AppendUInt64ArrayToTensorProto, - np.int8: fast_tensor_util.AppendInt8ArrayToTensorProto, - np.int16: fast_tensor_util.AppendInt16ArrayToTensorProto, - np.complex64: fast_tensor_util.AppendComplex64ArrayToTensorProto, - np.complex128: fast_tensor_util.AppendComplex128ArrayToTensorProto, - np.object: fast_tensor_util.AppendObjectArrayToTensorProto, - np.bool: fast_tensor_util.AppendBoolArrayToTensorProto, + np.float16: + SlowAppendFloat16ArrayToTensorProto, + np.float32: + fast_tensor_util.AppendFloat32ArrayToTensorProto, + np.float64: + fast_tensor_util.AppendFloat64ArrayToTensorProto, + np.int32: + fast_tensor_util.AppendInt32ArrayToTensorProto, + np.int64: + fast_tensor_util.AppendInt64ArrayToTensorProto, + np.uint8: + fast_tensor_util.AppendUInt8ArrayToTensorProto, + np.uint16: + fast_tensor_util.AppendUInt16ArrayToTensorProto, + np.uint32: + fast_tensor_util.AppendUInt32ArrayToTensorProto, + np.uint64: + fast_tensor_util.AppendUInt64ArrayToTensorProto, + np.int8: + fast_tensor_util.AppendInt8ArrayToTensorProto, + np.int16: + fast_tensor_util.AppendInt16ArrayToTensorProto, + np.complex64: + fast_tensor_util.AppendComplex64ArrayToTensorProto, + np.complex128: + fast_tensor_util.AppendComplex128ArrayToTensorProto, + np.object: + fast_tensor_util.AppendObjectArrayToTensorProto, + np.bool: + fast_tensor_util.AppendBoolArrayToTensorProto, dtypes.qint8.as_numpy_dtype: fast_tensor_util.AppendInt8ArrayToTensorProto, dtypes.quint8.as_numpy_dtype: @@ -118,14 +134,12 @@ else: tensor_proto.uint64_val.extend([np.asscalar(x) for x in proto_values]) def SlowAppendComplex64ArrayToTensorProto(tensor_proto, proto_values): - tensor_proto.scomplex_val.extend([np.asscalar(v) - for x in proto_values - for v in [x.real, x.imag]]) + tensor_proto.scomplex_val.extend( + [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]]) def SlowAppendComplex128ArrayToTensorProto(tensor_proto, proto_values): - tensor_proto.dcomplex_val.extend([np.asscalar(v) - for x in proto_values - for v in [x.real, x.imag]]) + tensor_proto.dcomplex_val.extend( + [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]]) def SlowAppendObjectArrayToTensorProto(tensor_proto, proto_values): tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values]) @@ -252,15 +266,16 @@ def _FilterTuple(v): return None if isinstance(v, list): if not any(isinstance(x, (list, tuple)) for x in v): - return _FirstNotNone([None if isinstance(x, (list, tuple)) else x for x in v]) + return _FirstNotNone( + [None if isinstance(x, (list, tuple)) else x for x in v]) return _FirstNotNone([_FilterTuple(x) for x in v]) def _FilterInt(v): if isinstance(v, (list, tuple)): return _FirstNotNone([_FilterInt(x) for x in v]) - return None if isinstance(v, (compat.integral_types, - tensor_shape.Dimension)) else _NotNone(v) + return None if isinstance( + v, (compat.integral_types, tensor_shape.Dimension)) else _NotNone(v) def _FilterFloat(v): @@ -380,8 +395,11 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False): if dtype: dtype = dtypes.as_dtype(dtype) - is_quantized = (dtype in [dtypes.qint8, dtypes.quint8, dtypes.qint16, - dtypes.quint16, dtypes.qint32]) + is_quantized = ( + dtype in [ + dtypes.qint8, dtypes.quint8, dtypes.qint16, dtypes.quint16, + dtypes.qint32 + ]) # We first convert value to a numpy array or scalar. if isinstance(values, (np.ndarray, np.generic)): @@ -419,9 +437,9 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False): if (list(nparray.shape) != _GetDenseDimensions(values) and not is_quantized): raise ValueError("""Argument must be a dense tensor: %s""" - """ - got shape %s, but wanted %s.""" % ( - values, list(nparray.shape), - _GetDenseDimensions(values))) + """ - got shape %s, but wanted %s.""" % + (values, list(nparray.shape), + _GetDenseDimensions(values))) # python/numpy default float type is float64. We prefer float32 instead. if (nparray.dtype == np.float64) and dtype is None: @@ -446,8 +464,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False): if dtype is not None and (not hasattr(dtype, "base_dtype") or dtype.base_dtype != numpy_dtype.base_dtype): - raise TypeError("Incompatible types: %s vs. %s. Value is %s" - % (dtype, nparray.dtype, values)) + raise TypeError("Incompatible types: %s vs. %s. Value is %s" % + (dtype, nparray.dtype, values)) # If shape is not given, get the shape from the numpy array. if shape is None: @@ -510,8 +528,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False): append_fn = GetNumpyAppendFn(proto_values.dtype) if append_fn is None: - raise TypeError("Element type not supported in TensorProto: %s" % - numpy_dtype.name) + raise TypeError( + "Element type not supported in TensorProto: %s" % numpy_dtype.name) append_fn(tensor_proto, proto_values) return tensor_proto @@ -553,19 +571,23 @@ def MakeNdarray(tensor): return tmp.reshape(shape) elif tensor_dtype == dtypes.float32: if len(tensor.float_val) == 1: - return np.repeat(np.array(tensor.float_val[0], dtype=dtype), - num_elements).reshape(shape) + return np.repeat( + np.array(tensor.float_val[0], dtype=dtype), + num_elements).reshape(shape) else: return np.fromiter(tensor.float_val, dtype=dtype).reshape(shape) elif tensor_dtype == dtypes.float64: if len(tensor.double_val) == 1: - return np.repeat(np.array(tensor.double_val[0], dtype=dtype), - num_elements).reshape(shape) + return np.repeat( + np.array(tensor.double_val[0], dtype=dtype), + num_elements).reshape(shape) else: return np.fromiter(tensor.double_val, dtype=dtype).reshape(shape) - elif tensor_dtype in [dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16, - dtypes.int8, dtypes.qint32, dtypes.quint8, dtypes.qint8, - dtypes.qint16, dtypes.quint16, dtypes.bfloat16]: + elif tensor_dtype in [ + dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16, dtypes.int8, + dtypes.qint32, dtypes.quint8, dtypes.qint8, dtypes.qint16, dtypes.quint16, + dtypes.bfloat16 + ]: if len(tensor.int_val) == 1: return np.repeat(np.array(tensor.int_val[0], dtype=dtype), num_elements).reshape(shape) @@ -573,35 +595,41 @@ def MakeNdarray(tensor): return np.fromiter(tensor.int_val, dtype=dtype).reshape(shape) elif tensor_dtype == dtypes.int64: if len(tensor.int64_val) == 1: - return np.repeat(np.array(tensor.int64_val[0], dtype=dtype), - num_elements).reshape(shape) + return np.repeat( + np.array(tensor.int64_val[0], dtype=dtype), + num_elements).reshape(shape) else: return np.fromiter(tensor.int64_val, dtype=dtype).reshape(shape) elif tensor_dtype == dtypes.string: if len(tensor.string_val) == 1: - return np.repeat(np.array(tensor.string_val[0], dtype=dtype), - num_elements).reshape(shape) + return np.repeat( + np.array(tensor.string_val[0], dtype=dtype), + num_elements).reshape(shape) else: - return np.array([x for x in tensor.string_val], - dtype=dtype).reshape(shape) + return np.array( + [x for x in tensor.string_val], dtype=dtype).reshape(shape) elif tensor_dtype == dtypes.complex64: it = iter(tensor.scomplex_val) if len(tensor.scomplex_val) == 2: - return np.repeat(np.array(complex(tensor.scomplex_val[0], - tensor.scomplex_val[1]), dtype=dtype), - num_elements).reshape(shape) + return np.repeat( + np.array( + complex(tensor.scomplex_val[0], tensor.scomplex_val[1]), + dtype=dtype), num_elements).reshape(shape) else: - return np.array([complex(x[0], x[1]) for x in zip(it, it)], - dtype=dtype).reshape(shape) + return np.array( + [complex(x[0], x[1]) for x in zip(it, it)], + dtype=dtype).reshape(shape) elif tensor_dtype == dtypes.complex128: it = iter(tensor.dcomplex_val) if len(tensor.dcomplex_val) == 2: - return np.repeat(np.array(complex(tensor.dcomplex_val[0], - tensor.dcomplex_val[1]), dtype=dtype), - num_elements).reshape(shape) + return np.repeat( + np.array( + complex(tensor.dcomplex_val[0], tensor.dcomplex_val[1]), + dtype=dtype), num_elements).reshape(shape) else: - return np.array([complex(x[0], x[1]) for x in zip(it, it)], - dtype=dtype).reshape(shape) + return np.array( + [complex(x[0], x[1]) for x in zip(it, it)], + dtype=dtype).reshape(shape) elif tensor_dtype == dtypes.bool: if len(tensor.bool_val) == 1: return np.repeat(np.array(tensor.bool_val[0], dtype=dtype), @@ -645,8 +673,9 @@ def _ConstantValue(tensor, partial): elif tensor.op.type == "Shape": input_shape = tensor.op.inputs[0].get_shape() if input_shape.is_fully_defined(): - return np.array([dim.value for dim in input_shape.dims], - dtype=tensor.dtype.as_numpy_dtype) + return np.array( + [dim.value for dim in input_shape.dims], + dtype=tensor.dtype.as_numpy_dtype) else: return None elif tensor.op.type == "Size": @@ -658,8 +687,10 @@ def _ConstantValue(tensor, partial): elif tensor.op.type == "Rank": input_shape = tensor.op.inputs[0].get_shape() if input_shape.ndims is not None: - return np.ndarray(shape=(), buffer=np.array([input_shape.ndims], dtype=np.int32), - dtype=np.int32) + return np.ndarray( + shape=(), + buffer=np.array([input_shape.ndims], dtype=np.int32), + dtype=np.int32) else: return None elif tensor.op.type == "Range": @@ -861,8 +892,8 @@ def constant_value_as_shape(tensor): # pylint: disable=invalid-name new_axis_mask = tensor.op.get_attr("new_axis_mask") shrink_axis_mask = tensor.op.get_attr("shrink_axis_mask") valid_attributes = (not ellipsis_mask and not new_axis_mask and - not shrink_axis_mask and - (not begin_mask or (begin_mask == 1)) and + not shrink_axis_mask and (not begin_mask or + (begin_mask == 1)) and (not end_mask or (end_mask == 1))) if valid_attributes: # additional inputs not supported prev = constant_value_as_shape(tensor.op.inputs[0]) @@ -878,8 +909,8 @@ def constant_value_as_shape(tensor): # pylint: disable=invalid-name ret = tensor_shape.unknown_shape(shape[0].value) value = constant_value(tensor) if value is not None: - ret = ret.merge_with(tensor_shape.TensorShape( - [d if d >= 0 else None for d in value])) + ret = ret.merge_with( + tensor_shape.TensorShape([d if d >= 0 else None for d in value])) return ret diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index 6a7e1d0c89c..15e8f5a38d6 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -123,11 +123,11 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False): TypeError: If either argument is not a `GraphDef`. """ if not isinstance(actual, graph_pb2.GraphDef): - raise TypeError("Expected tf.GraphDef for actual, got %s" % - type(actual).__name__) + raise TypeError( + "Expected tf.GraphDef for actual, got %s" % type(actual).__name__) if not isinstance(expected, graph_pb2.GraphDef): - raise TypeError("Expected tf.GraphDef for expected, got %s" % - type(expected).__name__) + raise TypeError( + "Expected tf.GraphDef for expected, got %s" % type(expected).__name__) if checkpoint_v2: _strip_checkpoint_v2_randomized(actual) @@ -152,11 +152,10 @@ def assert_meta_graph_protos_equal(tester, a, b): a_proto = proto_type() b_proto = proto_type() # Number of entries in the collections is the same - tester.assertEqual(len(a_value.bytes_list.value), - len(b_value.bytes_list.value)) - for (a_value_item, b_value_item) in zip( - a_value.bytes_list.value, - b_value.bytes_list.value): + tester.assertEqual( + len(a_value.bytes_list.value), len(b_value.bytes_list.value)) + for (a_value_item, b_value_item) in zip(a_value.bytes_list.value, + b_value.bytes_list.value): a_proto.ParseFromString(a_value_item) b_proto.ParseFromString(b_value_item) tester.assertProtoEquals(a_proto, b_proto) @@ -220,10 +219,7 @@ def NHWCToNCHW(input_tensor): converted tensor or shape array """ # tensor dim -> new axis order - new_axes = { - 4: [0, 3, 1, 2], - 5: [0, 4, 1, 2, 3] - } + new_axes = {4: [0, 3, 1, 2], 5: [0, 4, 1, 2, 3]} if isinstance(input_tensor, ops.Tensor): ndims = input_tensor.shape.ndims return array_ops.transpose(input_tensor, new_axes[ndims]) @@ -250,8 +246,9 @@ def NHWCToNCHW_VECT_C(input_shape_or_tensor): """ permutations = {5: [0, 3, 1, 2, 4], 6: [0, 4, 1, 2, 3, 5]} is_tensor = isinstance(input_shape_or_tensor, ops.Tensor) - temp_shape = (input_shape_or_tensor.shape.as_list() - if is_tensor else input_shape_or_tensor) + temp_shape = ( + input_shape_or_tensor.shape.as_list() + if is_tensor else input_shape_or_tensor) if temp_shape[-1] % 4 != 0: raise ValueError( "Last dimension of input must be evenly divisible by 4 to convert to " @@ -283,8 +280,9 @@ def NCHW_VECT_CToNHWC(input_shape_or_tensor): """ permutations = {5: [0, 2, 3, 1, 4], 6: [0, 2, 3, 4, 1, 5]} is_tensor = isinstance(input_shape_or_tensor, ops.Tensor) - input_shape = (input_shape_or_tensor.shape.as_list() - if is_tensor else input_shape_or_tensor) + input_shape = ( + input_shape_or_tensor.shape.as_list() + if is_tensor else input_shape_or_tensor) if input_shape[-1] != 4: raise ValueError("Last dimension of NCHW_VECT_C must be 4.") permutation = permutations[len(input_shape)] @@ -307,10 +305,7 @@ def NCHWToNHWC(input_tensor): converted tensor or shape array """ # tensor dim -> new axis order - new_axes = { - 4: [0, 2, 3, 1], - 5: [0, 2, 3, 4, 1] - } + new_axes = {4: [0, 2, 3, 1], 5: [0, 2, 3, 4, 1]} if isinstance(input_tensor, ops.Tensor): ndims = input_tensor.shape.ndims return array_ops.transpose(input_tensor, new_axes[ndims]) @@ -325,10 +320,17 @@ def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs): prev_value = ops._USE_C_API ops._USE_C_API = use_c_api try: - with ops.Graph().as_default(): - fn(*args, **kwargs) + # Reset the default graph so it has the C API enabled. We call + # reset_default_graph() instead of creating a new default Graph context to + # make this robust to tests that call reset_default_graph(), which requires + # that the current default graph isn't nested. + ops.reset_default_graph() + fn(*args, **kwargs) finally: ops._USE_C_API = prev_value + # Make sure default graph reflects prev_value in case next test doesn't call + # reset_default_graph(). + ops.reset_default_graph() # pylint: disable=protected-access @@ -345,7 +347,9 @@ def skip_if(condition): Returns: The wrapped function """ + def real_skip_if(fn): + def wrapper(*args, **kwargs): if callable(condition): skip = condition() @@ -353,7 +357,9 @@ def skip_if(condition): skip = condition if not skip: fn(*args, **kwargs) + return wrapper + return real_skip_if @@ -370,8 +376,10 @@ def disable_c_api(fn): Returns: The wrapped function """ + def wrapper(*args, **kwargs): _use_c_api_wrapper(fn, False, *args, **kwargs) + return wrapper @@ -388,8 +396,10 @@ def enable_c_api(fn): Returns: The wrapped function """ + def wrapper(*args, **kwargs): _use_c_api_wrapper(fn, True, *args, **kwargs) + return wrapper @@ -415,66 +425,6 @@ def with_c_api(cls): return cls -class IsolateTest(object): - """A context manager which isolates resources in its block. - - Provides an Eager-agnostic abstraction for preventing the sharing of - variables and other resources. - - In graph mode, resource handle ops are only executed in a particular Session, - isolating them from resources with the same name in other Graphs. In Eager, - separate Sessions do not exist, so resources (particularly ResourceVariables) - would be shared implicitly if a resource of the same name were created - anywhere in a Python process. Multiple handles to the same resource would - cause several issues, and so this type of sharing will raise an exception. - - Using resources with the same name in a single Python process may be useful - (especially for unit tests), so this context manager provides an abstraction - for isolating resources. Using a resource created in one Isolation environment - in another is an error. - - Example usage in Eager mode: - - ```python - import tensorflow as tf - # Import subject to change - from tensorflow.contrib.eager.python import tfe - - tfe.enable_eager_execution() - - for hyperparameter in [1, 2, 3]: - with tfe.IsolateTest(): - v = tfe.Variable(name="v", initial_value=hyperparameter) - # train model, test results ... - ``` - - IsolateTest is currently exposed through contrib.eager, but it creates a new - default Graph and provides equivalent safety in graph mode. - """ - - def __init__(self): - if context.in_eager_mode() and tape.could_possibly_record(): - raise ValueError("Cannot isolate Eager execution with an active tape.") - # In Eager, Graphs set a container which isolates resources, and maintain a - # VariableStore which caches ResourceVariable objects created through - # get_variable. So setting the default Graph has the side effect of - # isolating Eager resources. - with context.eager_mode(): - # Create the graph in Eager mode, as this provides stricter semantics - # (i.e. has a unique container prefix). This prevents implicit sharing - # when a Graph-mode graph is created and then Eager mode is enabled (an - # error through enable_eager_execution, but common with context managers - # in unit tests). - self._graph_as_default_context_manager = ops.Graph().as_default() - - def __enter__(self): - self._graph_as_default_context_manager.__enter__() - - def __exit__(self, type_arg, value_arg, traceback_arg): - return self._graph_as_default_context_manager.__exit__( - type_arg, value_arg, traceback_arg) - - def assert_no_new_tensors(f): """Decorator for asserting that no new Tensors persist after a test. @@ -505,12 +455,11 @@ def assert_no_new_tensors(f): return False tensors_before = set(id(obj) for obj in gc.get_objects() if _is_tensor(obj)) - outside_container_prefix = ops.get_default_graph()._container_prefix - with IsolateTest(): + outside_graph_key = ops.get_default_graph()._graph_key + with ops.Graph().as_default(): # Run the test in a new graph so that collections get cleared when it's - # done, but inherit the container prefix so that we can print the values - # of variables which get leaked when executing eagerly. - ops.get_default_graph()._container_prefix = outside_container_prefix + # done, but inherit the graph key so optimizers behave. + ops.get_default_graph()._graph_key = outside_graph_key f(self, **kwargs) # Make an effort to clear caches, which would otherwise look like leaked # Tensors. @@ -561,13 +510,17 @@ def assert_no_garbage_created(f): # not hold on to every object in other tests. gc.set_debug(previous_debug_flags) gc.enable() + return decorator -def run_in_graph_and_eager_modes( - __unused__=None, graph=None, config=None, - use_gpu=False, force_gpu=False, - reset_test=True, assert_no_eager_garbage=False): +def run_in_graph_and_eager_modes(__unused__=None, + graph=None, + config=None, + use_gpu=False, + force_gpu=False, + reset_test=True, + assert_no_eager_garbage=False): """Runs the test in both graph and eager modes. Args: @@ -596,6 +549,7 @@ def run_in_graph_and_eager_modes( def decorator(f): """Test method decorator.""" + def decorated(self, **kwargs): """Decorated the test method.""" with context.graph_mode(): @@ -627,10 +581,11 @@ def run_in_graph_and_eager_modes( assert_no_garbage_created(run_eager_mode)) with context.eager_mode(): - with IsolateTest(): + with ops.Graph().as_default(): run_eager_mode(self, **kwargs) return decorated + return decorator @@ -767,8 +722,10 @@ class TensorFlowTestCase(googletest.TestCase): self._AssertProtoEquals(expected_message, message) elif isinstance(expected_message_maybe_ascii, str): expected_message = type(message)() - text_format.Merge(expected_message_maybe_ascii, expected_message, - descriptor_pool=descriptor_pool.Default()) + text_format.Merge( + expected_message_maybe_ascii, + expected_message, + descriptor_pool=descriptor_pool.Default()) self._AssertProtoEquals(expected_message, message) else: assert False, ("Can't compare protos of type %s and %s" % @@ -852,7 +809,8 @@ class TensorFlowTestCase(googletest.TestCase): trigger the creation of a new session. Use the `use_gpu` and `force_gpu` options to control where ops are run. If - `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if `use_gpu` + `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if + `use_gpu` is True, TensorFlow tries to run as many ops on the GPU as possible. If both `force_gpu and `use_gpu` are False, all ops are pinned to the CPU. @@ -1051,6 +1009,7 @@ class TensorFlowTestCase(googletest.TestCase): self._threads.append(ret) return ret + # pylint: enable=invalid-name def assertNear(self, f1, f2, err, msg=None): @@ -1118,7 +1077,8 @@ class TensorFlowTestCase(googletest.TestCase): # the absolute difference between a and b. Here, we want to # print out which elements violate such conditions. cond = np.logical_or( - np.abs(a - b) > atol + rtol * np.abs(b), np.isnan(a) != np.isnan(b)) + np.abs(a - b) > atol + rtol * np.abs(b), + np.isnan(a) != np.isnan(b)) if a.ndim: x = a[np.where(cond)] y = b[np.where(cond)] @@ -1380,8 +1340,11 @@ class TensorFlowTestCase(googletest.TestCase): @tf_export("test.create_local_cluster") -def create_local_cluster(num_workers, num_ps, protocol="grpc", - worker_config=None, ps_config=None): +def create_local_cluster(num_workers, + num_ps, + protocol="grpc", + worker_config=None, + ps_config=None): """Create and start local servers and return the associated `Server` objects. Example: @@ -1431,15 +1394,21 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc", workers = [ server_lib.Server( - cs, job_name="worker", protocol=protocol, task_index=ix, - config=worker_config, start=True) - for ix in range(num_workers) + cs, + job_name="worker", + protocol=protocol, + task_index=ix, + config=worker_config, + start=True) for ix in range(num_workers) ] ps_servers = [ server_lib.Server( - cs, job_name="ps", protocol=protocol, task_index=ix, - config=ps_config, start=True) - for ix in range(num_ps) + cs, + job_name="ps", + protocol=protocol, + task_index=ix, + config=ps_config, + start=True) for ix in range(num_ps) ] return workers, ps_servers diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py index 3594d125bf6..a717eb39513 100644 --- a/tensorflow/python/framework/test_util_test.py +++ b/tensorflow/python/framework/test_util_test.py @@ -29,7 +29,6 @@ from google.protobuf import text_format from tensorflow.core.framework import graph_pb2 from tensorflow.core.protobuf import meta_graph_pb2 -from tensorflow.python.client import session from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import errors @@ -39,7 +38,6 @@ from tensorflow.python.framework import test_util from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import random_ops from tensorflow.python.ops import resource_variable_ops -from tensorflow.python.ops import variables from tensorflow.python.platform import googletest @@ -443,71 +441,5 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase): LeakedTensorTest().test_has_no_leak() -@test_util.with_c_api -class IsolationTest(test_util.TensorFlowTestCase): - - @test_util.run_in_graph_and_eager_modes() - def test_variable_reuse_exception(self): - with test_util.IsolateTest(), session.Session(): - first_container_variable = resource_variable_ops.ResourceVariable( - name="first_container_variable", - initial_value=1) - if context.in_graph_mode(): - self.evaluate([variables.global_variables_initializer()]) - with test_util.IsolateTest(): - if context.in_graph_mode(): - with self.assertRaises(RuntimeError): - self.evaluate(first_container_variable.read_value()) - else: - with self.assertRaises(ValueError): - first_container_variable.read_value() - - @test_util.run_in_graph_and_eager_modes() - def test_variable_reuse_exception_nested(self): - with test_util.IsolateTest(), session.Session(): - first_container_variable = resource_variable_ops.ResourceVariable( - name="first_container_variable", - initial_value=1) - if context.in_graph_mode(): - self.evaluate([variables.global_variables_initializer()]) - with test_util.IsolateTest(), session.Session(): - if context.in_graph_mode(): - with self.assertRaises(RuntimeError): - self.evaluate(first_container_variable.read_value()) - else: - with self.assertRaises(ValueError): - first_container_variable.read_value() - - @test_util.run_in_graph_and_eager_modes() - def test_no_sharing(self): - with test_util.IsolateTest(), session.Session(): - first_container_variable = resource_variable_ops.ResourceVariable( - name="same_name", - initial_value=1) - if context.in_graph_mode(): - self.evaluate([variables.global_variables_initializer()]) - with test_util.IsolateTest(), session.Session(): - second_container_variable = resource_variable_ops.ResourceVariable( - name="same_name", - initial_value=2) - if context.in_graph_mode(): - self.evaluate([variables.global_variables_initializer()]) - self.assertEqual( - 2, self.evaluate(second_container_variable.read_value())) - self.assertEqual(1, self.evaluate(first_container_variable.read_value())) - - def test_graph_mode_isolation(self): - with context.graph_mode(): - # Even if we've (accidentally) called IsolateTest in Graph mode, it should - # provide Eager isolation. - with test_util.IsolateTest(): - with context.eager_mode(): - first_container_variable = resource_variable_ops.ResourceVariable( - name="first_container_variable", - initial_value=1) - with context.eager_mode(): - with self.assertRaises(ValueError): - first_container_variable.read_value() - if __name__ == "__main__": googletest.main() diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py index 578f86ca5a0..5bc9e4b8030 100644 --- a/tensorflow/python/grappler/layout_optimizer_test.py +++ b/tensorflow/python/grappler/layout_optimizer_test.py @@ -157,6 +157,7 @@ def _get_config(layout_optimizer=True): graph_options = config_pb2.GraphOptions( rewrite_options=rewrite_options, build_cost_model=1) config = config_pb2.ConfigProto(graph_options=graph_options) + config.graph_options.optimizer_options.opt_level = -1 return config @@ -179,6 +180,8 @@ def _get_cluster(): named_device = device_properties_pb2.NamedDevice() named_device.name = '/GPU:0' named_device.properties.type = 'GPU' + named_device.properties.num_cores = 24 + named_device.properties.frequency = 1000 named_device.properties.environment['architecture'] = '4' cluster = gcluster.Cluster(devices=[named_device]) return cluster @@ -1169,7 +1172,7 @@ class LayoutOptimizerTest(test.TestCase): num_transposes += 1 nodes.append(node.name) - expected_num_transposes = 2 + expected_num_transposes = 3 self.assertEqual(expected_num_transposes, num_transposes) self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes) self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes) diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index 61257557751..fdac22bb53c 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -39,6 +39,7 @@ py_library( "_impl/keras/engine/__init__.py", "_impl/keras/engine/topology.py", "_impl/keras/engine/training.py", + "_impl/keras/engine/training_eager.py", "_impl/keras/estimator.py", "_impl/keras/initializers.py", "_impl/keras/layers/__init__.py", @@ -481,6 +482,7 @@ py_test( size = "small", srcs = ["_impl/keras/layers/normalization_test.py"], srcs_version = "PY2AND3", + tags = ["notsan"], deps = [ ":keras", "//tensorflow/python:client_testlib", @@ -719,6 +721,19 @@ py_test( ], ) +py_test( + name = "training_eager_test", + size = "medium", + srcs = ["_impl/keras/engine/training_eager_test.py"], + srcs_version = "PY2AND3", + tags = ["notsan"], + deps = [ + ":keras", + "//tensorflow/python:client_testlib", + "//third_party/py/numpy", + ], +) + py_test( name = "topology_test", size = "small", diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py index 460c0dc5f39..098ea063f95 100644 --- a/tensorflow/python/keras/_impl/keras/backend.py +++ b/tensorflow/python/keras/_impl/keras/backend.py @@ -29,6 +29,7 @@ import numpy as np from tensorflow.core.protobuf import config_pb2 from tensorflow.python.client import session as session_module +from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes as dtypes_module from tensorflow.python.framework import ops @@ -326,7 +327,15 @@ def learning_phase(): Returns: Learning phase (scalar integer tensor or Python integer). + + Raises: + ValueError: If called when Eager execution is enabled. """ + if context.in_eager_mode(): + if 'eager' not in _GRAPH_LEARNING_PHASES: + raise ValueError('No learning phase set in Eager mode.') + return _GRAPH_LEARNING_PHASES['eager'] + graph = ops.get_default_graph() if graph not in _GRAPH_LEARNING_PHASES: phase = array_ops.placeholder_with_default( @@ -347,7 +356,10 @@ def set_learning_phase(value): global _GRAPH_LEARNING_PHASES # pylint: disable=global-variable-not-assigned if value not in {0, 1}: raise ValueError('Expected learning phase to be ' '0 or 1.') - _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value + if context.in_eager_mode(): + _GRAPH_LEARNING_PHASES['eager'] = value + else: + _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value def get_session(): diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py index 64aa868f382..8354a2b8fd7 100644 --- a/tensorflow/python/keras/_impl/keras/engine/topology.py +++ b/tensorflow/python/keras/_impl/keras/engine/topology.py @@ -708,8 +708,10 @@ class Network(tf_network.GraphNetwork, Layer): self.input_names.append(layer.name) if layer.is_placeholder: self._feed_input_names.append(layer.name) - self._feed_inputs.append(layer.input) self._feed_input_shapes.append(K.int_shape(self.inputs[i])) + # layer.input gives an error in eager mode + if context.in_graph_mode(): + self._feed_inputs.append(layer.input) for layer in self._output_layers: self.output_names.append(layer.name) diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py index 699ae2edf0d..43d95b1f194 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training.py +++ b/tensorflow/python/keras/_impl/keras/engine/training.py @@ -22,17 +22,21 @@ import copy import numpy as np +from tensorflow.python.eager import context +from tensorflow.python.framework import ops from tensorflow.python.keras._impl.keras import backend as K from tensorflow.python.keras._impl.keras import callbacks as cbks from tensorflow.python.keras._impl.keras import losses from tensorflow.python.keras._impl.keras import metrics as metrics_module from tensorflow.python.keras._impl.keras import optimizers +from tensorflow.python.keras._impl.keras.engine import training_eager from tensorflow.python.keras._impl.keras.engine.topology import Network from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import optimizer as tf_optimizer_module try: from scipy.sparse import issparse # pylint: disable=g-import-not-at-top @@ -82,21 +86,24 @@ def _standardize_input_data(data, if data[x].__class__.__name__ == 'DataFrame' else data[x] for x in names ] - data = [np.expand_dims(x, 1) if x.ndim == 1 else x for x in data] except KeyError as e: raise ValueError('No data provided for "' + e.args[0] + '". Need data ' 'for each key in: ' + str(names)) elif isinstance(data, list): - data = [ - x.values if x.__class__.__name__ == 'DataFrame' else x for x in data - ] - data = [ - np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x - for x in data - ] + if isinstance(data[0], list): + data = [np.asarray(d) for d in data] + elif len(names) == 1 and isinstance(data[0], (float, int)): + data = [np.asarray(data)] + else: + data = [ + x.values if x.__class__.__name__ == 'DataFrame' else x for x in data + ] else: data = data.values if data.__class__.__name__ == 'DataFrame' else data - data = [np.expand_dims(data, 1)] if data.ndim == 1 else [data] + data = [data] + data = [ + np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data + ] if len(data) != len(names): if data and hasattr(data[0], 'shape'): @@ -618,9 +625,15 @@ class Model(Network): `optimizer`, `loss`, `metrics` or `sample_weight_mode`. """ loss = loss or {} + if context.in_eager_mode() and not isinstance( + optimizer, tf_optimizer_module.Optimizer): + raise ValueError('Only TF native optimizers are supported in Eager mode.') + self.optimizer = optimizers.get(optimizer) self.loss = loss self.loss_weights = loss_weights + if context.in_eager_mode() and sample_weight_mode is not None: + raise ValueError('sample_weight_mode is not supported in Eager mode.') self.sample_weight_mode = sample_weight_mode # Prepare loss functions. @@ -651,6 +664,7 @@ class Model(Network): loss_function = losses.get(loss) loss_functions = [loss_function for _ in range(len(self.outputs))] self.loss_functions = loss_functions + weighted_losses = [_weighted_masked_objective(fn) for fn in loss_functions] skip_target_indices = [] skip_target_weighing_indices = [] @@ -664,11 +678,12 @@ class Model(Network): skip_target_weighing_indices.append(i) # Prepare output masks. - masks = self.compute_mask(self.inputs, mask=None) - if masks is None: - masks = [None for _ in self.outputs] - if not isinstance(masks, list): - masks = [masks] + if context.in_graph_mode(): + masks = self.compute_mask(self.inputs, mask=None) + if masks is None: + masks = [None for _ in self.outputs] + if not isinstance(masks, list): + masks = [masks] # Prepare loss weights. if loss_weights is None: @@ -694,6 +709,32 @@ class Model(Network): else: raise TypeError('Could not interpret loss_weights argument: ' + str(loss_weights) + ' - expected a list of dicts.') + self.loss_weights_list = loss_weights_list + + # initialization for Eager mode execution + if context.in_eager_mode(): + if target_tensors is not None: + raise ValueError('target_tensors are not currently supported in Eager' + 'mode.') + self.total_loss = None + self.metrics = metrics + self.weighted_metrics = weighted_metrics + self.metrics_tensors = [] + self.metrics_names = ['loss'] + for i in range(len(self.outputs)): + if len(self.outputs) > 1: + self.metrics_names.append(self.output_names[i] + '_loss') + self.nested_metrics = _collect_metrics(metrics, self.output_names) + self._feed_sample_weight_modes = [] + for i in range(len(self.outputs)): + self._feed_sample_weight_modes.append(None) + self.sample_weights = [] + self.targets = [] + self._collected_trainable_weights = self.trainable_weights + for i in range(len(self.outputs)): + self._feed_output_names.append(self.output_names[i]) + + return # Prepare targets of model. self.targets = [] @@ -720,6 +761,7 @@ class Model(Network): else: raise TypeError('Expected `target_tensors` to be ' 'a list or dict, but got:', target_tensors) + for i in range(len(self.outputs)): if i in skip_target_indices: self.targets.append(None) @@ -769,7 +811,7 @@ class Model(Network): weight = K.placeholder(ndim=2, name=name + '_sample_weights') sample_weight_modes.append('temporal') else: - weight = K.placeholder(ndim=1, name=name + '_sample_weights') + weight = K.placeholder(ndim=1, name=name + 'sample_weights') sample_weight_modes.append(None) sample_weights.append(weight) elif isinstance(sample_weight_mode, list): @@ -929,7 +971,7 @@ class Model(Network): self._feed_sample_weights = [] for i in range(len(self.sample_weights)): if i not in skip_target_weighing_indices: - self._feed_sample_weights.append(sample_weights[i]) + self._feed_sample_weights.append(self.sample_weights[i]) # Functions for train, test and predict will # be compiled lazily when required. @@ -978,6 +1020,7 @@ class Model(Network): with K.name_scope(self.optimizer.__class__.__name__): training_updates = self.optimizer.get_updates( params=self._collected_trainable_weights, loss=self.total_loss) + updates = self.updates + training_updates # Gets loss and metrics. Updates weights at each call. self.train_function = K.function( @@ -1156,6 +1199,7 @@ class Model(Network): callback_model = self callbacks.set_model(callback_model) + callbacks.set_params({ 'batch_size': batch_size, 'epochs': epochs, @@ -1216,6 +1260,7 @@ class Model(Network): np.random.shuffle(index_array) batches = _make_batches(num_train_samples, batch_size) + for batch_index, (batch_start, batch_end) in enumerate(batches): batch_ids = index_array[batch_start:batch_end] try: @@ -1410,6 +1455,7 @@ class Model(Network): ins_batch[i] = ins_batch[i].toarray() batch_outs = f(ins_batch) + if isinstance(batch_outs, list): if batch_index == 0: for batch_out in enumerate(batch_outs): @@ -1420,7 +1466,6 @@ class Model(Network): if batch_index == 0: outs.append(0.) outs[0] += batch_outs * len(batch_ids) - if verbose == 1: progbar.update(batch_end) for i in range(len(outs)): @@ -1636,6 +1681,7 @@ class Model(Network): batch_size=batch_size) # Prepare validation data. do_validation = False + val_ins = [] if validation_data: do_validation = True if len(validation_data) == 2: @@ -1686,39 +1732,65 @@ class Model(Network): ins = x + y + sample_weights + [1.] else: ins = x + y + sample_weights - self._make_train_function() - f = self.train_function # Prepare display labels. out_labels = self._get_deduped_metrics_names() - if do_validation: - self._make_test_function() - val_f = self.test_function - callback_metrics = copy.copy(out_labels) + [ - 'val_' + n for n in out_labels - ] - else: - callback_metrics = copy.copy(out_labels) - val_f = None - val_ins = [] + if context.in_eager_mode(): + if do_validation: + callback_metrics = copy.copy(out_labels) + [ + 'val_' + n for n in out_labels + ] + else: + callback_metrics = copy.copy(out_labels) - # Delegate logic to `_fit_loop`. - return self._fit_loop( - f, - ins, - out_labels=out_labels, - batch_size=batch_size, - epochs=epochs, - verbose=verbose, - callbacks=callbacks, - val_f=val_f, - val_ins=val_ins, - shuffle=shuffle, - callback_metrics=callback_metrics, - initial_epoch=initial_epoch, - steps_per_epoch=steps_per_epoch, - validation_steps=validation_steps) + return training_eager.fit_loop( + self, + ins, + out_labels=out_labels, + batch_size=batch_size, + epochs=epochs, + verbose=verbose, + callbacks=callbacks, + val_ins=val_ins, + shuffle=shuffle, + callback_metrics=callback_metrics, + initial_epoch=initial_epoch, + steps_per_epoch=steps_per_epoch, + validation_steps=validation_steps) + else: + self._make_train_function() + f = self.train_function + + if do_validation: + if context.in_graph_mode(): + self._make_test_function() + val_f = self.test_function + else: + val_f = None + callback_metrics = copy.copy(out_labels) + [ + 'val_' + n for n in out_labels + ] + else: + val_f = None + callback_metrics = copy.copy(out_labels) + + # Delegate logic to `_fit_loop`. + return self._fit_loop( + f, + ins, + out_labels=out_labels, + batch_size=batch_size, + epochs=epochs, + verbose=verbose, + callbacks=callbacks, + val_f=val_f, + val_ins=val_ins, + shuffle=shuffle, + callback_metrics=callback_metrics, + initial_epoch=initial_epoch, + steps_per_epoch=steps_per_epoch, + validation_steps=validation_steps) def evaluate(self, x=None, @@ -1794,10 +1866,15 @@ class Model(Network): ins = x + y + sample_weights + [0.] else: ins = x + y + sample_weights - self._make_test_function() - f = self.test_function - return self._test_loop( - f, ins, batch_size=batch_size, verbose=verbose, steps=steps) + + if context.in_eager_mode(): + return training_eager.test_loop( + self, ins, batch_size=batch_size, verbose=verbose, steps=steps) + else: + self._make_test_function() + f = self.test_function + return self._test_loop( + f, ins, batch_size=batch_size, verbose=verbose, steps=steps) def predict(self, x, batch_size=None, verbose=0, steps=None): """Generates output predictions for the input samples. @@ -1849,10 +1926,16 @@ class Model(Network): ins = x + [0.] else: ins = x - self._make_predict_function() - f = self.predict_function - return self._predict_loop( - f, ins, batch_size=batch_size, verbose=verbose, steps=steps) + + if context.in_eager_mode(): + return training_eager.predict_loop( + self, ins, batch_size=batch_size, verbose=verbose, steps=steps) + else: + self._make_predict_function() + f = self.predict_function + + return self._predict_loop( + f, ins, batch_size=batch_size, verbose=verbose, steps=steps) def train_on_batch(self, x, y, sample_weight=None, class_weight=None): """Runs a single gradient update on a single batch of data. @@ -1888,6 +1971,7 @@ class Model(Network): or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the scalar outputs. + """ x, y, sample_weights = self._standardize_user_data( x, @@ -1899,11 +1983,16 @@ class Model(Network): ins = x + y + sample_weights + [1.] else: ins = x + y + sample_weights - self._make_train_function() - outputs = self.train_function(ins) - if len(outputs) == 1: - return outputs[0] - return outputs + + if context.in_eager_mode(): + return training_eager.train_on_batch(self, ins) + + if context.in_graph_mode(): + self._make_train_function() + outputs = self.train_function(ins) + if len(outputs) == 1: + return outputs[0] + return outputs def test_on_batch(self, x, y, sample_weight=None): """Test the model on a single batch of samples. @@ -1942,11 +2031,16 @@ class Model(Network): ins = x + y + sample_weights + [0.] else: ins = x + y + sample_weights - self._make_test_function() - outputs = self.test_function(ins) - if len(outputs) == 1: - return outputs[0] - return outputs + + if context.in_eager_mode(): + return training_eager.test_on_batch(self, ins) + + if context.in_graph_mode(): + self._make_test_function() + outputs = self.test_function(ins) + if len(outputs) == 1: + return outputs[0] + return outputs def predict_on_batch(self, x): """Returns predictions for a single batch of samples. @@ -1956,6 +2050,7 @@ class Model(Network): Returns: Numpy array(s) of predictions. + """ x = _standardize_input_data(x, self._feed_input_names, self._feed_input_shapes) @@ -1963,11 +2058,25 @@ class Model(Network): ins = x + [0.] else: ins = x - self._make_predict_function() - outputs = self.predict_function(ins) - if len(outputs) == 1: - return outputs[0] - return outputs + + if context.in_eager_mode(): + ins_batch_converted = [] + for ib in ins: + ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx())) + + eager_model_inputs = [] + for i in range(len(self.inputs)): + eager_model_inputs.append(ins_batch_converted[i]) + + outs = self(eager_model_inputs) # pylint: disable=not-callable + return outs + + if context.in_graph_mode(): + self._make_predict_function() + outputs = self.predict_function(ins) + if len(outputs) == 1: + return outputs[0] + return outputs def fit_generator(self, generator, @@ -2072,7 +2181,6 @@ class Model(Network): model.fit_generator(generate_arrays_from_file('/my_file.txt'), steps_per_epoch=10000, epochs=10) ``` - Raises: ValueError: In case the generator yields data in an invalid format. diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py new file mode 100644 index 00000000000..0a115969ca6 --- /dev/null +++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py @@ -0,0 +1,666 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Keras training and evaluation routines. +""" +# pylint: disable=protected-access +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +from tensorflow.python.eager.backprop import GradientTape +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_util +from tensorflow.python.keras._impl.keras import backend as K +from tensorflow.python.keras._impl.keras import callbacks as cbks +from tensorflow.python.keras._impl.keras import losses +from tensorflow.python.keras._impl.keras import metrics as metrics_module +from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar + + +def _make_batches(size, batch_size): + """Returns a list of batch indices (tuples of indices). + + Arguments: + size: Integer, total size of the data to slice into batches. + batch_size: Integer, batch size. + + Returns: + A list of tuples of array indices. + """ + num_batches = int(np.ceil(size / float(batch_size))) + return [(i * batch_size, min(size, (i + 1) * batch_size)) + for i in range(0, num_batches)] + + +def _slice_arrays(arrays, start=None, stop=None): + """Slice an array or list of arrays. + + This takes an array-like, or a list of + array-likes, and outputs: + - arrays[start:stop] if `arrays` is an array-like + - [x[start:stop] for x in arrays] if `arrays` is a list + + Can also work on list/array of indices: `_slice_arrays(x, indices)` + + Arguments: + arrays: Single array or list of arrays. + start: can be an integer index (start index) + or a list/array of indices + stop: integer (stop index); should be None if + `start` was a list. + + Returns: + A slice of the array(s). + + Raises: + ValueError: If the value of start is a list and stop is not None. + """ + if arrays is None: + return [None] + if isinstance(start, list) and stop is not None: + raise ValueError('The stop argument has to be None if the value of start is' + 'a list.') + elif isinstance(arrays, list): + if hasattr(start, '__len__'): + # hdf5 datasets only support list objects as indices + if hasattr(start, 'shape'): + start = start.tolist() + return [None if x is None else x[start] for x in arrays] + else: + return [None if x is None else x[start:stop] for x in arrays] + else: + if hasattr(start, '__len__'): + if hasattr(start, 'shape'): + start = start.tolist() + return arrays[start] + elif hasattr(start, '__getitem__'): + return arrays[start:stop] + else: + return [None] + + +def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None): + if metric == 'accuracy' or metric == 'acc': + # custom handling of accuracy + # (because of class mode duality) + output_shape = internal_output_shapes + if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy: + # case: binary accuracy + acc_fn = metrics_module.binary_accuracy + elif loss_func == losses.sparse_categorical_crossentropy: + # case: categorical accuracy with sparse targets + acc_fn = metrics_module.sparse_categorical_accuracy + else: + acc_fn = metrics_module.categorical_accuracy + + metric_name = 'acc' + return metric_name, acc_fn + else: + metric_fn = metrics_module.get(metric) + metric_name = metric_fn.__name__ + return metric_name, metric_fn + + +def _eager_loss_fn(outputs, targets, loss_fn, output_name): + with K.name_scope(output_name + '_loss'): + loss = loss_fn(targets, outputs) + return loss + + +def _eager_metrics_fn(model, outputs, targets): + """Calculates the metrics for each output of the given model. + + Arguments: + model: The model on which metrics are being calculated. + outputs: The outputs of the given model. + targets: The predictions or targets of the given model. + + Returns: + Returns the metric names and metric results for each output of the model. + """ + metric_names = [] + metric_results = [] + if not isinstance(outputs, list): + outputs = [outputs] + + if not isinstance(targets, list): + targets = [targets] + + for i in range(len(model.outputs)): + output_metrics = model.nested_metrics[i] + for nested_output_metric in output_metrics: + metric_name, metric_fn = _get_metrics_info( + nested_output_metric, model._internal_output_shapes[i], + model.loss_functions[i]) + + if len(model.output_names) > 1: + metric_name = model.output_names[i] + '_' + metric_name + if metric_name not in model.metrics_names: + model.metrics_names.append(metric_name) + + with K.name_scope(metric_name): + metric_result = metric_fn(outputs[i], targets[i]) + metric_names.append(metric_name) + metric_results.append(K.mean(metric_result)) + + return metric_names, metric_results + + +def _model_loss(model, inputs, targets): + """Calculates the loss for a given model. + + Arguments: + model: The model on which metrics are being calculated. + inputs: The inputs of the given model. This is typically the mini batch of + data that is fed to the model. + targets: The predictions or targets of the given model. + + Returns: + Returns the model output, total loss and loss value calculated using the + specified loss function. The total loss includes regularization losses and + applies masking and sample weighting to the loss value. + """ + total_loss = 0 + outs = model(inputs) + if not isinstance(outs, list): + outs = [outs] + + if not isinstance(targets, list): + targets = [targets] + + loss_metrics = [] + with K.name_scope('loss'): + for i, loss_fn in enumerate(model.loss_functions): + # compute the loss + output_loss = _eager_loss_fn(outs[i], targets[i], loss_fn, + model.output_names[i]) + loss_metrics.append(K.mean(output_loss)) + + mask = outs[i]._keras_mask + # adapted from weighted_loss_fn + if mask is not None: + # mask should have the same shape as output_loss + output_loss *= mask + # the loss per batch should be proportional + # to the number of unmasked samples. + output_loss /= K.mean(mask) + + # adapted from weighted_loss_fn + # apply sample weighting + if model.sample_weights: + # reduce score_array to same ndim as weight array + ndim = K.ndim(output_loss) + weight_ndim = K.ndim(model.sample_weights) + output_loss = K.mean(output_loss, axis=list(range(weight_ndim, ndim))) + output_loss *= model.sample_weights + output_loss /= K.mean(K.cast(K.not_equal(model.sample_weights, 0), + K.floatx())) + output_loss = K.mean(output_loss) + + loss_weight = model.loss_weights_list[i] + if total_loss is None: + total_loss = loss_weight * output_loss + else: + total_loss += loss_weight * output_loss + + total_loss = K.mean(total_loss) + # Add regularization losses + custom_losses = [] + for layer in model.layers: + if layer.losses: + custom_losses += layer.losses + + if custom_losses: + total_loss += sum(custom_losses) + + return outs, total_loss, loss_metrics + + +def _process_single_batch(eager_model_inputs, eager_model_outputs, model, + training=True): + """Calculate the loss and gradient for one input batch. + + The model weights are updated if training is set to True. + + Arguments: + eager_model_inputs: Input batch data. + eager_model_outputs: Output batch data. + model: Model whose loss has to be calculated. + training: The boolean represents if the weights of the model are updated. + 'fit' methods will set this to True while 'evaluate' methods will + set this to False. + + Returns: + output of the model, total loss and the loss associated with each output. + + Raises: + ValueError: If the model loss is 0 or if the trainable weights list is + empty when the trainable parameter is set to True. + """ + K.set_learning_phase(training) + with GradientTape() as tape: + outs, loss, loss_metrics = _model_loss(model, eager_model_inputs, + eager_model_outputs) + if loss is None: + raise ValueError('The model cannot be run ' + 'because it has no loss to optimize.') + if training: + if not model._collected_trainable_weights: + raise ValueError('The list of trainable weights is empty. Make sure that ' + 'you are not setting model.trainable to False before ' + 'compiling the model.') + grads = tape.gradient(loss, model._collected_trainable_weights) + model.optimizer.apply_gradients(zip(grads, + model._collected_trainable_weights)) + return outs, loss, loss_metrics + + +def train_on_batch(model, ins): + """Calculates the loss and gradient updates for one input batch. + + Arguments: + model: Given model on which loss and gradients are calculated. + ins: Input and output batch numpy arrays. + + Returns: + total loss and the loss associated with each output. + """ + ins_batch_converted = [] + for ib in ins: + ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx())) + eager_model_inputs = [] + eager_model_outputs = [] + for i in range(len(model.inputs)): + eager_model_inputs.append(ins_batch_converted[i]) + for i in range(len(model.inputs), len(ins_batch_converted)): + eager_model_outputs.append(ins_batch_converted[i]) + outs, loss, _ = _process_single_batch( + eager_model_inputs, eager_model_outputs, model) + if not isinstance(outs, list): + outs = [outs] + _, metrics_results = _eager_metrics_fn( + model, outs, eager_model_outputs) + if not isinstance(loss, list): + loss = [loss] + return loss + metrics_results + + +def test_on_batch(model, ins): + """Calculates the loss for one input batch. + + Arguments: + model: Given model on which loss is calculated. + ins: Input and output batch numpy arrays. + + Returns: + total loss, loss and metrics associated with each output. + """ + ins_batch_converted = [] + for ib in ins: + ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx())) + eager_model_inputs = [] + eager_model_outputs = [] + for i in range(len(model.inputs)): + eager_model_inputs.append(ins_batch_converted[i]) + for i in range(len(model.inputs), len(ins_batch_converted)): + eager_model_outputs.append(ins_batch_converted[i]) + outs, loss, loss_metrics = _process_single_batch( + eager_model_inputs, eager_model_outputs, model, training=False) + if not isinstance(outs, list): + outs = [outs] + metric_names, metrics_results = _eager_metrics_fn( + model, outs, eager_model_outputs) + model.metrics_names.append(metric_names) + if not isinstance(loss, list): + loss = [loss] + return loss + loss_metrics + metrics_results + + +def fit_loop( + model, + ins, + out_labels=None, + batch_size=None, + epochs=100, + verbose=1, + callbacks=None, + val_ins=None, + shuffle=True, + callback_metrics=None, + initial_epoch=0, + steps_per_epoch=None, + validation_steps=None): + """Abstract fit function for `f(ins)`. + + Assume that f returns a list, labeled by out_labels. + + Arguments: + model: Instance of the model that is being executed in Eager mode. + ins: List of tensors to be fed to `f` + out_labels: List of strings, display names of + the outputs of `f` + batch_size: Integer batch size or None if unknown. + epochs: Number of times to iterate over the data + verbose: Verbosity mode, 0, 1 or 2 + callbacks: List of callbacks to be called during training + val_ins: List of tensors to be fed to `val_f` + shuffle: Whether to shuffle the data at the beginning of each epoch + callback_metrics: List of strings, the display names of the metrics + passed to the callbacks. They should be the + concatenation of list the display names of the outputs of + `f` and the list of display names of the outputs of `f_val`. + initial_epoch: Epoch at which to start training + (useful for resuming a previous training run) + steps_per_epoch: Total number of steps (batches of samples) + before declaring one epoch finished and starting the + next epoch. Ignored with the default value of `None`. + validation_steps: Number of steps to run validation for (only if doing + validation from data tensors). Ignored with default value of `None`. + + Returns: + `History` object. + + Raises: + ValueError: In case of invalid argument values. + """ + # Required for Eager mode + K.set_learning_phase(True) + + do_validation = False + if val_ins: + do_validation = True + if (verbose and ins and hasattr(ins[0], 'shape') and + hasattr(val_ins[0], 'shape')): + print('Train on %d samples, validate on %d samples' % + (ins[0].shape[0], val_ins[0].shape[0])) + if validation_steps: + if steps_per_epoch is None: + raise ValueError('Can only use `validation_steps` when doing step-wise ' + 'training, i.e. `steps_per_epoch` must be set.') + do_validation = True + + num_train_samples = model._check_num_samples( + ins, batch_size, steps_per_epoch, 'steps_per_epoch') + + if num_train_samples is not None: + index_array = np.arange(num_train_samples) + + model.history = cbks.History() + callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history] + if verbose: + if steps_per_epoch is not None: + count_mode = 'steps' + else: + count_mode = 'samples' + callbacks += [cbks.ProgbarLogger(count_mode)] + callbacks = cbks.CallbackList(callbacks) + out_labels = out_labels or [] + + # it's possible to callback a different model than self + # (used by Sequential models) + if hasattr(model, 'callback_model') and model.callback_model: + callback_model = model.callback_model + else: + callback_model = model + + callbacks.set_model(callback_model) + + callbacks.set_params({ + 'batch_size': batch_size, + 'epochs': epochs, + 'steps': steps_per_epoch, + 'samples': num_train_samples, + 'verbose': verbose, + 'do_validation': do_validation, + 'metrics': callback_metrics or [], + }) + callbacks.on_train_begin() + callback_model.stop_training = False + for cbk in callbacks: + cbk.validation_data = val_ins + + for epoch in range(initial_epoch, epochs): + callbacks.on_epoch_begin(epoch) + epoch_logs = {} + if shuffle == 'batch': + index_array = model._batch_shuffle(index_array, batch_size) + elif shuffle: + np.random.shuffle(index_array) + + batches = _make_batches(num_train_samples, batch_size) + + for batch_index, (batch_start, batch_end) in enumerate(batches): + batch_ids = index_array[batch_start:batch_end] + try: + if isinstance(ins[-1], float): + # Do not slice the training phase flag. + ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]] + else: + ins_batch = _slice_arrays(ins, batch_ids) + except TypeError: + raise TypeError('TypeError while preparing batch. ' + 'If using HDF5 input data, ' + 'pass shuffle="batch".') + batch_logs = {} + batch_logs['batch'] = batch_index + batch_logs['size'] = len(batch_ids) + + callbacks.on_batch_begin(batch_index, batch_logs) + + ins_batch_converted = [] + for ib in ins_batch: + ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx())) + eager_model_inputs = [] + eager_model_outputs = [] + for i in range(len(model.inputs)): + eager_model_inputs.append(ins_batch_converted[i]) + + for i in range(len(model.inputs), len(ins_batch_converted)): + eager_model_outputs.append(ins_batch_converted[i]) + + outs, loss, loss_metrics = _process_single_batch(eager_model_inputs, + eager_model_outputs, + model) + + if not isinstance(outs, list): + outs = [outs] + + for l, o in zip(out_labels, outs): + batch_logs[l] = o + # Required for Eager mode + metrics_names, metrics_results = _eager_metrics_fn(model, outs, + eager_model_outputs) + batch_logs['loss'] = tensor_util.constant_value(K.mean(loss)) + + # TODO(anjalisridhar): Move this to compile to avoid duplicate code. + # In graph mode we set the metric names in compile. However in + # Eager mode we calculate the metrics for each batch in fit_loop. + # We could calculate the metric names and functions in compile. + # This would avoid setting the callback parameters separately. + # We need to do this for the first iteration alone + for m in metrics_names: + if m not in callback_metrics: + callback_metrics.append(m) + + callbacks.set_params({ + 'batch_size': batch_size, + 'epochs': epochs, + 'steps': steps_per_epoch, + 'samples': num_train_samples, + 'verbose': verbose, + 'do_validation': do_validation, + 'metrics': callback_metrics or [], + }) + + for k, v in zip(model.metrics_names, + [K.mean(loss)] + loss_metrics + metrics_results): + batch_logs[k] = tensor_util.constant_value(v) + + callbacks.on_batch_end(batch_index, batch_logs) + if callback_model.stop_training: + break + + if batch_index == len(batches) - 1: # Last batch. + if do_validation: + val_outs = test_loop( + model, val_ins, batch_size=batch_size, verbose=0) + if not isinstance(val_outs, list): + val_outs = [val_outs] + # Same labels assumed. + for l, o in zip(out_labels, val_outs): + epoch_logs['val_' + l] = o + callbacks.on_epoch_end(epoch, epoch_logs) + if callback_model.stop_training: + break + callbacks.on_train_end() + return model.history + + +def test_loop(model, ins, batch_size=None, verbose=0, steps=None): + """Abstract method to loop over some data in batches. + + Arguments: + model: Model instance that is being evaluated in Eager mode. + ins: list of tensors to be fed to `f`. + batch_size: integer batch size or `None`. + verbose: verbosity mode. + steps: Total number of steps (batches of samples) + before declaring predictions finished. + Ignored with the default value of `None`. + + Returns: + Scalar loss (if the model has a single output and no metrics) + or list of scalars (if the model has multiple outputs + and/or metrics). The attribute `model.metrics_names` will give you + the display labels for the scalar outputs. + """ + K.set_learning_phase(False) + num_samples = model._check_num_samples(ins, batch_size, steps, 'steps') + outs = [] + if verbose == 1: + progbar = Progbar(target=num_samples) + batches = _make_batches(num_samples, batch_size) + index_array = np.arange(num_samples) + for batch_index, (batch_start, batch_end) in enumerate(batches): + batch_ids = index_array[batch_start:batch_end] + if isinstance(ins[-1], float): + # Do not slice the training phase flag. + ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]] + else: + ins_batch = _slice_arrays(ins, batch_ids) + + ins_batch_converted = [] + for ib in ins_batch: + ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx())) + + eager_model_inputs = [] + eager_model_outputs = [] + for i in range(len(model.inputs)): + eager_model_inputs.append(ins_batch_converted[i]) + + for i in range(len(model.inputs), len(ins_batch_converted)): + eager_model_outputs.append(ins_batch_converted[i]) + + loss_outs, loss, loss_metrics = _model_loss(model, eager_model_inputs, + eager_model_outputs) + _, metrics_results = _eager_metrics_fn(model, loss_outs, + eager_model_outputs) + batch_outs = [] + for _, v in zip(model.metrics_names, + [K.mean(loss)] + loss_metrics + metrics_results): + batch_outs.append(tensor_util.constant_value(v)) + + if isinstance(batch_outs, list): + if batch_index == 0: + for batch_out in enumerate(batch_outs): + outs.append(0.) + for i, batch_out in enumerate(batch_outs): + outs[i] += batch_out * len(batch_ids) + else: + if batch_index == 0: + outs.append(0.) + outs[0] += batch_outs * len(batch_ids) + + if verbose == 1: + progbar.update(batch_end) + for i in range(len(outs)): + outs[i] /= num_samples + if len(outs) == 1: + return outs[0] + return outs + + +def predict_loop(model, ins, batch_size=32, verbose=0, steps=None): + """Abstract method to loop over some data in batches. + + Arguments: + model: + ins: list of tensors to be fed to `f`. + batch_size: integer batch size. + verbose: verbosity mode. + steps: Total number of steps (batches of samples) + before declaring `_predict_loop` finished. + Ignored with the default value of `None`. + + Returns: + Array of predictions (if the model has a single output) + or list of arrays of predictions + (if the model has multiple outputs). + """ + K.set_learning_phase(False) + num_samples = model._check_num_samples(ins, batch_size, steps, 'steps') + if verbose == 1: + if steps is not None: + progbar = Progbar(target=steps) + else: + progbar = Progbar(target=num_samples) + + outs = [] + batches = _make_batches(num_samples, batch_size) + index_array = np.arange(num_samples) + for batch_index, (batch_start, batch_end) in enumerate(batches): + batch_ids = index_array[batch_start:batch_end] + if ins and isinstance(ins[-1], float): + # Do not slice the training phase flag. + ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]] + else: + ins_batch = _slice_arrays(ins, batch_ids) + + ins_batch_converted = [] + for ib in ins_batch: + ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx())) + + eager_model_inputs = [] + for i in range(len(model.inputs)): + eager_model_inputs.append(ins_batch_converted[i]) + + batch_outs = model(eager_model_inputs) + + if not isinstance(batch_outs, list): + batch_outs = [batch_outs] + if batch_index == 0: + # Pre-allocate the results arrays. + for batch_out in batch_outs: + dims = batch_out.shape[1:].dims + dims_list = [d.value for d in dims] + shape = (num_samples,) + tuple(dims_list) + outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype)) + for i, batch_out in enumerate(batch_outs): + outs[i][batch_start:batch_end] = batch_out + if verbose == 1: + progbar.update(batch_end) + if len(outs) == 1: + return outs[0] + return outs diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py new file mode 100644 index 00000000000..81e2f7a5145 --- /dev/null +++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py @@ -0,0 +1,755 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for training routines.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import numpy as np + +from tensorflow.python.framework import ops +from tensorflow.python.keras._impl import keras +from tensorflow.python.keras._impl.keras import testing_utils +from tensorflow.python.platform import test +from tensorflow.python.training.rmsprop import RMSPropOptimizer + + +class TrainingTest(test.TestCase): + + def test_fit_on_arrays(self): + a = keras.layers.Input(shape=(3,), name='input_a') + b = keras.layers.Input(shape=(3,), name='input_b') + + dense = keras.layers.Dense(4, name='dense') + c = dense(a) + d = dense(b) + e = keras.layers.Dropout(0.5, name='dropout')(c) + + model = keras.models.Model([a, b], [d, e]) + + optimizer = RMSPropOptimizer(learning_rate=0.001) + loss = 'mse' + loss_weights = [1., 0.5] + metrics = ['mae'] + model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights) + + input_a_np = np.random.random((10, 3)) + input_b_np = np.random.random((10, 3)) + + output_d_np = np.random.random((10, 4)) + output_e_np = np.random.random((10, 4)) + + # Test fit at different verbosity + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + epochs=1, + batch_size=5, + verbose=0) + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + epochs=1, + batch_size=5, + verbose=1) + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + epochs=2, + batch_size=5, + verbose=2) + + # Test with validation data + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + validation_data=([input_a_np, input_b_np], [output_d_np, + output_e_np]), + epochs=1, + batch_size=5, + verbose=0) + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + validation_data=([input_a_np, input_b_np], [output_d_np, + output_e_np]), + epochs=2, + batch_size=5, + verbose=1) + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + validation_data=([input_a_np, input_b_np], [output_d_np, + output_e_np]), + epochs=2, + batch_size=5, + verbose=2) + model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np]) + + # Test with validation split + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + epochs=2, + batch_size=5, + verbose=0, + validation_split=0.2) + + # Test with dictionary inputs + model.fit( + { + 'input_a': input_a_np, + 'input_b': input_b_np + }, {'dense': output_d_np, + 'dropout': output_e_np}, + epochs=1, + batch_size=5, + verbose=0) + model.fit( + { + 'input_a': input_a_np, + 'input_b': input_b_np + }, {'dense': output_d_np, + 'dropout': output_e_np}, + epochs=1, + batch_size=5, + verbose=1) + model.fit( + { + 'input_a': input_a_np, + 'input_b': input_b_np + }, {'dense': output_d_np, + 'dropout': output_e_np}, + validation_data=({'input_a': input_a_np, + 'input_b': input_b_np + }, + { + 'dense': output_d_np, + 'dropout': output_e_np + }), + epochs=1, + batch_size=5, + verbose=0) + model.train_on_batch({ + 'input_a': input_a_np, + 'input_b': input_b_np + }, {'dense': output_d_np, + 'dropout': output_e_np}) + # Test with lists for loss, metrics + loss = ['mae', 'mse'] + metrics = ['acc', 'mae'] + model.compile(optimizer, loss, metrics=metrics) + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + epochs=1, + batch_size=5, + verbose=0) + + # Test with dictionaries for loss, metrics, loss weights + loss = {'dense': 'mse', 'dropout': 'mae'} + loss_weights = {'dense': 1., 'dropout': 0.5} + metrics = {'dense': 'mse', 'dropout': 'mae'} + model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights) + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + epochs=1, + batch_size=5, + verbose=0) + + # Invalid use cases + with self.assertRaises(AttributeError): + model.fit( + [input_a_np, input_b_np], [output_d_np, output_e_np], + epochs=1, + validation_data=([input_a_np, input_b_np], 0, 0), + verbose=0) + with self.assertRaises(ValueError): + model.train_on_batch({'input_a': input_a_np}, + [output_d_np, output_e_np]) + with self.assertRaises(ValueError): + model.train_on_batch([input_a_np], [output_d_np, output_e_np]) + with self.assertRaises(AttributeError): + model.train_on_batch(1, [output_d_np, output_e_np]) + with self.assertRaises(ValueError): + model.train_on_batch(input_a_np, [output_d_np, output_e_np]) + with self.assertRaises(ValueError): + bad_input = np.random.random((11, 3)) + model.train_on_batch([bad_input, input_b_np], + [output_d_np, output_e_np]) + with self.assertRaises(ValueError): + bad_target = np.random.random((11, 4)) + model.train_on_batch([input_a_np, input_b_np], + [bad_target, output_e_np]) + + # Build single-input model + x = keras.layers.Input(shape=(3,), name='input_a') + y = keras.layers.Dense(4)(x) + model = keras.models.Model(x, y) + model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse') + # This will work + model.fit([input_a_np], output_d_np, epochs=1) + with self.assertRaises(ValueError): + model.fit([input_a_np, input_a_np], output_d_np, epochs=1) + + def test_evaluate_predict_on_arrays(self): + a = keras.layers.Input(shape=(3,), name='input_a') + b = keras.layers.Input(shape=(3,), name='input_b') + + dense = keras.layers.Dense(4, name='dense') + c = dense(a) + d = dense(b) + e = keras.layers.Dropout(0.5, name='dropout')(c) + + model = keras.models.Model([a, b], [d, e]) + + optimizer = RMSPropOptimizer(learning_rate=0.001) + loss = 'mse' + loss_weights = [1., 0.5] + metrics = ['mae'] + model.compile( + optimizer, + loss, + metrics=metrics, + loss_weights=loss_weights, + sample_weight_mode=None) + + input_a_np = np.random.random((10, 3)) + input_b_np = np.random.random((10, 3)) + + output_d_np = np.random.random((10, 4)) + output_e_np = np.random.random((10, 4)) + + # Test evaluate at different verbosity + out = model.evaluate( + [input_a_np, input_b_np], [output_d_np, output_e_np], + batch_size=5, + verbose=0) + self.assertEqual(len(out), 5) + out = model.evaluate( + [input_a_np, input_b_np], [output_d_np, output_e_np], + batch_size=5, + verbose=1) + self.assertEqual(len(out), 5) + out = model.evaluate( + [input_a_np, input_b_np], [output_d_np, output_e_np], + batch_size=5, + verbose=2) + self.assertEqual(len(out), 5) + out = model.test_on_batch([input_a_np, input_b_np], + [output_d_np, output_e_np]) + self.assertEqual(len(out), 5) + + # Test evaluate with dictionary inputs + model.evaluate( + { + 'input_a': input_a_np, + 'input_b': input_b_np + }, {'dense': output_d_np, + 'dropout': output_e_np}, + batch_size=5, + verbose=0) + model.evaluate( + { + 'input_a': input_a_np, + 'input_b': input_b_np + }, {'dense': output_d_np, + 'dropout': output_e_np}, + batch_size=5, + verbose=1) + + # Test predict + out = model.predict([input_a_np, input_b_np], batch_size=5) + self.assertEqual(len(out), 2) + out = model.predict({'input_a': input_a_np, 'input_b': input_b_np}) + self.assertEqual(len(out), 2) + out = model.predict_on_batch({ + 'input_a': input_a_np, + 'input_b': input_b_np + }) + self.assertEqual(len(out), 2) + + def test_invalid_loss_or_metrics(self): + num_classes = 5 + train_samples = 1000 + test_samples = 1000 + input_dim = 5 + + model = keras.models.Sequential() + model.add(keras.layers.Dense(10, input_shape=(input_dim,))) + model.add(keras.layers.Activation('relu')) + model.add(keras.layers.Dense(num_classes)) + model.add(keras.layers.Activation('softmax')) + model.compile(loss='categorical_crossentropy', + optimizer=RMSPropOptimizer(learning_rate=0.001)) + np.random.seed(1337) + + (x_train, y_train), (_, _) = testing_utils.get_test_data( + train_samples=train_samples, + test_samples=test_samples, + input_shape=(input_dim,), + num_classes=num_classes) + + with self.assertRaises(ValueError): + model.fit(x_train, np.concatenate([y_train, y_train], axis=-1)) + + with self.assertRaises(TypeError): + model.compile(loss='categorical_crossentropy', + optimizer=RMSPropOptimizer(learning_rate=0.001), + metrics=set(0)) + + with self.assertRaises(ValueError): + model.compile(loss=None, + optimizer='rms') + + +class LossWeightingTest(test.TestCase): + + def test_class_weights(self): + num_classes = 5 + batch_size = 5 + epochs = 5 + weighted_class = 3 + train_samples = 3000 + test_samples = 3000 + input_dim = 5 + + model = keras.models.Sequential() + model.add(keras.layers.Dense(10, input_shape=(input_dim,))) + model.add(keras.layers.Activation('relu')) + model.add(keras.layers.Dense(num_classes)) + model.add(keras.layers.Activation('softmax')) + model.compile(loss='categorical_crossentropy', + optimizer=RMSPropOptimizer(learning_rate=0.001)) + + np.random.seed(1337) + (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data( + train_samples=train_samples, + test_samples=test_samples, + input_shape=(input_dim,), + num_classes=num_classes) + int_y_test = y_test.copy() + int_y_train = y_train.copy() + # convert class vectors to binary class matrices + y_train = keras.utils.to_categorical(y_train, num_classes) + y_test = keras.utils.to_categorical(y_test, num_classes) + test_ids = np.where(int_y_test == np.array(weighted_class))[0] + + class_weight = dict([(i, 1.) for i in range(num_classes)]) + class_weight[weighted_class] = 2. + + sample_weight = np.ones((y_train.shape[0])) + sample_weight[int_y_train == weighted_class] = 2. + + model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs // 3, + verbose=0, + class_weight=class_weight, + validation_data=(x_train, y_train, sample_weight)) + model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs // 2, + verbose=0, + class_weight=class_weight) + model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs // 2, + verbose=0, + class_weight=class_weight, + validation_split=0.1) + + model.train_on_batch( + x_train[:batch_size], y_train[:batch_size], class_weight=class_weight) + ref_score = model.evaluate(x_test, y_test, verbose=0) + score = model.evaluate( + x_test[test_ids, :], y_test[test_ids, :], verbose=0) + self.assertLess(score, ref_score) + + def test_sample_weights(self): + num_classes = 5 + batch_size = 5 + epochs = 5 + weighted_class = 3 + train_samples = 3000 + test_samples = 3000 + input_dim = 5 + + model = keras.models.Sequential() + model.add(keras.layers.Dense(10, input_shape=(input_dim,))) + model.add(keras.layers.Activation('relu')) + model.add(keras.layers.Dense(num_classes)) + model.add(keras.layers.Activation('softmax')) + model.compile(loss='categorical_crossentropy', + optimizer=RMSPropOptimizer(learning_rate=0.001)) + + np.random.seed(43) + (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data( + train_samples=train_samples, + test_samples=test_samples, + input_shape=(input_dim,), + num_classes=num_classes) + int_y_test = y_test.copy() + int_y_train = y_train.copy() + # convert class vectors to binary class matrices + y_train = keras.utils.to_categorical(y_train, num_classes) + y_test = keras.utils.to_categorical(y_test, num_classes) + test_ids = np.where(int_y_test == np.array(weighted_class))[0] + + class_weight = dict([(i, 1.) for i in range(num_classes)]) + class_weight[weighted_class] = 2. + + sample_weight = np.ones((y_train.shape[0])) + sample_weight[int_y_train == weighted_class] = 2. + + model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs // 3, + verbose=0, + sample_weight=sample_weight) + model.fit( + x_train, + y_train, + batch_size=batch_size, + epochs=epochs // 3, + verbose=0, + sample_weight=sample_weight, + validation_split=0.1) + model.train_on_batch( + x_train[:batch_size], + y_train[:batch_size], + sample_weight=sample_weight[:batch_size]) + model.test_on_batch( + x_train[:batch_size], + y_train[:batch_size], + sample_weight=sample_weight[:batch_size]) + + def test_temporal_sample_weights(self): + num_classes = 5 + weighted_class = 3 + train_samples = 1000 + test_samples = 1000 + input_dim = 5 + timesteps = 3 + + model = keras.models.Sequential() + model.add( + keras.layers.TimeDistributed( + keras.layers.Dense(num_classes), + input_shape=(timesteps, input_dim))) + model.add(keras.layers.Activation('softmax')) + + np.random.seed(1337) + (_, y_train), _ = testing_utils.get_test_data( + train_samples=train_samples, + test_samples=test_samples, + input_shape=(input_dim,), + num_classes=num_classes) + int_y_train = y_train.copy() + # convert class vectors to binary class matrices + y_train = keras.utils.to_categorical(y_train, num_classes) + + class_weight = dict([(i, 1.) for i in range(num_classes)]) + class_weight[weighted_class] = 2. + + sample_weight = np.ones((y_train.shape[0])) + sample_weight[int_y_train == weighted_class] = 2. + with self.assertRaises(ValueError): + model.compile( + loss='binary_crossentropy', + optimizer=RMSPropOptimizer(learning_rate=0.001), + sample_weight_mode='temporal') + + def test_class_weight_invalid_use_case(self): + num_classes = 5 + train_samples = 1000 + test_samples = 1000 + input_dim = 5 + timesteps = 3 + + model = keras.models.Sequential() + model.add( + keras.layers.TimeDistributed( + keras.layers.Dense(num_classes), + input_shape=(timesteps, input_dim))) + model.add(keras.layers.Activation('softmax')) + model.compile( + loss='binary_crossentropy', + optimizer=RMSPropOptimizer(learning_rate=0.001)) + + (x_train, y_train), _ = testing_utils.get_test_data( + train_samples=train_samples, + test_samples=test_samples, + input_shape=(input_dim,), + num_classes=num_classes) + # convert class vectors to binary class matrices + y_train = keras.utils.to_categorical(y_train, num_classes) + class_weight = dict([(i, 1.) for i in range(num_classes)]) + + del class_weight[1] + with self.assertRaises(ValueError): + model.fit(x_train, y_train, + epochs=0, verbose=0, class_weight=class_weight) + + with self.assertRaises(ValueError): + model.compile( + loss='binary_crossentropy', + optimizer=RMSPropOptimizer(learning_rate=0.001), + sample_weight_mode=[]) + + # Build multi-output model + x = keras.Input((3,)) + y1 = keras.layers.Dense(4, name='1')(x) + y2 = keras.layers.Dense(4, name='2')(x) + model = keras.models.Model(x, [y1, y2]) + model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse') + x_np = np.random.random((10, 3)) + y_np = np.random.random((10, 4)) + w_np = np.random.random((10,)) + # This will work + model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': w_np}) + # These will not + with self.assertRaises(ValueError): + model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=[w_np]) + with self.assertRaises(TypeError): + model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=w_np) + with self.assertRaises(ValueError): + bad_w_np = np.random.random((11,)) + model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np}) + with self.assertRaises(ValueError): + bad_w_np = np.random.random((10, 2)) + model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np}) + with self.assertRaises(ValueError): + bad_w_np = np.random.random((10, 2, 2)) + model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np}) + + +class TestDynamicTrainability(test.TestCase): + + def test_trainable_warning(self): + x = np.random.random((5, 3)) + y = np.random.random((5, 2)) + model = keras.models.Sequential() + model.add(keras.layers.Dense(2, input_dim=3)) + model.trainable = False + model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse') + model.trainable = True + with self.assertRaises(ValueError): + model.train_on_batch(x, y) + + def test_trainable_argument(self): + x = np.random.random((5, 3)) + y = np.random.random((5, 2)) + + model = keras.models.Sequential() + model.add(keras.layers.Dense(2, input_dim=3, trainable=False)) + model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse') + out = model.predict(x) + with self.assertRaises(ValueError): + model.train_on_batch(x, y) + out_2 = model.predict(x) + self.assertAllClose(out, out_2) + + # test with nesting + inputs = keras.layers.Input(shape=(3,)) + output = model(inputs) + model = keras.models.Model(inputs, output) + model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse') + out = model.predict(x) + with self.assertRaises(ValueError): + model.train_on_batch(x, y) + out_2 = model.predict(x) + self.assertAllClose(out, out_2) + + def test_layer_trainability_switch(self): + # with constructor argument, in Sequential + model = keras.models.Sequential() + model.add(keras.layers.Dense(2, trainable=False, input_dim=1)) + self.assertListEqual(model.trainable_weights, []) + + # by setting the `trainable` argument, in Sequential + model = keras.models.Sequential() + layer = keras.layers.Dense(2, input_dim=1) + model.add(layer) + self.assertListEqual(model.trainable_weights, layer.trainable_weights) + layer.trainable = False + self.assertListEqual(model.trainable_weights, []) + + # with constructor argument, in Model + x = keras.layers.Input(shape=(1,)) + y = keras.layers.Dense(2, trainable=False)(x) + model = keras.models.Model(x, y) + self.assertListEqual(model.trainable_weights, []) + + # by setting the `trainable` argument, in Model + x = keras.layers.Input(shape=(1,)) + layer = keras.layers.Dense(2) + y = layer(x) + model = keras.models.Model(x, y) + self.assertListEqual(model.trainable_weights, layer.trainable_weights) + layer.trainable = False + self.assertListEqual(model.trainable_weights, []) + + def test_model_trainability_switch(self): + # a non-trainable model has no trainable weights + x = keras.layers.Input(shape=(1,)) + y = keras.layers.Dense(2)(x) + model = keras.models.Model(x, y) + model.trainable = False + self.assertListEqual(model.trainable_weights, []) + + # same for Sequential + model = keras.models.Sequential() + model.add(keras.layers.Dense(2, input_dim=1)) + model.trainable = False + self.assertListEqual(model.trainable_weights, []) + + def test_nested_model_trainability(self): + + # a Sequential inside a Model + inner_model = keras.models.Sequential() + inner_model.add(keras.layers.Dense(2, input_dim=1)) + + x = keras.layers.Input(shape=(1,)) + y = inner_model(x) + outer_model = keras.models.Model(x, y) + self.assertListEqual(outer_model.trainable_weights, + inner_model.trainable_weights) + inner_model.trainable = False + self.assertListEqual(outer_model.trainable_weights, []) + inner_model.trainable = True + inner_model.layers[-1].trainable = False + self.assertListEqual(outer_model.trainable_weights, []) + + # a Sequential inside a Sequential + inner_model = keras.models.Sequential() + inner_model.add(keras.layers.Dense(2, input_dim=1)) + outer_model = keras.models.Sequential() + outer_model.add(inner_model) + self.assertListEqual(outer_model.trainable_weights, + inner_model.trainable_weights) + inner_model.trainable = False + self.assertListEqual(outer_model.trainable_weights, []) + inner_model.trainable = True + inner_model.layers[-1].trainable = False + self.assertListEqual(outer_model.trainable_weights, []) + + # a Model inside a Model + x = keras.layers.Input(shape=(1,)) + y = keras.layers.Dense(2)(x) + inner_model = keras.models.Model(x, y) + x = keras.layers.Input(shape=(1,)) + y = inner_model(x) + outer_model = keras.models.Model(x, y) + self.assertListEqual(outer_model.trainable_weights, + inner_model.trainable_weights) + inner_model.trainable = False + self.assertListEqual(outer_model.trainable_weights, []) + inner_model.trainable = True + inner_model.layers[-1].trainable = False + self.assertListEqual(outer_model.trainable_weights, []) + + # a Model inside a Sequential + x = keras.layers.Input(shape=(1,)) + y = keras.layers.Dense(2)(x) + inner_model = keras.models.Model(x, y) + outer_model = keras.models.Sequential() + outer_model.add(inner_model) + self.assertListEqual(outer_model.trainable_weights, + inner_model.trainable_weights) + inner_model.trainable = False + self.assertListEqual(outer_model.trainable_weights, []) + inner_model.trainable = True + inner_model.layers[-1].trainable = False + self.assertListEqual(outer_model.trainable_weights, []) + + +class TestTrainingUtils(test.TestCase): + + def test_check_array_lengths(self): + keras.engine.training._check_array_lengths(None, None, None) + a_np = np.random.random((4, 3, 3)) + keras.engine.training._check_array_lengths(a_np, a_np, a_np) + keras.engine.training._check_array_lengths( + [a_np, a_np], [a_np, a_np], [a_np, a_np]) + keras.engine.training._check_array_lengths([None], [None], [None]) + + b_np = np.random.random((3, 4)) + with self.assertRaises(ValueError): + keras.engine.training._check_array_lengths(a_np, None, None) + with self.assertRaises(ValueError): + keras.engine.training._check_array_lengths(a_np, a_np, None) + with self.assertRaises(ValueError): + keras.engine.training._check_array_lengths([a_np], [None], None) + with self.assertRaises(ValueError): + keras.engine.training._check_array_lengths([a_np], [b_np], None) + with self.assertRaises(ValueError): + keras.engine.training._check_array_lengths([a_np], None, [b_np]) + + def test_slice_arrays(self): + input_a = np.random.random((10, 3)) + keras.engine.training._slice_arrays(None) + keras.engine.training._slice_arrays(input_a, 0) + keras.engine.training._slice_arrays(input_a, 0, 1) + keras.engine.training._slice_arrays(input_a, stop=2) + input_a = [None, [1, 1], None, [1, 1]] + keras.engine.training._slice_arrays(input_a, 0) + keras.engine.training._slice_arrays(input_a, 0, 1) + keras.engine.training._slice_arrays(input_a, stop=2) + input_a = [None] + keras.engine.training._slice_arrays(input_a, 0) + keras.engine.training._slice_arrays(input_a, 0, 1) + keras.engine.training._slice_arrays(input_a, stop=2) + input_a = None + keras.engine.training._slice_arrays(input_a, 0) + keras.engine.training._slice_arrays(input_a, 0, 1) + keras.engine.training._slice_arrays(input_a, stop=2) + + def test_fit_with_BatchNorm(self): + model = keras.models.Sequential() + model.add(keras.layers.Dense(10, input_dim=4)) + model.add(keras.layers.BatchNormalization()) + model.add(keras.layers.Activation('tanh')) + model.add(keras.layers.Dropout(0.2)) + + input_a_np = np.random.random((10, 4)) + output_b_np = np.random.random((10, 10)) + + model.compile(loss='binary_crossentropy', optimizer=RMSPropOptimizer(0.001)) + model.fit(input_a_np, output_b_np, epochs=1, batch_size=5, verbose=0) + + def test_fit_with_regularization(self): + model = keras.models.Sequential() + with self.assertRaises(ValueError): + model.add( + keras.layers.Dense(4, input_dim=3, + kernel_regularizer=keras.regularizers.l2(0.01), + activity_regularizer=keras.regularizers.l1(0.01))) + + +if __name__ == '__main__': + # Bazel sets these environment variables to very long paths. + # Tempfile uses them to create long paths, and in turn multiprocessing + # library tries to create sockets named after paths. Delete whatever bazel + # writes to these to avoid tests failing due to socket addresses being too + # long. + for var in ('TMPDIR', 'TMP', 'TEMP'): + if var in os.environ: + del os.environ[var] + + ops.enable_eager_execution() + test.main() diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py index 5a033a04ade..b380238e4e2 100644 --- a/tensorflow/python/keras/_impl/keras/engine/training_test.py +++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py @@ -78,6 +78,14 @@ class TrainingTest(test.TestCase): verbose=2) model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np]) + # Test model with input data as a list of lists + model.fit( + [np.ndarray.tolist(input_a_np), np.ndarray.tolist(input_b_np)], + [output_d_np, output_e_np], + epochs=2, + batch_size=5, + verbose=2) + # Test with validation data model.fit( [input_a_np, input_b_np], [output_d_np, output_e_np], @@ -205,6 +213,16 @@ class TrainingTest(test.TestCase): with self.assertRaises(ValueError): model.fit([input_a_np, input_a_np], output_d_np, epochs=1) + # Test model on a list of floats + input_a_np = np.random.random((10, 3)) + input_b_np = np.random.random((10, 4)) + + model.fit([np.ndarray.tolist(input_a_np)], + [np.ndarray.tolist(input_b_np)], + epochs=2, + batch_size=5, + verbose=2) + def test_evaluate_predict_on_arrays(self): with self.test_session(): a = keras.layers.Input(shape=(3,), name='input_a') diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py index 6ee3fb48b2f..ea2d3f2f04a 100644 --- a/tensorflow/python/keras/_impl/keras/layers/core.py +++ b/tensorflow/python/keras/_impl/keras/layers/core.py @@ -23,6 +23,7 @@ import types as python_types import numpy as np +from tensorflow.python.eager import context from tensorflow.python.framework import tensor_shape from tensorflow.python.keras._impl.keras import activations from tensorflow.python.keras._impl.keras import backend as K @@ -119,7 +120,8 @@ class Dropout(tf_core_layers.Dropout, Layer): if training is None: training = K.learning_phase() output = super(Dropout, self).call(inputs, training=training) - if training is K.learning_phase(): + # EagerTensor object has no attribute _uses_learning_phase + if not context.in_eager_mode() and training is K.learning_phase(): output._uses_learning_phase = True # pylint: disable=protected-access return output diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py index 965ef70e6e6..eecb14ceaa3 100644 --- a/tensorflow/python/keras/_impl/keras/layers/normalization.py +++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.eager import context from tensorflow.python.keras._impl.keras import backend as K from tensorflow.python.keras._impl.keras import constraints from tensorflow.python.keras._impl.keras import initializers @@ -108,7 +109,7 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer): if training is None: training = K.learning_phase() output = super(BatchNormalization, self).call(inputs, training=training) - if training is K.learning_phase(): + if context.in_graph_mode() and training is K.learning_phase(): output._uses_learning_phase = True # pylint: disable=protected-access return output diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py index e47987aadc4..a55a5e39a69 100644 --- a/tensorflow/python/keras/_impl/keras/optimizers.py +++ b/tensorflow/python/keras/_impl/keras/optimizers.py @@ -24,6 +24,7 @@ import copy import six from six.moves import zip # pylint: disable=redefined-builtin +from tensorflow.python.eager import context from tensorflow.python.framework import dtypes as dtypes_module from tensorflow.python.framework import ops from tensorflow.python.keras._impl.keras import backend as K @@ -680,7 +681,14 @@ class TFOptimizer(Optimizer): def __init__(self, optimizer): # pylint: disable=super-init-not-called self.optimizer = optimizer with K.name_scope(self.__class__.__name__): - self.iterations = K.variable(0, dtype='int64', name='iterations') + if context.in_graph_mode(): + self.iterations = K.variable(0, dtype='int64', name='iterations') + + def apply_gradients(self, grads): + self.optimizer.apply_gradients(grads) + + def get_grads(self, loss, params): + return self.optimizer.compute_gradients(loss, params) def get_updates(self, loss, params): grads = self.optimizer.compute_gradients(loss, params) diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 8c1d16c2a8f..d4ceb2e489c 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -1043,6 +1043,7 @@ tf_py_test( "//tensorflow/python:training", "//tensorflow/python:util", "//tensorflow/python:variables", + "//tensorflow/python/eager:function", ], ) @@ -1293,7 +1294,7 @@ cuda_py_test( cuda_py_test( name = "control_flow_ops_py_test", - # TOOD(b/70473603): change this back to "small" once the C API is + # TODO(b/70473603): change this back to "small" once the C API is # permanently enabled size = "medium", srcs = ["control_flow_ops_py_test.py"], @@ -1600,6 +1601,19 @@ cuda_py_test( ], ) +cuda_py_test( + name = "manip_ops_test", + size = "small", + srcs = ["manip_ops_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:manip_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework_for_generated_wrappers", + ], + tags = ["no_windows_gpu"], +) + cuda_py_test( name = "matmul_op_test", size = "small", @@ -2821,7 +2835,7 @@ tf_py_test( "//tensorflow/python:random_ops", "//tensorflow/python:variables", ], - shard_count = 3, + shard_count = 10, tags = ["no_windows_gpu"], ) diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py index a96b88d96ff..7cff3e227c7 100644 --- a/tensorflow/python/kernel_tests/array_ops_test.py +++ b/tensorflow/python/kernel_tests/array_ops_test.py @@ -952,6 +952,32 @@ class SliceAssignTest(test_util.TensorFlowTestCase): v = variables.Variable([1, 2]) sess.run(v[:].assign([1, 2])) + def testTypeError(self): + init_val = constant_op.constant([1, 2], dtype=dtypes.int32) + too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8) + too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64) + v = variables.Variable(init_val) + with self.assertRaises(TypeError): + v[:].assign(too_small_val) + with self.assertRaises(TypeError): + v[:].assign(too_large_val) + + def testTypeErrorResource(self): + init_val = constant_op.constant([1, 2], dtype=dtypes.int32) + too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8) + too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64) + v = resource_variable_ops.ResourceVariable(init_val) + with self.test_session() as sess: + sess.run(v.initializer) + with self.assertRaisesRegexp( + errors.InvalidArgumentError, + "l-value dtype int32 does not match r-value dtype int64"): + sess.run(v[:].assign(too_large_val)) + with self.assertRaisesRegexp( + errors.InvalidArgumentError, + "l-value dtype int32 does not match r-value dtype int8"): + sess.run(v[:].assign(too_small_val)) + class ShapeSizeRankTest(test_util.TensorFlowTestCase): @@ -1114,6 +1140,26 @@ class InvertPermutationTest(test_util.TensorFlowTestCase): self.assertAllEqual(y.get_shape(), [5]) self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1]) +class UnravelIndexTest(test_util.TensorFlowTestCase): + + def testUnravelIndex(self): + with self.test_session(): + for dtype in [dtypes.int32, dtypes.int64]: + indices_1 = constant_op.constant(1621, dtype=dtype) + dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype) + out_1 = array_ops.unravel_index(indices_1, dims_1) + self.assertAllEqual(out_1.eval(), [3, 1, 4, 1]) + + indices_2 = constant_op.constant([1621], dtype=dtype) + dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype) + out_2 = array_ops.unravel_index(indices_2, dims_2) + self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]]) + + indices_3 = constant_op.constant([22, 41, 37], dtype=dtype) + dims_3 = constant_op.constant([7, 6], dtype=dtype) + out_3 = array_ops.unravel_index(indices_3, dims_3) + self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]]) + class GuaranteeConstOpTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py index 04248fb2bab..2d1b3d9b7e8 100644 --- a/tensorflow/python/kernel_tests/atrous_convolution_test.py +++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py @@ -81,6 +81,7 @@ class AtrousConvolutionTest(test.TestCase): otherwise, it's delayed after the context. """ checks = [] + def add_check(check, *args, **kwargs): if context.in_eager_mode(): args_val, kwargs_val = self.evaluate([args, kwargs]) @@ -96,12 +97,12 @@ class AtrousConvolutionTest(test.TestCase): def _test_atrous_convolution(self, add_check, input_shape, filter_shape, dilation_rate, **kwargs): - filters = np.arange(np.prod(filter_shape), - dtype=np.float32).reshape(filter_shape) + filters = np.arange( + np.prod(filter_shape), dtype=np.float32).reshape(filter_shape) filters_upsampled = upsample_filters(filters, dilation_rate) x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) - y1 = nn_ops.convolution(input=x, filter=filters, - dilation_rate=dilation_rate, **kwargs) + y1 = nn_ops.convolution( + input=x, filter=filters, dilation_rate=dilation_rate, **kwargs) y2 = nn_ops.convolution(input=x, filter=filters_upsampled, **kwargs) def check(y1_eval, y2_eval): @@ -112,13 +113,15 @@ class AtrousConvolutionTest(test.TestCase): def test_unknown_spatial_dims_for_channel_last_format(self): x = array_ops.placeholder(dtypes.float32, [1, None, None, 10]) w = array_ops.zeros([3, 3, 10, 20]) - y = nn_ops.convolution(x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC") + y = nn_ops.convolution( + x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC") self.assertEqual(y.shape.as_list(), [1, None, None, 20]) def test_unknown_spatial_dims_for_channel_first_format(self): x = array_ops.placeholder(dtypes.float32, [1, 10, None, None]) w = array_ops.zeros([3, 3, 10, 20]) - y = nn_ops.convolution(x, w, "VALID", dilation_rate=[2, 2], data_format="NCHW") + y = nn_ops.convolution( + x, w, "VALID", dilation_rate=[2, 2], data_format="NCHW") self.assertEqual(y.shape.as_list(), [1, 20, None, None]) @test_util.run_in_graph_and_eager_modes() @@ -215,28 +218,35 @@ class AtrousConvolutionTest(test.TestCase): def combined_op(converted_input, num_spatial_dims, padding_arg): # pylint: disable=unused-argument # pylint: disable=cell-var-from-loop - result = nn_ops.convolution(input=converted_input, filter=f1, - padding=padding) - result = nn_ops.convolution(input=result, filter=f2, - padding=padding) + result = nn_ops.convolution( + input=converted_input, filter=f1, padding=padding) + result = nn_ops.convolution( + input=result, filter=f2, padding=padding) # pylint: enable=cell-var-from-loop return result for rate_height in range(2, 4): for rate_width in range(2, 4): dilation_rate = [rate_height, rate_width] - y1 = nn_ops.convolution(input=x, filter=f1, padding=padding, - dilation_rate=dilation_rate) - y1 = nn_ops.convolution(input=y1, filter=f2, - padding=padding, - dilation_rate=dilation_rate) + y1 = nn_ops.convolution( + input=x, + filter=f1, + padding=padding, + dilation_rate=dilation_rate) + y1 = nn_ops.convolution( + input=y1, + filter=f2, + padding=padding, + dilation_rate=dilation_rate) y2 = nn_ops.with_space_to_batch( - input=x, dilation_rate=dilation_rate, op=combined_op, + input=x, + dilation_rate=dilation_rate, + op=combined_op, padding="VALID") def check(y1_eval, y2_eval): - self.assertAllClose(y1_eval, y2_eval, rtol=1e-2, - atol=1e-2) + self.assertAllClose(y1_eval, y2_eval, rtol=1e-2, atol=1e-2) + add_check(check, y1, y2) def _test_gradient(self, x_shape, f_shape, dilation_rate, padding): diff --git a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py index 88b3f20469a..28b3dc45e9c 100644 --- a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py +++ b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py @@ -80,7 +80,7 @@ class RangeSamplerOpsTest(test.TestCase): with self.test_session(): true_classes = constant_op.constant( [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64) - _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler( + _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler( # pylint: disable=line-too-long true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True) sampled_log_expected_count = math_ops.log(sampled_expected_count) result = sampled_log_expected_count.eval() diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py index 6e18ed132cd..4fafc36014e 100644 --- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py +++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py @@ -44,6 +44,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import data_flow_ops from tensorflow.python.ops import functional_ops from tensorflow.python.ops import gen_array_ops +from tensorflow.python.ops import gen_control_flow_ops from tensorflow.python.ops import gen_data_flow_ops from tensorflow.python.ops import gen_logging_ops from tensorflow.python.ops import gen_state_ops @@ -143,7 +144,7 @@ class ControlFlowTest(test.TestCase): enter_v = control_flow_ops._Enter(v, "foo_1", is_constant=True) nine = constant_op.constant(9) - enter_nine = control_flow_ops.enter(nine, "foo_1") + enter_nine = gen_control_flow_ops._enter(nine, "foo_1") op = state_ops.assign(enter_v, enter_nine) v2 = control_flow_ops.with_dependencies([op], enter_v) v3 = control_flow_ops.exit(v2) @@ -163,9 +164,9 @@ class ControlFlowTest(test.TestCase): def testEnterMulExit(self): with self.test_session(): data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data") - enter_data = control_flow_ops.enter(data, "foo_1", False) + enter_data = gen_control_flow_ops._enter(data, "foo_1", False) five = constant_op.constant(5) - enter_five = control_flow_ops.enter(five, "foo_1", False) + enter_five = gen_control_flow_ops._enter(five, "foo_1", False) mul_op = math_ops.multiply(enter_data, enter_five) exit_op = control_flow_ops.exit(mul_op) @@ -177,12 +178,13 @@ class ControlFlowTest(test.TestCase): v = variables.Variable([0.0, 0.0], dtype=dtypes.float32) # If is_constant=True, the shape information should be propagated. - enter_v_constant = control_flow_ops.enter(v, "frame1", is_constant=True) + enter_v_constant = gen_control_flow_ops._enter( + v, "frame1", is_constant=True) self.assertEqual(enter_v_constant.shape, [2]) # Otherwise, the shape should be unknown. - enter_v_non_constant = control_flow_ops.enter(v, "frame2", - is_constant=False) + enter_v_non_constant = gen_control_flow_ops._enter( + v, "frame2", is_constant=False) self.assertEqual(enter_v_non_constant.shape, None) def testSwitchMergeIndexedSlices(self): @@ -255,8 +257,8 @@ class ControlFlowTest(test.TestCase): false = ops.convert_to_tensor(False) n = constant_op.constant(10) - enter_false = control_flow_ops.enter(false, "foo_1", False) - enter_n = control_flow_ops.enter(n, "foo_1", False) + enter_false = gen_control_flow_ops._enter(false, "foo_1", False) + enter_n = gen_control_flow_ops._enter(n, "foo_1", False) merge_n = control_flow_ops.merge([enter_n, enter_n], name="merge_n")[0] switch_n = control_flow_ops.switch(merge_n, enter_false) @@ -273,9 +275,9 @@ class ControlFlowTest(test.TestCase): one = constant_op.constant(1) n = constant_op.constant(10) - enter_i = control_flow_ops.enter(zero, "foo", False) - enter_one = control_flow_ops.enter(one, "foo", True) - enter_n = control_flow_ops.enter(n, "foo", True) + enter_i = gen_control_flow_ops._enter(zero, "foo", False) + enter_one = gen_control_flow_ops._enter(one, "foo", True) + enter_n = gen_control_flow_ops._enter(n, "foo", True) with ops.device(test.gpu_device_name()): merge_i = control_flow_ops.merge([enter_i, enter_i])[0] @@ -299,9 +301,9 @@ class ControlFlowTest(test.TestCase): one = constant_op.constant(1) n = constant_op.constant(10) - enter_i = control_flow_ops.enter(zero, "foo", False) - enter_one = control_flow_ops.enter(one, "foo", True) - enter_n = control_flow_ops.enter(n, "foo", True) + enter_i = gen_control_flow_ops._enter(zero, "foo", False) + enter_one = gen_control_flow_ops._enter(one, "foo", True) + enter_n = gen_control_flow_ops._enter(n, "foo", True) merge_i = control_flow_ops.merge([enter_i, enter_i])[0] @@ -322,8 +324,8 @@ class ControlFlowTest(test.TestCase): def testDifferentFrame(self): with self.test_session(): data = array_ops.placeholder(dtypes.float32, shape=[]) - enter_1 = control_flow_ops.enter(data, "foo_1", False) - enter_2 = control_flow_ops.enter(data, "foo_2", False) + enter_1 = gen_control_flow_ops._enter(data, "foo_1", False) + enter_2 = gen_control_flow_ops._enter(data, "foo_2", False) res = math_ops.add(enter_1, enter_2) with self.assertRaisesOpError("has inputs from different frames"): res.eval(feed_dict={data: 1.0}) @@ -736,24 +738,21 @@ class ControlFlowTest(test.TestCase): with self.test_session(): s = constant_op.constant([1, 2, 3, 4, 5]) r = isum(s, maximum_iterations=3) - self.assertAllEqual([1+3, 2+3, 3+3, 4+3, 5+3], r.eval()) + self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval()) def testWhileWithMaximumIterationsAndSingleArgument(self): with self.test_session(): r = control_flow_ops.while_loop( - lambda i: i < 3, - lambda i: i + 1, - [0], - maximum_iterations=1) + lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1) self.assertEqual(1, r.eval()) def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self): v = constant_op.constant(1.0) + def training_loop_with_gradient(i): out = control_flow_ops.while_loop( lambda i_, _: i_ < 3, - lambda i_, j: [i_ + 1, j * v], - [0, 1.0], + lambda i_, j: [i_ + 1, j * v], [0, 1.0], maximum_iterations=i) g = gradients_impl.gradients(out, v) with ops.control_dependencies(g): @@ -763,8 +762,8 @@ class ControlFlowTest(test.TestCase): xla_context.Enter() # Create training loop, ensure we can call gradient() of # while_loop inside the training loop. - loop = control_flow_ops.while_loop( - lambda i: i < 3, training_loop_with_gradient, [0]) + loop = control_flow_ops.while_loop(lambda i: i < 3, + training_loop_with_gradient, [0]) xla_context.Exit() loop_execute = array_ops.identity(loop) # Because loop is not fetchable. @@ -774,17 +773,18 @@ class ControlFlowTest(test.TestCase): def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self): v = constant_op.constant(1.0) + def inner_body(i, x): out = control_flow_ops.while_loop( lambda i, _: i < 3, - lambda i, j: [i + 1, j * v], - [0, x], + lambda i, j: [i + 1, j * v], [0, x], maximum_iterations=i) return out def create_while_loop(maximum_iterations=None): return control_flow_ops.while_loop( - lambda i, _: i < 3, inner_body, [0, 1.0], + lambda i, _: i < 3, + inner_body, [0, 1.0], maximum_iterations=maximum_iterations) loop_no_xla = create_while_loop(maximum_iterations=5) @@ -819,14 +819,17 @@ class ControlFlowTest(test.TestCase): def create_while_loop(): max_iter_holder = [] + def create_mi(): max_iter_holder.append(array_ops.placeholder(dtypes.int32, shape=())) return 1.0 - _ = control_flow_ops.cond(constant_op.constant(True), - create_mi, create_mi) + + _ = control_flow_ops.cond( + constant_op.constant(True), create_mi, create_mi) return control_flow_ops.while_loop( - lambda i, _: i < 3, lambda i, x: (i + 1, v * x), (0, 1.0), + lambda i, _: i < 3, + lambda i, x: (i + 1, v * x), (0, 1.0), maximum_iterations=max_iter_holder[0]) xla_context = control_flow_ops.XLAControlFlowContext() @@ -849,28 +852,32 @@ class ControlFlowTest(test.TestCase): p = array_ops.placeholder(dtype=dtypes.int32) def mid_body_builder(iterations): + def mid_body(i, x): r = control_flow_ops.while_loop( lambda *_: True, - lambda i, x: (i + 1, v * x), - (0, x), - maximum_iterations=iterations, name="inner") + lambda i, x: (i + 1, v * x), (0, x), + maximum_iterations=iterations, + name="inner") return (i + 1, gradients_impl.gradients(x + r[1], v)[0]) + return mid_body def outer_body(i, x): iterations = array_ops.size(p, name="iterations") - return ( - i + 1, - x + control_flow_ops.while_loop( - lambda *_: True, mid_body_builder(iterations), (0, x), - maximum_iterations=iterations, name="mid")[1]) + return (i + 1, x + control_flow_ops.while_loop( + lambda *_: True, + mid_body_builder(iterations), (0, x), + maximum_iterations=iterations, + name="mid")[1]) def create_while_loop(): with ops.device("/cpu:0"): r = control_flow_ops.while_loop( - lambda *_: True, outer_body, (0, 1.0), - maximum_iterations=5, name="outer") + lambda *_: True, + outer_body, (0, 1.0), + maximum_iterations=5, + name="outer") return array_ops.identity(r[1]) xla_context = control_flow_ops.XLAControlFlowContext() @@ -881,18 +888,19 @@ class ControlFlowTest(test.TestCase): final_without_xla_context = create_while_loop() with self.test_session(use_gpu=False) as sess: - opts = config_pb2.RunOptions( - trace_level=config_pb2.RunOptions.FULL_TRACE) + opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE) run_metadata = config_pb2.RunMetadata() final_value_without_xla_context = sess.run( - final_without_xla_context, - feed_dict={p: [0, 0, 0]}) + final_without_xla_context, feed_dict={ + p: [0, 0, 0] + }) final_value_with_xla_context = sess.run( final_with_xla_context, feed_dict={p: [0, 0, 0]}, - options=opts, run_metadata=run_metadata) + options=opts, + run_metadata=run_metadata) node_stats = run_metadata.step_stats.dev_stats[0].node_stats stack_push_count = len( @@ -901,8 +909,8 @@ class ControlFlowTest(test.TestCase): # the last two "3"s comes from size(p), when p == [0, 0, 0]. self.assertEqual(stack_push_count, 5 * 3 * 3) - self.assertAllClose( - final_value_with_xla_context, final_value_without_xla_context) + self.assertAllClose(final_value_with_xla_context, + final_value_without_xla_context) # Have more than 10 parallel iterations and hence exercise k-bound # most of the time. @@ -951,8 +959,7 @@ class ControlFlowTest(test.TestCase): with self.test_session(): def compute(i, c, o): - c = array_ops.strided_slice(x, - array_ops.expand_dims(i, 0), + c = array_ops.strided_slice(x, array_ops.expand_dims(i, 0), [1] + array_ops.expand_dims(i, 0)) o = array_ops.concat([o, c], 0) i = math_ops.add(i, 1) @@ -963,11 +970,12 @@ class ControlFlowTest(test.TestCase): o = ops.convert_to_tensor([0]) x = ops.convert_to_tensor([1, 2, 3, 4, 5, 6]) s = array_ops.size(x) - r = control_flow_ops.while_loop( - lambda i, c, o: math_ops.less(i, s), compute, [i, c, o], [ - i.get_shape(), tensor_shape.unknown_shape(), - tensor_shape.unknown_shape() - ]) + r = control_flow_ops.while_loop(lambda i, c, o: math_ops.less(i, s), + compute, [i, c, o], [ + i.get_shape(), + tensor_shape.unknown_shape(), + tensor_shape.unknown_shape() + ]) result = r[2].eval() self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result) @@ -1033,7 +1041,8 @@ class ControlFlowTest(test.TestCase): return [new_i, new_j] r = control_flow_ops.while_loop( - c, _b, [i, m], [i.get_shape(), tensor_shape.unknown_shape()]) + c, _b, [i, m], + [i.get_shape(), tensor_shape.unknown_shape()]) r = r[1] * array_ops.ones([8, 8]) self.assertAllEqual(np.ones((8, 8)), r.eval()) @@ -1065,7 +1074,8 @@ class ControlFlowTest(test.TestCase): return [new_i, new_j] r = control_flow_ops.while_loop( - c, b, [i, m], [i.get_shape(), tensor_shape.TensorShape([None, 2])]) + c, b, [i, m], + [i.get_shape(), tensor_shape.TensorShape([None, 2])]) self.assertTrue(r[1].get_shape()[0].value is None) self.assertEqual(r[1].get_shape()[1], tensor_shape.Dimension(2)) @@ -1092,20 +1102,22 @@ class ControlFlowTest(test.TestCase): def b(i, x): return [ - i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0, - x.dense_shape) + i + 1, + sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape) ] _, r = control_flow_ops.while_loop(c, b, [i, x]) self.assertEqual(r.dense_shape.get_shape()[0].value, 1) _, r = control_flow_ops.while_loop( - c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None])]) + c, b, [i, x], + [i.get_shape(), tensor_shape.TensorShape([None])]) self.assertTrue(r.dense_shape.get_shape()[0].value is None) with self.assertRaisesRegexp(ValueError, "is not compatible with"): _, r = control_flow_ops.while_loop( - c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([5])]) + c, b, [i, x], + [i.get_shape(), tensor_shape.TensorShape([5])]) def testWhileShapeInferenceIndexedSlices(self): with self.test_session(): @@ -1120,7 +1132,8 @@ class ControlFlowTest(test.TestCase): def b(i, x): return [ - i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape) + i + 1, + ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape) ] _, r = control_flow_ops.while_loop(c, b, [i, x]) @@ -1128,14 +1141,16 @@ class ControlFlowTest(test.TestCase): self.assertEqual(r.values.get_shape(), tensor_shape.TensorShape([2, 2])) _, r = control_flow_ops.while_loop( - c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 2])]) + c, b, [i, x], + [i.get_shape(), tensor_shape.TensorShape([None, 2])]) self.assertEqual(r.dense_shape.get_shape()[0].value, 2) self.assertTrue(r.values.get_shape()[0].value is None) self.assertEqual(r.values.get_shape()[1].value, 2) with self.assertRaisesRegexp(ValueError, "is not compatible with"): _, r = control_flow_ops.while_loop( - c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 5])]) + c, b, [i, x], + [i.get_shape(), tensor_shape.TensorShape([None, 5])]) def _testNestedWhile_1(self, use_gpu): with self.test_session(use_gpu=use_gpu): @@ -1276,16 +1291,17 @@ class ControlFlowTest(test.TestCase): "v", [], initializer=init_ops.constant_initializer(2)) i0 = constant_op.constant(0) with ops.control_dependencies([i0]): + def loop_condition(i): return i < 4 def loop_body(i): some_cond = control_flow_ops.cond( constant_op.constant(True), - lambda: state_ops.assign(v, math_ops.square(v)), - lambda: v) + lambda: state_ops.assign(v, math_ops.square(v)), lambda: v) with ops.control_dependencies([some_cond]): return i + 1 + r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,)) variables.global_variables_initializer().run() self.assertEqual(4, r.eval()) @@ -1600,7 +1616,8 @@ class ControlFlowTest(test.TestCase): _, rx = control_flow_ops.while_loop( c1, - b1, [r, x], [r.get_shape(), tensor_shape.unknown_shape()], + b1, [r, x], + [r.get_shape(), tensor_shape.unknown_shape()], parallel_iterations=1) self.assertEqual(45, rx.eval()) @@ -1663,7 +1680,8 @@ class ControlFlowTest(test.TestCase): b = lambda i, v: [i + 1, math_ops.multiply(x, v)] r = control_flow_ops.while_loop( c, - b, [n, v], [n.get_shape(), tensor_shape.unknown_shape()], + b, [n, v], + [n.get_shape(), tensor_shape.unknown_shape()], parallel_iterations=1) r = gradients_impl.gradients(r[1], x)[0] @@ -1797,8 +1815,8 @@ class ControlFlowTest(test.TestCase): named = collections.namedtuple("named", ("a", "b")) loop_vars = [ named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)), - (constant_op.constant(2.0), - constant_op.constant(3.0)), constant_op.constant(4.0) + (constant_op.constant(2.0), constant_op.constant(3.0)), + constant_op.constant(4.0) ] c = lambda lv0, _1, _2: lv0.a < 100.0 @@ -1824,8 +1842,8 @@ class ControlFlowTest(test.TestCase): named = collections.namedtuple("named", ("a", "b")) loop_vars = [ named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)), - (constant_op.constant(2.0), - constant_op.constant(3.0)), constant_op.constant(4.0) + (constant_op.constant(2.0), constant_op.constant(3.0)), + constant_op.constant(4.0) ] c = lambda lv0, _1, _2: lv0.a < 100.0 @@ -2176,7 +2194,8 @@ class ControlFlowTest(test.TestCase): def b(i, x): return [ - i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape) + i + 1, + ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape) ] _, r = control_flow_ops.while_loop(c, b, [i, x]) @@ -2197,8 +2216,8 @@ class ControlFlowTest(test.TestCase): def b(i, x): return [ - i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0, - x.dense_shape) + i + 1, + sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape) ] _, r = control_flow_ops.while_loop(c, b, [i, x]) @@ -2220,8 +2239,8 @@ class ControlFlowTest(test.TestCase): x1 = x + gradients_impl.gradients(data, params)[0] return i + 1, x1 - output_grad = control_flow_ops.while_loop(c, b, - [i0, constant_op.constant(0.0)]) + output_grad = control_flow_ops.while_loop( + c, b, [i0, constant_op.constant(0.0)]) self.assertAllClose(600.0, sess.run(output_grad)[1]) def testWhileAndTensorArray(self): @@ -2359,9 +2378,12 @@ class ControlFlowTest(test.TestCase): def testStopGradMultiFlows(self): with self.test_session(): + def body(i, y, r): x = variable_scope.get_variable( - "x", shape=(), dtype=dtypes.float32, + "x", + shape=(), + dtype=dtypes.float32, initializer=init_ops.ones_initializer()) y *= x return [i + 1, y, r + math_ops.reduce_sum(y)] @@ -2773,7 +2795,8 @@ class ControlFlowTest(test.TestCase): r = control_flow_ops.while_loop( lambda i, v: i < 2, lambda i, v: [i + 1, func(v)], [constant_op.constant(0), x], - [tensor_shape.unknown_shape(), tensor_shape.unknown_shape()]) + [tensor_shape.unknown_shape(), + tensor_shape.unknown_shape()]) self.assertEqual(r[1].eval(), 65536.0) r = gradients_impl.gradients(r, x)[0] @@ -2800,12 +2823,14 @@ class ControlFlowContextCheckTest(test.TestCase): def _getCondTensor(self): cond_tensor = [] + def true_fn(): if not cond_tensor: cond_tensor.append(constant_op.constant(1)) return cond_tensor[0] - control_flow_ops.cond(math_ops.less(1, 2), true_fn, - lambda: constant_op.constant(0)) + + control_flow_ops.cond( + math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0)) return cond_tensor[0] def testInvalidContext(self): @@ -2821,14 +2846,13 @@ class ControlFlowContextCheckTest(test.TestCase): # Accessing a while loop tensor in cond is illegal. while_tensor = self._getWhileTensor() with self.assertRaisesRegexp( - ValueError, - "Cannot use 'while/Const_1' as input to 'cond/Add' because " + ValueError, "Cannot use 'while/Const_1' as input to 'cond/Add' because " "'while/Const_1' is in a while loop. See info log for more details."): # TODO(skyewm): this passes if we return while_tensor directly instead # of using it as input to another op. - control_flow_ops.cond(math_ops.less(1, 2), - lambda: math_ops.add(1, while_tensor), - lambda: constant_op.constant(0)) + control_flow_ops.cond( + math_ops.less(1, 2), lambda: math_ops.add(1, while_tensor), + lambda: constant_op.constant(0)) def testInvalidContextInWhile(self): # Accessing a while loop tensor in a different while loop is illegal. @@ -2856,6 +2880,7 @@ class ControlFlowContextCheckTest(test.TestCase): # Accessing a tensor from a cond context from the other branch's cond # context is OK (although dangerous). cond_tensor = [] + def branch_fn(): if not cond_tensor: cond_tensor.append(constant_op.constant(1)) @@ -2892,12 +2917,13 @@ class ControlFlowContextCheckTest(test.TestCase): while_tensor = self._getWhileTensor() return control_flow_ops.while_loop(lambda i: i < 3, lambda i: i + while_tensor, [0]) + with self.assertRaisesRegexp( ValueError, "Cannot use 'cond/while_1/add' as input to 'cond/while/Const_1' because" " they are in different while loops. See info log for more details."): - control_flow_ops.cond(math_ops.less(1, 2), true_fn, - lambda: constant_op.constant(0)) + control_flow_ops.cond( + math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0)) @test_util.with_c_api @@ -3005,11 +3031,13 @@ class AssertTest(test.TestCase): sess.run(unguarded_assert, options=opts, run_metadata=unguarded_metadata) guarded_nodestat_names = [ n.node_name - for d in guarded_metadata.step_stats.dev_stats for n in d.node_stats + for d in guarded_metadata.step_stats.dev_stats + for n in d.node_stats ] unguarded_nodestat_names = [ n.node_name - for d in unguarded_metadata.step_stats.dev_stats for n in d.node_stats + for d in unguarded_metadata.step_stats.dev_stats + for n in d.node_stats ] guarded_memcpy_nodestat_names = [ n for n in guarded_nodestat_names if "MEMCPYDtoH" in n @@ -3066,6 +3094,7 @@ class WhileOpBenchmark(test.Benchmark): Returns: The duration of the run in seconds. """ + def loop_body(i, x): with ops.device("/gpu:0"): # Always put loop body on GPU. @@ -3107,7 +3136,7 @@ class WhileOpBenchmark(test.Benchmark): start_time = time.time() for _ in xrange(num_iters): sess.run(r) - return (time.time() - start_time)/num_iters + return (time.time() - start_time) / num_iters def benchmarkWhileOpCrossDevicePlacement(self): iters = 10 @@ -3154,23 +3183,20 @@ class EagerTest(test.TestCase): def testWhileLoop(self): with context.eager_mode(): tensor = constant_op.constant([1, 2, 3, 4, 5]) - self.assertAllEqual(isum(tensor).numpy(), - [46, 47, 48, 49, 50]) + self.assertAllEqual(isum(tensor).numpy(), [46, 47, 48, 49, 50]) def testWhileLoopWithMaxIterations(self): with context.eager_mode(): tensor = constant_op.constant([1, 2, 3, 4, 5]) - self.assertAllEqual(isum(tensor, maximum_iterations=3).numpy(), - [1+3, 2+3, 3+3, 4+3, 5+3]) + self.assertAllEqual( + isum(tensor, maximum_iterations=3).numpy(), + [1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3]) def testWhileWithMaximumIterationsAndSingleArgument(self): with context.eager_mode(): tensor = constant_op.constant(0) r = control_flow_ops.while_loop( - lambda i: i < 3, - lambda i: i + 1, - [tensor], - maximum_iterations=1) + lambda i: i < 3, lambda i: i + 1, [tensor], maximum_iterations=1) self.assertEqual(1, r.numpy()) def testWithDependencies(self): @@ -3197,8 +3223,8 @@ class EagerTest(test.TestCase): f2 = lambda: constant_op.constant(23) f3 = lambda: constant_op.constant(-1) - r1 = control_flow_ops.case([(x < y, f1), (x > z, f2)], - default=f3, exclusive=True) + r1 = control_flow_ops.case( + [(x < y, f1), (x > z, f2)], default=f3, exclusive=True) self.assertAllEqual(r1.numpy(), 17) diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py index 39e96f74b04..23185eaeece 100644 --- a/tensorflow/python/kernel_tests/control_flow_util_test.py +++ b/tensorflow/python/kernel_tests/control_flow_util_test.py @@ -41,17 +41,17 @@ class ControlFlowUtilTest(test.TestCase): self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op)) def testIsLoopEnter(self): - enter = gen_control_flow_ops.enter(1, frame_name="name").op + enter = gen_control_flow_ops._enter(1, frame_name="name").op self.assertTrue(control_flow_util.IsLoopEnter(enter)) self.assertFalse(control_flow_util.IsLoopConstantEnter(enter)) - ref_enter = gen_control_flow_ops.ref_enter(test_ops.ref_output(), - frame_name="name").op + ref_enter = gen_control_flow_ops._ref_enter(test_ops.ref_output(), + frame_name="name").op self.assertTrue(control_flow_util.IsLoopEnter(ref_enter)) self.assertFalse(control_flow_util.IsLoopConstantEnter(ref_enter)) - const_enter = gen_control_flow_ops.enter(1, frame_name="name", - is_constant=True).op + const_enter = gen_control_flow_ops._enter(1, frame_name="name", + is_constant=True).op self.assertTrue(control_flow_util.IsLoopEnter(const_enter)) self.assertTrue(control_flow_util.IsLoopConstantEnter(const_enter)) diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py index 3e9bd3dade6..c5446326ba1 100644 --- a/tensorflow/python/kernel_tests/conv_ops_test.py +++ b/tensorflow/python/kernel_tests/conv_ops_test.py @@ -24,6 +24,7 @@ import time import numpy as np +from six.moves import xrange from tensorflow.contrib import layers from tensorflow.python.client import session as session_lib from tensorflow.python.framework import constant_op diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py index a91917b27fa..0d9b46c30db 100644 --- a/tensorflow/python/kernel_tests/cwise_ops_test.py +++ b/tensorflow/python/kernel_tests/cwise_ops_test.py @@ -71,6 +71,7 @@ def _sparsify(x, thresh=0.5, index_dtype=np.int64): return sparse_tensor.SparseTensor( indices=x_indices, values=x_values, dense_shape=x_shape), x_values + def _default_tolerance(dtype): """Returns a sensible default tolerance for comparing results of a given type""" @@ -81,7 +82,7 @@ def _default_tolerance(dtype): elif dtype in (np.float64, np.complex128): return 1e-5 else: - return None # Fail fast for unexpected types + return None # Fail fast for unexpected types class UnaryOpTest(test.TestCase): @@ -233,10 +234,10 @@ class UnaryOpTest(test.TestCase): self._compareBoth(k, np.arccos, math_ops.acos) self._compareBoth(x, np.arctan, math_ops.atan) self._compareBoth(x, np.tan, math_ops.tan) - self._compareBoth( - y, - np.vectorize(self._replace_domain_error_with_inf(math.lgamma)), - math_ops.lgamma) + self._compareBoth(y, + np.vectorize( + self._replace_domain_error_with_inf(math.lgamma)), + math_ops.lgamma) self._compareBoth(x, np.vectorize(math.erf), math_ops.erf) self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc) @@ -298,8 +299,8 @@ class UnaryOpTest(test.TestCase): w = x - x.min() + 1.02 # all greater than 1 y = (x + .5).astype(np.float64) # no zero z = (x + 15.5).astype(np.float64) # all positive - k = np.arange(-0.90, 0.90, 0.35).reshape(1, 3, 2).astype( - np.float64) # between -1 and 1 + k = np.arange(-0.90, 0.90, + 0.35).reshape(1, 3, 2).astype(np.float64) # between -1 and 1 self._compareBoth(x, np.abs, math_ops.abs) self._compareBoth(x, np.abs, _ABS) self._compareBoth(x, np.negative, math_ops.negative) @@ -322,10 +323,10 @@ class UnaryOpTest(test.TestCase): self._compareBoth(y, np.sign, math_ops.sign) self._compareBoth(x, np.sin, math_ops.sin) self._compareBoth(x, np.cos, math_ops.cos) - self._compareBoth( - y, - np.vectorize(self._replace_domain_error_with_inf(math.lgamma)), - math_ops.lgamma) + self._compareBoth(y, + np.vectorize( + self._replace_domain_error_with_inf(math.lgamma)), + math_ops.lgamma) self._compareBoth(x, np.vectorize(math.erf), math_ops.erf) self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc) self._compareBoth(x, np.arctan, math_ops.atan) @@ -362,10 +363,10 @@ class UnaryOpTest(test.TestCase): self._compareBoth(y, np.sign, math_ops.sign) self._compareBoth(x, np.sin, math_ops.sin) self._compareBoth(x, np.cos, math_ops.cos) - self._compareBoth( - y, - np.vectorize(self._replace_domain_error_with_inf(math.lgamma)), - math_ops.lgamma) + self._compareBoth(y, + np.vectorize( + self._replace_domain_error_with_inf(math.lgamma)), + math_ops.lgamma) self._compareBoth(x, np.vectorize(math.erf), math_ops.erf) self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc) @@ -406,8 +407,8 @@ class UnaryOpTest(test.TestCase): self._compareBothSparse(x, np.sign, math_ops.sign) def testComplex64Basic(self): - x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, - 2).astype(np.complex64) + x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype( + np.complex64) y = x + np.complex(0.5, 0.5) # no zeros self._compareBoth(x, np.abs, math_ops.abs) self._compareBoth(x, np.abs, _ABS) @@ -450,8 +451,8 @@ class UnaryOpTest(test.TestCase): self._compareBothSparse(y, complex_sign, math_ops.sign) def testComplex128Basic(self): - x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, - 2).astype(np.complex128) + x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype( + np.complex128) y = x + np.complex(0.5, 0.5) # no zeros self._compareBoth(x, np.abs, math_ops.abs) self._compareBoth(x, np.abs, _ABS) @@ -805,10 +806,10 @@ class BinaryOpTest(test.TestCase): self._compareBoth(x, y, np.mod, _MOD) def testComplex64Basic(self): - x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape( - 1, 3, 2).astype(np.complex64) - y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape( - 1, 3, 2).astype(np.complex64) + x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype( + np.complex64) + y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype( + np.complex64) self._compareBoth(x, y, np.add, math_ops.add) self._compareBoth(x, y, np.subtract, math_ops.subtract) self._compareBoth(x, y, np.multiply, math_ops.multiply) @@ -819,10 +820,10 @@ class BinaryOpTest(test.TestCase): self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV) def testComplex128Basic(self): - x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape( - 1, 3, 2).astype(np.complex128) - y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape( - 1, 3, 2).astype(np.complex128) + x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype( + np.complex128) + y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype( + np.complex128) self._compareBoth(x, y, np.add, math_ops.add) self._compareBoth(x, y, np.subtract, math_ops.subtract) self._compareBoth(x, y, np.multiply, math_ops.multiply) @@ -1127,8 +1128,8 @@ class BinaryOpTest(test.TestCase): def testMismatchedDimensions(self): for func in [ - math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div, - _ADD, _SUB, _MUL, _TRUEDIV, _FLOORDIV + math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div, _ADD, + _SUB, _MUL, _TRUEDIV, _FLOORDIV ]: with self.assertRaisesWithPredicateMatch( ValueError, lambda e: "Dimensions must" in str(e)): @@ -1161,8 +1162,8 @@ class BinaryOpTest(test.TestCase): (1.2345, float("inf")), (1.2345, -float("inf")), (-4.321, float("inf")), (-4.125, -float("inf")), (float("inf"), float("inf")), (float("inf"), -float("inf")), - (-float("inf"), float("inf")), (-float("inf"), - -float("inf"))) + (-float("inf"), float("inf")), + (-float("inf"), -float("inf"))) for dtype in np.float32, np.float64: x1 = np.array(x1l).astype(dtype) x2 = np.array(x2l).astype(dtype) @@ -1213,22 +1214,22 @@ class ComparisonOpTest(test.TestCase): for x in data: for y in data: self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y) - self.assertEqual(self._compareScalar(math_ops.less_equal, x, y, t), - x <= y) - self.assertEqual(self._compareScalar(math_ops.greater, x, y, t), - x > y) + self.assertEqual( + self._compareScalar(math_ops.less_equal, x, y, t), x <= y) + self.assertEqual( + self._compareScalar(math_ops.greater, x, y, t), x > y) self.assertEqual( self._compareScalar(math_ops.greater_equal, x, y, t), x >= y) self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y) - self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t), - x != y) + self.assertEqual( + self._compareScalar(math_ops.not_equal, x, y, t), x != y) data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j] for t in [np.complex64, np.complex128]: for x in data: for y in data: self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y) - self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t), - x != y) + self.assertEqual( + self._compareScalar(math_ops.not_equal, x, y, t), x != y) def _compare(self, x, y, np_func, tf_func): np_ans = np_func(x, y) @@ -1311,8 +1312,8 @@ class ComparisonOpTest(test.TestCase): self._testBCastByFunc(np.equal, math_ops.equal, include_complex=True) def testBCastNotEqual(self): - self._testBCastByFunc(np.not_equal, math_ops.not_equal, - include_complex=True) + self._testBCastByFunc( + np.not_equal, math_ops.not_equal, include_complex=True) def testShapeMismatch(self): dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64] @@ -1771,9 +1772,8 @@ class MathOpsOverloadTest(test.TestCase): def _compareUnary(self, x, dtype, np_func, tf_func): np_ans = np_func(x).astype(dtype.as_numpy_dtype) with self.test_session(use_gpu=False): - self.assertAllClose( - np_ans, tf_func(ops.convert_to_tensor( - x, dtype=dtype)).eval()) + self.assertAllClose(np_ans, + tf_func(ops.convert_to_tensor(x, dtype=dtype)).eval()) def testOverload(self): dtypes = [ @@ -1795,8 +1795,8 @@ class MathOpsOverloadTest(test.TestCase): ] for dtype in dtypes: for np_func, tf_func in funcs: - if dtype in (dtypes_lib.complex64, dtypes_lib.complex128 - ) and tf_func == _FLOORDIV: + if dtype in (dtypes_lib.complex64, + dtypes_lib.complex128) and tf_func == _FLOORDIV: continue # floordiv makes no sense for complex self._compareBinary(10, 5, dtype, np_func, tf_func) # Mod only works for int32 and int64. @@ -2008,7 +2008,8 @@ class ComplexMakeRealImagTest(test.TestCase): # self._compareAngle(cplx, use_gpu=True) def testRealReal(self): - for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32, dtypes_lib.float64: + for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32, + dtypes_lib.float64): x = array_ops.placeholder(dtype) y = math_ops.real(x) self.assertEqual(x, y) @@ -2037,15 +2038,16 @@ class ComplexMakeRealImagTest(test.TestCase): self._compareConj(cplx, use_gpu=True) def testConjReal(self): - for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64: + for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16, + dtypes_lib.float32, dtypes_lib.float64): x = array_ops.placeholder(dtype) y = math_ops.conj(x) self.assertEqual(x, y) def testConjString(self): x = array_ops.placeholder(dtypes_lib.string) - with self.assertRaisesRegexp( - TypeError, r"Expected numeric or variant tensor"): + with self.assertRaisesRegexp(TypeError, + r"Expected numeric or variant tensor"): math_ops.conj(x) def _compareGradient(self, x): @@ -2060,8 +2062,9 @@ class ComplexMakeRealImagTest(test.TestCase): real, imag = array_ops.reshape(real, [-1]), array_ops.reshape(imag, [-1]) cplx = math_ops.complex(real, imag) cplx = math_ops.conj(cplx) - loss = math_ops.reduce_sum(math_ops.square(math_ops.real( - cplx))) + math_ops.reduce_sum(math_ops.square(math_ops.imag(cplx))) + loss = math_ops.reduce_sum(math_ops.square( + math_ops.real(cplx))) + math_ops.reduce_sum( + math_ops.square(math_ops.imag(cplx))) epsilon = 1e-3 jacob_t, jacob_n = gradient_checker.compute_gradient( inx, list(x.shape), loss, [1], x_init_value=x, delta=epsilon) @@ -2125,8 +2128,8 @@ class AccumulateTest(test.TestCase): np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20) ] random_tensors = [ - ops.convert_to_tensor( - x, dtype=dtypes_lib.float32) for x in random_arrays + ops.convert_to_tensor(x, dtype=dtypes_lib.float32) + for x in random_arrays ] tf_val = math_ops.accumulate_n(random_tensors) np_val = random_arrays[0] diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py index ead55cd03b6..89fd26c544b 100644 --- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py +++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py @@ -21,6 +21,7 @@ from __future__ import print_function import os import time +from six.moves import xrange from tensorflow.python.client import session from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py index cf723f5eec3..a4b30e43195 100644 --- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py +++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py @@ -48,8 +48,10 @@ class DynamicStitchTestBase(object): def testShapeInferenceForScalarWithNonConstantIndices(self): with self.test_session(use_gpu=True): - indices = [array_ops.placeholder(dtype=dtypes.int32), - constant_op.constant(1)] + indices = [ + array_ops.placeholder(dtype=dtypes.int32), + constant_op.constant(1) + ] data = [constant_op.constant(40), constant_op.constant(60)] for step in -1, 1: stitched_t = self.stitch_op(indices[::step], data) @@ -61,7 +63,8 @@ class DynamicStitchTestBase(object): def testSimpleOneDimensional(self): with self.test_session(use_gpu=True): indices = [ - constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5]) + constant_op.constant([0, 4, 7]), + constant_op.constant([1, 6, 2, 3, 5]) ] data = [ constant_op.constant([0, 40, 70]), @@ -86,7 +89,8 @@ class DynamicStitchTestBase(object): def testSimpleTwoDimensional(self): with self.test_session(use_gpu=True): indices = [ - constant_op.constant([0, 4, 7]), constant_op.constant([1, 6]), + constant_op.constant([0, 4, 7]), + constant_op.constant([1, 6]), constant_op.constant([2, 3, 5]) ] data = [ @@ -104,7 +108,8 @@ class DynamicStitchTestBase(object): def testHigherRank(self): with self.test_session(use_gpu=True) as sess: indices = [ - constant_op.constant(6), constant_op.constant([4, 1]), + constant_op.constant(6), + constant_op.constant([4, 1]), constant_op.constant([[5, 2], [0, 3]]) ] data = [ @@ -127,7 +132,8 @@ class DynamicStitchTestBase(object): def testErrorIndicesMultiDimensional(self): indices = [ - constant_op.constant([0, 4, 7]), constant_op.constant([[1, 6, 2, 3, 5]]) + constant_op.constant([0, 4, 7]), + constant_op.constant([[1, 6, 2, 3, 5]]) ] data = [ constant_op.constant([[0, 40, 70]]), @@ -138,7 +144,8 @@ class DynamicStitchTestBase(object): def testErrorDataNumDimsMismatch(self): indices = [ - constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5]) + constant_op.constant([0, 4, 7]), + constant_op.constant([1, 6, 2, 3, 5]) ] data = [ constant_op.constant([0, 40, 70]), @@ -149,7 +156,8 @@ class DynamicStitchTestBase(object): def testErrorDataDimSizeMismatch(self): indices = [ - constant_op.constant([0, 4, 5]), constant_op.constant([1, 6, 2, 3]) + constant_op.constant([0, 4, 5]), + constant_op.constant([1, 6, 2, 3]) ] data = [ constant_op.constant([[0], [40], [70]]), @@ -160,7 +168,8 @@ class DynamicStitchTestBase(object): def testErrorDataAndIndicesSizeMismatch(self): indices = [ - constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5]) + constant_op.constant([0, 4, 7]), + constant_op.constant([1, 6, 2, 3, 5]) ] data = [ constant_op.constant([0, 40, 70]), @@ -235,13 +244,15 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase): def testHigherRankGPU(self): with self.test_session() as sess: indices = [ - constant_op.constant(6), constant_op.constant([4, 1]), + constant_op.constant(6), + constant_op.constant([4, 1]), constant_op.constant([[5, 2], [0, 3]]) ] data = [ constant_op.constant([61, 62], dtype=dtypes.float32), constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32), - constant_op.constant([[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32) + constant_op.constant( + [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32) ] stitched_t = data_flow_ops.dynamic_stitch(indices, data) stitched_val = stitched_t.eval() diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py index 5c7624f1f6b..6ea9f1badc3 100644 --- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py +++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py @@ -84,7 +84,7 @@ class ExtractImagePatches(test.TestCase): patches=patches) def testKsize2x2Stride1x1Rate1x1Valid(self): - """Test for 1x1 kernel .""" + """Test for 2x2 kernel with VALID padding.""" # [1, 2, 2, 1] image = [[[[1], [2]], [[3], [4]]]] # [1, 1, 1, 4] @@ -98,7 +98,7 @@ class ExtractImagePatches(test.TestCase): patches=patches) def testKsize2x2Stride1x1Rate1x1Same(self): - """Test for 1x1 kernel .""" + """Test for 2x2 kernel with SAME padding.""" # [1, 2, 2, 1] image = [[[[1], [2]], [[3], [4]]]] # [1, 2, 2, 4] @@ -111,6 +111,20 @@ class ExtractImagePatches(test.TestCase): padding="SAME", patches=patches) + def testKsize2x2Stride1x1Rate2x2Valid(self): + """Test for 2x2 kernel with 2x2 dilation.""" + # [1, 2, 2, 1] + image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32) + # [1, 2, 2, 4] + patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]], + [[4, 6, 12, 14], [5, 7, 13, 15]]]] + self._VerifyValues( + image, + ksizes=[2, 2], + strides=[1, 1], + rates=[2, 2], + padding="VALID", + patches=patches) if __name__ == "__main__": test.main() diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py index f91875c6f0c..61944f7e319 100644 --- a/tensorflow/python/kernel_tests/io_ops_test.py +++ b/tensorflow/python/kernel_tests/io_ops_test.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # Copyright 2015 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py index 81af3a0887d..f1fbe1a745b 100644 --- a/tensorflow/python/kernel_tests/losses_test.py +++ b/tensorflow/python/kernel_tests/losses_test.py @@ -953,14 +953,14 @@ class MeanPairwiseSquaredErrorTest(test.TestCase): # Compute the expected loss 'manually'. total = np.zeros((batch_size,)) for b in range(batch_size): - for i in range(dims): - for j in range(dims): + for i in range(dims-1): + for j in range(i+1, dims): x = self._predictions[b, i].item() - self._predictions[b, j].item() y = self._labels[b, i].item() - self._labels[b, j].item() diff = (x - y) total[b] += (diff * diff) - self._expected_losses = np.divide(total, 9.0) + self._expected_losses = np.divide(total, 3.0) def testValueErrorThrownWhenWeightIsNone(self): with self.test_session(): @@ -1060,7 +1060,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase): [[8, 1, 3], [7, 8, 9], [10, 11, 12]], ]) self._test_valid_weights( - labels, predictions, expected_loss=122.22222) + labels, predictions, expected_loss=137.5) def test3dWeightedScalar(self): labels = np.array([ @@ -1073,7 +1073,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase): ]) weight = 3.0 self._test_valid_weights( - labels, predictions, expected_loss=weight * 122.22222, + labels, predictions, expected_loss=weight * 137.5, weights=weight) def _test_invalid_weights( @@ -1124,7 +1124,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase): ]) self._test_valid_weights( # TODO(ptucker): This doesn't look right. - labels, predictions, expected_loss=9 * 122.22222, + labels, predictions, expected_loss=9 * 137.5, weights=np.ones((2, 3, 3))) def testLossWithAllZeroBatchSpecificWeights(self): @@ -1345,6 +1345,34 @@ class ComputeWeightedLossTest(test.TestCase): self.assertAllClose( np.mean(self._raw_losses), unweighted_loss.eval()) + def testUnweightedFromPlaceholder(self): + for reduction in losses.Reduction.all(): + with ops.Graph().as_default() as g: + self.assertEqual(0, len(util.get_losses())) + raw_losses = array_ops.placeholder(dtype=dtypes.float32) + feed_dict = {raw_losses: self._raw_losses} + unweighted_losses = ( + losses.compute_weighted_loss(raw_losses, reduction=reduction), + losses.compute_weighted_loss( + raw_losses, weights=np.ones((1, 1, 1)), reduction=reduction), + losses.compute_weighted_loss( + raw_losses, weights=np.ones((1, 1, 4)), reduction=reduction), + ) + self.assertEqual(3, len(util.get_losses())) + with self.test_session(g): + for unweighted_loss in unweighted_losses: + if reduction == losses.Reduction.NONE: + self.assertAllClose( + self._raw_losses, unweighted_loss.eval(feed_dict)) + elif reduction == losses.Reduction.SUM: + self.assertAllClose( + np.sum(self._raw_losses), unweighted_loss.eval(feed_dict)) + else: + # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS, + # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE. + self.assertAllClose( + np.mean(self._raw_losses), unweighted_loss.eval(feed_dict)) + def testScalarWeight(self): with ops.Graph().as_default(): self.assertEqual(0, len(util.get_losses())) diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py new file mode 100644 index 00000000000..3044b21aa42 --- /dev/null +++ b/tensorflow/python/kernel_tests/manip_ops_test.py @@ -0,0 +1,137 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for manip_ops.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import errors_impl +from tensorflow.python.framework import test_util +from tensorflow.python.ops import manip_ops +from tensorflow.python.ops import gradient_checker +from tensorflow.python.platform import test as test_lib + +import numpy as np + +# pylint: disable=g-import-not-at-top +try: + from distutils.version import StrictVersion as Version + # numpy.roll for multiple shifts was introduced in numpy version 1.12.0 + NP_ROLL_CAN_MULTISHIFT = Version(np.version.version) >= Version('1.12.0') +except ImportError: + NP_ROLL_CAN_MULTISHIFT = False +# pylint: enable=g-import-not-at-top + +class RollTest(test_util.TensorFlowTestCase): + def _testRoll(self, np_input, shift, axis): + expected_roll = np.roll(np_input, shift, axis) + with self.test_session(): + roll = manip_ops.roll(np_input, shift, axis) + self.assertAllEqual(roll.eval(), expected_roll) + + def _testGradient(self, np_input, shift, axis): + with self.test_session(): + inx = constant_op.constant(np_input.tolist()) + xs = list(np_input.shape) + y = manip_ops.roll(inx, shift, axis) + # Expected y's shape to be the same + ys = xs + jacob_t, jacob_n = gradient_checker.compute_gradient( + inx, xs, y, ys, x_init_value=np_input) + self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5) + + def _testAll(self, np_input, shift, axis): + self._testRoll(np_input, shift, axis) + if np_input.dtype == np.float32: + self._testGradient(np_input, shift, axis) + + def testIntTypes(self): + for t in [np.int32, np.int64]: + self._testAll(np.random.randint(-100, 100, (5)).astype(t), 3, 0) + if NP_ROLL_CAN_MULTISHIFT: + self._testAll(np.random.randint(-100, 100, (4, 4, 3)).astype(t), + [1, -2, 3], [0, 1, 2]) + self._testAll(np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t), + [0, 1, -2], [1, 2, 3]) + + def testFloatTypes(self): + for t in [np.float32, np.float64]: + self._testAll(np.random.rand(5).astype(t), 2, 0) + if NP_ROLL_CAN_MULTISHIFT: + self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0]) + self._testAll(np.random.rand(1, 3, 4).astype(t), [1, 0, -3], [0, 1, 2]) + + def testComplexTypes(self): + for t in [np.complex64, np.complex128]: + x = np.random.rand(4, 4).astype(t) + self._testAll(x + 1j * x, 2, 0) + if NP_ROLL_CAN_MULTISHIFT: + x = np.random.rand(2, 5).astype(t) + self._testAll(x + 1j * x, [1, 2], [1, 0]) + x = np.random.rand(3, 2, 1, 1).astype(t) + self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2]) + + + def testRollInputMustVectorHigherRaises(self): + tensor = 7 + shift = 1 + axis = 0 + with self.test_session(): + with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, + "input must be 1-D or higher"): + manip_ops.roll(tensor, shift, axis).eval() + + def testRollAxisMustBeScalarOrVectorRaises(self): + tensor = [[1, 2], + [3, 4]] + shift = 1 + axis = [[0, 1]] + with self.test_session(): + with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, + "axis must be a scalar or a 1-D vector"): + manip_ops.roll(tensor, shift, axis).eval() + + def testRollShiftMustBeScalarOrVectorRaises(self): + tensor = [[1, 2], + [3, 4]] + shift = [[0, 1]] + axis = 1 + with self.test_session(): + with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, + "shift must be a scalar or a 1-D vector"): + manip_ops.roll(tensor, shift, axis).eval() + + def testRollShiftAndAxisMustBeSameSizeRaises(self): + tensor = [[1, 2], + [3, 4]] + shift = [1] + axis = [0, 1] + with self.test_session(): + with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, + "shift and axis must have the same size"): + manip_ops.roll(tensor, shift, axis).eval() + + def testRollAxisOutOfRangeRaises(self): + tensor = [1, 2] + shift = 1 + axis = 1 + with self.test_session(): + with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, + "is out of range"): + manip_ops.roll(tensor, shift, axis).eval() + +if __name__ == "__main__": + test_lib.main() diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py index 317b8dc05be..68d626de2c5 100644 --- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py +++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py @@ -21,6 +21,7 @@ import numpy as np from tensorflow.python.client import session from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes as dtypes_lib from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops @@ -54,9 +55,13 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_): band_np = np.tril(band_np, upper) if batch_shape_ is not (): band_np = np.tile(band_np, batch_shape_ + (1, 1)) - with self.test_session(use_gpu=False): - band = array_ops.matrix_band_part(batch_mat, lower, upper) - self.assertAllEqual(band_np, band.eval()) + for index_dtype in [dtypes_lib.int32, dtypes_lib.int64]: + with self.test_session(use_gpu=False): + band = array_ops.matrix_band_part( + batch_mat, + constant_op.constant(lower, index_dtype), + constant_op.constant(upper, index_dtype)) + self.assertAllEqual(band_np, band.eval()) return Test diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py index 56a07cb012f..f5c6255c346 100644 --- a/tensorflow/python/kernel_tests/partitioned_variables_test.py +++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py @@ -50,8 +50,7 @@ class PartitionerCreatorsTest(test.TestCase): with self.test_session(): partitioner = partitioned_variables.fixed_size_partitioner(4, axis=0) with variable_scope.variable_scope("root", partitioner=partitioner): - v0 = variable_scope.get_variable( - "v0", dtype=dtypes.int64, shape=[20]) + v0 = variable_scope.get_variable("v0", dtype=dtypes.int64, shape=[20]) v0_list = v0._get_variable_list() self.assertEqual(len(v0_list), 4) @@ -169,8 +168,10 @@ class PartitionerCreatorsTest(test.TestCase): max_shards=2) # Use the partitioner with strings - partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner( - axis=3, max_shard_bytes=32768, bytes_per_string_element=8) + partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner( # pylint: disable=line-too-long + axis=3, + max_shard_bytes=32768, + bytes_per_string_element=8) with variable_scope.variable_scope( "root", partitioner=partitioner_axis3_str): @@ -423,8 +424,7 @@ class PartitionedVariablesTestCase(test.TestCase): def testRandomInitUnevenPartitions(self): with self.test_session(): rnd = variables.Variable( - random_ops.random_uniform( - [20, 43], dtype=dtypes.float64)) + random_ops.random_uniform([20, 43], dtype=dtypes.float64)) var_lists = [ partitioned_variables.create_partitioned_variables( rnd.get_shape(), [1, i], rnd.initialized_value()) diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py index 3263ed1a601..4466beeec96 100644 --- a/tensorflow/python/kernel_tests/pooling_ops_test.py +++ b/tensorflow/python/kernel_tests/pooling_ops_test.py @@ -1811,16 +1811,17 @@ class PoolingTest(test.TestCase): if test.is_gpu_available(): pool_funcs.append(nn_ops.max_pool_with_argmax) for pool_func in pool_funcs: - # Illegal strides. - with self.assertRaisesRegexp( - errors_impl.UnimplementedError, - "Pooling is not yet supported on the batch"): - sess.run( - pool_func( - array_ops.placeholder(dtypes.float32), - ksize=[1, 1, 1, 1], - strides=[2, 1, 1, 1], - padding="SAME")) + if pool_func != nn_ops.max_pool: + # Illegal strides. + with self.assertRaisesRegexp( + errors_impl.UnimplementedError, + "Pooling is not yet supported on the batch"): + sess.run( + pool_func( + array_ops.placeholder(dtypes.float32), + ksize=[1, 1, 1, 1], + strides=[2, 1, 1, 1], + padding="SAME")) # Filter too large. with self.assertRaisesRegexp(ValueError, "Negative dimension size"): diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py index 92fb68820e0..c7181497d89 100644 --- a/tensorflow/python/kernel_tests/py_func_test.py +++ b/tensorflow/python/kernel_tests/py_func_test.py @@ -396,66 +396,66 @@ class PyFuncTest(test.TestCase): @test_util.run_in_graph_and_eager_modes() def testEagerSingleOutputFloat32(self): - a = array_ops.ones((3, 3), dtype=dtypes.float32) - x = array_ops.ones((3, 1), dtype=dtypes.float32) - output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32) - with self.test_session(): + with test_util.device(use_gpu=True): + a = array_ops.ones((3, 3), dtype=dtypes.float32) + x = array_ops.ones((3, 1), dtype=dtypes.float32) + output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32) ret = self.evaluate(output) self.assertAllClose(ret, [[3.0], [3.0], [3.0]]) @test_util.run_in_graph_and_eager_modes() def testEagerArrayOutput(self): - a = array_ops.ones((3, 3), dtype=dtypes.int32) - x = array_ops.ones((3, 1), dtype=dtypes.int32) - output = script_ops.eager_py_func( - lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.int32]) - - with self.test_session(): + with test_util.device(use_gpu=True): + a = array_ops.ones((3, 3), dtype=dtypes.float32) + x = array_ops.ones((3, 1), dtype=dtypes.float32) + output = script_ops.eager_py_func( + lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.float32]) ret = self.evaluate(output) - self.assertAllEqual(ret, [[[3], [3], [3]]]) + self.assertAllEqual(ret, [[[3.0], [3.0], [3.0]]]) @test_util.run_in_graph_and_eager_modes() def testEagerReturnNone(self): + with test_util.device(use_gpu=True): + def no_return_value(): + return - def no_return_value(): - return - - output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[]) - ret = self.evaluate(output) - if context.in_eager_mode(): - self.assertEquals(len(ret), 0) - else: - self.assertIsNone(ret) + output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[]) + ret = self.evaluate(output) + if context.in_eager_mode(): + self.assertEquals(len(ret), 0) + else: + self.assertIsNone(ret) @test_util.run_in_graph_and_eager_modes() def testEagerPyFuncInDefun(self): + with test_util.device(use_gpu=True): + def wrapper(): + a = array_ops.ones((3, 3), dtype=dtypes.float32) + x = array_ops.ones((3, 1), dtype=dtypes.float32) + return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32) - def wrapper(): - a = array_ops.ones((3, 3), dtype=dtypes.int32) - x = array_ops.ones((3, 1), dtype=dtypes.int32) - return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32) - - wrapped = function.defun(wrapper) - ret = self.evaluate(wrapped()) - self.assertAllEqual(ret, [[3], [3], [3]]) + wrapped = function.defun(wrapper) + ret = self.evaluate(wrapped()) + self.assertAllEqual(ret, [[3.0], [3.0], [3.0]]) @test_util.run_in_graph_and_eager_modes() def testEagerExceptionHandling(self): - self._testExceptionHandling( - ValueError, errors.InvalidArgumentError, eager=True) - self._testExceptionHandling( - TypeError, errors.InvalidArgumentError, eager=True) - self._testExceptionHandling( - StopIteration, errors.OutOfRangeError, eager=True) - self._testExceptionHandling( - MemoryError, errors.ResourceExhaustedError, eager=True) - self._testExceptionHandling( - NotImplementedError, errors.UnimplementedError, eager=True) + with test_util.device(use_gpu=True): + self._testExceptionHandling( + ValueError, errors.InvalidArgumentError, eager=True) + self._testExceptionHandling( + TypeError, errors.InvalidArgumentError, eager=True) + self._testExceptionHandling( + StopIteration, errors.OutOfRangeError, eager=True) + self._testExceptionHandling( + MemoryError, errors.ResourceExhaustedError, eager=True) + self._testExceptionHandling( + NotImplementedError, errors.UnimplementedError, eager=True) - class WeirdError(Exception): - pass + class WeirdError(Exception): + pass - self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True) + self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True) if __name__ == "__main__": diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py index 5a2903a4234..df37dd98ece 100644 --- a/tensorflow/python/kernel_tests/random/random_ops_test.py +++ b/tensorflow/python/kernel_tests/random/random_ops_test.py @@ -203,7 +203,8 @@ class RandomUniformTest(test.TestCase): return func def testRange(self): - for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64: + for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, + dtypes.int64): sampler = self._Sampler(1000, minv=-2, maxv=8, dtype=dt, use_gpu=True) x = sampler() self.assertTrue(-2 <= np.min(x)) @@ -213,7 +214,8 @@ class RandomUniformTest(test.TestCase): # to see the same sequence of values. Will catch buggy # implementations which uses the same random number seed. def testDistinct(self): - for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64: + for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, + dtypes.int64): maxv = 1.0 if dt.is_floating else 1 << 30 sampler = self._Sampler(1000, minv=0, maxv=maxv, dtype=dt, use_gpu=True) x = sampler() @@ -251,7 +253,8 @@ class RandomUniformTest(test.TestCase): # Checks that the CPU and GPU implementation returns the same results, # given the same random seed def testCPUGPUMatch(self): - for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64: + for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, + dtypes.int64): maxv = 1.0 if dt.is_floating else 17 results = {} for use_gpu in False, True: @@ -261,7 +264,8 @@ class RandomUniformTest(test.TestCase): self.assertAllEqual(results[False], results[True]) def testSeed(self): - for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64: + for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, + dtypes.int64): for seed in [345, 2**100, -2**100]: sx = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed) sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed) @@ -285,8 +289,7 @@ class RandomShapeTest(test.TestCase): self.assertEqual([1, 2, 3], rnd1.get_shape()) # Partially known shape. rnd2 = random_ops.truncated_normal( - array_ops.placeholder( - dtypes.int32, shape=(3,))) + array_ops.placeholder(dtypes.int32, shape=(3,))) self.assertEqual([None, None, None], rnd2.get_shape().as_list()) # Unknown shape. rnd3 = random_ops.truncated_normal(array_ops.placeholder(dtypes.int32)) @@ -298,8 +301,7 @@ class RandomShapeTest(test.TestCase): self.assertEqual([1, 2, 3], rnd1.get_shape()) # Partially known shape. rnd2 = random_ops.random_normal( - array_ops.placeholder( - dtypes.int32, shape=(3,))) + array_ops.placeholder(dtypes.int32, shape=(3,))) self.assertEqual([None, None, None], rnd2.get_shape().as_list()) # Unknown shape. rnd3 = random_ops.random_normal(array_ops.placeholder(dtypes.int32)) @@ -311,8 +313,7 @@ class RandomShapeTest(test.TestCase): self.assertEqual([1, 2, 3], rnd1.get_shape()) # Partially known shape. rnd2 = random_ops.random_uniform( - array_ops.placeholder( - dtypes.int32, shape=(3,))) + array_ops.placeholder(dtypes.int32, shape=(3,))) self.assertEqual([None, None, None], rnd2.get_shape().as_list()) # Unknown shape. rnd3 = random_ops.random_uniform(array_ops.placeholder(dtypes.int32)) diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py index b4b555591d0..cd945796881 100644 --- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py +++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py @@ -36,6 +36,7 @@ from tensorflow.python.ops import state_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables from tensorflow.python.platform import test +from tensorflow.python.util import compat @test_util.with_c_api @@ -170,6 +171,17 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase): read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32) self.assertEqual(self.evaluate(read), [[3]]) + def testScatterUpdateString(self): + handle = resource_variable_ops.var_handle_op( + dtype=dtypes.string, shape=[1, 1]) + self.evaluate(resource_variable_ops.assign_variable_op( + handle, constant_op.constant([["a"]], dtype=dtypes.string))) + self.evaluate(resource_variable_ops.resource_scatter_update( + handle, [0], constant_op.constant([["b"]], dtype=dtypes.string))) + read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.string) + self.assertEqual(compat.as_bytes(self.evaluate(read)[0][0]), + compat.as_bytes("b")) + # TODO(alive): get this to work in Eager mode. def testGPU(self): with self.test_session(use_gpu=True): diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py index 0c77d1db921..a86b65affec 100644 --- a/tensorflow/python/kernel_tests/rnn_test.py +++ b/tensorflow/python/kernel_tests/rnn_test.py @@ -23,6 +23,7 @@ import timeit import numpy as np +from six.moves import xrange from tensorflow.contrib import rnn as contrib_rnn from tensorflow.core.protobuf import config_pb2 from tensorflow.python.client import session diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py index be72c194072..bb3f6970e4f 100644 --- a/tensorflow/python/kernel_tests/softmax_op_test.py +++ b/tensorflow/python/kernel_tests/softmax_op_test.py @@ -25,11 +25,13 @@ import numpy as np from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import nn_ops from tensorflow.python.platform import test +@test_util.with_c_api class SoftmaxTest(test.TestCase): def _npSoftmax(self, features, dim=-1, log=False): @@ -174,8 +176,11 @@ class SoftmaxTest(test.TestCase): def testDimTooLarge(self): with self.test_session(): + # Use placeholder to make sure we get runtime error instead of shape + # inference error. + dim = array_ops.placeholder_with_default(100, shape=[]) with self.assertRaises(errors_impl.InvalidArgumentError): - nn_ops.softmax([1., 2., 3., 4.], dim=100).eval() + nn_ops.softmax([1., 2., 3., 4.], dim=dim).eval() def testLargeDims(self): # Make sure that we properly handle large inputs. See diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py index 38205518b52..8ad29afd0a0 100644 --- a/tensorflow/python/kernel_tests/tensordot_op_test.py +++ b/tensorflow/python/kernel_tests/tensordot_op_test.py @@ -56,9 +56,11 @@ class TensordotTest(test_lib.TestCase): axes_ph = array_ops.placeholder(dtypes.int32) output = math_ops.tensordot(a_ph, b_ph, axes_ph) _ = sess.run( - [output], feed_dict={a_ph: a, - b_ph: b, - axes_ph: (a_axes, b_axes)}) + [output], feed_dict={ + a_ph: a, + b_ph: b, + axes_ph: (a_axes, b_axes) + }) def test_invalid_axes(self): a = [[1, 2], [3, 4]] @@ -81,28 +83,29 @@ class TensordotTest(test_lib.TestCase): with self.test_session() as sess: with self.assertRaises(errors_impl.InvalidArgumentError): _ = sess.run( - [output], feed_dict={a_ph: a, - b_ph: b, - axes_ph: axes_value}) + [output], feed_dict={ + a_ph: a, + b_ph: b, + axes_ph: axes_value + }) # Test case for 11950 def test_valid_axis(self): for axes_value in [1, 2], [[1], [2]], [[], []], 0: with self.test_session() as sess: - np_a = np.ones((3,3)) + np_a = np.ones((3, 3)) np_b = np.array([2, 3, 1])[None, None] np_ans = np.tensordot(np_a, np_b, axes_value) - tf_a = array_ops.ones((3,3), dtype=dtypes.float32) + tf_a = array_ops.ones((3, 3), dtype=dtypes.float32) tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None] tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value).eval() self.assertAllEqual(tf_ans.shape, np_ans.shape) self.assertAllEqual(tf_ans, np_ans) - def test_partial_shape_inference(self): - for axes in ([1],[0]), 1: + for axes in ([1], [0]), 1: a = array_ops.placeholder(dtypes.float32) b = array_ops.placeholder(dtypes.float32) output = math_ops.tensordot(a, b, axes) @@ -169,9 +172,11 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_): axes = array_ops.placeholder(dtypes.int32) c = math_ops.tensordot(a, b, axes) tf_ans = sess.run( - c, feed_dict={a: a_np, - b: b_np, - axes: (a_dims_np, b_dims_np)}) + c, feed_dict={ + a: a_np, + b: b_np, + axes: (a_dims_np, b_dims_np) + }) else: tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np)).eval() self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol) diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py index efb5b9f3641..6ab931fdb97 100644 --- a/tensorflow/python/kernel_tests/topk_op_test.py +++ b/tensorflow/python/kernel_tests/topk_op_test.py @@ -58,7 +58,7 @@ class TopKTest(test.TestCase): # Do some special casing of equality of indices: if indices # are not the same, but values are floating type, ensure that # the values are within epsilon of each other. - if not np.issubdtype(np_expected_values.dtype, np.float): + if not np.issubdtype(np_expected_values.dtype, np.floating): # Values are not floating point type; check indices exactly self.assertAllEqual(np_expected_indices, indices) else: diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py index f60ebf58f6f..b16c8c002c9 100644 --- a/tensorflow/python/kernel_tests/variables_test.py +++ b/tensorflow/python/kernel_tests/variables_test.py @@ -22,6 +22,7 @@ import operator import numpy as np +from tensorflow.python.eager import function from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl @@ -509,6 +510,15 @@ class VariablesTestCase(test.TestCase): "", repr(var)) + def testVariableNamesPreserveNameScopesWithDefun(self): + @function.defun + def create_variable(): + with ops.name_scope("foo"): + v = variables.Variable(0.0, name="bar") + self.assertEqual(v.name, "foo/bar:0") + with ops.get_default_graph().as_default(): + create_variable() + class IsInitializedTest(test.TestCase): diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py index c6c7c4e26cb..e152f02d8e9 100644 --- a/tensorflow/python/kernel_tests/xent_op_test.py +++ b/tensorflow/python/kernel_tests/xent_op_test.py @@ -38,9 +38,8 @@ class XentTest(test.TestCase): dim = len(features.shape) - 1 one_only_on_dim = list(features.shape) one_only_on_dim[dim] = 1 - e = np.exp(features - np.reshape( - np.amax( - features, axis=dim), one_only_on_dim)) + e = np.exp( + features - np.reshape(np.amax(features, axis=dim), one_only_on_dim)) probs = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim) bp = (probs - labels) l = -np.sum(labels * np.log(probs + 1.0e-20), axis=dim) @@ -85,10 +84,10 @@ class XentTest(test.TestCase): def testRankTooLarge(self): for dtype in np.float16, np.float32: - np_features = np.array( - [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]).astype(dtype) - np_labels = np.array( - [[[0., 0., 0., 1.]], [[0., .5, .5, 0.]]]).astype(dtype) + np_features = np.array([[[1., 1., 1., 1.]], [[1., 2., 3., + 4.]]]).astype(dtype) + np_labels = np.array([[[0., 0., 0., 1.]], [[0., .5, .5, + 0.]]]).astype(dtype) self.assertRaisesRegexp(ValueError, "must be rank 2", gen_nn_ops._softmax_cross_entropy_with_logits, np_features, np_labels) @@ -121,8 +120,8 @@ class XentTest(test.TestCase): # = [1.3862, 1.9401] np_loss, np_backprop = self._npXent(np.array(features), np.array(labels)) self.assertAllClose( - np.array([[0.25, 0.25, 0.25, -0.75], - [0.0321, -0.4129, -0.2632, 0.6439]]), + np.array([[0.25, 0.25, 0.25, -0.75], [0.0321, -0.4129, -0.2632, + 0.6439]]), np_backprop, rtol=1.e-3, atol=1.e-3) @@ -168,15 +167,17 @@ class XentTest(test.TestCase): shape=[3, 4], dtype=dtypes.float64, name="f") - x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f, - name="xent") + x = nn_ops.softmax_cross_entropy_with_logits( + labels=l, logits=f, name="xent") err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3]) # Check that no extra computation performed. When only first derivative is requested, # second derivative must not be computed. So when there is no second derivative, # there is no `BatchMatMul` op in the graph. - op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def] - self.assertNotIn('BatchMatMul', op_names) + op_names = [ + op.op_def.name for op in sess.graph.get_operations() if op.op_def + ] + self.assertNotIn("BatchMatMul", op_names) print("cross entropy gradient err = ", err) self.assertLess(err, 5e-8) @@ -193,24 +194,29 @@ class XentTest(test.TestCase): shape=[3, 4], dtype=dtypes.float64, name="f") - x = nn_ops.softmax_cross_entropy_with_logits_v2(labels=l, logits=f, - name="xent") + x = nn_ops.softmax_cross_entropy_with_logits_v2( + labels=l, logits=f, name="xent") err = gradient_checker.compute_gradient_error(l, [3, 4], x, [3]) self.assertLess(err, 5e-8) def testSecondGradient(self): with self.test_session() as sess: - l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0, - 1.0/3, 0.0, 0.0, 0.0, - 0.0, 0.5/3, 0.0, 0.5/3], shape=[12], - dtype=dtypes.float64, name="l") - f = constant_op.constant([0.1, 0.2, 0.3, 0.4, - 0.1, 0.4, 0.9, 1.6, - 0.1, 0.8, 2.7, 6.4], shape=[12], - dtype=dtypes.float64, name="f") - x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f, - name="xent") + l = constant_op.constant( + [ + 0.0, 0.0, 1.0 / 3, 0.0, 1.0 / 3, 0.0, 0.0, 0.0, 0.0, 0.5 / 3, 0.0, + 0.5 / 3 + ], + shape=[12], + dtype=dtypes.float64, + name="l") + f = constant_op.constant( + [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4], + shape=[12], + dtype=dtypes.float64, + name="f") + x = nn_ops.softmax_cross_entropy_with_logits( + labels=l, logits=f, name="xent") loss = math_ops.reduce_sum(x) gradients = gradients_impl.gradients(loss, [f])[0] @@ -219,20 +225,23 @@ class XentTest(test.TestCase): # Check that second derivative is calculated. # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad) - op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def] - self.assertIn('BatchMatMul', op_names) + op_names = [ + op.op_def.name for op in sess.graph.get_operations() if op.op_def + ] + self.assertIn("BatchMatMul", op_names) print("cross entropy hessian err = ", err) self.assertLess(err, 5e-8) def testWrapper(self): - features = np.array( - [[[1., 1., 1., 1.], [1., 2., 3., 4.]], - [[2., 3., 4., 5.], [6., 7., 8., 9.]], - [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32) + features = np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]], + [[2., 3., 4., 5.], [6., 7., 8., 9.]], + [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype( + np.float32) labels = np.array([[[0., 0., 0., 1.], [0., 1., 0., 0.]], [[0., 0.5, 0.5, 0.], [0.5, 0.5, 0., 0.]], - [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype(np.float32) + [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype( + np.float32) self._testXentWrapper(features, labels, dim=0, use_gpu=False) self._testXentWrapper(features, labels, dim=0, use_gpu=True) self._testXentWrapper(features, labels, dim=1, use_gpu=False) diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py index 5d9feb07b44..5dea732cbaa 100644 --- a/tensorflow/python/layers/base.py +++ b/tensorflow/python/layers/base.py @@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.layers import utils as layers_util +from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import variable_scope as vs from tensorflow.python.ops import variables as tf_variables @@ -139,9 +140,6 @@ class Layer(object): self._init_set_name(name) - # Holds functions for creating regularizer ops. - self._regularizer_factories = [] - # Determine variable scope. scope = kwargs.get('_scope') if scope: @@ -306,22 +304,6 @@ class Layer(object): inputs_hash = None return self._per_input_updates.get(inputs_hash, []) - def _get_regularizer_factories(self): - try: - # Some subclasses of Layer do not use its constructor. - return self._regularizer_factories - except AttributeError: - self._regularizer_factories = [] - return self._regularizer_factories - - def _maybe_create_variable_regularizers(self): - """Creates added but uninstantiated regularizers.""" - factories = self._get_regularizer_factories() - if factories: - for factory in factories: - factory() - factories[:] = [] - @property def losses(self): """Losses which are associated with this `Layer`. @@ -333,7 +315,6 @@ class Layer(object): Returns: A list of tensors. """ - self._maybe_create_variable_regularizers() if context.in_eager_mode(): # _losses may only contain variable regularization losses when executing # eagerly, and they have been saved as lambdas to be executed when @@ -417,7 +398,6 @@ class Layer(object): inputs_hash = layers_util.object_list_uid(inputs) else: inputs_hash = None - self._maybe_create_variable_regularizers() return self._per_input_losses.get(inputs_hash, []) def build(self, _): @@ -670,6 +650,7 @@ class Layer(object): else: scope_context_manager = vs.variable_scope( self._scope, reuse=self._reuse, auxiliary_name_scope=False) + input_shapes = None with scope_context_manager as scope: with ops.name_scope(self._name_scope_name(scope)): if not self.built: @@ -719,6 +700,9 @@ class Layer(object): else: # Deferred mode behavior: use `compute_output_shape` to # infer the number of outputs of the layer and their shapes. + if input_shapes is None: + input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs) + output_shapes = self.compute_output_shape(input_shapes) output_shapes = nest.flatten(output_shapes) outputs = [ @@ -1414,7 +1398,10 @@ class _DeferredTensor(object): def __init__(self, shape, dtype, name=None): self.shape = tensor_shape.TensorShape(shape) - self.dtype = dtypes.as_dtype(dtype) + if dtype is None: + self.dtype = dtypes.as_dtype(np.float32) + else: + self.dtype = dtypes.as_dtype(dtype) self.name = name def get_shape(self): diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py index e5b93a54f79..7bf62d45b8e 100644 --- a/tensorflow/python/layers/core.py +++ b/tensorflow/python/layers/core.py @@ -49,9 +49,6 @@ class Dense(base.Layer): and `bias` is a bias vector created by the layer (only if `use_bias` is `True`). - Note: if the input to the layer has a rank greater than 2, then it is - flattened prior to the initial matrix multiply by `kernel`. - Arguments: units: Integer or Long, dimensionality of the output space. activation: Activation function (callable). Set it to None to maintain a @@ -199,9 +196,6 @@ def dense( and `bias` is a bias vector created by the layer (only if `use_bias` is `True`). - Note: if the `inputs` tensor has a rank greater than 2, then it is - flattened prior to the initial matrix multiply by `kernel`. - Arguments: inputs: Tensor input. units: Integer or Long, dimensionality of the output space. @@ -230,7 +224,8 @@ def dense( by the same name. Returns: - Output tensor. + Output tensor the same shape as `inputs` except the last dimension is of + size `units`. Raises: ValueError: if eager execution is enabled. diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py index 0a5dd57621b..745843975c4 100644 --- a/tensorflow/python/layers/network.py +++ b/tensorflow/python/layers/network.py @@ -621,6 +621,11 @@ class GraphNetwork(base.Layer): A list of loss tensors. """ losses = [] + if context.in_eager_mode(): + for layer in self.layers: + losses += layer.losses + return losses + # Retrieve losses for all internal layers. for layer in self.layers: if hasattr(layer, 'losses'): @@ -853,7 +858,6 @@ class GraphNetwork(base.Layer): for node in nodes: # This is always a single layer, never a list. layer = node.outbound_layer - reference_input_tensors = node.input_tensors reference_output_tensors = node.output_tensors @@ -901,12 +905,13 @@ class GraphNetwork(base.Layer): else: output_masks = [None for _ in range(len(output_tensors))] - # Apply activity regularizer if any: - if layer.activity_regularizer is not None: - regularization_losses = [ - layer.activity_regularizer(x) for x in computed_tensors - ] - layer.add_loss(regularization_losses, computed_tensors) + if context.in_graph_mode(): + if layer.activity_regularizer is not None: + regularization_losses = [ + layer.activity_regularizer(x) for x in computed_tensors + ] + # Apply activity regularizer if any: + layer.add_loss(regularization_losses, computed_tensors) if context.in_graph_mode(): # Update model updates and losses: diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc index d3bfa0ee337..e0422ef80ad 100644 --- a/tensorflow/python/lib/core/py_func.cc +++ b/tensorflow/python/lib/core/py_func.cc @@ -19,6 +19,7 @@ limitations under the License. #include "numpy/arrayobject.h" #include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_internal.h" #include "tensorflow/c/tf_status_helper.h" #include "tensorflow/core/framework/allocation_description.pb.h" #include "tensorflow/core/framework/op_kernel.h" @@ -53,6 +54,12 @@ struct PyCall { // with this "token". string token; + // The device on which Tensors are stored; only used for EagerPyFunc. + Device* device; + + // True if and only if the op has been placed on a GPU. + bool gpu; + // True if the call is associated with an EagerPyFunc. bool eager; @@ -71,7 +78,12 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) { PyObject* arg = nullptr; const Tensor& t = call->ins[i]; if (call->eager) { - arg = EagerTensorFromHandle(TFE_NewTensorHandle(t)); + if (call->gpu) { + arg = EagerTensorFromHandle(new TFE_TensorHandle(t, call->device)); + } else { + // TFE_TensorHandle assumes that CPU is identified by `nullptr`. + arg = EagerTensorFromHandle(new TFE_TensorHandle(t, nullptr)); + } if (arg == nullptr) { return errors::Internal("Unable to procure EagerTensor from Tensor."); } @@ -84,7 +96,8 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) { } PyList_SetItem(lst, i, arg); } - *tuple = Py_BuildValue("(sN)", call->token.c_str(), lst); + *tuple = Py_BuildValue("(sON)", call->token.c_str(), + call->gpu ? Py_True : Py_False, lst); CHECK(*tuple); return Status::OK(); } @@ -150,15 +163,9 @@ bool IsSingleNone(PyObject* obj) { } // Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`. -Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor, - Tensor* output_tensor, - TF_Status* tf_status) { - // TODO(akshayka): Lift the restriction requiring output tensors to - // lie in host memory; EagerPyFunc should be able to dispatch ops on GPU - // tensors, so we should eventually implement a GPU kernel for EagerPyFunc. - *output_tensor = *TFE_TensorHandleUnderlyingTensorInHostMemory( - EagerTensor_Handle(eager_tensor), tf_status); - return StatusFromTF_Status(tf_status); +void ExtractTensorFromEagerTensor(const PyObject* eager_tensor, + Tensor* output_tensor) { + *output_tensor = EagerTensor_Handle(eager_tensor)->t; } // Calls the registered py function through the trampoline. @@ -201,15 +208,23 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) { } // Process the return values and convert them to TF Tensors. - Status s; + Status s = Status::OK(); if (PyList_Check(result)) { + // `result` is a Python list; if this operation is an `EagerPyFunc`, then + // every item in the list must be an `EagerTensor`; otherwise, every element + // must be a NumPy array. call->out.clear(); for (int i = 0; i < PyList_Size(result); ++i) { Tensor t; if (call->eager) { - auto tf_status = tensorflow::make_safe(TF_NewStatus()); - s = ExtractTensorFromEagerTensor(PyList_GetItem(result, i), &t, - tf_status.get()); + const PyObject* item = PyList_GetItem(result, i); + if (EagerTensor_CheckExact(item)) { + ExtractTensorFromEagerTensor(item, &t); + } else { + s = errors::FailedPrecondition( + "Expected EagerTensor, found PyObject of type: ", + Py_TYPE(item)->tp_name); + } } else { s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t); } @@ -220,16 +235,15 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) { call->out.push_back(t); } } else if (EagerTensor_CheckExact(result) || result == Py_None) { + // result is an `EagerTensor` or `None`. DCHECK(call->eager); Tensor t; if (result != Py_None) { - auto tf_status = tensorflow::make_safe(TF_NewStatus()); - s = ExtractTensorFromEagerTensor(result, &t, tf_status.get()); - if (s.ok()) { - call->out.push_back(t); - } + ExtractTensorFromEagerTensor(result, &t); + call->out.push_back(t); } } else if (PyArray_Check(result)) { + // `result` is a NumPy array. DCHECK(!call->eager); if (!IsSingleNone(result)) { Tensor t; @@ -239,7 +253,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) { } } } else { - s = errors::Internal("Unexpected pyobject is returned: ", + s = errors::Internal("Unexpected PyObject was returned: ", Py_TYPE(result)->tp_name); } Py_DECREF(result); @@ -429,12 +443,24 @@ class PyFuncOp : public OpKernel { explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_)); eager_ = type_string() == "EagerPyFunc"; + gpu_ = ctx->device_type().type_string() == DEVICE_GPU; } void Compute(OpKernelContext* ctx) override { PyCall call; call.token = token_; + call.gpu = gpu_; call.eager = eager_; + if (call.eager) { + // Eager's C API uses `Device`, whereas `OpKernelContext` stores a + // `DeviceBase`; attempt to downcast. + call.device = dynamic_cast(ctx->device()); + if (call.device == nullptr) { + ctx->CtxFailureWithWarning( + errors::Internal("Unrecognized device class")); + } + } + for (int i = 0; i < ctx->num_inputs(); ++i) { call.ins.push_back(ctx->input(i)); } @@ -476,6 +502,9 @@ class PyFuncOp : public OpKernel { private: string token_; + // True if and only if this op has been placed on a GPU. + bool gpu_; + // True if and only if this op should execute the python function eagerly, // i.e., if and only if the eager attribute is set. bool eager_; @@ -486,5 +515,6 @@ class PyFuncOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("PyFunc").Device(DEVICE_CPU), PyFuncOp); REGISTER_KERNEL_BUILDER(Name("PyFuncStateless").Device(DEVICE_CPU), PyFuncOp); REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_CPU), PyFuncOp); +REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_GPU), PyFuncOp); } // end namespace tensorflow diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py index 4e3071d8513..59f5075f177 100644 --- a/tensorflow/python/lib/io/file_io.py +++ b/tensorflow/python/lib/io/file_io.py @@ -31,6 +31,7 @@ from tensorflow.python.framework import c_api_util from tensorflow.python.framework import errors from tensorflow.python.util import compat from tensorflow.python.util import deprecation +from tensorflow.python.util.tf_export import tf_export class FileIO(object): @@ -235,6 +236,7 @@ class FileIO(object): self._writable_file = None +@tf_export("gfile.Exists") def file_exists(filename): """Determines whether a path exists or not. @@ -256,6 +258,7 @@ def file_exists(filename): return True +@tf_export("gfile.Remove") def delete_file(filename): """Deletes the file located at 'filename'. @@ -306,6 +309,7 @@ def write_string_to_file(filename, file_content): f.write(file_content) +@tf_export("gfile.Glob") def get_matching_files(filename): """Returns a list of files that match the given pattern(s). @@ -336,6 +340,7 @@ def get_matching_files(filename): ] +@tf_export("gfile.MkDir") def create_dir(dirname): """Creates a directory with the name 'dirname'. @@ -353,6 +358,7 @@ def create_dir(dirname): pywrap_tensorflow.CreateDir(compat.as_bytes(dirname), status) +@tf_export("gfile.MakeDirs") def recursive_create_dir(dirname): """Creates a directory and all parent/intermediate directories. @@ -368,6 +374,7 @@ def recursive_create_dir(dirname): pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(dirname), status) +@tf_export("gfile.Copy") def copy(oldpath, newpath, overwrite=False): """Copies data from oldpath to newpath. @@ -385,6 +392,7 @@ def copy(oldpath, newpath, overwrite=False): compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status) +@tf_export("gfile.Rename") def rename(oldname, newname, overwrite=False): """Rename or move a file / directory. @@ -426,6 +434,7 @@ def atomic_write_string_to_file(filename, contents, overwrite=True): raise +@tf_export("gfile.DeleteRecursively") def delete_recursively(dirname): """Deletes everything under dirname recursively. @@ -439,6 +448,7 @@ def delete_recursively(dirname): pywrap_tensorflow.DeleteRecursively(compat.as_bytes(dirname), status) +@tf_export("gfile.IsDirectory") def is_directory(dirname): """Returns whether the path is a directory or not. @@ -452,6 +462,7 @@ def is_directory(dirname): return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status) +@tf_export("gfile.ListDirectory") def list_directory(dirname): """Returns a list of entries contained within a directory. @@ -479,6 +490,7 @@ def list_directory(dirname): ] +@tf_export("gfile.Walk") def walk(top, in_order=True): """Recursive directory tree generator for directories. @@ -522,6 +534,7 @@ def walk(top, in_order=True): yield here +@tf_export("gfile.Stat") def stat(filename): """Returns file statistics for a given path. diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py index df190100689..48ea107a146 100644 --- a/tensorflow/python/lib/io/tf_record.py +++ b/tensorflow/python/lib/io/tf_record.py @@ -22,8 +22,10 @@ from __future__ import print_function from tensorflow.python import pywrap_tensorflow from tensorflow.python.framework import errors from tensorflow.python.util import compat +from tensorflow.python.util.tf_export import tf_export +@tf_export("python_io.TFRecordCompressionType") class TFRecordCompressionType(object): """The type of compression for the record.""" NONE = 0 @@ -33,6 +35,7 @@ class TFRecordCompressionType(object): # NOTE(vrv): This will eventually be converted into a proto. to match # the interface used by the C++ RecordWriter. +@tf_export("python_io.TFRecordOptions") class TFRecordOptions(object): """Options used for manipulating TFRecord files.""" compression_type_map = { @@ -51,6 +54,7 @@ class TFRecordOptions(object): return cls.compression_type_map[options.compression_type] +@tf_export("python_io.tf_record_iterator") def tf_record_iterator(path, options=None): """An iterator that read the records from a TFRecords file. @@ -81,6 +85,7 @@ def tf_record_iterator(path, options=None): reader.Close() +@tf_export("python_io.TFRecordWriter") class TFRecordWriter(object): """A class to write records to a TFRecords file. diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index 24a0c186198..9541b097a94 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -34,6 +34,7 @@ See the @{$python/array_ops} guide. @@reshape @@squeeze @@expand_dims +@@unravel_index @@meshgrid @@slice @@strided_slice diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py index 49191c647d5..33a92631d0b 100644 --- a/tensorflow/python/ops/control_flow_ops.py +++ b/tensorflow/python/ops/control_flow_ops.py @@ -261,10 +261,10 @@ def _Enter(data, data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True) if isinstance(data, ops.Tensor): if data.dtype._is_ref_dtype and use_ref: # pylint: disable=protected-access - result = ref_enter( + result = gen_control_flow_ops._ref_enter( data, frame_name, is_constant, parallel_iterations, name=name) else: - result = enter( + result = gen_control_flow_ops._enter( data, frame_name, is_constant, parallel_iterations, name=name) if use_input_shape: result.set_shape(data.get_shape()) @@ -279,7 +279,7 @@ def _Enter(data, parallel_iterations=parallel_iterations, use_input_shape=use_input_shape, name=name) - indices = enter( + indices = gen_control_flow_ops._enter( data.indices, frame_name, is_constant, @@ -290,7 +290,7 @@ def _Enter(data, if isinstance(data, ops.IndexedSlices): dense_shape = data.dense_shape if dense_shape is not None: - dense_shape = enter( + dense_shape = gen_control_flow_ops._enter( dense_shape, frame_name, is_constant, @@ -300,7 +300,7 @@ def _Enter(data, dense_shape.set_shape(data.dense_shape.get_shape()) return ops.IndexedSlices(values, indices, dense_shape) else: - dense_shape = enter( + dense_shape = gen_control_flow_ops._enter( data.dense_shape, frame_name, is_constant, diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py index 314726ede6c..230b6c5946a 100644 --- a/tensorflow/python/ops/gradients_impl.py +++ b/tensorflow/python/ops/gradients_impl.py @@ -44,6 +44,7 @@ from tensorflow.python.ops import image_grad # pylint: disable=unused-import from tensorflow.python.ops import linalg_grad # pylint: disable=unused-import from tensorflow.python.ops import linalg_ops # pylint: disable=unused-import from tensorflow.python.ops import logging_ops # pylint: disable=unused-import +from tensorflow.python.ops import manip_grad # pylint: disable=unused-import from tensorflow.python.ops import math_grad # pylint: disable=unused-import from tensorflow.python.ops import math_ops from tensorflow.python.ops import resource_variable_ops diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index cab1025df11..22636fdbb3f 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -1691,7 +1691,8 @@ def rgb_to_yiq(images): images: tensor with the same shape as `images`. """ images = ops.convert_to_tensor(images, name='images') - kernel = ops.convert_to_tensor(_rgb_to_yiq_kernel, dtype=images.dtype, name='kernel') + kernel = ops.convert_to_tensor( + _rgb_to_yiq_kernel, dtype=images.dtype, name='kernel') ndims = images.get_shape().ndims return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]]) @@ -1717,7 +1718,8 @@ def yiq_to_rgb(images): images: tensor with the same shape as `images`. """ images = ops.convert_to_tensor(images, name='images') - kernel = ops.convert_to_tensor(_yiq_to_rgb_kernel, dtype=images.dtype, name='kernel') + kernel = ops.convert_to_tensor( + _yiq_to_rgb_kernel, dtype=images.dtype, name='kernel') ndims = images.get_shape().ndims return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]]) @@ -1742,7 +1744,8 @@ def rgb_to_yuv(images): images: tensor with the same shape as `images`. """ images = ops.convert_to_tensor(images, name='images') - kernel = ops.convert_to_tensor(_rgb_to_yuv_kernel, dtype=images.dtype, name='kernel') + kernel = ops.convert_to_tensor( + _rgb_to_yuv_kernel, dtype=images.dtype, name='kernel') ndims = images.get_shape().ndims return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]]) @@ -1768,7 +1771,8 @@ def yuv_to_rgb(images): images: tensor with the same shape as `images`. """ images = ops.convert_to_tensor(images, name='images') - kernel = ops.convert_to_tensor(_yuv_to_rgb_kernel, dtype=images.dtype, name='kernel') + kernel = ops.convert_to_tensor( + _yuv_to_rgb_kernel, dtype=images.dtype, name='kernel') ndims = images.get_shape().ndims return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]]) diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py index 98343846342..82b77ee8e37 100644 --- a/tensorflow/python/ops/image_ops_test.py +++ b/tensorflow/python/ops/image_ops_test.py @@ -252,11 +252,11 @@ class AdjustGamma(test_util.TensorFlowTestCase): with self.test_session(): x_data = np.random.uniform(0, 255, (8, 8)) x_np = np.array(x_data, dtype=np.float32) - + x = constant_op.constant(x_np, shape=x_np.shape) - err_msg = 'Gamma should be a non-negative real number.' - + err_msg = "Gamma should be a non-negative real number." + try: image_ops.adjust_gamma(x, gamma=-1) except Exception as e: @@ -270,13 +270,13 @@ class AdjustGamma(test_util.TensorFlowTestCase): with self.test_session(): x_data = np.random.uniform(0, 255, (8, 8)) x_np = np.array(x_data, dtype=np.float32) - + x = constant_op.constant(x_np, shape=x_np.shape) y = constant_op.constant(-1.0, dtype=dtypes.float32) - + image = image_ops.adjust_gamma(x, gamma=y) - - err_msg = 'Gamma should be a non-negative real number.' + + err_msg = "Gamma should be a non-negative real number." try: image.eval() except Exception as e: @@ -284,7 +284,7 @@ class AdjustGamma(test_util.TensorFlowTestCase): raise else: raise AssertionError("Exception not raised: %s" % err_msg) - + def test_adjust_gamma_zero(self): """White image should be returned for gamma equal to zero""" with self.test_session(): @@ -311,13 +311,13 @@ class AdjustGamma(test_util.TensorFlowTestCase): y_tf = np.trunc(y.eval()) y_np = np.array( - [[0, 31, 45, 55, 63, 71, 78, 84], - [90, 95, 100, 105, 110, 115, 119, 123], - [127, 131, 135, 139, 142, 146, 149, 153], - [156, 159, 162, 165, 168, 171, 174, 177], - [180, 183, 186, 188, 191, 194, 196, 199], - [201, 204, 206, 209, 211, 214, 216, 218], - [221, 223, 225, 228, 230, 232, 234, 236], + [[0, 31, 45, 55, 63, 71, 78, 84], [ + 90, 95, 100, 105, 110, 115, 119, 123 + ], [127, 131, 135, 139, 142, 146, 149, 153], [ + 156, 159, 162, 165, 168, 171, 174, 177 + ], [180, 183, 186, 188, 191, 194, 196, 199], [ + 201, 204, 206, 209, 211, 214, 216, 218 + ], [221, 223, 225, 228, 230, 232, 234, 236], [238, 241, 243, 245, 247, 249, 251, 253]], dtype=np.float32) @@ -332,14 +332,12 @@ class AdjustGamma(test_util.TensorFlowTestCase): y_tf = np.trunc(y.eval()) y_np = np.array( - [[0, 0, 0, 0, 1, 1, 2, 3], - [4, 5, 6, 7, 9, 10, 12, 14], - [16, 18, 20, 22, 25, 27, 30, 33], - [36, 39, 42, 45, 49, 52, 56, 60], - [64, 68, 72, 76, 81, 85, 90, 95], - [100, 105, 110, 116, 121, 127, 132, 138], - [144, 150, 156, 163, 169, 176, 182, 189], - [196, 203, 211, 218, 225, 233, 241, 249]], + [[0, 0, 0, 0, 1, 1, 2, 3], [4, 5, 6, 7, 9, 10, 12, 14], [ + 16, 18, 20, 22, 25, 27, 30, 33 + ], [36, 39, 42, 45, 49, 52, 56, 60], [64, 68, 72, 76, 81, 85, 90, 95], + [100, 105, 110, 116, 121, 127, 132, 138], [ + 144, 150, 156, 163, 169, 176, 182, 189 + ], [196, 203, 211, 218, 225, 233, 241, 249]], dtype=np.float32) self.assertAllClose(y_tf, y_np, 1e-6) @@ -483,8 +481,7 @@ class FlipImageBenchmark(test.Benchmark): with session.Session("", graph=ops.Graph(), config=config) as sess: with ops.device(device): inputs = variables.Variable( - random_ops.random_uniform( - image_shape, dtype=dtypes.float32) * 255, + random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255, trainable=False, dtype=dtypes.float32) run_op = image_ops.flip_left_right(inputs) @@ -514,8 +511,7 @@ class FlipImageBenchmark(test.Benchmark): with session.Session("", graph=ops.Graph(), config=config) as sess: with ops.device(device): inputs = variables.Variable( - random_ops.random_uniform( - image_shape, dtype=dtypes.float32) * 255, + random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255, trainable=False, dtype=dtypes.float32) run_op = image_ops.random_flip_left_right(inputs) @@ -566,8 +562,7 @@ class AdjustHueBenchmark(test.Benchmark): with session.Session("", graph=ops.Graph(), config=config) as sess: with ops.device(device): inputs = variables.Variable( - random_ops.random_uniform( - image_shape, dtype=dtypes.float32) * 255, + random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255, trainable=False, dtype=dtypes.float32) delta = constant_op.constant(0.1, dtype=dtypes.float32) @@ -611,8 +606,7 @@ class AdjustSaturationBenchmark(test.Benchmark): with session.Session("", graph=ops.Graph(), config=config) as sess: with ops.device(device): inputs = variables.Variable( - random_ops.random_uniform( - image_shape, dtype=dtypes.float32) * 255, + random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255, trainable=False, dtype=dtypes.float32) delta = constant_op.constant(0.1, dtype=dtypes.float32) @@ -667,10 +661,11 @@ class ResizeBilinearBenchmark(test.Benchmark): results = self.run_op_benchmark( sess, benchmark_op, - name=("resize_bilinear_%s_%s_%s" % - (image_size[0], image_size[1], num_channels))) - print("%s : %.2f ms/img" % (results["name"], 1000 * results["wall_time"] - / (batch_size * num_ops))) + name=("resize_bilinear_%s_%s_%s" % (image_size[0], image_size[1], + num_channels))) + print("%s : %.2f ms/img" % + (results["name"], + 1000 * results["wall_time"] / (batch_size * num_ops))) def benchmarkSimilar3Channel(self): self._benchmarkResize((183, 229), 3) @@ -717,8 +712,9 @@ class ResizeBicubicBenchmark(test.Benchmark): min_iters=20, name=("resize_bicubic_%s_%s_%s" % (image_size[0], image_size[1], num_channels))) - print("%s : %.2f ms/img" % (results["name"], 1000 * results["wall_time"] - / (batch_size * num_ops))) + print("%s : %.2f ms/img" % + (results["name"], + 1000 * results["wall_time"] / (batch_size * num_ops))) def benchmarkSimilar3Channel(self): self._benchmarkResize((183, 229), 3) @@ -754,8 +750,8 @@ class ResizeAreaBenchmark(test.Benchmark): batch_size = 1 num_ops = 1000 img = variables.Variable( - random_ops.random_normal([batch_size, image_size[0], - image_size[1], num_channels]), + random_ops.random_normal( + [batch_size, image_size[0], image_size[1], num_channels]), name="img") deps = [] @@ -768,12 +764,13 @@ class ResizeAreaBenchmark(test.Benchmark): with session.Session() as sess: sess.run(variables.global_variables_initializer()) results = self.run_op_benchmark( - sess, benchmark_op, - name=("resize_area_%s_%s_%s" % - (image_size[0], image_size[1], num_channels))) - print("%s : %.2f ms/img" % ( - results["name"], - 1000*results["wall_time"] / (batch_size * num_ops))) + sess, + benchmark_op, + name=("resize_area_%s_%s_%s" % (image_size[0], image_size[1], + num_channels))) + print("%s : %.2f ms/img" % + (results["name"], + 1000 * results["wall_time"] / (batch_size * num_ops))) def benchmarkSimilar3Channel(self): self._benchmarkResize((183, 229), 3) @@ -847,8 +844,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase): flt_image = image_ops.convert_image_dtype(image, dtypes.float32) saturation_adjusted_image = gen_image_ops.adjust_saturation( flt_image, saturation_factor) - return image_ops.convert_image_dtype(saturation_adjusted_image, - orig_dtype) + return image_ops.convert_image_dtype(saturation_adjusted_image, orig_dtype) def testHalfSaturationFused(self): x_shape = [2, 2, 3] @@ -953,7 +949,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True): x_tf = constant_op.constant(x_np, shape=x_np.shape) y = image_ops.flip_left_right(x_tf) - self.assertTrue(y.op.name.startswith('flip_left_right')) + self.assertTrue(y.op.name.startswith("flip_left_right")) y_tf = y.eval() self.assertAllEqual(y_tf, y_np) @@ -964,7 +960,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True): x_tf = constant_op.constant(x_np, shape=x_np.shape) y = image_ops.random_flip_left_right(x_tf) - self.assertTrue(y.op.name.startswith('random_flip_left_right')) + self.assertTrue(y.op.name.startswith("random_flip_left_right")) count_flipped = 0 count_unflipped = 0 @@ -995,7 +991,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True): x_tf = constant_op.constant(x_np, shape=x_np.shape) y = image_ops.flip_up_down(x_tf) - self.assertTrue(y.op.name.startswith('flip_up_down')) + self.assertTrue(y.op.name.startswith("flip_up_down")) y_tf = y.eval() self.assertAllEqual(y_tf, y_np) @@ -1006,7 +1002,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True): x_tf = constant_op.constant(x_np, shape=x_np.shape) y = image_ops.random_flip_up_down(x_tf) - self.assertTrue(y.op.name.startswith('random_flip_up_down')) + self.assertTrue(y.op.name.startswith("random_flip_up_down")) count_flipped = 0 count_unflipped = 0 for _ in range(50): @@ -1036,7 +1032,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True): x_tf = constant_op.constant(x_np, shape=x_np.shape) y = image_ops.transpose_image(x_tf) - self.assertTrue(y.op.name.startswith('transpose_image')) + self.assertTrue(y.op.name.startswith("transpose_image")) y_tf = y.eval() self.assertAllEqual(y_tf, y_np) @@ -1261,7 +1257,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True): x = constant_op.constant(x_np, shape=x_shape) y = image_ops.per_image_standardization(x) - self.assertTrue(y.op.name.startswith('per_image_standardization')) + self.assertTrue(y.op.name.startswith("per_image_standardization")) y_tf = y.eval() self.assertAllClose(y_tf, y_np, atol=1e-4) @@ -1433,9 +1429,10 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase): # Each line is a test configuration: # (offset_height, offset_width, target_height, target_width), err_msg - test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"), - ([0, -1, 3, 3], "offset_width must be >= 0"), - ([0, 0, 0, 3], "target_height must be > 0"), + test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"), ([ + 0, -1, 3, 3 + ], "offset_width must be >= 0"), ([0, 0, 0, 3], + "target_height must be > 0"), ([0, 0, 3, 0], "target_width must be > 0"), ([2, 0, 3, 3], "height must be >= target + offset"), ([0, 2, 3, 3], "width must be >= target + offset")) @@ -1446,7 +1443,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase): def testNameScope(self): image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3]) y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66) - self.assertTrue(y.name.startswith('crop_to_bounding_box')) + self.assertTrue(y.name.startswith("crop_to_bounding_box")) class CentralCropTest(test_util.TensorFlowTestCase): @@ -1471,9 +1468,10 @@ class CentralCropTest(test_util.TensorFlowTestCase): def testCropping(self): x_shape = [4, 8, 1] - x_np = np.array([[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8], - [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]], - dtype=np.int32).reshape(x_shape) + x_np = np.array( + [[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8], + [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]], + dtype=np.int32).reshape(x_shape) y_np = np.array([[3, 4, 5, 6], [3, 4, 5, 6]]).reshape([2, 4, 1]) with self.test_session(use_gpu=True): x = constant_op.constant(x_np, shape=x_shape) @@ -1490,7 +1488,7 @@ class CentralCropTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True): x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32) y = image_ops.central_crop(x, 0.33) - y_tf = y.eval(feed_dict={x:x_np}) + y_tf = y.eval(feed_dict={x: x_np}) self.assertAllEqual(y_tf, y_np) self.assertAllEqual(y_tf.shape, y_np.shape) @@ -1529,7 +1527,7 @@ class CentralCropTest(test_util.TensorFlowTestCase): x_np = np.ones(x_shape, dtype=np.float32) with self.test_session(use_gpu=True): y = image_ops.central_crop(x_np, 1.0) - self.assertTrue(y.op.name.startswith('central_crop')) + self.assertTrue(y.op.name.startswith("central_crop")) class PadToBoundingBoxTest(test_util.TensorFlowTestCase): @@ -1602,15 +1600,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase): self.assertEqual(y.get_shape().as_list(), post_shape) def testInt64(self): - x = [1, 2, 3, - 4, 5, 6, - 7, 8, 9] + x = [1, 2, 3, 4, 5, 6, 7, 8, 9] x_shape = [3, 3, 1] - y = [0, 0, 0, - 1, 2, 3, - 4, 5, 6, - 7, 8, 9] + y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] y_shape = [4, 3, 1] x = np.array(x).reshape(x_shape) y = np.array(y).reshape(y_shape) @@ -1627,38 +1620,26 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase): self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape) def testPadding(self): - x = [1, 2, 3, - 4, 5, 6, - 7, 8, 9] + x = [1, 2, 3, 4, 5, 6, 7, 8, 9] x_shape = [3, 3, 1] offset_height, offset_width = [1, 0] - y = [0, 0, 0, - 1, 2, 3, - 4, 5, 6, - 7, 8, 9] + y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] y_shape = [4, 3, 1] self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape) offset_height, offset_width = [0, 1] - y = [0, 1, 2, 3, - 0, 4, 5, 6, - 0, 7, 8, 9] + y = [0, 1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9] y_shape = [3, 4, 1] self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape) offset_height, offset_width = [0, 0] - y = [1, 2, 3, - 4, 5, 6, - 7, 8, 9, - 0, 0, 0] + y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0] y_shape = [4, 3, 1] self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape) offset_height, offset_width = [0, 0] - y = [1, 2, 3, 0, - 4, 5, 6, 0, - 7, 8, 9, 0] + y = [1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9, 0] y_shape = [3, 4, 1] self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape) @@ -1690,9 +1671,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase): # Input image has 0-length dimension(s). # Each line is a test configuration: # x_shape, target_height, target_width - test_config = (([0, 2, 2], 2, 2), - ([2, 0, 2], 2, 2), - ([2, 2, 0], 2, 2)) + test_config = (([0, 2, 2], 2, 2), ([2, 0, 2], 2, 2), ([2, 2, 0], 2, 2)) offset_height, offset_width = [0, 0] x = [] @@ -1737,7 +1716,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase): def testNameScope(self): image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3]) y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66) - self.assertTrue(y.op.name.startswith('pad_to_bounding_box')) + self.assertTrue(y.op.name.startswith("pad_to_bounding_box")) class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase): @@ -1750,8 +1729,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase): (bounding_box[2] - bounding_box[0])) image_size_np = np.array(image.shape, dtype=np.int32) - bounding_box_np = (np.array( - bounding_box, dtype=np.float32).reshape([1, 1, 4])) + bounding_box_np = ( + np.array(bounding_box, dtype=np.float32).reshape([1, 1, 4])) aspect_ratios = [] area_ratios = [] @@ -1796,7 +1775,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase): y = array_ops.strided_slice(image_tf, begin, begin + size) for _ in xrange(num_iter): - y_tf = y.eval(feed_dict={min_object_covered_placeholder: min_object_covered}) + y_tf = y.eval(feed_dict={ + min_object_covered_placeholder: min_object_covered + }) crop_height = y_tf.shape[0] crop_width = y_tf.shape[1] aspect_ratio = float(crop_width) / float(crop_height) @@ -1890,7 +1871,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase): bounding_box = constant_op.constant( [0.0, 0.0, 1.0, 1.0], shape=[4], - dtype=dtypes.float32,) + dtype=dtypes.float32, + ) begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box( image_size=image_size, bounding_boxes=bounding_box, @@ -1937,13 +1919,15 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase): class ResizeImagesTest(test_util.TensorFlowTestCase): - OPTIONS = [image_ops.ResizeMethod.BILINEAR, - image_ops.ResizeMethod.NEAREST_NEIGHBOR, - image_ops.ResizeMethod.BICUBIC, - image_ops.ResizeMethod.AREA] + OPTIONS = [ + image_ops.ResizeMethod.BILINEAR, image_ops.ResizeMethod.NEAREST_NEIGHBOR, + image_ops.ResizeMethod.BICUBIC, image_ops.ResizeMethod.AREA + ] - TYPES = [np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, - np.float16, np.float32, np.float64] + TYPES = [ + np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.float16, + np.float32, np.float64 + ] def _assertShapeInference(self, pre_shape, size, post_shape): # Try single image resize @@ -1971,12 +1955,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): single_shape = [6, 4, 1] # This test is also conducted with int8, so 127 is the maximum # value that can be used. - data = [127, 127, 64, 64, - 127, 127, 64, 64, - 64, 64, 127, 127, - 64, 64, 127, 127, - 50, 50, 100, 100, - 50, 50, 100, 100] + data = [ + 127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127, + 50, 50, 100, 100, 50, 50, 100, 100 + ] target_height = 6 target_width = 4 @@ -2007,12 +1989,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): single_shape = [6, 4, 1] # This test is also conducted with int8, so 127 is the maximum # value that can be used. - data = [127, 127, 64, 64, - 127, 127, 64, 64, - 64, 64, 127, 127, - 64, 64, 127, 127, - 50, 50, 100, 100, - 50, 50, 100, 100] + data = [ + 127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127, + 50, 50, 100, 100, 50, 50, 100, 100 + ] new_size = array_ops.placeholder(dtypes.int32, shape=(2)) img_np = np.array(data, dtype=np.uint8).reshape(img_shape) @@ -2066,8 +2046,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): image_ops.ResizeMethod.BILINEAR) def testReturnDtype(self): - target_shapes = [[6, 4], [3, 2], [array_ops.placeholder(dtypes.int32), - array_ops.placeholder(dtypes.int32)]] + target_shapes = [[6, 4], [3, 2], [ + array_ops.placeholder(dtypes.int32), + array_ops.placeholder(dtypes.int32) + ]] for nptype in self.TYPES: image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1]) for opt in self.OPTIONS: @@ -2084,12 +2066,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): img_shape = [1, 6, 4, 1] # This test is also conducted with int8, so 127 is the maximum # value that can be used. - data = [127, 127, 64, 64, - 127, 127, 64, 64, - 64, 64, 127, 127, - 64, 64, 127, 127, - 50, 50, 100, 100, - 50, 50, 100, 100] + data = [ + 127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127, + 50, 50, 100, 100, 50, 50, 100, 100 + ] # Test size where width is specified as a tensor which is a sum # of two tensors. width_1 = constant_op.constant(1) @@ -2111,15 +2091,11 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): def testResizeDown(self): # This test is also conducted with int8, so 127 is the maximum # value that can be used. - data = [127, 127, 64, 64, - 127, 127, 64, 64, - 64, 64, 127, 127, - 64, 64, 127, 127, - 50, 50, 100, 100, - 50, 50, 100, 100] - expected_data = [127, 64, - 64, 127, - 50, 100] + data = [ + 127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127, + 50, 50, 100, 100, 50, 50, 100, 100 + ] + expected_data = [127, 64, 64, 127, 50, 100] target_height = 3 target_width = 2 @@ -2145,39 +2121,31 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): def testResizeUpAlignCornersFalse(self): img_shape = [1, 3, 2, 1] - data = [64, 32, - 32, 64, - 50, 100] + data = [64, 32, 32, 64, 50, 100] target_height = 6 target_width = 4 expected_data = {} expected_data[image_ops.ResizeMethod.BILINEAR] = [ - 64.0, 48.0, 32.0, 32.0, - 48.0, 48.0, 48.0, 48.0, - 32.0, 48.0, 64.0, 64.0, - 41.0, 61.5, 82.0, 82.0, - 50.0, 75.0, 100.0, 100.0, - 50.0, 75.0, 100.0, 100.0] + 64.0, 48.0, 32.0, 32.0, 48.0, 48.0, 48.0, 48.0, 32.0, 48.0, 64.0, 64.0, + 41.0, 61.5, 82.0, 82.0, 50.0, 75.0, 100.0, 100.0, 50.0, 75.0, 100.0, + 100.0 + ] expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [ - 64.0, 64.0, 32.0, 32.0, - 64.0, 64.0, 32.0, 32.0, - 32.0, 32.0, 64.0, 64.0, - 32.0, 32.0, 64.0, 64.0, - 50.0, 50.0, 100.0, 100.0, - 50.0, 50.0, 100.0, 100.0] + 64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0, + 32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0, + 100.0 + ] expected_data[image_ops.ResizeMethod.AREA] = [ - 64.0, 64.0, 32.0, 32.0, - 64.0, 64.0, 32.0, 32.0, - 32.0, 32.0, 64.0, 64.0, - 32.0, 32.0, 64.0, 64.0, - 50.0, 50.0, 100.0, 100.0, - 50.0, 50.0, 100.0, 100.0] + 64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0, + 32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0, + 100.0 + ] for nptype in self.TYPES: for opt in [ image_ops.ResizeMethod.BILINEAR, - image_ops.ResizeMethod.NEAREST_NEIGHBOR, - image_ops.ResizeMethod.AREA]: + image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA + ]: with self.test_session(use_gpu=True): img_np = np.array(data, dtype=nptype).reshape(img_shape) image = constant_op.constant(img_np, shape=img_shape) @@ -2190,41 +2158,29 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): def testResizeUpAlignCornersTrue(self): img_shape = [1, 3, 2, 1] - data = [6, 3, - 3, 6, - 6, 9] + data = [6, 3, 3, 6, 6, 9] target_height = 5 target_width = 4 expected_data = {} expected_data[image_ops.ResizeMethod.BILINEAR] = [ - 6.0, 5.0, 4.0, 3.0, - 4.5, 4.5, 4.5, 4.5, - 3.0, 4.0, 5.0, 6.0, - 4.5, 5.5, 6.5, 7.5, - 6.0, 7.0, 8.0, 9.0 + 6.0, 5.0, 4.0, 3.0, 4.5, 4.5, 4.5, 4.5, 3.0, 4.0, 5.0, 6.0, 4.5, 5.5, + 6.5, 7.5, 6.0, 7.0, 8.0, 9.0 ] expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [ - 6.0, 6.0, 3.0, 3.0, - 3.0, 3.0, 6.0, 6.0, - 3.0, 3.0, 6.0, 6.0, - 6.0, 6.0, 9.0, 9.0, - 6.0, 6.0, 9.0, 9.0 + 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0, + 9.0, 9.0, 6.0, 6.0, 9.0, 9.0 ] # TODO(b/37749740): Improve alignment of ResizeMethod.AREA when # align_corners=True. expected_data[image_ops.ResizeMethod.AREA] = [ - 6.0, 6.0, 6.0, 3.0, - 6.0, 6.0, 6.0, 3.0, - 3.0, 3.0, 3.0, 6.0, - 3.0, 3.0, 3.0, 6.0, - 6.0, 6.0, 6.0, 9.0 + 6.0, 6.0, 6.0, 3.0, 6.0, 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 3.0, 3.0, + 3.0, 6.0, 6.0, 6.0, 6.0, 9.0 ] for nptype in self.TYPES: for opt in [ image_ops.ResizeMethod.BILINEAR, - image_ops.ResizeMethod.NEAREST_NEIGHBOR, - image_ops.ResizeMethod.AREA + image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA ]: with self.test_session(use_gpu=True): img_np = np.array(data, dtype=nptype).reshape(img_shape) @@ -2238,23 +2194,21 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): def testResizeUpBicubic(self): img_shape = [1, 6, 6, 1] - data = [128, 128, 64, 64, 128, 128, 64, 64, - 64, 64, 128, 128, 64, 64, 128, 128, - 50, 50, 100, 100, 50, 50, 100, 100, - 50, 50, 100, 100, 50, 50, 100, 100, - 50, 50, 100, 100] + data = [ + 128, 128, 64, 64, 128, 128, 64, 64, 64, 64, 128, 128, 64, 64, 128, 128, + 50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100, + 50, 50, 100, 100 + ] img_np = np.array(data, dtype=np.uint8).reshape(img_shape) target_height = 8 target_width = 8 - expected_data = [128, 135, 96, 55, 64, 114, 134, 128, - 78, 81, 68, 52, 57, 118, 144, 136, - 55, 49, 79, 109, 103, 89, 83, 84, - 74, 70, 95, 122, 115, 69, 49, 55, - 100, 105, 75, 43, 50, 89, 105, 100, - 57, 54, 74, 96, 91, 65, 55, 58, - 70, 69, 75, 81, 80, 72, 69, 70, - 105, 112, 75, 36, 45, 92, 111, 105] + expected_data = [ + 128, 135, 96, 55, 64, 114, 134, 128, 78, 81, 68, 52, 57, 118, 144, 136, + 55, 49, 79, 109, 103, 89, 83, 84, 74, 70, 95, 122, 115, 69, 49, 55, 100, + 105, 75, 43, 50, 89, 105, 100, 57, 54, 74, 96, 91, 65, 55, 58, 70, 69, + 75, 81, 80, 72, 69, 70, 105, 112, 75, 36, 45, 92, 111, 105 + ] with self.test_session(use_gpu=True): image = constant_op.constant(img_np, shape=img_shape) @@ -2267,20 +2221,17 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): def testResizeDownArea(self): img_shape = [1, 6, 6, 1] - data = [128, 64, 32, 16, 8, 4, - 4, 8, 16, 32, 64, 128, - 128, 64, 32, 16, 8, 4, - 5, 10, 15, 20, 25, 30, - 30, 25, 20, 15, 10, 5, - 5, 10, 15, 20, 25, 30] + data = [ + 128, 64, 32, 16, 8, 4, 4, 8, 16, 32, 64, 128, 128, 64, 32, 16, 8, 4, 5, + 10, 15, 20, 25, 30, 30, 25, 20, 15, 10, 5, 5, 10, 15, 20, 25, 30 + ] img_np = np.array(data, dtype=np.uint8).reshape(img_shape) target_height = 4 target_width = 4 - expected_data = [73, 33, 23, 39, - 73, 33, 23, 39, - 14, 16, 19, 21, - 14, 16, 19, 21] + expected_data = [ + 73, 33, 23, 39, 73, 33, 23, 39, 14, 16, 19, 21, 14, 16, 19, 21 + ] with self.test_session(use_gpu=True): image = constant_op.constant(img_np, shape=img_shape) @@ -2367,7 +2318,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True): single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3]) y = image_ops.resize_images(single_image, [55, 66]) - self.assertTrue(y.op.name.startswith('resize_images')) + self.assertTrue(y.op.name.startswith("resize_images")) class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase): @@ -2440,133 +2391,93 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase): def testPad(self): # Pad even along col. - x = [1, 2, 3, 4, - 5, 6, 7, 8] + x = [1, 2, 3, 4, 5, 6, 7, 8] x_shape = [2, 4, 1] - y = [0, 1, 2, 3, 4, 0, - 0, 5, 6, 7, 8, 0] + y = [0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 8, 0] y_shape = [2, 6, 1] self._assertReturns(x, x_shape, y, y_shape) # Pad odd along col. - x = [1, 2, 3, 4, - 5, 6, 7, 8] + x = [1, 2, 3, 4, 5, 6, 7, 8] x_shape = [2, 4, 1] - y = [0, 1, 2, 3, 4, 0, 0, - 0, 5, 6, 7, 8, 0, 0] + y = [0, 1, 2, 3, 4, 0, 0, 0, 5, 6, 7, 8, 0, 0] y_shape = [2, 7, 1] self._assertReturns(x, x_shape, y, y_shape) # Pad even along row. - x = [1, 2, 3, 4, - 5, 6, 7, 8] + x = [1, 2, 3, 4, 5, 6, 7, 8] x_shape = [2, 4, 1] - y = [0, 0, 0, 0, - 1, 2, 3, 4, - 5, 6, 7, 8, - 0, 0, 0, 0] + y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0] y_shape = [4, 4, 1] self._assertReturns(x, x_shape, y, y_shape) # Pad odd along row. - x = [1, 2, 3, 4, - 5, 6, 7, 8] + x = [1, 2, 3, 4, 5, 6, 7, 8] x_shape = [2, 4, 1] - y = [0, 0, 0, 0, - 1, 2, 3, 4, - 5, 6, 7, 8, - 0, 0, 0, 0, - 0, 0, 0, 0] + y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0] y_shape = [5, 4, 1] self._assertReturns(x, x_shape, y, y_shape) def testCrop(self): # Crop even along col. - x = [1, 2, 3, 4, - 5, 6, 7, 8] + x = [1, 2, 3, 4, 5, 6, 7, 8] x_shape = [2, 4, 1] - y = [2, 3, - 6, 7] + y = [2, 3, 6, 7] y_shape = [2, 2, 1] self._assertReturns(x, x_shape, y, y_shape) # Crop odd along col. - x = [1, 2, 3, 4, 5, 6, - 7, 8, 9, 10, 11, 12] + x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] x_shape = [2, 6, 1] - y = [2, 3, 4, - 8, 9, 10] + y = [2, 3, 4, 8, 9, 10] y_shape = [2, 3, 1] self._assertReturns(x, x_shape, y, y_shape) # Crop even along row. - x = [1, 2, - 3, 4, - 5, 6, - 7, 8] + x = [1, 2, 3, 4, 5, 6, 7, 8] x_shape = [4, 2, 1] - y = [3, 4, - 5, 6] + y = [3, 4, 5, 6] y_shape = [2, 2, 1] self._assertReturns(x, x_shape, y, y_shape) # Crop odd along row. - x = [1, 2, - 3, 4, - 5, 6, - 7, 8, - 9, 10, - 11, 12, - 13, 14, - 15, 16] + x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] x_shape = [8, 2, 1] - y = [3, 4, - 5, 6, - 7, 8, - 9, 10, - 11, 12] + y = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12] y_shape = [5, 2, 1] self._assertReturns(x, x_shape, y, y_shape) def testCropAndPad(self): # Pad along row but crop along col. - x = [1, 2, 3, 4, - 5, 6, 7, 8] + x = [1, 2, 3, 4, 5, 6, 7, 8] x_shape = [2, 4, 1] - y = [0, 0, - 2, 3, - 6, 7, - 0, 0] + y = [0, 0, 2, 3, 6, 7, 0, 0] y_shape = [4, 2, 1] self._assertReturns(x, x_shape, y, y_shape) # Crop along row but pad along col. - x = [1, 2, - 3, 4, - 5, 6, - 7, 8] + x = [1, 2, 3, 4, 5, 6, 7, 8] x_shape = [4, 2, 1] - y = [0, 3, 4, 0, - 0, 5, 6, 0] + y = [0, 3, 4, 0, 0, 5, 6, 0] y_shape = [2, 4, 1] self._assertReturns(x, x_shape, y, y_shape) @@ -2647,7 +2558,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase): def testNameScope(self): image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3]) y = image_ops.resize_image_with_crop_or_pad(image, 55, 66) - self.assertTrue(y.op.name.startswith('resize_image_with_crop_or_pad')) + self.assertTrue(y.op.name.startswith("resize_image_with_crop_or_pad")) def _SimpleColorRamp(): @@ -2916,8 +2827,8 @@ class GifTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True) as sess: gif = io_ops.read_file(filename) image = image_ops.decode_gif(gif) - with self.assertRaisesRegexp( - errors.InvalidArgumentError, "can't process optimized gif"): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "can't process optimized gif"): gif, image = sess.run([gif, image]) def testValid(self): @@ -2979,8 +2890,9 @@ class ConvertImageTest(test_util.TensorFlowTestCase): y = image_ops.convert_image_dtype(image, output_dtype) self.assertTrue(y.dtype == output_dtype) self.assertAllClose(y.eval(), y_np, atol=1e-5) - if output_dtype in [dtypes.float32, dtypes.float64, - dtypes.int32, dtypes.int64]: + if output_dtype in [ + dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64 + ]: y_saturate = image_ops.convert_image_dtype( image, output_dtype, saturate=True) self.assertTrue(y_saturate.dtype == output_dtype) @@ -3000,8 +2912,8 @@ class ConvertImageTest(test_util.TensorFlowTestCase): with self.test_session(use_gpu=True): self._convert([0, 255], dtypes.uint8, dtypes.int16, [0, 255 * 128]) self._convert([0, 32767], dtypes.int16, dtypes.uint8, [0, 255]) - self._convert([0, 2 ** 32], dtypes.int64, dtypes.int32, [0, 1]) - self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2 ** 32]) + self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1]) + self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2**32]) def testConvertBetweenFloat(self): # Make sure converting to between float types does nothing interesting @@ -3022,20 +2934,14 @@ class ConvertImageTest(test_util.TensorFlowTestCase): def testConvertBetweenInt16AndInt8(self): with self.test_session(use_gpu=True): # uint8, uint16 - self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8, - [0, 255]) - self._convert([0, 255], dtypes.uint8, dtypes.uint16, - [0, 255 * 256]) + self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8, [0, 255]) + self._convert([0, 255], dtypes.uint8, dtypes.uint16, [0, 255 * 256]) # int8, uint16 - self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8, - [0, 127]) - self._convert([0, 127], dtypes.int8, dtypes.uint16, - [0, 127 * 2 * 256]) + self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8, [0, 127]) + self._convert([0, 127], dtypes.int8, dtypes.uint16, [0, 127 * 2 * 256]) # int16, uint16 - self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16, - [0, 255 * 128]) - self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16, - [0, 255 * 256]) + self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16, [0, 255 * 128]) + self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16, [0, 255 * 256]) class TotalVariationTest(test_util.TensorFlowTestCase): @@ -3168,20 +3074,17 @@ class TotalVariationTest(test_util.TensorFlowTestCase): # The following are the sum of absolute differences between the pixels. # sum row dif = (4-1) + (7-2) = 3 + 5 = 8 # sum col dif = (2-1) + (7-4) = 1 + 3 = 4 - r = [[1, 2], - [4, 7]] + r = [[1, 2], [4, 7]] # Blue color channel. # sum row dif = 18 + 29 = 47 # sum col dif = 7 + 18 = 25 - g = [[11, 18], - [29, 47]] + g = [[11, 18], [29, 47]] # Green color channel. # sum row dif = 120 + 193 = 313 # sum col dif = 47 + 120 = 167 - b = [[73, 120], - [193, 313]] + b = [[73, 120], [193, 313]] # Combine the 3 color channels into a single 3-dim array. # The shape is (2, 2, 3) corresponding to (height, width and color). @@ -3210,9 +3113,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase): # Combine these 3 images into a single array of shape (3, 2, 2, 3) # where the first dimension is for the image-number. - multi = np.vstack((a[np.newaxis, :], - b[np.newaxis, :], - c[np.newaxis, :])) + multi = np.vstack((a[np.newaxis, :], b[np.newaxis, :], c[np.newaxis, :])) # Check that TensorFlow correctly calculates the total variation # for each image individually and returns the correct array. @@ -3268,6 +3169,46 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase): boxes, scores, max_output_size, iou_threshold).eval() self.assertAllClose(selected_indices, [3, 0, 5]) + def testInvalidShape(self): + # The boxes should be 2D of shape [num_boxes, 4]. + with self.assertRaisesRegexp( + ValueError, 'Shape must be rank 2 but is rank 1'): + boxes = constant_op.constant([0.0, 0.0, 1.0, 1.0]) + scores = constant_op.constant([0.9]) + selected_indices = image_ops.non_max_suppression( + boxes, scores, 3, 0.5) + + with self.assertRaisesRegexp( + ValueError, 'Dimension must be 4 but is 3'): + boxes = constant_op.constant([[0.0, 0.0, 1.0]]) + scores = constant_op.constant([0.9]) + selected_indices = image_ops.non_max_suppression( + boxes, scores, 3, 0.5) + + # The scores should be 1D of shape [num_boxes]. + with self.assertRaisesRegexp( + ValueError, 'Shape must be rank 1 but is rank 2'): + boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]]) + scores = constant_op.constant([[0.9]]) + selected_indices = image_ops.non_max_suppression( + boxes, scores, 3, 0.5) + + # The max_output_size should be a scaler (0-D). + with self.assertRaisesRegexp( + ValueError, 'Shape must be rank 0 but is rank 1'): + boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]]) + scores = constant_op.constant([0.9]) + selected_indices = image_ops.non_max_suppression( + boxes, scores, [3], 0.5) + + # The iou_threshold should be a scaler (0-D). + with self.assertRaisesRegexp( + ValueError, 'Shape must be rank 0 but is rank 2'): + boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]]) + scores = constant_op.constant([0.9]) + selected_indices = image_ops.non_max_suppression( + boxes, scores, 3, [[0.5]]) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py index db33a08137e..a5096ffdd9c 100644 --- a/tensorflow/python/ops/linalg/linalg_impl.py +++ b/tensorflow/python/ops/linalg/linalg_impl.py @@ -65,8 +65,8 @@ def logdet(matrix, name=None): ``` Args: - matrix: A `Tensor`. Must be `float32`, `float64`, `complex64`, or - `complex128` with shape `[..., M, M]`. + matrix: A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`, + or `complex128` with shape `[..., M, M]`. name: A name to give this `Op`. Defaults to `logdet`. Returns: @@ -99,8 +99,8 @@ def adjoint(matrix, name=None): # [3 - 3j, 6 - 6j]] Args: - matrix: A `Tensor`. Must be `float32`, `float64`, `complex64`, or - `complex128` with shape `[..., M, M]`. + matrix: A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`, + or `complex128` with shape `[..., M, M]`. name: A name to give this `Op` (optional). Returns: diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py index 27e0f17020a..8339c940af8 100644 --- a/tensorflow/python/ops/linalg/linear_operator.py +++ b/tensorflow/python/ops/linalg/linear_operator.py @@ -478,7 +478,6 @@ class LinearOperator(object): cond, self._max_condition_number_to_be_non_singular(), message="Singular matrix up to precision epsilon.") - raise NotImplementedError("assert_non_singular is not implemented.") def _max_condition_number_to_be_non_singular(self): """Return the maximum condition number that we consider nonsingular.""" diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py index a4724d030f3..2217bfd5459 100644 --- a/tensorflow/python/ops/linalg/linear_operator_diag.py +++ b/tensorflow/python/ops/linalg/linear_operator_diag.py @@ -121,8 +121,8 @@ class LinearOperatorDiag(linear_operator.LinearOperator): Args: diag: Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`. - The diagonal of the operator. Allowed dtypes: `float32`, `float64`, - `complex64`, `complex128`. + The diagonal of the operator. Allowed dtypes: `float16`, `float32`, + `float64`, `complex64`, `complex128`. is_non_singular: Expect that this operator is non-singular. is_self_adjoint: Expect that this operator is equal to its hermitian transpose. If `diag.dtype` is real, this is auto-set to `True`. @@ -167,7 +167,12 @@ class LinearOperatorDiag(linear_operator.LinearOperator): def _check_diag(self, diag): """Static check of diag.""" allowed_dtypes = [ - dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128] + dtypes.float16, + dtypes.float32, + dtypes.float64, + dtypes.complex64, + dtypes.complex128, + ] dtype = diag.dtype if dtype not in allowed_dtypes: diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py index dd4c7cb0413..8fb59ca1a7e 100644 --- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py +++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py @@ -114,7 +114,8 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator): Args: matrix: Shape `[B1,...,Bb, M, N]` with `b >= 0`, `M, N >= 0`. - Allowed dtypes: `float32`, `float64`, `complex64`, `complex128`. + Allowed dtypes: `float16`, `float32`, `float64`, `complex64`, + `complex128`. is_non_singular: Expect that this operator is non-singular. is_self_adjoint: Expect that this operator is equal to its hermitian transpose. @@ -147,7 +148,12 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator): def _check_matrix(self, matrix): """Static check of the `matrix` argument.""" allowed_dtypes = [ - dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128] + dtypes.float16, + dtypes.float32, + dtypes.float64, + dtypes.complex64, + dtypes.complex128, + ] matrix = ops.convert_to_tensor(matrix, name="matrix") diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py index ad3bb2efa94..36eed89db60 100644 --- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py +++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py @@ -150,8 +150,8 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator): `is_X` matrix property hints, which will trigger the appropriate code path. Args: - base_operator: Shape `[B1,...,Bb, M, N]` real `float32` or `float64` - `LinearOperator`. This is `L` above. + base_operator: Shape `[B1,...,Bb, M, N]` real `float16`, `float32` or + `float64` `LinearOperator`. This is `L` above. u: Shape `[B1,...,Bb, M, K]` `Tensor` of same `dtype` as `base_operator`. This is `U` above. diag_update: Optional shape `[B1,...,Bb, K]` `Tensor` with same `dtype` @@ -188,7 +188,11 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator): # because if diag has non-zero imaginary part, it will not be # self-adjoint positive definite. dtype = base_operator.dtype - allowed_dtypes = [dtypes.float32, dtypes.float64] + allowed_dtypes = [ + dtypes.float16, + dtypes.float32, + dtypes.float64, + ] if dtype not in allowed_dtypes: raise TypeError( "Argument matrix must have dtype in %s. Found: %s" diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py index 6ea55f0367b..6419030755f 100644 --- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py +++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py @@ -118,7 +118,8 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator): Args: tril: Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`. The lower triangular part of `tril` defines this operator. The strictly - upper triangle is ignored. Allowed dtypes: `float32`, `float64`. + upper triangle is ignored. Allowed dtypes: `float16`, `float32`, + `float64`. is_non_singular: Expect that this operator is non-singular. This operator is non-singular if and only if its diagonal elements are all non-zero. @@ -164,7 +165,11 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator): """Static check of the `tril` argument.""" # TODO(langmore) Add complex types once matrix_triangular_solve works for # them. - allowed_dtypes = [dtypes.float32, dtypes.float64] + allowed_dtypes = [ + dtypes.float16, + dtypes.float32, + dtypes.float64, + ] dtype = tril.dtype if dtype not in allowed_dtypes: raise TypeError( diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py index 72508eb4350..8b3c61b9339 100644 --- a/tensorflow/python/ops/losses/losses_impl.py +++ b/tensorflow/python/ops/losses/losses_impl.py @@ -28,8 +28,10 @@ from tensorflow.python.ops import nn_ops from tensorflow.python.ops import weights_broadcast_ops from tensorflow.python.ops.losses import util from tensorflow.python.util.deprecation import deprecated_args +from tensorflow.python.util.tf_export import tf_export +@tf_export("losses.Reduction") class Reduction(object): """Types of loss reduction. @@ -149,9 +151,10 @@ def _num_present(losses, weights, per_batch=False): def _num_elements(losses): """Computes the number of elements in `losses` tensor.""" with ops.name_scope(None, "num_elements", values=[losses]) as scope: - return array_ops.size(losses, name=scope, out_type=losses.dtype) + return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype) +@tf_export("losses.compute_weighted_loss") def compute_weighted_loss( losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): @@ -211,6 +214,7 @@ def compute_weighted_loss( return loss +@tf_export("losses.absolute_difference") def absolute_difference( labels, predictions, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, @@ -258,6 +262,7 @@ def absolute_difference( losses, weights, scope, loss_collection, reduction=reduction) +@tf_export("losses.cosine_distance") @deprecated_args(None, "dim is deprecated, use axis instead", "dim") def cosine_distance( labels, predictions, axis=None, weights=1.0, scope=None, @@ -311,6 +316,7 @@ def cosine_distance( losses, weights, scope, loss_collection, reduction=reduction) +@tf_export("losses.hinge_loss") def hinge_loss(labels, logits, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): @@ -352,6 +358,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None, losses, weights, scope, loss_collection, reduction=reduction) +@tf_export("losses.huber_loss") def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): @@ -420,6 +427,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None, losses, weights, scope, loss_collection, reduction=reduction) +@tf_export("losses.log_loss") def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): @@ -471,6 +479,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None, # TODO(b/37208492): Add reduction arg. +@tf_export("losses.mean_pairwise_squared_error") def mean_pairwise_squared_error( labels, predictions, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES): @@ -538,12 +547,13 @@ def mean_pairwise_squared_error( num_present_per_batch = _num_present(diffs, weights, per_batch=True) term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, - num_present_per_batch) + num_present_per_batch-1) sum_diff = math_ops.reduce_sum( diffs, reduction_indices=reduction_indices, keep_dims=True) - term2 = 2.0 * _safe_div(math_ops.square(sum_diff), - math_ops.square(num_present_per_batch)) + term2 = 2.0 * _safe_div( + math_ops.square(sum_diff), + math_ops.multiply(num_present_per_batch, num_present_per_batch-1)) weighted_losses = math_ops.multiply(term1 - term2, weights) loss = math_ops.reduce_sum(weighted_losses) @@ -557,6 +567,7 @@ def mean_pairwise_squared_error( return mean_loss +@tf_export("losses.mean_squared_error") def mean_squared_error( labels, predictions, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, @@ -604,6 +615,7 @@ def mean_squared_error( losses, weights, scope, loss_collection, reduction=reduction) +@tf_export("losses.sigmoid_cross_entropy") def sigmoid_cross_entropy( multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None, loss_collection=ops.GraphKeys.LOSSES, @@ -662,6 +674,7 @@ def sigmoid_cross_entropy( losses, weights, scope, loss_collection, reduction=reduction) +@tf_export("losses.softmax_cross_entropy") def softmax_cross_entropy( onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None, loss_collection=ops.GraphKeys.LOSSES, @@ -771,6 +784,7 @@ def _remove_squeezable_dimensions( return labels, predictions, weights +@tf_export("losses.sparse_softmax_cross_entropy") def sparse_softmax_cross_entropy( labels, logits, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py index 3718c481c26..b835d963869 100644 --- a/tensorflow/python/ops/losses/util.py +++ b/tensorflow/python/ops/losses/util.py @@ -30,8 +30,10 @@ from __future__ import print_function from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops from tensorflow.python.ops import math_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("losses.add_loss") def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES): """Adds a externally defined loss to the collection of losses. @@ -43,6 +45,7 @@ def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES): ops.add_to_collection(loss_collection, loss) +@tf_export("losses.get_losses") def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES): """Gets the list of losses from the loss_collection. @@ -56,6 +59,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES): return ops.get_collection(loss_collection, scope) +@tf_export("losses.get_regularization_losses") def get_regularization_losses(scope=None): """Gets the list of regularization losses. @@ -68,6 +72,7 @@ def get_regularization_losses(scope=None): return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope) +@tf_export("losses.get_regularization_loss") def get_regularization_loss(scope=None, name="total_regularization_loss"): """Gets the total regularization loss. @@ -85,6 +90,7 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"): return constant_op.constant(0.0) +@tf_export("losses.get_total_loss") def get_total_loss(add_regularization_losses=True, name="total_loss"): """Returns a tensor whose value represents the total loss. diff --git a/tensorflow/python/ops/manip_grad.py b/tensorflow/python/ops/manip_grad.py new file mode 100644 index 00000000000..573e8c0a0d4 --- /dev/null +++ b/tensorflow/python/ops/manip_grad.py @@ -0,0 +1,32 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Gradients for operators defined in manip_ops.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import ops +from tensorflow.python.ops import manip_ops + + +@ops.RegisterGradient("Roll") +def _RollGrad(op, grad): + # The gradient is just the roll reversed + shift = op.inputs[1] + axis = op.inputs[2] + roll_grad = manip_ops.roll(grad, -shift, axis) + return roll_grad, None, None diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py new file mode 100644 index 00000000000..c5f39784f43 --- /dev/null +++ b/tensorflow/python/ops/manip_ops.py @@ -0,0 +1,36 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Operators for manipulating tensors. + +@@roll +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops +from tensorflow.python.util.all_util import remove_undocumented + +# pylint: disable=protected-access +def roll(input, shift, axis): + return _gen_manip_ops.roll(input, shift, axis) + +roll.__doc__ = _gen_manip_ops.roll.__doc__ +# pylint: enable=protected-access + +_allowed_symbols = ['roll'] + +remove_undocumented(__name__, allowed_exception_list=_allowed_symbols) diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py index bca4c665d27..53308484c42 100644 --- a/tensorflow/python/ops/math_grad.py +++ b/tensorflow/python/ops/math_grad.py @@ -40,15 +40,16 @@ def _SumGrad(op, grad): """Gradient for Sum.""" # Fast path for when reducing to a scalar and ndims is known: adds only # Reshape and Tile ops (and possibly a Shape). - if op.inputs[0].get_shape().ndims is not None: + input_0_shape = op.inputs[0]._shape_tuple() # pylint: disable=protected-access + if input_0_shape is not None: axes = tensor_util.constant_value(op.inputs[1]) if axes is not None: - rank = op.inputs[0].get_shape().ndims + rank = len(input_0_shape) if np.array_equal(axes, np.arange(rank)): # Reduce all dims. grad = array_ops.reshape(grad, [1] * rank) # If shape is not fully defined (but rank is), we use Shape. - if op.inputs[0].get_shape().is_fully_defined(): - input_shape = op.inputs[0].get_shape().as_list() + if None not in input_0_shape: + input_shape = input_0_shape else: input_shape = array_ops.shape(op.inputs[0]) return [array_ops.tile(grad, input_shape), None] @@ -96,9 +97,12 @@ def _MinGrad(op, grad): def _MeanGrad(op, grad): """Gradient for Mean.""" sum_grad = _SumGrad(op, grad)[0] - input_size = op.inputs[0].get_shape().num_elements() - output_size = op.outputs[0].get_shape().num_elements() - if input_size is not None and output_size is not None: + input_shape = op.inputs[0]._shape_tuple() # pylint: disable=protected-access + output_shape = op.outputs[0]._shape_tuple() # pylint: disable=protected-access + if (input_shape is not None and output_shape is not None and + None not in input_shape and None not in output_shape): + input_size = np.prod(input_shape) + output_size = np.prod(output_shape) factor = input_size // max(output_size, 1) factor = constant_op.constant(factor, dtype=sum_grad.dtype) else: @@ -106,7 +110,7 @@ def _MeanGrad(op, grad): output_shape = array_ops.shape(op.outputs[0]) factor = _safe_shape_div( math_ops.reduce_prod(input_shape), math_ops.reduce_prod(output_shape)) - return sum_grad / math_ops.cast(factor, sum_grad.dtype), None + return math_ops.truediv(sum_grad, math_ops.cast(factor, sum_grad.dtype)), None @ops.RegisterGradient("Prod") @@ -169,8 +173,7 @@ def _SegmentMeanGrad(op, grad): array_ops.shape(op.inputs[1]), array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1) ], 0) - ones = array_ops.fill(ones_shape, - constant_op.constant(1, dtype=grad.dtype)) + ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype)) scaled_grad = math_ops.div(grad, math_ops.segment_sum(ones, op.inputs[1])) return array_ops.gather(scaled_grad, op.inputs[1]), None @@ -226,16 +229,19 @@ def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad): def _SegmentMinOrMaxGrad(op, grad, is_sorted): - """Gradient for SegmentMin and (unsorted) SegmentMax. They share similar code.""" - zeros = array_ops.zeros(array_ops.shape(op.inputs[0]), - dtype=op.inputs[0].dtype) + """Gradient for SegmentMin and (unsorted) SegmentMax. + + They share similar code. + """ + zeros = array_ops.zeros( + array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype) # Get the number of selected (minimum or maximum) elements in each segment. gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1]) is_selected = math_ops.equal(op.inputs[0], gathered_outputs) if is_sorted: - num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype), - op.inputs[1]) + num_selected = math_ops.segment_sum( + math_ops.cast(is_selected, grad.dtype), op.inputs[1]) else: num_selected = math_ops.unsorted_segment_sum( math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2]) @@ -330,7 +336,7 @@ def _SquareGrad(op, grad): # Added control dependencies to prevent 2*x from being computed too early. with ops.control_dependencies([grad]): x = math_ops.conj(x) - return grad * (2.0 * x) + return math_ops.multiply(grad, math_ops.multiply(x, 2.0)) @ops.RegisterGradient("Sqrt") @@ -532,8 +538,8 @@ def _IgammaGrad(op, grad): # and Gamma'(a) can grow large. partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) - math_ops.lgamma(a)) # TODO(b/36815900): Mark None return values as NotImplemented - return (None, - array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx)) + return (None, array_ops.reshape( + math_ops.reduce_sum(partial_x * grad, rx), sx)) @ops.RegisterGradient("Igammac") @@ -559,15 +565,17 @@ def _BetaincGrad(op, grad): # Perform operations in log space before summing, because terms # can grow large. - log_beta = (gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b) - - gen_math_ops.lgamma(a + b)) - partial_x = math_ops.exp( - (b - 1) * math_ops.log(1 - x) + (a - 1) * math_ops.log(x) - log_beta) + log_beta = ( + gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b) - + gen_math_ops.lgamma(a + b)) + partial_x = math_ops.exp((b - 1) * math_ops.log(1 - x) + + (a - 1) * math_ops.log(x) - log_beta) # TODO(b/36815900): Mark None return values as NotImplemented - return (None, # da - None, # db - array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx)) + return ( + None, # da + None, # db + array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx)) @ops.RegisterGradient("Zeta") @@ -731,10 +739,8 @@ def _ShapesFullySpecifiedAndEqual(x, y, grad): y_shape = y._shape_tuple() grad_shape = grad._shape_tuple() # pylint: enable=protected-access - return (x_shape == y_shape and - x_shape == grad_shape and - x_shape is not None and - None not in x_shape) + return (x_shape == y_shape and x_shape == grad_shape and + x_shape is not None and None not in x_shape) @ops.RegisterGradient("Add") @@ -756,8 +762,12 @@ def _AddGrad(op, grad): @ops.RegisterGradient("Sub") def _SubGrad(op, grad): + """Gradient for Sub.""" x = op.inputs[0] y = op.inputs[1] + if (isinstance(grad, ops.Tensor) and + _ShapesFullySpecifiedAndEqual(x, y, grad)): + return grad, -grad sx = array_ops.shape(x) sy = array_ops.shape(y) # pylint: disable=protected-access @@ -848,10 +858,10 @@ def _RealDivGrad(op, grad): x = math_ops.conj(x) y = math_ops.conj(y) return (array_ops.reshape( - math_ops.reduce_sum(math_ops.realdiv(grad, y), rx), - sx), array_ops.reshape( - math_ops.reduce_sum(grad * math_ops.realdiv(math_ops.realdiv(-x, y), y), - ry), sy)) + math_ops.reduce_sum(math_ops.realdiv(grad, y), rx), sx), + array_ops.reshape( + math_ops.reduce_sum( + grad * math_ops.realdiv(math_ops.realdiv(-x, y), y), ry), sy)) @ops.RegisterGradient("Pow") @@ -946,8 +956,8 @@ def _SelectGrad(op, grad): c = op.inputs[0] x = op.inputs[1] zeros = array_ops.zeros_like(x) - return (None, array_ops.where(c, grad, zeros), - array_ops.where(c, zeros, grad)) + return (None, array_ops.where(c, grad, zeros), array_ops.where( + c, zeros, grad)) @ops.RegisterGradient("MatMul") @@ -1009,21 +1019,20 @@ def _SparseMatMulGrad(op, grad): dtype_a = op.inputs[0].dtype dtype_b = op.inputs[1].dtype if not t_a and not t_b: - return (_SparseMatMul( - grad, op.inputs[1], dtype_a, transpose_b=True), _SparseMatMul( - op.inputs[0], grad, dtype_b, transpose_a=True)) + return (_SparseMatMul(grad, op.inputs[1], dtype_a, transpose_b=True), + _SparseMatMul(op.inputs[0], grad, dtype_b, transpose_a=True)) elif not t_a and t_b: - return (_SparseMatMul(grad, op.inputs[1], dtype_a), _SparseMatMul( - grad, op.inputs[0], dtype_b, transpose_a=True)) + return (_SparseMatMul(grad, op.inputs[1], dtype_a), + _SparseMatMul(grad, op.inputs[0], dtype_b, transpose_a=True)) elif t_a and not t_b: - return (_SparseMatMul( - op.inputs[1], grad, dtype_a, transpose_b=True), + return (_SparseMatMul(op.inputs[1], grad, dtype_a, transpose_b=True), _SparseMatMul(op.inputs[0], grad, dtype_b)) elif t_a and t_b: return (_SparseMatMul( - op.inputs[1], grad, dtype_a, transpose_a=True, - transpose_b=True), _SparseMatMul( - grad, op.inputs[0], dtype_b, transpose_a=True, transpose_b=True)) + op.inputs[1], grad, dtype_a, transpose_a=True, transpose_b=True), + _SparseMatMul( + grad, op.inputs[0], dtype_b, transpose_a=True, + transpose_b=True)) @ops.RegisterGradient("Floor") @@ -1127,8 +1136,8 @@ def _ComplexAbsGrad(op, grad): """Returns the gradient of ComplexAbs.""" # TODO(b/27786104): The cast to complex could be removed once arithmetic # supports mixtures of complex64 and real values. - return (math_ops.complex(grad, array_ops.zeros_like(grad)) * - math_ops.sign(op.inputs[0])) + return (math_ops.complex(grad, array_ops.zeros_like(grad)) * math_ops.sign( + op.inputs[0])) @ops.RegisterGradient("Cast") @@ -1158,8 +1167,8 @@ def _CumsumGrad(op, grad): exclusive = op.get_attr("exclusive") reverse = op.get_attr("reverse") return [ - math_ops.cumsum( - grad, axis, exclusive=exclusive, reverse=not reverse), None + math_ops.cumsum(grad, axis, exclusive=exclusive, reverse=not reverse), + None ] diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index b8e8207bb24..9a8ac93de9d 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1841,12 +1841,11 @@ def reduce_logsumexp(input_tensor, reduce_sum( gen_math_ops.exp(input_tensor - my_max), axis, - keepdims=True, - reduction_indices=reduction_indices)) + my_max + keepdims=keepdims, + reduction_indices=reduction_indices)) if not keepdims: - if isinstance(axis, int): - axis = [axis] - result = array_ops.squeeze(result, axis) + my_max = array_ops.reshape(my_max, array_ops.shape(result)) + result += my_max return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result) diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py index f95cf08de1a..6e5fe74290a 100644 --- a/tensorflow/python/ops/matmul_benchmark.py +++ b/tensorflow/python/ops/matmul_benchmark.py @@ -95,8 +95,8 @@ class MatmulBenchmark(test.Benchmark): num_items = n * m * k * 2 throughput = num_items * num_iters / duration / 1e9 print('%s %s input_info:%s %d %.4fsec, %.4fGitems/s.' % - (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:' - + str(transpose_a) + '.tb:' + str(transpose_b), num_iters, + (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) + + ',ta:' + str(transpose_a) + '.tb:' + str(transpose_b), num_iters, duration, throughput)) name_template = ('matmul_{device}_{dtype}_input_info_{inputinfo}') @@ -112,7 +112,8 @@ class MatmulBenchmark(test.Benchmark): return duration def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters): - self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, num_iters, dtype) + self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, + num_iters, dtype) def test_round(self, num_iters): dtypes = [np.float32, np.float64] @@ -124,8 +125,8 @@ class MatmulBenchmark(test.Benchmark): self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters) for n, m, k, (transpose_a, transpose_b) in itertools.product( - [200], [1, 8, 20], [10000], [(False, False), (True, False), (False, - True)]): + [200], [1, 8, 20], [10000], [(False, False), (True, False), + (False, True)]): self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters) for (n, m, k), (transpose_a, transpose_b) in itertools.product( diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py index 5a9c0a7a495..3df0c66ef9c 100644 --- a/tensorflow/python/ops/matmul_benchmark_test.py +++ b/tensorflow/python/ops/matmul_benchmark_test.py @@ -33,11 +33,11 @@ def BuildGraphTest(n, m, k, transpose_a, transpose_b, dtype): def Test(self): if not googletest.is_gpu_available(): - tf_logging.info("Skipping BuildGraphTest %s", (n, m, k, transpose_a, - transpose_b)) + tf_logging.info("Skipping BuildGraphTest %s", + (n, m, k, transpose_a, transpose_b)) return - tf_logging.info("Testing BuildGraphTest %s", (n, m, k, transpose_a, - transpose_b)) + tf_logging.info("Testing BuildGraphTest %s", + (n, m, k, transpose_a, transpose_b)) self._VerifyBuildGraph(n, m, k, transpose_a, transpose_b, dtype) return Test @@ -47,11 +47,11 @@ def RunGraphTest(n, m, k, transpose_a, transpose_b, dtype): def Test(self): if not googletest.is_gpu_available(): - tf_logging.info("Skipping RunGraphTest %s", (n, m, k, transpose_a, - transpose_b)) + tf_logging.info("Skipping RunGraphTest %s", + (n, m, k, transpose_a, transpose_b)) return - tf_logging.info("Testing RunGraphTest %s", (n, m, k, transpose_a, - transpose_b)) + tf_logging.info("Testing RunGraphTest %s", + (n, m, k, transpose_a, transpose_b)) self._VerifyRunGraph(n, m, k, transpose_a, transpose_b, dtype) return Test @@ -71,40 +71,41 @@ class MatmulBenchmarkTest(googletest.TestCase): def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype): graph = ops.Graph() with graph.as_default(): - matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b, - dtype) + matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, + transpose_a, transpose_b, dtype) gd = graph.as_graph_def() - dev=googletest.gpu_device_name() + dev = googletest.gpu_device_name() proto_expected = """ - node { name: "random_uniform/shape" op: "Const" device: \""""+ dev +"""\" } - node { name: "random_uniform/min" op: "Const" device: \""""+ dev +"""\" } - node { name: "random_uniform/max" op: "Const" device: \""""+ dev +"""\" } - node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \""""+ dev +"""\" } - node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \""""+ dev +"""\" } - node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \""""+ dev +"""\" } - node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \""""+ dev +"""\" } - node { name: "Variable" op: "VariableV2" device: \""""+ dev +"""\" } - node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \""""+ dev +"""\" } - node { name: "Variable/read" op: "Identity" input: "Variable" device: \""""+ dev +"""\" } - node { name: "random_uniform_1/shape" op: "Const" device: \""""+ dev +"""\" } - node { name: "random_uniform_1/min" op: "Const" device: \""""+ dev +"""\" } - node { name: "random_uniform_1/max" op: "Const" device: \""""+ dev +"""\" } - node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \""""+ dev +"""\" } - node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \""""+ dev +"""\" } - node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \""""+ dev +"""\" } - node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \""""+ dev +"""\" } - node { name: "Variable_1" op: "VariableV2" device: \""""+ dev +"""\" } - node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \""""+ dev +"""\" } - node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \""""+ dev +"""\" } - node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \""""+ dev +"""\" } - node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \""""+ dev +"""\" } + node { name: "random_uniform/shape" op: "Const" device: \"""" + dev + """\" } + node { name: "random_uniform/min" op: "Const" device: \"""" + dev + """\" } + node { name: "random_uniform/max" op: "Const" device: \"""" + dev + """\" } + node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \"""" + dev + """\" } + node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \"""" + dev + """\" } + node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \"""" + dev + """\" } + node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \"""" + dev + """\" } + node { name: "Variable" op: "VariableV2" device: \"""" + dev + """\" } + node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \"""" + dev + """\" } + node { name: "Variable/read" op: "Identity" input: "Variable" device: \"""" + dev + """\" } + node { name: "random_uniform_1/shape" op: "Const" device: \"""" + dev + """\" } + node { name: "random_uniform_1/min" op: "Const" device: \"""" + dev + """\" } + node { name: "random_uniform_1/max" op: "Const" device: \"""" + dev + """\" } + node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \"""" + dev + """\" } + node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \"""" + dev + """\" } + node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \"""" + dev + """\" } + node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \"""" + dev + """\" } + node { name: "Variable_1" op: "VariableV2" device: \"""" + dev + """\" } + node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \"""" + dev + """\" } + node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \"""" + dev + """\" } + node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \"""" + dev + """\" } + node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \"""" + dev + """\" } """ self.assertProtoEquals(str(proto_expected), self._StripGraph(gd)) def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype): benchmark_instance = matmul_benchmark.MatmulBenchmark() - duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, k, transpose_a, - transpose_b, 1, dtype) + duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, + k, transpose_a, transpose_b, 1, + dtype) self.assertTrue(duration > 1e-6) @@ -113,8 +114,8 @@ if __name__ == "__main__": index = 0 for _dtype in dtypes: for _n, _m, (_transpose_a, _transpose_b) in itertools.product( - [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), (False, - True)]): + [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), + (False, True)]): _k = _n setattr(MatmulBenchmarkTest, "testBuildGraph_" + str(index), BuildGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype)) diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py index 0593ed2cfa6..a08b836025d 100644 --- a/tensorflow/python/ops/nn_fused_batchnorm_test.py +++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py @@ -278,7 +278,8 @@ class BatchNormalizationTest(test.TestCase): epsilon = y.op.get_attr('epsilon') data_format = y.op.get_attr('data_format') grad_vals = sess.run([grad_x, grad_scale, grad_offset]) - grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format) + grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean, + pop_var, epsilon, data_format) grad_internal_vals = sess.run(list(grad_internal)) for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals): self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 32b14f86b56..a691e281ee7 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -41,15 +41,19 @@ from tensorflow.python.ops.gen_nn_ops import * from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export - # Aliases for some automatically-generated names. local_response_normalization = gen_nn_ops.lrn # pylint: disable=protected-access -def _non_atrous_convolution(input, filter, padding, data_format=None, # pylint: disable=redefined-builtin - strides=None, name=None): +def _non_atrous_convolution( + input, + filter, + padding, + data_format=None, # pylint: disable=redefined-builtin + strides=None, + name=None): """Computes sums of N-D convolutions (actually cross correlation). It is required that 1 <= N <= 3. @@ -94,12 +98,13 @@ def _non_atrous_convolution(input, filter, padding, data_format=None, # pylint: input_shape = input.get_shape() filter = ops.convert_to_tensor(filter, name="filter") filter_shape = filter.get_shape() - op = _NonAtrousConvolution(input_shape, - filter_shape=filter_shape, - padding=padding, - data_format=data_format, - strides=strides, - name=scope) + op = _NonAtrousConvolution( + input_shape, + filter_shape=filter_shape, + padding=padding, + data_format=data_format, + strides=strides, + name=scope) return op(input, filter) @@ -119,11 +124,14 @@ class _NonAtrousConvolution(object): name: see _non_atrous_convolution. """ - def __init__(self, - input_shape, - filter_shape, # pylint: disable=redefined-builtin - padding, data_format=None, - strides=None, name=None): + def __init__( + self, + input_shape, + filter_shape, # pylint: disable=redefined-builtin + padding, + data_format=None, + strides=None, + name=None): filter_shape = filter_shape.with_rank(input_shape.ndims) self.padding = padding self.name = name @@ -137,8 +145,8 @@ class _NonAtrousConvolution(object): if strides is None: strides = [1] * conv_dims elif len(strides) != conv_dims: - raise ValueError("len(strides)=%d, but should be %d" % - (len(strides), conv_dims)) + raise ValueError("len(strides)=%d, but should be %d" % (len(strides), + conv_dims)) if conv_dims == 1: # conv1d uses the 2-d data format names if data_format is None or data_format == "NWC": @@ -177,8 +185,14 @@ class _NonAtrousConvolution(object): # those for gen_nn_ops.conv2d and gen_nn_ops.conv3d. # pylint: disable=redefined-builtin def _conv1d(self, input, filter, strides, padding, data_format, name): - return conv1d(value=input, filters=filter, stride=strides, padding=padding, - data_format=data_format, name=name) + return conv1d( + value=input, + filters=filter, + stride=strides, + padding=padding, + data_format=data_format, + name=name) + # pylint: enable=redefined-builtin def __call__(self, inp, filter): # pylint: disable=redefined-builtin @@ -340,13 +354,14 @@ def with_space_to_batch( def build_op(num_spatial_dims, padding): return lambda inp, _: op(inp, num_spatial_dims, padding) - new_op = _WithSpaceToBatch(input_shape, - dilation_rate, - padding, - build_op, - filter_shape=filter_shape, - spatial_dims=spatial_dims, - data_format=data_format) + new_op = _WithSpaceToBatch( + input_shape, + dilation_rate, + padding, + build_op, + filter_shape=filter_shape, + spatial_dims=spatial_dims, + data_format=data_format) return new_op(input, None) @@ -377,9 +392,8 @@ class _WithSpaceToBatch(object): spatial_dims=None, data_format=None): """Helper class for _with_space_to_batch.""" - dilation_rate = ops.convert_to_tensor(dilation_rate, - dtypes.int32, - name="dilation_rate") + dilation_rate = ops.convert_to_tensor( + dilation_rate, dtypes.int32, name="dilation_rate") try: rate_shape = dilation_rate.get_shape().with_rank(1) except ValueError: @@ -439,9 +453,7 @@ class _WithSpaceToBatch(object): if const_filter_shape is not None: filter_shape = const_filter_shape self.base_paddings = _with_space_to_batch_base_paddings( - const_filter_shape, - num_spatial_dims, - rate_or_const_rate) + const_filter_shape, num_spatial_dims, rate_or_const_rate) else: self.num_spatial_dims = num_spatial_dims self.rate_or_const_rate = rate_or_const_rate @@ -478,9 +490,7 @@ class _WithSpaceToBatch(object): # shape was not fully defined. filter_shape = array_ops.shape(filter) base_paddings = _with_space_to_batch_base_paddings( - filter_shape, - self.num_spatial_dims, - self.rate_or_const_rate) + filter_shape, self.num_spatial_dims, self.rate_or_const_rate) paddings, crops = array_ops.required_space_to_batch_paddings( input_shape=input_spatial_shape, base_paddings=base_paddings, @@ -491,9 +501,7 @@ class _WithSpaceToBatch(object): paddings = _with_space_to_batch_adjust(paddings, 0, spatial_dims) crops = _with_space_to_batch_adjust(crops, 0, spatial_dims) input_converted = array_ops.space_to_batch_nd( - input=inp, - block_shape=dilation_rate, - paddings=paddings) + input=inp, block_shape=dilation_rate, paddings=paddings) result = self.op(input_converted, filter) @@ -519,17 +527,17 @@ def _with_space_to_batch_base_paddings(filter_shape, num_spatial_dims, # Spatial dimensions of the filters and the upsampled filters in which we # introduce (rate - 1) zeros between consecutive filter values. filter_spatial_shape = filter_shape[:num_spatial_dims] - dilated_filter_spatial_shape = (filter_spatial_shape + - (filter_spatial_shape - 1) * - (rate_or_const_rate - 1)) + dilated_filter_spatial_shape = ( + filter_spatial_shape + (filter_spatial_shape - 1) * + (rate_or_const_rate - 1)) pad_extra_shape = dilated_filter_spatial_shape - 1 # When full_padding_shape is odd, we pad more at end, following the same # convention as conv2d. pad_extra_start = pad_extra_shape // 2 pad_extra_end = pad_extra_shape - pad_extra_start - base_paddings = array_ops.stack([[pad_extra_start[i], pad_extra_end[i]] - for i in range(num_spatial_dims)]) + base_paddings = array_ops.stack( + [[pad_extra_start[i], pad_extra_end[i]] for i in range(num_spatial_dims)]) return base_paddings @@ -623,8 +631,8 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate): if strides is None: strides = [1] * num_spatial_dims elif len(strides) != num_spatial_dims: - raise ValueError("len(strides)=%d but should be %d" % - (len(strides), num_spatial_dims)) + raise ValueError("len(strides)=%d but should be %d" % (len(strides), + num_spatial_dims)) strides = np.array(strides, dtype=np.int32) if np.any(strides < 1): raise ValueError("all values of strides must be positive") @@ -636,9 +644,14 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate): @tf_export("nn.convolution") -def convolution(input, filter, # pylint: disable=redefined-builtin - padding, strides=None, dilation_rate=None, - name=None, data_format=None): +def convolution( + input, + filter, # pylint: disable=redefined-builtin + padding, + strides=None, + dilation_rate=None, + name=None, + data_format=None): # pylint: disable=line-too-long """Computes sums of N-D convolutions (actually cross-correlation). @@ -757,12 +770,14 @@ def convolution(input, filter, # pylint: disable=redefined-builtin input_shape = input.get_shape() filter = ops.convert_to_tensor(filter, name="filter") filter_shape = filter.get_shape() - op = Convolution(input_shape, - filter_shape, - padding, - strides=strides, - dilation_rate=dilation_rate, - name=name, data_format=data_format) + op = Convolution( + input_shape, + filter_shape, + padding, + strides=strides, + dilation_rate=dilation_rate, + name=name, + data_format=data_format) return op(input, filter) @@ -786,8 +801,11 @@ class Convolution(object): def __init__(self, input_shape, filter_shape, - padding, strides=None, dilation_rate=None, - name=None, data_format=None): + padding, + strides=None, + dilation_rate=None, + name=None, + data_format=None): """Helper function for convolution.""" num_total_dims = filter_shape.ndims if num_total_dims is None: @@ -809,17 +827,17 @@ class Convolution(object): if data_format is None or not data_format.startswith("NC"): input_channels_dim = input_shape[num_spatial_dims + 1] - spatial_dims = range(1, num_spatial_dims+1) + spatial_dims = range(1, num_spatial_dims + 1) else: input_channels_dim = input_shape[1] - spatial_dims = range(2, num_spatial_dims+2) + spatial_dims = range(2, num_spatial_dims + 2) - if not input_channels_dim.is_compatible_with(filter_shape[ - num_spatial_dims]): + if not input_channels_dim.is_compatible_with( + filter_shape[num_spatial_dims]): raise ValueError( "number of input channels does not match corresponding dimension of " - "filter, {} != {}".format(input_channels_dim, filter_shape[ - num_spatial_dims])) + "filter, {} != {}".format(input_channels_dim, + filter_shape[num_spatial_dims])) strides, dilation_rate = _get_strides_and_dilation_rate( num_spatial_dims, strides, dilation_rate) @@ -852,14 +870,15 @@ class Convolution(object): @tf_export("nn.pool") -def pool(input, # pylint: disable=redefined-builtin - window_shape, - pooling_type, - padding, - dilation_rate=None, - strides=None, - name=None, - data_format=None): +def pool( + input, # pylint: disable=redefined-builtin + window_shape, + pooling_type, + padding, + dilation_rate=None, + strides=None, + name=None, + data_format=None): # pylint: disable=line-too-long """Performs an N-D pooling operation. @@ -941,8 +960,8 @@ def pool(input, # pylint: disable=redefined-builtin """ # pylint: enable=line-too-long - with ops.name_scope(name, "%s_pool" % - (pooling_type.lower()), [input]) as scope: + with ops.name_scope(name, "%s_pool" % (pooling_type.lower()), + [input]) as scope: input = ops.convert_to_tensor(input, name="input") num_spatial_dims = len(window_shape) @@ -963,17 +982,18 @@ def pool(input, # pylint: disable=redefined-builtin "strides > window_shape not supported due to inconsistency between " "CPU and GPU implementations") - pooling_ops = {("MAX", 1): max_pool, - ("MAX", 2): max_pool, - ("MAX", 3): max_pool3d, # pylint: disable=undefined-variable - ("AVG", 1): avg_pool, - ("AVG", 2): avg_pool, - ("AVG", 3): avg_pool3d, # pylint: disable=undefined-variable - } + pooling_ops = { + ("MAX", 1): max_pool, + ("MAX", 2): max_pool, + ("MAX", 3): max_pool3d, # pylint: disable=undefined-variable + ("AVG", 1): avg_pool, + ("AVG", 2): avg_pool, + ("AVG", 3): avg_pool3d, # pylint: disable=undefined-variable + } op_key = (pooling_type, num_spatial_dims) if op_key not in pooling_ops: - raise ValueError("%d-D %s pooling is not supported." % - (op_key[1], op_key[0])) + raise ValueError("%d-D %s pooling is not supported." % (op_key[1], + op_key[0])) if data_format is None or not data_format.startswith("NC"): adjusted_window_shape = [1] + list(window_shape) + [1] @@ -1000,12 +1020,13 @@ def pool(input, # pylint: disable=redefined-builtin if num_spatial_dims == 1: converted_input = array_ops.expand_dims(converted_input, spatial_dims[0]) - result = pooling_ops[op_key](converted_input, - adjusted_window_shape, - adjusted_strides, - converted_padding, - name=scope, - **data_format_kwargs) + result = pooling_ops[op_key]( + converted_input, + adjusted_window_shape, + adjusted_strides, + converted_padding, + name=scope, + **data_format_kwargs) if num_spatial_dims == 1: result = array_ops.squeeze(result, [spatial_dims[0]]) return result @@ -1021,7 +1042,9 @@ def pool(input, # pylint: disable=redefined-builtin @tf_export("nn.atrous_conv2d") def atrous_conv2d(value, filters, rate, padding, name=None): - """Atrous convolution (a.k.a. convolution with holes or dilated convolution). + """Atrous convolution (a.k.a. + + convolution with holes or dilated convolution). This function is a simpler wrapper around the more general @{tf.nn.convolution}, and exists only for backwards compatibility. You can @@ -1065,7 +1088,8 @@ def atrous_conv2d(value, filters, rate, padding, name=None): that effectively use atrous convolution in different ways are, among others, [OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image - Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700). + Scanning with Deep Max-Pooling Convolutional Neural + Networks](http://arxiv.org/abs/1302.1700). Atrous convolution is also closely related to the so-called noble identities in multi-rate signal processing. @@ -1156,13 +1180,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None): @tf_export("nn.conv2d_transpose") -def conv2d_transpose(value, - filter, # pylint: disable=redefined-builtin - output_shape, - strides, - padding="SAME", - data_format="NHWC", - name=None): +def conv2d_transpose( + value, + filter, # pylint: disable=redefined-builtin + output_shape, + strides, + padding="SAME", + data_format="NHWC", + name=None): """The transpose of `conv2d`. This operation is sometimes called "deconvolution" after [Deconvolutional @@ -1207,15 +1232,16 @@ def conv2d_transpose(value, output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape") if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)): - raise ValueError("output_shape must have shape (4,), got {}" - .format(output_shape_.get_shape())) + raise ValueError("output_shape must have shape (4,), got {}".format( + output_shape_.get_shape())) if isinstance(output_shape, (list, np.ndarray)): # output_shape's shape should be == [4] if reached this point. if not filter.get_shape()[2].is_compatible_with(output_shape[axis]): raise ValueError( "output_shape does not match filter's output channels, " - "{} != {}".format(output_shape[axis], filter.get_shape()[2])) + "{} != {}".format(output_shape[axis], + filter.get_shape()[2])) if padding != "VALID" and padding != "SAME": raise ValueError("padding must be either VALID or SAME:" @@ -1281,29 +1307,32 @@ def atrous_conv2d_transpose(value, if not value.get_shape()[3].is_compatible_with(filters.get_shape()[3]): raise ValueError( "value's input channels does not match filters' input channels, " - "{} != {}".format(value.get_shape()[3], filters.get_shape()[3])) + "{} != {}".format(value.get_shape()[3], + filters.get_shape()[3])) if rate < 1: raise ValueError("rate {} cannot be less than one".format(rate)) if rate == 1: - return conv2d_transpose(value, - filters, - output_shape, - strides=[1, 1, 1, 1], - padding=padding, - data_format="NHWC") + return conv2d_transpose( + value, + filters, + output_shape, + strides=[1, 1, 1, 1], + padding=padding, + data_format="NHWC") output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape") if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)): - raise ValueError("output_shape must have shape (4,), got {}" - .format(output_shape_.get_shape())) + raise ValueError("output_shape must have shape (4,), got {}".format( + output_shape_.get_shape())) if isinstance(output_shape, (list, np.ndarray)): # output_shape's shape should be == [4] if reached this point. if not filters.get_shape()[2].is_compatible_with(output_shape[3]): raise ValueError( "output_shape does not match filter's output channels, " - "{} != {}".format(output_shape[3], filters.get_shape()[2])) + "{} != {}".format(output_shape[3], + filters.get_shape()[2])) # We have two padding contributions. The first is used for converting "SAME" # to "VALID". The second is required so that the height and width of the @@ -1352,14 +1381,13 @@ def atrous_conv2d_transpose(value, # component. space_to_batch_pad = [[0, pad_bottom_extra], [0, pad_right_extra]] - value = array_ops.space_to_batch(input=value, - paddings=space_to_batch_pad, - block_size=rate) + value = array_ops.space_to_batch( + input=value, paddings=space_to_batch_pad, block_size=rate) - input_sizes = [rate * rate * output_shape[0], - (in_height + pad_bottom_extra) // rate, - (in_width + pad_right_extra) // rate, - output_shape[3]] + input_sizes = [ + rate * rate * output_shape[0], (in_height + pad_bottom_extra) // rate, + (in_width + pad_right_extra) // rate, output_shape[3] + ] value = gen_nn_ops.conv2d_backprop_input( input_sizes=input_sizes, @@ -1373,19 +1401,19 @@ def atrous_conv2d_transpose(value, batch_to_space_crop = [[pad_top, pad_bottom + pad_bottom_extra], [pad_left, pad_right + pad_right_extra]] - return array_ops.batch_to_space(input=value, - crops=batch_to_space_crop, - block_size=rate) + return array_ops.batch_to_space( + input=value, crops=batch_to_space_crop, block_size=rate) @tf_export("nn.conv3d_transpose") -def conv3d_transpose(value, - filter, # pylint: disable=redefined-builtin - output_shape, - strides, - padding="SAME", - data_format="NDHWC", - name=None): +def conv3d_transpose( + value, + filter, # pylint: disable=redefined-builtin + output_shape, + strides, + padding="SAME", + data_format="NDHWC", + name=None): """The transpose of `conv3d`. This operation is sometimes called "deconvolution" after [Deconvolutional @@ -1428,27 +1456,29 @@ def conv3d_transpose(value, output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape") if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)): - raise ValueError("output_shape must have shape (5,), got {}" - .format(output_shape_.get_shape())) + raise ValueError("output_shape must have shape (5,), got {}".format( + output_shape_.get_shape())) if isinstance(output_shape, (list, np.ndarray)): # output_shape's shape should be == [5] if reached this point. if not filter.get_shape()[3].is_compatible_with(output_shape[4]): raise ValueError( "output_shape does not match filter's output channels, " - "{} != {}".format(output_shape[4], filter.get_shape()[3])) + "{} != {}".format(output_shape[4], + filter.get_shape()[3])) if padding != "VALID" and padding != "SAME": raise ValueError("padding must be either VALID or SAME:" " {}".format(padding)) - return gen_nn_ops.conv3d_backprop_input_v2(input_sizes=output_shape_, - filter=filter, - out_backprop=value, - strides=strides, - padding=padding, - data_format=data_format, - name=name) + return gen_nn_ops.conv3d_backprop_input_v2( + input_sizes=output_shape_, + filter=filter, + out_backprop=value, + strides=strides, + padding=padding, + data_format=data_format, + name=name) # pylint: disable=protected-access @@ -1514,7 +1544,9 @@ def crelu(features, name=None, axis=-1): Concatenates a ReLU which selects only the positive part of the activation with a ReLU which selects only the *negative* part of the activation. Note that as a result this non-linearity doubles the depth of the activations. - Source: [Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units. W. Shang, et al.](https://arxiv.org/abs/1603.05201) + Source: [Understanding and Improving Convolutional Neural Networks via + Concatenated Rectified Linear Units. W. Shang, et + al.](https://arxiv.org/abs/1603.05201) Args: features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`, @@ -1534,7 +1566,9 @@ def crelu(features, name=None, axis=-1): @tf_export("nn.relu6") def relu6(features, name=None): """Computes Rectified Linear 6: `min(max(features, 0), 6)`. - Source: [Convolutional Deep Belief Networks on CIFAR-10. A. Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf) + + Source: [Convolutional Deep Belief Networks on CIFAR-10. A. + Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf) Args: features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`, @@ -1622,14 +1656,16 @@ def _softmax(logits, compute_op, dim=-1, name=None): InvalidArgumentError: if `logits` is empty or `dim` is beyond the last dimension of `logits`. """ + def _swap_axis(logits, dim_index, last_index, name=None): """Swaps logits's dim_index and last_index.""" - return array_ops.transpose(logits, - array_ops.concat([ - math_ops.range(dim_index), [last_index], - math_ops.range(dim_index + 1, last_index), - [dim_index] - ], 0), name=name) + return array_ops.transpose( + logits, + array_ops.concat([ + math_ops.range(dim_index), [last_index], + math_ops.range(dim_index + 1, last_index), [dim_index] + ], 0), + name=name) logits = ops.convert_to_tensor(logits) @@ -1746,9 +1782,12 @@ def _ensure_xent_args(name, sentinel, labels, logits): @tf_export("nn.softmax_cross_entropy_with_logits_v2") -def softmax_cross_entropy_with_logits_v2(_sentinel=None, # pylint: disable=invalid-name - labels=None, logits=None, - dim=-1, name=None): +def softmax_cross_entropy_with_logits_v2( + _sentinel=None, # pylint: disable=invalid-name + labels=None, + logits=None, + dim=-1, + name=None): """Computes softmax cross entropy between `logits` and `labels`. Measures the probability error in discrete classification tasks in which the @@ -1790,19 +1829,19 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None, # pylint: disable=inva A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the softmax cross entropy loss. """ - _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, - labels, logits) + _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels, + logits) # TODO(pcmurray) Raise an error when the labels do not sum to 1. Note: This # could break users who call this with bad labels, but disregard the bad # results. - with ops.name_scope( - name, "softmax_cross_entropy_with_logits", [logits, labels]) as name: + with ops.name_scope(name, "softmax_cross_entropy_with_logits", + [logits, labels]) as name: logits = ops.convert_to_tensor(logits, name="logits") labels = ops.convert_to_tensor(labels, name="labels") - precise_logits = math_ops.cast(logits, dtypes.float32) if ( - logits.dtype == dtypes.float16) else logits + precise_logits = math_ops.cast( + logits, dtypes.float32) if (logits.dtype == dtypes.float16) else logits # labels and logits must be of the same type labels = math_ops.cast(labels, precise_logits.dtype) input_rank = array_ops.rank(precise_logits) @@ -1811,13 +1850,14 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None, # pylint: disable=inva # Move the dim to the end if dim is not the last dimension. if dim is not -1: + def _move_dim_to_end(tensor, dim_index, rank): - return array_ops.transpose(tensor, - array_ops.concat([ - math_ops.range(dim_index), - math_ops.range(dim_index + 1, rank), - [dim_index] - ], 0)) + return array_ops.transpose( + tensor, + array_ops.concat([ + math_ops.range(dim_index), + math_ops.range(dim_index + 1, rank), [dim_index] + ], 0)) precise_logits = _move_dim_to_end(precise_logits, dim, input_rank) labels = _move_dim_to_end(labels, dim, input_rank) @@ -1862,9 +1902,12 @@ See tf.nn.softmax_cross_entropy_with_logits_v2. @tf_export("nn.softmax_cross_entropy_with_logits") @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION) -def softmax_cross_entropy_with_logits(_sentinel=None, # pylint: disable=invalid-name - labels=None, logits=None, - dim=-1, name=None): +def softmax_cross_entropy_with_logits( + _sentinel=None, # pylint: disable=invalid-name + labels=None, + logits=None, + dim=-1, + name=None): """Computes softmax cross entropy between `logits` and `labels`. Measures the probability error in discrete classification tasks in which the @@ -1906,11 +1949,11 @@ def softmax_cross_entropy_with_logits(_sentinel=None, # pylint: disable=invalid A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the softmax cross entropy loss. """ - _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, - labels, logits) + _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels, + logits) - with ops.name_scope( - name, "softmax_cross_entropy_with_logits_sg", [logits, labels]) as name: + with ops.name_scope(name, "softmax_cross_entropy_with_logits_sg", + [logits, labels]) as name: labels = array_ops.stop_gradient(labels, name="labels_stop_gradient") return softmax_cross_entropy_with_logits_v2( @@ -1918,9 +1961,11 @@ def softmax_cross_entropy_with_logits(_sentinel=None, # pylint: disable=invalid @tf_export("nn.sparse_softmax_cross_entropy_with_logits") -def sparse_softmax_cross_entropy_with_logits(_sentinel=None, # pylint: disable=invalid-name - labels=None, logits=None, - name=None): +def sparse_softmax_cross_entropy_with_logits( + _sentinel=None, # pylint: disable=invalid-name + labels=None, + logits=None, + name=None): """Computes sparse softmax cross entropy between `logits` and `labels`. Measures the probability error in discrete classification tasks in which the @@ -1976,15 +2021,15 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None, # pylint: disable= [labels, logits]): labels = ops.convert_to_tensor(labels) logits = ops.convert_to_tensor(logits) - precise_logits = math_ops.cast(logits, dtypes.float32) if ( - dtypes.as_dtype(logits.dtype) == dtypes.float16) else logits + precise_logits = math_ops.cast(logits, dtypes.float32) if (dtypes.as_dtype( + logits.dtype) == dtypes.float16) else logits # Store label shape for result later. labels_static_shape = labels.get_shape() labels_shape = array_ops.shape(labels) if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0: - raise ValueError("Logits cannot be scalars - received shape %s." % - logits.get_shape()) + raise ValueError( + "Logits cannot be scalars - received shape %s." % logits.get_shape()) if logits.get_shape().ndims is not None and ( labels_static_shape.ndims is not None and labels_static_shape.ndims != logits.get_shape().ndims - 1): @@ -2041,12 +2086,13 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None): """ with ops.name_scope(name, "AvgPool", [value]) as name: value = ops.convert_to_tensor(value, name="input") - return gen_nn_ops._avg_pool(value, - ksize=ksize, - strides=strides, - padding=padding, - data_format=data_format, - name=name) + return gen_nn_ops._avg_pool( + value, + ksize=ksize, + strides=strides, + padding=padding, + data_format=data_format, + name=name) @tf_export("nn.max_pool") @@ -2083,8 +2129,8 @@ def _calc_conv_flops(graph, node): """Calculates the compute resources needed for Conv2D.""" input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0]) input_shape.assert_is_fully_defined() - filter_shape = graph_util.tensor_shape_from_node_def_name(graph, - node.input[1]) + filter_shape = graph_util.tensor_shape_from_node_def_name( + graph, node.input[1]) filter_shape.assert_is_fully_defined() output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name) output_shape.assert_is_fully_defined() @@ -2092,8 +2138,9 @@ def _calc_conv_flops(graph, node): filter_width = int(filter_shape[1]) filter_in_depth = int(filter_shape[2]) output_count = np.prod(output_shape.as_list()) - return ops.OpStats("flops", (output_count * filter_in_depth * filter_height * - filter_width * 2)) + return ops.OpStats( + "flops", + (output_count * filter_in_depth * filter_height * filter_width * 2)) @ops.RegisterStatistics("DepthwiseConv2dNative", "flops") @@ -2101,8 +2148,8 @@ def _calc_depthwise_conv_flops(graph, node): """Calculates the compute resources needed for DepthwiseConv2dNative.""" input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0]) input_shape.assert_is_fully_defined() - filter_shape = graph_util.tensor_shape_from_node_def_name(graph, - node.input[1]) + filter_shape = graph_util.tensor_shape_from_node_def_name( + graph, node.input[1]) filter_shape.assert_is_fully_defined() output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name) output_shape.assert_is_fully_defined() @@ -2210,9 +2257,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylint: di if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1: raise ValueError("keep_prob must be a scalar tensor or a float in the " "range (0, 1], got %g" % keep_prob) - keep_prob = ops.convert_to_tensor(keep_prob, - dtype=x.dtype, - name="keep_prob") + keep_prob = ops.convert_to_tensor( + keep_prob, dtype=x.dtype, name="keep_prob") keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar()) # Do nothing if we know keep_prob == 1 @@ -2222,9 +2268,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylint: di noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x) # uniform [keep_prob, 1.0 + keep_prob) random_tensor = keep_prob - random_tensor += random_ops.random_uniform(noise_shape, - seed=seed, - dtype=x.dtype) + random_tensor += random_ops.random_uniform( + noise_shape, seed=seed, dtype=x.dtype) # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob) binary_tensor = math_ops.floor(random_tensor) ret = math_ops.div(x, keep_prob) * binary_tensor @@ -2293,13 +2338,21 @@ def nth_element(input, n, reverse=False, name=None): @tf_export("nn.conv1d") @deprecation.deprecated_arg_values( - None, "`NCHW` for data_format is deprecated, use `NCW` instead", - warn_once=True, data_format="NCHW") + None, + "`NCHW` for data_format is deprecated, use `NCW` instead", + warn_once=True, + data_format="NCHW") @deprecation.deprecated_arg_values( - None, "`NHWC` for data_format is deprecated, use `NWC` instead", - warn_once=True, data_format="NHWC") -def conv1d(value, filters, stride, padding, - use_cudnn_on_gpu=None, data_format=None, + None, + "`NHWC` for data_format is deprecated, use `NWC` instead", + warn_once=True, + data_format="NHWC") +def conv1d(value, + filters, + stride, + padding, + use_cudnn_on_gpu=None, + data_format=None, name=None): r"""Computes a 1-D convolution given 3-D input and filter tensors. @@ -2358,9 +2411,13 @@ def conv1d(value, filters, stride, padding, raise ValueError("data_format must be \"NWC\" or \"NCW\".") value = array_ops.expand_dims(value, spatial_start_dim) filters = array_ops.expand_dims(filters, 0) - result = gen_nn_ops.conv2d(value, filters, strides, padding, - use_cudnn_on_gpu=use_cudnn_on_gpu, - data_format=data_format) + result = gen_nn_ops.conv2d( + value, + filters, + strides, + padding, + use_cudnn_on_gpu=use_cudnn_on_gpu, + data_format=data_format) return array_ops.squeeze(result, [spatial_start_dim]) @@ -2466,8 +2523,8 @@ def _calc_dilation2d_flops(graph, node): """Calculates the compute resources needed for Dilation2D.""" input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0]) input_shape.assert_is_fully_defined() - filter_shape = graph_util.tensor_shape_from_node_def_name(graph, - node.input[1]) + filter_shape = graph_util.tensor_shape_from_node_def_name( + graph, node.input[1]) filter_shape.assert_is_fully_defined() output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name) output_shape.assert_is_fully_defined() @@ -2527,12 +2584,13 @@ def erosion2d(value, kernel, strides, rates, padding, name=None): with ops.name_scope(name, "erosion2d", [value, kernel]) as name: # Reduce erosion to dilation by duality. return math_ops.negative( - gen_nn_ops.dilation2d(input=math_ops.negative(value), - filter=array_ops.reverse_v2(kernel, [0, 1]), - strides=strides, - rates=rates, - padding=padding, - name=name)) + gen_nn_ops.dilation2d( + input=math_ops.negative(value), + filter=array_ops.reverse_v2(kernel, [0, 1]), + strides=strides, + rates=rates, + padding=padding, + name=name)) @tf_export("nn.in_top_k") @@ -2565,5 +2623,5 @@ def in_top_k(predictions, targets, k, name=None): Returns: A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`. """ - with ops.name_scope(name, 'in_top_k'): + with ops.name_scope(name, "in_top_k"): return gen_nn_ops._in_top_kv2(predictions, targets, k, name=name) diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py index 5e9e7100270..4ac2a8f634b 100644 --- a/tensorflow/python/ops/quantized_conv_ops_test.py +++ b/tensorflow/python/ops/quantized_conv_ops_test.py @@ -93,7 +93,8 @@ class Conv2DTest(test.TestCase): quantized_range = ((quantized_max - quantized_min) * range_adjust) range_scale = (quantized_range / number_of_steps) lowest_quantized = -(1 << (number_of_bits - 1)) - result = np.array([(quantized_min + ((float(x) - lowest_quantized) * range_scale)) + result = np.array([(quantized_min + + ((float(x) - lowest_quantized) * range_scale)) for x in quantized.flatten()]) return result diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py index 4bf3b35e138..d590bc4be6d 100644 --- a/tensorflow/python/ops/quantized_ops_test.py +++ b/tensorflow/python/ops/quantized_ops_test.py @@ -34,7 +34,10 @@ class QuantizedOpsTest(test.TestCase): def testQuantizeOp(self): expected_output = [1, 1, 2, 127, 255, 255] with self.test_session(use_gpu=False) as sess: - x = constant_op.constant([1.0, 1.25, 1.75, 127.0, 255.0, 500.0], shape=[6], dtype=dtypes.float32) + x = constant_op.constant( + [1.0, 1.25, 1.75, 127.0, 255.0, 500.0], + shape=[6], + dtype=dtypes.float32) x_min = 0.0 x_max = 255.0 op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST") diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py index bdf41cd75d6..cc9f7981e41 100644 --- a/tensorflow/python/ops/resource_variable_ops.py +++ b/tensorflow/python/ops/resource_variable_ops.py @@ -348,9 +348,9 @@ class ResourceVariable(variables.Variable): if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections: collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES] self._save_slice_info = None - # Save the graph's container prefix for error checking. Reading the value of - # the ResourceVariable from another Graph in Eager mode is an error. - self._container_prefix = ops.get_default_graph()._container_prefix # pylint: disable=protected-access + # Store the graph key so optimizers know how to only retrieve variables from + # this graph. + self._graph_key = ops.get_default_graph()._graph_key # pylint: disable=protected-access with ops.init_scope(): self._in_graph_mode = context.in_graph_mode() with ops.name_scope(name, "Variable", [] @@ -662,15 +662,7 @@ class ResourceVariable(variables.Variable): Returns: the read operation. - Raises: - ValueError: if the ResourceVariable was created in another isolation - environment or graph. """ - if (not self._in_graph_mode and - self._container_prefix != ops.get_default_graph()._container_prefix): # pylint: disable=protected-access - raise ValueError( - "Attempted to read a variable from another isolation environment" - " or Graph") with ops.name_scope("Read"): # Ensure we read the variable in the same device as the handle. with ops.device(self._handle_device): diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py index a10e1963d1f..da80e72071c 100644 --- a/tensorflow/python/ops/rnn.py +++ b/tensorflow/python/ops/rnn.py @@ -171,11 +171,11 @@ def _rnn_step( return (final_output, final_state) Args: - time: Python int, the current time step - sequence_length: int32 `Tensor` vector of size [batch_size] - min_sequence_length: int32 `Tensor` scalar, min of sequence_length - max_sequence_length: int32 `Tensor` scalar, max of sequence_length - zero_output: `Tensor` vector of shape [output_size] + time: int32 `Tensor` scalar. + sequence_length: int32 `Tensor` vector of size [batch_size]. + min_sequence_length: int32 `Tensor` scalar, min of sequence_length. + max_sequence_length: int32 `Tensor` scalar, max of sequence_length. + zero_output: `Tensor` vector of shape [output_size]. state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`, or a list/tuple of such tensors. call_cell: lambda returning tuple of (new_output, new_state) where @@ -202,6 +202,9 @@ def _rnn_step( flat_state = nest.flatten(state) flat_zero_output = nest.flatten(zero_output) + # Vector describing which batch entries are finished. + copy_cond = time >= sequence_length + def _copy_one_through(output, new_output): # TensorArray and scalar get passed through. if isinstance(output, tensor_array_ops.TensorArray): @@ -209,7 +212,6 @@ def _rnn_step( if output.shape.ndims == 0: return new_output # Otherwise propagate the old or the new value. - copy_cond = (time >= sequence_length) with ops.colocate_with(new_output): return array_ops.where(copy_cond, output, new_output) @@ -1125,6 +1127,12 @@ def raw_rnn(cell, loop_fn, def _copy_some_through(current, candidate): """Copy some tensors through via array_ops.where.""" def copy_fn(cur_i, cand_i): + # TensorArray and scalar get passed through. + if isinstance(cur_i, tensor_array_ops.TensorArray): + return cand_i + if cur_i.shape.ndims == 0: + return cand_i + # Otherwise propagate the old or the new value. with ops.colocate_with(cand_i): return array_ops.where(elements_finished, cur_i, cand_i) return nest.map_structure(copy_fn, current, candidate) diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py index 4b5072fd679..1b9071ee93c 100644 --- a/tensorflow/python/ops/script_ops.py +++ b/tensorflow/python/ops/script_ops.py @@ -50,19 +50,21 @@ class EagerFunc(object): self._func = func self._out_dtypes = Tout - def __call__(self, *args, **kwargs): - """Passes args, kwargs to `self._func`, which is executed eagerly.""" + def __call__(self, on_gpu, args): + """Passes `args` to `self._func`, which is executed eagerly.""" with context.eager_mode(): - ret = self._func(*args, **kwargs) + ret = self._func(*args) + maybe_copy_to_gpu = lambda x: x if not on_gpu else x.gpu() if isinstance(ret, (tuple, list)): return [ - ops.convert_to_tensor(x, dtype=dtype) + maybe_copy_to_gpu(ops.convert_to_tensor(x, dtype=dtype)) for (x, dtype) in zip(ret, self._out_dtypes) ] elif ret is None: return ret else: - return ops.convert_to_tensor(ret, dtype=self._out_dtypes[0]) + return maybe_copy_to_gpu( + ops.convert_to_tensor(ret, dtype=self._out_dtypes[0])) class FuncRegistry(object): @@ -116,16 +118,29 @@ class FuncRegistry(object): else: return result - def __call__(self, token, args): - """Calls the registered function for `token` with args.""" + def __call__(self, token, on_gpu, args): + """Calls the registered function for `token` with args. + + Args: + token: A key into this `FuncRegistry` identifying which function to call. + on_gpu: A boolean indicating whether or not `token`'s corresponding + operation was placed on GPU; only used if the function registered for + `token` is an `EagerPyFunc`. + args: The arguments to pass to the function registered for `token`. + + Returns: + The output of the function registered for `token`. + + Raises: + ValueError: if no function is registered for `token`. + """ func = self._funcs[token] if func is None: raise ValueError("callback %s is not found" % token) - ret = func(*args) - if isinstance(func, EagerFunc): - return ret + return func(on_gpu, args) else: + ret = func(*args) # Strings seem to lead to a memory leak here if they're not wrapped in a # list. if isinstance(ret, six.binary_type): @@ -302,8 +317,5 @@ def py_func(func, inp, Tout, stateful=True, name=None): func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name) -# TODO(akshayka): PyFuncs where the 'eager' attribute is set to True should be -# differentiable, i.e., the gradient of PyFunc should propagate Nones if the -# eager attribute is not set, and otherwise, it should return the gradient. ops.NotDifferentiable("PyFunc") ops.NotDifferentiable("PyFuncStateless") diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py index 3224856d7be..0fbbf5a805f 100644 --- a/tensorflow/python/ops/sparse_ops.py +++ b/tensorflow/python/ops/sparse_ops.py @@ -227,13 +227,14 @@ def sparse_concat(axis, [array_ops.reshape(shape, [1, -1]) for shape in shapes], 0), 0) shapes = [ array_ops.concat([ - max_shape[:axis], shape[-1:] if axis == -1 else - shape[axis:axis + 1], [] if axis == -1 else max_shape[axis + 1:] + max_shape[:axis], shape[-1:] + if axis == -1 else shape[axis:axis + 1], [] + if axis == -1 else max_shape[axis + 1:] ], 0) for shape in shapes ] - output_ind, output_val, output_shape = (gen_sparse_ops._sparse_concat( - inds, vals, shapes, axis, name=name)) + output_ind, output_val, output_shape = ( + gen_sparse_ops._sparse_concat(inds, vals, shapes, axis, name=name)) return sparse_tensor.SparseTensor(output_ind, output_val, output_shape) @@ -300,15 +301,14 @@ def sparse_add(a, b, thresh=0): b = _convert_to_sparse_tensor(b) thresh = ops.convert_to_tensor( thresh, dtype=a.values.dtype.real_dtype.base_dtype, name="thresh") - output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add( - a.indices, a.values, a.dense_shape, - b.indices, b.values, b.dense_shape, - thresh)) + output_ind, output_val, output_shape = ( + gen_sparse_ops._sparse_add(a.indices, a.values, a.dense_shape, + b.indices, b.values, b.dense_shape, thresh)) # Attempt to get output_shape statically. a.get_shape().assert_is_compatible_with(b.get_shape()) - static_shape = array_ops.broadcast_static_shape( - a.get_shape(), b.get_shape()) + static_shape = array_ops.broadcast_static_shape(a.get_shape(), + b.get_shape()) if static_shape.is_fully_defined(): output_shape = static_shape.as_list() @@ -317,8 +317,8 @@ def sparse_add(a, b, thresh=0): # swap to make `a` the SparseTensor. if isinstance(b, sparse_classes): a, b = b, a - return gen_sparse_ops._sparse_tensor_dense_add( - a.indices, a.values, a.dense_shape, b) + return gen_sparse_ops._sparse_tensor_dense_add(a.indices, a.values, + a.dense_shape, b) def _sparse_cross(inputs, name=None): @@ -397,19 +397,25 @@ def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None): _DEFAULT_HASH_KEY = 0xDECAFCAFFE -def _sparse_cross_internal( - inputs, hashed_output=False, num_buckets=0, hash_key=None, name=None): +def _sparse_cross_internal(inputs, + hashed_output=False, + num_buckets=0, + hash_key=None, + name=None): """See gen_sparse_ops._sparse_cross.""" if not isinstance(inputs, list): raise TypeError("Inputs must be a list") - if not all(isinstance(i, sparse_tensor.SparseTensor) or - isinstance(i, ops.Tensor) for i in inputs): + if not all( + isinstance(i, sparse_tensor.SparseTensor) or isinstance(i, ops.Tensor) + for i in inputs): raise TypeError("All inputs must be SparseTensors") - sparse_inputs = [i for i in inputs - if isinstance(i, sparse_tensor.SparseTensor)] - dense_inputs = [i for i in inputs - if not isinstance(i, sparse_tensor.SparseTensor)] + sparse_inputs = [ + i for i in inputs if isinstance(i, sparse_tensor.SparseTensor) + ] + dense_inputs = [ + i for i in inputs if not isinstance(i, sparse_tensor.SparseTensor) + ] indices = [sp_input.indices for sp_input in sparse_inputs] values = [sp_input.values for sp_input in sparse_inputs] @@ -504,8 +510,9 @@ def sparse_reorder(sp_input, name=None): """ sp_input = _convert_to_sparse_tensor(sp_input) - reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder( - sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)) + reordered_ind, reordered_val = ( + gen_sparse_ops._sparse_reorder( + sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)) if sp_input.get_shape().is_fully_defined(): dense_shape = sp_input.get_shape().as_list() @@ -572,8 +579,8 @@ def sparse_reshape(sp_input, shape, name=None): sp_input.indices, sp_input.dense_shape, shape, name=name) reshaped_shape_const = tensor_util.constant_value(shape) - if (reshaped_shape_const is not None - and sp_input.get_shape().is_fully_defined()): + if (reshaped_shape_const is not None and + sp_input.get_shape().is_fully_defined()): num_implied = sum((dim == -1) for dim in reshaped_shape_const) if num_implied > 1: raise ValueError("At most one dimension can be inferred (-1). Found: %s" @@ -589,15 +596,15 @@ def sparse_reshape(sp_input, shape, name=None): in_shape_size // np.prod(non_implied_idx)) reshaped_size = np.prod(reshaped_shape_const) if reshaped_size != in_shape_size: - raise ValueError( - "Cannot reshape a tensor with %d elements to shape %s " - "(%d elements)." - % (in_shape_size, original_reshaped_shape, reshaped_size)) + raise ValueError("Cannot reshape a tensor with %d elements to shape %s " + "(%d elements)." % + (in_shape_size, original_reshaped_shape, + reshaped_size)) reshaped_shape = reshaped_shape_const - return sparse_tensor.SparseTensor( - reshaped_ind, array_ops.identity(sp_input.values), - reshaped_shape) + return sparse_tensor.SparseTensor(reshaped_ind, + array_ops.identity(sp_input.values), + reshaped_shape) # TODO(aselle): Remove keyword required once for 1.0 final @@ -610,8 +617,11 @@ class KeywordRequired(object): @tf_export("sparse_split") def sparse_split(keyword_required=KeywordRequired(), - sp_input=None, num_split=None, axis=None, - name=None, split_dim=None): + sp_input=None, + num_split=None, + axis=None, + name=None, + split_dim=None): """Split a `SparseTensor` into `num_split` tensors along `axis`. If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split` @@ -660,18 +670,19 @@ def sparse_split(keyword_required=KeywordRequired(), split_dim) sp_input = _convert_to_sparse_tensor(sp_input) - output_inds, output_vals, output_shapes = (gen_sparse_ops._sparse_split( - axis, - sp_input.indices, - sp_input.values, - sp_input.dense_shape, - num_split, - name=name)) + output_inds, output_vals, output_shapes = ( + gen_sparse_ops._sparse_split( + axis, + sp_input.indices, + sp_input.values, + sp_input.dense_shape, + num_split, + name=name)) sparse_tensors = [] for i in range(0, num_split): sparse_tensors.append( - sparse_tensor.SparseTensor( - output_inds[i], output_vals[i], output_shapes[i])) + sparse_tensor.SparseTensor(output_inds[i], output_vals[i], + output_shapes[i])) return sparse_tensors @@ -713,12 +724,15 @@ def sparse_slice(sp_input, start, size, name=None): with ops.name_scope(name, "SparseSlice", [sp_input]) as name: output_indices, output_values, output_shape = gen_sparse_ops.sparse_slice( - sp_input.indices, sp_input.values, sp_input.dense_shape, start, size, name=name) + sp_input.indices, + sp_input.values, + sp_input.dense_shape, + start, + size, + name=name) - return sparse_tensor.SparseTensor( - output_indices, - output_values, - output_shape) + return sparse_tensor.SparseTensor(output_indices, output_values, + output_shape) @tf_export("sparse_to_dense") @@ -819,14 +833,14 @@ def sparse_reduce_max(sp_input, axis=None, keep_dims=False, The reduced Tensor. """ return gen_sparse_ops.sparse_reduce_max( - sp_input.indices, sp_input.values, - sp_input.dense_shape, - math_ops._ReductionDims(sp_input, axis, reduction_axes), - keep_dims) + sp_input.indices, sp_input.values, sp_input.dense_shape, + math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims) @tf_export("sparse_reduce_max_sparse") -def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False, +def sparse_reduce_max_sparse(sp_input, + axis=None, + keep_dims=False, reduction_axes=None): """Computes the max of elements across dimensions of a SparseTensor. @@ -855,10 +869,8 @@ def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False, """ output_ind, output_val, output_shape = ( gen_sparse_ops.sparse_reduce_max_sparse( - sp_input.indices, sp_input.values, - sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis, - reduction_axes), - keep_dims)) + sp_input.indices, sp_input.values, sp_input.dense_shape, + math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)) return sparse_tensor.SparseTensor(output_ind, output_val, output_shape) @@ -905,14 +917,14 @@ def sparse_reduce_sum(sp_input, axis=None, keep_dims=False, The reduced Tensor. """ return gen_sparse_ops.sparse_reduce_sum( - sp_input.indices, sp_input.values, - sp_input.dense_shape, - math_ops._ReductionDims(sp_input, axis, reduction_axes), - keep_dims) + sp_input.indices, sp_input.values, sp_input.dense_shape, + math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims) @tf_export("sparse_reduce_sum_sparse") -def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False, +def sparse_reduce_sum_sparse(sp_input, + axis=None, + keep_dims=False, reduction_axes=None): """Computes the sum of elements across dimensions of a SparseTensor. @@ -941,10 +953,8 @@ def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False, """ output_ind, output_val, output_shape = ( gen_sparse_ops.sparse_reduce_sum_sparse( - sp_input.indices, sp_input.values, - sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis, - reduction_axes), - keep_dims)) + sp_input.indices, sp_input.values, sp_input.dense_shape, + math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)) return sparse_tensor.SparseTensor(output_ind, output_val, output_shape) @@ -1053,8 +1063,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None): with ops.name_scope(name, "SparseToIndicator", [sp_input]) as name: num_entries = array_ops.shape(sp_input.indices)[0] new_values = array_ops.fill(array_ops.expand_dims(num_entries, 0), True) - sp_values = sparse_tensor.SparseTensor( - sp_input.indices, new_values, sp_input.dense_shape) + sp_values = sparse_tensor.SparseTensor(sp_input.indices, new_values, + sp_input.dense_shape) sp_new = sparse_merge(sp_input, sp_values, vocab_size, name) @@ -1174,8 +1184,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None, raise TypeError("vocab_size has to be a list of Tensors or Python ints. " "Found %s" % type(vocab_size)) for dim in vocab_size: - if not (isinstance(dim, ops.Tensor) or - isinstance(dim, numbers.Integral)): + if not (isinstance(dim, ops.Tensor) or isinstance(dim, numbers.Integral)): raise TypeError( "vocab_size has to be a list of Tensors or Python ints. Found %s" % type(dim)) @@ -1326,24 +1335,23 @@ def sparse_reset_shape(sp_input, new_shape=None): # error before the sparse_tensor.SparseTensor catches it. output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0]) - output_shape_tensor_const = tensor_util.constant_value( - output_shape_tensor) + output_shape_tensor_const = tensor_util.constant_value(output_shape_tensor) # For cases where all shapes are known during graph construction - if (output_shape_tensor_const is not None - and sp_input.get_shape().is_fully_defined()): + if (output_shape_tensor_const is not None and + sp_input.get_shape().is_fully_defined()): in_shape_const = np.array(sp_input.get_shape().as_list()) if not np.all(in_shape_const <= output_shape_tensor_const): raise ValueError( "Requested new_shape should have dimension sizes >= sp_input.shape." - " Found new_shape (%s), sp_input.shape (%s)." - % (in_shape_const, output_shape_tensor_const)) + " Found new_shape (%s), sp_input.shape (%s)." % + (in_shape_const, output_shape_tensor_const)) output_shape_tensor = output_shape_tensor_const else: # For cases where shape is not known during graph construction. - output_shape_tensor = control_flow_ops.with_dependencies( - [check_ops.assert_equal( - array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))], - output_shape_tensor) + output_shape_tensor = control_flow_ops.with_dependencies([ + check_ops.assert_equal( + array_ops.shape(in_shape), array_ops.shape(output_shape_tensor)) + ], output_shape_tensor) output_shape_tensor = control_flow_ops.with_dependencies( [check_ops.assert_less_equal(in_shape, output_shape_tensor)], output_shape_tensor) @@ -1409,10 +1417,10 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None): values=sp_input.values, dense_shape=sp_input.dense_shape, default_value=default_value) - return (sparse_tensor.SparseTensor(indices=output_indices, - values=output_values, - dense_shape=sp_input.dense_shape), - empty_row_indicator) + return (sparse_tensor.SparseTensor( + indices=output_indices, + values=output_values, + dense_shape=sp_input.dense_shape), empty_row_indicator) @tf_export("serialize_sparse") @@ -1880,8 +1888,8 @@ def sparse_softmax(sp_input, name=None): [sp_input.indices, sp_input.values]) as name: out_vals = gen_sparse_ops.sparse_softmax(sp_input.indices, sp_input.values, sp_input.dense_shape) - return sparse_tensor.SparseTensor( - sp_input.indices, out_vals, sp_input.dense_shape) + return sparse_tensor.SparseTensor(sp_input.indices, out_vals, + sp_input.dense_shape) @tf_export("sparse_maximum") @@ -1907,9 +1915,9 @@ def sparse_maximum(sp_a, sp_b, name=None): Returns: output: the output SparseTensor. """ - with ops.name_scope(name, "SparseSparseMaximum", [sp_a.indices, sp_a.values, - sp_b.indices, - sp_b.values]) as name: + with ops.name_scope( + name, "SparseSparseMaximum", + [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name: out_indices, out_values = gen_sparse_ops.sparse_sparse_maximum( sp_a.indices, sp_a.values, @@ -1944,9 +1952,9 @@ def sparse_minimum(sp_a, sp_b, name=None): Returns: output: the output SparseTensor. """ - with ops.name_scope(name, "SparseSparseMinimum", [sp_a.indices, sp_a.values, - sp_b.indices, - sp_b.values]) as name: + with ops.name_scope( + name, "SparseSparseMinimum", + [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name: out_indices, out_values = gen_sparse_ops.sparse_sparse_minimum( sp_a.indices, sp_a.values, @@ -2010,14 +2018,15 @@ def sparse_transpose(sp_input, perm=None, name=None): dense_shape = sp_input.dense_shape transposed_dense_shape = array_ops.gather(dense_shape, perm) transposed_st = sparse_tensor.SparseTensor( - transposed_indices, sp_input.values, - transposed_dense_shape) + transposed_indices, sp_input.values, transposed_dense_shape) transposed_st = sparse_reorder(transposed_st) return transposed_st -def _add_sparse_to_tensors_map(sp_input, container=None, - shared_name=None, name=None): +def _add_sparse_to_tensors_map(sp_input, + container=None, + shared_name=None, + name=None): """Add a `SparseTensor` to a `SparseTensorsMap` and return its handle. Args: @@ -2038,12 +2047,18 @@ def _add_sparse_to_tensors_map(sp_input, container=None, sp_input = _convert_to_sparse_tensor(sp_input) return gen_sparse_ops._add_sparse_to_tensors_map( - sp_input.indices, sp_input.values, sp_input.dense_shape, - container=container, shared_name=shared_name, name=name) + sp_input.indices, + sp_input.values, + sp_input.dense_shape, + container=container, + shared_name=shared_name, + name=name) -def _add_many_sparse_to_tensors_map(sp_input, container=None, - shared_name=None, name=None): +def _add_many_sparse_to_tensors_map(sp_input, + container=None, + shared_name=None, + name=None): """Add a minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles. The `SparseTensor` must have rank `R` greater than 1, and the first dimension @@ -2072,12 +2087,18 @@ def _add_many_sparse_to_tensors_map(sp_input, container=None, sp_input = _convert_to_sparse_tensor(sp_input) return gen_sparse_ops._add_many_sparse_to_tensors_map( - sp_input.indices, sp_input.values, sp_input.dense_shape, - container=container, shared_name=shared_name, name=name) + sp_input.indices, + sp_input.values, + sp_input.dense_shape, + container=container, + shared_name=shared_name, + name=name) -def _take_many_sparse_from_tensors_map( - sparse_map_op, sparse_handles, rank=None, name=None): +def _take_many_sparse_from_tensors_map(sparse_map_op, + sparse_handles, + rank=None, + name=None): """Read `SparseTensors` from a `SparseTensorsMap` and concatenate them. The input `sparse_handles` must be a string matrix of shape `[N, 1]` where @@ -2140,16 +2161,18 @@ def _take_many_sparse_from_tensors_map( raise TypeError("sparse_map_op be an Operation") if sparse_map_op.type not in ("AddSparseToTensorsMap", "AddManySparseToTensorsMap"): - raise TypeError("sparse_map_op must be one of AddSparseToTensorsMap or " - "AddSparseToTensorsMap. Instead, found `%s`." % - sparse_map_op.type) + raise TypeError( + "sparse_map_op must be one of AddSparseToTensorsMap or " + "AddSparseToTensorsMap. Instead, found `%s`." % sparse_map_op.type) with ops.colocate_with(sparse_map_op): shared_name = sparse_map_op.get_attr("shared_name") or sparse_map_op.name output_indices, output_values, output_shape = ( gen_sparse_ops._take_many_sparse_from_tensors_map( - sparse_handles, dtype=sparse_map_op.get_attr("T"), + sparse_handles, + dtype=sparse_map_op.get_attr("T"), container=sparse_map_op.get_attr("container"), - shared_name=shared_name, name=name)) + shared_name=shared_name, + name=name)) # Feed rank data back in, if available output_indices.set_shape([None, rank]) diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py index 15127862a4e..6d7eaababcd 100644 --- a/tensorflow/python/ops/special_math_ops.py +++ b/tensorflow/python/ops/special_math_ops.py @@ -192,8 +192,8 @@ def einsum(equation, *inputs, **kwargs): input_count = sum(1 for s in input_axis_labels if a in s) if input_count > 2 and a not in output_axis_labels: logging.warn( - 'Falling back to exponential-space implementation of einsum() because' - ' index "%s" is summed over more than two inputs.', a) + 'Falling back to exponential-space implementation of einsum()' + ' because index "%s" is summed over more than two inputs.', a) return _exponential_space_einsum(equation, *inputs) temp = inputs[0] diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py index 30bf4e4ef1b..737b923415f 100644 --- a/tensorflow/python/ops/standard_ops.py +++ b/tensorflow/python/ops/standard_ops.py @@ -26,6 +26,7 @@ import sys as _sys from tensorflow.python.ops import array_grad from tensorflow.python.ops import data_flow_grad from tensorflow.python.ops import math_grad +from tensorflow.python.ops import manip_grad from tensorflow.python.ops import sparse_grad from tensorflow.python.ops import spectral_grad from tensorflow.python.ops import state_grad @@ -59,6 +60,7 @@ from tensorflow.python.ops.logging_ops import Print from tensorflow.python.ops.logging_ops import get_summary_op from tensorflow.python.ops.lookup_ops import initialize_all_tables from tensorflow.python.ops.lookup_ops import tables_initializer +from tensorflow.python.ops.manip_ops import * from tensorflow.python.ops.math_ops import * from tensorflow.python.ops.numerics import * from tensorflow.python.ops.parsing_ops import * @@ -105,6 +107,7 @@ from tensorflow.python.ops import init_ops as _init_ops from tensorflow.python.ops import io_ops as _io_ops from tensorflow.python.ops import linalg_ops as _linalg_ops from tensorflow.python.ops import logging_ops as _logging_ops +from tensorflow.python.ops import manip_ops as _manip_ops from tensorflow.python.ops import math_ops as _math_ops from tensorflow.python.ops import numerics as _numerics from tensorflow.python.ops import parsing_ops as _parsing_ops @@ -280,6 +283,7 @@ remove_undocumented(__name__, _allowed_symbols, _io_ops, _linalg_ops, _logging_ops, + _manip_ops, _math_ops, _numerics, _parsing_ops, diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py index 3cc76fdbf34..f00213eb88d 100644 --- a/tensorflow/python/ops/state_ops.py +++ b/tensorflow/python/ops/state_ops.py @@ -278,7 +278,7 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None): return gen_state_ops.assign( ref, value, use_locking=use_locking, name=name, validate_shape=validate_shape) - return ref.assign(value) + return ref.assign(value, name=name) @tf_export("count_up_to") diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py index 84449e00beb..806fdd3da7a 100644 --- a/tensorflow/python/ops/template.py +++ b/tensorflow/python/ops/template.py @@ -140,7 +140,7 @@ def make_template(name_, func_, create_scope_now_=False, unique_name_=None, re-enter the scope and reuse those variables. Raises: - ValueError: if the name is None. + ValueError: if `name_` is None. """ return make_template_internal( name_, @@ -176,16 +176,14 @@ def make_template_internal(name_, custom_getter_: Optional custom getter for variables used in `func_`. See the @{tf.get_variable} `custom_getter` documentation for more information. - create_graph_function_: When True, the first invocation of the template will - execute `func_` as is, to allow for variable creation; however, the second - invocation and every invocation thereafter will execute func as a graph - function. In particular, this implies that `func_` must satisfy the - properties that `function.defun` requires of functions: See the - documentation of `function.defun` for details. When executing eagerly, - setting this flag to True can improve performance. Regardless of whether - eager execution is enabled, enabling this flag gives the caller access to - graph-function semantics, i.e., accesses to variables are totally ordered - and side-effecting ops are not pruned. + create_graph_function_: When True, `func_` will be executed as a graph + function. This implies that `func_` must satisfy the properties that + `function.defun` requires of functions: See the documentation of + `function.defun` for details. When executing eagerly, setting this flag to + True can improve performance. Regardless of whether eager execution is + enabled, enabling this flag gives the caller access to graph-function + semantics, i.e., accesses to variables are totally ordered and + side-effecting ops are not pruned. **kwargs: Keyword arguments to apply to `func_`. Returns: @@ -198,8 +196,8 @@ def make_template_internal(name_, re-enter the scope and reuse those variables. Raises: - ValueError: if the name is None. - ValueError: if unique_name_ is not None and eager execution is enabled. + ValueError: if `name_` is None. + ValueError: if `unique_name_` is not None and eager execution is enabled. """ if kwargs: @@ -266,18 +264,18 @@ class Template(object): template of the same scope/unique_name already exists and reuse is false, an error is raised. Defaults to None. custom_getter: optional custom getter to pass to `variable_scope()` - create_graph_function: When True, the first invocation of the template - will execute `func` as is, to allow for variable creation; however, the - second invocation and every invocation thereafter will execute `func` as - a graph function. Enabling this flag gives the caller access to - graph-function semantics, i.e., accesses to variables are totally - ordered and side-effecting ops are not pruned. - + create_graph_function: When True, `func` will be executed as a graph + function. Enabling this flag gives the caller access to graph-function + semantics, i.e., accesses to variables are totally ordered and + side-effecting ops are not pruned. Raises: - ValueError: if the name is None. + ValueError: if `name` is None. """ - self._func = func + if create_graph_function: + self._func = function.defun(func) + else: + self._func = func self._stacktrace = traceback.format_stack()[:-2] self._name = name self._unique_name = unique_name @@ -295,19 +293,13 @@ class Template(object): # This variable keeps track of whether the template has been called yet, # which is not the same as whether the scope has been created. self._variables_created = False - self._create_graph_function = create_graph_function def _call_func(self, args, kwargs): try: vars_at_start = len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)) trainable_at_start = len( ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)) - result = self._func(*args, **kwargs) - if self._create_graph_function and not self._variables_created: - # Only execute self._func as a graph function once variables are - # created. - self._func = function.defun(self._func) if self._variables_created: # Variables were previously created, implying this is not the first @@ -542,14 +534,11 @@ class EagerTemplate(Template): names of all created Tensors. If set to False, the scope will be created at the first call location. custom_getter: optional custom getter to pass to `variable_scope()` - create_graph_function: When True, the first invocation of the template - will execute `func` as is, to allow for variable creation; however, the - second invocation and every invocation thereafter will execute `func` as - a graph function. Enabling this flag allows the caller to reap the - performance benefits associated with executing graphs, at the cost of - sacrificing debuggability; however, not all functions can be compiled - into graph functions. See the documentation for `function.defun` for - details. + create_graph_function: When True, `func` will be executed as a graph + function. Enabling this flag allows the caller to reap the performance + benefits associated with executing graphs, at the cost of sacrificing + debuggability; however, not all Python functions can be compiled into + graph functions. See the documentation for `function.defun` for details. Raises: RuntimeError: if eager execution is not enabled. @@ -573,12 +562,7 @@ class EagerTemplate(Template): try: vars_at_start = self._template_store.variables() trainable_at_start = self._template_store.trainable_variables() - result = self._func(*args, **kwargs) - if self._create_graph_function and not self._variables_created: - # Only execute self._func as a graph function once variables are - # created. - self._func = function.defun(self._func) if self._variables_created: # Variables were previously created, implying this is not the first diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py index 9b92d9a1800..cce64c0ccaf 100644 --- a/tensorflow/python/platform/app.py +++ b/tensorflow/python/platform/app.py @@ -23,6 +23,7 @@ import sys as _sys from tensorflow.python.platform import flags from tensorflow.python.util.all_util import remove_undocumented +from tensorflow.python.util.tf_export import tf_export def _usage(shorthelp): @@ -108,6 +109,7 @@ def _define_help_flags(): _define_help_flags_called = True +@tf_export('app.run') def run(main=None, argv=None): """Runs the program with an optional 'main' function and 'argv' list.""" diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py index 2455acb4c0c..8f7b12e2b2b 100644 --- a/tensorflow/python/platform/resource_loader.py +++ b/tensorflow/python/platform/resource_loader.py @@ -29,8 +29,10 @@ import sys as _sys from tensorflow.python.util import tf_inspect as _inspect from tensorflow.python.util.all_util import remove_undocumented +from tensorflow.python.util.tf_export import tf_export +@tf_export('resource_loader.load_resource') def load_resource(path): """Load the resource at given path, where path is relative to tensorflow/. @@ -52,6 +54,7 @@ def load_resource(path): # pylint: disable=protected-access +@tf_export('resource_loader.get_data_files_path') def get_data_files_path(): """Get a direct path to the data files colocated with the script. @@ -62,6 +65,7 @@ def get_data_files_path(): return _os.path.dirname(_inspect.getfile(_sys._getframe(1))) +@tf_export('resource_loader.get_root_dir_with_all_resources') def get_root_dir_with_all_resources(): """Get a root directory containing all the data attributes in the build rule. @@ -101,6 +105,7 @@ def get_root_dir_with_all_resources(): return data_files_dir or script_dir +@tf_export('resource_loader.get_path_to_datafile') def get_path_to_datafile(path): """Get the path to the specified file in the data dependencies. @@ -120,6 +125,7 @@ def get_path_to_datafile(path): return _os.path.join(data_files_path, path) +@tf_export('resource_loader.readahead_file_path') def readahead_file_path(path, readahead='128M'): # pylint: disable=unused-argument """Readahead files not implemented; simply returns given path.""" return path diff --git a/tensorflow/python/platform/stacktrace_handler_test.py b/tensorflow/python/platform/stacktrace_handler_test.py index 3f0e534f4cb..f2071f9d54c 100644 --- a/tensorflow/python/platform/stacktrace_handler_test.py +++ b/tensorflow/python/platform/stacktrace_handler_test.py @@ -57,7 +57,8 @@ class StacktraceHandlerTest(test.TestCase): # Capture its output. capture both stdout and stderr and append them. # We are not worried about timing or order of messages in this test. - child_output = child_process.stdout.read() + child_process.stderr.read() + child_stdout, child_stderr = child_process.communicate() + child_output = child_stdout + child_stderr # Make sure the child process is dead before we proceed. child_process.wait() diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py index 85ed4f071c7..22aabfd7121 100644 --- a/tensorflow/python/platform/tf_logging.py +++ b/tensorflow/python/platform/tf_logging.py @@ -35,6 +35,7 @@ import threading import six from tensorflow.python.util.all_util import remove_undocumented +from tensorflow.python.util.tf_export import tf_export # Don't use this directly. Use _get_logger() instead. @@ -90,30 +91,37 @@ def _get_logger(): _logger_lock.release() +@tf_export('logging.log') def log(level, msg, *args, **kwargs): _get_logger().log(level, msg, *args, **kwargs) +@tf_export('logging.debug') def debug(msg, *args, **kwargs): _get_logger().debug(msg, *args, **kwargs) +@tf_export('logging.error') def error(msg, *args, **kwargs): _get_logger().error(msg, *args, **kwargs) +@tf_export('logging.fatal') def fatal(msg, *args, **kwargs): _get_logger().fatal(msg, *args, **kwargs) +@tf_export('logging.info') def info(msg, *args, **kwargs): _get_logger().info(msg, *args, **kwargs) +@tf_export('logging.warn') def warn(msg, *args, **kwargs): _get_logger().warn(msg, *args, **kwargs) +@tf_export('logging.warning') def warning(msg, *args, **kwargs): _get_logger().warning(msg, *args, **kwargs) @@ -136,15 +144,18 @@ _log_prefix = None # later set to google2_log_prefix _log_counter_per_token = {} +@tf_export('logging.TaskLevelStatusMessage') def TaskLevelStatusMessage(msg): error(msg) +@tf_export('logging.flush') def flush(): raise NotImplementedError() # Code below is taken from pyglib/logging +@tf_export('logging.vlog') def vlog(level, msg, *args, **kwargs): _get_logger().log(level, msg, *args, **kwargs) @@ -164,6 +175,7 @@ def _GetNextLogCountPerToken(token): return _log_counter_per_token[token] +@tf_export('logging.log_every_n') def log_every_n(level, msg, n, *args): """Log 'msg % args' at level 'level' once per 'n' times. @@ -180,6 +192,7 @@ def log_every_n(level, msg, n, *args): log_if(level, msg, not (count % n), *args) +@tf_export('logging.log_first_n') def log_first_n(level, msg, n, *args): # pylint: disable=g-bad-name """Log 'msg % args' at level 'level' only first 'n' times. @@ -195,6 +208,7 @@ def log_first_n(level, msg, n, *args): # pylint: disable=g-bad-name log_if(level, msg, count < n, *args) +@tf_export('logging.log_if') def log_if(level, msg, condition, *args): """Log 'msg % args' at level 'level' only if condition is fulfilled.""" if condition: @@ -251,11 +265,13 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None): return s +@tf_export('logging.get_verbosity') def get_verbosity(): """Return how much logging output will be produced.""" return _get_logger().getEffectiveLevel() +@tf_export('logging.set_verbosity') def set_verbosity(v): """Sets the threshold for what messages will be logged.""" _get_logger().setLevel(v) @@ -296,4 +312,10 @@ _allowed_symbols = [ 'warning', ] +tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG') +tf_export('logging.ERROR').export_constant(__name__, 'ERROR') +tf_export('logging.FATAL').export_constant(__name__, 'FATAL') +tf_export('logging.INFO').export_constant(__name__, 'INFO') +tf_export('logging.WARN').export_constant(__name__, 'WARN') + remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py index 8f780545607..0e20ca35bba 100644 --- a/tensorflow/python/profiler/model_analyzer.py +++ b/tensorflow/python/profiler/model_analyzer.py @@ -33,6 +33,7 @@ from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.profiler import option_builder from tensorflow.python.profiler import tfprof_logger +from tensorflow.python.util.tf_export import tf_export _DEFAULT_PROFILE_OPTIONS = 0 _DEFAULT_ADVISE_OPTIONS = 0 @@ -121,6 +122,7 @@ def _build_advisor_options(options): return opts +@tf_export('profiler.Profiler') class Profiler(object): """TensorFlow multi-step profiler. @@ -304,6 +306,7 @@ class Profiler(object): print_mdl.WriteProfile(filename) +@tf_export('profiler.profile') def profile(graph=None, run_meta=None, op_log=None, @@ -378,6 +381,7 @@ def profile(graph=None, return tfprof_node +@tf_export('profiler.advise') def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS): """Auto profile and advise. diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py index 13942ad6a2a..957ebe6dddc 100644 --- a/tensorflow/python/profiler/option_builder.py +++ b/tensorflow/python/profiler/option_builder.py @@ -20,8 +20,10 @@ from __future__ import print_function import copy from tensorflow.python.profiler import tfprof_logger +from tensorflow.python.util.tf_export import tf_export +@tf_export('profiler.ProfileOptionBuilder') class ProfileOptionBuilder(object): # pylint: disable=line-too-long """Option Builder for Profiling API. diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py index ffda7ddad75..8d121064967 100644 --- a/tensorflow/python/profiler/tfprof_logger.py +++ b/tensorflow/python/profiler/tfprof_logger.py @@ -30,6 +30,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.platform import gfile from tensorflow.python.profiler.internal import flops_registry # pylint: disable=unused-import +from tensorflow.python.util.tf_export import tf_export TRAINABLE_VARIABLES = '_trainable_variables' REGISTERED_FLOP_STATS = 'flops' @@ -187,6 +188,7 @@ def merge_default_with_oplog(graph, op_log=None, run_meta=None, return tmp_op_log +@tf_export('profiler.write_op_log') def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True): """Log provided 'op_log', and add additional model information below. diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py index 5ff954fd9f8..6e85df0cbf5 100644 --- a/tensorflow/python/saved_model/loader_impl.py +++ b/tensorflow/python/saved_model/loader_impl.py @@ -232,13 +232,9 @@ def load(sess, tags, export_dir, **saver_kwargs): asset_tensors_dictionary = _get_asset_tensors(export_dir, meta_graph_def_to_load) - main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load) + main_op_tensor = (_get_main_op_tensor(meta_graph_def_to_load) or + (_get_legacy_init_op_tensor(meta_graph_def_to_load))) if main_op_tensor is not None: sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary) - else: - legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load) - if legacy_init_op_tensor is not None: - sess.run( - fetches=[legacy_init_op_tensor], feed_dict=asset_tensors_dictionary) return meta_graph_def_to_load diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py index 1ea619ff55d..f92247d52e4 100644 --- a/tensorflow/python/saved_model/saved_model_test.py +++ b/tensorflow/python/saved_model/saved_model_test.py @@ -54,8 +54,14 @@ def tearDownModule(): file_io.delete_recursively(test.get_temp_dir()) +@test_util.with_c_api class SavedModelTest(test.TestCase): + def _get_export_dir(self, label): + if ops._USE_C_API: + label += "_c_api" + return os.path.join(test.get_temp_dir(), label) + def _init_and_validate_variable(self, sess, variable_name, variable_value): v = variables.Variable(variable_value, name=variable_name) sess.run(variables.global_variables_initializer()) @@ -123,8 +129,7 @@ class SavedModelTest(test.TestCase): self.assertFalse(loader.maybe_saved_model_directory(base_path)) def testBadSavedModelFileFormat(self): - export_dir = os.path.join(test.get_temp_dir(), - "test_bad_saved_model_file_format") + export_dir = self._get_export_dir("test_bad_saved_model_file_format") # Attempt to load a SavedModel from an export directory that does not exist. with self.test_session(graph=ops.Graph()) as sess: with self.assertRaisesRegexp(IOError, @@ -157,8 +162,7 @@ class SavedModelTest(test.TestCase): loader.load(sess, ["foo"], export_dir) def testVerifySessionGraphUsage(self): - export_dir = os.path.join(test.get_temp_dir(), - "test_verify_session_graph_usage") + export_dir = self._get_export_dir("test_verify_session_graph_usage") builder = saved_model_builder.SavedModelBuilder(export_dir) with self.test_session(graph=ops.Graph()) as sess: @@ -178,7 +182,7 @@ class SavedModelTest(test.TestCase): 42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval()) def testSequence(self): - export_dir = os.path.join(test.get_temp_dir(), "test_sequence") + export_dir = self._get_export_dir("test_sequence") builder = saved_model_builder.SavedModelBuilder(export_dir) # Expect an assertion error since add_meta_graph_and_variables() should be @@ -195,7 +199,7 @@ class SavedModelTest(test.TestCase): sess, ["baz"]) def testTags(self): - export_dir = os.path.join(test.get_temp_dir(), "test_tags") + export_dir = self._get_export_dir("test_tags") builder = saved_model_builder.SavedModelBuilder(export_dir) # Graph with a single variable. SavedModel invoked to: @@ -284,7 +288,7 @@ class SavedModelTest(test.TestCase): export_dir) def testVariables(self): - export_dir = os.path.join(test.get_temp_dir(), "test_variables") + export_dir = self._get_export_dir("test_variables") builder = saved_model_builder.SavedModelBuilder(export_dir) # Graph with two variables. SavedModel invoked to: @@ -336,7 +340,7 @@ class SavedModelTest(test.TestCase): export_dir) def testGraphWithoutVariables(self): - export_dir = os.path.join(test.get_temp_dir(), "test_graph_has_variables") + export_dir = self._get_export_dir("test_graph_has_variables") builder = saved_model_builder.SavedModelBuilder(export_dir) # Graph with no variables. @@ -371,7 +375,7 @@ class SavedModelTest(test.TestCase): self.assertEqual(30.0, sess.run(c)) def testNoOverwrite(self): - export_dir = os.path.join(test.get_temp_dir(), "test_no_overwrite") + export_dir = self._get_export_dir("test_no_overwrite") builder = saved_model_builder.SavedModelBuilder(export_dir) # Graph with a single variable. SavedModel invoked to: @@ -395,7 +399,7 @@ class SavedModelTest(test.TestCase): export_dir) def testSaveAsText(self): - export_dir = os.path.join(test.get_temp_dir(), "test_astext") + export_dir = self._get_export_dir("test_astext") builder = saved_model_builder.SavedModelBuilder(export_dir) # Graph with a single variable. SavedModel invoked to: @@ -426,7 +430,7 @@ class SavedModelTest(test.TestCase): 42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval()) def testCollections(self): - export_dir = os.path.join(test.get_temp_dir(), "test_collections") + export_dir = self._get_export_dir("test_collections") builder = saved_model_builder.SavedModelBuilder(export_dir) # Graph with a single variable added to a collection. SavedModel invoked to: @@ -476,7 +480,7 @@ class SavedModelTest(test.TestCase): self.assertEqual(len(ops.get_collection("foo_vars")), 0) def testSignatureDefs(self): - export_dir = os.path.join(test.get_temp_dir(), "test_signature_defs") + export_dir = self._get_export_dir("test_signature_defs") builder = saved_model_builder.SavedModelBuilder(export_dir) # Graph with a single variable and a single entry in the signature def map. @@ -536,8 +540,7 @@ class SavedModelTest(test.TestCase): self.assertEqual("foo_new", bar_signature["foo_key"].method_name) def testSignatureDefValidation(self): - export_dir = os.path.join(test.get_temp_dir(), - "test_signature_def_validation") + export_dir = self._get_export_dir("test_signature_def_validation") builder = saved_model_builder.SavedModelBuilder(export_dir) tensor_without_name = meta_graph_pb2.TensorInfo() @@ -555,7 +558,7 @@ class SavedModelTest(test.TestCase): self._validate_outputs_tensor_info(builder, tensor_empty) def testAssets(self): - export_dir = os.path.join(test.get_temp_dir(), "test_assets") + export_dir = self._get_export_dir("test_assets") builder = saved_model_builder.SavedModelBuilder(export_dir) with self.test_session(graph=ops.Graph()) as sess: @@ -588,7 +591,7 @@ class SavedModelTest(test.TestCase): self.assertFalse(file_io.file_exists(ignored_asset_path)) def testCustomMainOp(self): - export_dir = os.path.join(test.get_temp_dir(), "test_main_op") + export_dir = self._get_export_dir("test_main_op") builder = saved_model_builder.SavedModelBuilder(export_dir) with self.test_session(graph=ops.Graph()) as sess: @@ -623,7 +626,7 @@ class SavedModelTest(test.TestCase): self.assertEqual(3, ops.get_collection("v")[2].eval()) def testLegacyInitOp(self): - export_dir = os.path.join(test.get_temp_dir(), "test_legacy_init_op") + export_dir = self._get_export_dir("test_legacy_init_op") builder = saved_model_builder.SavedModelBuilder(export_dir) with self.test_session(graph=ops.Graph()) as sess: @@ -657,8 +660,8 @@ class SavedModelTest(test.TestCase): self.assertEqual(3, ops.get_collection("v")[2].eval()) def testLegacyInitOpWithNonEmptyCollection(self): - export_dir = os.path.join(test.get_temp_dir(), - "test_legacy_init_op_with_non_empty_collection") + export_dir = self._get_export_dir( + "test_legacy_init_op_with_non_empty_collection") builder = saved_model_builder.SavedModelBuilder(export_dir) with self.test_session(graph=ops.Graph()) as sess: @@ -685,7 +688,7 @@ class SavedModelTest(test.TestCase): sess, ["foo"], legacy_init_op=legacy_init_op) def testMultipleAssets(self): - export_dir = os.path.join(test.get_temp_dir(), "test_multiple_assets") + export_dir = self._get_export_dir("test_multiple_assets") builder = saved_model_builder.SavedModelBuilder(export_dir) with self.test_session(graph=ops.Graph()) as sess: @@ -727,7 +730,7 @@ class SavedModelTest(test.TestCase): "asset_file_tensor:0") def testDuplicateAssets(self): - export_dir = os.path.join(test.get_temp_dir(), "test_duplicate_assets") + export_dir = self._get_export_dir("test_duplicate_assets") builder = saved_model_builder.SavedModelBuilder(export_dir) with self.test_session(graph=ops.Graph()) as sess: @@ -775,7 +778,7 @@ class SavedModelTest(test.TestCase): "asset_file_tensor:0") def testOp(self): - export_dir = os.path.join(test.get_temp_dir(), "test_op") + export_dir = self._get_export_dir("test_op") builder = saved_model_builder.SavedModelBuilder(export_dir) with session.Session( @@ -818,7 +821,7 @@ class SavedModelTest(test.TestCase): self.assertEqual(3, ops.get_collection("v")[2].eval()) def testCustomSaveable(self): - export_dir = os.path.join(test.get_temp_dir(), "custom_saveable") + export_dir = self._get_export_dir("custom_saveable") builder = saved_model_builder.SavedModelBuilder(export_dir) with session.Session( @@ -847,7 +850,7 @@ class SavedModelTest(test.TestCase): self.assertEqual(3.0, v1.values().eval()) def testClearDevices(self): - export_dir = os.path.join(test.get_temp_dir(), "test_clear_devices") + export_dir = self._get_export_dir("test_clear_devices") builder = saved_model_builder.SavedModelBuilder(export_dir) # Specify a device and save a variable. @@ -871,7 +874,9 @@ class SavedModelTest(test.TestCase): 42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval()) def testStripDefaultAttrs(self): - export_dir = os.path.join(test.get_temp_dir(), "test_strip_default_attrs") + if ops._USE_C_API: return # TODO(skyewm): get this working + + export_dir = self._get_export_dir("test_strip_default_attrs") builder = saved_model_builder.SavedModelBuilder(export_dir) # Add a graph with two float32 variables and a Complex Op composing them @@ -941,8 +946,10 @@ class SavedModelTest(test.TestCase): self.assertIn("Tout", node_def.attr) def testStripDefaultAttrsInconsistentConsumerDefaults(self): - export_dir = os.path.join(test.get_temp_dir(), - "test_strip_default_attrs_no_consumer_defaults") + if ops._USE_C_API: return # TODO(skyewm): get this working + + export_dir = self._get_export_dir( + "test_strip_default_attrs_no_consumer_defaults") builder = saved_model_builder.SavedModelBuilder(export_dir) # Add a graph with two float32 variables and a Complex Op composing them diff --git a/tensorflow/python/saved_model/simple_save.py b/tensorflow/python/saved_model/simple_save.py index 9a81e5cd807..1e4cc733706 100644 --- a/tensorflow/python/saved_model/simple_save.py +++ b/tensorflow/python/saved_model/simple_save.py @@ -40,17 +40,20 @@ def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None): - It will be treated as a graph for inference / serving (i.e. uses the tag `tag_constants.SERVING`) - The SavedModel will load in TensorFlow Serving and supports the - [Predict API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto). + [Predict + API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto). To use the Classify, Regress, or MultiInference APIs, please use either [tf.Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator) or the lower level - [SavedModel APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md). + [SavedModel + APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md). - Some TensorFlow ops depend on information on disk or other information called "assets". These are generally handled automatically by adding the assets to the `GraphKeys.ASSET_FILEPATHS` collection. Only assets in that collection are exported; if you need more custom behavior, you'll need to - use the [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py). + use the + [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py). More information about SavedModel and signatures can be found here: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md. diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py index 12f120116f4..1f3f2287043 100644 --- a/tensorflow/python/summary/writer/writer.py +++ b/tensorflow/python/summary/writer/writer.py @@ -32,6 +32,7 @@ from tensorflow.python.platform import gfile from tensorflow.python.platform import tf_logging as logging from tensorflow.python.summary import plugin_asset from tensorflow.python.summary.writer.event_file_writer import EventFileWriter +from tensorflow.python.util.tf_export import tf_export _PLUGINS_DIR = "plugins" @@ -276,6 +277,7 @@ class SummaryToEventTransformer(object): self.event_writer.add_event(event) +@tf_export("summary.FileWriter") class FileWriter(SummaryToEventTransformer): """Writes `Summary` protocol buffers to event files. diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py index bad289303c0..645fa28a37f 100644 --- a/tensorflow/python/summary/writer/writer_cache.py +++ b/tensorflow/python/summary/writer/writer_cache.py @@ -22,8 +22,10 @@ import threading from tensorflow.python.framework import ops from tensorflow.python.summary.writer.writer import FileWriter +from tensorflow.python.util.tf_export import tf_export +@tf_export('summary.FileWriterCache') class FileWriterCache(object): """Cache for file writers. diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py index a2e86a1c43a..fd78f44c999 100644 --- a/tensorflow/python/tools/freeze_graph.py +++ b/tensorflow/python/tools/freeze_graph.py @@ -251,7 +251,7 @@ def main(unused_args): FLAGS.output_graph, FLAGS.clear_devices, FLAGS.initializer_nodes, FLAGS.variable_names_whitelist, FLAGS.variable_names_blacklist, FLAGS.input_meta_graph, FLAGS.input_saved_model_dir, - FLAGS.saved_model_tags, checkpoint_version=checkpoint_version) + FLAGS.saved_model_tags, FLAGS.checkpoint_version) if __name__ == "__main__": diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py index c2687bf557b..9c192712225 100644 --- a/tensorflow/python/tools/optimize_for_inference_lib.py +++ b/tensorflow/python/tools/optimize_for_inference_lib.py @@ -349,6 +349,7 @@ def fold_batch_norms(input_graph_def): bias_add_op.op = "BiasAdd" bias_add_op.name = node.name bias_add_op.attr["T"].CopyFrom(conv_op.attr["T"]) + bias_add_op.attr["data_format"].CopyFrom(conv_op.attr["data_format"]) bias_add_op.input.extend([new_conv_op.name, offset_op.name]) new_ops.extend([scaled_weights_op, new_conv_op, offset_op, bias_add_op]) diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py index 6dd24c0dca1..2ef612473b4 100644 --- a/tensorflow/python/tools/optimize_for_inference_test.py +++ b/tensorflow/python/tools/optimize_for_inference_test.py @@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import importer from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import image_ops @@ -38,6 +39,7 @@ from tensorflow.python.platform import test from tensorflow.python.tools import optimize_for_inference_lib +@test_util.with_c_api class OptimizeForInferenceTest(test.TestCase): def create_node_def(self, op, name, inputs): @@ -145,7 +147,7 @@ class OptimizeForInferenceTest(test.TestCase): np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32) gamma_op = constant_op.constant( np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32) - ops.get_default_graph().graph_def_versions.producer = 8 + test_util.set_producer_version(ops.get_default_graph(), 8) gen_nn_ops._batch_norm_with_global_normalization( conv_op, mean_op, @@ -171,48 +173,53 @@ class OptimizeForInferenceTest(test.TestCase): self.assertNotEqual("BatchNormWithGlobalNormalization", node.op) def testFoldFusedBatchNorms(self): - with self.test_session() as sess: - inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6] - input_op = constant_op.constant( - np.array(inputs), shape=[1, 1, 6, 2], dtype=dtypes.float32) - weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4] - weights_op = constant_op.constant( - np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32) - conv_op = nn_ops.conv2d( - input_op, weights_op, [1, 1, 1, 1], padding="SAME", name="conv_op") - mean_op = constant_op.constant( - np.array([10, 20]), shape=[2], dtype=dtypes.float32) - variance_op = constant_op.constant( - np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32) - beta_op = constant_op.constant( - np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32) - gamma_op = constant_op.constant( - np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32) - ops.get_default_graph().graph_def_versions.producer = 9 - gen_nn_ops._fused_batch_norm( - conv_op, - gamma_op, - beta_op, - mean_op, - variance_op, - 0.00001, - is_training=False, - name="output") - original_graph_def = sess.graph_def - original_result = sess.run(["output:0"]) - optimized_graph_def = optimize_for_inference_lib.fold_batch_norms( - original_graph_def) + for data_format, use_gpu in [("NHWC", False), ("NCHW", True)]: + with self.test_session(use_gpu=use_gpu) as sess: + inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6] + input_op = constant_op.constant( + np.array(inputs), + shape=[1, 1, 6, 2] if data_format == "NHWC" else [1, 2, 1, 6], + dtype=dtypes.float32) + weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4] + weights_op = constant_op.constant( + np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32) + conv_op = nn_ops.conv2d( + input_op, weights_op, [1, 1, 1, 1], padding="SAME", + data_format=data_format, name="conv_op") + mean_op = constant_op.constant( + np.array([10, 20]), shape=[2], dtype=dtypes.float32) + variance_op = constant_op.constant( + np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32) + beta_op = constant_op.constant( + np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32) + gamma_op = constant_op.constant( + np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32) + ops.get_default_graph().graph_def_versions.producer = 9 + gen_nn_ops._fused_batch_norm( + conv_op, + gamma_op, + beta_op, + mean_op, + variance_op, + 0.00001, + is_training=False, + data_format=data_format, + name="output") + original_graph_def = sess.graph_def + original_result = sess.run(["output:0"]) + optimized_graph_def = optimize_for_inference_lib.fold_batch_norms( + original_graph_def) - with self.test_session() as sess: - _ = importer.import_graph_def( - optimized_graph_def, input_map={}, name="optimized") - optimized_result = sess.run(["optimized/output:0"]) + with self.test_session(use_gpu=use_gpu) as sess: + _ = importer.import_graph_def( + optimized_graph_def, input_map={}, name="optimized") + optimized_result = sess.run(["optimized/output:0"]) - self.assertAllClose( - original_result, optimized_result, rtol=1e-04, atol=1e-06) + self.assertAllClose( + original_result, optimized_result, rtol=1e-04, atol=1e-06) - for node in optimized_graph_def.node: - self.assertNotEqual("FusedBatchNorm", node.op) + for node in optimized_graph_def.node: + self.assertNotEqual("FusedBatchNorm", node.op) def testFuseResizePadAndConv(self): with self.test_session() as sess: diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 21e8e803fcb..5b0a584c10e 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -31,6 +31,7 @@ import warnings import numpy as np +from six import integer_types from tensorflow.contrib.saved_model.python.saved_model import reader from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils from tensorflow.core.example import example_pb2 @@ -440,7 +441,7 @@ def _create_example_string(example_dict): elif isinstance(feature_list[0], str): example.features.feature[feature_name].bytes_list.value.extend( feature_list) - elif isinstance(feature_list[0], (int, long)): + elif isinstance(feature_list[0], integer_types): example.features.feature[feature_name].int64_list.value.extend( feature_list) else: diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py index 13c07cfd7bf..c08e3cca007 100644 --- a/tensorflow/python/training/adadelta.py +++ b/tensorflow/python/training/adadelta.py @@ -22,8 +22,10 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import math_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.AdadeltaOptimizer") class AdadeltaOptimizer(optimizer.Optimizer): """Optimizer that implements the Adadelta algorithm. diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py index afa192f7cc6..deb4e6f5463 100644 --- a/tensorflow/python/training/adagrad.py +++ b/tensorflow/python/training/adagrad.py @@ -25,8 +25,10 @@ from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.AdagradOptimizer") class AdagradOptimizer(optimizer.Optimizer): """Optimizer that implements the Adagrad algorithm. diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py index b3f9ea323c2..5ba403554f5 100644 --- a/tensorflow/python/training/adagrad_da.py +++ b/tensorflow/python/training/adagrad_da.py @@ -23,8 +23,10 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.AdagradDAOptimizer") class AdagradDAOptimizer(optimizer.Optimizer): """Adagrad Dual Averaging algorithm for sparse linear models. diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py index 0c69f8bf399..c92f6fc3015 100644 --- a/tensorflow/python/training/adam.py +++ b/tensorflow/python/training/adam.py @@ -26,8 +26,10 @@ from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import state_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.AdamOptimizer") class AdamOptimizer(optimizer.Optimizer): """Optimizer that implements the Adam algorithm. diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py index 52b0f421061..7af821c8192 100644 --- a/tensorflow/python/training/basic_loops.py +++ b/tensorflow/python/training/basic_loops.py @@ -18,8 +18,10 @@ from __future__ import division from __future__ import print_function from tensorflow.python.framework import errors +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.basic_train_loop") def basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master=""): """Basic loop to train a model. diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py index 752d585cd17..aae757b99aa 100644 --- a/tensorflow/python/training/basic_session_run_hooks.py +++ b/tensorflow/python/training/basic_session_run_hooks.py @@ -47,6 +47,7 @@ from tensorflow.python.training import session_run_hook from tensorflow.python.training import training_util from tensorflow.python.training.session_run_hook import SessionRunArgs from tensorflow.python.training.summary_io import SummaryWriterCache +from tensorflow.python.util.tf_export import tf_export class _HookTimer(object): @@ -85,6 +86,7 @@ class _HookTimer(object): raise NotImplementedError +@tf_export("train.SecondOrStepTimer") class SecondOrStepTimer(_HookTimer): """Timer that triggers at most once every N seconds or once every N steps. """ @@ -164,6 +166,7 @@ class NeverTriggerTimer(_HookTimer): return None +@tf_export("train.LoggingTensorHook") class LoggingTensorHook(session_run_hook.SessionRunHook): """Prints the given tensors every N local steps, every N seconds, or at end. @@ -262,6 +265,7 @@ class LoggingTensorHook(session_run_hook.SessionRunHook): self._log_tensors(values) +@tf_export("train.StopAtStepHook") class StopAtStepHook(session_run_hook.SessionRunHook): """Hook that requests stop at a specified step.""" @@ -317,6 +321,7 @@ class StopAtStepHook(session_run_hook.SessionRunHook): run_context.request_stop() +@tf_export("train.CheckpointSaverListener") class CheckpointSaverListener(object): """Interface for listeners that take action before or after checkpoint save. @@ -331,7 +336,7 @@ class CheckpointSaverListener(object): `CheckpointSaverHook`, as in this example: ```python - class ExampleCheckpointSaverListerner(CheckpointSaverListener): + class ExampleCheckpointSaverListener(CheckpointSaverListener): def begin(self): # You can add ops to the graph here. print('Starting the session.') @@ -347,7 +352,7 @@ class CheckpointSaverListener(object): print('Done with the session.') ... - listener = ExampleCheckpointSaverListerner() + listener = ExampleCheckpointSaverListener() saver_hook = tf.train.CheckpointSaverHook( checkpoint_dir, listeners=[listener]) with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]): @@ -375,6 +380,7 @@ class CheckpointSaverListener(object): pass +@tf_export("train.CheckpointSaverHook") class CheckpointSaverHook(session_run_hook.SessionRunHook): """Saves checkpoints every N steps or seconds.""" @@ -497,6 +503,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook): return savers[0] +@tf_export("train.StepCounterHook") class StepCounterHook(session_run_hook.SessionRunHook): """Hook that counts steps per second.""" @@ -575,12 +582,14 @@ class StepCounterHook(session_run_hook.SessionRunHook): self._last_global_step = stale_global_step +@tf_export("train.NanLossDuringTrainingError") class NanLossDuringTrainingError(RuntimeError): def __str__(self): return "NaN loss during training." +@tf_export("train.NanTensorHook") class NanTensorHook(session_run_hook.SessionRunHook): """Monitors the loss tensor and stops training if loss is NaN. @@ -612,6 +621,7 @@ class NanTensorHook(session_run_hook.SessionRunHook): run_context.request_stop() +@tf_export("train.SummarySaverHook") class SummarySaverHook(session_run_hook.SessionRunHook): """Saves summaries every N steps.""" @@ -720,6 +730,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook): return summary_op +@tf_export("train.GlobalStepWaiterHook") class GlobalStepWaiterHook(session_run_hook.SessionRunHook): """Delays execution until global step reaches `wait_until_step`. @@ -767,6 +778,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook): time.sleep(0.5) +@tf_export("train.FinalOpsHook") class FinalOpsHook(session_run_hook.SessionRunHook): """A hook which evaluates `Tensors` at the end of a session.""" @@ -793,6 +805,7 @@ class FinalOpsHook(session_run_hook.SessionRunHook): feed_dict=self._final_ops_feed_dict) +@tf_export("train.FeedFnHook") class FeedFnHook(session_run_hook.SessionRunHook): """Runs `feed_fn` and sets the `feed_dict` accordingly.""" @@ -810,6 +823,7 @@ class FeedFnHook(session_run_hook.SessionRunHook): fetches=None, feed_dict=self.feed_fn()) +@tf_export("train.ProfilerHook") class ProfilerHook(session_run_hook.SessionRunHook): """Captures CPU/GPU profiling information every N steps or seconds. diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py index b5d3e787971..fa3de6fad27 100644 --- a/tensorflow/python/training/checkpoint_utils.py +++ b/tensorflow/python/training/checkpoint_utils.py @@ -29,6 +29,7 @@ from tensorflow.python.ops import variables from tensorflow.python.platform import gfile from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import saver +from tensorflow.python.util.tf_export import tf_export __all__ = [ @@ -36,6 +37,7 @@ __all__ = [ ] +@tf_export("train.load_checkpoint") def load_checkpoint(ckpt_dir_or_file): """Returns `CheckpointReader` for checkpoint found in `ckpt_dir_or_file`. @@ -60,6 +62,7 @@ def load_checkpoint(ckpt_dir_or_file): return pywrap_tensorflow.NewCheckpointReader(filename) +@tf_export("train.load_variable") def load_variable(ckpt_dir_or_file, name): """Returns the tensor value of the given variable in the checkpoint. @@ -77,6 +80,7 @@ def load_variable(ckpt_dir_or_file, name): return reader.get_tensor(name) +@tf_export("train.list_variables") def list_variables(ckpt_dir_or_file): """Returns list of all variables in the checkpoint. @@ -95,6 +99,7 @@ def list_variables(ckpt_dir_or_file): return result +@tf_export("train.init_from_checkpoint") def init_from_checkpoint(ckpt_dir_or_file, assignment_map): """Initializes current variables with tensors loaded from given checkpoint. @@ -242,6 +247,9 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map): full_tensor_name = full_tensor_name[1:] if tensor_name_in_ckpt != "/": full_tensor_name = tensor_name_in_ckpt + full_tensor_name + # Remove trailing '/', if any, in the full_tensor_name + if full_tensor_name.endswith("/"): + full_tensor_name = full_tensor_name[:-1] if full_tensor_name not in variable_map: raise ValueError( "Tensor %s (%s in %s) is not found in %s checkpoint" % ( diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py index 0e31255b74f..0ff97d85e37 100644 --- a/tensorflow/python/training/coordinator.py +++ b/tensorflow/python/training/coordinator.py @@ -27,8 +27,10 @@ import six from tensorflow.python.framework import errors from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import compat +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.Coordinator") class Coordinator(object): """A coordinator for threads. @@ -406,6 +408,7 @@ class Coordinator(object): # Threads for the standard services. +@tf_export("train.LooperThread") class LooperThread(threading.Thread): """A thread that runs code repeatedly, optionally on a timer. diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py index 37ab625779f..689088bb41e 100644 --- a/tensorflow/python/training/device_setter.py +++ b/tensorflow/python/training/device_setter.py @@ -23,6 +23,7 @@ from tensorflow.core.framework import node_def_pb2 from tensorflow.python.framework import device as pydev from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import server_lib +from tensorflow.python.util.tf_export import tf_export class _RoundRobinStrategy(object): @@ -121,6 +122,7 @@ class _ReplicaDeviceChooser(object): return worker_device.to_string() +@tf_export("train.replica_device_setter") def replica_device_setter(ps_tasks=0, ps_device="/job:ps", worker_device="/job:worker", merge_devices=True, cluster=None, ps_ops=None, ps_strategy=None): diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py index c64a1b3f799..9d02e694db1 100644 --- a/tensorflow/python/training/ftrl.py +++ b/tensorflow/python/training/ftrl.py @@ -22,8 +22,10 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import math_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.FtrlOptimizer") class FtrlOptimizer(optimizer.Optimizer): """Optimizer that implements the FTRL algorithm. @@ -265,4 +267,3 @@ class FtrlOptimizer(optimizer.Optimizer): grad.dtype), math_ops.cast(self._learning_rate_power_tensor, grad.dtype), use_locking=self._use_locking) - diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py index 5a536e27297..380e14e0249 100644 --- a/tensorflow/python/training/gradient_descent.py +++ b/tensorflow/python/training/gradient_descent.py @@ -23,8 +23,10 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import resource_variable_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.GradientDescentOptimizer") class GradientDescentOptimizer(optimizer.Optimizer): """Optimizer that implements the gradient descent algorithm. """ diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py index 331a51e8bc8..bd9985a7c5c 100644 --- a/tensorflow/python/training/input.py +++ b/tensorflow/python/training/input.py @@ -44,6 +44,7 @@ from tensorflow.python.ops import sparse_ops from tensorflow.python.ops import variable_scope as vs from tensorflow.python.summary import summary from tensorflow.python.training import queue_runner +from tensorflow.python.util.tf_export import tf_export # pylint: disable=protected-access @@ -53,9 +54,12 @@ _restore_sparse = sparse_ops._take_many_sparse_from_tensors_map # pylint: enable=protected-access +@tf_export("train.match_filenames_once") def match_filenames_once(pattern, name=None): """Save the list of files matching pattern, so it is only computed once. + NOTE: The order of the files returned can be non-deterministic. + Args: pattern: A file pattern (glob), or 1D tensor of file patterns. name: A name for the operations (optional). @@ -70,6 +74,7 @@ def match_filenames_once(pattern, name=None): collections=[ops.GraphKeys.LOCAL_VARIABLES]) +@tf_export("train.limit_epochs") def limit_epochs(tensor, num_epochs=None, name=None): """Returns tensor `num_epochs` times and then raises an `OutOfRange` error. @@ -102,6 +107,7 @@ def limit_epochs(tensor, num_epochs=None, name=None): return array_ops.identity(tensor, name=name) +@tf_export("train.input_producer") def input_producer(input_tensor, element_shape=None, num_epochs=None, @@ -184,6 +190,7 @@ def input_producer(input_tensor, return q +@tf_export("train.string_input_producer") def string_input_producer(string_tensor, num_epochs=None, shuffle=True, @@ -253,6 +260,7 @@ def string_input_producer(string_tensor, cancel_op=cancel_op) +@tf_export("train.range_input_producer") def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, name=None): """Produces the integers from 0 to limit-1 in a queue. @@ -290,6 +298,7 @@ def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None, shared_name, "fraction_of_%d_full" % capacity, name) +@tf_export("train.slice_input_producer") def slice_input_producer(tensor_list, num_epochs=None, shuffle=True, seed=None, capacity=32, shared_name=None, name=None): """Produces a slice of each `Tensor` in `tensor_list`. @@ -885,6 +894,7 @@ def _shuffle_batch_join(tensors_list, batch_size, capacity, # Batching functions ---------------------------------------------------------- +@tf_export("train.batch") def batch(tensors, batch_size, num_threads=1, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None): @@ -979,6 +989,7 @@ def batch(tensors, batch_size, num_threads=1, capacity=32, name=name) +@tf_export("train.maybe_batch") def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None): @@ -1031,6 +1042,7 @@ def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32, name=name) +@tf_export("train.batch_join") def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, name=None): @@ -1136,6 +1148,7 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False, name=name) +@tf_export("train.maybe_batch_join") def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32, enqueue_many=False, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, shared_name=None, @@ -1188,6 +1201,7 @@ def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32, name=name) +@tf_export("train.shuffle_batch") def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, num_threads=1, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, shared_name=None, name=None): @@ -1287,6 +1301,7 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, name=name) +@tf_export("train.maybe_shuffle_batch") def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, keep_input, num_threads=1, seed=None, enqueue_many=False, shapes=None, @@ -1346,6 +1361,7 @@ def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue, name=name) +@tf_export("train.shuffle_batch_join") def shuffle_batch_join(tensors_list, batch_size, capacity, min_after_dequeue, seed=None, enqueue_many=False, shapes=None, allow_smaller_final_batch=False, @@ -1439,6 +1455,7 @@ def shuffle_batch_join(tensors_list, batch_size, capacity, name=name) +@tf_export("train.maybe_shuffle_batch_join") def maybe_shuffle_batch_join(tensors_list, batch_size, capacity, min_after_dequeue, keep_input, seed=None, enqueue_many=False, shapes=None, diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py index 3ee49650e01..10ab4c1137f 100644 --- a/tensorflow/python/training/learning_rate_decay.py +++ b/tensorflow/python/training/learning_rate_decay.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Various learning rate decay functions.""" from __future__ import absolute_import from __future__ import division @@ -26,10 +25,16 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops +from tensorflow.python.util.tf_export import tf_export -def exponential_decay(learning_rate, global_step, decay_steps, decay_rate, - staircase=False, name=None): +@tf_export("train.exponential_decay") +def exponential_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False, + name=None): """Applies exponential decay to the learning rate. When training a model, it is often recommended to lower the learning rate as @@ -85,9 +90,9 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate, """ if global_step is None: raise ValueError("global_step is required for exponential_decay.") - with ops.name_scope(name, "ExponentialDecay", - [learning_rate, global_step, - decay_steps, decay_rate]) as name: + with ops.name_scope( + name, "ExponentialDecay", + [learning_rate, global_step, decay_steps, decay_rate]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype global_step = math_ops.cast(global_step, dtype) @@ -96,10 +101,11 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate, p = global_step / decay_steps if staircase: p = math_ops.floor(p) - return math_ops.multiply(learning_rate, math_ops.pow(decay_rate, p), - name=name) + return math_ops.multiply( + learning_rate, math_ops.pow(decay_rate, p), name=name) +@tf_export("train.piecewise_constant") def piecewise_constant(x, boundaries, values, name=None): """Piecewise constant from boundaries and interval values. @@ -156,15 +162,15 @@ def piecewise_constant(x, boundaries, values, name=None): boundaries[i] = b else: raise ValueError( - "Boundaries (%s) must have the same dtype as x (%s)." % ( - b.dtype.base_dtype, x.dtype.base_dtype)) + "Boundaries (%s) must have the same dtype as x (%s)." % + (b.dtype.base_dtype, x.dtype.base_dtype)) # TODO(rdipietro): Ensure that boundaries' elements are strictly increasing. values = ops.convert_n_to_tensor(values) for v in values[1:]: if v.dtype.base_dtype != values[0].dtype.base_dtype: raise ValueError( - "Values must have elements all with the same dtype (%s vs %s)." % ( - values[0].dtype.base_dtype, v.dtype.base_dtype)) + "Values must have elements all with the same dtype (%s vs %s)." % + (values[0].dtype.base_dtype, v.dtype.base_dtype)) pred_fn_pairs = [] pred_fn_pairs.append((x <= boundaries[0], lambda: values[0])) pred_fn_pairs.append((x > boundaries[-1], lambda: values[-1])) @@ -179,9 +185,14 @@ def piecewise_constant(x, boundaries, values, name=None): return control_flow_ops.case(pred_fn_pairs, default, exclusive=True) -def polynomial_decay(learning_rate, global_step, decay_steps, - end_learning_rate=0.0001, power=1.0, - cycle=False, name=None): +@tf_export("train.polynomial_decay") +def polynomial_decay(learning_rate, + global_step, + decay_steps, + end_learning_rate=0.0001, + power=1.0, + cycle=False, + name=None): """Applies a polynomial decay to the learning rate. It is commonly observed that a monotonically decreasing learning rate, whose @@ -255,9 +266,10 @@ def polynomial_decay(learning_rate, global_step, decay_steps, """ if global_step is None: raise ValueError("global_step is required for polynomial_decay.") - with ops.name_scope(name, "PolynomialDecay", - [learning_rate, global_step, - decay_steps, end_learning_rate, power]) as name: + with ops.name_scope( + name, "PolynomialDecay", + [learning_rate, global_step, decay_steps, end_learning_rate, power + ]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype global_step = math_ops.cast(global_step, dtype) @@ -267,23 +279,29 @@ def polynomial_decay(learning_rate, global_step, decay_steps, if cycle: # Find the first multiple of decay_steps that is bigger than global_step. # If global_step is zero set the multiplier to 1 - multiplier = control_flow_ops.cond(math_ops.equal(global_step, 0), - lambda: 1.0, - lambda: math_ops.ceil( - global_step / decay_steps)) + multiplier = control_flow_ops.cond( + math_ops.equal(global_step, 0), lambda: 1.0, + lambda: math_ops.ceil(global_step / decay_steps)) decay_steps = math_ops.multiply(decay_steps, multiplier) else: # Make sure that the global_step used is not bigger than decay_steps. global_step = math_ops.minimum(global_step, decay_steps) p = math_ops.div(global_step, decay_steps) - return math_ops.add(math_ops.multiply(learning_rate - end_learning_rate, - math_ops.pow(1 - p, power)), - end_learning_rate, name=name) + return math_ops.add( + math_ops.multiply(learning_rate - end_learning_rate, + math_ops.pow(1 - p, power)), + end_learning_rate, + name=name) -def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate, - staircase=False, name=None): +@tf_export("train.natural_exp_decay") +def natural_exp_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False, + name=None): """Applies natural exponential decay to the initial learning rate. When training a model, it is often recommended to lower the learning rate as @@ -349,8 +367,13 @@ def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate, return math_ops.multiply(learning_rate, exponent, name=name) -def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, - staircase=False, name=None): +@tf_export("train.inverse_time_decay") +def inverse_time_decay(learning_rate, + global_step, + decay_steps, + decay_rate, + staircase=False, + name=None): """Applies inverse time decay to the initial learning rate. When training a model, it is often recommended to lower the learning rate as @@ -362,13 +385,15 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, The function returns the decayed learning rate. It is computed as: ```python - decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step) + decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / + decay_step) ``` or, if `staircase` is `True`, as: ```python - decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step)) + decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / + decay_step)) ``` Example: decay 1/t with a rate of 0.5: @@ -379,7 +404,8 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, learning_rate = 0.1 decay_steps = 1.0 decay_rate = 0.5 - learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate) + learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, + decay_steps, decay_rate) # Passing global_step to minimize() will increment it at each step. learning_step = ( @@ -424,8 +450,8 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, return math_ops.div(learning_rate, denom, name=name) -def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, - name=None): +@tf_export("train.cosine_decay") +def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None): """Applies cosine decay to the learning rate. See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent @@ -484,8 +510,14 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, return math_ops.multiply(learning_rate, decayed) -def cosine_decay_restarts(learning_rate, global_step, first_decay_steps, - t_mul=2.0, m_mul=1.0, alpha=0.0, name=None): +@tf_export("train.cosine_decay_restarts") +def cosine_decay_restarts(learning_rate, + global_step, + first_decay_steps, + t_mul=2.0, + m_mul=1.0, + alpha=0.0, + name=None): """Applies cosine decay with restarts to the learning rate. See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent @@ -532,10 +564,9 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps, """ if global_step is None: raise ValueError("cosine decay restarts requires global_step") - with ops.name_scope(name, "SGDRDecay", - [learning_rate, global_step]) as name: - learning_rate = ops.convert_to_tensor(learning_rate, - name="initial_learning_rate") + with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name: + learning_rate = ops.convert_to_tensor( + learning_rate, name="initial_learning_rate") dtype = learning_rate.dtype global_step = math_ops.cast(global_step, dtype) first_decay_steps = math_ops.cast(first_decay_steps, dtype) @@ -547,11 +578,12 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps, def compute_step(completed_fraction, geometric=False): if geometric: - i_restart = math_ops.floor(math_ops.log(1.0 - completed_fraction * ( - 1.0 - t_mul)) / math_ops.log(t_mul)) + i_restart = math_ops.floor( + math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) / + math_ops.log(t_mul)) - sum_r = (1.0 - t_mul ** i_restart) / (1.0 - t_mul) - completed_fraction = (completed_fraction - sum_r) / t_mul ** i_restart + sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul) + completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart else: i_restart = math_ops.floor(completed_fraction) @@ -564,16 +596,21 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps, lambda: compute_step(completed_fraction, geometric=False), lambda: compute_step(completed_fraction, geometric=True)) - m_fac = m_mul ** i_restart - cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos( - constant_op.constant(math.pi) * completed_fraction)) + m_fac = m_mul**i_restart + cosine_decayed = 0.5 * m_fac * ( + 1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(learning_rate, decayed, name=name) -def linear_cosine_decay(learning_rate, global_step, decay_steps, - num_periods=0.5, alpha=0.0, beta=0.001, +@tf_export("train.linear_cosine_decay") +def linear_cosine_decay(learning_rate, + global_step, + decay_steps, + num_periods=0.5, + alpha=0.0, + beta=0.001, name=None): """Applies linear cosine decay to the learning rate. @@ -651,9 +688,15 @@ def linear_cosine_decay(learning_rate, global_step, decay_steps, return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name) -def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps, - initial_variance=1.0, variance_decay=0.55, - num_periods=0.5, alpha=0.0, beta=0.001, +@tf_export("train.noisy_linear_cosine_decay") +def noisy_linear_cosine_decay(learning_rate, + global_step, + decay_steps, + initial_variance=1.0, + variance_decay=0.55, + num_periods=0.5, + alpha=0.0, + beta=0.001, name=None): """Applies noisy linear cosine decay to the learning rate. @@ -734,8 +777,8 @@ def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps, math_ops.pow(1.0 + global_step, variance_decay)) std = math_ops.sqrt(variance) noisy_linear_decayed = ( - linear_decayed + random_ops.random_normal( - linear_decayed.shape, stddev=std)) + linear_decayed + + random_ops.random_normal(linear_decayed.shape, stddev=std)) completed_fraction = global_step / decay_steps fraction = 2.0 * num_periods * completed_fraction diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py index cf9530d87c4..bd9fa79d8fe 100644 --- a/tensorflow/python/training/momentum.py +++ b/tensorflow/python/training/momentum.py @@ -22,8 +22,10 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import math_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.MomentumOptimizer") class MomentumOptimizer(optimizer.Optimizer): """Optimizer that implements the Momentum algorithm. diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py index fa3517db27b..6c5c9e01a76 100644 --- a/tensorflow/python/training/monitored_session.py +++ b/tensorflow/python/training/monitored_session.py @@ -41,6 +41,7 @@ from tensorflow.python.training import queue_runner from tensorflow.python.training import saver as training_saver from tensorflow.python.training import session_manager as sm from tensorflow.python.training import session_run_hook +from tensorflow.python.util.tf_export import tf_export # The list of exceptions that we should recover from. Exceptions not in this @@ -52,6 +53,7 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError) USE_DEFAULT = object() +@tf_export('train.Scaffold') class Scaffold(object): """Structure to create or gather pieces commonly needed to train a model. @@ -272,6 +274,7 @@ class Scaffold(object): resources.initialize_resources(resources.local_resources())) +@tf_export('train.MonitoredTrainingSession') def MonitoredTrainingSession(master='', # pylint: disable=invalid-name is_chief=True, checkpoint_dir=None, @@ -381,6 +384,7 @@ def MonitoredTrainingSession(master='', # pylint: disable=invalid-name stop_grace_period_secs=stop_grace_period_secs) +@tf_export('train.SessionCreator') class SessionCreator(object): """A factory for tf.Session.""" @@ -390,6 +394,7 @@ class SessionCreator(object): 'create_session is not implemented for {}.'.format(self)) +@tf_export('train.ChiefSessionCreator') class ChiefSessionCreator(SessionCreator): """Creates a tf.Session for a chief.""" @@ -441,6 +446,7 @@ class ChiefSessionCreator(SessionCreator): init_fn=self._scaffold.init_fn) +@tf_export('train.WorkerSessionCreator') class WorkerSessionCreator(SessionCreator): """Creates a tf.Session for a worker.""" @@ -706,6 +712,7 @@ class _MonitoredSession(object): return self._coordinated_creator.tf_sess +@tf_export('train.MonitoredSession') class MonitoredSession(_MonitoredSession): """Session-like object that handles initialization, recovery and hooks. @@ -788,6 +795,7 @@ class MonitoredSession(_MonitoredSession): stop_grace_period_secs=stop_grace_period_secs) +@tf_export('train.SingularMonitoredSession') class SingularMonitoredSession(_MonitoredSession): """Session-like object that handles initialization, restoring, and hooks. diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py index 43ed1ac170d..2d89082ad75 100644 --- a/tensorflow/python/training/moving_averages.py +++ b/tensorflow/python/training/moving_averages.py @@ -26,6 +26,7 @@ from tensorflow.python.ops import state_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables from tensorflow.python.training import slot_creator +from tensorflow.python.util.tf_export import tf_export # TODO(touts): switch to variables.Variable. @@ -230,6 +231,7 @@ def _zero_debias(unbiased_var, value, decay): return unbiased_ema_delta +@tf_export("train.ExponentialMovingAverage") class ExponentialMovingAverage(object): """Maintains moving averages of variables by employing an exponential decay. diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py index 719b83e5ca5..9ec588bac96 100644 --- a/tensorflow/python/training/optimizer.py +++ b/tensorflow/python/training/optimizer.py @@ -36,6 +36,7 @@ from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables from tensorflow.python.training import slot_creator from tensorflow.python.util import nest +from tensorflow.python.util.tf_export import tf_export def _get_variable_for(v): @@ -187,6 +188,7 @@ def _get_processor(v): raise NotImplementedError("Trying to optimize unsupported type ", v) +@tf_export("train.Optimizer") class Optimizer(object): """Base class for optimizers. @@ -533,7 +535,15 @@ class Optimizer(object): else: with ops.control_dependencies([self._finish(update_ops, "update")]): with ops.colocate_with(global_step): - apply_updates = state_ops.assign_add(global_step, 1, name=name) + if isinstance(global_step, resource_variable_ops.ResourceVariable): + # TODO(apassos): the implicit read in assign_add is slow; consider + # making it less so. + apply_updates = resource_variable_ops.assign_add_variable_op( + global_step.handle, + ops.convert_to_tensor(1, dtype=global_step.dtype), + name=name) + else: + apply_updates = state_ops.assign_add(global_step, 1, name=name) if context.in_graph_mode(): if isinstance(apply_updates, ops.Tensor): @@ -592,7 +602,7 @@ class Optimizer(object): if executing_eagerly: # No variable.op in eager mode. We don't expect lots of eager graphs, # but behavior should be consistent with graph mode. - return variable._container_prefix == current_graph._container_prefix # pylint: disable=protected-access + return variable._graph_key == current_graph._graph_key # pylint: disable=protected-access else: return variable.op.graph is current_graph diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py index da31ab325d5..9bd677b8efc 100644 --- a/tensorflow/python/training/proximal_adagrad.py +++ b/tensorflow/python/training/proximal_adagrad.py @@ -23,8 +23,10 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import math_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.ProximalAdagradOptimizer") class ProximalAdagradOptimizer(optimizer.Optimizer): # pylint: disable=line-too-long """Optimizer that implements the Proximal Adagrad algorithm. diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py index 53e9dc2ef2c..369b6cbb50e 100644 --- a/tensorflow/python/training/proximal_gradient_descent.py +++ b/tensorflow/python/training/proximal_gradient_descent.py @@ -24,8 +24,10 @@ from tensorflow.python.ops import math_ops # pylint: enable=unused-import from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.ProximalGradientDescentOptimizer") class ProximalGradientDescentOptimizer(optimizer.Optimizer): # pylint: disable=line-too-long """Optimizer that implements the proximal gradient descent algorithm. diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py index 4e7c81d7b29..07afba79abf 100644 --- a/tensorflow/python/training/queue_runner_impl.py +++ b/tensorflow/python/training/queue_runner_impl.py @@ -27,8 +27,10 @@ from tensorflow.python.eager import context from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.queue_runner.QueueRunner", "train.QueueRunner") class QueueRunner(object): """Holds a list of enqueue operations for a queue, each to be run in a thread. @@ -384,6 +386,7 @@ class QueueRunner(object): import_scope=import_scope) +@tf_export("train.queue_runner.add_queue_runner", "train.add_queue_runner") def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS): """Adds a `QueueRunner` to a collection in the graph. @@ -402,6 +405,8 @@ def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS): ops.add_to_collection(collection, qr) +@tf_export("train.queue_runner.start_queue_runners", + "train.start_queue_runners") def start_queue_runners(sess=None, coord=None, daemon=True, start=True, collection=ops.GraphKeys.QUEUE_RUNNERS): """Starts all queue runners collected in the graph. diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py index ebec725b7b9..89d1099a49f 100644 --- a/tensorflow/python/training/rmsprop.py +++ b/tensorflow/python/training/rmsprop.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """One-line documentation for rmsprop module. rmsprop algorithm [tieleman2012rmsprop] @@ -47,12 +46,15 @@ from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.training import optimizer from tensorflow.python.training import training_ops +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.RMSPropOptimizer") class RMSPropOptimizer(optimizer.Optimizer): """Optimizer that implements the RMSProp algorithm. - See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf). + See the + [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf). """ def __init__(self, @@ -113,13 +115,12 @@ class RMSPropOptimizer(optimizer.Optimizer): self._zeros_slot(v, "momentum", self._name) def _prepare(self): - self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate, - name="learning_rate") + self._learning_rate_tensor = ops.convert_to_tensor( + self._learning_rate, name="learning_rate") self._decay_tensor = ops.convert_to_tensor(self._decay, name="decay") - self._momentum_tensor = ops.convert_to_tensor(self._momentum, - name="momentum") - self._epsilon_tensor = ops.convert_to_tensor(self._epsilon, - name="epsilon") + self._momentum_tensor = ops.convert_to_tensor( + self._momentum, name="momentum") + self._epsilon_tensor = ops.convert_to_tensor(self._epsilon, name="epsilon") def _apply_dense(self, grad, var): rms = self.get_slot(var, "rms") diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py index 4f3773c0fc7..764f8400122 100644 --- a/tensorflow/python/training/saver.py +++ b/tensorflow/python/training/saver.py @@ -53,6 +53,7 @@ from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import training_util from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState from tensorflow.python.util import compat +from tensorflow.python.util.tf_export import tf_export # Op names which identify variable reads which should be saved. @@ -889,6 +890,7 @@ def _GetCheckpointFilename(save_dir, latest_filename): return os.path.join(save_dir, latest_filename) +@tf_export("train.generate_checkpoint_state_proto") def generate_checkpoint_state_proto(save_dir, model_checkpoint_path, all_model_checkpoint_paths=None): @@ -933,6 +935,7 @@ def generate_checkpoint_state_proto(save_dir, return coord_checkpoint_proto +@tf_export("train.update_checkpoint_state") def update_checkpoint_state(save_dir, model_checkpoint_path, all_model_checkpoint_paths=None, @@ -1025,6 +1028,7 @@ def _update_checkpoint_state(save_dir, text_format.MessageToString(ckpt)) +@tf_export("train.get_checkpoint_state") def get_checkpoint_state(checkpoint_dir, latest_filename=None): """Returns CheckpointState proto from the "checkpoint" file. @@ -1082,6 +1086,7 @@ def get_checkpoint_state(checkpoint_dir, latest_filename=None): return ckpt +@tf_export("train.Saver") class Saver(object): """Saves and restores variables. @@ -1229,7 +1234,7 @@ class Saver(object): The `saver_def` proto should be the one returned by the `as_saver_def()` call of the `Saver` that was created for that `Graph`. builder: Optional `SaverBuilder` to use if a `saver_def` was not provided. - Defaults to `BaseSaverBuilder()`. + Defaults to `BulkSaverBuilder()`. defer_build: If `True`, defer adding the save and restore ops to the `build()` call. In that case `build()` should be called before finalizing the graph or using the saver. @@ -1309,7 +1314,7 @@ class Saver(object): if not self.saver_def or context.in_eager_mode(): if self._builder is None: - self._builder = BaseSaverBuilder(self._write_version) + self._builder = BulkSaverBuilder(self._write_version) if self._var_list is None: # pylint: disable=protected-access @@ -1788,6 +1793,7 @@ def _prefix_to_checkpoint_path(prefix, format_version): return prefix # Just the data file. +@tf_export("train.latest_checkpoint") def latest_checkpoint(checkpoint_dir, latest_filename=None): """Finds the filename of latest saved checkpoint file. @@ -1817,6 +1823,7 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None): return None +@tf_export("train.import_meta_graph") def import_meta_graph(meta_graph_or_file, clear_devices=False, import_scope=None, **kwargs): """Recreates a Graph saved in a `MetaGraphDef` proto. @@ -1918,6 +1925,7 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False, return None +@tf_export("train.export_meta_graph") def export_meta_graph(filename=None, meta_info_def=None, graph_def=None, @@ -1994,6 +2002,7 @@ def export_meta_graph(filename=None, return meta_graph_def +@tf_export("train.checkpoint_exists") def checkpoint_exists(checkpoint_prefix): """Checks whether a V1 or V2 checkpoint exists with the specified prefix. @@ -2018,6 +2027,7 @@ def checkpoint_exists(checkpoint_prefix): return False +@tf_export("train.get_checkpoint_mtimes") def get_checkpoint_mtimes(checkpoint_prefixes): """Returns the mtimes (modification timestamps) of the checkpoints. diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py index 29da67a30a5..2f421d1cc0a 100644 --- a/tensorflow/python/training/server_lib.py +++ b/tensorflow/python/training/server_lib.py @@ -23,6 +23,7 @@ from tensorflow.core.protobuf import tensorflow_server_pb2 from tensorflow.python import pywrap_tensorflow from tensorflow.python.framework import errors from tensorflow.python.util import compat +from tensorflow.python.util.tf_export import tf_export def _make_server_def(server_or_cluster_def, job_name, task_index, protocol, @@ -92,6 +93,7 @@ def _make_server_def(server_or_cluster_def, job_name, task_index, protocol, return server_def +@tf_export("train.Server") class Server(object): """An in-process TensorFlow server, for use in distributed training. @@ -221,6 +223,7 @@ class Server(object): start=start) +@tf_export("train.ClusterSpec") class ClusterSpec(object): """Represents a cluster as a set of "tasks", organized into "jobs". diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py index b396a1e7d0a..360e02fb44c 100644 --- a/tensorflow/python/training/session_manager.py +++ b/tensorflow/python/training/session_manager.py @@ -25,6 +25,7 @@ from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import saver as saver_mod +from tensorflow.python.util.tf_export import tf_export def _maybe_name(obj): @@ -44,6 +45,7 @@ def _maybe_name(obj): return "" % type(obj) +@tf_export("train.SessionManager") class SessionManager(object): """Training helper that restores from checkpoint and creates session. diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py index 5b023d8a267..89f40300650 100644 --- a/tensorflow/python/training/session_run_hook.py +++ b/tensorflow/python/training/session_run_hook.py @@ -96,8 +96,10 @@ from __future__ import division from __future__ import print_function import collections +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.SessionRunHook") class SessionRunHook(object): """Hook to extend calls to MonitoredSession.run().""" @@ -189,6 +191,7 @@ class SessionRunHook(object): pass +@tf_export("train.SessionRunArgs") class SessionRunArgs( collections.namedtuple("SessionRunArgs", ["fetches", "feed_dict", "options"])): @@ -213,6 +216,7 @@ class SessionRunArgs( return super(SessionRunArgs, cls).__new__(cls, fetches, feed_dict, options) +@tf_export("train.SessionRunContext") class SessionRunContext(object): """Provides information about the `session.run()` call being made. @@ -264,6 +268,7 @@ class SessionRunContext(object): self._stop_requested = True +@tf_export("train.SessionRunValues") class SessionRunValues( collections.namedtuple("SessionRunValues", ["results", "options", "run_metadata"])): diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py index e4514aaea22..d2ad34773e0 100644 --- a/tensorflow/python/training/supervisor.py +++ b/tensorflow/python/training/supervisor.py @@ -37,8 +37,10 @@ from tensorflow.python.training import saver as saver_mod from tensorflow.python.training import session_manager as session_manager_mod from tensorflow.python.training import training_util from tensorflow.python.util import deprecation +from tensorflow.python.util.tf_export import tf_export +@tf_export("train.Supervisor") class Supervisor(object): """A training helper that checkpoints models and computes summaries. diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py index 47702fdad05..0c6cf910d1a 100644 --- a/tensorflow/python/training/sync_replicas_optimizer.py +++ b/tensorflow/python/training/sync_replicas_optimizer.py @@ -31,6 +31,7 @@ from tensorflow.python.training import optimizer from tensorflow.python.training import queue_runner from tensorflow.python.training import session_manager from tensorflow.python.training import session_run_hook +from tensorflow.python.util.tf_export import tf_export # Please note that the gradients from replicas are averaged instead of summed @@ -38,6 +39,7 @@ from tensorflow.python.training import session_run_hook # rate according to the number of replicas. This change is introduced to be # consistent with how gradients are aggregated (averaged) within a batch in a # replica. +@tf_export("train.SyncReplicasOptimizer") class SyncReplicasOptimizer(optimizer.Optimizer): """Class to synchronize, aggregate gradients and pass them to the optimizer. diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py index 89a9e129328..499f1feb2db 100644 --- a/tensorflow/python/training/training_util.py +++ b/tensorflow/python/training/training_util.py @@ -29,6 +29,7 @@ from tensorflow.python.ops import state_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.util.tf_export import tf_export # Picked a long key value to minimize the chance of collision with user defined @@ -40,6 +41,7 @@ GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache' write_graph = graph_io.write_graph +@tf_export('train.global_step') def global_step(sess, global_step_tensor): """Small helper to get the global step. @@ -67,6 +69,7 @@ def global_step(sess, global_step_tensor): return int(sess.run(global_step_tensor)) +@tf_export('train.get_global_step') def get_global_step(graph=None): """Get the global step tensor. @@ -101,6 +104,7 @@ def get_global_step(graph=None): return global_step_tensor +@tf_export('train.create_global_step') def create_global_step(graph=None): """Create global step tensor in graph. @@ -139,6 +143,7 @@ def create_global_step(graph=None): ops.GraphKeys.GLOBAL_STEP]) +@tf_export('train.get_or_create_global_step') def get_or_create_global_step(graph=None): """Returns and create (if necessary) the global step tensor. @@ -156,6 +161,7 @@ def get_or_create_global_step(graph=None): return global_step_tensor +@tf_export('train.assert_global_step') def assert_global_step(global_step_tensor): """Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`. diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py index 270d96a3c7c..7e5f192b8f1 100644 --- a/tensorflow/python/util/compat.py +++ b/tensorflow/python/util/compat.py @@ -41,8 +41,10 @@ import numpy as _np import six as _six from tensorflow.python.util.all_util import remove_undocumented +from tensorflow.python.util.tf_export import tf_export +@tf_export('compat.as_bytes', 'compat.as_str') def as_bytes(bytes_or_text, encoding='utf-8'): """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text. @@ -65,6 +67,7 @@ def as_bytes(bytes_or_text, encoding='utf-8'): (bytes_or_text,)) +@tf_export('compat.as_text') def as_text(bytes_or_text, encoding='utf-8'): """Returns the given argument as a unicode string. @@ -93,6 +96,7 @@ else: as_str = as_text +@tf_export('compat.as_str_any') def as_str_any(value): """Converts to `str` as `str(value)`, but use `as_str` for `bytes`. @@ -125,11 +129,16 @@ def path_to_str(path): # Numpy 1.8 scalars don't inherit from numbers.Integral in Python 3, so we # need to check them specifically. The same goes from Real and Complex. integral_types = (_numbers.Integral, _np.integer) +tf_export('compat.integral_types').export_constant(__name__, 'integral_types') real_types = (_numbers.Real, _np.integer, _np.floating) +tf_export('compat.real_types').export_constant(__name__, 'real_types') complex_types = (_numbers.Complex, _np.number) +tf_export('compat.complex_types').export_constant(__name__, 'complex_types') # Either bytes or text. bytes_or_text_types = (bytes, _six.text_type) +tf_export('compat.bytes_or_text_types').export_constant(__name__, + 'bytes_or_text_types') _allowed_symbols = [ 'as_str', diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py index 8a66f0435a8..fbec8fd2d8e 100644 --- a/tensorflow/python/util/deprecation.py +++ b/tensorflow/python/util/deprecation.py @@ -22,6 +22,7 @@ import collections import functools import re +from tensorflow.python.eager import context from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import decorator_utils from tensorflow.python.util import tf_contextlib @@ -38,13 +39,14 @@ _PRINTED_WARNING = {} def _add_deprecated_function_notice_to_docstring(doc, date, instructions): """Adds a deprecation notice to a docstring for deprecated functions.""" + main_text = ['THIS FUNCTION IS DEPRECATED. It will be removed %s.' % + ('in a future version' if date is None else ('after %s' % date))] + if instructions: + main_text.append('Instructions for updating:') return decorator_utils.add_notice_to_docstring( doc, instructions, 'DEPRECATED FUNCTION', - '(deprecated)', [ - 'THIS FUNCTION IS DEPRECATED. It will be removed %s.' % ( - 'in a future version' if date is None else ('after %s' % date)), - 'Instructions for updating:']) + '(deprecated)', main_text) def _add_deprecated_arg_notice_to_docstring(doc, date, instructions): @@ -66,23 +68,135 @@ def _validate_deprecation_args(date, instructions): raise ValueError('Don\'t deprecate things without conversion instructions!') -def _call_location(): +def _call_location(outer=False): """Returns call location given level up from current call.""" frame = tf_inspect.currentframe() if frame: # CPython internals are available, use them for performance. # walk back two frames to get to deprecated function caller. - first_frame = frame.f_back - second_frame = first_frame.f_back - frame = second_frame if second_frame else first_frame + frame = frame.f_back + if frame.f_back: + frame = frame.f_back + if outer and frame.f_back: + frame = frame.f_back return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno) else: # Slow fallback path stack = tf_inspect.stack(0) # 0 avoids generating unused context - entry = stack[2] + entry = stack[3 if outer else 2] return '%s:%d' % (entry[1], entry[2]) +def deprecated_alias(deprecated_name, name, func_or_class, warn_once=True): + """Deprecate a symbol in favor of a new name with identical semantics. + + This function is meant to be used when defining a backwards-compatibility + alias for a symbol which has been moved. For example: + + module1.py: + ```python + class NewNameForClass: pass + ``` + + module2.py: + ```python + import module1 + + DeprecatedNameForClass = deprecated_alias( + deprecated_name='module2.DeprecatedNameForClass', + name='module1.NewNameForClass', + module1.NewNameForClass) + ``` + + This function works for classes and functions. + + For classes, it creates a new class which is functionally identical (it + inherits from the original, and overrides its constructor), but which prints + a deprecation warning when an instance is created. It also adds a deprecation + notice to the class' docstring. + + For functions, it returns a function wrapped by `tf_decorator.make_decorator`. + That function prints a warning when used, and has a deprecation notice in its + docstring. This is more or less equivalent (the deprecation warning has + slightly different text) to writing: + + ```python + @deprecated + def deprecated_alias(original_args): + real_function(original_args) + ``` + + Args: + deprecated_name: The name of the symbol that is being deprecated, to be used + in the warning message. This should be its fully qualified name to avoid + confusion. + name: The name of the symbol that is to be used instead of the deprecated + name. This should be a fully qualified name to avoid confusion. + func_or_class: The (non-deprecated) class or function for which a deprecated + alias should be created. + warn_once: If True (the default), only print a deprecation warning the first + time this function is used, or the class is instantiated. + + Returns: + A wrapped version of `func_or_class` which prints a deprecation warning on + use and has a modified docstring. + """ + if tf_inspect.isclass(func_or_class): + + # Make a new class with __init__ wrapped in a warning. + class NewClass(func_or_class): # pylint: disable=missing-docstring + __doc__ = decorator_utils.add_notice_to_docstring( + func_or_class.__doc__, 'Please use %s instead.' % name, + 'DEPRECATED CLASS', + '(deprecated)', ['THIS CLASS IS DEPRECATED. ' + 'It will be removed in a future version. ']) + __name__ = func_or_class.__name__ + __module__ = _call_location(outer=True) + + def __init__(self, *args, **kwargs): + if hasattr(NewClass.__init__, '__func__'): + # Python 2 + NewClass.__init__.__func__.__doc__ = func_or_class.__init__.__doc__ + else: + # Python 3 + NewClass.__init__.__doc__ = func_or_class.__init__.__doc__ + + if _PRINT_DEPRECATION_WARNINGS: + # We're making the alias as we speak. The original may have other + # aliases, so we cannot use it to check for whether it's already been + # warned about. + if NewClass.__init__ not in _PRINTED_WARNING: + if warn_once: + _PRINTED_WARNING[NewClass.__init__] = True + logging.warning( + 'From %s: The name %s is deprecated. Please use %s instead.\n', + _call_location(), deprecated_name, name) + super(NewClass, self).__init__(*args, **kwargs) + + return NewClass + else: + decorator_utils.validate_callable(func_or_class, 'deprecated') + + # Make a wrapper for the original + @functools.wraps(func_or_class) + def new_func(*args, **kwargs): # pylint: disable=missing-docstring + if _PRINT_DEPRECATION_WARNINGS: + # We're making the alias as we speak. The original may have other + # aliases, so we cannot use it to check for whether it's already been + # warned about. + if new_func not in _PRINTED_WARNING: + if warn_once: + _PRINTED_WARNING[new_func] = True + logging.warning( + 'From %s: The name %s is deprecated. Please use %s instead.\n', + _call_location(), deprecated_name, name) + return func_or_class(*args, **kwargs) + return tf_decorator.make_decorator( + func_or_class, new_func, 'deprecated', + _add_deprecated_function_notice_to_docstring( + func_or_class.__doc__, None, 'Please use %s instead.' % name)) + + def deprecated(date, instructions, warn_once=True): """Decorator for marking functions or methods deprecated. @@ -284,7 +398,9 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples, @functools.wraps(func) def new_func(*args, **kwargs): """Deprecation wrapper.""" - if _PRINT_DEPRECATION_WARNINGS: + # TODO(apassos) figure out a way to have reasonable performance with + # deprecation warnings and eager mode. + if context.in_graph_mode() and _PRINT_DEPRECATION_WARNINGS: invalid_args = [] named_args = tf_inspect.getcallargs(func, *args, **kwargs) for arg_name, spec in iter(deprecated_positions.items()): diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py index e61edb5cfa3..bdd0bc48d29 100644 --- a/tensorflow/python/util/deprecation_test.py +++ b/tensorflow/python/util/deprecation_test.py @@ -24,6 +24,56 @@ from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import deprecation +class DeprecatedAliasTest(test.TestCase): + + @test.mock.patch.object(logging, "warning", autospec=True) + def test_function_alias(self, mock_warning): + deprecated_func = deprecation.deprecated_alias("deprecated.func", + "real.func", + logging.error) + + logging.error("fake error logged") + self.assertEqual(0, mock_warning.call_count) + deprecated_func("FAKE ERROR!") + self.assertEqual(1, mock_warning.call_count) + # Make sure the error points to the right file. + self.assertRegexpMatches(mock_warning.call_args[0][1], + r"deprecation_test\.py:") + deprecated_func("ANOTHER FAKE ERROR!") + self.assertEqual(1, mock_warning.call_count) + + @test.mock.patch.object(logging, "warning", autospec=True) + def test_class_alias(self, mock_warning): + class MyClass(object): + """My docstring.""" + + init_args = [] + + def __init__(self, arg): + MyClass.init_args.append(arg) + + deprecated_cls = deprecation.deprecated_alias("deprecated.cls", + "real.cls", + MyClass) + + print(deprecated_cls.__name__) + print(deprecated_cls.__module__) + print(deprecated_cls.__doc__) + + MyClass("test") + self.assertEqual(0, mock_warning.call_count) + deprecated_cls("deprecated") + self.assertEqual(1, mock_warning.call_count) + # Make sure the error points to the right file. + self.assertRegexpMatches(mock_warning.call_args[0][1], + r"deprecation_test\.py:") + deprecated_cls("deprecated again") + self.assertEqual(1, mock_warning.call_count) + + self.assertEqual(["test", "deprecated", "deprecated again"], + MyClass.init_args) + + class DeprecationTest(test.TestCase): @test.mock.patch.object(logging, "warning", autospec=True) diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py index 874df3d1087..c8525ed4203 100644 --- a/tensorflow/python/util/nest.py +++ b/tensorflow/python/util/nest.py @@ -532,8 +532,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True): (list(_six.iterkeys(input_tree)), list(_six.iterkeys(shallow_tree)))) - input_tree = list(_six.iteritems(input_tree)) - shallow_tree = list(_six.iteritems(shallow_tree)) + input_tree = list(sorted(_six.iteritems(input_tree))) + shallow_tree = list(sorted(_six.iteritems(shallow_tree))) for shallow_branch, input_branch in zip(shallow_tree, input_tree): assert_shallow_structure(shallow_branch, input_branch, diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py index 6bec397db57..8aaf799fd05 100644 --- a/tensorflow/python/util/nest_test.py +++ b/tensorflow/python/util/nest_test.py @@ -425,6 +425,10 @@ class NestTest(test.TestCase): with self.assertRaisesRegexp(ValueError, expected_message): nest.assert_shallow_structure(inp_ab2, inp_ab1) + inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))]) + inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)]) + nest.assert_shallow_structure(inp_ab, inp_ba) + def testFlattenUpTo(self): # Shallow tree ends at scalar. input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]] diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc index a23d6a70ba2..d1a8aae1674 100644 --- a/tensorflow/stream_executor/executor_cache.cc +++ b/tensorflow/stream_executor/executor_cache.cc @@ -23,6 +23,14 @@ namespace gputools { port::StatusOr ExecutorCache::GetOrCreate( const StreamExecutorConfig& config, const std::function& factory) { + // In the fast path case, the cache already has an entry and we can just + // return after Get() which only takes a shared lock and not a unique lock. + // If we need to create, we take a unique lock on cache_. + auto fast_result = Get(config); + if (fast_result.ok()) { + return fast_result; + } + Entry* entry = nullptr; { mutex_lock lock{mutex_}; @@ -59,12 +67,17 @@ port::StatusOr ExecutorCache::Get( const StreamExecutorConfig& config) { Entry* entry = nullptr; { - mutex_lock lock{mutex_}; - entry = &cache_[config.ordinal]; - // Release the map lock; the address of 'entry' is stable because - // std::map guarantees reference stability. + tf_shared_lock lock{mutex_}; + auto it = cache_.find(config.ordinal); + if (it != cache_.end()) { + entry = &it->second; + } else { + return port::Status(port::error::NOT_FOUND, + port::Printf("No executors registered for ordinal %d", + config.ordinal)); + } } - mutex_lock lock{entry->configurations_mutex}; + tf_shared_lock lock{entry->configurations_mutex}; if (entry->configurations.empty()) { return port::Status( port::error::NOT_FOUND, diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc index cc32a6beaa5..f23224ae772 100644 --- a/tensorflow/stream_executor/multi_platform_manager.cc +++ b/tensorflow/stream_executor/multi_platform_manager.cc @@ -45,7 +45,7 @@ namespace gputools { /* static */ port::StatusOr MultiPlatformManager::PlatformWithName( const string& target) { - mutex_lock lock(GetPlatformsMutex()); + tf_shared_lock lock(GetPlatformsMutex()); auto it = GetPlatformMap()->find(port::Lowercase(target)); if (it == GetPlatformMap()->end()) { @@ -59,7 +59,7 @@ namespace gputools { /* static */ port::StatusOr MultiPlatformManager::PlatformWithId( const Platform::Id& id) { - mutex_lock lock(GetPlatformsMutex()); + tf_shared_lock lock(GetPlatformsMutex()); auto it = GetPlatformByIdMap()->find(id); if (it == GetPlatformByIdMap()->end()) { return port::Status( diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD index d1103163959..66bbd572a67 100644 --- a/tensorflow/tools/api/generator/BUILD +++ b/tensorflow/tools/api/generator/BUILD @@ -77,6 +77,16 @@ genrule( "api/nn/rnn_cell/__init__.py", "api/sets/__init__.py", "api/summary/__init__.py", + "api/train/queue_runner/__init__.py", + "api/compat/__init__.py", + "api/data/__init__.py", + "api/estimator/__init__.py", + "api/estimator/export/__init__.py", + "api/estimator/inputs/__init__.py", + "api/feature_column/__init__.py", + "api/losses/__init__.py", + "api/profiler/__init__.py", + "api/python_io/__init__.py", ], cmd = "$(location create_python_api) $(OUTS)", tools = ["create_python_api"], diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt index ab697b1b95b..874a73f661d 100644 --- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt @@ -21,7 +21,7 @@ tf_class { } member_method { name: "__init__" - argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\'], " + argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], " } member_method { name: "evaluate" diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt index b73f6433e22..8da2a2b6867 100644 --- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt @@ -21,7 +21,7 @@ tf_class { } member_method { name: "__init__" - argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\'], " + argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], " } member_method { name: "evaluate" diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt index dbcc187f945..aa6ac46613f 100644 --- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt @@ -23,6 +23,10 @@ tf_class { name: "mode" mtype: "" } + member { + name: "prediction_hooks" + mtype: "" + } member { name: "predictions" mtype: "" diff --git a/tensorflow/tools/api/golden/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt new file mode 100644 index 00000000000..0b841652851 --- /dev/null +++ b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt @@ -0,0 +1,7 @@ +path: "tensorflow.manip" +tf_module { + member_method { + name: "roll" + argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None" + } +} diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt index db1ed421851..e8890e9cc0a 100644 --- a/tensorflow/tools/api/golden/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/tensorflow.pbtxt @@ -396,6 +396,10 @@ tf_module { name: "losses" mtype: "" } + member { + name: "manip" + mtype: "" + } member { name: "metrics" mtype: "" @@ -2044,6 +2048,10 @@ tf_module { name: "unique_with_counts" argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\'], " } + member_method { + name: "unravel_index" + argspec: "args=[\'indices\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "unsorted_segment_max" argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD index 8fb6b1cdfd8..608a34ab7b3 100644 --- a/tensorflow/tools/api/tests/BUILD +++ b/tensorflow/tools/api/tests/BUILD @@ -17,10 +17,6 @@ py_test( name = "api_compatibility_test", srcs = ["api_compatibility_test.py"], data = [ - ":convert_from_multiline", - "//tensorflow/core/api_def:base_api_def", - "//tensorflow/core/api_def:python_api_def", - "//tensorflow/python:hidden_ops", "//tensorflow/tools/api/golden:api_golden", "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt", "//tensorflow/tools/api/tests:README.txt", @@ -29,7 +25,6 @@ py_test( deps = [ "//tensorflow:tensorflow_py", "//tensorflow/python:client_testlib", - "//tensorflow/python:framework_test_lib", "//tensorflow/python:lib", "//tensorflow/python:platform", "//tensorflow/tools/api/lib:python_object_to_proto_visitor", diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py index afcbf50944c..c1e09cc531e 100644 --- a/tensorflow/tools/api/tests/api_compatibility_test.py +++ b/tensorflow/tools/api/tests/api_compatibility_test.py @@ -28,10 +28,8 @@ from __future__ import division from __future__ import print_function import argparse -from collections import defaultdict import os import re -import subprocess import sys import unittest @@ -39,7 +37,6 @@ import tensorflow as tf from google.protobuf import text_format -from tensorflow.core.framework import api_def_pb2 from tensorflow.python.lib.io import file_io from tensorflow.python.platform import resource_loader from tensorflow.python.platform import test @@ -67,11 +64,6 @@ _API_GOLDEN_FOLDER = 'tensorflow/tools/api/golden' _TEST_README_FILE = 'tensorflow/tools/api/tests/README.txt' _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt' -_CONVERT_FROM_MULTILINE_SCRIPT = 'tensorflow/tools/api/tests/convert_from_multiline' -_BASE_API_DIR = 'tensorflow/core/api_def/base_api' -_PYTHON_API_DIR = 'tensorflow/core/api_def/python_api' -_HIDDEN_OPS_FILE = 'tensorflow/python/ops/hidden_ops.txt' - def _KeyToFilePath(key): """From a given key, construct a filepath.""" @@ -96,55 +88,6 @@ def _FileNameToKey(filename): return api_object_key -def _GetSymbol(symbol_id): - """Get TensorFlow symbol based on the given identifier. - - Args: - symbol_id: Symbol identifier in the form module1.module2. ... .sym. - - Returns: - Symbol corresponding to the given id. - """ - # Ignore first module which should be tensorflow - symbol_id_split = symbol_id.split('.')[1:] - symbol = tf - for sym in symbol_id_split: - symbol = getattr(symbol, sym) - return symbol - - -def _IsGenModule(module_name): - if not module_name: - return False - module_name_split = module_name.split('.') - return module_name_split[-1].startswith('gen_') - - -def _GetHiddenOps(): - hidden_ops_file = file_io.FileIO(_HIDDEN_OPS_FILE, 'r') - hidden_ops = set() - for line in hidden_ops_file: - line = line.strip() - if not line: - continue - if line[0] == '#': # comment line - continue - # If line is of the form "op_name # comment", only keep the op_name. - line_split = line.split('#') - hidden_ops.add(line_split[0].strip()) - return hidden_ops - - -def _GetGoldenApiDefs(): - old_api_def_files = file_io.get_matching_files(_GetApiDefFilePath('*')) - return {file_path: file_io.read_file_to_string(file_path) - for file_path in old_api_def_files} - - -def _GetApiDefFilePath(graph_op_name): - return os.path.join(_PYTHON_API_DIR, 'api_def_%s.pbtxt' % graph_op_name) - - class ApiCompatibilityTest(test.TestCase): def __init__(self, *args, **kwargs): @@ -287,188 +230,6 @@ class ApiCompatibilityTest(test.TestCase): update_goldens=FLAGS.update_goldens) -class ApiDefTest(test.TestCase): - - def __init__(self, *args, **kwargs): - super(ApiDefTest, self).__init__(*args, **kwargs) - self._first_cap_pattern = re.compile('(.)([A-Z][a-z]+)') - self._all_cap_pattern = re.compile('([a-z0-9])([A-Z])') - - def _GenerateLowerCaseOpName(self, op_name): - lower_case_name = self._first_cap_pattern.sub(r'\1_\2', op_name) - return self._all_cap_pattern.sub(r'\1_\2', lower_case_name).lower() - - def _CreatePythonApiDef(self, base_api_def, endpoint_names): - """Creates Python ApiDef that overrides base_api_def if needed. - - Args: - base_api_def: (api_def_pb2.ApiDef) base ApiDef instance. - endpoint_names: List of Python endpoint names. - - Returns: - api_def_pb2.ApiDef instance with overrides for base_api_def - if module.name endpoint is different from any existing - endpoints in base_api_def. Otherwise, returns None. - """ - endpoint_names_set = set(endpoint_names) - - # If the only endpoint is equal to graph_op_name then - # it is equivalent to having no endpoints. - if (not base_api_def.endpoint and len(endpoint_names) == 1 - and endpoint_names[0] == - self._GenerateLowerCaseOpName(base_api_def.graph_op_name)): - return None - - base_endpoint_names_set = { - self._GenerateLowerCaseOpName(endpoint.name) - for endpoint in base_api_def.endpoint} - - if endpoint_names_set == base_endpoint_names_set: - return None # All endpoints are the same - - api_def = api_def_pb2.ApiDef() - api_def.graph_op_name = base_api_def.graph_op_name - - for endpoint_name in sorted(endpoint_names): - new_endpoint = api_def.endpoint.add() - new_endpoint.name = endpoint_name - - return api_def - - def _GetBaseApiMap(self): - """Get a map from graph op name to its base ApiDef. - - Returns: - Dictionary mapping graph op name to corresponding ApiDef. - """ - # Convert base ApiDef in Multiline format to Proto format. - converted_base_api_dir = os.path.join( - test.get_temp_dir(), 'temp_base_api_defs') - subprocess.check_call( - [os.path.join(resource_loader.get_root_dir_with_all_resources(), - _CONVERT_FROM_MULTILINE_SCRIPT), - _BASE_API_DIR, converted_base_api_dir]) - - name_to_base_api_def = {} - base_api_files = file_io.get_matching_files( - os.path.join(converted_base_api_dir, 'api_def_*.pbtxt')) - for base_api_file in base_api_files: - if file_io.file_exists(base_api_file): - api_defs = api_def_pb2.ApiDefs() - text_format.Merge( - file_io.read_file_to_string(base_api_file), api_defs) - for api_def in api_defs.op: - name_to_base_api_def[api_def.graph_op_name] = api_def - return name_to_base_api_def - - def _AddHiddenOpOverrides(self, name_to_base_api_def, api_def_map): - """Adds ApiDef overrides to api_def_map for hidden Python ops. - - Args: - name_to_base_api_def: Map from op name to base api_def_pb2.ApiDef. - api_def_map: Map from file path to api_def_pb2.ApiDefs for Python API - overrides. - """ - hidden_ops = _GetHiddenOps() - for hidden_op in hidden_ops: - if hidden_op not in name_to_base_api_def: - logging.warning('Unexpected hidden op name: %s' % hidden_op) - continue - - base_api_def = name_to_base_api_def[hidden_op] - if base_api_def.visibility != api_def_pb2.ApiDef.HIDDEN: - api_def = api_def_pb2.ApiDef() - api_def.graph_op_name = base_api_def.graph_op_name - api_def.visibility = api_def_pb2.ApiDef.HIDDEN - - file_path = _GetApiDefFilePath(base_api_def.graph_op_name) - api_def_map[file_path].op.extend([api_def]) - - @unittest.skipUnless( - sys.version_info.major == 2 and os.uname()[0] == 'Linux', - 'API compabitility test goldens are generated using python2 on Linux.') - def testAPIDefCompatibility(self): - # Get base ApiDef - name_to_base_api_def = self._GetBaseApiMap() - snake_to_camel_graph_op_names = { - self._GenerateLowerCaseOpName(name): name - for name in name_to_base_api_def.keys()} - # Extract Python API - visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor() - public_api_visitor = public_api.PublicAPIVisitor(visitor) - public_api_visitor.do_not_descend_map['tf'].append('contrib') - traverse.traverse(tf, public_api_visitor) - proto_dict = visitor.GetProtos() - - # Map from file path to Python ApiDefs. - new_api_defs_map = defaultdict(api_def_pb2.ApiDefs) - # We need to override all endpoints even if 1 endpoint differs from base - # ApiDef. So, we first create a map from an op to all its endpoints. - op_to_endpoint_name = defaultdict(list) - - # Generate map from generated python op to endpoint names. - for public_module, value in proto_dict.items(): - module_obj = _GetSymbol(public_module) - for sym in value.tf_module.member_method: - obj = getattr(module_obj, sym.name) - - # Check if object is defined in gen_* module. That is, - # the object has been generated from OpDef. - if hasattr(obj, '__module__') and _IsGenModule(obj.__module__): - if obj.__name__ not in snake_to_camel_graph_op_names: - # Symbol might be defined only in Python and not generated from - # C++ api. - continue - relative_public_module = public_module[len('tensorflow.'):] - full_name = (relative_public_module + '.' + sym.name - if relative_public_module else sym.name) - op_to_endpoint_name[obj].append(full_name) - - # Generate Python ApiDef overrides. - for op, endpoint_names in op_to_endpoint_name.items(): - graph_op_name = snake_to_camel_graph_op_names[op.__name__] - api_def = self._CreatePythonApiDef( - name_to_base_api_def[graph_op_name], endpoint_names) - - if api_def: - file_path = _GetApiDefFilePath(graph_op_name) - api_defs = new_api_defs_map[file_path] - api_defs.op.extend([api_def]) - - self._AddHiddenOpOverrides(name_to_base_api_def, new_api_defs_map) - - old_api_defs_map = _GetGoldenApiDefs() - for file_path, new_api_defs in new_api_defs_map.items(): - # Get new ApiDef string. - new_api_defs_str = str(new_api_defs) - - # Get current ApiDef for the given file. - old_api_defs_str = ( - old_api_defs_map[file_path] if file_path in old_api_defs_map else '') - - if old_api_defs_str == new_api_defs_str: - continue - - if FLAGS.update_goldens: - logging.info('Updating %s...' % file_path) - file_io.write_string_to_file(file_path, new_api_defs_str) - else: - self.assertMultiLineEqual( - old_api_defs_str, new_api_defs_str, - 'To update golden API files, run api_compatibility_test locally ' - 'with --update_goldens=True flag.') - - for file_path in set(old_api_defs_map) - set(new_api_defs_map): - if FLAGS.update_goldens: - logging.info('Deleting %s...' % file_path) - file_io.delete_file(file_path) - else: - self.fail( - '%s file is no longer needed and should be removed.' - 'To update golden API files, run api_compatibility_test locally ' - 'with --update_goldens=True flag.' % file_path) - - if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh index 27fa1b89ceb..b3a8ff2ac72 100755 --- a/tensorflow/tools/ci_build/ci_sanity.sh +++ b/tensorflow/tools/ci_build/ci_sanity.sh @@ -183,7 +183,9 @@ do_pylint() { # W0311 bad-indentation # W0312 mixed-indentation # C0330 bad-continuation - grep -E '(\[E|\[W0311|\[W0312|\[C0330)' ${OUTPUT_FILE} > ${ERRORS_FILE} + # C0301 line-too-long + # C0326 bad-whitespace + grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326)' ${OUTPUT_FILE} > ${ERRORS_FILE} N_ERRORS=0 while read -r LINE; do @@ -319,7 +321,7 @@ do_external_licenses_check(){ EXTRA_LICENSES_FILE="$(mktemp)_extra_licenses.log" echo "Getting external dependencies for ${BUILD_TARGET}" - bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \ + bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going \ | grep -E -v "^//tensorflow" \ | sed -e 's|:.*||' \ | sort \ @@ -328,7 +330,7 @@ do_external_licenses_check(){ echo echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}." - bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \ + bazel query "deps(${LICENSES_TARGET})" --keep_going \ | grep -E -v "^//tensorflow" \ | sed -e 's|:.*||' \ | sort \ @@ -342,6 +344,18 @@ do_external_licenses_check(){ EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s') + # Blacklist + echo ${MISSING_LICENSES_FILE} + grep -e "@bazel_tools//third_party/" -e "@com_google_absl//absl" -e "@org_tensorflow//" -v ${MISSING_LICENSES_FILE} > temp.txt + mv temp.txt ${MISSING_LICENSES_FILE} + + # Whitelist + echo ${EXTRA_LICENSE_FILE} + grep -e "@bazel_tools//src/" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -v ${EXTRA_LICENSES_FILE} > temp.txt + mv temp.txt ${EXTRA_LICENSES_FILE} + + + echo echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s" echo @@ -515,9 +529,14 @@ do_check_futures_test() { python check_futures_test.py } +do_check_file_name_test() { + cd "$ROOT_DIR/tensorflow/tools/test" + python file_name_test.py +} + # Supply all sanity step commands and descriptions -SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity") -SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency") +SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test") +SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases") INCREMENTAL_FLAG="" DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp" diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh index 71744c04f2f..d406b83a624 100755 --- a/tensorflow/tools/ci_build/install/install_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh @@ -43,8 +43,8 @@ pip2 install --upgrade werkzeug==0.11.10 pip3 install --upgrade werkzeug==0.11.10 # Install bleach. html5lib will be picked up as a dependency. -pip2 install --upgrade bleach==1.5.0 -pip3 install --upgrade bleach==1.5.0 +pip2 install --upgrade bleach==2.0.0 +pip3 install --upgrade bleach==2.0.0 # Install markdown. pip2 install --upgrade markdown==2.6.8 diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc index e71017e621c..68fdb617166 100644 --- a/tensorflow/tools/ci_build/pylintrc +++ b/tensorflow/tools/ci_build/pylintrc @@ -180,7 +180,17 @@ docstring-min-length=10 max-line-length=80 # Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ +ignore-long-lines=(?x) + (^\s*(import|from)\s + |\$Id:\s\/\/depot\/.+#\d+\s\$ + |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+') + |^\s*\#\ LINT\.ThenChange + |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$ + |pylint + |""" + |\# + |lambda + |(https?|ftp):) # Allow the body of an if to be on the same line as the test if there is no # else. diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat index 957729bb37d..c1bc7185075 100644 --- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat +++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat @@ -36,7 +36,7 @@ SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe" :: Run cmake to create Visual Studio Project files. -%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% +%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX :: Run msbuild in the resulting VS project files to build a pip package. %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat index 5a362de3992..b87e4a9bec4 100644 --- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat +++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat @@ -37,7 +37,7 @@ SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe" :: Run cmake to create Visual Studio Project files. -%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% +%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX :: Run msbuild in the resulting VS project files to build a pip package. %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh index fa28e3d79ca..583d1d5f095 100755 --- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh +++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh @@ -41,7 +41,7 @@ run_configure_for_cpu_build # build_libtensorflow_tarball in ../builds/libtensorflow.sh # cannot be used on Windows since it relies on pkg_tar rules. # So we do something special here -bazel build -c opt \ +bazel build -c opt --copt=/arch:AVX \ tensorflow:libtensorflow.so \ tensorflow/tools/lib_package:clicenses_generate \ tensorflow/java:libtensorflow_jni.so \ diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh index 573c926203f..94276c6c5c9 100644 --- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh +++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh @@ -41,7 +41,7 @@ run_configure_for_gpu_build # build_libtensorflow_tarball in ../builds/libtensorflow.sh # cannot be used on Windows since it relies on pkg_tar rules. # So we do something special here -bazel build -c opt \ +bazel build -c opt --copt=/arch:AVX \ tensorflow:libtensorflow.so \ tensorflow/tools/lib_package:clicenses_generate \ tensorflow/java:libtensorflow_jni.so \ diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py index f678681dac2..6e90b286c99 100644 --- a/tensorflow/tools/compatibility/tf_upgrade.py +++ b/tensorflow/tools/compatibility/tf_upgrade.py @@ -46,8 +46,9 @@ class APIChangeSpec(object): """ -class _FileEditTuple(collections.namedtuple( - "_FileEditTuple", ["comment", "line", "start", "old", "new"])): +class _FileEditTuple( + collections.namedtuple("_FileEditTuple", + ["comment", "line", "start", "old", "new"])): """Each edit that is recorded by a _FileEditRecorder. Fields: @@ -179,8 +180,7 @@ class _ASTCallVisitor(ast.NodeVisitor): function_renames = self._api_change_spec.function_renames try: new_name = function_renames[full_name] - self._file_edit.add("Renamed function %r to %r" % (full_name, - new_name), + self._file_edit.add("Renamed function %r to %r" % (full_name, new_name), node.lineno, node.col_offset, full_name, new_name) except KeyError: pass @@ -227,7 +227,7 @@ class _ASTCallVisitor(ast.NodeVisitor): # loop over lines while 1: # Reverse the text to and regular expression search for whitespace - text = self._lines[line-1] + text = self._lines[line - 1] reversed_preceding_text = text[:col][::-1] # First find if a [ can be found with only whitespace between it and # col. @@ -248,8 +248,8 @@ class _ASTCallVisitor(ast.NodeVisitor): # node ranges to filter out spurious #'s that appear in string # literals. comment_start = prev_line.find("#") - if comment_start == -1: - col = len(prev_line) -1 + if comment_start == -1: + col = len(prev_line) - 1 elif find_string_chars.search(prev_line[comment_start:]) is None: col = comment_start else: @@ -260,7 +260,6 @@ class _ASTCallVisitor(ast.NodeVisitor): # it is not possible to use that in an argument. return node.lineno, node.col_offset - def visit_Call(self, node): # pylint: disable=invalid-name """Handle visiting a call node in the AST. @@ -268,7 +267,6 @@ class _ASTCallVisitor(ast.NodeVisitor): node: Current Node """ - # Find a simple attribute name path e.g. "tf.foo.bar" full_name = self._get_attribute_full_path(node.func) @@ -293,18 +291,21 @@ class _ASTCallVisitor(ast.NodeVisitor): lineno, col_offset = self._find_true_position(arg) if lineno is None or col_offset is None: self._file_edit.add( - "Failed to add keyword %r to reordered function %r" - % (reordered[idx], full_name), arg.lineno, arg.col_offset, - "", "", + "Failed to add keyword %r to reordered function %r" % + (reordered[idx], full_name), + arg.lineno, + arg.col_offset, + "", + "", error="A necessary keyword argument failed to be inserted.") else: keyword_arg = reordered[idx] if (full_name in function_keyword_renames and keyword_arg in function_keyword_renames[full_name]): keyword_arg = function_keyword_renames[full_name][keyword_arg] - self._file_edit.add("Added keyword %r to reordered function %r" - % (reordered[idx], full_name), lineno, - col_offset, "", keyword_arg + "=") + self._file_edit.add("Added keyword %r to reordered function %r" % + (reordered[idx], full_name), lineno, col_offset, + "", keyword_arg + "=") # Examine each keyword argument and convert it to the final renamed form renamed_keywords = ({} if full_name not in function_keyword_renames else @@ -322,11 +323,11 @@ class _ASTCallVisitor(ast.NodeVisitor): # value. key_start = argval_col_offset - len(argkey) - 1 key_end = key_start + len(argkey) + 1 - if (self._lines[argval_lineno - 1][key_start:key_end] == - argkey + "="): + if (self._lines[argval_lineno - 1][key_start:key_end] == argkey + + "="): self._file_edit.add("Renamed keyword argument from %r to %r" % - (argkey, renamed_keywords[argkey]), - argval_lineno, + (argkey, + renamed_keywords[argkey]), argval_lineno, argval_col_offset - len(argkey) - 1, argkey + "=", renamed_keywords[argkey] + "=") continue @@ -335,7 +336,8 @@ class _ASTCallVisitor(ast.NodeVisitor): (argkey, renamed_keywords[argkey]), argval.lineno, argval.col_offset - len(argkey) - 1, - "", "", + "", + "", error="Failed to find keyword lexographically. Fix manually.") ast.NodeVisitor.generic_visit(self, node) @@ -352,7 +354,7 @@ class _ASTCallVisitor(ast.NodeVisitor): if full_name in self._api_change_spec.change_to_function: if not hasattr(node, "is_function_for_call"): new_text = full_name + "()" - self._file_edit.add("Changed %r to %r"%(full_name, new_text), + self._file_edit.add("Changed %r to %r" % (full_name, new_text), node.lineno, node.col_offset, full_name, new_text) ast.NodeVisitor.generic_visit(self, node) @@ -380,8 +382,8 @@ class ASTCodeUpgrader(object): # Write to a temporary file, just in case we are doing an implace modify. with open(in_filename, "r") as in_file, \ tempfile.NamedTemporaryFile("w", delete=False) as temp_file: - ret = self.process_opened_file( - in_filename, in_file, out_filename, temp_file) + ret = self.process_opened_file(in_filename, in_file, out_filename, + temp_file) shutil.move(temp_file.name, out_filename) return ret @@ -424,6 +426,7 @@ class ASTCodeUpgrader(object): out_file.write(out_text) text += "\n" return 1, text, process_errors + # pylint: enable=broad-except def process_tree(self, root_directory, output_root_directory, @@ -444,16 +447,16 @@ class ASTCodeUpgrader(object): # make sure output directory doesn't exist if output_root_directory and os.path.exists(output_root_directory): - print("Output directory %r must not already exist." % ( - output_root_directory)) + print("Output directory %r must not already exist." % + (output_root_directory)) sys.exit(1) # make sure output directory does not overlap with root_directory norm_root = os.path.split(os.path.normpath(root_directory)) norm_output = os.path.split(os.path.normpath(output_root_directory)) if norm_root == norm_output: - print("Output directory %r same as input directory %r" % ( - root_directory, output_root_directory)) + print("Output directory %r same as input directory %r" % + (root_directory, output_root_directory)) sys.exit(1) # Collect list of files to process (we do this to correctly handle if the @@ -465,14 +468,16 @@ class ASTCodeUpgrader(object): copy_files = [f for f in file_list if not f.endswith(".py")] for filename in py_files: fullpath = os.path.join(dir_name, filename) - fullpath_output = os.path.join( - output_root_directory, os.path.relpath(fullpath, root_directory)) + fullpath_output = os.path.join(output_root_directory, + os.path.relpath(fullpath, + root_directory)) files_to_process.append((fullpath, fullpath_output)) if copy_other_files: for filename in copy_files: fullpath = os.path.join(dir_name, filename) - fullpath_output = os.path.join( - output_root_directory, os.path.relpath(fullpath, root_directory)) + fullpath_output = os.path.join(output_root_directory, + os.path.relpath( + fullpath, root_directory)) files_to_copy.append((fullpath, fullpath_output)) file_count = 0 @@ -641,18 +646,17 @@ class TFAPIChangeSpec(APIChangeSpec): "tf.concat": ["concat_dim", "values", "name"], "tf.svd": ["tensor", "compute_uv", "full_matrices", "name"], "tf.nn.softmax_cross_entropy_with_logits": [ - "logits", "labels", "dim", "name"], + "logits", "labels", "dim", "name" + ], "tf.nn.sparse_softmax_cross_entropy_with_logits": [ - "logits", "labels", "name"], - "tf.nn.sigmoid_cross_entropy_with_logits": [ - "logits", "labels", "name"], + "logits", "labels", "name" + ], + "tf.nn.sigmoid_cross_entropy_with_logits": ["logits", "labels", "name"], "tf.op_scope": ["values", "name", "default_name"], } # Specially handled functions. - self.function_handle = { - "tf.reverse": self._reverse_handler - } + self.function_handle = {"tf.reverse": self._reverse_handler} @staticmethod def _reverse_handler(file_edit_recorder, node): @@ -661,12 +665,13 @@ class TFAPIChangeSpec(APIChangeSpec): comment = ("ERROR: tf.reverse has had its argument semantics changed\n" "significantly the converter cannot detect this reliably, so you" "need to inspect this usage manually.\n") - file_edit_recorder.add(comment, - node.lineno, - node.col_offset, - "tf.reverse", - "tf.reverse", - error="tf.reverse requires manual check.") + file_edit_recorder.add( + comment, + node.lineno, + node.col_offset, + "tf.reverse", + "tf.reverse", + error="tf.reverse requires manual check.") if __name__ == "__main__": diff --git a/tensorflow/tools/dist_test/build_server.sh b/tensorflow/tools/dist_test/build_server.sh index 878fabd248f..225c0347416 100755 --- a/tensorflow/tools/dist_test/build_server.sh +++ b/tensorflow/tools/dist_test/build_server.sh @@ -16,14 +16,15 @@ # # Builds the test server for distributed (GRPC) TensorFlow # -# Usage: build_server.sh [--test] +# Usage: build_server.sh [--test] # # Arguments: # docker_image_name: Name of the docker image to build. # E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1 # -# whl_url: URL from which the TensorFlow whl file will be downloaded. +# whl_file_location: URL from which the TensorFlow whl file will be downloaded. # E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl +# E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl # # The optional flag --test lets the script to use the Dockerfile for the # testing GRPC server. Without the flag, the script will build the non-test @@ -41,11 +42,11 @@ die() { # Check arguments if [[ $# -lt 2 ]]; then - die "Usage: $0 [--test]" + die "Usage: $0 [--test]" fi DOCKER_IMG_NAME=$1 -WHL_URL=$2 +WHL_FILE_LOCATION=$2 shift 2 # Current script directory @@ -53,7 +54,7 @@ DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BUILD_DIR=$(mktemp -d) echo "" -echo "Using whl file URL: ${WHL_URL}" +echo "Using whl file URL: ${WHL_FILE_LOCATION}" echo "Building in temporary directory: ${BUILD_DIR}" cp -r ${DIR}/* "${BUILD_DIR}"/ || \ @@ -65,9 +66,15 @@ if [[ $1 == "--test" ]]; then fi echo "Using Docker file: ${DOCKER_FILE}" +if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then + # Download whl file into the build context directory. + wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \ + die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}" +else + cp "${WHL_FILE_LOCATION}" "${BUILD_DIR}" +fi + # Download whl file into the build context directory. -wget -P "${BUILD_DIR}" ${WHL_URL} || \ - die "Failed to download tensorflow whl file from URL: ${WHL_URL}" if [[ ! -f "${DOCKER_FILE}" ]]; then die "ERROR: Unable to find dockerfile: ${DOCKER_FILE}" diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py index e40ecb43f9a..a2d12442c44 100644 --- a/tensorflow/tools/dist_test/python/mnist_replica.py +++ b/tensorflow/tools/dist_test/python/mnist_replica.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Distributed MNIST training and validation, with model replicas. A simple softmax model with one hidden layer is defined. The parameters @@ -32,7 +31,6 @@ perform forward computation and gradient calculation in parallel, which should lead to increased training speed for the simple model. """ - from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -45,7 +43,6 @@ import time import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data - flags = tf.app.flags flags.DEFINE_string("data_dir", "/tmp/mnist-data", "Directory for storing mnist data") @@ -56,8 +53,7 @@ flags.DEFINE_integer("task_index", None, "Worker task index, should be >= 0. task_index=0 is " "the master worker task the performs the variable " "initialization ") -flags.DEFINE_integer("num_gpus", 1, - "Total number of gpus for each machine." +flags.DEFINE_integer("num_gpus", 1, "Total number of gpus for each machine." "If you don't use GPU, please set it to '0'") flags.DEFINE_integer("replicas_to_aggregate", None, "Number of replicas to aggregate before parameter update" @@ -69,24 +65,24 @@ flags.DEFINE_integer("train_steps", 200, "Number of (global) training steps to perform") flags.DEFINE_integer("batch_size", 100, "Training batch size") flags.DEFINE_float("learning_rate", 0.01, "Learning rate") -flags.DEFINE_boolean("sync_replicas", False, - "Use the sync_replicas (synchronized replicas) mode, " - "wherein the parameter updates from workers are aggregated " - "before applied to avoid stale gradients") +flags.DEFINE_boolean( + "sync_replicas", False, + "Use the sync_replicas (synchronized replicas) mode, " + "wherein the parameter updates from workers are aggregated " + "before applied to avoid stale gradients") flags.DEFINE_boolean( "existing_servers", False, "Whether servers already exists. If True, " "will use the worker hosts via their GRPC URLs (one client process " "per worker host). Otherwise, will create an in-process TensorFlow " "server.") -flags.DEFINE_string("ps_hosts","localhost:2222", +flags.DEFINE_string("ps_hosts", "localhost:2222", "Comma-separated list of hostname:port pairs") flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224", "Comma-separated list of hostname:port pairs") -flags.DEFINE_string("job_name", None,"job name: worker or ps") +flags.DEFINE_string("job_name", None, "job name: worker or ps") FLAGS = flags.FLAGS - IMAGE_PIXELS = 28 @@ -97,7 +93,7 @@ def main(unused_argv): if FLAGS.job_name is None or FLAGS.job_name == "": raise ValueError("Must specify an explicit `job_name`") - if FLAGS.task_index is None or FLAGS.task_index =="": + if FLAGS.task_index is None or FLAGS.task_index == "": raise ValueError("Must specify an explicit `task_index`") print("job name = %s" % FLAGS.job_name) @@ -110,9 +106,7 @@ def main(unused_argv): # Get the number of workers. num_workers = len(worker_spec) - cluster = tf.train.ClusterSpec({ - "ps": ps_spec, - "worker": worker_spec}) + cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec}) if not FLAGS.existing_servers: # Not using existing servers. Create an in-process server. @@ -217,7 +211,8 @@ def main(unused_argv): sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, - device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index]) + device_filters=["/job:ps", + "/job:worker/task:%d" % FLAGS.task_index]) # The chief worker (task_index==0) session will prepare the session, # while the remaining workers will wait for the preparation to complete. @@ -231,8 +226,7 @@ def main(unused_argv): server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index] print("Using existing server at: %s" % server_grpc_url) - sess = sv.prepare_or_wait_for_session(server_grpc_url, - config=sess_config) + sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) else: sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) diff --git a/tensorflow/tools/docker/jupyter_notebook_config.py b/tensorflow/tools/docker/jupyter_notebook_config.py index 0acbf6fcee5..05dcefb099a 100644 --- a/tensorflow/tools/docker/jupyter_notebook_config.py +++ b/tensorflow/tools/docker/jupyter_notebook_config.py @@ -15,6 +15,7 @@ import os from IPython.lib import passwd +c = c # pylint:disable=undefined-variable c.NotebookApp.ip = '*' c.NotebookApp.port = int(os.getenv('PORT', 8888)) c.NotebookApp.open_browser = False diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py index b5df633800a..543b5fa6fef 100644 --- a/tensorflow/tools/docs/pretty_docs.py +++ b/tensorflow/tools/docs/pretty_docs.py @@ -162,7 +162,7 @@ def _build_class_page(page_info): parts.append(h3.format(**method_info.__dict__)) if method_info.signature is not None: - parts.append(_build_signature(method_info)) + parts.append(_build_signature(method_info, use_full_name=False)) parts.append(method_info.doc.docstring) parts.append(_build_function_details(method_info.doc.function_details)) @@ -259,14 +259,14 @@ def _build_module_page(page_info): return ''.join(parts) -def _build_signature(obj_info): +def _build_signature(obj_info, use_full_name=True): """Returns a md code block showing the function signature.""" # Special case tf.range, since it has an optional first argument if obj_info.full_name == 'tf.range': return ( '``` python\n' - "range(limit, delta=1, dtype=None, name='range')\n" - "range(start, limit, delta=1, dtype=None, name='range')\n" + "tf.range(limit, delta=1, dtype=None, name='range')\n" + "tf.range(start, limit, delta=1, dtype=None, name='range')\n" '```\n\n') parts = ['``` python'] @@ -281,7 +281,11 @@ def _build_signature(obj_info): sig = ',\n'.join(' %s' % sig_item for sig_item in obj_info.signature) sig = '\n'+sig+'\n' - parts.append(signature_template.format(name=obj_info.short_name, sig=sig)) + if use_full_name: + obj_name = obj_info.full_name + else: + obj_name = obj_info.short_name + parts.append(signature_template.format(name=obj_name, sig=sig)) parts.append('```\n\n') return '\n'.join(parts) diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc index 5ccd88cfa1a..a022f579267 100644 --- a/tensorflow/tools/graph_transforms/quantize_nodes.cc +++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc @@ -183,22 +183,6 @@ Status ExtractRangeFromParams(const TransformFuncContext& context, return Status::OK(); } -bool AreAttrsEqual(const NodeDef* current_node, const NodeDef* other_node) { - if (current_node->attr_size() != other_node->attr_size()) { - return false; - } - string current_serialized; - string other_serialized; - for (const auto& attr : other_node->attr()) { - auto iter = current_node->attr().find(attr.first); - if (iter == current_node->attr().end()) return false; - iter->second.SerializeToString(¤t_serialized); - attr.second.SerializeToString(&other_serialized); - if (current_serialized != other_serialized) return false; - } - return true; -} - } // namespace // Analyzes all the nodes in the graph to figure out which ones are duplicates diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc index 96324d0deab..214ec721e2c 100644 --- a/tensorflow/tools/graph_transforms/sparsify_gather.cc +++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc @@ -15,6 +15,7 @@ limitations under the License. #include #include +#include #include "tensorflow/c/checkpoint_reader.h" #include "tensorflow/core/framework/tensor.h" @@ -28,9 +29,10 @@ limitations under the License. #include "tensorflow/tools/graph_transforms/transform_utils.h" namespace tensorflow { -using strings::StrCat; using str_util::Join; using str_util::Split; +using str_util::StringReplace; +using strings::StrCat; namespace graph_transforms { @@ -84,48 +86,71 @@ void CreateConstNode(const Tensor& tensor, const string& name, SetNodeTensorAttr("value", tensor, node_def); } +string GetMonolithicTensorKey(const string& tensor_slice_name) { + std::vector names = Split(tensor_slice_name, "/"); + if (StringPiece(names[names.size() - 1]).starts_with("part_")) { + CHECK_GE(names.size(), 2); + names.pop_back(); + } + return Join(names, "/"); +} + Status ObtainTensorSlice(const GraphDef& input_graph_def, - const string& tensor_name, + const string& target_name, string* shape_slice_string) { string restore_node_name; for (const auto& node : input_graph_def.node()) { - std::vector node_name_parts = str_util::Split(node.name(), "/"); + std::vector node_name_parts = Split(node.name(), "/"); if (node_name_parts.size() == 2 && StringPiece(node_name_parts[0]).starts_with("save") && StringPiece(node_name_parts[1]).starts_with("Assign") && - node.input(0) == tensor_name) { + node.input(0) == target_name) { restore_node_name = node.input(1); break; } } + + std::vector restore_node_parts = Split(restore_node_name, ":"); + CHECK_LE(restore_node_parts.size(), 2); + string tensor_names_node; string shape_and_slices_node; for (const auto& node : input_graph_def.node()) { - if ((node.name() == restore_node_name) && (node.op() == "RestoreV2")) { + if ((node.name() == restore_node_parts[0]) && (node.op() == "RestoreV2")) { + tensor_names_node = node.input(1); shape_and_slices_node = node.input(2); break; } } + + int offset = -1; + for (const auto& node : input_graph_def.node()) { + if (node.name() == tensor_names_node) { + Tensor tensor_names_tensor; + TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &tensor_names_tensor)); + const auto& tensor_names_value = tensor_names_tensor.flat(); + for (int i = 0; i < tensor_names_value.size(); i++) { + if (tensor_names_value(i) == GetMonolithicTensorKey(target_name)) { + offset = i; + break; + } + } + } + } + if (offset == -1) { + return errors::Internal("Unable to find RestoreV2 entry for variable: ", + target_name); + } for (const auto& node : input_graph_def.node()) { if (node.name() == shape_and_slices_node) { Tensor shape_and_slices_tensor; TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &shape_and_slices_tensor)); const auto& shape_and_slices_value = shape_and_slices_tensor.flat(); - *shape_slice_string = shape_and_slices_value(0); + *shape_slice_string = shape_and_slices_value(offset); return Status::OK(); } } - return errors::Internal("Unable to find slice for variable: ", tensor_name); -} - -string GetMonolithicTensorKey(const string& tensor_slice_name) { - std::vector names = str_util::Split(tensor_slice_name, "/"); - CHECK_GE(names.size(), 2); - CHECK(StringPiece(names[names.size() - 1]).starts_with("part_")); - - // Remove the "part_x" suffix - names.pop_back(); - return str_util::Join(names, "/"); + return errors::Internal("Unable to find slice for variable: ", target_name); } Status ReadTensorFromCheckpoint( @@ -179,6 +204,14 @@ Status ObtainVariableInfo( return Status::OK(); } +Status RemoveInputAtIndex(NodeDef* n, int index) { + for (int i = index; i < n->input_size() - 1; i++) { + n->mutable_input()->SwapElements(i, i + 1); + } + n->mutable_input()->RemoveLast(); + return Status::OK(); +} + Status SparsifyGatherInternal( const GraphDef& input_graph_def, const std::unique_ptr >& @@ -193,6 +226,15 @@ Status SparsifyGatherInternal( GraphDef current_graph_def = input_graph_def; bool any_match_found = false; + // Populate references. + std::unordered_map refs; + for (const auto& node : current_graph_def.node()) { + for (const auto& input : node.input()) { + auto parsed_input = StringReplace(input, "^", "", true); + refs[parsed_input] += 1; + } + } + // The subgraphs may have overlapping components, therefore GraphMatcher // doesn't return all subgraphs in one round -- this has to be multi-round // update. @@ -200,15 +242,15 @@ Status SparsifyGatherInternal( any_match_found = false; GraphDef replaced_graph_def = current_graph_def; std::vector init_table_node_names; - std::vector removed_variable_names; + std::vector removed_node_names; TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes( current_graph_def, pattern, [&ckpt_reader, &any_match_found, &init_table_node_names, - &shapes_and_slices, &removed_variable_names]( - const NodeMatch& match, const std::set& input_nodes, - const std::set& output_nodes, - std::vector* new_nodes) { + &shapes_and_slices, &removed_node_names, + &refs](const NodeMatch& match, const std::set& input_nodes, + const std::set& output_nodes, + std::vector* new_nodes) { any_match_found = true; // The captured subgraph should be of the following pattern: @@ -290,9 +332,13 @@ Status SparsifyGatherInternal( TF_RETURN_IF_ERROR(ReadTensorFromCheckpoint( weights_node.name(), ckpt_reader, (*shapes_and_slices)[weights_node.name()], &weight)); - // Add both both weight and identity node names. - removed_variable_names.push_back(weights_node.name()); - removed_variable_names.push_back(match.inputs[0].node.name()); + } + // Add both both weight and identity node names. + removed_node_names.push_back(weights_node.name()); + removed_node_names.push_back(match.inputs[0].node.name()); + for (auto input_node : match.inputs[0].node.input()) { + auto parsed_input = StringReplace(input_node, "^", "", true); + refs[parsed_input]--; } Tensor indices_tensor; Tensor values_tensor; @@ -362,15 +408,23 @@ Status SparsifyGatherInternal( // Connect nodes AddNodeInput(hashtable_node.name(), &init_table_node); + refs[hashtable_node.name()]++; AddNodeInput(indices_node.name(), &init_table_node); + refs[indices_node.name()]++; AddNodeInput(values_node.name(), &init_table_node); + refs[values_node.name()]++; AddNodeInput(hashtable_node.name(), &lookup_node); + refs[hashtable_node.name()]++; AddNodeInput(gather_node.input(1), &lookup_node); + refs[gather_node.input(1)]++; AddNodeInput(default_value_node.name(), &lookup_node); + refs[default_value_node.name()]++; AddNodeInput(lookup_node.name(), &expand_dims_node); + refs[lookup_node.name()]++; AddNodeInput(dim_idx_node.name(), &expand_dims_node); + refs[dim_idx_node.name()]++; // Copy 'ids' input of original 'Gather' new_nodes->push_back(match.inputs[1].node); @@ -404,48 +458,91 @@ Status SparsifyGatherInternal( for (const string& name : init_table_node_names) { // Add control dependence from init_table_node to group_deps_node AddNodeInput(StrCat("^", name), init_op); + refs[name]++; } - // Remove all dependencies associated with removed variables. - while (!removed_variable_names.empty()) { - auto name = removed_variable_names.back(); - removed_variable_names.pop_back(); + // Erase inputs and outputs as they are not considered for deletion. + for (const auto& output : context.output_names) { + refs.erase(output); + } + + for (const auto& input : context.input_names) { + refs.erase(input); + } + + // Add nodes with a reference count of 0 for deletion. + for (auto entry : refs) { + if (entry.second == 0) { + removed_node_names.push_back(entry.first); + } + } + + while (!removed_node_names.empty()) { + auto name = removed_node_names.back(); + removed_node_names.pop_back(); + int i = 0; while (i < replaced_graph_def.node_size()) { - if (!replaced_graph_def.node(i).input_size()) { - if (replaced_graph_def.node(i).name() == name) { - replaced_graph_def.mutable_node()->SwapElements( - i, replaced_graph_def.node_size() - 1); - replaced_graph_def.mutable_node()->RemoveLast(); - continue; - } - i++; - continue; - } - int j = 0; - while (j < replaced_graph_def.node(i).input_size()) { - if (replaced_graph_def.node(i).input(j) == name || - replaced_graph_def.node(i).input(j) == ("^" + name)) { - replaced_graph_def.mutable_node(i)->mutable_input()->SwapElements( - j, replaced_graph_def.node(i).input_size() - 1); - replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast(); - continue; - } - j++; - } - if ((replaced_graph_def.node(i).input_size() == 0) || - (replaced_graph_def.node(i).op() == "Assign" && - replaced_graph_def.node(i).input_size() == 1)) { - removed_variable_names.push_back(replaced_graph_def.node(i).name()); - if (replaced_graph_def.node(i).input_size() == 1) { - removed_variable_names.push_back( - replaced_graph_def.node(i).input(0)); + // Revisit this to see if we can safely remove RestoreV2 nodes. + if ((replaced_graph_def.node(i).name() == name) && + (replaced_graph_def.node(i).op() != "RestoreV2")) { + for (const auto& input : replaced_graph_def.node(i).input()) { + auto parsed_input = StringReplace(input, "^", "", true); + refs[parsed_input] -= 1; + if (refs[parsed_input] == 0) { + removed_node_names.push_back(parsed_input); + } } replaced_graph_def.mutable_node()->SwapElements( i, replaced_graph_def.node_size() - 1); replaced_graph_def.mutable_node()->RemoveLast(); continue; } + int j = 0; + bool deleted_inputs = false; + while (j < replaced_graph_def.node(i).input_size()) { + if (replaced_graph_def.node(i).input(j) == name || + replaced_graph_def.node(i).input(j) == ("^" + name)) { + TF_RETURN_IF_ERROR( + RemoveInputAtIndex(replaced_graph_def.mutable_node(i), j)); + deleted_inputs = true; + continue; + } + j++; + } + if (deleted_inputs) { + if (replaced_graph_def.node(i).op() == "ConcatV2") { + if (replaced_graph_def.node(i).input_size() > 2) { + SetNodeAttr("N", replaced_graph_def.node(i).input_size() - 1, + replaced_graph_def.mutable_node(i)); + } else if (replaced_graph_def.node(i).input_size() == 2) { + if (refs[replaced_graph_def.node(i).input(1)] != 1) { + return errors::Internal( + "Expect axis tensor of ConcatV2 node to only be referenced " + "once."); + } + refs[replaced_graph_def.node(i).input(1)] -= 1; + removed_node_names.push_back(replaced_graph_def.node(i).input(1)); + replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast(); + replaced_graph_def.mutable_node(i)->mutable_attr()->erase("N"); + replaced_graph_def.mutable_node(i)->set_op("Identity"); + } else { + return errors::Internal( + "ConcatV2 should have at least two elements"); + } + } + if ((replaced_graph_def.node(i).op() == "Assign" || + replaced_graph_def.node(i).op() == "Reshape" || + replaced_graph_def.node(i).op() == "Equal" || + replaced_graph_def.node(i).op() == "Mean" || + replaced_graph_def.node(i).op() == "ScalarSummary") && + replaced_graph_def.node(i).input_size() == 1) { + removed_node_names.push_back(replaced_graph_def.node(i).name()); + } + if (!replaced_graph_def.node(i).input_size()) { + removed_node_names.push_back(replaced_graph_def.node(i).name()); + } + } i++; } } @@ -485,17 +582,22 @@ Status SparsifyGather(const GraphDef& input_graph_def, }; // clang-format on + GraphDef cleaned_input_graph_def; + RemoveAttributes(input_graph_def, {"_output_shapes"}, + &cleaned_input_graph_def); + GraphDef temp_output; std::unique_ptr ckpt_reader; TF_RETURN_IF_ERROR(InitializeCheckpointReader(context, &ckpt_reader)); std::unique_ptr > shapes_and_slices; - TF_RETURN_IF_ERROR(ObtainVariableInfo(input_graph_def, &shapes_and_slices)); + TF_RETURN_IF_ERROR( + ObtainVariableInfo(cleaned_input_graph_def, &shapes_and_slices)); - TF_RETURN_IF_ERROR(SparsifyGatherInternal(input_graph_def, shapes_and_slices, - context, gather_pattern, - ckpt_reader, &temp_output)); + TF_RETURN_IF_ERROR(SparsifyGatherInternal( + cleaned_input_graph_def, shapes_and_slices, context, gather_pattern, + ckpt_reader, &temp_output)); TF_RETURN_IF_ERROR(SparsifyGatherInternal(temp_output, shapes_and_slices, context, gather_v2_pattern, diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc index 000568a0cc9..d41321c9a6d 100644 --- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc +++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc @@ -71,7 +71,7 @@ class SparsifyGatherTest : public ::testing::Test { } void TestSinglePartition(bool gather_v2, bool include_shared_init, - bool test_variable, + bool test_variable, bool test_kept_concat, const string& shared_init_name = "group_deps") { GraphDef graph_def; @@ -80,6 +80,8 @@ class SparsifyGatherTest : public ::testing::Test { // Build the graph. NodeDef* input_node = CreateNode("ids", "Const", {}, &graph_def); NodeDef* w_node; + NodeDef* zeros_const; + NodeDef* zeros_shape; NodeDef* zeros_node; NodeDef* assign_node; @@ -92,19 +94,27 @@ class SparsifyGatherTest : public ::testing::Test { } else { w_node = CreateNode("w/part_1", "VariableV2", {}, &graph_def); - zeros_node = - CreateNode("w/part_1/Initializer/zeros", "Const", {}, &graph_def); + zeros_shape = CreateNode("w/part_1/Initializer/zeros/shape_as_tensor", + "Const", {}, &graph_def); + zeros_const = CreateNode("w/part_1/Initializer/zeros/Const", "Const", {}, + &graph_def); + zeros_node = CreateNode("w/part_1/Initializer/zeros", "Fill", + {zeros_shape, zeros_const}, &graph_def); assign_node = CreateNode("w/part_1/Assign", "Assign", {w_node, zeros_node}, &graph_def); NodeDef* save_const_node = CreateNode("save/Const", "Const", {}, &graph_def); + Tensor tensor_names_values(DT_STRING, TensorShape({1})); + test::FillValues(&tensor_names_values, {"w"}); NodeDef* tensor_names_node = CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def); + SetNodeTensorAttr("value", tensor_names_values, + tensor_names_node); + NodeDef* tensor_shapes_slices_node = CreateNode( "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def); - Tensor shapes_slices_val(DT_STRING, TensorShape({1})); shapes_slices_val.flat()(0) = "4 1 0,4:0,1"; SetNodeTensorAttr("value", shapes_slices_val, @@ -133,6 +143,26 @@ class SparsifyGatherTest : public ::testing::Test { } } + NodeDef* concat_axis_node = + CreateNode("linear/concat/axis", "Const", {}, &graph_def); + NodeDef* concat_input_node = + CreateNode("concat/input/node", "Const", {}, &graph_def); + NodeDef* concat_node = nullptr; + if (!test_kept_concat) { + concat_node = CreateNode( + "concat/node", "ConcatV2", + {identity_node, concat_input_node, concat_axis_node}, &graph_def); + SetNodeAttr("N", 2, concat_node); + } else { + NodeDef* concat_input_node_2 = + CreateNode("concat/input/node_2", "Const", {}, &graph_def); + concat_node = CreateNode("concat/node", "ConcatV2", + {identity_node, concat_input_node, + concat_input_node_2, concat_axis_node}, + &graph_def); + SetNodeAttr("N", 3, concat_node); + } + // Run the op. GraphDef result; TransformFuncContext context; @@ -151,12 +181,32 @@ class SparsifyGatherTest : public ::testing::Test { MapNamesToNodes(result, &node_lookup); // Check nodes. + EXPECT_EQ(0, + node_lookup.count("w/part_1/Initializer/zeros/shape_as_tensor")); + EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros/Const")); EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros")); EXPECT_EQ(0, node_lookup.count("w/part_1/Assign")); EXPECT_EQ(1, node_lookup.count("ids")); EXPECT_EQ("Const", node_lookup.at("ids")->op()); + EXPECT_EQ(1, node_lookup.count("concat/node")); + + if (!test_kept_concat) { + EXPECT_EQ(0, node_lookup.count("linear/concat/axis")); + EXPECT_EQ("Identity", node_lookup.at("concat/node")->op()); + EXPECT_EQ(1, node_lookup.at("concat/node")->input_size()); + EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0)); + } else { + EXPECT_EQ(1, node_lookup.count("linear/concat/axis")); + EXPECT_EQ("ConcatV2", node_lookup.at("concat/node")->op()); + EXPECT_EQ(3, node_lookup.at("concat/node")->input_size()); + EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0)); + EXPECT_EQ("concat/input/node_2", node_lookup.at("concat/node")->input(1)); + EXPECT_EQ("linear/concat/axis", node_lookup.at("concat/node")->input(2)); + EXPECT_EQ(2, node_lookup.at("concat/node")->attr().at("N").i()); + } + EXPECT_EQ(1, node_lookup.count("w/part_1/indices")); EXPECT_EQ("Const", node_lookup.at("w/part_1/indices")->op()); Tensor expected_indices_tensor(DT_INT64, TensorShape({3})); @@ -247,7 +297,11 @@ class SparsifyGatherTest : public ::testing::Test { // Two partitions NodeDef* w_node1; NodeDef* w_node2; + NodeDef* zeros_const1; + NodeDef* zeros_shape1; NodeDef* zeros_node1; + NodeDef* zeros_const2; + NodeDef* zeros_shape2; NodeDef* zeros_node2; NodeDef* assign_node1; NodeDef* assign_node2; @@ -260,51 +314,53 @@ class SparsifyGatherTest : public ::testing::Test { SetNodeTensorAttr("value", weights, w_node1); SetNodeTensorAttr("value", weights, w_node2); } else { + NodeDef* save_const_node = + CreateNode("save/Const", "Const", {}, &graph_def); + + NodeDef* tensor_names_node = + CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def); + Tensor tensor_names_values(DT_STRING, TensorShape({2})); + test::FillValues(&tensor_names_values, {"w1", "w2"}); + SetNodeTensorAttr("value", tensor_names_values, + tensor_names_node); + + NodeDef* tensor_shapes_slices_node = CreateNode( + "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def); + Tensor shapes_slices_val(DT_STRING, TensorShape({2})); + shapes_slices_val.flat()(0) = "4 1 0,4:0,1"; + shapes_slices_val.flat()(1) = "4 1 0,4:0,1"; + SetNodeTensorAttr("value", shapes_slices_val, + tensor_shapes_slices_node); + + NodeDef* restore_node = CreateNode( + "save/RestoreV2", "RestoreV2", + {save_const_node, tensor_names_node, tensor_shapes_slices_node}, + &graph_def); + w_node1 = CreateNode("w1/part_1", "VariableV2", {}, &graph_def); - zeros_node1 = - CreateNode("w1/part_1/Initializer/zeros", "Const", {}, &graph_def); + + zeros_shape1 = CreateNode("w1/part_1/Initializer/zeros/shape_as_tensor", + "Const", {}, &graph_def); + zeros_const1 = CreateNode("w1/part_1/Initializer/zeros/Const", "Const", + {}, &graph_def); + zeros_node1 = CreateNode("w1/part_1/Initializer/zeros", "Fill", + {zeros_shape1, zeros_const1}, &graph_def); assign_node1 = CreateNode("w1/part_1/Assign", "Assign", {w_node1, zeros_node1}, &graph_def); - NodeDef* save_const_node = - CreateNode("save/Const", "Const", {}, &graph_def); - NodeDef* tensor_names_node1 = - CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def); - NodeDef* tensor_shapes_slices_node1 = CreateNode( - "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def); - - Tensor shapes_slices_val1(DT_STRING, TensorShape({1})); - shapes_slices_val1.flat()(0) = "4 1 0,4:0,1"; - SetNodeTensorAttr("value", shapes_slices_val1, - tensor_shapes_slices_node1); - - NodeDef* restore_node1 = CreateNode( - "save/RestoreV2", "RestoreV2", - {save_const_node, tensor_names_node1, tensor_shapes_slices_node1}, - &graph_def); - CreateNode("save/Assign", "Assign", {w_node1, restore_node1}, &graph_def); + CreateNode("save/Assign", "Assign", {w_node1, restore_node}, &graph_def); w_node2 = CreateNode("w2/part_1", "VariableV2", {}, &graph_def); - zeros_node2 = - CreateNode("w2/part_1/Initializer/zeros", "Const", {}, &graph_def); + zeros_shape2 = CreateNode("w2/part_1/Initializer/zeros/shape_as_tensor", + "Const", {}, &graph_def); + zeros_const2 = CreateNode("w2/part_1/Initializer/zeros/Const", "Const", + {}, &graph_def); + zeros_node2 = CreateNode("w2/part_1/Initializer/zeros", "Fill", + {zeros_shape2, zeros_const2}, &graph_def); assign_node2 = CreateNode("w2/part_1/Assign", "Assign", {w_node2, zeros_node2}, &graph_def); - NodeDef* tensor_names_node2 = - CreateNode("save/RestoreV2_1/tensor_names", "Const", {}, &graph_def); - NodeDef* tensor_shapes_slices_node2 = CreateNode( - "save/RestoreV2_1/shape_and_slices", "Const", {}, &graph_def); - - Tensor shapes_slices_val2(DT_STRING, TensorShape({1})); - shapes_slices_val2.flat()(0) = "4 1 0,4:0,1"; - SetNodeTensorAttr("value", shapes_slices_val2, - tensor_shapes_slices_node2); - - NodeDef* restore_node2 = CreateNode( - "save/RestoreV2_1", "RestoreV2", - {save_const_node, tensor_names_node2, tensor_shapes_slices_node2}, - &graph_def); - CreateNode("save/Assign_1", "Assign", {w_node2, restore_node2}, + CreateNode("save/Assign_1", "Assign", {w_node2, restore_node}, &graph_def); BundleWriter writer(Env::Default(), checkpoint_path); @@ -322,6 +378,13 @@ class SparsifyGatherTest : public ::testing::Test { MakeGather("gather1", gather_v2, identity_node1, input_node, &graph_def); MakeGather("gather2", gather_v2, identity_node2, input_node, &graph_def); + NodeDef* concat_axis_node = + CreateNode("linear/concat/axis", "Const", {}, &graph_def); + NodeDef* concat_node = CreateNode( + "concat/node", "ConcatV2", + {identity_node1, identity_node2, concat_axis_node}, &graph_def); + SetNodeAttr("N", 2, concat_node); + // Shared init node if (include_shared_init) { if (!test_variable) { @@ -350,8 +413,14 @@ class SparsifyGatherTest : public ::testing::Test { MapNamesToNodes(result, &node_lookup); // Check nodes. + EXPECT_EQ(0, + node_lookup.count("w1/part_1/Initializer/zeros/shape_as_tensor")); + EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros/Const")); EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros")); EXPECT_EQ(0, node_lookup.count("w1/part_1/Assign")); + EXPECT_EQ(0, + node_lookup.count("w2/part_1/Initializer/zeros/shape_as_tensor")); + EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros/Const")); EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros")); EXPECT_EQ(0, node_lookup.count("w2/part_1/Assign")); EXPECT_EQ(1, node_lookup.count("ids")); @@ -487,6 +556,9 @@ class SparsifyGatherTest : public ::testing::Test { node_lookup.at("gather2/LookupTableFind")->input(2)); EXPECT_EQ("gather2/LookupTableFind", node_lookup.at("gather2")->input(0)); + EXPECT_EQ(0, node_lookup.count("linear/concat/axis")); + EXPECT_EQ(0, node_lookup.count("concat/node")); + // Check control deps. EXPECT_EQ(2, node_lookup.at(shared_init_name)->input_size()); EXPECT_NE(std::find(node_lookup.at(shared_init_name)->input().begin(), @@ -522,18 +594,31 @@ class SparsifyGatherTest : public ::testing::Test { }; TEST_F(SparsifyGatherTest, TestSinglePartition) { - TestSinglePartition(false, false, false); - TestSinglePartition(false, true, false); - TestSinglePartition(true, false, false); - TestSinglePartition(true, true, false); - TestSinglePartition(false, false, true); - TestSinglePartition(false, true, true); - TestSinglePartition(true, false, true); - TestSinglePartition(true, true, true); - TestSinglePartition(false, true, false, "shared_inits"); - TestSinglePartition(true, true, false, "shared_inits"); - TestSinglePartition(false, true, true, "shared_inits"); - TestSinglePartition(true, true, true, "shared_inits"); + TestSinglePartition(false, false, false, false); + TestSinglePartition(false, true, false, false); + TestSinglePartition(true, false, false, false); + TestSinglePartition(true, true, false, false); + TestSinglePartition(false, false, true, false); + TestSinglePartition(false, true, true, false); + TestSinglePartition(true, false, true, false); + TestSinglePartition(true, true, true, false); + TestSinglePartition(false, true, false, false, "shared_inits"); + TestSinglePartition(true, true, false, false, "shared_inits"); + TestSinglePartition(false, true, true, false, "shared_inits"); + TestSinglePartition(true, true, true, false, "shared_inits"); + + TestSinglePartition(false, false, false, true); + TestSinglePartition(false, true, false, true); + TestSinglePartition(true, false, false, true); + TestSinglePartition(true, true, false, true); + TestSinglePartition(false, false, true, true); + TestSinglePartition(false, true, true, true); + TestSinglePartition(true, false, true, true); + TestSinglePartition(true, true, true, true); + TestSinglePartition(false, true, false, true, "shared_inits"); + TestSinglePartition(true, true, false, true, "shared_inits"); + TestSinglePartition(false, true, true, true, "shared_inits"); + TestSinglePartition(true, true, true, true, "shared_inits"); } TEST_F(SparsifyGatherTest, TestMultiPartition) { diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD index dbc81599de8..7717d8d7de2 100644 --- a/tensorflow/tools/lib_package/BUILD +++ b/tensorflow/tools/lib_package/BUILD @@ -99,6 +99,7 @@ genrule( "//third_party/hadoop:LICENSE.txt", "//third_party/eigen3:LICENSE", "//third_party/fft2d:LICENSE", + "@aws//:LICENSE", "@boringssl//:LICENSE", "@com_googlesource_code_re2//:LICENSE", "@cub_archive//:LICENSE.TXT", @@ -114,6 +115,7 @@ genrule( "@libxsmm_archive//:LICENSE", "@lmdb//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", + "@nasm//:LICENSE", "@nsync//:LICENSE", "@png_archive//:LICENSE", "@protobuf_archive//:LICENSE", @@ -134,6 +136,7 @@ genrule( "//third_party/hadoop:LICENSE.txt", "//third_party/eigen3:LICENSE", "//third_party/fft2d:LICENSE", + "@aws//:LICENSE", "@boringssl//:LICENSE", "@com_googlesource_code_re2//:LICENSE", "@cub_archive//:LICENSE.TXT", @@ -149,6 +152,7 @@ genrule( "@libxsmm_archive//:LICENSE", "@lmdb//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", + "@nasm//:LICENSE", "@nsync//:LICENSE", "@png_archive//:LICENSE", "@protobuf_archive//:LICENSE", diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index c87a3048fa5..bd2e3adb841 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -89,13 +89,20 @@ filegroup( "//third_party/eigen3:LICENSE", "//third_party/fft2d:LICENSE", "//third_party/hadoop:LICENSE.txt", + "@absl_py//absl/flags:LICENSE", + "@arm_neon_2_x86_sse//:LICENSE", + "@astor_archive//:LICENSE", + "@aws//:LICENSE", "@boringssl//:LICENSE", + "@com_google_absl//:LICENSE", "@com_googlesource_code_re2//:LICENSE", "@cub_archive//:LICENSE.TXT", "@curl//:COPYING", "@eigen_archive//:COPYING.MPL2", "@farmhash_archive//:COPYING", "@fft2d//:fft/readme.txt", + "@flatbuffers//:LICENSE.txt", + "@gast_archive//:PKG-INFO", "@gemmlowp//:LICENSE", "@gif_archive//:COPYING", "@grpc//:LICENSE", @@ -106,11 +113,15 @@ filegroup( "@lmdb//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", "@grpc//third_party/nanopb:LICENSE.txt", + "@nasm//:LICENSE", "@nsync//:LICENSE", + "@pcre//:LICENCE", "@png_archive//:LICENSE", "@protobuf_archive//:LICENSE", "@six_archive//:LICENSE", "@snappy//:COPYING", + "@swig//:LICENSE", + "@termcolor_archive//:COPYING.txt", "@zlib_archive//:zlib.h", "@org_python_pypi_backports_weakref//:LICENSE", ] + if_mkl([ @@ -152,9 +163,10 @@ sh_binary( "//tensorflow/contrib/ndlstm:ndlstm", "//tensorflow/contrib/nn:nn_py", "//tensorflow/contrib/predictor:predictor_pip", - "//tensorflow/contrib/py2tf:py2tf_internal", + "//tensorflow/contrib/py2tf:py2tf", "//tensorflow/contrib/py2tf/converters:converters", "//tensorflow/contrib/py2tf/converters:test_lib", + "//tensorflow/contrib/py2tf/impl:impl", "//tensorflow/contrib/py2tf/pyct:pyct", "//tensorflow/contrib/py2tf/pyct/static_analysis:static_analysis", "//tensorflow/contrib/receptive_field:receptive_field_pip", diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py index 38a90073878..73d759eb130 100644 --- a/tensorflow/tools/pip_package/pip_smoke_test.py +++ b/tensorflow/tools/pip_package/pip_smoke_test.py @@ -65,7 +65,6 @@ BLACKLIST = [ "//tensorflow/contrib/framework:checkpoint_ops_testdata", "//tensorflow/contrib/bayesflow:reinforce_simple_example", "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py", # pylint:disable=line-too-long - "//tensorflow/contrib/py2tf:py2tf_internal", "//tensorflow/contrib/timeseries/examples:predict", "//tensorflow/contrib/timeseries/examples:multivariate", "//tensorflow/contrib/timeseries/examples:known_anomaly", diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index 62df6453fb5..d7fab2b93ac 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -29,16 +29,17 @@ from setuptools.dist import Distribution # This version string is semver compatible, but incompatible with pip. # For pip, we will remove all '-' characters from this string, and use the # result for pip. -_VERSION = '1.5.0-rc1' +_VERSION = '1.5.0' REQUIRED_PACKAGES = [ 'absl-py >= 0.1.6', 'astor >= 0.6.0', 'gast >= 0.2.0', + 'grpcio >= 1.8.6', 'numpy >= 1.12.1', 'six >= 1.10.0', 'protobuf >= 3.4.0', - 'tensorflow-tensorboard >= 0.4.0', + 'tensorflow-tensorboard >= 1.5.0, < 1.6.0', 'termcolor >= 1.1.0', ] @@ -79,13 +80,13 @@ CONSOLE_SCRIPTS = [ # is now declared by the tensorboard pip package. If we remove the # TensorBoard command, pip will inappropriately remove it during install, # even though the command is not removed, just moved to a different wheel. - 'tensorboard = tensorboard.main:main', + 'tensorboard = tensorboard.main:run_main', ] # pylint: enable=line-too-long # remove the tensorboard console script if building tf_nightly if 'tf_nightly' in project_name: - CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:main') + CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main') TEST_PACKAGES = [ 'scipy >= 0.15.1', diff --git a/tensorflow/tools/test/file_name_test.py b/tensorflow/tools/test/file_name_test.py new file mode 100644 index 00000000000..16fb8a822d0 --- /dev/null +++ b/tensorflow/tools/test/file_name_test.py @@ -0,0 +1,48 @@ +#!/usr/bin/python +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Test that checks if we have any issues with case insensitive filesystems. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) +ERROR_MESSAGE = """ +Files with same name but different case detected in directory: {} +""" + + +def main(): + # Make sure BASE_DIR ends with tensorflow. If it doesn't, we probably + # computed the wrong directory. + if os.path.split(BASE_DIR)[-1] != 'tensorflow': + raise AssertionError( + "BASE_DIR = '%s' doesn't end with tensorflow" % BASE_DIR) + + for dirpath, dirnames, filenames in os.walk(BASE_DIR, followlinks=True): + lowercase_directories = [x.lower() for x in dirnames] + lowercase_files = [x.lower() for x in filenames] + + lowercase_dir_contents = lowercase_directories + lowercase_files + if len(lowercase_dir_contents) != len(set(lowercase_dir_contents)): + raise AssertionError(ERROR_MESSAGE.format(dirpath)) + + +if __name__ == '__main__': + main() diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py index a953ed1b53d..3b4921bb983 100644 --- a/tensorflow/tools/test/run_and_gather_logs_lib.py +++ b/tensorflow/tools/test/run_and_gather_logs_lib.py @@ -136,7 +136,7 @@ def run_and_gather_logs(name, test_name, test_args, gpu_config = gpu_info_lib.gather_gpu_devices() if gpu_config: gpu_name = gpu_config[0].model - gpu_short_name_match = re.search(r"Tesla (K40|K80|P100)", gpu_name) + gpu_short_name_match = re.search(r"Tesla (K40|K80|P100|V100)", gpu_name) if gpu_short_name_match: gpu_short_name = gpu_short_name_match.group(0) test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_") diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index f7d9075032e..eca744a920c 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -114,16 +114,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""): ], sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478", strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f", + build_file = str(Label("//third_party:com_google_absl.BUILD")), ) tf_http_archive( name = "eigen_archive", urls = [ - "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/14e1418fcf12.tar.gz", - "https://bitbucket.org/eigen/eigen/get/14e1418fcf12.tar.gz", + "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz", + "https://bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz", ], - sha256 = "2b526c6888639025323fd4f2600533c0f982d304ea48e4f1663e8066bd9f6368", - strip_prefix = "eigen-eigen-14e1418fcf12", + sha256 = "0cadb31a35b514bf2dfd6b5d38205da94ef326ec6908fc3fd7c269948467214f", + strip_prefix = "eigen-eigen-2355b229ea4c", build_file = str(Label("//third_party:eigen.BUILD")), ) @@ -352,16 +353,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "protobuf_archive", urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz", - "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz", + "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz", ], - sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a", - strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9", - # TODO: remove patching when tensorflow stops linking same protos into - # multiple shared libraries loaded in runtime by python. - # This patch fixes a runtime crash when tensorflow is compiled - # with clang -O2 on Linux (see https://github.com/tensorflow/tensorflow/issues/8394) - patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")), + sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3", + strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a", ) # We need to import the protobuf library under the names com_google_protobuf @@ -370,21 +366,21 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "com_google_protobuf", urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz", - "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz", + "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz", ], - sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a", - strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9", + sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3", + strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a", ) tf_http_archive( name = "com_google_protobuf_cc", urls = [ - "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz", - "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz", + "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz", + "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz", ], - sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a", - strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9", + sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3", + strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a", ) tf_http_archive( @@ -477,11 +473,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""): tf_http_archive( name = "llvm", urls = [ - "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz", - "https://github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz", + "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/299f8c346e1ab483463da5f02536ffd00b7ad9c6.tar.gz", + "https://github.com/llvm-mirror/llvm/archive/299f8c346e1ab483463da5f02536ffd00b7ad9c6.tar.gz", ], - sha256 = "b5429ccf8d57273cb8489714f728c997cd720ec66fc2c0292422ab8f0e729ce0", - strip_prefix = "llvm-11a2ca6eea8a7fe240a14c0c35fd2017341279be", + sha256 = "0556bc6a85000c573d92fe00946b6418cbcd3844912696a81055e4768299dda4", + strip_prefix = "llvm-299f8c346e1ab483463da5f02536ffd00b7ad9c6", build_file = str(Label("//third_party/llvm:llvm.BUILD")), ) @@ -560,6 +556,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""): build_file = str(Label("//third_party:nccl.BUILD")), ) + tf_http_archive( + name = "kafka", + urls = [ + "https://mirror.bazel.build/github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz", + "https://github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz", + ], + sha256 = "dd035d57c8f19b0b612dd6eefe6e5eebad76f506e302cccb7c2066f25a83585e", + strip_prefix = "librdkafka-0.11.1", + build_file = str(Label("//third_party:kafka/BUILD")), + patch_file = str(Label("//third_party/kafka:config.patch")), + ) + tf_http_archive( name = "aws", urls = [ diff --git a/third_party/com_google_absl.BUILD b/third_party/com_google_absl.BUILD new file mode 100644 index 00000000000..8fca145f751 --- /dev/null +++ b/third_party/com_google_absl.BUILD @@ -0,0 +1,5 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # Apache + +exports_files(["LICENSE"]) diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD index f6b8e6ddb05..824c97be60e 100644 --- a/third_party/flatbuffers/flatbuffers.BUILD +++ b/third_party/flatbuffers/flatbuffers.BUILD @@ -4,6 +4,8 @@ package( licenses(["notice"]) # Apache 2.0 +exports_files(["LICENSE.txt"]) + config_setting( name = "freebsd", values = {"cpu": "freebsd"}, diff --git a/third_party/gast.BUILD b/third_party/gast.BUILD index 06db528ada2..4866982e1fd 100644 --- a/third_party/gast.BUILD +++ b/third_party/gast.BUILD @@ -3,7 +3,7 @@ licenses(["notice"]) # BSD 3-clause -exports_files(["LICENSE"]) +exports_files(["PKG-INFO"]) py_library( name = "gast", diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl index 47cc4a5e69b..e23a533716a 100644 --- a/third_party/gpus/cuda_configure.bzl +++ b/third_party/gpus/cuda_configure.bzl @@ -835,7 +835,7 @@ def symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name, if src_dir != None: src_dir = _norm_path(src_dir) dest_dir = _norm_path(dest_dir) - files = _read_dir(repository_ctx, src_dir) + files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines())) # Create a list with the src_dir stripped to use for outputs. dest_files = files.replace(src_dir, '').splitlines() src_files = files.splitlines() diff --git a/third_party/kafka/BUILD b/third_party/kafka/BUILD new file mode 100644 index 00000000000..a61a9e1f6c2 --- /dev/null +++ b/third_party/kafka/BUILD @@ -0,0 +1,147 @@ +# Description: +# Kafka C/C++ (librdkafka) client library + +licenses(["notice"]) # 2-clause BSD license + +exports_files(["LICENSE"]) + +cc_library( + name = "kafka", + srcs = [ + "config.h", + "src-cpp/ConfImpl.cpp", + "src-cpp/ConsumerImpl.cpp", + "src-cpp/HandleImpl.cpp", + "src-cpp/KafkaConsumerImpl.cpp", + "src-cpp/MessageImpl.cpp", + "src-cpp/MetadataImpl.cpp", + "src-cpp/QueueImpl.cpp", + "src-cpp/RdKafka.cpp", + "src-cpp/TopicImpl.cpp", + "src-cpp/TopicPartitionImpl.cpp", + "src/crc32c.c", + "src/crc32c.h", + "src/lz4.c", + "src/lz4.h", + "src/lz4frame.c", + "src/lz4frame.h", + "src/lz4frame_static.h", + "src/lz4hc.c", + "src/lz4hc.h", + "src/lz4opt.h", + "src/queue.h", + "src/rd.h", + "src/rdaddr.c", + "src/rdaddr.h", + "src/rdatomic.h", + "src/rdavg.h", + "src/rdavl.c", + "src/rdavl.h", + "src/rdbuf.c", + "src/rdbuf.h", + "src/rdcrc32.h", + "src/rddl.h", + "src/rdendian.h", + "src/rdgz.c", + "src/rdgz.h", + "src/rdinterval.h", + "src/rdkafka.c", + "src/rdkafka.h", + "src/rdkafka_assignor.c", + "src/rdkafka_assignor.h", + "src/rdkafka_broker.c", + "src/rdkafka_broker.h", + "src/rdkafka_buf.c", + "src/rdkafka_buf.h", + "src/rdkafka_cgrp.c", + "src/rdkafka_cgrp.h", + "src/rdkafka_conf.c", + "src/rdkafka_conf.h", + "src/rdkafka_event.h", + "src/rdkafka_feature.c", + "src/rdkafka_feature.h", + "src/rdkafka_int.h", + "src/rdkafka_interceptor.c", + "src/rdkafka_interceptor.h", + "src/rdkafka_lz4.c", + "src/rdkafka_lz4.h", + "src/rdkafka_metadata.c", + "src/rdkafka_metadata.h", + "src/rdkafka_metadata_cache.c", + "src/rdkafka_msg.c", + "src/rdkafka_msg.h", + "src/rdkafka_msgset.h", + "src/rdkafka_msgset_reader.c", + "src/rdkafka_msgset_writer.c", + "src/rdkafka_offset.c", + "src/rdkafka_offset.h", + "src/rdkafka_op.c", + "src/rdkafka_op.h", + "src/rdkafka_partition.c", + "src/rdkafka_partition.h", + "src/rdkafka_pattern.c", + "src/rdkafka_pattern.h", + "src/rdkafka_proto.h", + "src/rdkafka_queue.c", + "src/rdkafka_queue.h", + "src/rdkafka_range_assignor.c", + "src/rdkafka_request.c", + "src/rdkafka_request.h", + "src/rdkafka_roundrobin_assignor.c", + "src/rdkafka_sasl.c", + "src/rdkafka_sasl.h", + "src/rdkafka_sasl_int.h", + "src/rdkafka_sasl_plain.c", + "src/rdkafka_subscription.c", + "src/rdkafka_subscription.h", + "src/rdkafka_timer.c", + "src/rdkafka_timer.h", + "src/rdkafka_topic.c", + "src/rdkafka_topic.h", + "src/rdkafka_transport.c", + "src/rdkafka_transport.h", + "src/rdkafka_transport_int.h", + "src/rdlist.c", + "src/rdlist.h", + "src/rdlog.c", + "src/rdlog.h", + "src/rdports.c", + "src/rdports.h", + "src/rdposix.h", + "src/rdrand.c", + "src/rdrand.h", + "src/rdregex.c", + "src/rdregex.h", + "src/rdstring.c", + "src/rdstring.h", + "src/rdsysqueue.h", + "src/rdtime.h", + "src/rdtypes.h", + "src/rdunittest.c", + "src/rdunittest.h", + "src/rdvarint.c", + "src/rdvarint.h", + "src/snappy.c", + "src/snappy.h", + "src/tinycthread.c", + "src/tinycthread.h", + "src/xxhash.c", + "src/xxhash.h", + ], + hdrs = [ + "config.h", + ], + defines = [ + ], + includes = [ + "src", + "src-cpp", + ], + linkopts = [ + "-lpthread", + ], + visibility = ["//visibility:public"], + deps = [ + "@boringssl//:ssl", + ], +) diff --git a/third_party/kafka/config.patch b/third_party/kafka/config.patch new file mode 100644 index 00000000000..fa5c2d35b40 --- /dev/null +++ b/third_party/kafka/config.patch @@ -0,0 +1,44 @@ +diff -Naur a/config.h b/config.h +--- a/config.h 1970-01-01 00:00:00.000000000 +0000 ++++ b/config.h 2017-10-28 00:57:03.316957390 +0000 +@@ -0,0 +1,40 @@ ++#pragma once ++#define WITHOUT_OPTIMIZATION 0 ++#define ENABLE_DEVEL 0 ++#define ENABLE_REFCNT_DEBUG 0 ++#define ENABLE_SHAREDPTR_DEBUG 0 ++ ++#define HAVE_ATOMICS_32 1 ++#define HAVE_ATOMICS_32_SYNC 1 ++ ++#if (HAVE_ATOMICS_32) ++# if (HAVE_ATOMICS_32_SYNC) ++# define ATOMIC_OP32(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL) ++# else ++# define ATOMIC_OP32(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST) ++# endif ++#endif ++ ++#define HAVE_ATOMICS_64 1 ++#define HAVE_ATOMICS_64_SYNC 1 ++ ++#if (HAVE_ATOMICS_64) ++# if (HAVE_ATOMICS_64_SYNC) ++# define ATOMIC_OP64(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL) ++# else ++# define ATOMIC_OP64(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST) ++# endif ++#endif ++ ++ ++#define WITH_ZLIB 1 ++#define WITH_LIBDL 1 ++#define WITH_PLUGINS 0 ++#define WITH_SNAPPY 1 ++#define WITH_SOCKEM 1 ++#define WITH_SSL 1 ++#define WITH_SASL 0 ++#define WITH_SASL_SCRAM 0 ++#define WITH_SASL_CYRUS 0 ++#define HAVE_REGEX 1 ++#define HAVE_STRNDUP 1 diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD index 5344525ba8b..a9e1341a03c 100644 --- a/third_party/llvm/llvm.BUILD +++ b/third_party/llvm/llvm.BUILD @@ -670,6 +670,28 @@ cc_library( ], ) +cc_library( + name = "aggressive_inst_combine", + srcs = glob([ + "lib/Transforms/AggressiveInstCombine/*.c", + "lib/Transforms/AggressiveInstCombine/*.cpp", + "lib/Transforms/AggressiveInstCombine/*.inc", + "lib/Transforms/AggressiveInstCombine/*.h", + ]), + hdrs = glob([ + "include/llvm/Transforms/AggressiveInstCombine/*.h", + "include/llvm/Transforms/AggressiveInstCombine/*.def", + "include/llvm/Transforms/AggressiveInstCombine/*.inc", + ]), + deps = [ + ":analysis", + ":config", + ":core", + ":support", + ":transform_utils", + ], +) + cc_library( name = "analysis", srcs = glob([ @@ -1405,6 +1427,7 @@ cc_library( "include/llvm/Transforms/IPO/*.inc", ]), deps = [ + ":aggressive_inst_combine", ":analysis", ":bit_reader", ":bit_writer", @@ -1931,6 +1954,7 @@ cc_library( "include/llvm/Transforms/IPO/SCCP.h", ]), deps = [ + ":aggressive_inst_combine", ":analysis", ":config", ":core", diff --git a/third_party/pcre.BUILD b/third_party/pcre.BUILD index e2cdec40295..3a8e7a10b43 100644 --- a/third_party/pcre.BUILD +++ b/third_party/pcre.BUILD @@ -1,6 +1,6 @@ licenses(["notice"]) # BSD -exports_files(["COPYING"]) +exports_files(["LICENCE"]) cc_library( name = "pcre", diff --git a/third_party/protobuf/add_noinlines.patch b/third_party/protobuf/add_noinlines.patch deleted file mode 100644 index af74798f067..00000000000 --- a/third_party/protobuf/add_noinlines.patch +++ /dev/null @@ -1,30 +0,0 @@ -diff -u -r a/src/google/protobuf/compiler/cpp/cpp_file.cc b/src/google/protobuf/compiler/cpp/cpp_file.cc ---- a/src/google/protobuf/compiler/cpp/cpp_file.cc 2017-02-10 23:55:34.000000000 +0100 -+++ b/src/google/protobuf/compiler/cpp/cpp_file.cc 2017-03-21 13:41:46.931979154 +0100 -@@ -557,7 +557,7 @@ - " $metadata$, $enum_descriptors$, $service_descriptors$);\n" - "}\n" - "\n" -- "void protobuf_AssignDescriptorsOnce() {\n" -+ "GOOGLE_ATTRIBUTE_NOINLINE void protobuf_AssignDescriptorsOnce() {\n" - " static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n" - " ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);\n" - "}\n" -@@ -656,7 +656,7 @@ - printer->Print( - "}\n" - "\n" -- "void InitDefaults() {\n" -+ "GOOGLE_ATTRIBUTE_NOINLINE void InitDefaults() {\n" - " static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n" - " ::google::protobuf::GoogleOnceInit(&once, &TableStruct::InitDefaultsImpl);\n" - "}\n"); -@@ -737,7 +737,7 @@ - printer->Print( - "}\n" - "\n" -- "void AddDescriptors() {\n" -+ "GOOGLE_ATTRIBUTE_NOINLINE void AddDescriptors() {\n" - " static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n" - " ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);\n" - "}\n"); diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl index c16eb3a12a8..954f21f5f8f 100644 --- a/third_party/py/python_configure.bzl +++ b/third_party/py/python_configure.bzl @@ -118,7 +118,7 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name, if src_dir != None: src_dir = _norm_path(src_dir) dest_dir = _norm_path(dest_dir) - files = _read_dir(repository_ctx, src_dir) + files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines())) # Create a list with the src_dir stripped to use for outputs. dest_files = files.replace(src_dir, '').splitlines() src_files = files.splitlines() diff --git a/third_party/repo.bzl b/third_party/repo.bzl index 11e9c842d2f..aa178fa8cab 100644 --- a/third_party/repo.bzl +++ b/third_party/repo.bzl @@ -27,7 +27,7 @@ def _wrap_bash_cmd(ctx, cmd): bazel_sh = _get_env_var(ctx, "BAZEL_SH") if not bazel_sh: fail("BAZEL_SH environment variable is not set") - cmd = [bazel_sh, "-c", " ".join(cmd)] + cmd = [bazel_sh, "-l", "-c", " ".join(cmd)] return cmd def _get_env_var(ctx, name): diff --git a/third_party/termcolor.BUILD b/third_party/termcolor.BUILD index 6000e3289de..655d7cb85e5 100644 --- a/third_party/termcolor.BUILD +++ b/third_party/termcolor.BUILD @@ -3,7 +3,7 @@ licenses(["notice"]) # MIT -exports_files(["LICENSE"]) +exports_files(["COPYING.txt"]) py_library( name = "termcolor",
Version:CPU/GPU:Python Version:Compiler:Build Tools:cuDNN:CUDA:
tensorflow-1.5.0-rc1CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.5.0-rc1GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.5.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.5.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.379
tensorflow-1.4.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A
tensorflow_gpu-1.4.0GPU3.5-3.6MSVC 2015 update 3Cmake v3.6.368
tensorflow-1.3.0CPU3.5-3.6MSVC 2015 update 3Cmake v3.6.3N/AN/A