Merge pull request #2518 from vrv/branch_123332988

Upstream changes from internal
2016-05-26 15:08:06 -07:00 · 2016-05-26 15:08:06 -07:00 · 15e51e6113
commit 15e51e6113
parent 708cd77460 950cdab050
602 changed files with 943 additions and 321 deletions
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@ -53,6 +53,7 @@ def optimize_loss(loss,
                  clip_gradients=None,
                  moving_average_decay=0.9,
                  learning_rate_decay_fn=None,
+                  update_ops=None,
                  variables=None,
                  name=None):
  """Given loss and parameters for optimizer, returns a training op.
@ -81,6 +82,8 @@ def optimize_loss(loss,
                            Can be used to implement any learning rate decay
                            functions.
                            For example: tf.train.exponential_decay.
+    update_ops: list of update `Operation`s to execute at each step. If `None`,
+                uses elements of UPDATE_OPS collection.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
@ -92,6 +95,15 @@ def optimize_loss(loss,
    ValueError: if optimizer is wrong type.
  """
  with vs.variable_op_scope([loss, global_step], name, "OptimizeLoss"):
+    # Update ops take UPDATE_OPS collection if not provided.
+    update_ops = (set(update_ops or []) or
+                  set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)))
+    # Make sure update ops are ran before computing loss.
+    if update_ops:
+      with ops.control_dependencies(update_ops):
+        barrier = control_flow_ops.no_op(name="update_barrier")
+      loss = control_flow_ops.with_dependencies([barrier], loss)
+
    # Moving average of the loss with decay.
    if moving_average_decay is not None:
      # Generate moving averages of the loss.
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@ -132,6 +132,25 @@ class OptimizersTest(tf.test.TestCase):
    tf.contrib.layers.optimize_loss(
        loss, global_step, learning_rate=0.1, optimizer="SGD")

+  def testUpdateOp(self):
+    optimizers = ["SGD", tf.train.GradientDescentOptimizer,
+                  tf.train.GradientDescentOptimizer(learning_rate=0.1)]
+    for optimizer in optimizers:
+      with tf.Graph().as_default() as g:
+        with self.test_session(graph=g) as session:
+          x, var, loss, global_step = _setup_model()
+          update_op = tf.assign(var, 20)
+          train = tf.contrib.layers.optimize_loss(loss,
+                                                  global_step,
+                                                  learning_rate=0.1,
+                                                  optimizer=optimizer,
+                                                  update_ops=[update_op])
+          tf.initialize_all_variables().run()
+          session.run(train, feed_dict={x: 5})
+          var_value, global_step_value = session.run([var, global_step])
+          # 19.5, due to update of var to 20 before loss computation.
+          self.assertEqual(var_value, 19.5)
+          self.assertEqual(global_step_value, 1)

 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@ -195,7 +195,10 @@ def train(graph,
    raise ValueError('No "global_step" was provided or found in the graph.')

  # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
-  if not monitors:
+  if not supervisor_is_chief:
+    # monitors should run only in supervisor.
+    monitors = []
+  elif not monitors:
    monitors = monitors_lib.get_default_monitors(
        loss_op=loss_op,
        summary_op=logging_ops.get_summary_op(),
--- a/tensorflow/contrib/learn/python/learn/io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/io/graph_io.py
@ -26,8 +26,9 @@ from tensorflow.python.training import input as input_ops


 def read_batch_examples(file_pattern, batch_size, reader,
-                        randomize_input=True, queue_capacity=10000,
-                        num_threads=1, name='dequeue_examples'):
+                        randomize_input=True, num_epochs=None,
+                        queue_capacity=10000, num_threads=1,
+                        name=None):
  """Adds operations to read, queue, batch `Example` protos.

  Given file pattern (or list of files), will setup a queue for file names,
@ -46,6 +47,10 @@ def read_batch_examples(file_pattern, batch_size, reader,
    reader: A function or class that returns an object with
      `read` method, (filename tensor) -> (example tensor).
    randomize_input: Whether the input should be randomized.
+    num_epochs: Integer specifying the number of times to read through the
+      dataset. If `None`, cycles through the dataset forever.
+      NOTE - If specified, creates a variable that must be initialized, so call
+      `tf.initialize_all_variables()` as shown in the tests.
    queue_capacity: Capacity for input queue.
    num_threads: The number of threads enqueuing examples.
    name: Name of resulting op.
@ -82,39 +87,47 @@ def read_batch_examples(file_pattern, batch_size, reader,
        (batch_size, queue_capacity))
  if (not num_threads) or (num_threads <= 0):
    raise ValueError('Invalid num_threads %s.' % num_threads)
+  if (num_epochs is not None) and (num_epochs <= 0):
+    raise ValueError('Invalid num_epochs %s.' % num_epochs)

-  with ops.name_scope(name) as scope:
+  with ops.op_scope([file_pattern], name, 'read_batch_examples') as scope:
    # Setup filename queue with shuffling.
    with ops.name_scope('file_name_queue') as file_name_queue_scope:
      file_name_queue = input_ops.string_input_producer(
          constant_op.constant(file_names, name='input'),
-          shuffle=randomize_input, name=file_name_queue_scope)
+          shuffle=randomize_input, num_epochs=num_epochs,
+          name=file_name_queue_scope)

-    # Create reader and set it to read from filename queue.
+    # Create readers, one per thread and set them to read from filename queue.
    with ops.name_scope('read'):
-      _, example_proto = reader().read(file_name_queue)
+      example_list = []
+      for _ in range(num_threads):
+        _, example_proto = reader().read(file_name_queue)
+        example_list.append([example_proto])

-    # Setup batching queue.
+    # Setup batching queue given list of read example tensors.
    if randomize_input:
      if isinstance(batch_size, ops.Tensor):
        min_after_dequeue = int(queue_capacity * 0.4)
      else:
        min_after_dequeue = max(queue_capacity - (3 * batch_size), batch_size)
-      examples = input_ops.shuffle_batch(
-          [example_proto], batch_size, capacity=queue_capacity,
-          num_threads=num_threads, min_after_dequeue=min_after_dequeue,
+      examples = input_ops.shuffle_batch_join(
+          example_list, batch_size, capacity=queue_capacity,
+          min_after_dequeue=min_after_dequeue,
          name=scope)
    else:
-      examples = input_ops.batch(
-          [example_proto], batch_size, capacity=queue_capacity,
-          num_threads=num_threads, name=scope)
+      examples = input_ops.batch_join(
+          example_list, batch_size, capacity=queue_capacity,
+          name=scope)

    return examples


 def read_batch_features(file_pattern, batch_size, features, reader,
-                        randomize_input=True, queue_capacity=10000,
-                        num_threads=1, name='dequeue_examples'):
+                        randomize_input=True, num_epochs=None,
+                        queue_capacity=10000, reader_num_threads=1,
+                        parser_num_threads=1,
+                        name=None):
  """Adds operations to read, queue, batch and parse `Example` protos.

  Given file pattern (or list of files), will setup a queue for file names,
@ -136,8 +149,13 @@ def read_batch_features(file_pattern, batch_size, features, reader,
    reader: A function or class that returns an object with
      `read` method, (filename tensor) -> (example tensor).
    randomize_input: Whether the input should be randomized.
+    num_epochs: Integer specifying the number of times to read through the
+      dataset. If None, cycles through the dataset forever. NOTE - If specified,
+      creates a variable that must be initialized, so call
+      tf.initialize_all_variables() as shown in the tests.
    queue_capacity: Capacity for input queue.
-    num_threads: The number of threads enqueuing examples.
+    reader_num_threads: The number of threads to read examples.
+    parser_num_threads: The number of threads to parse examples.
    name: Name of resulting op.

  Returns:
@ -146,17 +164,29 @@ def read_batch_features(file_pattern, batch_size, features, reader,
  Raises:
    ValueError: for invalid inputs.
  """
-  examples = read_batch_examples(
-      file_pattern, batch_size, reader, randomize_input,
-      queue_capacity, num_threads, name=name)
+  with ops.op_scope([file_pattern], name, 'read_batch_features') as scope:
+    examples = read_batch_examples(
+        file_pattern, batch_size, reader, randomize_input=randomize_input,
+        num_epochs=num_epochs, queue_capacity=queue_capacity,
+        num_threads=reader_num_threads, name=scope)

-  # Parse features into tensors.
-  return parsing_ops.parse_example(examples, features)
+    # Parse features into tensors in many threads and put on the queue.
+    features_list = []
+    for _ in range(parser_num_threads):
+      features_list.append(parsing_ops.parse_example(examples, features))
+    return input_ops.batch_join(
+        features_list,
+        batch_size=batch_size,
+        capacity=queue_capacity,
+        enqueue_many=True,
+        name='parse_example_batch_join')


 def read_batch_record_features(file_pattern, batch_size, features,
-                               randomize_input=True, queue_capacity=10000,
-                               num_threads=1, name='dequeue_record_examples'):
+                               randomize_input=True, num_epochs=None,
+                               queue_capacity=10000, reader_num_threads=1,
+                               parser_num_threads=1,
+                               name='dequeue_record_examples'):
  """Reads TFRecord, queues, batches and parses `Example` proto.

  See more detailed description in `read_examples`.
@ -168,8 +198,13 @@ def read_batch_record_features(file_pattern, batch_size, features,
    features: A `dict` mapping feature keys to `FixedLenFeature` or
      `VarLenFeature` values.
    randomize_input: Whether the input should be randomized.
+    num_epochs: Integer specifying the number of times to read through the
+      dataset. If None, cycles through the dataset forever. NOTE - If specified,
+      creates a variable that must be initialized, so call
+      tf.initialize_all_variables() as shown in the tests.
    queue_capacity: Capacity for input queue.
-    num_threads: The number of threads enqueuing examples.
+    reader_num_threads: The number of threads to read examples.
+    parser_num_threads: The number of threads to parse examples.
    name: Name of resulting op.

  Returns:
@ -181,5 +216,6 @@ def read_batch_record_features(file_pattern, batch_size, features,
  return read_batch_features(
      file_pattern=file_pattern, batch_size=batch_size, features=features,
      reader=io_ops.TFRecordReader,
-      randomize_input=randomize_input,
-      queue_capacity=queue_capacity, num_threads=num_threads, name=name)
+      randomize_input=randomize_input, num_epochs=num_epochs,
+      queue_capacity=queue_capacity, reader_num_threads=reader_num_threads,
+      parser_num_threads=parser_num_threads, name=name)
--- a/tensorflow/contrib/learn/python/learn/io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/io/graph_io_test.py
@ -17,10 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import os
 import random
+import tempfile

 import tensorflow as tf

+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import gfile

@ -55,44 +58,83 @@ class GraphIOTest(tf.test.TestCase):

    self.assertRaisesRegexp(
        ValueError, "No files match",
-        tf.contrib.learn.io.read_batch_features,
-        _INVALID_FILE_PATTERN, default_batch_size, None, tf.TFRecordReader,
-        False, queue_capacity,
-        num_threads, name)
+        tf.contrib.learn.io.read_batch_examples,
+        _INVALID_FILE_PATTERN, default_batch_size, tf.TFRecordReader,
+        False, num_epochs=None, queue_capacity=queue_capacity,
+        num_threads=num_threads, name=name)
    self.assertRaisesRegexp(
        ValueError, "Invalid batch_size",
-        tf.contrib.learn.io.read_batch_features,
-        _VALID_FILE_PATTERN, None, None, tf.TFRecordReader,
-        False, queue_capacity, num_threads, name)
+        tf.contrib.learn.io.read_batch_examples,
+        _VALID_FILE_PATTERN, None, tf.TFRecordReader,
+        False, num_epochs=None, queue_capacity=queue_capacity,
+        num_threads=num_threads, name=name)
    self.assertRaisesRegexp(
        ValueError, "Invalid batch_size",
-        tf.contrib.learn.io.read_batch_features,
-        _VALID_FILE_PATTERN, -1, None, tf.TFRecordReader,
-        False, queue_capacity, num_threads, name)
+        tf.contrib.learn.io.read_batch_examples,
+        _VALID_FILE_PATTERN, -1, tf.TFRecordReader,
+        False, num_epochs=None, queue_capacity=queue_capacity,
+        num_threads=num_threads, name=name)
    self.assertRaisesRegexp(
        ValueError, "Invalid queue_capacity",
-        tf.contrib.learn.io.read_batch_features,
-        _VALID_FILE_PATTERN, default_batch_size, None, tf.TFRecordReader,
-        False, None, num_threads, name)
+        tf.contrib.learn.io.read_batch_examples,
+        _VALID_FILE_PATTERN, default_batch_size, tf.TFRecordReader,
+        False, num_epochs=None, queue_capacity=None,
+        num_threads=num_threads, name=name)
    self.assertRaisesRegexp(
        ValueError, "Invalid num_threads",
-        tf.contrib.learn.io.read_batch_features,
-        _VALID_FILE_PATTERN, default_batch_size, None, tf.TFRecordReader,
-        False, queue_capacity, None,
-        name)
+        tf.contrib.learn.io.read_batch_examples,
+        _VALID_FILE_PATTERN, default_batch_size, tf.TFRecordReader,
+        False, num_epochs=None, queue_capacity=queue_capacity,
+        num_threads=None, name=name)
    self.assertRaisesRegexp(
        ValueError, "Invalid num_threads",
-        tf.contrib.learn.io.read_batch_features,
-        _VALID_FILE_PATTERN, default_batch_size, None, tf.TFRecordReader,
-        False, queue_capacity, -1,
-        name)
+        tf.contrib.learn.io.read_batch_examples,
+        _VALID_FILE_PATTERN, default_batch_size, tf.TFRecordReader,
+        False, num_epochs=None, queue_capacity=queue_capacity,
+        num_threads=-1, name=name)
    self.assertRaisesRegexp(
        ValueError, "Invalid batch_size",
-        tf.contrib.learn.io.read_batch_features,
-        _VALID_FILE_PATTERN, queue_capacity + 1, None, tf.TFRecordReader,
-        False, queue_capacity, 1, name)
+        tf.contrib.learn.io.read_batch_examples,
+        _VALID_FILE_PATTERN, queue_capacity + 1, tf.TFRecordReader,
+        False, num_epochs=None, queue_capacity=queue_capacity,
+        num_threads=1, name=name)
+    self.assertRaisesRegexp(
+        ValueError, "Invalid num_epochs",
+        tf.contrib.learn.io.read_batch_examples,
+        _VALID_FILE_PATTERN, default_batch_size, tf.TFRecordReader,
+        False, num_epochs=-1, queue_capacity=queue_capacity, num_threads=1,
+        name=name)

-  def test_batch_tf_record(self):
+  def test_batch_record_features(self):
+    batch_size = 17
+    queue_capacity = 1234
+    name = "my_batch"
+    features = {"feature": tf.FixedLenFeature(shape=[0], dtype=tf.float32)}
+
+    with tf.Graph().as_default() as g, self.test_session(graph=g) as sess:
+      features = tf.contrib.learn.io.read_batch_record_features(
+          _VALID_FILE_PATTERN, batch_size, features, randomize_input=False,
+          queue_capacity=queue_capacity, reader_num_threads=2,
+          parser_num_threads=2, name=name)
+      self.assertEquals("%s/parse_example_batch_join:0" % name,
+                        features["feature"].name)
+      file_name_queue_name = "%s/file_name_queue" % name
+      file_names_name = "%s/input" % file_name_queue_name
+      example_queue_name = "%s/fifo_queue" % name
+      parse_example_queue_name = "%s/parse_example_batch_join" % name
+      op_nodes = test_util.assert_ops_in_graph({
+          file_names_name: "Const",
+          file_name_queue_name: "FIFOQueue",
+          "%s/read/TFRecordReader" % name: "TFRecordReader",
+          example_queue_name: "FIFOQueue",
+          parse_example_queue_name: "QueueDequeueMany",
+          name: "QueueDequeueMany"
+      }, g)
+      self.assertAllEqual(_FILE_NAMES, sess.run(["%s:0" % file_names_name])[0])
+      self.assertEqual(
+          queue_capacity, op_nodes[example_queue_name].attr["capacity"].i)
+
+  def test_one_epoch(self):
    batch_size = 17
    queue_capacity = 1234
    name = "my_batch"
@ -100,20 +142,25 @@ class GraphIOTest(tf.test.TestCase):
    with tf.Graph().as_default() as g, self.test_session(graph=g) as sess:
      inputs = tf.contrib.learn.io.read_batch_examples(
          _VALID_FILE_PATTERN, batch_size,
-          reader=tf.TFRecordReader, randomize_input=False,
+          reader=tf.TFRecordReader, randomize_input=True,
+          num_epochs=1,
          queue_capacity=queue_capacity, name=name)
      self.assertEquals("%s:0" % name, inputs.name)
      file_name_queue_name = "%s/file_name_queue" % name
+      file_name_queue_limit_name = (
+          "%s/limit_epochs/epochs" % file_name_queue_name)
      file_names_name = "%s/input" % file_name_queue_name
-      example_queue_name = "%s/fifo_queue" % name
+      example_queue_name = "%s/random_shuffle_queue" % name
      op_nodes = test_util.assert_ops_in_graph({
          file_names_name: "Const",
          file_name_queue_name: "FIFOQueue",
          "%s/read/TFRecordReader" % name: "TFRecordReader",
-          example_queue_name: "FIFOQueue",
-          name: "QueueDequeueMany"
+          example_queue_name: "RandomShuffleQueue",
+          name: "QueueDequeueMany",
+          file_name_queue_limit_name: "Variable"
      }, g)
-      self.assertAllEqual(_FILE_NAMES, sess.run(["%s:0" % file_names_name])[0])
+      self.assertEqual(
+          set(_FILE_NAMES), set(sess.run(["%s:0" % file_names_name])[0]))
      self.assertEqual(
          queue_capacity, op_nodes[example_queue_name].attr["capacity"].i)

@ -143,6 +190,34 @@ class GraphIOTest(tf.test.TestCase):
      self.assertEqual(
          queue_capacity, op_nodes[example_queue_name].attr["capacity"].i)

+  def test_read_csv(self):
+    gfile.Glob = self._orig_glob
+    tempdir = tempfile.mkdtemp()
+    filename = os.path.join(tempdir, "file.csv")
+    gfile.Open(filename, "w").write("ABC\nDEF\nGHK\n")
+
+    batch_size = 1
+    queue_capacity = 5
+    name = "my_batch"
+
+    with tf.Graph().as_default() as g, self.test_session(graph=g) as session:
+      inputs = tf.contrib.learn.io.read_batch_examples(
+          filename, batch_size,
+          reader=tf.TextLineReader, randomize_input=False,
+          num_epochs=1, queue_capacity=queue_capacity, name=name)
+      session.run(tf.initialize_all_variables())
+
+      coord = tf.train.Coordinator()
+      tf.train.start_queue_runners(session, coord=coord)
+
+      self.assertAllEqual(session.run(inputs), [b"ABC"])
+      self.assertAllEqual(session.run(inputs), [b"DEF"])
+      self.assertAllEqual(session.run(inputs), [b"GHK"])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run(inputs)
+
+      coord.request_stop()
+

 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/contrib/linear_optimizer/kernels/squared-loss.h
+++ b/tensorflow/contrib/linear_optimizer/kernels/squared-loss.h
@ -26,15 +26,17 @@ namespace tensorflow {
 class SquaredLossUpdater : public DualLossUpdater {
 public:
  // Closed form solution that decreases the dual squared loss.
-  // See page 23 of http://arxiv.org/pdf/1309.2375v2.pdf
+  // See page 23 of http://arxiv.org/pdf/1309.2375v2.pdf for the derivation of
+  // the update rule when the example weights are equal to 1.0.
+  // Note: There is a typo in the formula in the paper: the denominator should
+  // be 1 + ||x_i||^2/(\lambda n) (without the 2 multiplier).
  double ComputeUpdatedDual(const double label, const double example_weight,
                            const double current_dual, const double wx,
                            const double weighted_example_norm,
                            const double primal_loss_unused,
                            const double dual_loss_unused) const final {
-    const double delta_numerator = (label - current_dual - wx) * example_weight;
-    const double delta_denominator =
-        1 + weighted_example_norm * example_weight * example_weight * 0.5;
+    const double delta_numerator = label - current_dual - wx;
+    const double delta_denominator = 1 + weighted_example_norm * example_weight;
    return current_dual + delta_numerator / delta_denominator;
  }

--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@ -455,6 +455,7 @@ class SdcaWithLogisticLossTest(SdcaOptimizerTest):
  # TODO(katsiaspis): add a test for the case when examples at the end of an
  # epoch are repeated, since example id may be duplicated.

+
 class SdcaWithLinearLossTest(SdcaOptimizerTest):
  """SDCA optimizer test class for linear (squared) loss."""

@ -488,9 +489,11 @@ class SdcaWithLinearLossTest(SdcaOptimizerTest):
      self.assertAllClose([-20.0 / 3.0, 28.0 / 3.0],
                          predictions.eval(),
                          rtol=0.005)
-      self.assertAllClose(0.01,
+      # Approximate gap should be very close to 0.0. (In fact, because the gap
+      # is only approximate, it is likely that upon convergence the duality gap
+      # can have a tiny negative value).
+      self.assertAllClose(0.00,
                          lr.approximate_duality_gap().eval(),
-                          rtol=1e-2,
                          atol=1e-2)

  def testL2Regularization(self):
@ -580,7 +583,7 @@ class SdcaWithLinearLossTest(SdcaOptimizerTest):
            {'age': [1],
             'gender': [1]}, 14.0, 2.0),
    ]
-    example_weights = [1.0, 1.0]
+    example_weights = [5.0, 3.0]
    with self._single_threaded_test_session():
      examples = make_example_dict(example_protos, example_weights)

@ -597,20 +600,30 @@ class SdcaWithLinearLossTest(SdcaOptimizerTest):
      for _ in xrange(_MAX_ITERATIONS):
        train_op.run()

-      # Predictions should be 8/9 of label due to minimizing regularized loss:
-      #   (label - 2 * 2 * weight)^2 / 2 + L2 * 2 * weight^2
-      self.assertAllClose([-10.0 * 8 / 9, 14.0 * 8 / 9],
+      # There are 4 (sparse) variable weights to be learned. 2 for age and 2 for
+      # gender. Let w_1, w_2 be age weights, w_3, w_4 be gender weights, y_1,
+      # y_2 be the labels for examples 1 and 2 respectively and s_1, s_2 the
+      # corresponding *example* weights. With the given feature values, the loss
+      # function is given by:
+      # s_1/2(y_1 + 2w_1 + 2w_3)^2 + s_2/2(y_2 - 2w_2 - 2w_4)^2
+      # + \lambda/2 (w_1^2 + w_2^2 + w_3^2 + w_4^2). Solving for the optimal, it
+      # can be verified that:
+      # w_1* = w_3* = -2.0 s_1 y_1/(\lambda + 8 s_1) and
+      # w_2* = w_4* = 2 \cdot s_2 y_2/(\lambda + 8 s_2). Equivalently, due to
+      # regularization and example weights, the predictions are within:
+      # 8 \cdot s_i /(\lambda + 8 \cdot s_i) of the labels.
+      self.assertAllClose([-10 * 40.0 / 41.0, 14.0 * 24 / 25.0],
                          predictions.eval(),
-                          rtol=0.07)
+                          atol=0.01)

-  def testDenseFeatures(self):
+  def testDenseFeaturesWithDefaultWeights(self):
    with self._single_threaded_test_session():
      examples = make_dense_examples_dict(
-          dense_feature_values=[[-2.0, 0.0], [0.0, 2.0]],
+          dense_feature_values=[[1.0, 0.0], [0.0, 1.0]],
          weights=[1.0, 1.0],
-          labels=[-10.0, 14.0])
+          labels=[10.0, -5.0])
      variables = make_dense_variable_dict(2, 2)
-      options = dict(symmetric_l2_regularization=1,
+      options = dict(symmetric_l2_regularization=1.0,
                     symmetric_l1_regularization=0,
                     loss_type='squared_loss')
      lr = SdcaModel(CONTAINER, examples, variables, options)
@ -621,14 +634,51 @@ class SdcaWithLinearLossTest(SdcaOptimizerTest):
      for _ in xrange(_MAX_ITERATIONS):
        train_op.run()

-      # Predictions should be 4/5 of label due to minimizing regularized loss:
-      #   (label - 2 * weight)^2 / 2 + L2 * weight^2
-      self.assertAllClose([-10.0 * 4 / 5, 14.0 * 4 / 5],
+      # The loss function for these particular features is given by:
+      # 1/2(label_1-w_1)^2 + 1/2(label_2-w_2)^2 + \lambda/2 (w_1^2 + w_2^2). So,
+      # differentiating wrt to w_1, w_2 yields the following optimal values:
+      # w_1* = label_1/(\lambda + 1)= 10/2, w_2* =label_2/(\lambda + 1)= -5/2.
+      # In this case the (unnormalized regularized) loss will be:
+      # 1/2(10-5)^2 + 1/2(5-5/2)^2 + 1/2(5^2 + (5/2)^2) = 125.0/4. The actual
+      # loss should be further normalized by the sum of example weights.
+      self.assertAllClose([5.0, -2.5],
                          predictions.eval(),
                          rtol=0.01)
-
      loss = lr.regularized_loss(examples)
-      self.assertAllClose(148.0 / 10.0, loss.eval(), atol=0.01)
+      self.assertAllClose(125.0 / 8.0, loss.eval(), atol=0.01)
+
+  def testDenseFeaturesWithArbitraryWeights(self):
+    with self._single_threaded_test_session():
+      examples = make_dense_examples_dict(
+          dense_feature_values=[[1.0, 0.0], [0.0, 1.0]],
+          weights=[20.0, 10.0],
+          labels=[10.0, -5.0])
+      variables = make_dense_variable_dict(2, 2)
+      options = dict(symmetric_l2_regularization=5.0,
+                     symmetric_l1_regularization=0,
+                     loss_type='squared_loss')
+      lr = SdcaModel(CONTAINER, examples, variables, options)
+      tf.initialize_all_variables().run()
+      predictions = lr.predictions(examples)
+
+      train_op = lr.minimize()
+      for _ in xrange(_MAX_ITERATIONS):
+        train_op.run()
+
+      # The loss function for these particular features is given by:
+      # 1/2 s_1 (label_1-w_1)^2 + 1/2 s_2(label_2-w_2)^2 +
+      # \lambda/2 (w_1^2 + w_2^2) where s_1, s_2 are the *example weights. It
+      # turns out that the optimal (variable) weights are given by:
+      # w_1* = label_1 \cdot s_1/(\lambda + s_1)= 8.0 and
+      # w_2* =label_2 \cdot s_2/(\lambda + s_2)= -10/3.
+      # In this case the (unnormalized regularized) loss will be:
+      # s_1/2(8-10)^2 + s_2/2(5-10/3)^2 + 5.0/2(8^2 + (10/3)^2) = 2175.0/9. The
+      # actual loss should be further normalized by the sum of example weights.
+      self.assertAllClose([8.0, -10.0/3],
+                          predictions.eval(),
+                          rtol=0.01)
+      loss = lr.regularized_loss(examples)
+      self.assertAllClose(2175.0 / 270.0, loss.eval(), atol=0.01)


 class SdcaWithHingeLossTest(SdcaOptimizerTest):
--- a/tensorflow/contrib/losses/python/losses/init.py
+++ b/tensorflow/contrib/losses/python/losses/init.py
@ -19,7 +19,10 @@ from __future__ import division
 from __future__ import print_function

 from tensorflow.contrib.losses.python.losses.loss_ops import absolute_difference
+from tensorflow.contrib.losses.python.losses.loss_ops import add_loss
 from tensorflow.contrib.losses.python.losses.loss_ops import cosine_distance
+from tensorflow.contrib.losses.python.losses.loss_ops import get_losses
+from tensorflow.contrib.losses.python.losses.loss_ops import get_total_loss
 from tensorflow.contrib.losses.python.losses.loss_ops import log
 from tensorflow.contrib.losses.python.losses.loss_ops import sigmoid_cross_entropy
 from tensorflow.contrib.losses.python.losses.loss_ops import softmax_cross_entropy
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@ -104,9 +104,11 @@ weighted average over the individual prediction errors:
  weight = tf.div(weight, tf.size(weight))
  loss = tf.contrib.losses.sum_of_squares(predictions, depths, weight)

-
@@absolute_difference
+@@add_loss
@@cosine_distance
+@@get_losses
+@@get_total_loss
@@log
@@sigmoid_cross_entropy
@@softmax_cross_entropy
@ -252,6 +254,61 @@ def _num_present(losses, weight, per_batch=False):
  return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)


+def add_loss(loss):
+  """Adds a externally defined loss to collection of losses.
+
+  Args:
+    loss: A loss `Tensor`.
+  """
+  ops.add_to_collection(ops.GraphKeys.LOSSES, loss)
+
+
+def get_losses(scope=None):
+  """Gets the list of loss variables.
+
+  Args:
+    scope: an optional scope for filtering the losses to return.
+
+  Returns:
+    a list of loss variables.
+  """
+  return ops.get_collection(ops.GraphKeys.LOSSES, scope)
+
+
+def get_regularization_losses(scope=None):
+  """Gets the regularization losses.
+
+  Args:
+    scope: an optional scope for filtering the losses to return.
+
+  Returns:
+    A list of loss variables.
+  """
+  return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
+
+
+def get_total_loss(add_regularization_losses=True, name="total_loss"):
+  """Returns a tensor whose value represents the total loss.
+
+  Notice that the function adds the given losses to the regularization losses.
+
+  Args:
+    add_regularization_losses: A boolean indicating whether or not to use the
+      regularization losses in the sum.
+    name: The name of the returned tensor.
+
+  Returns:
+    A `Tensor` whose value represents the total loss.
+
+  Raises:
+    ValueError: if `losses` is not iterable.
+  """
+  losses = get_losses()
+  if add_regularization_losses:
+    losses += get_regularization_losses()
+  return math_ops.add_n(losses, name=name)
+
+
 def absolute_difference(predictions, targets, weight=1.0, scope=None):
  """Adds an Absolute Difference loss to the training procedure.

--- a/tensorflow/core/common_runtime/simple_placer.cc
+++ b/tensorflow/core/common_runtime/simple_placer.cc
@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/simple_placer.h"

 #include <memory>
+#include <set>
 #include <utility>
 #include <vector>

@ -182,6 +183,7 @@ class ColocationGraph {
  Status ColocateNodes(const Node& x, const Node& y) {
    int x_root = FindRoot(x.id());
    int y_root = FindRoot(y.id());
+
    Status s;
    if (x_root != y_root) {
      // Merge the sets by swinging the parent pointer of the smaller
@ -229,6 +231,12 @@ class ColocationGraph {
                                       s.error_message());
      }

+      // Transfer ids in the old group to the new one.
+      members_[new_root].ids_in_group.insert(
+          members_[old_root].ids_in_group.begin(),
+          members_[old_root].ids_in_group.end());
+      members_[old_root].ids_in_group.clear();
+
      // Ensure that the common root has at least one supported device
      // type, by computing the intersection of
      // members_[new_root].supported_device_types and
@ -267,6 +275,9 @@ class ColocationGraph {
      return Status::OK();
    }

+    // String containing additional debugging info on failures.
+    string debug_info;
+
    // We have not yet computed the possible devices for the
    // colocated node set containing 'node', so we do so now using the
    // constraints on the root node.
@ -310,6 +321,8 @@ class ColocationGraph {
        // Return an error when a physical device that matches an explicit
        // device specification is not found. This ensures that we don't
        // assign a node to GPU when the user wanted to force it on CPU.
+        AddDebugInfo(node_root, &debug_info);
+
        DeviceNameUtils::ParsedName specified_device_name;
        if (DeviceNameUtils::ParseFullName(node->def().device(),
                                           &specified_device_name) &&
@ -334,16 +347,17 @@ class ColocationGraph {
                node->def().device(),
                "' because no devices matching that specification "
                "are registered in this process; available devices: ",
-                str_util::Join(device_names, ", "));
+                str_util::Join(device_names, ", "), debug_info);
          } else if (specified_device_name.has_type) {
            return errors::InvalidArgument(
                "Could not satisfy explicit device specification '",
                node->def().device(), "' because no supported kernel for ",
-                specified_device_name.type, " devices is available");
+                specified_device_name.type, " devices is available.",
+                debug_info);
          } else {
            return errors::InvalidArgument(
                "Could not satisfy explicit device specification '",
-                node->def().device());
+                node->def().device(), debug_info);
          }
        } else {
          // The specified device may be a valid device but the
@ -355,7 +369,7 @@ class ColocationGraph {
              "required incompatible device '",
              DeviceNameUtils::ParsedNameToString(
                  members_[node_root].device_name),
-              "'");
+              "'", debug_info);
        }
      }
    } else {
@ -368,10 +382,11 @@ class ColocationGraph {
          device_set_->devices(), members_[node_root].supported_device_types);

      if (devices.empty()) {
+        AddDebugInfo(node_root, &debug_info);
        return errors::InvalidArgument(
            "Node had no OpKernel registered to support this operation: ",
            "Operation was ", node->type_string(), " and inputs were ",
-            DataTypeVectorString(node->input_types()));
+            DataTypeVectorString(node->input_types()), debug_info);
      }
    }

@ -390,6 +405,15 @@ class ColocationGraph {
    // id if it is a root. parent <= 0 indicates that this member is invalid.
    int parent = -1;

+    // The set of ids that are part of the disjoint node set forest.
+    //
+    // This is only fully specified in the root of a disjoint
+    // node set forest.
+    std::set<int> ids_in_group;
+
+    // The type of the op for this node.
+    string op_type;
+
    // A proxy for the depth of the tree that is used to prefer
    // connecting smaller trees to larger trees when merging disjoint
    // sets.
@ -410,8 +434,41 @@ class ColocationGraph {
    std::vector<Device*> possible_devices;
  };

+  // Adds debugging info to 'output' for the node referred to by
+  // 'node_root'.
+  void AddDebugInfo(const int node_root, string* output) {
+    if (members_[node_root].ids_in_group.size() > 1) {
+      strings::StrAppend(output, "\nColocation Debug Info:\n");
+
+      // If this node is part of a colocation group, then we want to
+      // collect the mapping of ops to supported devices, so that
+      // the user can see why an unsatisfiable placement occurred.
+      strings::StrAppend(
+          output, "Colocation group had the following types and devices: ");
+
+      std::unordered_map<string, string> type_to_devices;
+      for (const int id : members_[node_root].ids_in_group) {
+        const string& op_type = members_[id].op_type;
+        string devices_registered;
+        for (const auto& device_type : members_[id].supported_device_types) {
+          strings::StrAppend(&devices_registered, DeviceTypeString(device_type),
+                             " ");
+        }
+
+        type_to_devices[op_type] = devices_registered;
+      }
+
+      for (const auto& td : type_to_devices) {
+        strings::StrAppend(output, "\n", td.first, ": ", td.second);
+      }
+    }
+  }
+
  Status InitializeMember(const Node& node, Member* member) {
    const int id = node.id();
+    member->ids_in_group.insert(id);
+    member->op_type = node.type_string();
+
    if (id < 0) {
      return errors::InvalidArgument("Node id was not positive: ", id);
    }
--- a/tensorflow/core/common_runtime/simple_placer_test.cc
+++ b/tensorflow/core/common_runtime/simple_placer_test.cc
@ -729,6 +729,12 @@ TEST_F(SimplePlacerTest, TestHeterogeneousDeviceSetFailure) {
  EXPECT_TRUE(StringPiece(s.error_message())
                  .contains("colocated with a group of nodes that required "
                            "incompatible device"));
+
+  // The error message should contain information that indicates which
+  // op types have which registered device types.
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("VariableGPU: GPU")) << s;
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("TestAssign: GPU CPU"))
+      << s;
 }

 // Test that placement fails when an unknown device is requested.
--- a/tensorflow/core/kernels/cholesky_grad.cc
+++ b/tensorflow/core/kernels/cholesky_grad.cc
@ -13,75 +13,68 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/core/framework/op.h"
 #include "third_party/eigen3/Eigen/Core"
-
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-
-#include "tensorflow/core/kernels/linalg_ops_common.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/binary_linalg_ops_common.h"

 namespace tensorflow {

-template <typename T>
-class CholeskyGrad : public OpKernel {
+template <typename Scalar, bool SupportsBatchOperationT>
+class CholeskyGrad
+    : public BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> {
 public:
-  explicit CholeskyGrad(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit CholeskyGrad(OpKernelConstruction* context)
+      : BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
+  ~CholeskyGrad() override {}
+
  using Matrix =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
  using ConstMatrixMap = Eigen::Map<const Matrix>;
  using MatrixMap = Eigen::Map<Matrix>;
  using ConstRef = Eigen::Ref<const Matrix>;
  using Ref = Eigen::Ref<Matrix>;

-  void Compute(OpKernelContext* context) override {
-    const Tensor& input_tensor_l = context->input(0);
-    const Tensor& input_tensor_grad = context->input(1);
-    // Check that input tensors represent a matrix.
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_tensor_l.shape()),
-                errors::InvalidArgument("In[0] is not a matrix"));
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_tensor_grad.shape()),
-                errors::InvalidArgument("In[1] is not a matrix"));
-    // Check that input tensors are square.
-    OP_REQUIRES(context,
-                input_tensor_l.dim_size(0) == input_tensor_l.dim_size(1),
-                errors::InvalidArgument("Input matrix must be square."));
-    OP_REQUIRES(context,
-                input_tensor_grad.dim_size(0) == input_tensor_grad.dim_size(1),
-                errors::InvalidArgument("Input matrix must be square."));
+  TensorShape GetOutputMatrixShape(
+      const TensorShape& input_matrix_l_full_shape,
+      const TensorShape& input_matrix_grad_shape) override {
+    return input_matrix_l_full_shape;
+  }

-    // Check that input tensors are of same size.
-    OP_REQUIRES(context,
-                input_tensor_l.dim_size(0) == input_tensor_grad.dim_size(0),
-                errors::InvalidArgument("Input matrices must be same size."));
-
-    // Create an output tensor
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, input_tensor_grad.shape(), &output_tensor));
-
-    if (output_tensor->NumElements() == 0) {
-      // the output shape is a 0-element matrix, so there is nothing to do.
-      return;
+  int64 GetCostPerUnit(const TensorShape& input_matrix_shape,
+                       const TensorShape& rhs_matrix_shape) override {
+    const int64 rows = input_matrix_shape.dim_size(0);
+    if (rows > (1LL << 20)) {
+      // A big number to cap the cost in case overflow.
+      return kint64max;
+    } else {
+      return rows * rows * rows;
    }
-    // The next lines are necessary to get Eigen matrix behaviour.
-    const ConstMatrixMap input_matrix_l_full(input_tensor_l.flat<T>().data(),
-                                             input_tensor_l.dim_size(0),
-                                             input_tensor_l.dim_size(1));
-    const ConstMatrixMap input_matrix_grad(input_tensor_grad.flat<T>().data(),
-                                           input_tensor_grad.dim_size(0),
-                                           input_tensor_grad.dim_size(1));
-    MatrixMap output_matrix(output_tensor->template flat<T>().data(),
-                            input_tensor_l.dim_size(0),
-                            input_tensor_l.dim_size(1));
+  }

-    // Algorithm only depends on lower triangular half on input_tensor_l.
+  void ComputeMatrix(OpKernelContext* context,
+                     const ConstMatrixMap& input_matrix_l_full,
+                     const ConstMatrixMap& input_matrix_grad,
+                     MatrixMap* output_matrix) override {
+    OP_REQUIRES(context,
+                input_matrix_l_full.rows() == input_matrix_l_full.cols(),
+                errors::InvalidArgument("Input matrix must be square."));
+    OP_REQUIRES(
+        context, input_matrix_l_full.cols() == input_matrix_grad.cols(),
+        errors::InvalidArgument(
+            "Input matrix and gradient must have same number of cols."));
+    OP_REQUIRES(
+        context, input_matrix_l_full.rows() == input_matrix_grad.rows(),
+        errors::InvalidArgument(
+            "Input matrix and gradient must have same number of rows."));
+
+    // Algorithm only depends on lower triangular half on input_matrix_l.
    const Matrix input_matrix_l =
        input_matrix_l_full.template triangularView<Eigen::Lower>();
    // Algorithm only depends on lower triangular half on input_matrix_grad.
-    output_matrix = input_matrix_grad.template triangularView<Eigen::Lower>();
+    *output_matrix = input_matrix_grad.template triangularView<Eigen::Lower>();

    const int64 kMatrixSize = input_matrix_l.rows();
    const int64 kMaxBlockSize = 32;
@ -104,20 +97,21 @@ class CholeskyGrad : public OpKernel {

      auto B = input_matrix_l.block(block_end, 0, trailing_size, block_begin);
      auto B_bar =
-          output_matrix.block(block_end, 0, trailing_size, block_begin);
+          output_matrix->block(block_end, 0, trailing_size, block_begin);

      auto C = input_matrix_l.block(block_end, block_begin, trailing_size,
                                    block_size);
-      auto C_bar = output_matrix.block(block_end, block_begin, trailing_size,
-                                       block_size);
+      auto C_bar = output_matrix->block(block_end, block_begin, trailing_size,
+                                        block_size);

      auto D = input_matrix_l.block(block_begin, block_begin, block_size,
                                    block_size);
-      auto D_bar =
-          output_matrix.block(block_begin, block_begin, block_size, block_size);
+      auto D_bar = output_matrix->block(block_begin, block_begin, block_size,
+                                        block_size);

      auto R = input_matrix_l.block(block_begin, 0, block_size, block_begin);
-      auto R_bar = output_matrix.block(block_begin, 0, block_size, block_begin);
+      auto R_bar =
+          output_matrix->block(block_begin, 0, block_size, block_begin);

      C_bar = D.adjoint().template triangularView<Eigen::Upper>()
          .solve(C_bar.adjoint()).adjoint();
@ -127,9 +121,11 @@ class CholeskyGrad : public OpKernel {
      CholeskyGradUnblocked(D, D_bar);
      R_bar -= (D_bar + D_bar.adjoint()) * R;
    }
-    output_matrix = (0.5 * (output_matrix + output_matrix.transpose())).eval();
+    *output_matrix =
+        (0.5 * (*output_matrix + output_matrix->transpose())).eval();
  }
-  void CholeskyGradUnblocked(const ConstRef l_block, Ref grad_block) {
+
+  void CholeskyGradUnblocked(const ConstRef& l_block, Ref grad_block) {
    const int64 kMatrixSize = l_block.rows();
    for (int64 k = kMatrixSize - 1; k >= 0; k--) {
      /* This shows the block structure.
@ -166,6 +162,11 @@ class CholeskyGrad : public OpKernel {
  }
 };

-REGISTER_LINALG_OP("CholeskyGrad", (CholeskyGrad<float>), float);
-REGISTER_LINALG_OP("CholeskyGrad", (CholeskyGrad<double>), double);
+REGISTER_BINARY_LINALG_OP("CholeskyGrad", (CholeskyGrad<float, false>), float);
+REGISTER_BINARY_LINALG_OP("CholeskyGrad", (CholeskyGrad<double, false>),
+                          double);
+REGISTER_BINARY_LINALG_OP("BatchCholeskyGrad", (CholeskyGrad<float, true>),
+                          float);
+REGISTER_BINARY_LINALG_OP("BatchCholeskyGrad", (CholeskyGrad<double, true>),
+                          double);
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@ -64,8 +64,7 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
        AllocatorAttributes(), allocation_attr));
    if (!allocation_status.ok()) {
      return perftools::gputools::port::StatusOr<
-          perftools::gputools::DeviceMemory<uint8>>(
-          AsDeviceMemory<uint8>(nullptr, 0));
+          perftools::gputools::DeviceMemory<uint8>>();
    }
    // Hold the reference of the allocated tensors until the end of the
    // allocator.
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@ -305,7 +305,7 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output,
            const int out_offset =
                (b * params.out_height + ph) * params.out_width + pw;
            out_mat.col(out_offset) += in_mat.col(in_offset);
-            out_count(out_offset)++;
+            out_count(out_offset) += T(1);
          }
        }
      }
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@ -3175,6 +3175,31 @@ op {
    }
  }
 }
+op {
+  name: "BatchCholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
  name: "BatchFFT"
  input_arg {
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@ -129,11 +129,34 @@ REGISTER_OP("CholeskyGrad")
    .Doc(R"doc(
 Calculates the reverse mode backpropagated gradient of the Cholesky algorithm.

-For an explanation see "Differentiation of the Cholesky algorithm" by Iain Murray http://arxiv.org/abs/1602.07527.
+For an explanation see "Differentiation of the Cholesky algorithm" by
+Iain Murray http://arxiv.org/abs/1602.07527.

-l: Output of Cholesky algorithm l = chol(A). Shape is `[M, M]`. Algorithm depends only on lower triangular part of this matrix.
-grad: df/dl where f is some scalar function. Shape is `[M, M]'. Algorithm depends only on lower triangular part of this matrix.
-output: Symmetrized version of df/dA . Shape is `[M, M]'
+l: Output of Cholesky algorithm l = chol(A). Shape is `[M, M]`.
+  Algorithm depends only on lower triangular part of this matrix.
+grad: df/dl where f is some scalar function. Shape is `[M, M]'.
+  Algorithm depends only on lower triangular part of this matrix.
+output: Symmetrized version of df/dA . Shape is `[M, M]'.
+)doc");
+
+REGISTER_OP("BatchCholeskyGrad")
+    .Input("l: T")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Doc(R"doc(
+Calculates the reverse mode backpropagated gradient of the Cholesky algorithm.
+
+For an explanation see "Differentiation of the Cholesky algorithm" by
+Iain Murray http://arxiv.org/abs/1602.07527.
+
+l: Output of batch Cholesky algorithm l = batch_cholesky(A). Shape is `[..., M, M]`.
+  Algorithm depends only on lower triangular part of the innermost matrices of
+  this tensor.
+grad: df/dl where f is some scalar function. Shape is `[..., M, M]'.
+  Algorithm depends only on lower triangular part of the innermost matrices of
+  this tensor.
+output: Symmetrized version of df/dA . Shape is `[..., M, M]'
 )doc");

 REGISTER_OP("SelfAdjointEig")
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -1397,6 +1397,36 @@ op {
  summary: "Calculates the Cholesky decomposition of a batch of square matrices."
  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices, with the same constraints as the single matrix Cholesky\ndecomposition above. The output is a tensor of the same shape as the input\ncontaining the Cholesky decompositions for all input submatrices `[..., :, :]`."
 }
+op {
+  name: "BatchCholeskyGrad"
+  input_arg {
+    name: "l"
+    description: "Output of batch Cholesky algorithm l = batch_cholesky(A). Shape is `[..., M, M]`.\nAlgorithm depends only on lower triangular part of the innermost matrices of\nthis tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "df/dl where f is some scalar function. Shape is `[..., M, M]\'.\nAlgorithm depends only on lower triangular part of the innermost matrices of\nthis tensor."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Symmetrized version of df/dA . Shape is `[..., M, M]\'"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Calculates the reverse mode backpropagated gradient of the Cholesky algorithm."
+  description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527."
+}
 op {
  name: "BatchFFT"
  input_arg {
@ -2482,17 +2512,17 @@ op {
  name: "CholeskyGrad"
  input_arg {
    name: "l"
-    description: "Output of Cholesky algorithm l = chol(A). Shape is `[M, M]`. Algorithm depends only on lower triangular part of this matrix."
+    description: "Output of Cholesky algorithm l = chol(A). Shape is `[M, M]`.\nAlgorithm depends only on lower triangular part of this matrix."
    type_attr: "T"
  }
  input_arg {
    name: "grad"
-    description: "df/dl where f is some scalar function. Shape is `[M, M]\'. Algorithm depends only on lower triangular part of this matrix."
+    description: "df/dl where f is some scalar function. Shape is `[M, M]\'.\nAlgorithm depends only on lower triangular part of this matrix."
    type_attr: "T"
  }
  output_arg {
    name: "output"
-    description: "Symmetrized version of df/dA . Shape is `[M, M]\'"
+    description: "Symmetrized version of df/dA . Shape is `[M, M]\'."
    type_attr: "T"
  }
  attr {
@ -2506,7 +2536,7 @@ op {
    }
  }
  summary: "Calculates the reverse mode backpropagated gradient of the Cholesky algorithm."
-  description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by Iain Murray http://arxiv.org/abs/1602.07527."
+  description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527."
 }
 op {
  name: "Complex"
@ -11482,7 +11512,7 @@ op {
    }
  }
  summary: "Computes the sum of elements across dimensions of a SparseTensor."
-  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`\ninstead of a sparse one.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned."
+  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`\ninstead of a sparse one.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned.  Additionally, the axes can be negative,\nwhich are interpreted according to the indexing rules in Python."
 }
 op {
  name: "SparseReorder"
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@ -52,11 +52,11 @@ def train():
  # Input placehoolders
  with tf.name_scope('input'):
    x = tf.placeholder(tf.float32, [None, 784], name='x-input')
+    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
+
+  with tf.name_scope('input_reshape'):
    image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
    tf.image_summary('input', image_shaped_input, 10)
-    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
-    keep_prob = tf.placeholder(tf.float32)
-    tf.scalar_summary('dropout_keep_probability', keep_prob)

  # We can't initialize these variables to 0 - the network will get stuck.
  def weight_variable(shape):
@ -105,7 +105,12 @@ def train():
      return activations

  hidden1 = nn_layer(x, 784, 500, 'layer1')
-  dropped = tf.nn.dropout(hidden1, keep_prob)
+
+  with tf.name_scope('dropout'):
+    keep_prob = tf.placeholder(tf.float32)
+    tf.scalar_summary('dropout_keep_probability', keep_prob)
+    dropped = tf.nn.dropout(hidden1, keep_prob)
+
  y = nn_layer(dropped, 500, 10, 'layer2', act=tf.nn.softmax)

  with tf.name_scope('cross_entropy'):
@ -151,9 +156,20 @@ def train():
      summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
      test_writer.add_summary(summary, i)
      print('Accuracy at step %s: %s' % (i, acc))
-    else:  # Record train set summarieis, and train
-      summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
-      train_writer.add_summary(summary, i)
+    else:  # Record train set summaries, and train
+      if i % 100 == 99:  # Record execution stats
+        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+        run_metadata = tf.RunMetadata()
+        summary, _ = sess.run([merged, train_step],
+                              feed_dict=feed_dict(True),
+                              options=run_options,
+                              run_metadata=run_metadata)
+        train_writer.add_run_metadata(run_metadata, 'step%d' % i)
+        train_writer.add_summary(summary, i)
+        print('Adding run metadata for', i)
+      else:  # Record a summary
+        summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
+        train_writer.add_summary(summary, i)


 def main(_):
--- a/tensorflow/g3doc/api_docs/python/contrib.distributions.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
@ -1338,9 +1338,9 @@ Variance of each batch member.

 - - -

-### `class tf.contrib.distributions.Gaussian` {#Gaussian}
+### `class tf.contrib.distributions.Normal` {#Normal}

-The scalar Gaussian distribution with mean and stddev parameters mu, sigma.
+The scalar Normal distribution with mean and stddev parameters mu, sigma.

 #### Mathematical details

@ -1353,15 +1353,15 @@ The PDF of this distribution is:
 Examples of initialization of one or a batch of distributions.

 ```python
-# Define a single scalar Gaussian distribution.
-dist = tf.contrib.distributions.Gaussian(mu=0, sigma=3)
+# Define a single scalar Normal distribution.
+dist = tf.contrib.distributions.Normal(mu=0, sigma=3)

 # Evaluate the cdf at 1, returning a scalar.
 dist.cdf(1)

-# Define a batch of two scalar valued Gaussians.
+# Define a batch of two scalar valued Normals.
 # The first has mean 1 and standard deviation 11, the second 2 and 22.
-dist = tf.contrib.distributions.Gaussian(mu=[1, 2.], sigma=[11, 22.])
+dist = tf.contrib.distributions.Normal(mu=[1, 2.], sigma=[11, 22.])

 # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
 # returning a length two tensor.
@ -1374,9 +1374,9 @@ dist.sample(3)
 Arguments are broadcast when possible.

 ```python
-# Define a batch of two scalar valued Gaussians.
+# Define a batch of two scalar valued Normals.
 # Both have mean 1, but different standard deviations.
-dist = tf.contrib.distributions.Gaussian(mu=1, sigma=[11, 22.])
+dist = tf.contrib.distributions.Normal(mu=1, sigma=[11, 22.])

 # Evaluate the pdf of both distributions on the same point, 3.0,
 # returning a length 2 tensor.
@ -1384,9 +1384,9 @@ dist.pdf(3.0)
 ```
 - - -

-#### `tf.contrib.distributions.Gaussian.__init__(mu, sigma, name=None)` {#Gaussian.__init__}
+#### `tf.contrib.distributions.Normal.__init__(mu, sigma, name=None)` {#Normal.__init__}

-Construct Gaussian distributions with mean and stddev `mu` and `sigma`.
+Construct Normal distributions with mean and stddev `mu` and `sigma`.

 The parameters `mu` and `sigma` must be shaped in a way that supports
 broadcasting (e.g. `mu + sigma` is a valid operation).
@ -1407,9 +1407,9 @@ broadcasting (e.g. `mu + sigma` is a valid operation).

 - - -

-#### `tf.contrib.distributions.Gaussian.cdf(x, name=None)` {#Gaussian.cdf}
+#### `tf.contrib.distributions.Normal.cdf(x, name=None)` {#Normal.cdf}

-CDF of observations in `x` under these Gaussian distribution(s).
+CDF of observations in `x` under these Normal distribution(s).

 ##### Args:

@ -1425,16 +1425,16 @@ CDF of observations in `x` under these Gaussian distribution(s).

 - - -

-#### `tf.contrib.distributions.Gaussian.dtype` {#Gaussian.dtype}
+#### `tf.contrib.distributions.Normal.dtype` {#Normal.dtype}




 - - -

-#### `tf.contrib.distributions.Gaussian.entropy(name=None)` {#Gaussian.entropy}
+#### `tf.contrib.distributions.Normal.entropy(name=None)` {#Normal.entropy}

-The entropy of Gaussian distribution(s).
+The entropy of Normal distribution(s).

 ##### Args:

@ -1449,16 +1449,16 @@ The entropy of Gaussian distribution(s).

 - - -

-#### `tf.contrib.distributions.Gaussian.is_reparameterized` {#Gaussian.is_reparameterized}
+#### `tf.contrib.distributions.Normal.is_reparameterized` {#Normal.is_reparameterized}




 - - -

-#### `tf.contrib.distributions.Gaussian.log_cdf(x, name=None)` {#Gaussian.log_cdf}
+#### `tf.contrib.distributions.Normal.log_cdf(x, name=None)` {#Normal.log_cdf}

-Log CDF of observations `x` under these Gaussian distribution(s).
+Log CDF of observations `x` under these Normal distribution(s).

 ##### Args:

@ -1474,9 +1474,9 @@ Log CDF of observations `x` under these Gaussian distribution(s).

 - - -

-#### `tf.contrib.distributions.Gaussian.log_pdf(x, name=None)` {#Gaussian.log_pdf}
+#### `tf.contrib.distributions.Normal.log_pdf(x, name=None)` {#Normal.log_pdf}

-Log pdf of observations in `x` under these Gaussian distribution(s).
+Log pdf of observations in `x` under these Normal distribution(s).

 ##### Args:

@ -1492,23 +1492,23 @@ Log pdf of observations in `x` under these Gaussian distribution(s).

 - - -

-#### `tf.contrib.distributions.Gaussian.mean` {#Gaussian.mean}
+#### `tf.contrib.distributions.Normal.mean` {#Normal.mean}




 - - -

-#### `tf.contrib.distributions.Gaussian.mu` {#Gaussian.mu}
+#### `tf.contrib.distributions.Normal.mu` {#Normal.mu}




 - - -

-#### `tf.contrib.distributions.Gaussian.pdf(x, name=None)` {#Gaussian.pdf}
+#### `tf.contrib.distributions.Normal.pdf(x, name=None)` {#Normal.pdf}

-The PDF of observations in `x` under these Gaussian distribution(s).
+The PDF of observations in `x` under these Normal distribution(s).

 ##### Args:

@ -1524,9 +1524,9 @@ The PDF of observations in `x` under these Gaussian distribution(s).

 - - -

-#### `tf.contrib.distributions.Gaussian.sample(n, seed=None, name=None)` {#Gaussian.sample}
+#### `tf.contrib.distributions.Normal.sample(n, seed=None, name=None)` {#Normal.sample}

-Sample `n` observations from the Gaussian Distributions.
+Sample `n` observations from the Normal Distributions.

 ##### Args:

@ -1544,7 +1544,7 @@ Sample `n` observations from the Gaussian Distributions.

 - - -

-#### `tf.contrib.distributions.Gaussian.sigma` {#Gaussian.sigma}
+#### `tf.contrib.distributions.Normal.sigma` {#Normal.sigma}



@ -2443,26 +2443,26 @@ probability includes a combinatorial coefficient.
 Functions that transform conjugate prior/likelihood pairs to distributions
 representing the posterior or posterior predictive.

-### Gaussian likelihood with conjugate prior.
+### Normal likelihood with conjugate prior.

 - - -

-### `tf.contrib.distributions.gaussian_conjugates_known_sigma_posterior(prior, sigma, s, n)` {#gaussian_conjugates_known_sigma_posterior}
+### `tf.contrib.distributions.normal_conjugates_known_sigma_posterior(prior, sigma, s, n)` {#normal_conjugates_known_sigma_posterior}

-Posterior Gaussian distribution with conjugate prior on the mean.
+Posterior Normal distribution with conjugate prior on the mean.

 This model assumes that `n` observations (with sum `s`) come from a
-Gaussian with unknown mean `mu` (described by the Gaussian `prior`)
+Normal with unknown mean `mu` (described by the Normal `prior`)
 and known variance `sigma^2`.  The "known sigma posterior" is
 the distribution of the unknown `mu`.

-Accepts a prior Gaussian distribution object, having parameters
+Accepts a prior Normal distribution object, having parameters
 `mu0` and `sigma0`, as well as known `sigma` values of the predictive
-distribution(s) (also assumed Gaussian),
+distribution(s) (also assumed Normal),
 and statistical estimates `s` (the sum(s) of the observations) and
 `n` (the number(s) of observations).

-Returns a posterior (also Gaussian) distribution object, with parameters
+Returns a posterior (also Normal) distribution object, with parameters
 `(mu', sigma'^2)`, where:

 ```
@ -2477,7 +2477,7 @@ will broadcast in the case of multidimensional sets of parameters.
 ##### Args:


-*  <b>`prior`</b>: `Gaussian` object of type `dtype`:
+*  <b>`prior`</b>: `Normal` object of type `dtype`:
    the prior distribution having parameters `(mu0, sigma0)`.
 *  <b>`sigma`</b>: tensor of type `dtype`, taking values `sigma > 0`.
    The known stddev parameter(s).
@ -2486,35 +2486,35 @@ will broadcast in the case of multidimensional sets of parameters.

 ##### Returns:

-  A new Gaussian posterior distribution object for the unknown observation
+  A new Normal posterior distribution object for the unknown observation
  mean `mu`.

 ##### Raises:


 *  <b>`TypeError`</b>: if dtype of `s` does not match `dtype`, or `prior` is not a
-    Gaussian object.
+    Normal object.


 - - -

-### `tf.contrib.distributions.gaussian_congugates_known_sigma_predictive(prior, sigma, s, n)` {#gaussian_congugates_known_sigma_predictive}
+### `tf.contrib.distributions.normal_congugates_known_sigma_predictive(prior, sigma, s, n)` {#normal_congugates_known_sigma_predictive}

-Posterior predictive Gaussian distribution w. conjugate prior on the mean.
+Posterior predictive Normal distribution w. conjugate prior on the mean.

 This model assumes that `n` observations (with sum `s`) come from a
-Gaussian with unknown mean `mu` (described by the Gaussian `prior`)
+Normal with unknown mean `mu` (described by the Normal `prior`)
 and known variance `sigma^2`.  The "known sigma predictive"
 is the distribution of new observations, conditioned on the existing
 observations and our prior.

-Accepts a prior Gaussian distribution object, having parameters
+Accepts a prior Normal distribution object, having parameters
 `mu0` and `sigma0`, as well as known `sigma` values of the predictive
-distribution(s) (also assumed Gaussian),
+distribution(s) (also assumed Normal),
 and statistical estimates `s` (the sum(s) of the observations) and
 `n` (the number(s) of observations).

-Calculates the Gaussian distribution(s) `p(x | sigma^2)`:
+Calculates the Normal distribution(s) `p(x | sigma^2)`:

 ```
  p(x | sigma^2) = int N(x | mu, sigma^2) N(mu | prior.mu, prior.sigma^2) dmu
@ -2536,7 +2536,7 @@ will broadcast in the case of multidimensional sets of parameters.
 ##### Args:


-*  <b>`prior`</b>: `Gaussian` object of type `dtype`:
+*  <b>`prior`</b>: `Normal` object of type `dtype`:
    the prior distribution having parameters `(mu0, sigma0)`.
 *  <b>`sigma`</b>: tensor of type `dtype`, taking values `sigma > 0`.
    The known stddev parameter(s).
@ -2545,12 +2545,12 @@ will broadcast in the case of multidimensional sets of parameters.

 ##### Returns:

-  A new Gaussian predictive distribution object.
+  A new Normal predictive distribution object.

 ##### Raises:


 *  <b>`TypeError`</b>: if dtype of `s` does not match `dtype`, or `prior` is not a
-    Gaussian object.
+    Normal object.


--- a/tensorflow/g3doc/api_docs/python/contrib.layers.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.layers.md
@ -339,7 +339,7 @@ Optimize weights given a loss.

 - - -

-### `tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, variables=None, name=None)` {#optimize_loss}
+### `tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None)` {#optimize_loss}

 Given loss and parameters for optimizer, returns a training op.

@ -369,6 +369,8 @@ Given loss and parameters for optimizer, returns a training op.
                          Can be used to implement any learning rate decay
                          functions.
                          For example: tf.train.exponential_decay.
+*  <b>`update_ops`</b>: list of update `Operation`s to execute at each step. If `None`,
+              uses elements of UPDATE_OPS collection.
 *  <b>`variables`</b>: list of variables to optimize or
             `None` to use all trainable variables.
 *  <b>`name`</b>: The name for this operation is used to scope operations and summaries.
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md
@ -3396,7 +3396,7 @@ Extracts numpy matrix from pandas DataFrame.

 - - -

-### `tf.contrib.learn.read_batch_examples(file_pattern, batch_size, reader, randomize_input=True, queue_capacity=10000, num_threads=1, name='dequeue_examples')` {#read_batch_examples}
+### `tf.contrib.learn.read_batch_examples(file_pattern, batch_size, reader, randomize_input=True, num_epochs=None, queue_capacity=10000, num_threads=1, name=None)` {#read_batch_examples}

 Adds operations to read, queue, batch `Example` protos.

@ -3418,6 +3418,10 @@ All ops are added to the default graph.
 *  <b>`reader`</b>: A function or class that returns an object with
    `read` method, (filename tensor) -> (example tensor).
 *  <b>`randomize_input`</b>: Whether the input should be randomized.
+*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
+    dataset. If `None`, cycles through the dataset forever.
+    NOTE - If specified, creates a variable that must be initialized, so call
+    `tf.initialize_all_variables()` as shown in the tests.
 *  <b>`queue_capacity`</b>: Capacity for input queue.
 *  <b>`num_threads`</b>: The number of threads enqueuing examples.
 *  <b>`name`</b>: Name of resulting op.
@ -3434,7 +3438,7 @@ All ops are added to the default graph.

 - - -

-### `tf.contrib.learn.read_batch_features(file_pattern, batch_size, features, reader, randomize_input=True, queue_capacity=10000, num_threads=1, name='dequeue_examples')` {#read_batch_features}
+### `tf.contrib.learn.read_batch_features(file_pattern, batch_size, features, reader, randomize_input=True, num_epochs=None, queue_capacity=10000, reader_num_threads=1, parser_num_threads=1, name=None)` {#read_batch_features}

 Adds operations to read, queue, batch and parse `Example` protos.

@ -3459,8 +3463,13 @@ All ops are added to the default graph.
 *  <b>`reader`</b>: A function or class that returns an object with
    `read` method, (filename tensor) -> (example tensor).
 *  <b>`randomize_input`</b>: Whether the input should be randomized.
+*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
+    dataset. If None, cycles through the dataset forever. NOTE - If specified,
+    creates a variable that must be initialized, so call
+    tf.initialize_all_variables() as shown in the tests.
 *  <b>`queue_capacity`</b>: Capacity for input queue.
-*  <b>`num_threads`</b>: The number of threads enqueuing examples.
+*  <b>`reader_num_threads`</b>: The number of threads to read examples.
+*  <b>`parser_num_threads`</b>: The number of threads to parse examples.
 *  <b>`name`</b>: Name of resulting op.

 ##### Returns:
@ -3475,7 +3484,7 @@ All ops are added to the default graph.

 - - -

-### `tf.contrib.learn.read_batch_record_features(file_pattern, batch_size, features, randomize_input=True, queue_capacity=10000, num_threads=1, name='dequeue_record_examples')` {#read_batch_record_features}
+### `tf.contrib.learn.read_batch_record_features(file_pattern, batch_size, features, randomize_input=True, num_epochs=None, queue_capacity=10000, reader_num_threads=1, parser_num_threads=1, name='dequeue_record_examples')` {#read_batch_record_features}

 Reads TFRecord, queues, batches and parses `Example` proto.

@ -3490,8 +3499,13 @@ See more detailed description in `read_examples`.
 *  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
    `VarLenFeature` values.
 *  <b>`randomize_input`</b>: Whether the input should be randomized.
+*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
+    dataset. If None, cycles through the dataset forever. NOTE - If specified,
+    creates a variable that must be initialized, so call
+    tf.initialize_all_variables() as shown in the tests.
 *  <b>`queue_capacity`</b>: Capacity for input queue.
-*  <b>`num_threads`</b>: The number of threads enqueuing examples.
+*  <b>`reader_num_threads`</b>: The number of threads to read examples.
+*  <b>`parser_num_threads`</b>: The number of threads to parse examples.
 *  <b>`name`</b>: Name of resulting op.

 ##### Returns:
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.OpError.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.OpError.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ReaderBase.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ReaderBase.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TFRecordReader.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.TFRecordReader.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.Variable.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.Variable.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.argmin.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.argmin.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_less_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_less_equal.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_rank.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_rank.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.batch_fft.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.batch_fft.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.batch_matrix_band_part.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.batch_matrix_band_part.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.complex_abs.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.complex_abs.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.copy_graph.copy_op_to_graph.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.copy_graph.copy_op_to_graph.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Exponential.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Exponential.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormal.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.normal_conjugates_known_sigma_posterior.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.normal_conjugates_known_sigma_posterior.md
@ -1,19 +1,19 @@
-### `tf.contrib.distributions.gaussian_conjugates_known_sigma_posterior(prior, sigma, s, n)` {#gaussian_conjugates_known_sigma_posterior}
+### `tf.contrib.distributions.normal_conjugates_known_sigma_posterior(prior, sigma, s, n)` {#normal_conjugates_known_sigma_posterior}

-Posterior Gaussian distribution with conjugate prior on the mean.
+Posterior Normal distribution with conjugate prior on the mean.

 This model assumes that `n` observations (with sum `s`) come from a
-Gaussian with unknown mean `mu` (described by the Gaussian `prior`)
+Normal with unknown mean `mu` (described by the Normal `prior`)
 and known variance `sigma^2`.  The "known sigma posterior" is
 the distribution of the unknown `mu`.

-Accepts a prior Gaussian distribution object, having parameters
+Accepts a prior Normal distribution object, having parameters
 `mu0` and `sigma0`, as well as known `sigma` values of the predictive
-distribution(s) (also assumed Gaussian),
+distribution(s) (also assumed Normal),
 and statistical estimates `s` (the sum(s) of the observations) and
 `n` (the number(s) of observations).

-Returns a posterior (also Gaussian) distribution object, with parameters
+Returns a posterior (also Normal) distribution object, with parameters
 `(mu', sigma'^2)`, where:

 ```
@ -28,7 +28,7 @@ will broadcast in the case of multidimensional sets of parameters.
 ##### Args:


-*  <b>`prior`</b>: `Gaussian` object of type `dtype`:
+*  <b>`prior`</b>: `Normal` object of type `dtype`:
    the prior distribution having parameters `(mu0, sigma0)`.
 *  <b>`sigma`</b>: tensor of type `dtype`, taking values `sigma > 0`.
    The known stddev parameter(s).
@ -37,12 +37,12 @@ will broadcast in the case of multidimensional sets of parameters.

 ##### Returns:

-  A new Gaussian posterior distribution object for the unknown observation
+  A new Normal posterior distribution object for the unknown observation
  mean `mu`.

 ##### Raises:


 *  <b>`TypeError`</b>: if dtype of `s` does not match `dtype`, or `prior` is not a
-    Gaussian object.
+    Normal object.

--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.xavier_initializer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.layers.xavier_initializer.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.TensorFlowLinearRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.TensorFlowLinearRegressor.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.extract_dask_data.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.extract_dask_data.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.read_batch_record_features.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.read_batch_record_features.md
@ -1,4 +1,4 @@
-### `tf.contrib.learn.read_batch_record_features(file_pattern, batch_size, features, randomize_input=True, queue_capacity=10000, num_threads=1, name='dequeue_record_examples')` {#read_batch_record_features}
+### `tf.contrib.learn.read_batch_record_features(file_pattern, batch_size, features, randomize_input=True, num_epochs=None, queue_capacity=10000, reader_num_threads=1, parser_num_threads=1, name='dequeue_record_examples')` {#read_batch_record_features}

 Reads TFRecord, queues, batches and parses `Example` proto.

@ -13,8 +13,13 @@ See more detailed description in `read_examples`.
 *  <b>`features`</b>: A `dict` mapping feature keys to `FixedLenFeature` or
    `VarLenFeature` values.
 *  <b>`randomize_input`</b>: Whether the input should be randomized.
+*  <b>`num_epochs`</b>: Integer specifying the number of times to read through the
+    dataset. If None, cycles through the dataset forever. NOTE - If specified,
+    creates a variable that must be initialized, so call
+    tf.initialize_all_variables() as shown in the tests.
 *  <b>`queue_capacity`</b>: Capacity for input queue.
-*  <b>`num_threads`</b>: The number of threads enqueuing examples.
+*  <b>`reader_num_threads`</b>: The number of threads to read examples.
+*  <b>`parser_num_threads`</b>: The number of threads to parse examples.
 *  <b>`name`</b>: Name of resulting op.

 ##### Returns:
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.train.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.train.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.set_difference.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.set_difference.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.streaming_mean_absolute_error.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.streaming_mean_absolute_error.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.streaming_mean_cosine_distance.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.metrics.streaming_mean_cosine_distance.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.diag_part.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.diag_part.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.erf.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.erf.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.greater_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.greater_equal.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.group.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.group.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.adjust_contrast.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.adjust_contrast.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.random_contrast.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.random_contrast.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.sample_distorted_bounding_box.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.image.sample_distorted_bounding_box.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.import_graph_def.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.import_graph_def.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.initialize_local_variables.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.initialize_local_variables.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.initialize_variables.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.initialize_variables.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.inv.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.inv.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.local_variables.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.local_variables.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.logical_xor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.logical_xor.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.matrix_solve_ls.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.matrix_solve_ls.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.avg_pool.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.avg_pool.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.batch_normalization.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.batch_normalization.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.in_top_k.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.in_top_k.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.softsign.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.softsign.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.sufficient_statistics.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.sufficient_statistics.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ones_initializer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.ones_initializer.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.placeholder.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.placeholder.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.polygamma.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.polygamma.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.range.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.range.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.register_tensor_conversion_function.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.register_tensor_conversion_function.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.reverse_sequence.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.segment_sum.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.segment_sum.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.self_adjoint_eig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.self_adjoint_eig.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sparse_add.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sparse_add.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sqrt.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.sqrt.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.stop_gradient.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.stop_gradient.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.test.compute_gradient_error.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.test.compute_gradient_error.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.to_int32.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.to_int32.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.trace.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.trace.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.Optimizer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.Optimizer.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.Saver.from_proto.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.Saver.from_proto.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.Saver.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.Saver.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.string_input_producer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.train.string_input_producer.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.truediv.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.truediv.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.while_loop.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.while_loop.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.DType.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.DType.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Dimension.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Dimension.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.RegisterGradient.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.RegisterGradient.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Tensor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Tensor.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.abs.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.abs.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.add_n.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.add_n.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_variables_initialized.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_variables_initialized.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_self_adjoint_eig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_self_adjoint_eig.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.DirichletMultinomial.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.DirichletMultinomial.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.apply_regularization.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.apply_regularization.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.summarize_tensor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.layers.summarize_tensor.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.RunConfig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.RunConfig.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_mean_relative_error.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.metrics.streaming_mean_relative_error.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.control_dependencies.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.control_dependencies.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.diag.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.diag.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.dynamic_partition.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.dynamic_partition.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.errors.ResourceExhaustedError.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.errors.ResourceExhaustedError.md
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.histogram_fixed_width.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.histogram_fixed_width.md
--- a/Show More
+++ b/Show More