From b044a5d5e8296a487b48e65304eb3c601500a487 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Mon, 12 Jun 2017 21:03:48 -0700
Subject: [PATCH] Adds label_vocab to all canned Classifiers.

PiperOrigin-RevId: 158804380
---
 tensorflow/python/estimator/canned/dnn.py     | 14 ++++++--
 .../estimator/canned/dnn_linear_combined.py   | 14 ++++++--
 .../canned/dnn_linear_combined_test.py        |  2 ++
 .../estimator/canned/dnn_testing_utils.py     | 32 ++++++++++++++++---
 tensorflow/python/estimator/canned/linear.py  |  8 +++--
 5 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 571d415741e..4cae5df8772 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -196,6 +196,7 @@ class DNNClassifier(estimator.Estimator):
                model_dir=None,
                n_classes=2,
                weight_feature_key=None,
+               label_vocabulary=None,
                optimizer='Adagrad',
                activation_fn=nn.relu,
                dropout=None,
@@ -218,6 +219,13 @@ class DNNClassifier(estimator.Estimator):
       weight_feature_key: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
       optimizer: An instance of `tf.Optimizer` used to train the model. If
         `None`, will use an Adagrad optimizer.
       activation_fn: Activation function applied to each layer. If `None`, will
@@ -230,10 +238,12 @@ class DNNClassifier(estimator.Estimator):
     """
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
-          weight_column=weight_feature_key)
+          weight_column=weight_feature_key,
+          label_vocabulary=label_vocabulary)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
-          n_classes, weight_column=weight_feature_key)
+          n_classes, weight_column=weight_feature_key,
+          label_vocabulary=label_vocabulary)
     def _model_fn(features, labels, mode, config):
       return _dnn_model_fn(
           features=features,
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index d2be52b2698..f16227311db 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -308,6 +308,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
                dnn_dropout=None,
                n_classes=2,
                weight_feature_key=None,
+               label_vocabulary=None,
                input_layer_partitioner=None,
                config=None):
     """Initializes a DNNLinearCombinedClassifier instance.
@@ -337,6 +338,13 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
       weight_feature_key: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
       input_layer_partitioner: Partitioner for input layer. Defaults to
         `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
       config: RunConfig object to configure the runtime settings.
@@ -354,11 +362,13 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
                        'must be defined.')
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
-          weight_column=weight_feature_key)
+          weight_column=weight_feature_key,
+          label_vocabulary=label_vocabulary)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes,
-          weight_column=weight_feature_key)
+          weight_column=weight_feature_key,
+          label_vocabulary=label_vocabulary)
     def _model_fn(features, labels, mode, config):
       return _dnn_linear_combined_model_fn(
           features=features,
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
index b890881a511..3cba3aeaea5 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
@@ -325,6 +325,7 @@ def _dnn_classifier_fn(
     model_dir=None,
     n_classes=2,
     weight_feature_key=None,
+    label_vocabulary=None,
     optimizer='Adagrad',
     config=None,
     input_layer_partitioner=None):
@@ -335,6 +336,7 @@ def _dnn_classifier_fn(
       dnn_optimizer=optimizer,
       n_classes=n_classes,
       weight_feature_key=weight_feature_key,
+      label_vocabulary=label_vocabulary,
       input_layer_partitioner=input_layer_partitioner,
       config=config)
 
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index d90f06cdc5f..1951f57adb7 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -624,7 +624,7 @@ class BaseDNNClassifierPredictTest(object):
       writer_cache.FileWriterCache.clear()
       shutil.rmtree(self._model_dir)
 
-  def test_one_dim(self):
+  def _test_one_dim(self, label_vocabulary, label_output_fn):
     """Asserts predictions for one-dimensional input and logits."""
     create_checkpoint(
         (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
@@ -634,6 +634,7 @@ class BaseDNNClassifierPredictTest(object):
 
     dnn_classifier = self._dnn_classifier_fn(
         hidden_units=(2, 2),
+        label_vocabulary=label_vocabulary,
         feature_columns=(feature_column.numeric_column('x'),),
         model_dir=self._model_dir)
     input_fn = numpy_io.numpy_input_fn(
@@ -654,10 +655,20 @@ class BaseDNNClassifierPredictTest(object):
          0.11105597], predictions[prediction_keys.PredictionKeys.PROBABILITIES])
     self.assertAllClose([0],
                         predictions[prediction_keys.PredictionKeys.CLASS_IDS])
-    self.assertAllEqual([b'0'],
+    self.assertAllEqual([label_output_fn(0)],
                         predictions[prediction_keys.PredictionKeys.CLASSES])
 
-  def test_multi_dim(self):
+  def test_one_dim_without_label_vocabulary(self):
+    self._test_one_dim(label_vocabulary=None,
+                       label_output_fn=lambda x: ('%s' % x).encode())
+
+  def test_one_dim_with_label_vocabulary(self):
+    n_classes = 2
+    self._test_one_dim(
+        label_vocabulary=['class_vocab_{}'.format(i) for i in range(n_classes)],
+        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
+
+  def _test_multi_dim_with_3_classes(self, label_vocabulary, label_output_fn):
     """Asserts predictions for multi-dimensional input and logits."""
     create_checkpoint(
         (([[.6, .5], [-.6, -.5]], [.1, -.1]),
@@ -669,6 +680,7 @@ class BaseDNNClassifierPredictTest(object):
     dnn_classifier = self._dnn_classifier_fn(
         hidden_units=(2, 2),
         feature_columns=(feature_column.numeric_column('x', shape=(2,)),),
+        label_vocabulary=label_vocabulary,
         n_classes=3,
         model_dir=self._model_dir)
     input_fn = numpy_io.numpy_input_fn(
@@ -698,7 +710,19 @@ class BaseDNNClassifierPredictTest(object):
     self.assertAllEqual(
         [1], predictions[prediction_keys.PredictionKeys.CLASS_IDS])
     self.assertAllEqual(
-        [b'1'], predictions[prediction_keys.PredictionKeys.CLASSES])
+        [label_output_fn(1)],
+        predictions[prediction_keys.PredictionKeys.CLASSES])
+
+  def test_multi_dim_with_3_classes_but_no_label_vocab(self):
+    self._test_multi_dim_with_3_classes(
+        label_vocabulary=None,
+        label_output_fn=lambda x: ('%s' % x).encode())
+
+  def test_multi_dim_with_3_classes_and_label_vocab(self):
+    n_classes = 3
+    self._test_multi_dim_with_3_classes(
+        label_vocabulary=['class_vocab_{}'.format(i) for i in range(n_classes)],
+        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
 
 
 class BaseDNNRegressorPredictTest(object):
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 827453a20aa..e046ecc439f 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -179,10 +179,12 @@ class LinearClassifier(estimator.Estimator):
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
       label_vocabulary: A list of strings represents possible label values. If
-        it is not given, that means labels are already encoded within [0, 1]. If
         given, labels must be string type and have any value in
-        `label_vocabulary`. Also there will be errors if vocabulary is not
-        provided and labels are string.
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
       optimizer: The optimizer used to train the model. If specified, it should
         be either an instance of `tf.Optimizer` or the SDCAOptimizer. If `None`,
         the Ftrl optimizer will be used.