From f00ca9720c961adf507b179e05f1f94eafcf7596 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 19 Feb 2019 01:30:54 +0000 Subject: [PATCH 01/10] Add StringLower op to support lower() for string operations This fix tries to address the issue raised in 25857 where there is no lower() for string operations yet. This fix adds StringLower to the kernel to support this op. This fix fixes 25857. Signed-off-by: Yong Tang --- tensorflow/core/kernels/BUILD | 7 +++ tensorflow/core/kernels/string_lower_op.cc | 52 ++++++++++++++++++++++ tensorflow/core/ops/string_ops.cc | 5 +++ 3 files changed, 64 insertions(+) create mode 100644 tensorflow/core/kernels/string_lower_op.cc diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 7b9bb112be6..df2f5603b1f 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -4920,6 +4920,7 @@ cc_library( ":string_format_op", ":string_join_op", ":string_length_op", + ":string_lower_op", ":string_split_op", ":string_strip_op", ":string_to_hash_bucket_op", @@ -5058,6 +5059,12 @@ tf_kernel_library( deps = STRING_DEPS, ) +tf_kernel_library( + name = "string_lower_op", + prefix = "string_lower_op", + deps = STRING_DEPS, +) + tf_kernel_library( name = "substr_op", prefix = "substr_op", diff --git a/tensorflow/core/kernels/string_lower_op.cc b/tensorflow/core/kernels/string_lower_op.cc new file mode 100644 index 00000000000..8d33ff5f7c7 --- /dev/null +++ b/tensorflow/core/kernels/string_lower_op.cc @@ -0,0 +1,52 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/string_ops.cc. + +#include + +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" + +namespace tensorflow { + +class StringLowerOp : public OpKernel { + public: + explicit StringLowerOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor* input_tensor; + OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor)); + Tensor* output_tensor; + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor)); + + const auto input = input_tensor->flat(); + auto output = output_tensor->flat(); + + for (int64 i = 0; i < input.size(); ++i) { + StringPiece entry(input(i)); + output(i) = str_util::Lowercase(entry); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("StringLower").Device(DEVICE_CPU), StringLowerOp); + +} // namespace tensorflow diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index d012ce67fd0..94d8dd0e47b 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -206,6 +206,11 @@ REGISTER_OP("StringSplitV2") return Status::OK(); }); +REGISTER_OP("StringLower") + .Input("input: string") + .Output("output: string") + .SetShapeFn(shape_inference::UnchangedShape); + REGISTER_OP("StringStrip") .Input("input: string") .Output("output: string") From a19e8352ae62e61d7909d972fbc9211c8b749bdd Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 19 Feb 2019 01:32:39 +0000 Subject: [PATCH 02/10] Expose strings.lower api endpoint based on StringLower op Signed-off-by: Yong Tang --- .../core/api_def/python_api/api_def_StringLower.pbtxt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt diff --git a/tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt new file mode 100644 index 00000000000..27d6783e3a4 --- /dev/null +++ b/tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt @@ -0,0 +1,6 @@ +op { + graph_op_name: "StringLower" + endpoint { + name: "strings.lower" + } +} From e0a90f81fd5fe3ce7283949a8350a32fe2d782b5 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 19 Feb 2019 01:33:02 +0000 Subject: [PATCH 03/10] Add test case for strings.lower op Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/BUILD | 14 ++++++ .../kernel_tests/string_lower_op_test.py | 48 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 tensorflow/python/kernel_tests/string_lower_op_test.py diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index b061e415b31..facd22ed8c8 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -1096,6 +1096,20 @@ tf_py_test( ], ) +tf_py_test( + name = "string_lower_op_test", + size = "small", + srcs = ["string_lower_op_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:errors", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:string_ops", + ], +) + tf_py_test( name = "substr_op_test", size = "small", diff --git a/tensorflow/python/kernel_tests/string_lower_op_test.py b/tensorflow/python/kernel_tests/string_lower_op_test.py new file mode 100644 index 00000000000..ff8c4e32492 --- /dev/null +++ b/tensorflow/python/kernel_tests/string_lower_op_test.py @@ -0,0 +1,48 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for string_lower_op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import string_ops +from tensorflow.python.platform import test + + +class StringLowerOpTest(test.TestCase): + """ Test cases for tf.strings.lower.""" + + def test_string_lower(self): + strings = ["Pigs on The Wing", "aNimals"] + + with self.cached_session() as sess: + output = string_ops.string_lower(strings) + output = self.evaluate(output) + self.assertAllEqual(output, [b"pigs on the wing", b"animals"]) + + def test_string_lower_2d(self): + strings = [["pigS on THE wIng", "aniMals"], + [" hello ", "\n\tWorld! \r \n"]] + + with self.cached_session() as sess: + output = string_ops.string_lower(strings) + output = self.evaluate(output) + self.assertAllEqual(output, [[b"pigs on the wing", b"animals"], + [b" hello ", b"\n\tworld! \r \n"]]) + + +if __name__ == "__main__": + test.main() From 57816257b4b3aa8ece941f1b03907d48cc85c5c0 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 19 Feb 2019 01:42:34 +0000 Subject: [PATCH 04/10] Add StringUpper op to support upper() for string operations Signed-off-by: Yong Tang --- .../python_api/api_def_StringUpper.pbtxt | 6 +++ tensorflow/core/kernels/BUILD | 7 +++ tensorflow/core/kernels/string_upper_op.cc | 52 +++++++++++++++++++ tensorflow/core/ops/string_ops.cc | 5 ++ 4 files changed, 70 insertions(+) create mode 100644 tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt create mode 100644 tensorflow/core/kernels/string_upper_op.cc diff --git a/tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt new file mode 100644 index 00000000000..390501827ab --- /dev/null +++ b/tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt @@ -0,0 +1,6 @@ +op { + graph_op_name: "StringUpper" + endpoint { + name: "strings.upper" + } +} diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index df2f5603b1f..9d09659bd33 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -4924,6 +4924,7 @@ cc_library( ":string_split_op", ":string_strip_op", ":string_to_hash_bucket_op", + ":string_upper_op", ":substr_op", ":unicode_ops", ":unicode_script_op", @@ -5065,6 +5066,12 @@ tf_kernel_library( deps = STRING_DEPS, ) +tf_kernel_library( + name = "string_upper_op", + prefix = "string_upper_op", + deps = STRING_DEPS, +) + tf_kernel_library( name = "substr_op", prefix = "substr_op", diff --git a/tensorflow/core/kernels/string_upper_op.cc b/tensorflow/core/kernels/string_upper_op.cc new file mode 100644 index 00000000000..fef9304916d --- /dev/null +++ b/tensorflow/core/kernels/string_upper_op.cc @@ -0,0 +1,52 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/string_ops.cc. + +#include + +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" + +namespace tensorflow { + +class StringUpperOp : public OpKernel { + public: + explicit StringUpperOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor* input_tensor; + OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor)); + Tensor* output_tensor; + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor)); + + const auto input = input_tensor->flat(); + auto output = output_tensor->flat(); + + for (int64 i = 0; i < input.size(); ++i) { + StringPiece entry(input(i)); + output(i) = str_util::Uppercase(entry); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("StringUpper").Device(DEVICE_CPU), StringUpperOp); + +} // namespace tensorflow diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index 94d8dd0e47b..23c5791713d 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -211,6 +211,11 @@ REGISTER_OP("StringLower") .Output("output: string") .SetShapeFn(shape_inference::UnchangedShape); +REGISTER_OP("StringUpper") + .Input("input: string") + .Output("output: string") + .SetShapeFn(shape_inference::UnchangedShape); + REGISTER_OP("StringStrip") .Input("input: string") .Output("output: string") From 7cf89bd34d5e213d3c02dbf8befefb423016319a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 19 Feb 2019 01:43:38 +0000 Subject: [PATCH 05/10] Add test case for strings.upper Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/BUILD | 14 ++++++ .../kernel_tests/string_upper_op_test.py | 48 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 tensorflow/python/kernel_tests/string_upper_op_test.py diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index facd22ed8c8..b1556631b91 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -1110,6 +1110,20 @@ tf_py_test( ], ) +tf_py_test( + name = "string_upper_op_test", + size = "small", + srcs = ["string_upper_op_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:errors", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:string_ops", + ], +) + tf_py_test( name = "substr_op_test", size = "small", diff --git a/tensorflow/python/kernel_tests/string_upper_op_test.py b/tensorflow/python/kernel_tests/string_upper_op_test.py new file mode 100644 index 00000000000..56abebfffb1 --- /dev/null +++ b/tensorflow/python/kernel_tests/string_upper_op_test.py @@ -0,0 +1,48 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for string_upper_op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import string_ops +from tensorflow.python.platform import test + + +class StringUpperOpTest(test.TestCase): + """ Test cases for tf.strings.upper.""" + + def test_string_upper(self): + strings = ["Pigs on The Wing", "aNimals"] + + with self.cached_session() as sess: + output = string_ops.string_upper(strings) + output = self.evaluate(output) + self.assertAllEqual(output, [b"PIGS ON THE WING", b"ANIMALS"]) + + def test_string_upper_2d(self): + strings = [["pigS on THE wIng", "aniMals"], + [" hello ", "\n\tWorld! \r \n"]] + + with self.cached_session() as sess: + output = string_ops.string_upper(strings) + output = self.evaluate(output) + self.assertAllEqual(output, [[b"PIGS ON THE WING", b"ANIMALS"], + [b" HELLO ", b"\n\tWORLD! \r \n"]]) + + +if __name__ == "__main__": + test.main() From e2da5fd365ee3be600a389abd50da953d8dd8e12 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 19 Feb 2019 02:12:45 +0000 Subject: [PATCH 06/10] Update API compatibility with bazel-bin/tensorflow/tools/api/tests/api_compatibility_test --update_goldens True Signed-off-by: Yong Tang --- tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt | 8 ++++++++ tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt | 8 ++++++++ tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt | 8 ++++++++ tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt | 8 ++++++++ 4 files changed, 32 insertions(+) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index e30d28cdfa6..b4c2589869b 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -3788,6 +3788,10 @@ tf_module { name: "StringLength" argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], " } + member_method { + name: "StringLower" + argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None" + } member_method { name: "StringSplit" argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " @@ -3816,6 +3820,10 @@ tf_module { name: "StringToNumber" argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\'], " } + member_method { + name: "StringUpper" + argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None" + } member_method { name: "Sub" argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt index ada8be91454..4eba1dcb5ed 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt @@ -16,6 +16,10 @@ tf_module { name: "length" argspec: "args=[\'input\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], " } + member_method { + name: "lower" + argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "reduce_join" argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], " @@ -84,4 +88,8 @@ tf_module { name: "unicode_transcode" argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], " } + member_method { + name: "upper" + argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } } diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index e30d28cdfa6..b4c2589869b 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -3788,6 +3788,10 @@ tf_module { name: "StringLength" argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], " } + member_method { + name: "StringLower" + argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None" + } member_method { name: "StringSplit" argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " @@ -3816,6 +3820,10 @@ tf_module { name: "StringToNumber" argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\'], " } + member_method { + name: "StringUpper" + argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None" + } member_method { name: "Sub" argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt index e2da65eee41..0dc388268f1 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt @@ -16,6 +16,10 @@ tf_module { name: "length" argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], " } + member_method { + name: "lower" + argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "reduce_join" argspec: "args=[\'inputs\', \'axis\', \'keepdims\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\'], " @@ -84,4 +88,8 @@ tf_module { name: "unicode_transcode" argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], " } + member_method { + name: "upper" + argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } } From 5ebe920e241ac1e831635a8305f79cb157198782 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 19 Feb 2019 02:40:27 +0000 Subject: [PATCH 07/10] Add API defs with tensorflow/core/api_def/update_api_def.sh Signed-off-by: Yong Tang --- tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt | 3 +++ tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt | 3 +++ 2 files changed, 6 insertions(+) create mode 100644 tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt create mode 100644 tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt diff --git a/tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt new file mode 100644 index 00000000000..118bb66fad8 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt @@ -0,0 +1,3 @@ +op { + graph_op_name: "StringLower" +} diff --git a/tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt new file mode 100644 index 00000000000..40cd7a5a77b --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt @@ -0,0 +1,3 @@ +op { + graph_op_name: "StringUpper" +} From b102e328987ab80db1d55f4f52a1a179d306b30a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 3 Apr 2019 21:07:00 +0000 Subject: [PATCH 08/10] Add unicode support of strings.lower and strings.upper with additional encoding='' support, utf-8 is supported at the moment Signed-off-by: Yong Tang --- tensorflow/core/kernels/string_lower_op.cc | 26 +++++++++++++++--- tensorflow/core/kernels/string_upper_op.cc | 27 +++++++++++++++---- tensorflow/core/ops/string_ops.cc | 2 ++ .../kernel_tests/string_lower_op_test.py | 8 ++++++ .../kernel_tests/string_upper_op_test.py | 8 ++++++ 5 files changed, 62 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/kernels/string_lower_op.cc b/tensorflow/core/kernels/string_lower_op.cc index 8d33ff5f7c7..5fb9e2bcb0e 100644 --- a/tensorflow/core/kernels/string_lower_op.cc +++ b/tensorflow/core/kernels/string_lower_op.cc @@ -17,6 +17,8 @@ limitations under the License. #include +#include "unicode/unistr.h" // TF:icu + #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" @@ -28,7 +30,12 @@ namespace tensorflow { class StringLowerOp : public OpKernel { public: - explicit StringLowerOp(OpKernelConstruction* context) : OpKernel(context) {} + explicit StringLowerOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("encoding", &encoding_)); + OP_REQUIRES( + context, encoding_ != "" || encoding_ != "utf-8", + errors::InvalidArgument("only utf-8 or '' (no encoding) is supported, received ", encoding_)); + } void Compute(OpKernelContext* ctx) override { const Tensor* input_tensor; @@ -40,11 +47,22 @@ class StringLowerOp : public OpKernel { const auto input = input_tensor->flat(); auto output = output_tensor->flat(); - for (int64 i = 0; i < input.size(); ++i) { - StringPiece entry(input(i)); - output(i) = str_util::Lowercase(entry); + if (encoding_ == "") { + for (int64 i = 0; i < input.size(); ++i) { + StringPiece entry(input(i)); + output(i) = str_util::Lowercase(entry); + } + } else { + // The validation of utf-8 has already been done in GetAttr above. + for (int64 i = 0; i < input.size(); ++i) { + icu::UnicodeString us(input(i).c_str(), "UTF-8"); + us.toLower(); + us.toUTF8String(output(i)); + } } } + private: + string encoding_; }; REGISTER_KERNEL_BUILDER(Name("StringLower").Device(DEVICE_CPU), StringLowerOp); diff --git a/tensorflow/core/kernels/string_upper_op.cc b/tensorflow/core/kernels/string_upper_op.cc index fef9304916d..918bd0b63ae 100644 --- a/tensorflow/core/kernels/string_upper_op.cc +++ b/tensorflow/core/kernels/string_upper_op.cc @@ -17,6 +17,8 @@ limitations under the License. #include +#include "unicode/unistr.h" // TF:icu + #include "tensorflow/core/framework/kernel_def_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" @@ -28,7 +30,12 @@ namespace tensorflow { class StringUpperOp : public OpKernel { public: - explicit StringUpperOp(OpKernelConstruction* context) : OpKernel(context) {} + explicit StringUpperOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("encoding", &encoding_)); + OP_REQUIRES( + context, encoding_ != "" || encoding_ != "utf-8", + errors::InvalidArgument("only utf-8 or '' (no encoding) is supported, received ", encoding_)); + } void Compute(OpKernelContext* ctx) override { const Tensor* input_tensor; @@ -39,12 +46,22 @@ class StringUpperOp : public OpKernel { const auto input = input_tensor->flat(); auto output = output_tensor->flat(); - - for (int64 i = 0; i < input.size(); ++i) { - StringPiece entry(input(i)); - output(i) = str_util::Uppercase(entry); + if (encoding_ == "") { + for (int64 i = 0; i < input.size(); ++i) { + StringPiece entry(input(i)); + output(i) = str_util::Uppercase(entry); + } + } else { + // The validation of utf-8 has already been done in GetAttr above. + for (int64 i = 0; i < input.size(); ++i) { + icu::UnicodeString us(input(i).c_str(), "UTF-8"); + us.toUpper(); + us.toUTF8String(output(i)); + } } } + private: + string encoding_; }; REGISTER_KERNEL_BUILDER(Name("StringUpper").Device(DEVICE_CPU), StringUpperOp); diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index 23c5791713d..894a0bad326 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -209,11 +209,13 @@ REGISTER_OP("StringSplitV2") REGISTER_OP("StringLower") .Input("input: string") .Output("output: string") + .Attr("encoding: string =''") .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("StringUpper") .Input("input: string") .Output("output: string") + .Attr("encoding: string =''") .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("StringStrip") diff --git a/tensorflow/python/kernel_tests/string_lower_op_test.py b/tensorflow/python/kernel_tests/string_lower_op_test.py index ff8c4e32492..8650dfb0c0e 100644 --- a/tensorflow/python/kernel_tests/string_lower_op_test.py +++ b/tensorflow/python/kernel_tests/string_lower_op_test.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -43,6 +44,13 @@ class StringLowerOpTest(test.TestCase): self.assertAllEqual(output, [[b"pigs on the wing", b"animals"], [b" hello ", b"\n\tworld! \r \n"]]) + def test_string_upper_unicode(self): + strings = [["ÓÓSSCHLOË"]] + with self.cached_session() as sess: + output = string_ops.string_lower(strings, encoding='utf-8') + output = self.evaluate(output) + self.assertAllEqual(output, [[b"óósschloë"]]), + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/kernel_tests/string_upper_op_test.py b/tensorflow/python/kernel_tests/string_upper_op_test.py index 56abebfffb1..1edc303ba60 100644 --- a/tensorflow/python/kernel_tests/string_upper_op_test.py +++ b/tensorflow/python/kernel_tests/string_upper_op_test.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -43,6 +44,13 @@ class StringUpperOpTest(test.TestCase): self.assertAllEqual(output, [[b"PIGS ON THE WING", b"ANIMALS"], [b" HELLO ", b"\n\tWORLD! \r \n"]]) + def test_string_upper_unicode(self): + strings = [["óósschloë"]] + with self.cached_session() as sess: + output = string_ops.string_upper(strings, encoding='utf-8') + output = self.evaluate(output) + self.assertAllEqual(output, [[b"ÓÓSSCHLOË"]]), + if __name__ == "__main__": test.main() From 92828cb4ce3a38fc576e34060d184499cdbae0c3 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 3 Apr 2019 21:07:56 +0000 Subject: [PATCH 09/10] Update api golden Signed-off-by: Yong Tang --- tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt | 4 ++-- tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt | 4 ++-- tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt | 4 ++-- tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index b4c2589869b..70cb38f7b65 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -3790,7 +3790,7 @@ tf_module { } member_method { name: "StringLower" - argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " } member_method { name: "StringSplit" @@ -3822,7 +3822,7 @@ tf_module { } member_method { name: "StringUpper" - argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " } member_method { name: "Sub" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt index 4eba1dcb5ed..45ba9c33e2f 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt @@ -18,7 +18,7 @@ tf_module { } member_method { name: "lower" - argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " } member_method { name: "reduce_join" @@ -90,6 +90,6 @@ tf_module { } member_method { name: "upper" - argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " } } diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index b4c2589869b..70cb38f7b65 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -3790,7 +3790,7 @@ tf_module { } member_method { name: "StringLower" - argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " } member_method { name: "StringSplit" @@ -3822,7 +3822,7 @@ tf_module { } member_method { name: "StringUpper" - argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " } member_method { name: "Sub" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt index 0dc388268f1..043f730cdb6 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt @@ -18,7 +18,7 @@ tf_module { } member_method { name: "lower" - argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " } member_method { name: "reduce_join" @@ -90,6 +90,6 @@ tf_module { } member_method { name: "upper" - argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " } } From fda502e0a1a01136605bdddacb559865d0d4ef61 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 26 Apr 2019 16:20:45 +0000 Subject: [PATCH 10/10] Fix python 3 test failures caused by unicode string. Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/string_lower_op_test.py | 3 ++- tensorflow/python/kernel_tests/string_upper_op_test.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/kernel_tests/string_lower_op_test.py b/tensorflow/python/kernel_tests/string_lower_op_test.py index 8650dfb0c0e..1ed32ee099e 100644 --- a/tensorflow/python/kernel_tests/string_lower_op_test.py +++ b/tensorflow/python/kernel_tests/string_lower_op_test.py @@ -49,7 +49,8 @@ class StringLowerOpTest(test.TestCase): with self.cached_session() as sess: output = string_ops.string_lower(strings, encoding='utf-8') output = self.evaluate(output) - self.assertAllEqual(output, [[b"óósschloë"]]), + # output: "óósschloë" + self.assertAllEqual(output, [[b"\xc3\xb3\xc3\xb3sschlo\xc3\xab"]]) if __name__ == "__main__": diff --git a/tensorflow/python/kernel_tests/string_upper_op_test.py b/tensorflow/python/kernel_tests/string_upper_op_test.py index 1edc303ba60..dc14cb9be73 100644 --- a/tensorflow/python/kernel_tests/string_upper_op_test.py +++ b/tensorflow/python/kernel_tests/string_upper_op_test.py @@ -49,7 +49,8 @@ class StringUpperOpTest(test.TestCase): with self.cached_session() as sess: output = string_ops.string_upper(strings, encoding='utf-8') output = self.evaluate(output) - self.assertAllEqual(output, [[b"ÓÓSSCHLOË"]]), + # output: "ÓÓSSCHLOË" + self.assertAllEqual(output, [[b"\xc3\x93\xc3\x93SSCHLO\xc3\x8b"]]) if __name__ == "__main__":