diff --git a/tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt new file mode 100644 index 00000000000..118bb66fad8 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_StringLower.pbtxt @@ -0,0 +1,3 @@ +op { + graph_op_name: "StringLower" +} diff --git a/tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt new file mode 100644 index 00000000000..40cd7a5a77b --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_StringUpper.pbtxt @@ -0,0 +1,3 @@ +op { + graph_op_name: "StringUpper" +} diff --git a/tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt new file mode 100644 index 00000000000..27d6783e3a4 --- /dev/null +++ b/tensorflow/core/api_def/python_api/api_def_StringLower.pbtxt @@ -0,0 +1,6 @@ +op { + graph_op_name: "StringLower" + endpoint { + name: "strings.lower" + } +} diff --git a/tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt new file mode 100644 index 00000000000..390501827ab --- /dev/null +++ b/tensorflow/core/api_def/python_api/api_def_StringUpper.pbtxt @@ -0,0 +1,6 @@ +op { + graph_op_name: "StringUpper" + endpoint { + name: "strings.upper" + } +} diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index ad7356586fb..adb9b0aa8fd 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -5066,9 +5066,11 @@ cc_library( ":string_format_op", ":string_join_op", ":string_length_op", + ":string_lower_op", ":string_split_op", ":string_strip_op", ":string_to_hash_bucket_op", + ":string_upper_op", ":substr_op", ":unicode_ops", ":unicode_script_op", @@ -5204,6 +5206,24 @@ tf_kernel_library( deps = STRING_DEPS, ) +tf_kernel_library( + name = "string_lower_op", + prefix = "string_lower_op", + deps = STRING_DEPS + [ + "@com_google_absl//absl/strings", + "@icu//:common", + ], +) + +tf_kernel_library( + name = "string_upper_op", + prefix = "string_upper_op", + deps = STRING_DEPS + [ + "@com_google_absl//absl/strings", + "@icu//:common", + ], +) + tf_kernel_library( name = "substr_op", prefix = "substr_op", @@ -6193,6 +6213,8 @@ filegroup( "batch_kernels.*", "regex_full_match_op.cc", "regex_replace_op.cc", + "string_lower_op.cc", # Requires ICU for unicode. + "string_upper_op.cc", # Requires ICU for unicode. "unicode_ops.cc", "unicode_script_op.cc", # Ops that are inherently incompatible with Android (e.g. tied to x86 platform). diff --git a/tensorflow/core/kernels/string_lower_op.cc b/tensorflow/core/kernels/string_lower_op.cc new file mode 100644 index 00000000000..e24eedcc3ae --- /dev/null +++ b/tensorflow/core/kernels/string_lower_op.cc @@ -0,0 +1,72 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/string_ops.cc. + +#include + +#include "absl/strings/ascii.h" +#include "unicode/unistr.h" // TF:icu +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" + +namespace tensorflow { + +class StringLowerOp : public OpKernel { + public: + explicit StringLowerOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("encoding", &encoding_)); + OP_REQUIRES(context, encoding_.empty() || encoding_ == "utf-8", + errors::InvalidArgument( + "only utf-8 or '' (no encoding) is supported, received ", + encoding_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor* input_tensor; + OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor)); + Tensor* output_tensor; + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor)); + + const auto input = input_tensor->flat(); + auto output = output_tensor->flat(); + + if (encoding_.empty()) { + for (int64 i = 0; i < input.size(); ++i) { + StringPiece entry(input(i)); + output(i) = absl::AsciiStrToLower(entry); + } + } else { + // The validation of utf-8 has already been done in GetAttr above. + for (int64 i = 0; i < input.size(); ++i) { + icu::UnicodeString us(input(i).c_str(), "UTF-8"); + us.toLower(); + us.toUTF8String(output(i)); + } + } + } + + private: + string encoding_; +}; + +REGISTER_KERNEL_BUILDER(Name("StringLower").Device(DEVICE_CPU), StringLowerOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/string_upper_op.cc b/tensorflow/core/kernels/string_upper_op.cc new file mode 100644 index 00000000000..f2a1d33e7a6 --- /dev/null +++ b/tensorflow/core/kernels/string_upper_op.cc @@ -0,0 +1,71 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/string_ops.cc. + +#include + +#include "absl/strings/ascii.h" +#include "unicode/unistr.h" // TF:icu +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/str_util.h" + +namespace tensorflow { + +class StringUpperOp : public OpKernel { + public: + explicit StringUpperOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("encoding", &encoding_)); + OP_REQUIRES(context, encoding_.empty() || encoding_ == "utf-8", + errors::InvalidArgument( + "only utf-8 or '' (no encoding) is supported, received ", + encoding_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor* input_tensor; + OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor)); + Tensor* output_tensor; + OP_REQUIRES_OK( + ctx, ctx->allocate_output(0, input_tensor->shape(), &output_tensor)); + + const auto input = input_tensor->flat(); + auto output = output_tensor->flat(); + if (encoding_.empty()) { + for (int64 i = 0; i < input.size(); ++i) { + StringPiece entry(input(i)); + output(i) = absl::AsciiStrToUpper(entry); + } + } else { + // The validation of utf-8 has already been done in GetAttr above. + for (int64 i = 0; i < input.size(); ++i) { + icu::UnicodeString us(input(i).c_str(), "UTF-8"); + us.toUpper(); + us.toUTF8String(output(i)); + } + } + } + + private: + string encoding_; +}; + +REGISTER_KERNEL_BUILDER(Name("StringUpper").Device(DEVICE_CPU), StringUpperOp); + +} // namespace tensorflow diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index dd530dac043..4aefaad90d0 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -206,6 +206,18 @@ REGISTER_OP("StringSplitV2") return Status::OK(); }); +REGISTER_OP("StringLower") + .Input("input: string") + .Output("output: string") + .Attr("encoding: string =''") + .SetShapeFn(shape_inference::UnchangedShape); + +REGISTER_OP("StringUpper") + .Input("input: string") + .Output("output: string") + .Attr("encoding: string =''") + .SetShapeFn(shape_inference::UnchangedShape); + REGISTER_OP("StringStrip") .Input("input: string") .Output("output: string") diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 3e78874e06a..285060af801 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -1118,6 +1118,34 @@ tf_py_test( ], ) +tf_py_test( + name = "string_lower_op_test", + size = "small", + srcs = ["string_lower_op_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:errors", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:string_ops", + ], +) + +tf_py_test( + name = "string_upper_op_test", + size = "small", + srcs = ["string_upper_op_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:errors", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:string_ops", + ], +) + tf_py_test( name = "substr_op_test", size = "small", diff --git a/tensorflow/python/kernel_tests/string_lower_op_test.py b/tensorflow/python/kernel_tests/string_lower_op_test.py new file mode 100644 index 00000000000..ec2f2ea78e8 --- /dev/null +++ b/tensorflow/python/kernel_tests/string_lower_op_test.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for string_lower_op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import string_ops +from tensorflow.python.platform import test + + +class StringLowerOpTest(test.TestCase): + """Test cases for tf.strings.lower.""" + + def test_string_lower(self): + strings = ["Pigs on The Wing", "aNimals"] + + with self.cached_session(): + output = string_ops.string_lower(strings) + output = self.evaluate(output) + self.assertAllEqual(output, [b"pigs on the wing", b"animals"]) + + def test_string_lower_2d(self): + strings = [["pigS on THE wIng", "aniMals"], [" hello ", "\n\tWorld! \r \n"]] + + with self.cached_session(): + output = string_ops.string_lower(strings) + output = self.evaluate(output) + self.assertAllEqual(output, [[b"pigs on the wing", b"animals"], + [b" hello ", b"\n\tworld! \r \n"]]) + + def test_string_upper_unicode(self): + strings = [["ÓÓSSCHLOË"]] + with self.cached_session(): + output = string_ops.string_lower(strings, encoding="utf-8") + output = self.evaluate(output) + # output: "óósschloë" + self.assertAllEqual(output, [[b"\xc3\xb3\xc3\xb3sschlo\xc3\xab"]]) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/python/kernel_tests/string_upper_op_test.py b/tensorflow/python/kernel_tests/string_upper_op_test.py new file mode 100644 index 00000000000..fa685f57ecf --- /dev/null +++ b/tensorflow/python/kernel_tests/string_upper_op_test.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for string_upper_op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import string_ops +from tensorflow.python.platform import test + + +class StringUpperOpTest(test.TestCase): + """Test cases for tf.strings.upper.""" + + def test_string_upper(self): + strings = ["Pigs on The Wing", "aNimals"] + + with self.cached_session(): + output = string_ops.string_upper(strings) + output = self.evaluate(output) + self.assertAllEqual(output, [b"PIGS ON THE WING", b"ANIMALS"]) + + def test_string_upper_2d(self): + strings = [["pigS on THE wIng", "aniMals"], [" hello ", "\n\tWorld! \r \n"]] + + with self.cached_session(): + output = string_ops.string_upper(strings) + output = self.evaluate(output) + self.assertAllEqual(output, [[b"PIGS ON THE WING", b"ANIMALS"], + [b" HELLO ", b"\n\tWORLD! \r \n"]]) + + def test_string_upper_unicode(self): + strings = [["óósschloë"]] + with self.cached_session(): + output = string_ops.string_upper(strings, encoding="utf-8") + output = self.evaluate(output) + # output: "ÓÓSSCHLOË" + self.assertAllEqual(output, [[b"\xc3\x93\xc3\x93SSCHLO\xc3\x8b"]]) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index 312ba5b12ca..d2e12dc7156 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -3872,6 +3872,10 @@ tf_module { name: "StringLength" argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], " } + member_method { + name: "StringLower" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " + } member_method { name: "StringSplit" argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " @@ -3900,6 +3904,10 @@ tf_module { name: "StringToNumber" argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\'], " } + member_method { + name: "StringUpper" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " + } member_method { name: "Sub" argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt index 8a326ed5e41..0fa45e41ee9 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt @@ -20,6 +20,10 @@ tf_module { name: "length" argspec: "args=[\'input\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], " } + member_method { + name: "lower" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " + } member_method { name: "reduce_join" argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], " @@ -88,4 +92,8 @@ tf_module { name: "unicode_transcode" argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], " } + member_method { + name: "upper" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " + } } diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index 312ba5b12ca..d2e12dc7156 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -3872,6 +3872,10 @@ tf_module { name: "StringLength" argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], " } + member_method { + name: "StringLower" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " + } member_method { name: "StringSplit" argspec: "args=[\'input\', \'delimiter\', \'skip_empty\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " @@ -3900,6 +3904,10 @@ tf_module { name: "StringToNumber" argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\'], " } + member_method { + name: "StringUpper" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " + } member_method { name: "Sub" argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt index b959c7c25d4..592da35d439 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt @@ -20,6 +20,10 @@ tf_module { name: "length" argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], " } + member_method { + name: "lower" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " + } member_method { name: "reduce_join" argspec: "args=[\'inputs\', \'axis\', \'keepdims\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\'], " @@ -88,4 +92,8 @@ tf_module { name: "unicode_transcode" argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], " } + member_method { + name: "upper" + argspec: "args=[\'input\', \'encoding\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], " + } }