diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt new file mode 100644 index 00000000000..7898fe8d6bc --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt @@ -0,0 +1,28 @@ +op { + graph_op_name: "UnicodeScript" + endpoint { + name: "UnicodeScript" + } + in_arg { + name: "input" + description: <input("input", &input_tensor)); + const auto& input_flat = input_tensor->flat(); + + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output("output", input_tensor->shape(), + &output_tensor)); + auto output_flat = output_tensor->flat(); + + icu::ErrorCode status; + for (int i = 0; i < input_flat.size(); i++) { + UScriptCode script_code = uscript_getScript(input_flat(i), status); + if (status.isSuccess()) { + output_flat(i) = script_code; + } else { + output_flat(i) = -1; + status.reset(); + } + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("UnicodeScript").Device(DEVICE_CPU), + UnicodeScriptOp); + +} // namespace tensorflow diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc index da1d2a64320..b4fbde54d90 100644 --- a/tensorflow/core/ops/string_ops.cc +++ b/tensorflow/core/ops/string_ops.cc @@ -244,4 +244,9 @@ REGISTER_OP("Substr") return shape_inference::BroadcastBinaryOpShapeFn(c); }); +REGISTER_OP("UnicodeScript") + .Input("input: int32") + .Output("output: int32") + .SetShapeFn(shape_inference::UnchangedShape); + } // namespace tensorflow diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 5183e4d30c2..c2e36e5e196 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -1097,6 +1097,18 @@ tf_py_test( ], ) +tf_py_test( + name = "unicode_script_op_test", + size = "small", + srcs = ["unicode_script_op_test.py"], + additional_deps = [ + "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:string_ops", + ], +) + cuda_py_test( name = "topk_op_test", size = "small", diff --git a/tensorflow/python/kernel_tests/unicode_script_op_test.py b/tensorflow/python/kernel_tests/unicode_script_op_test.py new file mode 100644 index 00000000000..927e5459ed2 --- /dev/null +++ b/tensorflow/python/kernel_tests/unicode_script_op_test.py @@ -0,0 +1,57 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== +"""Functional tests for UnicodeScript op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import string_ops +from tensorflow.python.platform import test + + +class UnicodeScriptOpTest(test.TestCase): + + def testValidScripts(self): + inputs = [ + ord("a"), + 0x0411, # CYRILLIC CAPITAL LETTER BE + 0x82b8, # CJK UNIFIED IDEOGRAPH-82B8 + ord(",") + ] + with self.cached_session(): + input_vector = constant_op.constant(inputs, dtypes.int32) + outputs = string_ops.unicode_script(input_vector).eval() + self.assertAllEqual( + outputs, + [ + 25, # USCRIPT_LATIN (LATN) + 8, # USCRIPT_CYRILLIC (CYRL) + 17, # USCRIPT_HAN (HANI) + 0 # USCRIPT_COMMON (ZYYY) + ]) + + def testInvalidScript(self): + inputs = [-100, 0xffffff] + with self.cached_session(): + input_vector = constant_op.constant(inputs, dtypes.int32) + outputs = string_ops.unicode_script(input_vector).eval() + self.assertAllEqual(outputs, [-1, -1]) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt index c52581dec15..312e94b41d5 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt @@ -48,4 +48,8 @@ tf_module { name: "to_number" argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\'], " } + member_method { + name: "unicode_script" + argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } } diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt index c52581dec15..312e94b41d5 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt @@ -48,4 +48,8 @@ tf_module { name: "to_number" argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'None\'], " } + member_method { + name: "unicode_script" + argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } } diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD index b450bc42c54..095ac1f4cc7 100644 --- a/tensorflow/tools/lib_package/BUILD +++ b/tensorflow/tools/lib_package/BUILD @@ -125,6 +125,7 @@ genrule( "@gemmlowp//:LICENSE", "@gif_archive//:COPYING", "@highwayhash//:LICENSE", + "@icu//:icu4c/LICENSE", "@jpeg//:LICENSE.md", "@llvm//:LICENSE.TXT", "@lmdb//:LICENSE", @@ -192,6 +193,7 @@ genrule( "@gemmlowp//:LICENSE", "@gif_archive//:COPYING", "@highwayhash//:LICENSE", + "@icu//:icu4j/main/shared/licenses/LICENSE", "@jpeg//:LICENSE.md", "@llvm//:LICENSE.TXT", "@lmdb//:LICENSE", diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index 9d816f0672e..cce60ccea05 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -153,6 +153,7 @@ filegroup( "@gemmlowp//:LICENSE", "@gif_archive//:COPYING", "@highwayhash//:LICENSE", + "@icu//:icu4c/LICENSE", "@jpeg//:LICENSE.md", "@lmdb//:LICENSE", "@local_config_sycl//sycl:LICENSE.text", diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 4bf2ff3fb50..e5a0a0b2b7b 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -21,9 +21,11 @@ load( "def_file_filter_configure", ) load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo") +load("//third_party/icu:workspace.bzl", icu = "repo") def initialize_third_party(): flatbuffers() + icu() # Sanitize a dependency so that it works correctly from code that includes # TensorFlow as a submodule. diff --git a/third_party/icu/BUILD b/third_party/icu/BUILD new file mode 100644 index 00000000000..82bab3ffd96 --- /dev/null +++ b/third_party/icu/BUILD @@ -0,0 +1 @@ +# This empty BUILD file is required to make Bazel treat this directory as a package. diff --git a/third_party/icu/BUILD.bazel b/third_party/icu/BUILD.bazel new file mode 100644 index 00000000000..36d6b9006b9 --- /dev/null +++ b/third_party/icu/BUILD.bazel @@ -0,0 +1,88 @@ +package( + default_visibility = ["//visibility:public"], +) + +licenses(["notice"]) # Apache 2.0 + +exports_files([ + "icu4c/LICENSE", + "icu4j/main/shared/licenses/LICENSE", +]) + +cc_library( + name = "headers", + hdrs = glob(["icu4c/source/common/unicode/*.h"]), + includes = [ + "icu4c/source/common", + ], + deps = [ + ], +) + +cc_library( + name = "common", + hdrs = glob(["icu4c/source/common/unicode/*.h"]), + includes = [ + "icu4c/source/common", + ], + deps = [ + ":icuuc", + ], +) + +cc_library( + name = "icuuc", + srcs = glob( + [ + "icu4c/source/common/*.c", + "icu4c/source/common/*.cpp", + "icu4c/source/stubdata/*.cpp", + ], + ), + hdrs = glob([ + "icu4c/source/common/*.h", + ]), + copts = [ + "-DU_COMMON_IMPLEMENTATION", + "-DU_HAVE_STD_ATOMICS", + ] + select({ + ":android": [ + "-fdata-sections", + "-DGOOGLE_VENDOR_SRC_BRANCH", + "-DU_HAVE_NL_LANGINFO_CODESET=0", + "-Wno-deprecated-declarations", + ], + ":apple": [ + "-DGOOGLE_VENDOR_SRC_BRANCH", + "-Wno-shorten-64-to-32", + "-Wno-unused-variable", + ], + ":windows": [ + "/utf-8", + "/DLOCALE_ALLOW_NEUTRAL_NAMES=0", + ], + "//conditions:default": [], + }), + tags = ["requires-rtti"], + visibility = [ + "//visibility:private", + ], + deps = [ + ":headers", + ], +) + +config_setting( + name = "android", + values = {"crosstool_top": "//external:android/crosstool"}, +) + +config_setting( + name = "apple", + values = {"cpu": "darwin"}, +) + +config_setting( + name = "windows", + values = {"cpu": "x64_windows"}, +) diff --git a/third_party/icu/workspace.bzl b/third_party/icu/workspace.bzl new file mode 100644 index 00000000000..bfebf4219b1 --- /dev/null +++ b/third_party/icu/workspace.bzl @@ -0,0 +1,15 @@ +"""Loads a lightweight subset of the ICU library for Unicode processing.""" + +load("//third_party:repo.bzl", "third_party_http_archive") + +def repo(): + third_party_http_archive( + name = "icu", + strip_prefix = "icu-release-62-1", + sha256 = "e15ffd84606323cbad5515bf9ecdf8061cc3bf80fb883b9e6aa162e485aa9761", + urls = [ + "https://mirror.bazel.build/github.com/unicode-org/icu/archive/release-62-1.tar.gz", + "https://github.com/unicode-org/icu/archive/release-62-1.tar.gz", + ], + build_file = "//third_party/icu:BUILD.bazel", + )