Add tf.strings.unicode_script, which detects the script of a unicode codepoint
based on standard ranges. PiperOrigin-RevId: 214796357
This commit is contained in:
parent
3002b10e29
commit
334244be68
28
tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt
Normal file
28
tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
op {
|
||||||
|
graph_op_name: "UnicodeScript"
|
||||||
|
endpoint {
|
||||||
|
name: "UnicodeScript"
|
||||||
|
}
|
||||||
|
in_arg {
|
||||||
|
name: "input"
|
||||||
|
description: <<END
|
||||||
|
A Tensor of int32 Unicode code points.
|
||||||
|
END
|
||||||
|
}
|
||||||
|
out_arg {
|
||||||
|
name: "output"
|
||||||
|
description: <<END
|
||||||
|
A Tensor of int32 script codes corresponding to each input code point.
|
||||||
|
END
|
||||||
|
}
|
||||||
|
summary: <<END
|
||||||
|
Determine the script codes of a given tensor of Unicode integer code points.
|
||||||
|
END
|
||||||
|
description: <<END
|
||||||
|
This operation converts Unicode code points to script codes corresponding to
|
||||||
|
each code point. Script codes correspond to International Components for
|
||||||
|
Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
|
||||||
|
Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
|
||||||
|
match input shape.
|
||||||
|
END
|
||||||
|
}
|
@ -0,0 +1,6 @@
|
|||||||
|
op {
|
||||||
|
graph_op_name: "UnicodeScript"
|
||||||
|
endpoint {
|
||||||
|
name: "strings.unicode_script"
|
||||||
|
}
|
||||||
|
}
|
@ -4431,6 +4431,7 @@ cc_library(
|
|||||||
":string_strip_op",
|
":string_strip_op",
|
||||||
":string_to_hash_bucket_op",
|
":string_to_hash_bucket_op",
|
||||||
":substr_op",
|
":substr_op",
|
||||||
|
":unicode_script_op",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -5471,6 +5472,7 @@ filegroup(
|
|||||||
"batch_kernels.*",
|
"batch_kernels.*",
|
||||||
"regex_full_match_op.cc",
|
"regex_full_match_op.cc",
|
||||||
"regex_replace_op.cc",
|
"regex_replace_op.cc",
|
||||||
|
"unicode_script_op.cc",
|
||||||
# Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
|
# Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
|
||||||
"mkl_*",
|
"mkl_*",
|
||||||
"xsmm_*",
|
"xsmm_*",
|
||||||
@ -6565,6 +6567,16 @@ tf_kernel_library(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
tf_kernel_library(
|
||||||
|
name = "unicode_script_op",
|
||||||
|
srcs = ["unicode_script_op.cc"],
|
||||||
|
deps = [
|
||||||
|
"//tensorflow/core:framework",
|
||||||
|
"//tensorflow/core:string_ops_op_lib",
|
||||||
|
"@icu//:common",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Google-internal targets. These must be at the end for syncrepo.
|
# Google-internal targets. These must be at the end for syncrepo.
|
||||||
|
|
||||||
|
53
tensorflow/core/kernels/unicode_script_op.cc
Normal file
53
tensorflow/core/kernels/unicode_script_op.cc
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#include "unicode/errorcode.h" // TF:icu
|
||||||
|
#include "unicode/uscript.h" // TF:icu
|
||||||
|
#include "tensorflow/core/framework/op_kernel.h"
|
||||||
|
|
||||||
|
namespace tensorflow {
|
||||||
|
|
||||||
|
class UnicodeScriptOp : public OpKernel {
|
||||||
|
public:
|
||||||
|
explicit UnicodeScriptOp(OpKernelConstruction* context) : OpKernel(context) {}
|
||||||
|
|
||||||
|
void Compute(OpKernelContext* context) override {
|
||||||
|
const Tensor* input_tensor;
|
||||||
|
OP_REQUIRES_OK(context, context->input("input", &input_tensor));
|
||||||
|
const auto& input_flat = input_tensor->flat<int32>();
|
||||||
|
|
||||||
|
Tensor* output_tensor = nullptr;
|
||||||
|
OP_REQUIRES_OK(context,
|
||||||
|
context->allocate_output("output", input_tensor->shape(),
|
||||||
|
&output_tensor));
|
||||||
|
auto output_flat = output_tensor->flat<int32>();
|
||||||
|
|
||||||
|
icu::ErrorCode status;
|
||||||
|
for (int i = 0; i < input_flat.size(); i++) {
|
||||||
|
UScriptCode script_code = uscript_getScript(input_flat(i), status);
|
||||||
|
if (status.isSuccess()) {
|
||||||
|
output_flat(i) = script_code;
|
||||||
|
} else {
|
||||||
|
output_flat(i) = -1;
|
||||||
|
status.reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
REGISTER_KERNEL_BUILDER(Name("UnicodeScript").Device(DEVICE_CPU),
|
||||||
|
UnicodeScriptOp);
|
||||||
|
|
||||||
|
} // namespace tensorflow
|
@ -244,4 +244,9 @@ REGISTER_OP("Substr")
|
|||||||
return shape_inference::BroadcastBinaryOpShapeFn(c);
|
return shape_inference::BroadcastBinaryOpShapeFn(c);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
REGISTER_OP("UnicodeScript")
|
||||||
|
.Input("input: int32")
|
||||||
|
.Output("output: int32")
|
||||||
|
.SetShapeFn(shape_inference::UnchangedShape);
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -1097,6 +1097,18 @@ tf_py_test(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
tf_py_test(
|
||||||
|
name = "unicode_script_op_test",
|
||||||
|
size = "small",
|
||||||
|
srcs = ["unicode_script_op_test.py"],
|
||||||
|
additional_deps = [
|
||||||
|
"//tensorflow/python:client_testlib",
|
||||||
|
"//tensorflow/python:constant_op",
|
||||||
|
"//tensorflow/python:dtypes",
|
||||||
|
"//tensorflow/python:string_ops",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
cuda_py_test(
|
cuda_py_test(
|
||||||
name = "topk_op_test",
|
name = "topk_op_test",
|
||||||
size = "small",
|
size = "small",
|
||||||
|
57
tensorflow/python/kernel_tests/unicode_script_op_test.py
Normal file
57
tensorflow/python/kernel_tests/unicode_script_op_test.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#===============================================================================
|
||||||
|
"""Functional tests for UnicodeScript op."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from tensorflow.python.framework import constant_op
|
||||||
|
from tensorflow.python.framework import dtypes
|
||||||
|
from tensorflow.python.ops import string_ops
|
||||||
|
from tensorflow.python.platform import test
|
||||||
|
|
||||||
|
|
||||||
|
class UnicodeScriptOpTest(test.TestCase):
|
||||||
|
|
||||||
|
def testValidScripts(self):
|
||||||
|
inputs = [
|
||||||
|
ord("a"),
|
||||||
|
0x0411, # CYRILLIC CAPITAL LETTER BE
|
||||||
|
0x82b8, # CJK UNIFIED IDEOGRAPH-82B8
|
||||||
|
ord(",")
|
||||||
|
]
|
||||||
|
with self.cached_session():
|
||||||
|
input_vector = constant_op.constant(inputs, dtypes.int32)
|
||||||
|
outputs = string_ops.unicode_script(input_vector).eval()
|
||||||
|
self.assertAllEqual(
|
||||||
|
outputs,
|
||||||
|
[
|
||||||
|
25, # USCRIPT_LATIN (LATN)
|
||||||
|
8, # USCRIPT_CYRILLIC (CYRL)
|
||||||
|
17, # USCRIPT_HAN (HANI)
|
||||||
|
0 # USCRIPT_COMMON (ZYYY)
|
||||||
|
])
|
||||||
|
|
||||||
|
def testInvalidScript(self):
|
||||||
|
inputs = [-100, 0xffffff]
|
||||||
|
with self.cached_session():
|
||||||
|
input_vector = constant_op.constant(inputs, dtypes.int32)
|
||||||
|
outputs = string_ops.unicode_script(input_vector).eval()
|
||||||
|
self.assertAllEqual(outputs, [-1, -1])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test.main()
|
@ -48,4 +48,8 @@ tf_module {
|
|||||||
name: "to_number"
|
name: "to_number"
|
||||||
argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
|
argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
|
||||||
}
|
}
|
||||||
|
member_method {
|
||||||
|
name: "unicode_script"
|
||||||
|
argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -48,4 +48,8 @@ tf_module {
|
|||||||
name: "to_number"
|
name: "to_number"
|
||||||
argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
|
argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
|
||||||
}
|
}
|
||||||
|
member_method {
|
||||||
|
name: "unicode_script"
|
||||||
|
argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -125,6 +125,7 @@ genrule(
|
|||||||
"@gemmlowp//:LICENSE",
|
"@gemmlowp//:LICENSE",
|
||||||
"@gif_archive//:COPYING",
|
"@gif_archive//:COPYING",
|
||||||
"@highwayhash//:LICENSE",
|
"@highwayhash//:LICENSE",
|
||||||
|
"@icu//:icu4c/LICENSE",
|
||||||
"@jpeg//:LICENSE.md",
|
"@jpeg//:LICENSE.md",
|
||||||
"@llvm//:LICENSE.TXT",
|
"@llvm//:LICENSE.TXT",
|
||||||
"@lmdb//:LICENSE",
|
"@lmdb//:LICENSE",
|
||||||
@ -192,6 +193,7 @@ genrule(
|
|||||||
"@gemmlowp//:LICENSE",
|
"@gemmlowp//:LICENSE",
|
||||||
"@gif_archive//:COPYING",
|
"@gif_archive//:COPYING",
|
||||||
"@highwayhash//:LICENSE",
|
"@highwayhash//:LICENSE",
|
||||||
|
"@icu//:icu4j/main/shared/licenses/LICENSE",
|
||||||
"@jpeg//:LICENSE.md",
|
"@jpeg//:LICENSE.md",
|
||||||
"@llvm//:LICENSE.TXT",
|
"@llvm//:LICENSE.TXT",
|
||||||
"@lmdb//:LICENSE",
|
"@lmdb//:LICENSE",
|
||||||
|
@ -153,6 +153,7 @@ filegroup(
|
|||||||
"@gemmlowp//:LICENSE",
|
"@gemmlowp//:LICENSE",
|
||||||
"@gif_archive//:COPYING",
|
"@gif_archive//:COPYING",
|
||||||
"@highwayhash//:LICENSE",
|
"@highwayhash//:LICENSE",
|
||||||
|
"@icu//:icu4c/LICENSE",
|
||||||
"@jpeg//:LICENSE.md",
|
"@jpeg//:LICENSE.md",
|
||||||
"@lmdb//:LICENSE",
|
"@lmdb//:LICENSE",
|
||||||
"@local_config_sycl//sycl:LICENSE.text",
|
"@local_config_sycl//sycl:LICENSE.text",
|
||||||
|
@ -21,9 +21,11 @@ load(
|
|||||||
"def_file_filter_configure",
|
"def_file_filter_configure",
|
||||||
)
|
)
|
||||||
load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
|
load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
|
||||||
|
load("//third_party/icu:workspace.bzl", icu = "repo")
|
||||||
|
|
||||||
def initialize_third_party():
|
def initialize_third_party():
|
||||||
flatbuffers()
|
flatbuffers()
|
||||||
|
icu()
|
||||||
|
|
||||||
# Sanitize a dependency so that it works correctly from code that includes
|
# Sanitize a dependency so that it works correctly from code that includes
|
||||||
# TensorFlow as a submodule.
|
# TensorFlow as a submodule.
|
||||||
|
1
third_party/icu/BUILD
vendored
Normal file
1
third_party/icu/BUILD
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
# This empty BUILD file is required to make Bazel treat this directory as a package.
|
88
third_party/icu/BUILD.bazel
vendored
Normal file
88
third_party/icu/BUILD.bazel
vendored
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
package(
|
||||||
|
default_visibility = ["//visibility:public"],
|
||||||
|
)
|
||||||
|
|
||||||
|
licenses(["notice"]) # Apache 2.0
|
||||||
|
|
||||||
|
exports_files([
|
||||||
|
"icu4c/LICENSE",
|
||||||
|
"icu4j/main/shared/licenses/LICENSE",
|
||||||
|
])
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "headers",
|
||||||
|
hdrs = glob(["icu4c/source/common/unicode/*.h"]),
|
||||||
|
includes = [
|
||||||
|
"icu4c/source/common",
|
||||||
|
],
|
||||||
|
deps = [
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "common",
|
||||||
|
hdrs = glob(["icu4c/source/common/unicode/*.h"]),
|
||||||
|
includes = [
|
||||||
|
"icu4c/source/common",
|
||||||
|
],
|
||||||
|
deps = [
|
||||||
|
":icuuc",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "icuuc",
|
||||||
|
srcs = glob(
|
||||||
|
[
|
||||||
|
"icu4c/source/common/*.c",
|
||||||
|
"icu4c/source/common/*.cpp",
|
||||||
|
"icu4c/source/stubdata/*.cpp",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
hdrs = glob([
|
||||||
|
"icu4c/source/common/*.h",
|
||||||
|
]),
|
||||||
|
copts = [
|
||||||
|
"-DU_COMMON_IMPLEMENTATION",
|
||||||
|
"-DU_HAVE_STD_ATOMICS",
|
||||||
|
] + select({
|
||||||
|
":android": [
|
||||||
|
"-fdata-sections",
|
||||||
|
"-DGOOGLE_VENDOR_SRC_BRANCH",
|
||||||
|
"-DU_HAVE_NL_LANGINFO_CODESET=0",
|
||||||
|
"-Wno-deprecated-declarations",
|
||||||
|
],
|
||||||
|
":apple": [
|
||||||
|
"-DGOOGLE_VENDOR_SRC_BRANCH",
|
||||||
|
"-Wno-shorten-64-to-32",
|
||||||
|
"-Wno-unused-variable",
|
||||||
|
],
|
||||||
|
":windows": [
|
||||||
|
"/utf-8",
|
||||||
|
"/DLOCALE_ALLOW_NEUTRAL_NAMES=0",
|
||||||
|
],
|
||||||
|
"//conditions:default": [],
|
||||||
|
}),
|
||||||
|
tags = ["requires-rtti"],
|
||||||
|
visibility = [
|
||||||
|
"//visibility:private",
|
||||||
|
],
|
||||||
|
deps = [
|
||||||
|
":headers",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
config_setting(
|
||||||
|
name = "android",
|
||||||
|
values = {"crosstool_top": "//external:android/crosstool"},
|
||||||
|
)
|
||||||
|
|
||||||
|
config_setting(
|
||||||
|
name = "apple",
|
||||||
|
values = {"cpu": "darwin"},
|
||||||
|
)
|
||||||
|
|
||||||
|
config_setting(
|
||||||
|
name = "windows",
|
||||||
|
values = {"cpu": "x64_windows"},
|
||||||
|
)
|
15
third_party/icu/workspace.bzl
vendored
Normal file
15
third_party/icu/workspace.bzl
vendored
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
"""Loads a lightweight subset of the ICU library for Unicode processing."""
|
||||||
|
|
||||||
|
load("//third_party:repo.bzl", "third_party_http_archive")
|
||||||
|
|
||||||
|
def repo():
|
||||||
|
third_party_http_archive(
|
||||||
|
name = "icu",
|
||||||
|
strip_prefix = "icu-release-62-1",
|
||||||
|
sha256 = "e15ffd84606323cbad5515bf9ecdf8061cc3bf80fb883b9e6aa162e485aa9761",
|
||||||
|
urls = [
|
||||||
|
"https://mirror.bazel.build/github.com/unicode-org/icu/archive/release-62-1.tar.gz",
|
||||||
|
"https://github.com/unicode-org/icu/archive/release-62-1.tar.gz",
|
||||||
|
],
|
||||||
|
build_file = "//third_party/icu:BUILD.bazel",
|
||||||
|
)
|
Loading…
Reference in New Issue
Block a user