Adding unicode / PY3 support for feature column vocab files.

PiperOrigin-RevId: 285862836
Change-Id: I2eec29c2300dfbc99f29b30b56e3e7dfea6d047e
This commit is contained in:
Rohan Jain 2019-12-16 15:23:28 -08:00 committed by TensorFlower Gardener
parent 3aa42f1cbb
commit 42afc3e5ac
4 changed files with 182 additions and 1 deletions

View File

@ -99,6 +99,7 @@ filegroup(
"testdata/embedding.ckpt.data-00000-of-00001",
"testdata/embedding.ckpt.index",
"testdata/embedding.ckpt.meta",
"testdata/unicode_vocabulary",
"testdata/warriors_vocabulary.txt",
"testdata/wire_vocabulary.txt",
],

View File

@ -1665,7 +1665,7 @@ def categorical_column_with_vocabulary_file_v2(key,
if not gfile.Exists(vocabulary_file):
raise ValueError('vocabulary_file in {} does not exist.'.format(key))
with gfile.GFile(vocabulary_file) as f:
with gfile.GFile(vocabulary_file, mode='rb') as f:
vocabulary_size = sum(1 for _ in f)
logging.info(
'vocabulary_size = %d in %s is inferred from the number of elements '

View File

@ -3886,6 +3886,10 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
'python/feature_column/testdata/wire_vocabulary.txt')
self._wire_vocabulary_size = 3
# Contains unicode characters.
self._unicode_vocabulary_file_name = test.test_src_dir_path(
'python/feature_column/testdata/unicode_vocabulary')
@test_util.run_deprecated_v1
def test_defaults(self):
column = fc.categorical_column_with_vocabulary_file(
@ -3898,6 +3902,17 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
}, column.parse_example_spec)
self.assertTrue(column._is_v2_column)
@test_util.run_deprecated_v1
def test_defaults_unicode(self):
column = fc.categorical_column_with_vocabulary_file(
key='aaa', vocabulary_file=self._unicode_vocabulary_file_name)
self.assertEqual('aaa', column.name)
self.assertEqual('aaa', column.key)
self.assertEqual(165, column.num_buckets)
self.assertEqual({'aaa': parsing_ops.VarLenFeature(dtypes.string)},
column.parse_example_spec)
self.assertTrue(column._is_v2_column)
def test_key_should_be_string(self):
with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
fc.categorical_column_with_vocabulary_file(

View File

@ -0,0 +1,165 @@
t
/
e
o
a
s
p
i
c
n
.
r
h
m
x
l
d
w
-
u
g
b
:
2
0
1
f
%
8
3
5
k
9
4
y
7
6
v
=
_
?
A
D
j
&
F
z
E
B
S
C
q
M
L
I
R
T
N
W
P
U
G
Z
O
V
Y
H
J
X
Q
K
+
#
,
;
~
)
@
!
|
'
(
$
*
]
[
{
}
\
^
`
"
¸
à
Ð
Ñ
Ã
>
<
²
°
±
§
Ø
ˆ
â
¾
¹
µ
³
Œ
ì
Ù
º
¡
·
©
 
¼
­
ª
<EFBFBD>
ë
å
¿
½
´
£
ê
é
è
ç
Û
»
¯
¦
¥
¢
ž
š
˜
<EFBFBD>
<EFBFBD>
ƒ
í
Ú
Å
®
¨
¤
œ