101 lines
3.6 KiB
Python
101 lines
3.6 KiB
Python
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
"""Utilities for text input preprocessing.
|
|
"""
|
|
# pylint: disable=invalid-name
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
from keras_preprocessing import text
|
|
|
|
from tensorflow.python.keras.preprocessing.text_dataset import text_dataset_from_directory # pylint: disable=unused-import
|
|
from tensorflow.python.util.tf_export import keras_export
|
|
|
|
hashing_trick = text.hashing_trick
|
|
Tokenizer = text.Tokenizer
|
|
|
|
|
|
@keras_export('keras.preprocessing.text.text_to_word_sequence')
|
|
def text_to_word_sequence(input_text,
|
|
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
|
|
lower=True,
|
|
split=' '):
|
|
"""Converts a text to a sequence of words (or tokens).
|
|
|
|
This function transforms a string of text into a list of words
|
|
while ignoring `filters` which include punctuations by default.
|
|
|
|
>>> sample_text = 'This is a sample sentence.'
|
|
>>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
|
|
['this', 'is', 'a', 'sample', 'sentence']
|
|
|
|
Args:
|
|
input_text: Input text (string).
|
|
filters: list (or concatenation) of characters to filter out, such as
|
|
punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``,
|
|
includes basic punctuation, tabs, and newlines.
|
|
lower: boolean. Whether to convert the input to lowercase.
|
|
split: str. Separator for word splitting.
|
|
|
|
Returns:
|
|
A list of words (or tokens).
|
|
"""
|
|
return text.text_to_word_sequence(
|
|
input_text, filters=filters, lower=lower, split=split)
|
|
|
|
|
|
@keras_export('keras.preprocessing.text.one_hot')
|
|
def one_hot(input_text,
|
|
n,
|
|
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
|
|
lower=True,
|
|
split=' '):
|
|
r"""One-hot encodes a text into a list of word indexes of size `n`.
|
|
|
|
This function receives as input a string of text and returns a
|
|
list of encoded integers each corresponding to a word (or token)
|
|
in the given input string.
|
|
|
|
Args:
|
|
input_text: Input text (string).
|
|
n: int. Size of vocabulary.
|
|
filters: list (or concatenation) of characters to filter out, such as
|
|
punctuation. Default:
|
|
```
|
|
'!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
|
|
```,
|
|
includes basic punctuation, tabs, and newlines.
|
|
lower: boolean. Whether to set the text to lowercase.
|
|
split: str. Separator for word splitting.
|
|
|
|
Returns:
|
|
List of integers in `[1, n]`. Each integer encodes a word
|
|
(unicity non-guaranteed).
|
|
"""
|
|
return text.one_hot(input_text, n, filters=filters, lower=lower, split=split)
|
|
|
|
|
|
# text.tokenizer_from_json is only available if keras_preprocessing >= 1.1.0
|
|
try:
|
|
tokenizer_from_json = text.tokenizer_from_json
|
|
keras_export('keras.preprocessing.text.tokenizer_from_json')(
|
|
tokenizer_from_json)
|
|
except AttributeError:
|
|
pass
|
|
|
|
keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
|
|
keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer)
|