Merge pull request #37585 from jaketae:preprocessing-docs

PiperOrigin-RevId: 301820297 Change-Id: I2c6fc183aaf0e4820e9e07220a68aae664815a53
2020-03-19 08:36:26 -07:00 · 2020-03-19 08:36:26 -07:00 · 1dffd2d117
commit 1dffd2d117
parent 6bc6835154 374a0d3a4a
1 changed files with 60 additions and 7 deletions
--- a/tensorflow/python/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/preprocessing/text.py
@ -23,16 +23,66 @@ from keras_preprocessing import text

 from tensorflow.python.util.tf_export import keras_export

-text_to_word_sequence = text.text_to_word_sequence
-one_hot = text.one_hot
 hashing_trick = text.hashing_trick
 Tokenizer = text.Tokenizer

-keras_export(
-    'keras.preprocessing.text.text_to_word_sequence')(text_to_word_sequence)
-keras_export('keras.preprocessing.text.one_hot')(one_hot)
-keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
-keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer)
+
+@keras_export('keras.preprocessing.text.text_to_word_sequence')
+def text_to_word_sequence(input_text,
+                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                          lower=True,
+                          split=' '):
+  """Converts a text to a sequence of words (or tokens).
+
+  This function transforms a string of text into a list of words
+  while ignoring `filters` which include punctuations by default.
+
+  >>> sample_text = 'This is a sample sentence.'
+  >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
+  ['this', 'is', 'a', 'sample', 'sentence']
+
+  Arguments:
+      input_text: Input text (string).
+      filters: list (or concatenation) of characters to filter out, such as
+          punctuation. Default: `'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'`,
+            includes basic punctuation, tabs, and newlines.
+      lower: boolean. Whether to convert the input to lowercase.
+      split: str. Separator for word splitting.
+
+  Returns:
+      A list of words (or tokens).
+  """
+  return text.text_to_word_sequence(
+      input_text, filters=filters, lower=lower, split=split)
+
+
+@keras_export('keras.preprocessing.text.one_hot')
+def one_hot(input_text,
+            n,
+            filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+            lower=True,
+            split=' '):
+  """One-hot encodes a text into a list of word indexes of size `n`.
+
+  This function receives as input a string of text and returns a
+  list of encoded integers each corresponding to a word (or token)
+  in the given input string.
+
+  Arguments:
+      input_text: Input text (string).
+      n: int. Size of vocabulary.
+      filters: list (or concatenation) of characters to filter out, such as
+          punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n``,
+            includes basic punctuation, tabs, and newlines.
+      lower: boolean. Whether to set the text to lowercase.
+      split: str. Separator for word splitting.
+
+  Returns:
+      List of integers in `[1, n]`. Each integer encodes a word
+      (unicity non-guaranteed).
+  """
+  return text.one_hot(input_text, n, filters=filters, lower=lower, split=split)
+

 # text.tokenizer_from_json is only available if keras_preprocessing >= 1.1.0
 try:
@ -41,3 +91,6 @@ try:
      tokenizer_from_json)
 except AttributeError:
  pass
+
+keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
+keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer)