From 8ec6ac80791ddb9e0c0146b346902710085d82f3 Mon Sep 17 00:00:00 2001 From: Robert Gale Date: Thu, 1 Aug 2019 11:19:21 -0700 Subject: [PATCH 1/3] Checking for empty transcripts during character encoding This way we can get a plain English exception early, rather than a matrix shape error during training. --- util/text.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/util/text.py b/util/text.py index 7ae6ef3e..368c812f 100644 --- a/util/text.py +++ b/util/text.py @@ -52,7 +52,10 @@ def text_to_char_array(original, alphabet): Given a Python string ``original``, remove unsupported characters, map characters to integers and return a numpy array representing the processed string. """ - return np.asarray([alphabet.label_from_string(c) for c in original]) + characters = np.asarray([alphabet.label_from_string(c) for c in original]) + if characters.shape[0] == 0: + raise Exception("Found an empty transcript! You must include a transcript for all training data.") + return characters # The following code is from: http://hetland.org/coding/python/levenshtein.py From a3e0e9f9bc078a34333575433ea4acc04c3b22e9 Mon Sep 17 00:00:00 2001 From: Robert Gale Date: Thu, 1 Aug 2019 12:14:13 -0700 Subject: [PATCH 2/3] Update text.py "characters" was a bad variable name now that I think about it --- util/text.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/util/text.py b/util/text.py index 368c812f..8a115ab4 100644 --- a/util/text.py +++ b/util/text.py @@ -52,10 +52,10 @@ def text_to_char_array(original, alphabet): Given a Python string ``original``, remove unsupported characters, map characters to integers and return a numpy array representing the processed string. """ - characters = np.asarray([alphabet.label_from_string(c) for c in original]) - if characters.shape[0] == 0: + integers = np.asarray([alphabet.label_from_string(c) for c in original]) + if integers.shape[0] == 0: raise Exception("Found an empty transcript! You must include a transcript for all training data.") - return characters + return integers # The following code is from: http://hetland.org/coding/python/levenshtein.py From 85e25fa2d7a3c3583790c94acaf4d5e600cbe064 Mon Sep 17 00:00:00 2001 From: Robert Gale Date: Wed, 7 Aug 2019 14:43:41 -0700 Subject: [PATCH 3/3] Applying text_to_char_array to each row in DataFrame so we can provide wav_filename context on exception --- util/feeding.py | 10 ++++++++-- util/text.py | 24 ++++++++++++++++-------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/util/feeding.py b/util/feeding.py index 67c6cd93..35bb5bfc 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -13,6 +13,7 @@ import datetime from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio from util.config import Config +from util.logging import log_error from util.text import text_to_char_array @@ -68,8 +69,13 @@ def create_dataset(csvs, batch_size, cache_path=''): df = read_csvs(csvs) df.sort_values(by='wav_filesize', inplace=True) - # Convert to character index arrays - df['transcript'] = df['transcript'].apply(partial(text_to_char_array, alphabet=Config.alphabet)) + try: + # Convert to character index arrays + df = df.apply(partial(text_to_char_array, alphabet=Config.alphabet), result_type='broadcast', axis=1) + except ValueError as e: + error_message, series, *_ = e.args + log_error('While processing {}:\n {}'.format(series['wav_filename'], error_message)) + exit(1) def generate_values(): for _, row in df.iterrows(): diff --git a/util/text.py b/util/text.py index 8a115ab4..d04961ce 100644 --- a/util/text.py +++ b/util/text.py @@ -31,7 +31,9 @@ class Alphabet(object): return self._str_to_label[string] except KeyError as e: raise KeyError( - '''ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your {train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt.''' + 'ERROR: Your transcripts contain characters (e.g. \'{}\') which do not occur in data/alphabet.txt! Use ' \ + 'util/check_characters.py to see what characters are in your [train,dev,test].csv transcripts, and ' \ + 'then add all these to data/alphabet.txt.'.format(string) ).with_traceback(e.__traceback__) def decode(self, labels): @@ -47,15 +49,21 @@ class Alphabet(object): return self._config_file -def text_to_char_array(original, alphabet): +def text_to_char_array(series, alphabet): r""" - Given a Python string ``original``, remove unsupported characters, map characters - to integers and return a numpy array representing the processed string. + Given a Pandas Series containing transcript string, map characters to + integers and return a numpy array representing the processed string. """ - integers = np.asarray([alphabet.label_from_string(c) for c in original]) - if integers.shape[0] == 0: - raise Exception("Found an empty transcript! You must include a transcript for all training data.") - return integers + try: + series['transcript'] = np.asarray([alphabet.label_from_string(c) for c in series['transcript']]) + except KeyError as e: + # Provide the row context (especially wav_filename) for alphabet errors + raise ValueError(str(e), series) + + if series['transcript'].shape[0] == 0: + raise ValueError("Found an empty transcript! You must include a transcript for all training data.", series) + + return series # The following code is from: http://hetland.org/coding/python/levenshtein.py