Applying text_to_char_array to each row in DataFrame so we can provide wav_filename context on exception

This commit is contained in:
Robert Gale 2019-08-07 14:43:41 -07:00
parent a3e0e9f9bc
commit 85e25fa2d7
2 changed files with 24 additions and 10 deletions

View File

@ -13,6 +13,7 @@ import datetime
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from util.config import Config from util.config import Config
from util.logging import log_error
from util.text import text_to_char_array from util.text import text_to_char_array
@ -68,8 +69,13 @@ def create_dataset(csvs, batch_size, cache_path=''):
df = read_csvs(csvs) df = read_csvs(csvs)
df.sort_values(by='wav_filesize', inplace=True) df.sort_values(by='wav_filesize', inplace=True)
try:
# Convert to character index arrays # Convert to character index arrays
df['transcript'] = df['transcript'].apply(partial(text_to_char_array, alphabet=Config.alphabet)) df = df.apply(partial(text_to_char_array, alphabet=Config.alphabet), result_type='broadcast', axis=1)
except ValueError as e:
error_message, series, *_ = e.args
log_error('While processing {}:\n {}'.format(series['wav_filename'], error_message))
exit(1)
def generate_values(): def generate_values():
for _, row in df.iterrows(): for _, row in df.iterrows():

View File

@ -31,7 +31,9 @@ class Alphabet(object):
return self._str_to_label[string] return self._str_to_label[string]
except KeyError as e: except KeyError as e:
raise KeyError( raise KeyError(
'''ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your {train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt.''' 'ERROR: Your transcripts contain characters (e.g. \'{}\') which do not occur in data/alphabet.txt! Use ' \
'util/check_characters.py to see what characters are in your [train,dev,test].csv transcripts, and ' \
'then add all these to data/alphabet.txt.'.format(string)
).with_traceback(e.__traceback__) ).with_traceback(e.__traceback__)
def decode(self, labels): def decode(self, labels):
@ -47,15 +49,21 @@ class Alphabet(object):
return self._config_file return self._config_file
def text_to_char_array(original, alphabet): def text_to_char_array(series, alphabet):
r""" r"""
Given a Python string ``original``, remove unsupported characters, map characters Given a Pandas Series containing transcript string, map characters to
to integers and return a numpy array representing the processed string. integers and return a numpy array representing the processed string.
""" """
integers = np.asarray([alphabet.label_from_string(c) for c in original]) try:
if integers.shape[0] == 0: series['transcript'] = np.asarray([alphabet.label_from_string(c) for c in series['transcript']])
raise Exception("Found an empty transcript! You must include a transcript for all training data.") except KeyError as e:
return integers # Provide the row context (especially wav_filename) for alphabet errors
raise ValueError(str(e), series)
if series['transcript'].shape[0] == 0:
raise ValueError("Found an empty transcript! You must include a transcript for all training data.", series)
return series
# The following code is from: http://hetland.org/coding/python/levenshtein.py # The following code is from: http://hetland.org/coding/python/levenshtein.py