From 8ec6ac80791ddb9e0c0146b346902710085d82f3 Mon Sep 17 00:00:00 2001
From: Robert Gale <galer@ohsu.edu>
Date: Thu, 1 Aug 2019 11:19:21 -0700
Subject: [PATCH 1/3] Checking for empty transcripts during character encoding

This way we can get a plain English exception early, rather than a matrix shape error during training.
---
 util/text.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/util/text.py b/util/text.py
index 7ae6ef3e..368c812f 100644
--- a/util/text.py
+++ b/util/text.py
@@ -52,7 +52,10 @@ def text_to_char_array(original, alphabet):
     Given a Python string ``original``, remove unsupported characters, map characters
     to integers and return a numpy array representing the processed string.
     """
-    return np.asarray([alphabet.label_from_string(c) for c in original])
+    characters = np.asarray([alphabet.label_from_string(c) for c in original])
+    if characters.shape[0] == 0:
+        raise Exception("Found an empty transcript! You must include a transcript for all training data.")
+    return characters
 
 
 # The following code is from: http://hetland.org/coding/python/levenshtein.py

From a3e0e9f9bc078a34333575433ea4acc04c3b22e9 Mon Sep 17 00:00:00 2001
From: Robert Gale <rcgale@gmail.com>
Date: Thu, 1 Aug 2019 12:14:13 -0700
Subject: [PATCH 2/3] Update text.py

"characters" was a bad variable name now that I think about it
---
 util/text.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/util/text.py b/util/text.py
index 368c812f..8a115ab4 100644
--- a/util/text.py
+++ b/util/text.py
@@ -52,10 +52,10 @@ def text_to_char_array(original, alphabet):
     Given a Python string ``original``, remove unsupported characters, map characters
     to integers and return a numpy array representing the processed string.
     """
-    characters = np.asarray([alphabet.label_from_string(c) for c in original])
-    if characters.shape[0] == 0:
+    integers = np.asarray([alphabet.label_from_string(c) for c in original])
+    if integers.shape[0] == 0:
         raise Exception("Found an empty transcript! You must include a transcript for all training data.")
-    return characters
+    return integers
 
 
 # The following code is from: http://hetland.org/coding/python/levenshtein.py

From 85e25fa2d7a3c3583790c94acaf4d5e600cbe064 Mon Sep 17 00:00:00 2001
From: Robert Gale <galer@ohsu.edu>
Date: Wed, 7 Aug 2019 14:43:41 -0700
Subject: [PATCH 3/3] Applying text_to_char_array to each row in DataFrame so
 we can provide wav_filename context on exception

---
 util/feeding.py | 10 ++++++++--
 util/text.py    | 24 ++++++++++++++++--------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/util/feeding.py b/util/feeding.py
index 67c6cd93..35bb5bfc 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -13,6 +13,7 @@ import datetime
 from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
 
 from util.config import Config
+from util.logging import log_error
 from util.text import text_to_char_array
 
 
@@ -68,8 +69,13 @@ def create_dataset(csvs, batch_size, cache_path=''):
     df = read_csvs(csvs)
     df.sort_values(by='wav_filesize', inplace=True)
 
-    # Convert to character index arrays
-    df['transcript'] = df['transcript'].apply(partial(text_to_char_array, alphabet=Config.alphabet))
+    try:
+        # Convert to character index arrays
+        df = df.apply(partial(text_to_char_array, alphabet=Config.alphabet), result_type='broadcast', axis=1)
+    except ValueError as e:
+        error_message, series, *_ = e.args
+        log_error('While processing {}:\n  {}'.format(series['wav_filename'], error_message))
+        exit(1)
 
     def generate_values():
         for _, row in df.iterrows():
diff --git a/util/text.py b/util/text.py
index 8a115ab4..d04961ce 100644
--- a/util/text.py
+++ b/util/text.py
@@ -31,7 +31,9 @@ class Alphabet(object):
             return self._str_to_label[string]
         except KeyError as e:
             raise KeyError(
-                '''ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your {train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt.'''
+                'ERROR: Your transcripts contain characters (e.g. \'{}\') which do not occur in data/alphabet.txt! Use ' \
+                'util/check_characters.py to see what characters are in your [train,dev,test].csv transcripts, and ' \
+                'then add all these to data/alphabet.txt.'.format(string)
             ).with_traceback(e.__traceback__)
 
     def decode(self, labels):
@@ -47,15 +49,21 @@ class Alphabet(object):
         return self._config_file
 
 
-def text_to_char_array(original, alphabet):
+def text_to_char_array(series, alphabet):
     r"""
-    Given a Python string ``original``, remove unsupported characters, map characters
-    to integers and return a numpy array representing the processed string.
+    Given a Pandas Series containing transcript string, map characters to
+    integers and return a numpy array representing the processed string.
     """
-    integers = np.asarray([alphabet.label_from_string(c) for c in original])
-    if integers.shape[0] == 0:
-        raise Exception("Found an empty transcript! You must include a transcript for all training data.")
-    return integers
+    try:
+        series['transcript'] = np.asarray([alphabet.label_from_string(c) for c in series['transcript']])
+    except KeyError as e:
+        # Provide the row context (especially wav_filename) for alphabet errors
+        raise ValueError(str(e), series)
+
+    if series['transcript'].shape[0] == 0:
+        raise ValueError("Found an empty transcript! You must include a transcript for all training data.", series)
+
+    return series
 
 
 # The following code is from: http://hetland.org/coding/python/levenshtein.py