diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py index 18f402a7..2dc2be56 100644 --- a/native_client/ctcdecode/__init__.py +++ b/native_client/ctcdecode/__init__.py @@ -48,15 +48,33 @@ class Alphabet(swigwrapper.Alphabet): raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err)) def CanEncodeSingle(self, input): + ''' + Returns true if the single character/output class has a corresponding label + in the alphabet. + ''' return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8')) def CanEncode(self, input): + ''' + Returns true if the entire string can be encoded into labels in this + alphabet. + ''' return super(Alphabet, self).CanEncode(input.encode('utf-8')) def EncodeSingle(self, input): + ''' + Encode a single character/output class into a label. Character must be in + the alphabet, this method will assert that. Use `CanEncodeSingle` to test. + ''' return super(Alphabet, self).EncodeSingle(input.encode('utf-8')) def Encode(self, input): + ''' + Encode a sequence of character/output classes into a sequence of labels. + Characters are assumed to always take a single Unicode codepoint. + Characters must be in the alphabet, this method will assert that. Use + `CanEncode` and `CanEncodeSingle` to test. + ''' # Convert SWIG's UnsignedIntVec to a Python list res = super(Alphabet, self).Encode(input.encode('utf-8')) return [el for el in res] @@ -66,6 +84,7 @@ class Alphabet(swigwrapper.Alphabet): return res.decode('utf-8') def Decode(self, input): + '''Decode a sequence of labels into a string.''' res = super(Alphabet, self).Decode(input) return res.decode('utf-8') diff --git a/training/deepspeech_training/util/text.py b/training/deepspeech_training/util/text.py index e1c2e981..198bd96e 100644 --- a/training/deepspeech_training/util/text.py +++ b/training/deepspeech_training/util/text.py @@ -9,16 +9,20 @@ def text_to_char_array(transcript, alphabet, context=''): integers and return a numpy array representing the processed string. Use a string in `context` for adding text to raised exceptions. """ - try: - transcript = alphabet.Encode(transcript) - if len(transcript) == 0: - raise ValueError('While processing {}: Found an empty transcript! ' - 'You must include a transcript for all training data.' - .format(context)) - return transcript - except KeyError as e: + if not alphabet.CanEncode(transcript): # Provide the row context (especially wav_filename) for alphabet errors - raise ValueError('While processing: {}\n{}'.format(context, e)) + raise ValueError( + 'Alphabet cannot encode transcript "{}" while processing sample "{}", ' + 'check that your alphabet contains all characters in the training corpus. ' + 'Missing characters are: {}.' + .format(transcript, context, list(ch for ch in transcript if not alphabet.CanEncodeSingle(ch)))) + + encoded = alphabet.Encode(transcript) + if len(encoded) == 0: + raise ValueError('While processing {}: Found an empty transcript! ' + 'You must include a transcript for all training data.' + .format(context)) + return encoded # The following code is from: http://hetland.org/coding/python/levenshtein.py