Merge pull request #3176 from mozilla/alphabet-binding-docs

Document Alphabet methods in decoder binding as well
2020-07-23 16:06:54 +02:00 · 2020-07-23 16:06:54 +02:00 · 816c2d84ce
parent 38f6afdba8 2cdc228db4
commit 816c2d84ce
2 changed files with 32 additions and 9 deletions
--- a/native_client/ctcdecode/init.py
+++ b/native_client/ctcdecode/init.py
@ -48,15 +48,33 @@ class Alphabet(swigwrapper.Alphabet):
            raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))
    def CanEncodeSingle(self, input):
        '''
        Returns true if the single character/output class has a corresponding label
        in the alphabet.
        '''
        return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
    def CanEncode(self, input):
        '''
        Returns true if the entire string can be encoded into labels in this
        alphabet.
        '''
        return super(Alphabet, self).CanEncode(input.encode('utf-8'))
    def EncodeSingle(self, input):
        '''
        Encode a single character/output class into a label. Character must be in
        the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
        '''
        return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))
    def Encode(self, input):
        '''
        Encode a sequence of character/output classes into a sequence of labels.
        Characters are assumed to always take a single Unicode codepoint.
        Characters must be in the alphabet, this method will assert that. Use
        `CanEncode` and `CanEncodeSingle` to test.
        '''
        # Convert SWIG's UnsignedIntVec to a Python list
        res = super(Alphabet, self).Encode(input.encode('utf-8'))
        return [el for el in res]
@ -66,6 +84,7 @@ class Alphabet(swigwrapper.Alphabet):
        return res.decode('utf-8')
    def Decode(self, input):
        '''Decode a sequence of labels into a string.'''
        res = super(Alphabet, self).Decode(input)
        return res.decode('utf-8')
--- a/training/deepspeech_training/util/text.py
+++ b/training/deepspeech_training/util/text.py
@ -9,16 +9,20 @@ def text_to_char_array(transcript, alphabet, context=''):
    integers and return a numpy array representing the processed string.
    Use a string in `context` for adding text to raised exceptions.
    """
-    try:
+    if not alphabet.CanEncode(transcript):
-        transcript = alphabet.Encode(transcript)
+        # Provide the row context (especially wav_filename) for alphabet errors
-        if len(transcript) == 0:
+        raise ValueError(
            'Alphabet cannot encode transcript "{}" while processing sample "{}", '
            'check that your alphabet contains all characters in the training corpus. '
            'Missing characters are: {}.'
            .format(transcript, context, list(ch for ch in transcript if not alphabet.CanEncodeSingle(ch))))
    encoded = alphabet.Encode(transcript)
    if len(encoded) == 0:
        raise ValueError('While processing {}: Found an empty transcript! '
                         'You must include a transcript for all training data.'
                         .format(context))
-        return transcript
+    return encoded
    except KeyError as e:
        # Provide the row context (especially wav_filename) for alphabet errors
        raise ValueError('While processing: {}\n{}'.format(context, e))
 # The following code is from: http://hetland.org/coding/python/levenshtein.py