Merge pull request #3176 from mozilla/alphabet-binding-docs

Document Alphabet methods in decoder binding as well
This commit is contained in:
Reuben Morais 2020-07-23 16:06:54 +02:00 committed by GitHub
commit 816c2d84ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 32 additions and 9 deletions

View File

@ -48,15 +48,33 @@ class Alphabet(swigwrapper.Alphabet):
raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err)) raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))
def CanEncodeSingle(self, input): def CanEncodeSingle(self, input):
'''
Returns true if the single character/output class has a corresponding label
in the alphabet.
'''
return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8')) return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
def CanEncode(self, input): def CanEncode(self, input):
'''
Returns true if the entire string can be encoded into labels in this
alphabet.
'''
return super(Alphabet, self).CanEncode(input.encode('utf-8')) return super(Alphabet, self).CanEncode(input.encode('utf-8'))
def EncodeSingle(self, input): def EncodeSingle(self, input):
'''
Encode a single character/output class into a label. Character must be in
the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
'''
return super(Alphabet, self).EncodeSingle(input.encode('utf-8')) return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))
def Encode(self, input): def Encode(self, input):
'''
Encode a sequence of character/output classes into a sequence of labels.
Characters are assumed to always take a single Unicode codepoint.
Characters must be in the alphabet, this method will assert that. Use
`CanEncode` and `CanEncodeSingle` to test.
'''
# Convert SWIG's UnsignedIntVec to a Python list # Convert SWIG's UnsignedIntVec to a Python list
res = super(Alphabet, self).Encode(input.encode('utf-8')) res = super(Alphabet, self).Encode(input.encode('utf-8'))
return [el for el in res] return [el for el in res]
@ -66,6 +84,7 @@ class Alphabet(swigwrapper.Alphabet):
return res.decode('utf-8') return res.decode('utf-8')
def Decode(self, input): def Decode(self, input):
'''Decode a sequence of labels into a string.'''
res = super(Alphabet, self).Decode(input) res = super(Alphabet, self).Decode(input)
return res.decode('utf-8') return res.decode('utf-8')

View File

@ -9,16 +9,20 @@ def text_to_char_array(transcript, alphabet, context=''):
integers and return a numpy array representing the processed string. integers and return a numpy array representing the processed string.
Use a string in `context` for adding text to raised exceptions. Use a string in `context` for adding text to raised exceptions.
""" """
try: if not alphabet.CanEncode(transcript):
transcript = alphabet.Encode(transcript) # Provide the row context (especially wav_filename) for alphabet errors
if len(transcript) == 0: raise ValueError(
'Alphabet cannot encode transcript "{}" while processing sample "{}", '
'check that your alphabet contains all characters in the training corpus. '
'Missing characters are: {}.'
.format(transcript, context, list(ch for ch in transcript if not alphabet.CanEncodeSingle(ch))))
encoded = alphabet.Encode(transcript)
if len(encoded) == 0:
raise ValueError('While processing {}: Found an empty transcript! ' raise ValueError('While processing {}: Found an empty transcript! '
'You must include a transcript for all training data.' 'You must include a transcript for all training data.'
.format(context)) .format(context))
return transcript return encoded
except KeyError as e:
# Provide the row context (especially wav_filename) for alphabet errors
raise ValueError('While processing: {}\n{}'.format(context, e))
# The following code is from: http://hetland.org/coding/python/levenshtein.py # The following code is from: http://hetland.org/coding/python/levenshtein.py