Merge pull request #3176 from mozilla/alphabet-binding-docs
Document Alphabet methods in decoder binding as well
This commit is contained in:
commit
816c2d84ce
|
@ -48,15 +48,33 @@ class Alphabet(swigwrapper.Alphabet):
|
||||||
raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))
|
raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))
|
||||||
|
|
||||||
def CanEncodeSingle(self, input):
|
def CanEncodeSingle(self, input):
|
||||||
|
'''
|
||||||
|
Returns true if the single character/output class has a corresponding label
|
||||||
|
in the alphabet.
|
||||||
|
'''
|
||||||
return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
|
return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
|
||||||
|
|
||||||
def CanEncode(self, input):
|
def CanEncode(self, input):
|
||||||
|
'''
|
||||||
|
Returns true if the entire string can be encoded into labels in this
|
||||||
|
alphabet.
|
||||||
|
'''
|
||||||
return super(Alphabet, self).CanEncode(input.encode('utf-8'))
|
return super(Alphabet, self).CanEncode(input.encode('utf-8'))
|
||||||
|
|
||||||
def EncodeSingle(self, input):
|
def EncodeSingle(self, input):
|
||||||
|
'''
|
||||||
|
Encode a single character/output class into a label. Character must be in
|
||||||
|
the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
|
||||||
|
'''
|
||||||
return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))
|
return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))
|
||||||
|
|
||||||
def Encode(self, input):
|
def Encode(self, input):
|
||||||
|
'''
|
||||||
|
Encode a sequence of character/output classes into a sequence of labels.
|
||||||
|
Characters are assumed to always take a single Unicode codepoint.
|
||||||
|
Characters must be in the alphabet, this method will assert that. Use
|
||||||
|
`CanEncode` and `CanEncodeSingle` to test.
|
||||||
|
'''
|
||||||
# Convert SWIG's UnsignedIntVec to a Python list
|
# Convert SWIG's UnsignedIntVec to a Python list
|
||||||
res = super(Alphabet, self).Encode(input.encode('utf-8'))
|
res = super(Alphabet, self).Encode(input.encode('utf-8'))
|
||||||
return [el for el in res]
|
return [el for el in res]
|
||||||
|
@ -66,6 +84,7 @@ class Alphabet(swigwrapper.Alphabet):
|
||||||
return res.decode('utf-8')
|
return res.decode('utf-8')
|
||||||
|
|
||||||
def Decode(self, input):
|
def Decode(self, input):
|
||||||
|
'''Decode a sequence of labels into a string.'''
|
||||||
res = super(Alphabet, self).Decode(input)
|
res = super(Alphabet, self).Decode(input)
|
||||||
return res.decode('utf-8')
|
return res.decode('utf-8')
|
||||||
|
|
||||||
|
|
|
@ -9,16 +9,20 @@ def text_to_char_array(transcript, alphabet, context=''):
|
||||||
integers and return a numpy array representing the processed string.
|
integers and return a numpy array representing the processed string.
|
||||||
Use a string in `context` for adding text to raised exceptions.
|
Use a string in `context` for adding text to raised exceptions.
|
||||||
"""
|
"""
|
||||||
try:
|
if not alphabet.CanEncode(transcript):
|
||||||
transcript = alphabet.Encode(transcript)
|
# Provide the row context (especially wav_filename) for alphabet errors
|
||||||
if len(transcript) == 0:
|
raise ValueError(
|
||||||
|
'Alphabet cannot encode transcript "{}" while processing sample "{}", '
|
||||||
|
'check that your alphabet contains all characters in the training corpus. '
|
||||||
|
'Missing characters are: {}.'
|
||||||
|
.format(transcript, context, list(ch for ch in transcript if not alphabet.CanEncodeSingle(ch))))
|
||||||
|
|
||||||
|
encoded = alphabet.Encode(transcript)
|
||||||
|
if len(encoded) == 0:
|
||||||
raise ValueError('While processing {}: Found an empty transcript! '
|
raise ValueError('While processing {}: Found an empty transcript! '
|
||||||
'You must include a transcript for all training data.'
|
'You must include a transcript for all training data.'
|
||||||
.format(context))
|
.format(context))
|
||||||
return transcript
|
return encoded
|
||||||
except KeyError as e:
|
|
||||||
# Provide the row context (especially wav_filename) for alphabet errors
|
|
||||||
raise ValueError('While processing: {}\n{}'.format(context, e))
|
|
||||||
|
|
||||||
|
|
||||||
# The following code is from: http://hetland.org/coding/python/levenshtein.py
|
# The following code is from: http://hetland.org/coding/python/levenshtein.py
|
||||||
|
|
Loading…
Reference in New Issue