From eb33fc171932c0779a4f7e06bec5a2a961546bf7 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 23 Jul 2020 13:00:10 +0200
Subject: [PATCH 1/2] Document Alphabet methods in Python binding as well

---
 native_client/ctcdecode/__init__.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/native_client/ctcdecode/__init__.py b/native_client/ctcdecode/__init__.py
index 18f402a7..2dc2be56 100644
--- a/native_client/ctcdecode/__init__.py
+++ b/native_client/ctcdecode/__init__.py
@@ -48,15 +48,33 @@ class Alphabet(swigwrapper.Alphabet):
             raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))
 
     def CanEncodeSingle(self, input):
+        '''
+        Returns true if the single character/output class has a corresponding label
+        in the alphabet.
+        '''
         return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
 
     def CanEncode(self, input):
+        '''
+        Returns true if the entire string can be encoded into labels in this
+        alphabet.
+        '''
         return super(Alphabet, self).CanEncode(input.encode('utf-8'))
 
     def EncodeSingle(self, input):
+        '''
+        Encode a single character/output class into a label. Character must be in
+        the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
+        '''
         return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))
 
     def Encode(self, input):
+        '''
+        Encode a sequence of character/output classes into a sequence of labels.
+        Characters are assumed to always take a single Unicode codepoint.
+        Characters must be in the alphabet, this method will assert that. Use
+        `CanEncode` and `CanEncodeSingle` to test.
+        '''
         # Convert SWIG's UnsignedIntVec to a Python list
         res = super(Alphabet, self).Encode(input.encode('utf-8'))
         return [el for el in res]
@@ -66,6 +84,7 @@ class Alphabet(swigwrapper.Alphabet):
         return res.decode('utf-8')
 
     def Decode(self, input):
+        '''Decode a sequence of labels into a string.'''
         res = super(Alphabet, self).Decode(input)
         return res.decode('utf-8')
 

From 2cdc228db48fe62330381214863d0a8e4e405d2f Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 23 Jul 2020 13:16:12 +0200
Subject: [PATCH 2/2] Use Alphabet.CanEncode in text_to_char_array

---
 training/deepspeech_training/util/text.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/training/deepspeech_training/util/text.py b/training/deepspeech_training/util/text.py
index e1c2e981..198bd96e 100644
--- a/training/deepspeech_training/util/text.py
+++ b/training/deepspeech_training/util/text.py
@@ -9,16 +9,20 @@ def text_to_char_array(transcript, alphabet, context=''):
     integers and return a numpy array representing the processed string.
     Use a string in `context` for adding text to raised exceptions.
     """
-    try:
-        transcript = alphabet.Encode(transcript)
-        if len(transcript) == 0:
-            raise ValueError('While processing {}: Found an empty transcript! '
-                             'You must include a transcript for all training data.'
-                             .format(context))
-        return transcript
-    except KeyError as e:
+    if not alphabet.CanEncode(transcript):
         # Provide the row context (especially wav_filename) for alphabet errors
-        raise ValueError('While processing: {}\n{}'.format(context, e))
+        raise ValueError(
+            'Alphabet cannot encode transcript "{}" while processing sample "{}", '
+            'check that your alphabet contains all characters in the training corpus. '
+            'Missing characters are: {}.'
+            .format(transcript, context, list(ch for ch in transcript if not alphabet.CanEncodeSingle(ch))))
+
+    encoded = alphabet.Encode(transcript)
+    if len(encoded) == 0:
+        raise ValueError('While processing {}: Found an empty transcript! '
+                         'You must include a transcript for all training data.'
+                         .format(context))
+    return encoded
 
 
 # The following code is from: http://hetland.org/coding/python/levenshtein.py