Named tuple AudioFormat, parameter re-ordering in util.audio and NP to PCM conversion support

2020-04-02 13:12:08 +02:00 · 2020-04-02 13:12:08 +02:00 · 927859728f
commit 927859728f
parent f8acf5cba7
2 changed files with 61 additions and 49 deletions
--- a/training/deepspeech_training/util/audio.py
+++ b/training/deepspeech_training/util/audio.py
@ -6,11 +6,14 @@ import collections
 import numpy as np

 from .helpers import LimitingPool
+from collections import namedtuple
+
+AudioFormat = namedtuple('AudioFormat', 'rate channels width')

 DEFAULT_RATE = 16000
 DEFAULT_CHANNELS = 1
 DEFAULT_WIDTH = 2
-DEFAULT_FORMAT = (DEFAULT_RATE, DEFAULT_CHANNELS, DEFAULT_WIDTH)
+DEFAULT_FORMAT = AudioFormat(DEFAULT_RATE, DEFAULT_CHANNELS, DEFAULT_WIDTH)

 AUDIO_TYPE_NP = 'application/vnd.mozilla.np'
 AUDIO_TYPE_PCM = 'application/vnd.mozilla.pcm'
@ -33,7 +36,7 @@ class Sample:
    ----------
    audio_type : str
        See `__init__`.
-    audio_format : tuple:(int, int, int)
+    audio_format : util.audio.AudioFormat
        See `__init__`.
    audio : binary
        Audio data represented as indicated by `audio_type`
@ -55,8 +58,7 @@ class Sample:
        raw_data : binary
            Audio data in the form of the provided representation type (see audio_type).
            For types util.audio.AUDIO_TYPE_OPUS or util.audio.AUDIO_TYPE_WAV data can also be passed as a bytearray.
-        audio_format : tuple
-            Tuple of sample-rate, number of channels and sample-width.
+        audio_format : util.audio.AudioFormat
            Required in case of audio_type = util.audio.AUDIO_TYPE_PCM or util.audio.AUDIO_TYPE_NP,
            as this information cannot be derived from raw audio data.
        sample_id : str
@ -87,7 +89,6 @@ class Sample:
        ----------
        new_audio_type : str
            New audio-type - see `__init__`.
-            Not supported: Converting from AUDIO_TYPE_NP into any other type.
        """
        if self.audio_type == new_audio_type:
            return
@ -95,13 +96,15 @@ class Sample:
            self.audio_format, audio = read_audio(self.audio_type, self.audio)
            self.audio.close()
            self.audio = audio
+        elif new_audio_type == AUDIO_TYPE_PCM and self.audio_type == AUDIO_TYPE_NP:
+            self.audio = np_to_pcm(self.audio, self.audio_format)
        elif new_audio_type == AUDIO_TYPE_NP:
            self.change_audio_type(AUDIO_TYPE_PCM)
-            self.audio = pcm_to_np(self.audio_format, self.audio)
+            self.audio = pcm_to_np(self.audio, self.audio_format)
        elif new_audio_type in SERIALIZABLE_AUDIO_TYPES:
            self.change_audio_type(AUDIO_TYPE_PCM)
            audio_bytes = io.BytesIO()
-            write_audio(new_audio_type, audio_bytes, self.audio_format, self.audio)
+            write_audio(new_audio_type, audio_bytes, self.audio, audio_format=self.audio_format)
            audio_bytes.seek(0)
            self.audio = audio_bytes
        else:
@ -122,29 +125,30 @@ def change_audio_types(samples, audio_type=AUDIO_TYPE_PCM, processes=None, proce


 def read_audio_format_from_wav_file(wav_file):
-    return wav_file.getframerate(), wav_file.getnchannels(), wav_file.getsampwidth()
+    return AudioFormat(wav_file.getframerate(), wav_file.getnchannels(), wav_file.getsampwidth())


 def get_num_samples(pcm_buffer_size, audio_format=DEFAULT_FORMAT):
-    _, channels, width = audio_format
-    return pcm_buffer_size // (channels * width)
+    return pcm_buffer_size // (audio_format.channels * audio_format.width)


 def get_pcm_duration(pcm_buffer_size, audio_format=DEFAULT_FORMAT):
    """Calculates duration in seconds of a binary PCM buffer (typically read from a WAV file)"""
-    return get_num_samples(pcm_buffer_size, audio_format) / audio_format[0]
+    return get_num_samples(pcm_buffer_size, audio_format) / audio_format.rate


 def get_np_duration(np_len, audio_format=DEFAULT_FORMAT):
    """Calculates duration in seconds of NumPy audio data"""
-    return np_len / audio_format[0]
+    return np_len / audio_format.rate


 def convert_audio(src_audio_path, dst_audio_path, file_type=None, audio_format=DEFAULT_FORMAT):
-    sample_rate, channels, width = audio_format
    import sox
    transformer = sox.Transformer()
-    transformer.set_output_format(file_type=file_type, rate=sample_rate, channels=channels, bits=width*8)
+    transformer.set_output_format(file_type=file_type,
+                                  rate=audio_format.rate,
+                                  channels=audio_format.channels,
+                                  bits=audio_format.width * 8)
    transformer.build(src_audio_path, dst_audio_path)


@ -181,7 +185,7 @@ class AudioFile:

 def read_frames(wav_file, frame_duration_ms=30, yield_remainder=False):
    audio_format = read_audio_format_from_wav_file(wav_file)
-    frame_size = int(audio_format[0] * (frame_duration_ms / 1000.0))
+    frame_size = int(audio_format.rate * (frame_duration_ms / 1000.0))
    while True:
        try:
            data = wav_file.readframes(frame_size)
@ -203,13 +207,12 @@ def vad_split(audio_frames,
              num_padding_frames=10,
              threshold=0.5,
              aggressiveness=3):
-    from webrtcvad import Vad
-    sample_rate, channels, width = audio_format
-    if channels != 1:
+    from webrtcvad import Vad  # pylint: disable=import-outside-toplevel
+    if audio_format.channels != 1:
        raise ValueError('VAD-splitting requires mono samples')
-    if width != 2:
+    if audio_format.width != 2:
        raise ValueError('VAD-splitting requires 16 bit samples')
-    if sample_rate not in [8000, 16000, 32000, 48000]:
+    if audio_format.rate not in [8000, 16000, 32000, 48000]:
        raise ValueError('VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000')
    if aggressiveness not in [0, 1, 2, 3]:
        raise ValueError('VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3')
@ -223,7 +226,7 @@ def vad_split(audio_frames,
        frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000
        if int(frame_duration_ms) not in [10, 20, 30]:
            raise ValueError('VAD-splitting only supported for frame durations 10, 20, or 30 ms')
-        is_speech = vad.is_speech(frame, sample_rate)
+        is_speech = vad.is_speech(frame, audio_format.rate)
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
@ -261,16 +264,15 @@ def get_opus_frame_size(rate):
    return 60 * rate // 1000


-def write_opus(opus_file, audio_format, audio_data):
-    rate, channels, width = audio_format
-    frame_size = get_opus_frame_size(rate)
+def write_opus(opus_file, audio_data, audio_format=DEFAULT_FORMAT):
+    frame_size = get_opus_frame_size(audio_format.rate)
    import opuslib  # pylint: disable=import-outside-toplevel
-    encoder = opuslib.Encoder(rate, channels, 'audio')
-    chunk_size = frame_size * channels * width
+    encoder = opuslib.Encoder(audio_format.rate, audio_format.channels, 'audio')
+    chunk_size = frame_size * audio_format.channels * audio_format.width
    opus_file.write(pack_number(len(audio_data), OPUS_PCM_LEN_SIZE))
-    opus_file.write(pack_number(rate, OPUS_RATE_SIZE))
-    opus_file.write(pack_number(channels, OPUS_CHANNELS_SIZE))
-    opus_file.write(pack_number(width, OPUS_WIDTH_SIZE))
+    opus_file.write(pack_number(audio_format.rate, OPUS_RATE_SIZE))
+    opus_file.write(pack_number(audio_format.channels, OPUS_CHANNELS_SIZE))
+    opus_file.write(pack_number(audio_format.width, OPUS_WIDTH_SIZE))
    for i in range(0, len(audio_data), chunk_size):
        chunk = audio_data[i:i + chunk_size]
        # Preventing non-deterministic encoding results from uninitialized remainder of the encoder buffer
@ -287,15 +289,14 @@ def read_opus_header(opus_file):
    rate = unpack_number(opus_file.read(OPUS_RATE_SIZE))
    channels = unpack_number(opus_file.read(OPUS_CHANNELS_SIZE))
    width = unpack_number(opus_file.read(OPUS_WIDTH_SIZE))
-    return pcm_buffer_size, (rate, channels, width)
+    return pcm_buffer_size, AudioFormat(rate, channels, width)


 def read_opus(opus_file):
    pcm_buffer_size, audio_format = read_opus_header(opus_file)
-    rate, channels, _ = audio_format
-    frame_size = get_opus_frame_size(rate)
+    frame_size = get_opus_frame_size(audio_format.rate)
    import opuslib  # pylint: disable=import-outside-toplevel
-    decoder = opuslib.Decoder(rate, channels)
+    decoder = opuslib.Decoder(audio_format.rate, audio_format.channels)
    audio_data = bytearray()
    while len(audio_data) < pcm_buffer_size:
        chunk_len = unpack_number(opus_file.read(OPUS_CHUNK_LEN_SIZE))
@ -306,12 +307,11 @@ def read_opus(opus_file):
    return audio_format, audio_data


-def write_wav(wav_file, audio_format, pcm_data):
+def write_wav(wav_file, pcm_data, audio_format=DEFAULT_FORMAT):
    with wave.open(wav_file, 'wb') as wav_file_writer:
-        rate, channels, width = audio_format
-        wav_file_writer.setframerate(rate)
-        wav_file_writer.setnchannels(channels)
-        wav_file_writer.setsampwidth(width)
+        wav_file_writer.setframerate(audio_format.rate)
+        wav_file_writer.setnchannels(audio_format.channels)
+        wav_file_writer.setsampwidth(audio_format.width)
        wav_file_writer.writeframes(pcm_data)


@ -331,11 +331,11 @@ def read_audio(audio_type, audio_file):
    raise ValueError('Unsupported audio type: {}'.format(audio_type))


-def write_audio(audio_type, audio_file, audio_format, pcm_data):
+def write_audio(audio_type, audio_file, pcm_data, audio_format=DEFAULT_FORMAT):
    if audio_type == AUDIO_TYPE_WAV:
-        return write_wav(audio_file, audio_format, pcm_data)
+        return write_wav(audio_file, pcm_data, audio_format=audio_format)
    if audio_type == AUDIO_TYPE_OPUS:
-        return write_opus(audio_file, audio_format, pcm_data)
+        return write_opus(audio_file, pcm_data, audio_format=audio_format)
    raise ValueError('Unsupported audio type: {}'.format(audio_type))


@ -358,12 +358,24 @@ def read_duration(audio_type, audio_file):
    raise ValueError('Unsupported audio type: {}'.format(audio_type))


-def pcm_to_np(audio_format, pcm_data):
-    _, channels, width = audio_format
-    if width not in [1, 2, 4]:
-        raise ValueError('Unsupported sample width: {}'.format(width))
-    dtype = [None, np.int8, np.int16, None, np.int32][width]
+def get_dtype(audio_format):
+    if audio_format.width not in [1, 2, 4]:
+        raise ValueError('Unsupported sample width: {}'.format(audio_format.width))
+    return [None, np.int8, np.int16, None, np.int32][audio_format.width]
+
+
+def pcm_to_np(pcm_data, audio_format=DEFAULT_FORMAT):
+    assert audio_format.channels == 1  # only mono supported for now
+    dtype = get_dtype(audio_format)
    samples = np.frombuffer(pcm_data, dtype=dtype)
-    assert channels == 1  # only mono supported for now
    samples = samples.astype(np.float32) / np.iinfo(dtype).max
    return np.expand_dims(samples, axis=1)
+
+
+def np_to_pcm(np_data, audio_format=DEFAULT_FORMAT):
+    assert audio_format.channels == 1  # only mono supported for now
+    dtype = get_dtype(audio_format)
+    np_data = np_data.squeeze()
+    np_data *= dtype.max
+    np_data = np_data.astype(dtype)
+    return bytearray(np_data.tobytes())
--- a/training/deepspeech_training/util/feeding.py
+++ b/training/deepspeech_training/util/feeding.py
@ -122,7 +122,7 @@ def create_dataset(sources,
                                         process_ahead=2 * batch_size if process_ahead is None else process_ahead):
            transcript = text_to_char_array(sample.transcript, Config.alphabet, context=sample.sample_id)
            transcript = to_sparse_tuple(transcript)
-            yield sample.sample_id, sample.audio, sample.audio_format[0], transcript
+            yield sample.sample_id, sample.audio, sample.audio_format.rate, transcript

    # Batching a dataset of 2D SparseTensors creates 3D batches, which fail
    # when passed to tf.nn.ctc_loss, so we reshape them to remove the extra
@ -167,7 +167,7 @@ def split_audio_file(audio_path,
            yield time_start, time_end, samples

    def to_mfccs(time_start, time_end, samples):
-        features, features_len = samples_to_mfccs(samples, audio_format[0])
+        features, features_len = samples_to_mfccs(samples, audio_format.rate)
        return time_start, time_end, features, features_len

    def create_batch_set(bs, criteria):