Named tuple AudioFormat, parameter re-ordering in util.audio and NP to PCM conversion support

This commit is contained in:
Tilman Kamp 2020-04-02 13:12:08 +02:00
parent f8acf5cba7
commit 927859728f
2 changed files with 61 additions and 49 deletions

View File

@ -6,11 +6,14 @@ import collections
import numpy as np
from .helpers import LimitingPool
from collections import namedtuple
AudioFormat = namedtuple('AudioFormat', 'rate channels width')
DEFAULT_RATE = 16000
DEFAULT_CHANNELS = 1
DEFAULT_WIDTH = 2
DEFAULT_FORMAT = (DEFAULT_RATE, DEFAULT_CHANNELS, DEFAULT_WIDTH)
DEFAULT_FORMAT = AudioFormat(DEFAULT_RATE, DEFAULT_CHANNELS, DEFAULT_WIDTH)
AUDIO_TYPE_NP = 'application/vnd.mozilla.np'
AUDIO_TYPE_PCM = 'application/vnd.mozilla.pcm'
@ -33,7 +36,7 @@ class Sample:
----------
audio_type : str
See `__init__`.
audio_format : tuple:(int, int, int)
audio_format : util.audio.AudioFormat
See `__init__`.
audio : binary
Audio data represented as indicated by `audio_type`
@ -55,8 +58,7 @@ class Sample:
raw_data : binary
Audio data in the form of the provided representation type (see audio_type).
For types util.audio.AUDIO_TYPE_OPUS or util.audio.AUDIO_TYPE_WAV data can also be passed as a bytearray.
audio_format : tuple
Tuple of sample-rate, number of channels and sample-width.
audio_format : util.audio.AudioFormat
Required in case of audio_type = util.audio.AUDIO_TYPE_PCM or util.audio.AUDIO_TYPE_NP,
as this information cannot be derived from raw audio data.
sample_id : str
@ -87,7 +89,6 @@ class Sample:
----------
new_audio_type : str
New audio-type - see `__init__`.
Not supported: Converting from AUDIO_TYPE_NP into any other type.
"""
if self.audio_type == new_audio_type:
return
@ -95,13 +96,15 @@ class Sample:
self.audio_format, audio = read_audio(self.audio_type, self.audio)
self.audio.close()
self.audio = audio
elif new_audio_type == AUDIO_TYPE_PCM and self.audio_type == AUDIO_TYPE_NP:
self.audio = np_to_pcm(self.audio, self.audio_format)
elif new_audio_type == AUDIO_TYPE_NP:
self.change_audio_type(AUDIO_TYPE_PCM)
self.audio = pcm_to_np(self.audio_format, self.audio)
self.audio = pcm_to_np(self.audio, self.audio_format)
elif new_audio_type in SERIALIZABLE_AUDIO_TYPES:
self.change_audio_type(AUDIO_TYPE_PCM)
audio_bytes = io.BytesIO()
write_audio(new_audio_type, audio_bytes, self.audio_format, self.audio)
write_audio(new_audio_type, audio_bytes, self.audio, audio_format=self.audio_format)
audio_bytes.seek(0)
self.audio = audio_bytes
else:
@ -122,29 +125,30 @@ def change_audio_types(samples, audio_type=AUDIO_TYPE_PCM, processes=None, proce
def read_audio_format_from_wav_file(wav_file):
return wav_file.getframerate(), wav_file.getnchannels(), wav_file.getsampwidth()
return AudioFormat(wav_file.getframerate(), wav_file.getnchannels(), wav_file.getsampwidth())
def get_num_samples(pcm_buffer_size, audio_format=DEFAULT_FORMAT):
_, channels, width = audio_format
return pcm_buffer_size // (channels * width)
return pcm_buffer_size // (audio_format.channels * audio_format.width)
def get_pcm_duration(pcm_buffer_size, audio_format=DEFAULT_FORMAT):
"""Calculates duration in seconds of a binary PCM buffer (typically read from a WAV file)"""
return get_num_samples(pcm_buffer_size, audio_format) / audio_format[0]
return get_num_samples(pcm_buffer_size, audio_format) / audio_format.rate
def get_np_duration(np_len, audio_format=DEFAULT_FORMAT):
"""Calculates duration in seconds of NumPy audio data"""
return np_len / audio_format[0]
return np_len / audio_format.rate
def convert_audio(src_audio_path, dst_audio_path, file_type=None, audio_format=DEFAULT_FORMAT):
sample_rate, channels, width = audio_format
import sox
transformer = sox.Transformer()
transformer.set_output_format(file_type=file_type, rate=sample_rate, channels=channels, bits=width*8)
transformer.set_output_format(file_type=file_type,
rate=audio_format.rate,
channels=audio_format.channels,
bits=audio_format.width * 8)
transformer.build(src_audio_path, dst_audio_path)
@ -181,7 +185,7 @@ class AudioFile:
def read_frames(wav_file, frame_duration_ms=30, yield_remainder=False):
audio_format = read_audio_format_from_wav_file(wav_file)
frame_size = int(audio_format[0] * (frame_duration_ms / 1000.0))
frame_size = int(audio_format.rate * (frame_duration_ms / 1000.0))
while True:
try:
data = wav_file.readframes(frame_size)
@ -203,13 +207,12 @@ def vad_split(audio_frames,
num_padding_frames=10,
threshold=0.5,
aggressiveness=3):
from webrtcvad import Vad
sample_rate, channels, width = audio_format
if channels != 1:
from webrtcvad import Vad # pylint: disable=import-outside-toplevel
if audio_format.channels != 1:
raise ValueError('VAD-splitting requires mono samples')
if width != 2:
if audio_format.width != 2:
raise ValueError('VAD-splitting requires 16 bit samples')
if sample_rate not in [8000, 16000, 32000, 48000]:
if audio_format.rate not in [8000, 16000, 32000, 48000]:
raise ValueError('VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000')
if aggressiveness not in [0, 1, 2, 3]:
raise ValueError('VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3')
@ -223,7 +226,7 @@ def vad_split(audio_frames,
frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000
if int(frame_duration_ms) not in [10, 20, 30]:
raise ValueError('VAD-splitting only supported for frame durations 10, 20, or 30 ms')
is_speech = vad.is_speech(frame, sample_rate)
is_speech = vad.is_speech(frame, audio_format.rate)
if not triggered:
ring_buffer.append((frame, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
@ -261,16 +264,15 @@ def get_opus_frame_size(rate):
return 60 * rate // 1000
def write_opus(opus_file, audio_format, audio_data):
rate, channels, width = audio_format
frame_size = get_opus_frame_size(rate)
def write_opus(opus_file, audio_data, audio_format=DEFAULT_FORMAT):
frame_size = get_opus_frame_size(audio_format.rate)
import opuslib # pylint: disable=import-outside-toplevel
encoder = opuslib.Encoder(rate, channels, 'audio')
chunk_size = frame_size * channels * width
encoder = opuslib.Encoder(audio_format.rate, audio_format.channels, 'audio')
chunk_size = frame_size * audio_format.channels * audio_format.width
opus_file.write(pack_number(len(audio_data), OPUS_PCM_LEN_SIZE))
opus_file.write(pack_number(rate, OPUS_RATE_SIZE))
opus_file.write(pack_number(channels, OPUS_CHANNELS_SIZE))
opus_file.write(pack_number(width, OPUS_WIDTH_SIZE))
opus_file.write(pack_number(audio_format.rate, OPUS_RATE_SIZE))
opus_file.write(pack_number(audio_format.channels, OPUS_CHANNELS_SIZE))
opus_file.write(pack_number(audio_format.width, OPUS_WIDTH_SIZE))
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i + chunk_size]
# Preventing non-deterministic encoding results from uninitialized remainder of the encoder buffer
@ -287,15 +289,14 @@ def read_opus_header(opus_file):
rate = unpack_number(opus_file.read(OPUS_RATE_SIZE))
channels = unpack_number(opus_file.read(OPUS_CHANNELS_SIZE))
width = unpack_number(opus_file.read(OPUS_WIDTH_SIZE))
return pcm_buffer_size, (rate, channels, width)
return pcm_buffer_size, AudioFormat(rate, channels, width)
def read_opus(opus_file):
pcm_buffer_size, audio_format = read_opus_header(opus_file)
rate, channels, _ = audio_format
frame_size = get_opus_frame_size(rate)
frame_size = get_opus_frame_size(audio_format.rate)
import opuslib # pylint: disable=import-outside-toplevel
decoder = opuslib.Decoder(rate, channels)
decoder = opuslib.Decoder(audio_format.rate, audio_format.channels)
audio_data = bytearray()
while len(audio_data) < pcm_buffer_size:
chunk_len = unpack_number(opus_file.read(OPUS_CHUNK_LEN_SIZE))
@ -306,12 +307,11 @@ def read_opus(opus_file):
return audio_format, audio_data
def write_wav(wav_file, audio_format, pcm_data):
def write_wav(wav_file, pcm_data, audio_format=DEFAULT_FORMAT):
with wave.open(wav_file, 'wb') as wav_file_writer:
rate, channels, width = audio_format
wav_file_writer.setframerate(rate)
wav_file_writer.setnchannels(channels)
wav_file_writer.setsampwidth(width)
wav_file_writer.setframerate(audio_format.rate)
wav_file_writer.setnchannels(audio_format.channels)
wav_file_writer.setsampwidth(audio_format.width)
wav_file_writer.writeframes(pcm_data)
@ -331,11 +331,11 @@ def read_audio(audio_type, audio_file):
raise ValueError('Unsupported audio type: {}'.format(audio_type))
def write_audio(audio_type, audio_file, audio_format, pcm_data):
def write_audio(audio_type, audio_file, pcm_data, audio_format=DEFAULT_FORMAT):
if audio_type == AUDIO_TYPE_WAV:
return write_wav(audio_file, audio_format, pcm_data)
return write_wav(audio_file, pcm_data, audio_format=audio_format)
if audio_type == AUDIO_TYPE_OPUS:
return write_opus(audio_file, audio_format, pcm_data)
return write_opus(audio_file, pcm_data, audio_format=audio_format)
raise ValueError('Unsupported audio type: {}'.format(audio_type))
@ -358,12 +358,24 @@ def read_duration(audio_type, audio_file):
raise ValueError('Unsupported audio type: {}'.format(audio_type))
def pcm_to_np(audio_format, pcm_data):
_, channels, width = audio_format
if width not in [1, 2, 4]:
raise ValueError('Unsupported sample width: {}'.format(width))
dtype = [None, np.int8, np.int16, None, np.int32][width]
def get_dtype(audio_format):
if audio_format.width not in [1, 2, 4]:
raise ValueError('Unsupported sample width: {}'.format(audio_format.width))
return [None, np.int8, np.int16, None, np.int32][audio_format.width]
def pcm_to_np(pcm_data, audio_format=DEFAULT_FORMAT):
assert audio_format.channels == 1 # only mono supported for now
dtype = get_dtype(audio_format)
samples = np.frombuffer(pcm_data, dtype=dtype)
assert channels == 1 # only mono supported for now
samples = samples.astype(np.float32) / np.iinfo(dtype).max
return np.expand_dims(samples, axis=1)
def np_to_pcm(np_data, audio_format=DEFAULT_FORMAT):
assert audio_format.channels == 1 # only mono supported for now
dtype = get_dtype(audio_format)
np_data = np_data.squeeze()
np_data *= dtype.max
np_data = np_data.astype(dtype)
return bytearray(np_data.tobytes())

View File

@ -122,7 +122,7 @@ def create_dataset(sources,
process_ahead=2 * batch_size if process_ahead is None else process_ahead):
transcript = text_to_char_array(sample.transcript, Config.alphabet, context=sample.sample_id)
transcript = to_sparse_tuple(transcript)
yield sample.sample_id, sample.audio, sample.audio_format[0], transcript
yield sample.sample_id, sample.audio, sample.audio_format.rate, transcript
# Batching a dataset of 2D SparseTensors creates 3D batches, which fail
# when passed to tf.nn.ctc_loss, so we reshape them to remove the extra
@ -167,7 +167,7 @@ def split_audio_file(audio_path,
yield time_start, time_end, samples
def to_mfccs(time_start, time_end, samples):
features, features_len = samples_to_mfccs(samples, audio_format[0])
features, features_len = samples_to_mfccs(samples, audio_format.rate)
return time_start, time_end, features, features_len
def create_batch_set(bs, criteria):