Named tuple AudioFormat, parameter re-ordering in util.audio and NP to PCM conversion support
This commit is contained in:
parent
f8acf5cba7
commit
927859728f
@ -6,11 +6,14 @@ import collections
|
||||
import numpy as np
|
||||
|
||||
from .helpers import LimitingPool
|
||||
from collections import namedtuple
|
||||
|
||||
AudioFormat = namedtuple('AudioFormat', 'rate channels width')
|
||||
|
||||
DEFAULT_RATE = 16000
|
||||
DEFAULT_CHANNELS = 1
|
||||
DEFAULT_WIDTH = 2
|
||||
DEFAULT_FORMAT = (DEFAULT_RATE, DEFAULT_CHANNELS, DEFAULT_WIDTH)
|
||||
DEFAULT_FORMAT = AudioFormat(DEFAULT_RATE, DEFAULT_CHANNELS, DEFAULT_WIDTH)
|
||||
|
||||
AUDIO_TYPE_NP = 'application/vnd.mozilla.np'
|
||||
AUDIO_TYPE_PCM = 'application/vnd.mozilla.pcm'
|
||||
@ -33,7 +36,7 @@ class Sample:
|
||||
----------
|
||||
audio_type : str
|
||||
See `__init__`.
|
||||
audio_format : tuple:(int, int, int)
|
||||
audio_format : util.audio.AudioFormat
|
||||
See `__init__`.
|
||||
audio : binary
|
||||
Audio data represented as indicated by `audio_type`
|
||||
@ -55,8 +58,7 @@ class Sample:
|
||||
raw_data : binary
|
||||
Audio data in the form of the provided representation type (see audio_type).
|
||||
For types util.audio.AUDIO_TYPE_OPUS or util.audio.AUDIO_TYPE_WAV data can also be passed as a bytearray.
|
||||
audio_format : tuple
|
||||
Tuple of sample-rate, number of channels and sample-width.
|
||||
audio_format : util.audio.AudioFormat
|
||||
Required in case of audio_type = util.audio.AUDIO_TYPE_PCM or util.audio.AUDIO_TYPE_NP,
|
||||
as this information cannot be derived from raw audio data.
|
||||
sample_id : str
|
||||
@ -87,7 +89,6 @@ class Sample:
|
||||
----------
|
||||
new_audio_type : str
|
||||
New audio-type - see `__init__`.
|
||||
Not supported: Converting from AUDIO_TYPE_NP into any other type.
|
||||
"""
|
||||
if self.audio_type == new_audio_type:
|
||||
return
|
||||
@ -95,13 +96,15 @@ class Sample:
|
||||
self.audio_format, audio = read_audio(self.audio_type, self.audio)
|
||||
self.audio.close()
|
||||
self.audio = audio
|
||||
elif new_audio_type == AUDIO_TYPE_PCM and self.audio_type == AUDIO_TYPE_NP:
|
||||
self.audio = np_to_pcm(self.audio, self.audio_format)
|
||||
elif new_audio_type == AUDIO_TYPE_NP:
|
||||
self.change_audio_type(AUDIO_TYPE_PCM)
|
||||
self.audio = pcm_to_np(self.audio_format, self.audio)
|
||||
self.audio = pcm_to_np(self.audio, self.audio_format)
|
||||
elif new_audio_type in SERIALIZABLE_AUDIO_TYPES:
|
||||
self.change_audio_type(AUDIO_TYPE_PCM)
|
||||
audio_bytes = io.BytesIO()
|
||||
write_audio(new_audio_type, audio_bytes, self.audio_format, self.audio)
|
||||
write_audio(new_audio_type, audio_bytes, self.audio, audio_format=self.audio_format)
|
||||
audio_bytes.seek(0)
|
||||
self.audio = audio_bytes
|
||||
else:
|
||||
@ -122,29 +125,30 @@ def change_audio_types(samples, audio_type=AUDIO_TYPE_PCM, processes=None, proce
|
||||
|
||||
|
||||
def read_audio_format_from_wav_file(wav_file):
|
||||
return wav_file.getframerate(), wav_file.getnchannels(), wav_file.getsampwidth()
|
||||
return AudioFormat(wav_file.getframerate(), wav_file.getnchannels(), wav_file.getsampwidth())
|
||||
|
||||
|
||||
def get_num_samples(pcm_buffer_size, audio_format=DEFAULT_FORMAT):
|
||||
_, channels, width = audio_format
|
||||
return pcm_buffer_size // (channels * width)
|
||||
return pcm_buffer_size // (audio_format.channels * audio_format.width)
|
||||
|
||||
|
||||
def get_pcm_duration(pcm_buffer_size, audio_format=DEFAULT_FORMAT):
|
||||
"""Calculates duration in seconds of a binary PCM buffer (typically read from a WAV file)"""
|
||||
return get_num_samples(pcm_buffer_size, audio_format) / audio_format[0]
|
||||
return get_num_samples(pcm_buffer_size, audio_format) / audio_format.rate
|
||||
|
||||
|
||||
def get_np_duration(np_len, audio_format=DEFAULT_FORMAT):
|
||||
"""Calculates duration in seconds of NumPy audio data"""
|
||||
return np_len / audio_format[0]
|
||||
return np_len / audio_format.rate
|
||||
|
||||
|
||||
def convert_audio(src_audio_path, dst_audio_path, file_type=None, audio_format=DEFAULT_FORMAT):
|
||||
sample_rate, channels, width = audio_format
|
||||
import sox
|
||||
transformer = sox.Transformer()
|
||||
transformer.set_output_format(file_type=file_type, rate=sample_rate, channels=channels, bits=width*8)
|
||||
transformer.set_output_format(file_type=file_type,
|
||||
rate=audio_format.rate,
|
||||
channels=audio_format.channels,
|
||||
bits=audio_format.width * 8)
|
||||
transformer.build(src_audio_path, dst_audio_path)
|
||||
|
||||
|
||||
@ -181,7 +185,7 @@ class AudioFile:
|
||||
|
||||
def read_frames(wav_file, frame_duration_ms=30, yield_remainder=False):
|
||||
audio_format = read_audio_format_from_wav_file(wav_file)
|
||||
frame_size = int(audio_format[0] * (frame_duration_ms / 1000.0))
|
||||
frame_size = int(audio_format.rate * (frame_duration_ms / 1000.0))
|
||||
while True:
|
||||
try:
|
||||
data = wav_file.readframes(frame_size)
|
||||
@ -203,13 +207,12 @@ def vad_split(audio_frames,
|
||||
num_padding_frames=10,
|
||||
threshold=0.5,
|
||||
aggressiveness=3):
|
||||
from webrtcvad import Vad
|
||||
sample_rate, channels, width = audio_format
|
||||
if channels != 1:
|
||||
from webrtcvad import Vad # pylint: disable=import-outside-toplevel
|
||||
if audio_format.channels != 1:
|
||||
raise ValueError('VAD-splitting requires mono samples')
|
||||
if width != 2:
|
||||
if audio_format.width != 2:
|
||||
raise ValueError('VAD-splitting requires 16 bit samples')
|
||||
if sample_rate not in [8000, 16000, 32000, 48000]:
|
||||
if audio_format.rate not in [8000, 16000, 32000, 48000]:
|
||||
raise ValueError('VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000')
|
||||
if aggressiveness not in [0, 1, 2, 3]:
|
||||
raise ValueError('VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3')
|
||||
@ -223,7 +226,7 @@ def vad_split(audio_frames,
|
||||
frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000
|
||||
if int(frame_duration_ms) not in [10, 20, 30]:
|
||||
raise ValueError('VAD-splitting only supported for frame durations 10, 20, or 30 ms')
|
||||
is_speech = vad.is_speech(frame, sample_rate)
|
||||
is_speech = vad.is_speech(frame, audio_format.rate)
|
||||
if not triggered:
|
||||
ring_buffer.append((frame, is_speech))
|
||||
num_voiced = len([f for f, speech in ring_buffer if speech])
|
||||
@ -261,16 +264,15 @@ def get_opus_frame_size(rate):
|
||||
return 60 * rate // 1000
|
||||
|
||||
|
||||
def write_opus(opus_file, audio_format, audio_data):
|
||||
rate, channels, width = audio_format
|
||||
frame_size = get_opus_frame_size(rate)
|
||||
def write_opus(opus_file, audio_data, audio_format=DEFAULT_FORMAT):
|
||||
frame_size = get_opus_frame_size(audio_format.rate)
|
||||
import opuslib # pylint: disable=import-outside-toplevel
|
||||
encoder = opuslib.Encoder(rate, channels, 'audio')
|
||||
chunk_size = frame_size * channels * width
|
||||
encoder = opuslib.Encoder(audio_format.rate, audio_format.channels, 'audio')
|
||||
chunk_size = frame_size * audio_format.channels * audio_format.width
|
||||
opus_file.write(pack_number(len(audio_data), OPUS_PCM_LEN_SIZE))
|
||||
opus_file.write(pack_number(rate, OPUS_RATE_SIZE))
|
||||
opus_file.write(pack_number(channels, OPUS_CHANNELS_SIZE))
|
||||
opus_file.write(pack_number(width, OPUS_WIDTH_SIZE))
|
||||
opus_file.write(pack_number(audio_format.rate, OPUS_RATE_SIZE))
|
||||
opus_file.write(pack_number(audio_format.channels, OPUS_CHANNELS_SIZE))
|
||||
opus_file.write(pack_number(audio_format.width, OPUS_WIDTH_SIZE))
|
||||
for i in range(0, len(audio_data), chunk_size):
|
||||
chunk = audio_data[i:i + chunk_size]
|
||||
# Preventing non-deterministic encoding results from uninitialized remainder of the encoder buffer
|
||||
@ -287,15 +289,14 @@ def read_opus_header(opus_file):
|
||||
rate = unpack_number(opus_file.read(OPUS_RATE_SIZE))
|
||||
channels = unpack_number(opus_file.read(OPUS_CHANNELS_SIZE))
|
||||
width = unpack_number(opus_file.read(OPUS_WIDTH_SIZE))
|
||||
return pcm_buffer_size, (rate, channels, width)
|
||||
return pcm_buffer_size, AudioFormat(rate, channels, width)
|
||||
|
||||
|
||||
def read_opus(opus_file):
|
||||
pcm_buffer_size, audio_format = read_opus_header(opus_file)
|
||||
rate, channels, _ = audio_format
|
||||
frame_size = get_opus_frame_size(rate)
|
||||
frame_size = get_opus_frame_size(audio_format.rate)
|
||||
import opuslib # pylint: disable=import-outside-toplevel
|
||||
decoder = opuslib.Decoder(rate, channels)
|
||||
decoder = opuslib.Decoder(audio_format.rate, audio_format.channels)
|
||||
audio_data = bytearray()
|
||||
while len(audio_data) < pcm_buffer_size:
|
||||
chunk_len = unpack_number(opus_file.read(OPUS_CHUNK_LEN_SIZE))
|
||||
@ -306,12 +307,11 @@ def read_opus(opus_file):
|
||||
return audio_format, audio_data
|
||||
|
||||
|
||||
def write_wav(wav_file, audio_format, pcm_data):
|
||||
def write_wav(wav_file, pcm_data, audio_format=DEFAULT_FORMAT):
|
||||
with wave.open(wav_file, 'wb') as wav_file_writer:
|
||||
rate, channels, width = audio_format
|
||||
wav_file_writer.setframerate(rate)
|
||||
wav_file_writer.setnchannels(channels)
|
||||
wav_file_writer.setsampwidth(width)
|
||||
wav_file_writer.setframerate(audio_format.rate)
|
||||
wav_file_writer.setnchannels(audio_format.channels)
|
||||
wav_file_writer.setsampwidth(audio_format.width)
|
||||
wav_file_writer.writeframes(pcm_data)
|
||||
|
||||
|
||||
@ -331,11 +331,11 @@ def read_audio(audio_type, audio_file):
|
||||
raise ValueError('Unsupported audio type: {}'.format(audio_type))
|
||||
|
||||
|
||||
def write_audio(audio_type, audio_file, audio_format, pcm_data):
|
||||
def write_audio(audio_type, audio_file, pcm_data, audio_format=DEFAULT_FORMAT):
|
||||
if audio_type == AUDIO_TYPE_WAV:
|
||||
return write_wav(audio_file, audio_format, pcm_data)
|
||||
return write_wav(audio_file, pcm_data, audio_format=audio_format)
|
||||
if audio_type == AUDIO_TYPE_OPUS:
|
||||
return write_opus(audio_file, audio_format, pcm_data)
|
||||
return write_opus(audio_file, pcm_data, audio_format=audio_format)
|
||||
raise ValueError('Unsupported audio type: {}'.format(audio_type))
|
||||
|
||||
|
||||
@ -358,12 +358,24 @@ def read_duration(audio_type, audio_file):
|
||||
raise ValueError('Unsupported audio type: {}'.format(audio_type))
|
||||
|
||||
|
||||
def pcm_to_np(audio_format, pcm_data):
|
||||
_, channels, width = audio_format
|
||||
if width not in [1, 2, 4]:
|
||||
raise ValueError('Unsupported sample width: {}'.format(width))
|
||||
dtype = [None, np.int8, np.int16, None, np.int32][width]
|
||||
def get_dtype(audio_format):
|
||||
if audio_format.width not in [1, 2, 4]:
|
||||
raise ValueError('Unsupported sample width: {}'.format(audio_format.width))
|
||||
return [None, np.int8, np.int16, None, np.int32][audio_format.width]
|
||||
|
||||
|
||||
def pcm_to_np(pcm_data, audio_format=DEFAULT_FORMAT):
|
||||
assert audio_format.channels == 1 # only mono supported for now
|
||||
dtype = get_dtype(audio_format)
|
||||
samples = np.frombuffer(pcm_data, dtype=dtype)
|
||||
assert channels == 1 # only mono supported for now
|
||||
samples = samples.astype(np.float32) / np.iinfo(dtype).max
|
||||
return np.expand_dims(samples, axis=1)
|
||||
|
||||
|
||||
def np_to_pcm(np_data, audio_format=DEFAULT_FORMAT):
|
||||
assert audio_format.channels == 1 # only mono supported for now
|
||||
dtype = get_dtype(audio_format)
|
||||
np_data = np_data.squeeze()
|
||||
np_data *= dtype.max
|
||||
np_data = np_data.astype(dtype)
|
||||
return bytearray(np_data.tobytes())
|
||||
|
@ -122,7 +122,7 @@ def create_dataset(sources,
|
||||
process_ahead=2 * batch_size if process_ahead is None else process_ahead):
|
||||
transcript = text_to_char_array(sample.transcript, Config.alphabet, context=sample.sample_id)
|
||||
transcript = to_sparse_tuple(transcript)
|
||||
yield sample.sample_id, sample.audio, sample.audio_format[0], transcript
|
||||
yield sample.sample_id, sample.audio, sample.audio_format.rate, transcript
|
||||
|
||||
# Batching a dataset of 2D SparseTensors creates 3D batches, which fail
|
||||
# when passed to tf.nn.ctc_loss, so we reshape them to remove the extra
|
||||
@ -167,7 +167,7 @@ def split_audio_file(audio_path,
|
||||
yield time_start, time_end, samples
|
||||
|
||||
def to_mfccs(time_start, time_end, samples):
|
||||
features, features_len = samples_to_mfccs(samples, audio_format[0])
|
||||
features, features_len = samples_to_mfccs(samples, audio_format.rate)
|
||||
return time_start, time_end, features, features_len
|
||||
|
||||
def create_batch_set(bs, criteria):
|
||||
|
Loading…
x
Reference in New Issue
Block a user