STT/training/deepspeech_training/util/audio.py
Catalin Voss 6640cf2341
Remote training I/O once more (#3437)
* Redo remote I/O changes once more; this time without messing with taskcluster

* Add bin changes

* Fix merge-induced issue?

* For the interleaved case with multiple collections, unpack audio on the fly

To reproduce the previous failure

rm data/smoke_test/ldc93s1.csv
rm data/smoke_test/ldc93s1.sdb
rm -rf /tmp/ldc93s1_cache_sdb_csv
rm -rf /tmp/ckpt_sdb_csv
rm -rf /tmp/train_sdb_csv

./bin/run-tc-ldc93s1_new_sdb_csv.sh 109 16000
python -u DeepSpeech.py --noshow_progressbar --noearly_stop --train_files ./data/smoke_test/ldc93s1.sdb,./data/smoke_test/ldc93s1.csv --train_batch_size 1 --feature_cache /tmp/ldc93s1_cache_sdb_csv --dev_files ./data/smoke_test/ldc93s1.sdb,./data/smoke_test/ldc93s1.csv --dev_batch_size 1 --test_files ./data/smoke_test/ldc93s1.sdb,./data/smoke_test/ldc93s1.csv --test_batch_size 1 --n_hidden 100 --epochs 109 --max_to_keep 1 --checkpoint_dir /tmp/ckpt_sdb_csv --learning_rate 0.001 --dropout_rate 0.05 --export_dir /tmp/train_sdb_csv --scorer_path data/smoke_test/pruned_lm.scorer --audio_sample_rate 16000

* Attempt to preserve length information with a wrapper around `map()`… this gets pretty python-y

* Call the right `__next__()`

* Properly implement the rest of the map wrappers here……

* Fix trailing whitespace situation and other linter complaints

* Remove data accidentally checked in

* Fix overlay augmentations

* Wavs must be open in rb mode if we're passing in an external file pointer -- this confused me

* Lint whitespace

* Revert "Fix trailing whitespace situation and other linter complaints"

This reverts commit c3c45397a2f98e9b00d00c18c4ced4fc52475032.

* Fix linter issue but without such an aggressive diff

* Move unpack_maybe into sample_collections

* Use unpack_maybe in place of duplicate lambda

* Fix confusing comment

* Add clarifying comment for on-the-fly unpacking
2020-12-07 13:07:34 +01:00

439 lines
18 KiB
Python

import os
import io
import wave
import math
import tempfile
import collections
import numpy as np
from .helpers import LimitingPool
from collections import namedtuple
from .io import open_remote, remove_remote, copy_remote, is_remote_path
AudioFormat = namedtuple('AudioFormat', 'rate channels width')
DEFAULT_RATE = 16000
DEFAULT_CHANNELS = 1
DEFAULT_WIDTH = 2
DEFAULT_FORMAT = AudioFormat(DEFAULT_RATE, DEFAULT_CHANNELS, DEFAULT_WIDTH)
AUDIO_TYPE_NP = 'application/vnd.mozilla.np'
AUDIO_TYPE_PCM = 'application/vnd.mozilla.pcm'
AUDIO_TYPE_WAV = 'audio/wav'
AUDIO_TYPE_OPUS = 'application/vnd.mozilla.opus'
SERIALIZABLE_AUDIO_TYPES = [AUDIO_TYPE_WAV, AUDIO_TYPE_OPUS]
LOADABLE_AUDIO_EXTENSIONS = {'.wav': AUDIO_TYPE_WAV}
OPUS_PCM_LEN_SIZE = 4
OPUS_RATE_SIZE = 4
OPUS_CHANNELS_SIZE = 1
OPUS_WIDTH_SIZE = 1
OPUS_CHUNK_LEN_SIZE = 2
class Sample:
"""
Represents in-memory audio data of a certain (convertible) representation.
Attributes
----------
audio_type : str
See `__init__`.
audio_format : util.audio.AudioFormat
See `__init__`.
audio : binary
Audio data represented as indicated by `audio_type`
duration : float
Audio duration of the sample in seconds
"""
def __init__(self, audio_type, raw_data, audio_format=None, sample_id=None):
"""
Parameters
----------
audio_type : str
Audio data representation type
Supported types:
- util.audio.AUDIO_TYPE_OPUS: Memory file representation (BytesIO) of Opus encoded audio
wrapped by a custom container format (used in SDBs)
- util.audio.AUDIO_TYPE_WAV: Memory file representation (BytesIO) of a Wave file
- util.audio.AUDIO_TYPE_PCM: Binary representation (bytearray) of PCM encoded audio data (Wave file without header)
- util.audio.AUDIO_TYPE_NP: NumPy representation of audio data (np.float32) - typically used for GPU feeding
raw_data : binary
Audio data in the form of the provided representation type (see audio_type).
For types util.audio.AUDIO_TYPE_OPUS or util.audio.AUDIO_TYPE_WAV data can also be passed as a bytearray.
audio_format : util.audio.AudioFormat
Required in case of audio_type = util.audio.AUDIO_TYPE_PCM or util.audio.AUDIO_TYPE_NP,
as this information cannot be derived from raw audio data.
sample_id : str
Tracking ID - should indicate sample's origin as precisely as possible
"""
self.audio_type = audio_type
self.audio_format = audio_format
self.sample_id = sample_id
if audio_type in SERIALIZABLE_AUDIO_TYPES:
self.audio = raw_data if isinstance(raw_data, io.BytesIO) else io.BytesIO(raw_data)
self.duration = read_duration(audio_type, self.audio)
else:
self.audio = raw_data
if self.audio_format is None:
raise ValueError('For audio type "{}" parameter "audio_format" is mandatory'.format(self.audio_type))
if audio_type == AUDIO_TYPE_PCM:
self.duration = get_pcm_duration(len(self.audio), self.audio_format)
elif audio_type == AUDIO_TYPE_NP:
self.duration = get_np_duration(len(self.audio), self.audio_format)
else:
raise ValueError('Unsupported audio type: {}'.format(self.audio_type))
def change_audio_type(self, new_audio_type, bitrate=None):
"""
In-place conversion of audio data into a different representation.
Parameters
----------
new_audio_type : str
New audio-type - see `__init__`.
bitrate : int
Bitrate to use in case of converting to a lossy audio-type.
"""
if self.audio_type == new_audio_type:
return
if new_audio_type == AUDIO_TYPE_PCM and self.audio_type in SERIALIZABLE_AUDIO_TYPES:
self.audio_format, audio = read_audio(self.audio_type, self.audio)
self.audio.close()
self.audio = audio
elif new_audio_type == AUDIO_TYPE_PCM and self.audio_type == AUDIO_TYPE_NP:
self.audio = np_to_pcm(self.audio, self.audio_format)
elif new_audio_type == AUDIO_TYPE_NP:
self.change_audio_type(AUDIO_TYPE_PCM)
self.audio = pcm_to_np(self.audio, self.audio_format)
elif new_audio_type in SERIALIZABLE_AUDIO_TYPES:
self.change_audio_type(AUDIO_TYPE_PCM)
audio_bytes = io.BytesIO()
write_audio(new_audio_type, audio_bytes, self.audio, audio_format=self.audio_format, bitrate=bitrate)
audio_bytes.seek(0)
self.audio = audio_bytes
else:
raise RuntimeError('Changing audio representation type from "{}" to "{}" not supported'
.format(self.audio_type, new_audio_type))
self.audio_type = new_audio_type
def _unpack_and_change_audio_type(sample_and_audio_type):
packed_sample, audio_type, bitrate = sample_and_audio_type
if hasattr(packed_sample, 'unpack'):
sample = packed_sample.unpack()
else:
sample = packed_sample
sample.change_audio_type(audio_type, bitrate=bitrate)
return sample
def change_audio_types(packed_samples, audio_type=AUDIO_TYPE_PCM, bitrate=None, processes=None, process_ahead=None):
with LimitingPool(processes=processes, process_ahead=process_ahead) as pool:
yield from pool.imap(_unpack_and_change_audio_type, map(lambda s: (s, audio_type, bitrate), packed_samples))
def get_audio_type_from_extension(ext):
if ext in LOADABLE_AUDIO_EXTENSIONS:
return LOADABLE_AUDIO_EXTENSIONS[ext]
return None
def read_audio_format_from_wav_file(wav_file):
return AudioFormat(wav_file.getframerate(), wav_file.getnchannels(), wav_file.getsampwidth())
def get_num_samples(pcm_buffer_size, audio_format=DEFAULT_FORMAT):
return pcm_buffer_size // (audio_format.channels * audio_format.width)
def get_pcm_duration(pcm_buffer_size, audio_format=DEFAULT_FORMAT):
"""Calculates duration in seconds of a binary PCM buffer (typically read from a WAV file)"""
return get_num_samples(pcm_buffer_size, audio_format) / audio_format.rate
def get_np_duration(np_len, audio_format=DEFAULT_FORMAT):
"""Calculates duration in seconds of NumPy audio data"""
return np_len / audio_format.rate
def convert_audio(src_audio_path, dst_audio_path, file_type=None, audio_format=DEFAULT_FORMAT):
import sox
transformer = sox.Transformer()
transformer.set_output_format(file_type=file_type,
rate=audio_format.rate,
channels=audio_format.channels,
bits=audio_format.width * 8)
transformer.build(src_audio_path, dst_audio_path)
class AudioFile:
def __init__(self, audio_path, as_path=False, audio_format=DEFAULT_FORMAT):
self.audio_path = audio_path
self.audio_format = audio_format
self.as_path = as_path
self.open_file = None
self.open_wav = None
self.tmp_file_path = None
self.tmp_src_file_path = None
def __enter__(self):
if self.audio_path.endswith('.wav'):
self.open_file = open_remote(self.audio_path, 'rb')
self.open_wav = wave.open(self.open_file)
if read_audio_format_from_wav_file(self.open_wav) == self.audio_format:
if self.as_path:
self.open_wav.close()
self.open_file.close()
return self.audio_path
return self.open_wav
self.open_wav.close()
self.open_file.close()
# If the format isn't right, copy the file to local tmp dir and do the conversion on disk
if is_remote_path(self.audio_path):
_, self.tmp_src_file_path = tempfile.mkstemp(suffix='.wav')
copy_remote(self.audio_path, self.tmp_src_file_path)
self.audio_path = self.tmp_file_path
_, self.tmp_file_path = tempfile.mkstemp(suffix='.wav')
convert_audio(self.audio_path, self.tmp_file_path, file_type='wav', audio_format=self.audio_format)
if self.as_path:
return self.tmp_file_path
self.open_wav = wave.open(self.tmp_file_path, 'rb')
return self.open_wav
def __exit__(self, *args):
if not self.as_path:
self.open_wav.close()
if self.open_file:
self.open_file.close()
if self.tmp_file_path is not None:
os.remove(self.tmp_file_path)
if self.tmp_src_file_path is not None:
os.remove(self.tmp_src_file_path)
def read_frames(wav_file, frame_duration_ms=30, yield_remainder=False):
audio_format = read_audio_format_from_wav_file(wav_file)
frame_size = int(audio_format.rate * (frame_duration_ms / 1000.0))
while True:
try:
data = wav_file.readframes(frame_size)
if not yield_remainder and get_pcm_duration(len(data), audio_format) * 1000 < frame_duration_ms:
break
yield data
except EOFError:
break
def read_frames_from_file(audio_path, audio_format=DEFAULT_FORMAT, frame_duration_ms=30, yield_remainder=False):
with AudioFile(audio_path, audio_format=audio_format) as wav_file:
for frame in read_frames(wav_file, frame_duration_ms=frame_duration_ms, yield_remainder=yield_remainder):
yield frame
def vad_split(audio_frames,
audio_format=DEFAULT_FORMAT,
num_padding_frames=10,
threshold=0.5,
aggressiveness=3):
from webrtcvad import Vad # pylint: disable=import-outside-toplevel
if audio_format.channels != 1:
raise ValueError('VAD-splitting requires mono samples')
if audio_format.width != 2:
raise ValueError('VAD-splitting requires 16 bit samples')
if audio_format.rate not in [8000, 16000, 32000, 48000]:
raise ValueError('VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000')
if aggressiveness not in [0, 1, 2, 3]:
raise ValueError('VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3')
ring_buffer = collections.deque(maxlen=num_padding_frames)
triggered = False
vad = Vad(int(aggressiveness))
voiced_frames = []
frame_duration_ms = 0
frame_index = 0
for frame_index, frame in enumerate(audio_frames):
frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000
if int(frame_duration_ms) not in [10, 20, 30]:
raise ValueError('VAD-splitting only supported for frame durations 10, 20, or 30 ms')
is_speech = vad.is_speech(frame, audio_format.rate)
if not triggered:
ring_buffer.append((frame, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
if num_voiced > threshold * ring_buffer.maxlen:
triggered = True
for f, s in ring_buffer:
voiced_frames.append(f)
ring_buffer.clear()
else:
voiced_frames.append(frame)
ring_buffer.append((frame, is_speech))
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
if num_unvoiced > threshold * ring_buffer.maxlen:
triggered = False
yield b''.join(voiced_frames), \
frame_duration_ms * max(0, frame_index - len(voiced_frames)), \
frame_duration_ms * frame_index
ring_buffer.clear()
voiced_frames = []
if len(voiced_frames) > 0:
yield b''.join(voiced_frames), \
frame_duration_ms * (frame_index - len(voiced_frames)), \
frame_duration_ms * (frame_index + 1)
def pack_number(n, num_bytes):
return n.to_bytes(num_bytes, 'big', signed=False)
def unpack_number(data):
return int.from_bytes(data, 'big', signed=False)
def get_opus_frame_size(rate):
return 60 * rate // 1000
def write_opus(opus_file, audio_data, audio_format=DEFAULT_FORMAT, bitrate=None):
frame_size = get_opus_frame_size(audio_format.rate)
import opuslib # pylint: disable=import-outside-toplevel
encoder = opuslib.Encoder(audio_format.rate, audio_format.channels, 'audio')
if bitrate is not None:
encoder.bitrate = bitrate
chunk_size = frame_size * audio_format.channels * audio_format.width
opus_file.write(pack_number(len(audio_data), OPUS_PCM_LEN_SIZE))
opus_file.write(pack_number(audio_format.rate, OPUS_RATE_SIZE))
opus_file.write(pack_number(audio_format.channels, OPUS_CHANNELS_SIZE))
opus_file.write(pack_number(audio_format.width, OPUS_WIDTH_SIZE))
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i + chunk_size]
# Preventing non-deterministic encoding results from uninitialized remainder of the encoder buffer
if len(chunk) < chunk_size:
chunk = chunk + b'\0' * (chunk_size - len(chunk))
encoded = encoder.encode(chunk, frame_size)
opus_file.write(pack_number(len(encoded), OPUS_CHUNK_LEN_SIZE))
opus_file.write(encoded)
def read_opus_header(opus_file):
opus_file.seek(0)
pcm_buffer_size = unpack_number(opus_file.read(OPUS_PCM_LEN_SIZE))
rate = unpack_number(opus_file.read(OPUS_RATE_SIZE))
channels = unpack_number(opus_file.read(OPUS_CHANNELS_SIZE))
width = unpack_number(opus_file.read(OPUS_WIDTH_SIZE))
return pcm_buffer_size, AudioFormat(rate, channels, width)
def read_opus(opus_file):
pcm_buffer_size, audio_format = read_opus_header(opus_file)
frame_size = get_opus_frame_size(audio_format.rate)
import opuslib # pylint: disable=import-outside-toplevel
decoder = opuslib.Decoder(audio_format.rate, audio_format.channels)
audio_data = bytearray()
while len(audio_data) < pcm_buffer_size:
chunk_len = unpack_number(opus_file.read(OPUS_CHUNK_LEN_SIZE))
chunk = opus_file.read(chunk_len)
decoded = decoder.decode(chunk, frame_size)
audio_data.extend(decoded)
audio_data = audio_data[:pcm_buffer_size]
return audio_format, bytes(audio_data)
def write_wav(wav_file, pcm_data, audio_format=DEFAULT_FORMAT):
# wav_file is already a file-pointer here
with wave.open(wav_file, 'wb') as wav_file_writer:
wav_file_writer.setframerate(audio_format.rate)
wav_file_writer.setnchannels(audio_format.channels)
wav_file_writer.setsampwidth(audio_format.width)
wav_file_writer.writeframes(pcm_data)
def read_wav(wav_file):
wav_file.seek(0)
with wave.open(wav_file, 'rb') as wav_file_reader:
audio_format = read_audio_format_from_wav_file(wav_file_reader)
pcm_data = wav_file_reader.readframes(wav_file_reader.getnframes())
return audio_format, pcm_data
def read_audio(audio_type, audio_file):
if audio_type == AUDIO_TYPE_WAV:
return read_wav(audio_file)
if audio_type == AUDIO_TYPE_OPUS:
return read_opus(audio_file)
raise ValueError('Unsupported audio type: {}'.format(audio_type))
def write_audio(audio_type, audio_file, pcm_data, audio_format=DEFAULT_FORMAT, bitrate=None):
if audio_type == AUDIO_TYPE_WAV:
return write_wav(audio_file, pcm_data, audio_format=audio_format)
if audio_type == AUDIO_TYPE_OPUS:
return write_opus(audio_file, pcm_data, audio_format=audio_format, bitrate=bitrate)
raise ValueError('Unsupported audio type: {}'.format(audio_type))
def read_wav_duration(wav_file):
wav_file.seek(0)
with wave.open(wav_file, 'rb') as wav_file_reader:
return wav_file_reader.getnframes() / wav_file_reader.getframerate()
def read_opus_duration(opus_file):
pcm_buffer_size, audio_format = read_opus_header(opus_file)
return get_pcm_duration(pcm_buffer_size, audio_format)
def read_duration(audio_type, audio_file):
if audio_type == AUDIO_TYPE_WAV:
return read_wav_duration(audio_file)
if audio_type == AUDIO_TYPE_OPUS:
return read_opus_duration(audio_file)
raise ValueError('Unsupported audio type: {}'.format(audio_type))
def get_dtype(audio_format):
if audio_format.width not in [1, 2, 4]:
raise ValueError('Unsupported sample width: {}'.format(audio_format.width))
return [None, np.int8, np.int16, None, np.int32][audio_format.width]
def pcm_to_np(pcm_data, audio_format=DEFAULT_FORMAT):
if audio_format.channels != 1:
raise ValueError('Mono-channel audio required')
dtype = get_dtype(audio_format)
samples = np.frombuffer(pcm_data, dtype=dtype)
samples = samples.astype(np.float32) / np.iinfo(dtype).max
return np.expand_dims(samples, axis=1)
def np_to_pcm(np_data, audio_format=DEFAULT_FORMAT):
if audio_format.channels != 1:
raise ValueError('Mono-channel audio required')
dtype = get_dtype(audio_format)
np_data = np_data.squeeze()
np_data = np_data * np.iinfo(dtype).max
np_data = np_data.astype(dtype)
return np_data.tobytes()
def rms_to_dbfs(rms):
return 20.0 * math.log10(max(1e-16, rms)) + 3.0103
def max_dbfs(sample_data):
# Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
def mean_dbfs(sample_data):
return rms_to_dbfs(math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
def gain_db_to_ratio(gain_db):
return math.pow(10.0, gain_db / 20.0)
def normalize_audio(sample_data, dbfs=3.0103):
return np.maximum(np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)), 1.0), -1.0)