Merge pull request #3147 from tilmankamp/data_set_tool

Resolves #3146 - Let build_sdb.py also output CSV files and rename it accordingly
This commit is contained in:
Tilman Kamp 2020-07-21 18:35:50 +02:00 committed by GitHub
commit b18a3a4ef5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 198 additions and 98 deletions

View File

@ -1,92 +0,0 @@
#!/usr/bin/env python
"""
Tool for building Sample Databases (SDB files) from DeepSpeech CSV files and other SDB files
Use "python3 build_sdb.py -h" for help
"""
import argparse
import progressbar
from deepspeech_training.util.audio import (
AUDIO_TYPE_PCM,
AUDIO_TYPE_OPUS,
AUDIO_TYPE_WAV,
change_audio_types,
)
from deepspeech_training.util.downloader import SIMPLE_BAR
from deepspeech_training.util.sample_collections import (
DirectSDBWriter,
samples_from_sources,
)
from deepspeech_training.util.augmentations import (
parse_augmentations,
apply_sample_augmentations,
SampleAugmentation
)
AUDIO_TYPE_LOOKUP = {"wav": AUDIO_TYPE_WAV, "opus": AUDIO_TYPE_OPUS}
def build_sdb():
audio_type = AUDIO_TYPE_LOOKUP[CLI_ARGS.audio_type]
augmentations = parse_augmentations(CLI_ARGS.augment)
if any(not isinstance(a, SampleAugmentation) for a in augmentations):
print("Warning: Some of the augmentations cannot be applied by this command.")
with DirectSDBWriter(
CLI_ARGS.target, audio_type=audio_type, labeled=not CLI_ARGS.unlabeled
) as sdb_writer:
samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
num_samples = len(samples)
if augmentations:
samples = apply_sample_augmentations(samples, audio_type=AUDIO_TYPE_PCM, augmentations=augmentations)
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
for sample in bar(
change_audio_types(samples, audio_type=audio_type, bitrate=CLI_ARGS.bitrate, processes=CLI_ARGS.workers)
):
sdb_writer.add(sample)
def handle_args():
parser = argparse.ArgumentParser(
description="Tool for building Sample Databases (SDB files) "
"from DeepSpeech CSV files and other SDB files"
)
parser.add_argument(
"sources",
nargs="+",
help="Source CSV and/or SDB files - "
"Note: For getting a correctly ordered target SDB, source SDBs have to have their samples "
"already ordered from shortest to longest.",
)
parser.add_argument("target", help="SDB file to create")
parser.add_argument(
"--audio-type",
default="opus",
choices=AUDIO_TYPE_LOOKUP.keys(),
help="Audio representation inside target SDB",
)
parser.add_argument(
"--bitrate",
type=int,
help="Bitrate for lossy compressed SDB samples like in case of --audio-type opus",
)
parser.add_argument(
"--workers", type=int, default=None, help="Number of encoding SDB workers"
)
parser.add_argument(
"--unlabeled",
action="store_true",
help="If to build an SDB with unlabeled (audio only) samples - "
"typically used for building noise augmentation corpora",
)
parser.add_argument(
"--augment",
action='append',
help="Add an augmentation operation",
)
return parser.parse_args()
if __name__ == "__main__":
CLI_ARGS = handle_args()
build_sdb()

111
bin/data_set_tool.py Executable file
View File

@ -0,0 +1,111 @@
#!/usr/bin/env python
'''
Tool for building a combined SDB or CSV sample-set from other sets
Use 'python3 data_set_tool.py -h' for help
'''
import sys
import argparse
import progressbar
from pathlib import Path
from deepspeech_training.util.audio import (
AUDIO_TYPE_PCM,
AUDIO_TYPE_OPUS,
AUDIO_TYPE_WAV,
change_audio_types,
)
from deepspeech_training.util.downloader import SIMPLE_BAR
from deepspeech_training.util.sample_collections import (
CSVWriter,
DirectSDBWriter,
samples_from_sources,
)
from deepspeech_training.util.augmentations import (
parse_augmentations,
apply_sample_augmentations,
SampleAugmentation
)
AUDIO_TYPE_LOOKUP = {'wav': AUDIO_TYPE_WAV, 'opus': AUDIO_TYPE_OPUS}
def build_data_set():
audio_type = AUDIO_TYPE_LOOKUP[CLI_ARGS.audio_type]
augmentations = parse_augmentations(CLI_ARGS.augment)
if any(not isinstance(a, SampleAugmentation) for a in augmentations):
print('Warning: Some of the specified augmentations will not get applied, as this tool only supports '
'overlay, codec, reverb, resample and volume.')
extension = Path(CLI_ARGS.target).suffix.lower()
labeled = not CLI_ARGS.unlabeled
if extension == '.csv':
writer = CSVWriter(CLI_ARGS.target, absolute_paths=CLI_ARGS.absolute_paths, labeled=labeled)
elif extension == '.sdb':
writer = DirectSDBWriter(CLI_ARGS.target, audio_type=audio_type, labeled=labeled)
else:
print('Unknown extension of target file - has to be either .csv or .sdb')
sys.exit(1)
with writer:
samples = samples_from_sources(CLI_ARGS.sources, labeled=not CLI_ARGS.unlabeled)
num_samples = len(samples)
if augmentations:
samples = apply_sample_augmentations(samples, audio_type=AUDIO_TYPE_PCM, augmentations=augmentations)
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
for sample in bar(change_audio_types(
samples,
audio_type=audio_type,
bitrate=CLI_ARGS.bitrate,
processes=CLI_ARGS.workers)):
writer.add(sample)
def handle_args():
parser = argparse.ArgumentParser(
description='Tool for building a combined SDB or CSV sample-set from other sets'
)
parser.add_argument(
'sources',
nargs='+',
help='Source CSV and/or SDB files - '
'Note: For getting a correctly ordered target set, source SDBs have to have their samples '
'already ordered from shortest to longest.',
)
parser.add_argument(
'target',
help='SDB or CSV file to create'
)
parser.add_argument(
'--audio-type',
default='opus',
choices=AUDIO_TYPE_LOOKUP.keys(),
help='Audio representation inside target SDB',
)
parser.add_argument(
'--bitrate',
type=int,
help='Bitrate for lossy compressed SDB samples like in case of --audio-type opus',
)
parser.add_argument(
'--workers', type=int, default=None, help='Number of encoding SDB workers'
)
parser.add_argument(
'--unlabeled',
action='store_true',
help='If to build an SDB with unlabeled (audio only) samples - '
'typically used for building noise augmentation corpora',
)
parser.add_argument(
'--absolute-paths',
action='store_true',
help='If to reference samples by their absolute paths when writing CSV files',
)
parser.add_argument(
'--augment',
action='append',
help='Add an augmentation operation',
)
return parser.parse_args()
if __name__ == '__main__':
CLI_ARGS = handle_args()
build_data_set()

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
""" """
Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) and DeepSpeech CSV files Tool for playing (and augmenting) single samples or samples from Sample Databases (SDB files) and DeepSpeech CSV files
Use "python3 build_sdb.py -h" for help Use "python3 play.py -h" for help
""" """
import os import os

View File

@ -13,7 +13,7 @@ fi;
if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then
echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}." echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}."
python -u bin/build_sdb.py ${ldc93s1_csv} ${ldc93s1_sdb} python -u bin/data_set_tool.py ${ldc93s1_csv} ${ldc93s1_sdb}
fi; fi;
# Force only one visible device because we have a single-sample dataset # Force only one visible device because we have a single-sample dataset

View File

@ -16,7 +16,7 @@ fi;
if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then
echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}." echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}."
python -u bin/build_sdb.py ${ldc93s1_csv} ${ldc93s1_sdb} python -u bin/data_set_tool.py ${ldc93s1_csv} ${ldc93s1_sdb}
fi; fi;
# Force only one visible device because we have a single-sample dataset # Force only one visible device because we have a single-sample dataset

View File

@ -16,7 +16,7 @@ fi;
if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then if [ ! -f "${ldc93s1_dir}/ldc93s1.sdb" ]; then
echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}." echo "Converting LDC93S1 example data, saving to ${ldc93s1_sdb}."
python -u bin/build_sdb.py ${ldc93s1_csv} ${ldc93s1_sdb} python -u bin/data_set_tool.py ${ldc93s1_csv} ${ldc93s1_sdb}
fi; fi;
# Force only one visible device because we have a single-sample dataset # Force only one visible device because we have a single-sample dataset

View File

@ -496,7 +496,7 @@ Example training with all augmentations:
[...] [...]
The ``bin/play.py`` tool also supports ``--augment`` parameters (for sample domain augmentations) and can be used for experimenting with different configurations. The ``bin/play.py`` and ``bin/data_set_tool.py`` tools also support ``--augment`` parameters (for sample domain augmentations) and can be used for experimenting with different configurations or creating augmented data sets.
Example of playing all samples with reverberation and maximized volume: Example of playing all samples with reverberation and maximized volume:
@ -510,3 +510,12 @@ Example simulation of the codec augmentation of a wav-file first at the beginnin
bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 0.0 test.wav bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 0.0 test.wav
bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 1.0 test.wav bin/play.py --augment codec[p=0.1,bitrate=48000:16000] --clock 1.0 test.wav
Example of creating a pre-augmented test set:
.. code-block:: bash
bin/data_set_tool.py \
--augment overlay[source=noise.sdb,layers=1,snr=20~10] \
--augment resample[rate=12000:8000~4000] \
test.sdb test-augmented.sdb

View File

@ -7,7 +7,15 @@ from pathlib import Path
from functools import partial from functools import partial
from .helpers import MEGABYTE, GIGABYTE, Interleaved from .helpers import MEGABYTE, GIGABYTE, Interleaved
from .audio import Sample, DEFAULT_FORMAT, AUDIO_TYPE_OPUS, SERIALIZABLE_AUDIO_TYPES, get_audio_type_from_extension from .audio import (
Sample,
DEFAULT_FORMAT,
AUDIO_TYPE_PCM,
AUDIO_TYPE_OPUS,
SERIALIZABLE_AUDIO_TYPES,
get_audio_type_from_extension,
write_wav
)
BIG_ENDIAN = 'big' BIG_ENDIAN = 'big'
INT_SIZE = 4 INT_SIZE = 4
@ -297,6 +305,70 @@ class SDB: # pylint: disable=too-many-instance-attributes
self.close() self.close()
class CSVWriter: # pylint: disable=too-many-instance-attributes
"""Sample collection writer for writing a CSV data-set and all its referenced WAV samples"""
def __init__(self,
csv_filename,
absolute_paths=False,
labeled=True):
"""
Parameters
----------
csv_filename : str
Path to the CSV file to write.
Will create a directory (CSV-filename without extension) next to it and fail if it already exists.
absolute_paths : bool
If paths in CSV file should be absolute instead of relative to the CSV file's parent directory.
labeled : bool or None
If True: Writes labeled samples (util.sample_collections.LabeledSample) only.
If False: Ignores transcripts (if available) and writes (unlabeled) util.audio.Sample instances.
"""
self.csv_filename = Path(csv_filename)
self.csv_base_dir = self.csv_filename.parent.resolve().absolute()
self.set_name = self.csv_filename.stem
self.csv_dir = self.csv_base_dir / self.set_name
if self.csv_dir.exists():
raise RuntimeError('"{}" already existing'.format(self.csv_dir))
os.mkdir(str(self.csv_dir))
self.absolute_paths = absolute_paths
fieldnames = ['wav_filename', 'wav_filesize']
self.labeled = labeled
if labeled:
fieldnames.append('transcript')
self.csv_file = open(csv_filename, 'w', encoding='utf-8', newline='')
self.csv_writer = csv.DictWriter(self.csv_file, fieldnames=fieldnames)
self.csv_writer.writeheader()
self.counter = 0
def __enter__(self):
return self
def add(self, sample):
sample_filename = self.csv_dir / 'sample{0:08d}.wav'.format(self.counter)
self.counter += 1
sample.change_audio_type(AUDIO_TYPE_PCM)
write_wav(str(sample_filename), sample.audio, audio_format=sample.audio_format)
sample.sample_id = str(sample_filename.relative_to(self.csv_base_dir))
row = {
'wav_filename': str(sample_filename.absolute()) if self.absolute_paths else sample.sample_id,
'wav_filesize': sample_filename.stat().st_size
}
if self.labeled:
row['transcript'] = sample.transcript
self.csv_writer.writerow(row)
return sample.sample_id
def close(self):
if self.csv_file:
self.csv_file.close()
def __len__(self):
return self.counter
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
class SampleList: class SampleList:
"""Sample collection base class with samples loaded from a list of in-memory paths.""" """Sample collection base class with samples loaded from a list of in-memory paths."""
def __init__(self, samples, labeled=True): def __init__(self, samples, labeled=True):