From fcb9bf6d9fddd9ede5a5bba422d15277250218c4 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Wed, 11 Sep 2019 09:02:21 +0000 Subject: [PATCH] Also remove samples with noise X-DeepSpeech: NOBUILD --- bin/import_magicdata.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/import_magicdata.py b/bin/import_magicdata.py index ae34a6cd..1532fb84 100755 --- a/bin/import_magicdata.py +++ b/bin/import_magicdata.py @@ -65,6 +65,10 @@ def preprocess_data(folder_with_archives, target_dir): durations = (df['wav_filesize'] - 44) / 16000 / 2 df = df[durations <= 10.0] print('Trimming {} samples > 10 seconds'.format((durations > 10.0).sum())) + + with_noise = df['transcript'].str.contains(r'\[(FIL|SPK)\]') + df = df[~with_noise] + print('Trimming {} samples with noise ([FIL] or [SPK])'.format(sum(with_noise))) dest_csv = os.path.join(target_dir, 'magicdata_{}.csv'.format(subset)) print('Saving {} set into {}...'.format(subset, dest_csv))