Also remove samples with noise

X-DeepSpeech: NOBUILD
This commit is contained in:
Reuben Morais 2019-09-11 09:02:21 +00:00 committed by GitHub
parent 90c2acd810
commit fcb9bf6d9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 4 additions and 0 deletions

View File

@ -65,6 +65,10 @@ def preprocess_data(folder_with_archives, target_dir):
durations = (df['wav_filesize'] - 44) / 16000 / 2
df = df[durations <= 10.0]
print('Trimming {} samples > 10 seconds'.format((durations > 10.0).sum()))
with_noise = df['transcript'].str.contains(r'\[(FIL|SPK)\]')
df = df[~with_noise]
print('Trimming {} samples with noise ([FIL] or [SPK])'.format(sum(with_noise)))
dest_csv = os.path.join(target_dir, 'magicdata_{}.csv'.format(subset))
print('Saving {} set into {}...'.format(subset, dest_csv))