parent
90c2acd810
commit
fcb9bf6d9f
|
@ -66,6 +66,10 @@ def preprocess_data(folder_with_archives, target_dir):
|
||||||
df = df[durations <= 10.0]
|
df = df[durations <= 10.0]
|
||||||
print('Trimming {} samples > 10 seconds'.format((durations > 10.0).sum()))
|
print('Trimming {} samples > 10 seconds'.format((durations > 10.0).sum()))
|
||||||
|
|
||||||
|
with_noise = df['transcript'].str.contains(r'\[(FIL|SPK)\]')
|
||||||
|
df = df[~with_noise]
|
||||||
|
print('Trimming {} samples with noise ([FIL] or [SPK])'.format(sum(with_noise)))
|
||||||
|
|
||||||
dest_csv = os.path.join(target_dir, 'magicdata_{}.csv'.format(subset))
|
dest_csv = os.path.join(target_dir, 'magicdata_{}.csv'.format(subset))
|
||||||
print('Saving {} set into {}...'.format(subset, dest_csv))
|
print('Saving {} set into {}...'.format(subset, dest_csv))
|
||||||
df.to_csv(dest_csv, index=False)
|
df.to_csv(dest_csv, index=False)
|
||||||
|
|
Loading…
Reference in New Issue