Validate WAV header duration against file size
X-DeepSpeech: NOBUILD
This commit is contained in:
parent
fcb9bf6d9f
commit
150fb67a02
|
@ -11,6 +11,7 @@ import argparse
|
||||||
import glob
|
import glob
|
||||||
import pandas
|
import pandas
|
||||||
import tarfile
|
import tarfile
|
||||||
|
import wave
|
||||||
|
|
||||||
|
|
||||||
COLUMN_NAMES = ['wav_filename', 'wav_filesize', 'transcript']
|
COLUMN_NAMES = ['wav_filename', 'wav_filesize', 'transcript']
|
||||||
|
@ -22,6 +23,18 @@ def extract(archive_path, target_dir):
|
||||||
tar.extractall(target_dir)
|
tar.extractall(target_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def is_file_truncated(wav_filename, wav_filesize):
|
||||||
|
with wave.open(wav_filename, mode='rb') as fin:
|
||||||
|
assert fin.getframerate() == 16000
|
||||||
|
assert fin.getsampwidth() == 2
|
||||||
|
assert fin.getnchannels() == 1
|
||||||
|
|
||||||
|
header_duration = fin.getnframes() / fin.getframerate()
|
||||||
|
filesize_duration = (wav_filesize - 44) / 16000 / 2
|
||||||
|
|
||||||
|
return header_duration != filesize_duration
|
||||||
|
|
||||||
|
|
||||||
def preprocess_data(folder_with_archives, target_dir):
|
def preprocess_data(folder_with_archives, target_dir):
|
||||||
# First extract subset archives
|
# First extract subset archives
|
||||||
for subset in ('train', 'dev', 'test'):
|
for subset in ('train', 'dev', 'test'):
|
||||||
|
@ -50,6 +63,15 @@ def preprocess_data(folder_with_archives, target_dir):
|
||||||
wav_filesize = os.path.getsize(wav)
|
wav_filesize = os.path.getsize(wav)
|
||||||
transcript_key = os.path.basename(wav)
|
transcript_key = os.path.basename(wav)
|
||||||
transcript = transcripts.loc[transcript_key, 'Transcription']
|
transcript = transcripts.loc[transcript_key, 'Transcription']
|
||||||
|
|
||||||
|
# Some files in this dataset are truncated, the header duration
|
||||||
|
# doesn't match the file size. This causes errors at training
|
||||||
|
# time, so check here if things are fine before including a file
|
||||||
|
if is_file_truncated(wav_filename, wav_filesize):
|
||||||
|
print('Warning: File {} is corrupted, header duration does '
|
||||||
|
'not match file size. Ignoring.'.format(wav_filename))
|
||||||
|
continue
|
||||||
|
|
||||||
set_files.append((wav_filename, wav_filesize, transcript))
|
set_files.append((wav_filename, wav_filesize, transcript))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
print('Warning: Missing transcript for WAV file {}.'.format(wav))
|
print('Warning: Missing transcript for WAV file {}.'.format(wav))
|
||||||
|
|
Loading…
Reference in New Issue