From 150fb67a020f7bdc751feeabd4b0824f0ce257d5 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Thu, 12 Sep 2019 10:16:54 +0000
Subject: [PATCH] Validate WAV header duration against file size

X-DeepSpeech: NOBUILD
---
 bin/import_magicdata.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/bin/import_magicdata.py b/bin/import_magicdata.py
index 1532fb84..2ec01549 100755
--- a/bin/import_magicdata.py
+++ b/bin/import_magicdata.py
@@ -11,6 +11,7 @@ import argparse
 import glob
 import pandas
 import tarfile
+import wave
 
 
 COLUMN_NAMES = ['wav_filename', 'wav_filesize', 'transcript']
@@ -22,6 +23,18 @@ def extract(archive_path, target_dir):
         tar.extractall(target_dir)
 
 
+def is_file_truncated(wav_filename, wav_filesize):
+    with wave.open(wav_filename, mode='rb') as fin:
+        assert fin.getframerate() == 16000
+        assert fin.getsampwidth() == 2
+        assert fin.getnchannels() == 1
+
+        header_duration = fin.getnframes() / fin.getframerate()
+        filesize_duration = (wav_filesize - 44) / 16000 / 2
+
+    return header_duration != filesize_duration
+
+
 def preprocess_data(folder_with_archives, target_dir):
     # First extract subset archives
     for subset in ('train', 'dev', 'test'):
@@ -50,6 +63,15 @@ def preprocess_data(folder_with_archives, target_dir):
                 wav_filesize = os.path.getsize(wav)
                 transcript_key = os.path.basename(wav)
                 transcript = transcripts.loc[transcript_key, 'Transcription']
+
+                # Some files in this dataset are truncated, the header duration
+                # doesn't match the file size. This causes errors at training
+                # time, so check here if things are fine before including a file
+                if is_file_truncated(wav_filename, wav_filesize):
+                    print('Warning: File {} is corrupted, header duration does '
+                          'not match file size. Ignoring.'.format(wav_filename))
+                    continue
+
                 set_files.append((wav_filename, wav_filesize, transcript))
             except KeyError:
                 print('Warning: Missing transcript for WAV file {}.'.format(wav))