Merge pull request #1734 from mozilla/issue1726
Fixed #1726 (Imported 8khz training audio compromised by unfiltered upsampling)
This commit is contained in:
commit
ccd75f2a1b
@ -16,8 +16,8 @@ import os
|
|||||||
import pandas
|
import pandas
|
||||||
import subprocess
|
import subprocess
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import wave
|
import librosa
|
||||||
import audioop
|
import resampy
|
||||||
|
|
||||||
from util.text import validate_label
|
from util.text import validate_label
|
||||||
|
|
||||||
@ -142,7 +142,7 @@ def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data
|
|||||||
|
|
||||||
print("splitting {} according to {}".format(wav_files, trans_file))
|
print("splitting {} according to {}".format(wav_files, trans_file))
|
||||||
|
|
||||||
origAudios = [wave.open(wav_file, "r") for wav_file in wav_files]
|
origAudios = [librosa.load(wav_file, sr=None, mono=False) for wav_file in wav_files]
|
||||||
|
|
||||||
# Loop over segments and split wav_file for each segment
|
# Loop over segments and split wav_file for each segment
|
||||||
for segment in segments:
|
for segment in segments:
|
||||||
@ -160,26 +160,20 @@ def _split_wav_and_sentences(data_dir, trans_data, original_data, converted_data
|
|||||||
if transcript != None:
|
if transcript != None:
|
||||||
files.append((os.path.abspath(new_wav_file), new_wav_filesize, transcript))
|
files.append((os.path.abspath(new_wav_file), new_wav_filesize, transcript))
|
||||||
|
|
||||||
# Close origAudios
|
|
||||||
for origAudio in origAudios:
|
|
||||||
origAudio.close()
|
|
||||||
|
|
||||||
return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])
|
return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])
|
||||||
|
|
||||||
|
def _split_audio(origAudio, start_time, stop_time):
|
||||||
|
audioData, frameRate = origAudio
|
||||||
|
nChannels = len(audioData.shape)
|
||||||
|
startIndex = int(start_time * frameRate)
|
||||||
|
stopIndex = int(stop_time * frameRate)
|
||||||
|
return audioData[startIndex: stopIndex] if 1 == nChannels else audioData[:, startIndex: stopIndex]
|
||||||
|
|
||||||
def _split_and_resample_wav(origAudio, start_time, stop_time, new_wav_file):
|
def _split_and_resample_wav(origAudio, start_time, stop_time, new_wav_file):
|
||||||
nChannels = origAudio.getnchannels()
|
frameRate = origAudio[1]
|
||||||
sampleWidth = origAudio.getsampwidth()
|
chunkData = _split_audio(origAudio, start_time, stop_time)
|
||||||
frameRate = origAudio.getframerate()
|
chunkData = resampy.resample(chunkData, frameRate, 16000)
|
||||||
origAudio.setpos(int(start_time * frameRate))
|
librosa.output.write_wav(new_wav_file, chunkData, 16000)
|
||||||
chunkData = origAudio.readframes(int((stop_time - start_time) * frameRate))
|
|
||||||
# by doubling the frame-rate we effectively go from 8 kHz to 16 kHz
|
|
||||||
chunkData, _ = audioop.ratecv(chunkData, sampleWidth, nChannels, frameRate, 2 * frameRate, None)
|
|
||||||
chunkAudio = wave.open(new_wav_file, "w")
|
|
||||||
chunkAudio.setnchannels(nChannels)
|
|
||||||
chunkAudio.setsampwidth(sampleWidth)
|
|
||||||
chunkAudio.setframerate(2 * frameRate)
|
|
||||||
chunkAudio.writeframes(chunkData)
|
|
||||||
chunkAudio.close()
|
|
||||||
|
|
||||||
def _split_sets(filelist):
|
def _split_sets(filelist):
|
||||||
# We initially split the entire set into 80% train and 20% test, then
|
# We initially split the entire set into 80% train and 20% test, then
|
||||||
|
@ -15,3 +15,5 @@ requests
|
|||||||
tables
|
tables
|
||||||
attrdict
|
attrdict
|
||||||
setuptools
|
setuptools
|
||||||
|
librosa
|
||||||
|
resampy
|
||||||
|
Loading…
x
Reference in New Issue
Block a user