Fix import_cv2.py binary file + permissions
This commit is contained in:
parent
3c3401ec60
commit
179ba1b533
20
bin/import_cv2.py
Normal file → Executable file
20
bin/import_cv2.py
Normal file → Executable file
@ -20,16 +20,16 @@ from util.downloader import SIMPLE_BAR
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
Broadly speaking, this script takes the audio downloaded from Common Voice
|
Broadly speaking, this script takes the audio downloaded from Common Voice
|
||||||
for a certain language, in addition to the *.tsv files output by CorporaCeator,
|
for a certain language, in addition to the *.tsv files output by CorporaCreator,
|
||||||
and the script formats the data and transcripts to be in a state usable by
|
and the script formats the data and transcripts to be in a state usable by
|
||||||
DeepSpeech.py
|
DeepSpeech.py
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
$ python3 import_cv2.py /path/to/audio/data_dir /path/to/tsv_dir
|
$ python3 import_cv2.py /path/to/audio/data_dir /path/to/tsv_dir
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
(1) audio_dir (string) path to dir of audio downloaded from Common Voice
|
(1) audio_dir (string) path to dir of audio downloaded from Common Voice
|
||||||
(2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files
|
(2) tsv_dir (string) path to dir containing {train,test,dev}.tsv files
|
||||||
which were generated by CorporaCreator
|
which were generated by CorporaCreator
|
||||||
|
|
||||||
Ouput:
|
Ouput:
|
||||||
@ -53,23 +53,25 @@ def _preprocess_data(audio_dir, tsv_dir):
|
|||||||
def _maybe_convert_set(audio_dir, input_tsv):
|
def _maybe_convert_set(audio_dir, input_tsv):
|
||||||
output_csv = path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
|
output_csv = path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
|
||||||
print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
|
print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
|
||||||
|
|
||||||
# Get audiofile path and transcript for each sentence in tsv
|
# Get audiofile path and transcript for each sentence in tsv
|
||||||
samples = []
|
samples = []
|
||||||
with open(input_tsv) as input_tsv_file:
|
with open(input_tsv) as input_tsv_file:
|
||||||
reader = csv.DictReader(input_tsv_file, delimiter='\t')
|
reader = csv.DictReader(input_tsv_file, delimiter='\t')
|
||||||
for row in reader:
|
for row in reader:
|
||||||
samples.append((row['path'], row['sentence']))
|
samples.append((row['path'], row['sentence']))
|
||||||
|
|
||||||
# Keep track of how many samples are good vs. problematic
|
# Keep track of how many samples are good vs. problematic
|
||||||
counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
|
counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
|
||||||
lock = RLock()
|
lock = RLock()
|
||||||
num_samples = len(samples)
|
num_samples = len(samples)
|
||||||
rows = []
|
rows = []
|
||||||
|
|
||||||
def one_sample(sample):
|
def one_sample(sample):
|
||||||
""" Take a audio file, and optionally convert it to 16kHz WAV """
|
""" Take a audio file, and optionally convert it to 16kHz WAV """
|
||||||
mp3_filename = path.join(audio_dir, sample[0])
|
mp3_filename = path.join(audio_dir, sample[0])
|
||||||
|
if not path.splitext(mp3_filename.lower())[1] == '.mp3':
|
||||||
|
mp3_filename += ".mp3"
|
||||||
# Storing wav files next to the mp3 ones - just with a different suffix
|
# Storing wav files next to the mp3 ones - just with a different suffix
|
||||||
wav_filename = path.splitext(mp3_filename)[0] + ".wav"
|
wav_filename = path.splitext(mp3_filename)[0] + ".wav"
|
||||||
_maybe_convert_wav(mp3_filename, wav_filename)
|
_maybe_convert_wav(mp3_filename, wav_filename)
|
||||||
@ -86,7 +88,7 @@ def _maybe_convert_set(audio_dir, input_tsv):
|
|||||||
# This one is good - keep it for the target CSV
|
# This one is good - keep it for the target CSV
|
||||||
rows.append((wav_filename, file_size, sample[1]))
|
rows.append((wav_filename, file_size, sample[1]))
|
||||||
counter['all'] += 1
|
counter['all'] += 1
|
||||||
|
|
||||||
print("Importing mp3 files...")
|
print("Importing mp3 files...")
|
||||||
pool = Pool(cpu_count())
|
pool = Pool(cpu_count())
|
||||||
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
|
bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
|
||||||
@ -95,7 +97,7 @@ def _maybe_convert_set(audio_dir, input_tsv):
|
|||||||
bar.update(num_samples)
|
bar.update(num_samples)
|
||||||
pool.close()
|
pool.close()
|
||||||
pool.join()
|
pool.join()
|
||||||
|
|
||||||
with open(output_csv, 'w') as output_csv_file:
|
with open(output_csv, 'w') as output_csv_file:
|
||||||
print('Writing CSV file for DeepSpeech.py as: ', output_csv)
|
print('Writing CSV file for DeepSpeech.py as: ', output_csv)
|
||||||
writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
|
writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
|
||||||
@ -103,7 +105,7 @@ def _maybe_convert_set(audio_dir, input_tsv):
|
|||||||
bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
|
bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
|
||||||
for filename, file_size, transcript in bar(rows):
|
for filename, file_size, transcript in bar(rows):
|
||||||
writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
|
writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })
|
||||||
|
|
||||||
print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
|
print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
|
||||||
if counter['too_short'] > 0:
|
if counter['too_short'] > 0:
|
||||||
print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
|
print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user