* Redo remote I/O changes once more; this time without messing with taskcluster * Add bin changes * Fix merge-induced issue? * For the interleaved case with multiple collections, unpack audio on the fly To reproduce the previous failure rm data/smoke_test/ldc93s1.csv rm data/smoke_test/ldc93s1.sdb rm -rf /tmp/ldc93s1_cache_sdb_csv rm -rf /tmp/ckpt_sdb_csv rm -rf /tmp/train_sdb_csv ./bin/run-tc-ldc93s1_new_sdb_csv.sh 109 16000 python -u DeepSpeech.py --noshow_progressbar --noearly_stop --train_files ./data/smoke_test/ldc93s1.sdb,./data/smoke_test/ldc93s1.csv --train_batch_size 1 --feature_cache /tmp/ldc93s1_cache_sdb_csv --dev_files ./data/smoke_test/ldc93s1.sdb,./data/smoke_test/ldc93s1.csv --dev_batch_size 1 --test_files ./data/smoke_test/ldc93s1.sdb,./data/smoke_test/ldc93s1.csv --test_batch_size 1 --n_hidden 100 --epochs 109 --max_to_keep 1 --checkpoint_dir /tmp/ckpt_sdb_csv --learning_rate 0.001 --dropout_rate 0.05 --export_dir /tmp/train_sdb_csv --scorer_path data/smoke_test/pruned_lm.scorer --audio_sample_rate 16000 * Attempt to preserve length information with a wrapper around `map()`… this gets pretty python-y * Call the right `__next__()` * Properly implement the rest of the map wrappers here…… * Fix trailing whitespace situation and other linter complaints * Remove data accidentally checked in * Fix overlay augmentations * Wavs must be open in rb mode if we're passing in an external file pointer -- this confused me * Lint whitespace * Revert "Fix trailing whitespace situation and other linter complaints" This reverts commit c3c45397a2f98e9b00d00c18c4ced4fc52475032. * Fix linter issue but without such an aggressive diff * Move unpack_maybe into sample_collections * Use unpack_maybe in place of duplicate lambda * Fix confusing comment * Add clarifying comment for on-the-fly unpacking
32 lines
1.2 KiB
Python
32 lines
1.2 KiB
Python
import requests
|
|
import progressbar
|
|
|
|
from os import path, makedirs
|
|
from .io import open_remote, path_exists_remote, is_remote_path
|
|
|
|
SIMPLE_BAR = ['Progress ', progressbar.Bar(), ' ', progressbar.Percentage(), ' completed']
|
|
|
|
def maybe_download(archive_name, target_dir, archive_url):
|
|
# If archive file does not exist, download it...
|
|
archive_path = path.join(target_dir, archive_name)
|
|
|
|
if not is_remote_path(target_dir) and not path.exists(target_dir):
|
|
print('No path "%s" - creating ...' % target_dir)
|
|
makedirs(target_dir)
|
|
|
|
if not path_exists_remote(archive_path):
|
|
print('No archive "%s" - downloading...' % archive_path)
|
|
req = requests.get(archive_url, stream=True)
|
|
total_size = int(req.headers.get('content-length', 0))
|
|
done = 0
|
|
with open_remote(archive_path, 'wb') as f:
|
|
bar = progressbar.ProgressBar(max_value=total_size if total_size > 0 else progressbar.UnknownLength, widgets=SIMPLE_BAR)
|
|
|
|
for data in req.iter_content(1024*1024):
|
|
done += len(data)
|
|
f.write(data)
|
|
bar.update(done)
|
|
else:
|
|
print('Found archive "%s" - not downloading.' % archive_path)
|
|
return archive_path
|