Simplyfy dockerfile and add notebook
This commit is contained in:
parent
d0f8eb96cd
commit
a37ca2ec27
@ -1,132 +1,13 @@
|
|||||||
# This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks
|
# This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks
|
||||||
|
|
||||||
FROM ubuntu:20.04 AS kenlm-build
|
FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.9
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws
|
||||||
apt-get install -y --no-install-recommends \
|
RUN jupyter serverextension enable --py jupyter_http_over_ws
|
||||||
build-essential cmake libboost-system-dev \
|
|
||||||
libboost-thread-dev libboost-program-options-dev \
|
|
||||||
libboost-test-dev libeigen3-dev zlib1g-dev \
|
|
||||||
libbz2-dev liblzma-dev && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Build KenLM to generate new scorers
|
RUN mv /code /home/STT
|
||||||
WORKDIR /code
|
WORKDIR /home
|
||||||
COPY kenlm /code/kenlm
|
|
||||||
RUN cd /code/kenlm && \
|
|
||||||
mkdir -p build && \
|
|
||||||
cd build && \
|
|
||||||
cmake .. && \
|
|
||||||
make -j $(nproc) || \
|
|
||||||
( echo "ERROR: Failed to build KenLM."; \
|
|
||||||
echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \
|
|
||||||
echo "ERROR: $ cd STT; git submodule update --init kenlm"; \
|
|
||||||
exit 1; )
|
|
||||||
|
|
||||||
|
EXPOSE 8888
|
||||||
|
|
||||||
FROM ubuntu:20.04 AS wget-binaries
|
CMD ["bash", "-c", "jupyter notebook --notebook-dir=/home --ip 0.0.0.0 --no-browser --allow-root"]
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils
|
|
||||||
|
|
||||||
# Tool to convert output graph for inference
|
|
||||||
RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \
|
|
||||||
unzip temp.zip && \
|
|
||||||
rm temp.zip
|
|
||||||
|
|
||||||
RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \
|
|
||||||
tar -xf temp.tar.xz && \
|
|
||||||
rm temp.tar.xz
|
|
||||||
|
|
||||||
|
|
||||||
FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
# We need to purge python3-xdg because
|
|
||||||
# it's breaking STT install later with
|
|
||||||
# errors about setuptools
|
|
||||||
#
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --no-install-recommends \
|
|
||||||
git \
|
|
||||||
wget \
|
|
||||||
libopus0 \
|
|
||||||
libopusfile0 \
|
|
||||||
libsndfile1 \
|
|
||||||
sox \
|
|
||||||
libsox-fmt-mp3 && \
|
|
||||||
apt-get purge -y python3-xdg && \
|
|
||||||
rm -rf /var/lib/apt/lists/
|
|
||||||
|
|
||||||
# Make sure pip and its dependencies are up-to-date
|
|
||||||
RUN pip3 install --upgrade pip wheel setuptools
|
|
||||||
|
|
||||||
WORKDIR /code
|
|
||||||
|
|
||||||
COPY native_client /code/native_client
|
|
||||||
COPY .git /code/.git
|
|
||||||
COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION
|
|
||||||
COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION
|
|
||||||
|
|
||||||
# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
|
|
||||||
RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
|
|
||||||
RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
|
|
||||||
|
|
||||||
COPY setup.py /code/setup.py
|
|
||||||
COPY VERSION /code/VERSION
|
|
||||||
COPY training /code/training
|
|
||||||
# Copy files from previous build stages
|
|
||||||
RUN mkdir -p /code/kenlm/build/
|
|
||||||
COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin
|
|
||||||
COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format
|
|
||||||
COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package
|
|
||||||
|
|
||||||
# Install STT
|
|
||||||
# No need for the decoder since we did it earlier
|
|
||||||
# TensorFlow GPU should already be installed on the base image,
|
|
||||||
# and we don't want to break that
|
|
||||||
RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .
|
|
||||||
|
|
||||||
# Copy rest of the code and test training
|
|
||||||
COPY . /code
|
|
||||||
#RUN ./bin/run-ldc93s1.sh
|
|
||||||
RUN rm -rf ~/.local/share/stt
|
|
||||||
|
|
||||||
### START OVH THINGS
|
|
||||||
##
|
|
||||||
#
|
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y \
|
|
||||||
man \
|
|
||||||
vim \
|
|
||||||
nano \
|
|
||||||
htop \
|
|
||||||
curl \
|
|
||||||
wget \
|
|
||||||
rsync \
|
|
||||||
ca-certificates \
|
|
||||||
git \
|
|
||||||
zip \
|
|
||||||
procps \
|
|
||||||
ssh \
|
|
||||||
supervisor \
|
|
||||||
gettext-base \
|
|
||||||
less \
|
|
||||||
nodejs \
|
|
||||||
npm \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN pip3 install jupyterlab ipywidgets
|
|
||||||
RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
|
|
||||||
RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets
|
|
||||||
RUN jupyter labextension install jupyterlab-plotly
|
|
||||||
|
|
||||||
EXPOSE 8080
|
|
||||||
|
|
||||||
ADD start.sh /
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
RUN chown -R 42420:42420 /workspace
|
|
||||||
|
|
||||||
ENTRYPOINT ["/start.sh"]
|
|
||||||
|
46
train-ldc.ipynb
Normal file
46
train-ldc.ipynb
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pandas
|
||||||
|
from STT.training.coqui_stt_training.util.downloader import maybe_download
|
||||||
|
#from STT.bin.import_ldc93s1 import _download_and_preprocess_data as download_data
|
||||||
|
|
||||||
|
#download_data('/home/STT/data')
|
||||||
|
|
||||||
|
def download_and_preprocess_data(data_dir):
|
||||||
|
# Conditionally download data
|
||||||
|
LDC93S1_BASE = "LDC93S1"
|
||||||
|
LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/"
|
||||||
|
local_file = maybe_download(
|
||||||
|
LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav"
|
||||||
|
)
|
||||||
|
trans_file = maybe_download(
|
||||||
|
LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt"
|
||||||
|
)
|
||||||
|
with open(trans_file, "r") as fin:
|
||||||
|
transcript = " ".join(fin.read().strip().lower().split(" ")[2:]).replace(
|
||||||
|
".", ""
|
||||||
|
)
|
||||||
|
|
||||||
|
df = pandas.DataFrame(
|
||||||
|
data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)],
|
||||||
|
columns=["wav_filename", "wav_filesize", "transcript"],
|
||||||
|
)
|
||||||
|
df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False)
|
||||||
|
|
||||||
|
download_and_preprocess_data('/home/STT/data')
|
||||||
|
|
||||||
|
|
||||||
|
from STT.training.coqui_stt_training.train import train, early_training_checks
|
||||||
|
from STT.training.coqui_stt_training.util.config import initialize_globals
|
||||||
|
|
||||||
|
#Config.train_files=['/home/STT/data/ldc.csv']
|
||||||
|
#Config.dev_files=['/home/STT/data/ldc.csv']
|
||||||
|
#Config.test_files=['/home/STT/data/ldc.csv']
|
||||||
|
|
||||||
|
#Config.alphabet_config_path='/home/STT/data/alphabet.txt'
|
||||||
|
initialize_globals()
|
||||||
|
|
||||||
|
early_training_checks()
|
||||||
|
|
||||||
|
train()
|
Loading…
Reference in New Issue
Block a user