From f062f75e172d13489d8d790e3f6a52a3365ba80d Mon Sep 17 00:00:00 2001 From: Josh Meyer Date: Fri, 16 Jul 2021 09:24:00 -0400 Subject: [PATCH 1/9] working on dockerfile with jupyter support --- Dockerfile.train.jupyter | 198 ++++++++++++++++++++++++++++++++ Dockerfile.train.jupyter.simple | 97 ++++++++++++++++ start.sh | 9 ++ tfenv.yml | 16 +++ 4 files changed, 320 insertions(+) create mode 100644 Dockerfile.train.jupyter create mode 100644 Dockerfile.train.jupyter.simple create mode 100644 start.sh create mode 100644 tfenv.yml diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter new file mode 100644 index 00000000..09c6ed79 --- /dev/null +++ b/Dockerfile.train.jupyter @@ -0,0 +1,198 @@ +# This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks + +FROM ubuntu:20.04 AS kenlm-build +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential cmake libboost-system-dev \ + libboost-thread-dev libboost-program-options-dev \ + libboost-test-dev libeigen3-dev zlib1g-dev \ + libbz2-dev liblzma-dev && \ + rm -rf /var/lib/apt/lists/* + +# Build KenLM to generate new scorers +WORKDIR /code +COPY kenlm /code/kenlm +RUN cd /code/kenlm && \ + mkdir -p build && \ + cd build && \ + cmake .. && \ + make -j $(nproc) || \ + ( echo "ERROR: Failed to build KenLM."; \ + echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \ + echo "ERROR: $ cd STT; git submodule update --init kenlm"; \ + exit 1; ) + + +FROM ubuntu:20.04 AS wget-binaries +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils + +# Tool to convert output graph for inference +RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \ + unzip temp.zip && \ + rm temp.zip + +RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \ + tar -xf temp.tar.xz && \ + rm temp.tar.xz + + +FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3 +ENV DEBIAN_FRONTEND=noninteractive + +# We need to purge python3-xdg because +# it's breaking STT install later with +# errors about setuptools +# +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + wget \ + libopus0 \ + libopusfile0 \ + libsndfile1 \ + sox \ + libsox-fmt-mp3 && \ + apt-get purge -y python3-xdg && \ + rm -rf /var/lib/apt/lists/ + +# Make sure pip and its dependencies are up-to-date +RUN pip3 install --upgrade pip wheel setuptools + +WORKDIR /code + +COPY native_client /code/native_client +COPY .git /code/.git +COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION +COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION + +# Build CTC decoder first, to avoid clashes on incompatible versions upgrades +RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings +RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl + +COPY setup.py /code/setup.py +COPY VERSION /code/VERSION +COPY training /code/training +# Copy files from previous build stages +RUN mkdir -p /code/kenlm/build/ +COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin +COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format +COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package + +# Install STT +# No need for the decoder since we did it earlier +# TensorFlow GPU should already be installed on the base image, +# and we don't want to break that +RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . + +# Copy rest of the code and test training +COPY . /code +#RUN ./bin/run-ldc93s1.sh +RUN rm -rf ~/.local/share/stt + +### START OVH THINGS +## +# + +# FROM gcr.io/kaggle-gpu-images/python:v98 + +RUN chsh -s /bin/bash +ENV SHELL=/bin/bash +RUN rm /bin/sh && ln -s /bin/bash /bin/sh + +RUN apt-get update && apt-get install -y \ + man \ + vim \ + nano \ + htop \ + curl \ + wget \ + rsync \ + ca-certificates \ + git \ + zip \ + procps \ + ssh \ + supervisor \ + gettext-base \ + less \ + && rm -rf /var/lib/apt/lists/* + +# install nvm +# https://github.com/creationix/nvm#install-script +RUN curl --silent -o- https://raw.githubusercontent.com/creationix/nvm/v0.33.11/install.sh | bash + +#ENV NVM_DIR /root/.nvm +ENV NVM_DIR /usr/local/nvm +ENV NODE_VERSION v12.20.1 + +# install node and npm +RUN source $NVM_DIR/nvm.sh \ + && nvm install $NODE_VERSION \ + && nvm alias default $NODE_VERSION \ + && nvm use default + +# add node and npm to path so the commands are available +ENV NODE_PATH $NVM_DIR/versions/node/$NODE_VERSION/bin +ENV PATH $NODE_PATH:$PATH + +RUN pip install pip==20.3.4 +RUN pip install jupyterlab==2.2.9 ipywidgets==7.6.3 +RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager +RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets +RUN jupyter labextension install jupyterlab-plotly@4.14.3 + +#install xlrd python module +RUN pip install pyradiomics +RUN pip install xlrd==1.2.0 +RUN pip install zarr +RUN pip install imbalanced-learn +RUN pip install openpyxl +RUN pip install efficientnet-pytorch +RUN pip install monai +RUN pip install prince +RUN pip install vit-pytorch +RUN pip install lifelines==0.25.11 +RUN pip install timm==0.3.2 +RUN pip install keras-retinanet==1.0.0 +RUN python -m pip install histomicstk --find-links https://girder.github.io/large_image_wheels +RUN pip install luminoth ipympl pysurvival missingpy pyinform pingouin pyAgrum missingno autoimpute networkx community yellowbrick factor_analyzer hdbscan pyitlib +RUN pip install eli5 dtreeviz gower batchgenerators mlinsights efficientnet-pytorch pretrainedmodels +# Add R to Jupyter Kernel +RUN conda install -y -c r r-irkernel + +#Install survival, sm, ggplot2, Hmisc, mixOmics (ce dernier est en repositoire Bioconductor) + +RUN conda install -y -c cran r-survival +RUN conda install -y -c cran r-sm +RUN conda install -y -c cran r-ggplot2 +RUN conda install -y -c cran r-hmisc +RUN conda install -y -c cran r-mixomics +RUN conda install -y -c cran r-caret +RUN conda install -y -c cran r-survminer +RUN conda install -y -c cran r-ggfortify +RUN conda install -y -c cran r-wordcloud +RUN conda install -y -c cran r-tm +RUN conda install -y -c cran r-prioritylasso +RUN conda install -y -c cran r-blockforest +RUN conda install -y -c cran r-mice + +#### tensorflow 1 +# Create the environment: +SHELL ["/bin/bash", "-c"] +COPY tfenv.yml . +RUN conda env create -f tfenv.yml +SHELL ["conda", "run", "-n", "tf1", "/bin/bash", "-c"] +RUN python -m ipykernel install --name=tensorflow1 + +EXPOSE 8080 + +ADD start.sh / + +WORKDIR /workspace +RUN chown -R 42420:42420 /workspace + +ENTRYPOINT ["/start.sh"] diff --git a/Dockerfile.train.jupyter.simple b/Dockerfile.train.jupyter.simple new file mode 100644 index 00000000..bf4f798f --- /dev/null +++ b/Dockerfile.train.jupyter.simple @@ -0,0 +1,97 @@ +# This is a Dockerfile useful for training models with Coqui STT. +# You can train "acoustic models" with audio + Tensorflow, and +# you can create "scorers" with text + KenLM. + +FROM ubuntu:20.04 AS kenlm-build +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential cmake libboost-system-dev \ + libboost-thread-dev libboost-program-options-dev \ + libboost-test-dev libeigen3-dev zlib1g-dev \ + libbz2-dev liblzma-dev && \ + rm -rf /var/lib/apt/lists/* + +# Build KenLM to generate new scorers +WORKDIR /code +COPY kenlm /code/kenlm +RUN cd /code/kenlm && \ + mkdir -p build && \ + cd build && \ + cmake .. && \ + make -j $(nproc) || \ + ( echo "ERROR: Failed to build KenLM."; \ + echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \ + echo "ERROR: $ cd STT; git submodule update --init kenlm"; \ + exit 1; ) + + +FROM ubuntu:20.04 AS wget-binaries +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils + +# Tool to convert output graph for inference +RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \ + unzip temp.zip && \ + rm temp.zip + +RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \ + tar -xf temp.tar.xz && \ + rm temp.tar.xz + + +FROM jupyter/tensorflow-notebook +ENV DEBIAN_FRONTEND=noninteractive +USER root + +# We need to purge python3-xdg because +# it's breaking STT install later with +# errors about setuptools +# +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + wget \ + libopus0 \ + libopusfile0 \ + libsndfile1 \ + sox \ + libsox-fmt-mp3 && \ + apt-get purge -y python3-xdg && \ + rm -rf /var/lib/apt/lists/ + +# Make sure pip and its dependencies are up-to-date +RUN pip3 install --upgrade pip wheel setuptools +RUN pip3 uninstall -y tensorflow && pip3 install -y 'tensorflow-gpu==1.15.4' + +WORKDIR /code + +COPY native_client /code/native_client +COPY .git /code/.git +COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION +COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION + +# Build CTC decoder first, to avoid clashes on incompatible versions upgrades +RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings +RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl + +COPY setup.py /code/setup.py +COPY VERSION /code/VERSION +COPY training /code/training +# Copy files from previous build stages +RUN mkdir -p /code/kenlm/build/ +COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin +COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format +COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package + +# Install STT +# No need for the decoder since we did it earlier +# TensorFlow GPU should already be installed on the base image, +# and we don't want to break that +RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . + +# Copy rest of the code and test training +COPY . /code +RUN ./bin/run-ldc93s1.sh && rm -rf ~/.local/share/stt diff --git a/start.sh b/start.sh new file mode 100644 index 00000000..4ec5999e --- /dev/null +++ b/start.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -eu + +jupyter lab --ip=0.0.0.0 --port=8080 --no-browser --allow-root \ + --LabApp.token='' \ + --LabApp.custom_display_url=${JOB_URL_SCHEME}${JOB_ID}.${JOB_HOST} \ + --LabApp.allow_remote_access=True \ + --LabApp.allow_origin='*' \ + --LabApp.disable_check_xsrf=True \ No newline at end of file diff --git a/tfenv.yml b/tfenv.yml new file mode 100644 index 00000000..d489c7f3 --- /dev/null +++ b/tfenv.yml @@ -0,0 +1,16 @@ +name: tf1 +channels: + - conda-forge +dependencies: + - python=3.7 + - tensorflow-gpu==1.15 + - ipykernel + - google-auth + - tensorflow-hub + - pydicom + - pandas + - seaborn + - matplotlib + - scikit-learn + - openslide + - keras From 649bc535369ac48be47bef64ff97f0dd51aaf246 Mon Sep 17 00:00:00 2001 From: Josh Meyer Date: Fri, 16 Jul 2021 11:14:00 -0400 Subject: [PATCH 2/9] Remove extra installs from Dockerfile --- Dockerfile.train.jupyter | 77 +++++--------------------- Dockerfile.train.jupyter.simple | 97 --------------------------------- start.sh | 0 3 files changed, 13 insertions(+), 161 deletions(-) delete mode 100644 Dockerfile.train.jupyter.simple mode change 100644 => 100755 start.sh diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter index 09c6ed79..1e2a0ff7 100644 --- a/Dockerfile.train.jupyter +++ b/Dockerfile.train.jupyter @@ -40,6 +40,7 @@ RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/downloa rm temp.tar.xz +#FROM gcr.io/kaggle-gpu-images/python:v98 FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3 ENV DEBIAN_FRONTEND=noninteractive @@ -97,7 +98,7 @@ RUN rm -rf ~/.local/share/stt ## # -# FROM gcr.io/kaggle-gpu-images/python:v98 +#FROM gcr.io/kaggle-gpu-images/python:v98 RUN chsh -s /bin/bash ENV SHELL=/bin/bash @@ -119,74 +120,22 @@ RUN apt-get update && apt-get install -y \ supervisor \ gettext-base \ less \ + nodejs \ + npm \ && rm -rf /var/lib/apt/lists/* -# install nvm -# https://github.com/creationix/nvm#install-script -RUN curl --silent -o- https://raw.githubusercontent.com/creationix/nvm/v0.33.11/install.sh | bash - -#ENV NVM_DIR /root/.nvm -ENV NVM_DIR /usr/local/nvm -ENV NODE_VERSION v12.20.1 - -# install node and npm -RUN source $NVM_DIR/nvm.sh \ - && nvm install $NODE_VERSION \ - && nvm alias default $NODE_VERSION \ - && nvm use default - -# add node and npm to path so the commands are available -ENV NODE_PATH $NVM_DIR/versions/node/$NODE_VERSION/bin -ENV PATH $NODE_PATH:$PATH - -RUN pip install pip==20.3.4 -RUN pip install jupyterlab==2.2.9 ipywidgets==7.6.3 +RUN pip3 install jupyterlab ipywidgets RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets -RUN jupyter labextension install jupyterlab-plotly@4.14.3 +RUN jupyter labextension install jupyterlab-plotly -#install xlrd python module -RUN pip install pyradiomics -RUN pip install xlrd==1.2.0 -RUN pip install zarr -RUN pip install imbalanced-learn -RUN pip install openpyxl -RUN pip install efficientnet-pytorch -RUN pip install monai -RUN pip install prince -RUN pip install vit-pytorch -RUN pip install lifelines==0.25.11 -RUN pip install timm==0.3.2 -RUN pip install keras-retinanet==1.0.0 -RUN python -m pip install histomicstk --find-links https://girder.github.io/large_image_wheels -RUN pip install luminoth ipympl pysurvival missingpy pyinform pingouin pyAgrum missingno autoimpute networkx community yellowbrick factor_analyzer hdbscan pyitlib -RUN pip install eli5 dtreeviz gower batchgenerators mlinsights efficientnet-pytorch pretrainedmodels -# Add R to Jupyter Kernel -RUN conda install -y -c r r-irkernel - -#Install survival, sm, ggplot2, Hmisc, mixOmics (ce dernier est en repositoire Bioconductor) - -RUN conda install -y -c cran r-survival -RUN conda install -y -c cran r-sm -RUN conda install -y -c cran r-ggplot2 -RUN conda install -y -c cran r-hmisc -RUN conda install -y -c cran r-mixomics -RUN conda install -y -c cran r-caret -RUN conda install -y -c cran r-survminer -RUN conda install -y -c cran r-ggfortify -RUN conda install -y -c cran r-wordcloud -RUN conda install -y -c cran r-tm -RUN conda install -y -c cran r-prioritylasso -RUN conda install -y -c cran r-blockforest -RUN conda install -y -c cran r-mice - -#### tensorflow 1 -# Create the environment: -SHELL ["/bin/bash", "-c"] -COPY tfenv.yml . -RUN conda env create -f tfenv.yml -SHELL ["conda", "run", "-n", "tf1", "/bin/bash", "-c"] -RUN python -m ipykernel install --name=tensorflow1 +# #### tensorflow 1 +# # Create the environment: +# SHELL ["/bin/bash", "-c"] +# COPY tfenv.yml . +# RUN conda env create -f tfenv.yml +# SHELL ["conda", "run", "-n", "tf1", "/bin/bash", "-c"] +# RUN python -m ipykernel install --name=tensorflow1 EXPOSE 8080 diff --git a/Dockerfile.train.jupyter.simple b/Dockerfile.train.jupyter.simple deleted file mode 100644 index bf4f798f..00000000 --- a/Dockerfile.train.jupyter.simple +++ /dev/null @@ -1,97 +0,0 @@ -# This is a Dockerfile useful for training models with Coqui STT. -# You can train "acoustic models" with audio + Tensorflow, and -# you can create "scorers" with text + KenLM. - -FROM ubuntu:20.04 AS kenlm-build -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential cmake libboost-system-dev \ - libboost-thread-dev libboost-program-options-dev \ - libboost-test-dev libeigen3-dev zlib1g-dev \ - libbz2-dev liblzma-dev && \ - rm -rf /var/lib/apt/lists/* - -# Build KenLM to generate new scorers -WORKDIR /code -COPY kenlm /code/kenlm -RUN cd /code/kenlm && \ - mkdir -p build && \ - cd build && \ - cmake .. && \ - make -j $(nproc) || \ - ( echo "ERROR: Failed to build KenLM."; \ - echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \ - echo "ERROR: $ cd STT; git submodule update --init kenlm"; \ - exit 1; ) - - -FROM ubuntu:20.04 AS wget-binaries -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils - -# Tool to convert output graph for inference -RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \ - unzip temp.zip && \ - rm temp.zip - -RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \ - tar -xf temp.tar.xz && \ - rm temp.tar.xz - - -FROM jupyter/tensorflow-notebook -ENV DEBIAN_FRONTEND=noninteractive -USER root - -# We need to purge python3-xdg because -# it's breaking STT install later with -# errors about setuptools -# -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - git \ - wget \ - libopus0 \ - libopusfile0 \ - libsndfile1 \ - sox \ - libsox-fmt-mp3 && \ - apt-get purge -y python3-xdg && \ - rm -rf /var/lib/apt/lists/ - -# Make sure pip and its dependencies are up-to-date -RUN pip3 install --upgrade pip wheel setuptools -RUN pip3 uninstall -y tensorflow && pip3 install -y 'tensorflow-gpu==1.15.4' - -WORKDIR /code - -COPY native_client /code/native_client -COPY .git /code/.git -COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION -COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION - -# Build CTC decoder first, to avoid clashes on incompatible versions upgrades -RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings -RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl - -COPY setup.py /code/setup.py -COPY VERSION /code/VERSION -COPY training /code/training -# Copy files from previous build stages -RUN mkdir -p /code/kenlm/build/ -COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin -COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format -COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package - -# Install STT -# No need for the decoder since we did it earlier -# TensorFlow GPU should already be installed on the base image, -# and we don't want to break that -RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . - -# Copy rest of the code and test training -COPY . /code -RUN ./bin/run-ldc93s1.sh && rm -rf ~/.local/share/stt diff --git a/start.sh b/start.sh old mode 100644 new mode 100755 From d0f8eb96cd833e633761c5196c62867a6f3d8b21 Mon Sep 17 00:00:00 2001 From: Josh Meyer Date: Fri, 16 Jul 2021 11:52:51 -0400 Subject: [PATCH 3/9] Take out OVH run-time params --- Dockerfile.train.jupyter | 15 --------------- start.sh | 7 +------ 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter index 1e2a0ff7..d0a4872a 100644 --- a/Dockerfile.train.jupyter +++ b/Dockerfile.train.jupyter @@ -40,7 +40,6 @@ RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/downloa rm temp.tar.xz -#FROM gcr.io/kaggle-gpu-images/python:v98 FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3 ENV DEBIAN_FRONTEND=noninteractive @@ -98,12 +97,6 @@ RUN rm -rf ~/.local/share/stt ## # -#FROM gcr.io/kaggle-gpu-images/python:v98 - -RUN chsh -s /bin/bash -ENV SHELL=/bin/bash -RUN rm /bin/sh && ln -s /bin/bash /bin/sh - RUN apt-get update && apt-get install -y \ man \ vim \ @@ -129,14 +122,6 @@ RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets RUN jupyter labextension install jupyterlab-plotly -# #### tensorflow 1 -# # Create the environment: -# SHELL ["/bin/bash", "-c"] -# COPY tfenv.yml . -# RUN conda env create -f tfenv.yml -# SHELL ["conda", "run", "-n", "tf1", "/bin/bash", "-c"] -# RUN python -m ipykernel install --name=tensorflow1 - EXPOSE 8080 ADD start.sh / diff --git a/start.sh b/start.sh index 4ec5999e..05476bc9 100755 --- a/start.sh +++ b/start.sh @@ -1,9 +1,4 @@ #!/usr/bin/env bash set -eu -jupyter lab --ip=0.0.0.0 --port=8080 --no-browser --allow-root \ - --LabApp.token='' \ - --LabApp.custom_display_url=${JOB_URL_SCHEME}${JOB_ID}.${JOB_HOST} \ - --LabApp.allow_remote_access=True \ - --LabApp.allow_origin='*' \ - --LabApp.disable_check_xsrf=True \ No newline at end of file +jupyter lab --ip=0.0.0.0 --port=8080 --allow-root From a37ca2ec27d9e831c0901c9912d2bda45de52f08 Mon Sep 17 00:00:00 2001 From: Josh Meyer Date: Tue, 20 Jul 2021 04:20:57 -0400 Subject: [PATCH 4/9] Simplyfy dockerfile and add notebook --- Dockerfile.train.jupyter | 133 +++------------------------------------ train-ldc.ipynb | 46 ++++++++++++++ 2 files changed, 53 insertions(+), 126 deletions(-) create mode 100644 train-ldc.ipynb diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter index d0a4872a..71f3e466 100644 --- a/Dockerfile.train.jupyter +++ b/Dockerfile.train.jupyter @@ -1,132 +1,13 @@ # This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks -FROM ubuntu:20.04 AS kenlm-build -ENV DEBIAN_FRONTEND=noninteractive +FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.9 -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential cmake libboost-system-dev \ - libboost-thread-dev libboost-program-options-dev \ - libboost-test-dev libeigen3-dev zlib1g-dev \ - libbz2-dev liblzma-dev && \ - rm -rf /var/lib/apt/lists/* +RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws +RUN jupyter serverextension enable --py jupyter_http_over_ws -# Build KenLM to generate new scorers -WORKDIR /code -COPY kenlm /code/kenlm -RUN cd /code/kenlm && \ - mkdir -p build && \ - cd build && \ - cmake .. && \ - make -j $(nproc) || \ - ( echo "ERROR: Failed to build KenLM."; \ - echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \ - echo "ERROR: $ cd STT; git submodule update --init kenlm"; \ - exit 1; ) +RUN mv /code /home/STT +WORKDIR /home +EXPOSE 8888 -FROM ubuntu:20.04 AS wget-binaries -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils - -# Tool to convert output graph for inference -RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \ - unzip temp.zip && \ - rm temp.zip - -RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \ - tar -xf temp.tar.xz && \ - rm temp.tar.xz - - -FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3 -ENV DEBIAN_FRONTEND=noninteractive - -# We need to purge python3-xdg because -# it's breaking STT install later with -# errors about setuptools -# -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - git \ - wget \ - libopus0 \ - libopusfile0 \ - libsndfile1 \ - sox \ - libsox-fmt-mp3 && \ - apt-get purge -y python3-xdg && \ - rm -rf /var/lib/apt/lists/ - -# Make sure pip and its dependencies are up-to-date -RUN pip3 install --upgrade pip wheel setuptools - -WORKDIR /code - -COPY native_client /code/native_client -COPY .git /code/.git -COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION -COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION - -# Build CTC decoder first, to avoid clashes on incompatible versions upgrades -RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings -RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl - -COPY setup.py /code/setup.py -COPY VERSION /code/VERSION -COPY training /code/training -# Copy files from previous build stages -RUN mkdir -p /code/kenlm/build/ -COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin -COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format -COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package - -# Install STT -# No need for the decoder since we did it earlier -# TensorFlow GPU should already be installed on the base image, -# and we don't want to break that -RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . - -# Copy rest of the code and test training -COPY . /code -#RUN ./bin/run-ldc93s1.sh -RUN rm -rf ~/.local/share/stt - -### START OVH THINGS -## -# - -RUN apt-get update && apt-get install -y \ - man \ - vim \ - nano \ - htop \ - curl \ - wget \ - rsync \ - ca-certificates \ - git \ - zip \ - procps \ - ssh \ - supervisor \ - gettext-base \ - less \ - nodejs \ - npm \ - && rm -rf /var/lib/apt/lists/* - -RUN pip3 install jupyterlab ipywidgets -RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager -RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets -RUN jupyter labextension install jupyterlab-plotly - -EXPOSE 8080 - -ADD start.sh / - -WORKDIR /workspace -RUN chown -R 42420:42420 /workspace - -ENTRYPOINT ["/start.sh"] +CMD ["bash", "-c", "jupyter notebook --notebook-dir=/home --ip 0.0.0.0 --no-browser --allow-root"] diff --git a/train-ldc.ipynb b/train-ldc.ipynb new file mode 100644 index 00000000..78a94c2f --- /dev/null +++ b/train-ldc.ipynb @@ -0,0 +1,46 @@ +import os +import sys + +import pandas +from STT.training.coqui_stt_training.util.downloader import maybe_download +#from STT.bin.import_ldc93s1 import _download_and_preprocess_data as download_data + +#download_data('/home/STT/data') + +def download_and_preprocess_data(data_dir): + # Conditionally download data + LDC93S1_BASE = "LDC93S1" + LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/" + local_file = maybe_download( + LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav" + ) + trans_file = maybe_download( + LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt" + ) + with open(trans_file, "r") as fin: + transcript = " ".join(fin.read().strip().lower().split(" ")[2:]).replace( + ".", "" + ) + + df = pandas.DataFrame( + data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)], + columns=["wav_filename", "wav_filesize", "transcript"], + ) + df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False) + +download_and_preprocess_data('/home/STT/data') + + +from STT.training.coqui_stt_training.train import train, early_training_checks +from STT.training.coqui_stt_training.util.config import initialize_globals + +#Config.train_files=['/home/STT/data/ldc.csv'] +#Config.dev_files=['/home/STT/data/ldc.csv'] +#Config.test_files=['/home/STT/data/ldc.csv'] + +#Config.alphabet_config_path='/home/STT/data/alphabet.txt' +initialize_globals() + +early_training_checks() + +train() From 59e32556a4633a3a941f4a5c1c752bcbb12b1a28 Mon Sep 17 00:00:00 2001 From: Josh Meyer Date: Tue, 20 Jul 2021 09:07:54 -0400 Subject: [PATCH 5/9] Currently working notebook --- train-ldc.ipynb | 191 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 183 insertions(+), 8 deletions(-) diff --git a/train-ldc.ipynb b/train-ldc.ipynb index 78a94c2f..5f7a1168 100644 --- a/train-ldc.ipynb +++ b/train-ldc.ipynb @@ -1,3 +1,5 @@ +# Download LDC data + import os import sys @@ -30,17 +32,190 @@ def download_and_preprocess_data(data_dir): download_and_preprocess_data('/home/STT/data') +# Train -from STT.training.coqui_stt_training.train import train, early_training_checks -from STT.training.coqui_stt_training.util.config import initialize_globals +from STT.training.coqui_stt_training.util.config import _SttConfig, _ConfigSingleton +from STT.training.coqui_stt_training.util.augmentations import parse_augmentations, NormalizeSampleRate +from STT.training.coqui_stt_training.util.helpers import parse_file_size +from STT.training.coqui_stt_training.util.gpu import get_available_gpus +from coqui_stt_ctcdecoder import Alphabet +from xdg import BaseDirectory as xdg +import tensorflow.compat.v1 as tfv1 -#Config.train_files=['/home/STT/data/ldc.csv'] -#Config.dev_files=['/home/STT/data/ldc.csv'] -#Config.test_files=['/home/STT/data/ldc.csv'] +def initialize_globals(c): -#Config.alphabet_config_path='/home/STT/data/alphabet.txt' -initialize_globals() + # Augmentations + c.augmentations = parse_augmentations(c.augment) + print(f"Parsed augmentations from flags: {c.augmentations}") + if c.augmentations and c.feature_cache and c.cache_for_epochs == 0: + print( + "Due to current feature-cache settings the exact same sample augmentations of the first " + "epoch will be repeated on all following epochs. This could lead to unintended over-fitting. " + "You could use --cache_for_epochs to invalidate the cache after a given number of epochs." + ) + if c.normalize_sample_rate: + c.augmentations = [NormalizeSampleRate(c.audio_sample_rate)] + c[ + "augmentations" + ] + + # Caching + if c.cache_for_epochs == 1: + print( + "--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it." + ) + + # Read-buffer + c.read_buffer = parse_file_size(c.read_buffer) + + # Set default dropout rates + if c.dropout_rate2 < 0: + c.dropout_rate2 = c.dropout_rate + if c.dropout_rate3 < 0: + c.dropout_rate3 = c.dropout_rate + if c.dropout_rate6 < 0: + c.dropout_rate6 = c.dropout_rate + + # Set default checkpoint dir + if not c.checkpoint_dir: + c.checkpoint_dir = xdg.save_data_path(os.path.join("stt", "checkpoints")) + + if c.load_train not in ["last", "best", "init", "auto"]: + c.load_train = "auto" + + if c.load_evaluate not in ["last", "best", "auto"]: + c.load_evaluate = "auto" + + # Set default summary dir + if not c.summary_dir: + c.summary_dir = xdg.save_data_path(os.path.join("stt", "summaries")) + + # Standard session configuration that'll be used for all new sessions. + c.session_config = tfv1.ConfigProto( + allow_soft_placement=True, + log_device_placement=c.log_placement, + inter_op_parallelism_threads=c.inter_op_parallelism_threads, + intra_op_parallelism_threads=c.intra_op_parallelism_threads, + gpu_options=tfv1.GPUOptions(allow_growth=c.use_allow_growth), + ) + + # CPU device + c.cpu_device = "/cpu:0" + + # Available GPU devices + c.available_devices = get_available_gpus(c.session_config) + + # If there is no GPU available, we fall back to CPU based operation + if not c.available_devices: + c.available_devices = [c.cpu_device] + + c.alphabet_config_path="" + + if c.bytes_output_mode: + c.alphabet = UTF8Alphabet() + elif c.alphabet_config_path: + c.alphabet = Alphabet(os.path.abspath(c.alphabet_config_path)) + + # Geometric Constants + # =================== + + # For an explanation of the meaning of the geometric constants, please refer to + # doc/Geometry.md + + # Number of MFCC features + c.n_input = 26 # TODO: Determine this programmatically from the sample rate + + # The number of frames in the context + c.n_context = 9 # TODO: Determine the optimal value using a validation data set + + # Number of units in hidden layers + c.n_hidden = c.n_hidden + + c.n_hidden_1 = c.n_hidden + + c.n_hidden_2 = c.n_hidden + + c.n_hidden_5 = c.n_hidden + + # LSTM cell state dimension + c.n_cell_dim = c.n_hidden + + # The number of units in the third layer, which feeds in to the LSTM + c.n_hidden_3 = c.n_cell_dim + + # Units in the sixth layer = number of characters in the target language plus one + try: + c.n_hidden_6 = c.alphabet.GetSize() + 1 # +1 for CTC blank label + except: + AttributeError + + # Size of audio window in samples + if (c.feature_win_len * c.audio_sample_rate) % 1000 != 0: + log_error( + "--feature_win_len value ({}) in milliseconds ({}) multiplied " + "by --audio_sample_rate value ({}) must be an integer value. Adjust " + "your --feature_win_len value or resample your audio accordingly." + "".format(c.feature_win_len, c.feature_win_len / 1000, c.audio_sample_rate) + ) + sys.exit(1) + + c.audio_window_samples = c.audio_sample_rate * (c.feature_win_len / 1000) + + # Stride for feature computations in samples + if (c.feature_win_step * c.audio_sample_rate) % 1000 != 0: + log_error( + "--feature_win_step value ({}) in milliseconds ({}) multiplied " + "by --audio_sample_rate value ({}) must be an integer value. Adjust " + "your --feature_win_step value or resample your audio accordingly." + "".format( + c.feature_win_step, c.feature_win_step / 1000, c.audio_sample_rate + ) + ) + sys.exit(1) + + c.audio_step_samples = c.audio_sample_rate * (c.feature_win_step / 1000) + + if c.one_shot_infer: + if not path_exists_remote(c.one_shot_infer): + log_error("Path specified in --one_shot_infer is not a valid file.") + sys.exit(1) + + if c.train_cudnn and c.load_cudnn: + log_error( + "Trying to use --train_cudnn, but --load_cudnn " + "was also specified. The --load_cudnn flag is only " + "needed when converting a CuDNN RNN checkpoint to " + "a CPU-capable graph. If your system is capable of " + "using CuDNN RNN, you can just specify the CuDNN RNN " + "checkpoint normally with --save_checkpoint_dir." + ) + sys.exit(1) + + # If separate save and load flags were not specified, default to load and save + # from the same dir. + if not c.save_checkpoint_dir: + c.save_checkpoint_dir = c.checkpoint_dir + + if not c.load_checkpoint_dir: + c.load_checkpoint_dir = c.checkpoint_dir + + _ConfigSingleton._config = c # pylint: disable=protected-access + +from STT.training.coqui_stt_training.train import train, test, early_training_checks + +Config = _SttConfig() + +Config.alphabet = Alphabet('/home/STT/data/alphabet.txt') +Config.train_files=['/home/STT/data/ldc93s1.csv'] +Config.dev_files=['/home/STT/data/ldc93s1.csv'] +Config.test_files=['/home/STT/data/ldc93s1.csv'] +Config.n_hidden=100 +Config.epochs=200 + +initialize_globals(Config) + +#print(Config.to_json()) early_training_checks() - train() +tfv1.reset_default_graph() +test() From 9f7fda14cbce1f3848eda9d34d826deb91edc74c Mon Sep 17 00:00:00 2001 From: Josh Meyer Date: Fri, 23 Jul 2021 12:12:02 -0400 Subject: [PATCH 6/9] Add first Jupyter notebook --- Dockerfile.train.jupyter | 10 +- notebooks/train-ldc.ipynb | 253 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+), 5 deletions(-) create mode 100644 notebooks/train-ldc.ipynb diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter index 71f3e466..a11d28b2 100644 --- a/Dockerfile.train.jupyter +++ b/Dockerfile.train.jupyter @@ -1,13 +1,13 @@ # This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks -FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.9 +FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.10 + +COPY notebooks /code/notebooks +WORKDIR /code/notebooks RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws RUN jupyter serverextension enable --py jupyter_http_over_ws -RUN mv /code /home/STT -WORKDIR /home - EXPOSE 8888 -CMD ["bash", "-c", "jupyter notebook --notebook-dir=/home --ip 0.0.0.0 --no-browser --allow-root"] +CMD ["bash", "-c", "jupyter notebook --notebook-dir=/code/notebooks --ip 0.0.0.0 --no-browser --allow-root"] diff --git a/notebooks/train-ldc.ipynb b/notebooks/train-ldc.ipynb new file mode 100644 index 00000000..895785dd --- /dev/null +++ b/notebooks/train-ldc.ipynb @@ -0,0 +1,253 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f79d99ef", + "metadata": {}, + "source": [ + "# Train your first 🐸 STT model 💫\n", + "\n", + "👋 Hello and welcome to Coqui (🐸) STT \n", + "\n", + "The goal of this notebook is to show you a **typical workflow** for **training** and **testing** an STT model with 🐸.\n", + "\n", + "Let's train a very small model on a very small amount of data so we can iterate quickly.\n", + "\n", + "In this notebook, we will:\n", + "\n", + "1. Download data and format it for 🐸 STT.\n", + "2. Configure the training and testing runs.\n", + "3. Train a new model.\n", + "4. Test the model and display its performance.\n", + "\n", + "So, let's jump right in!\n", + "\n", + "*PS - If you just want a working, off-the-shelf model, check out the [🐸 Model Zoo](https://www.coqui.ai/models)*" + ] + }, + { + "cell_type": "markdown", + "id": "be5fe49c", + "metadata": {}, + "source": [ + "## ✅ Download & format sample data for English\n", + "\n", + "**First things first**: we need some data.\n", + "\n", + "We're training a Speech-to-Text model, so we need some _speech_ and we need some _text_. Specificially, we want _transcribed speech_. Let's download an English audio file and its transcript and then format them for 🐸 STT. \n", + "\n", + "🐸 STT expects to find information about your data in a CSV file, where each line contains:\n", + "\n", + "1. the **path** to an audio file\n", + "2. the **size** of that audio file\n", + "3. the **transcript** of that audio file.\n", + "\n", + "Formatting the audio and transcript isn't too difficult in this case. We define a custom data importer called `download_sample_data()` which does all the work. If you have a custom dataset, you will probably want to write a custom data importer.\n", + "\n", + "**Second things second**: we want an alphabet. The output layer of a typical* 🐸 STT model represents letters in the alphabet, and you should specify this alphabet before training. Let's download an English alphabet from Coqui and use that.\n", + "\n", + "_*If you are working with languages with large character sets (e.g. Chinese), you can set `bytes_output_mode=True` instead of supplying an `alphabet.txt` file. In this case, the output layer of the STT model will correspond to individual UTF-8 bytes instead of individual characters._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53945462", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "### Download sample data\n", + "import os\n", + "import pandas\n", + "from coqui_stt_training.util.downloader import maybe_download\n", + "\n", + "def download_sample_data():\n", + " data_dir=\"english/\"\n", + " # Download data + alphabet\n", + " audio_file = maybe_download(\"LDC93S1.wav\", data_dir, \"https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.wav\")\n", + " transcript_file = maybe_download(\"LDC93S1.txt\", data_dir, \"https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.txt\")\n", + " alphabet = maybe_download(\"alphabet.txt\", data_dir, \"https://raw.githubusercontent.com/coqui-ai/STT/main/data/alphabet.txt\")\n", + " # Format data\n", + " with open(transcript_file, \"r\") as fin:\n", + " transcript = \" \".join(fin.read().strip().lower().split(\" \")[2:]).replace(\".\", \"\")\n", + " df = pandas.DataFrame(data=[(os.path.abspath(audio_file), os.path.getsize(audio_file), transcript)],\n", + " columns=[\"wav_filename\", \"wav_filesize\", \"transcript\"])\n", + " # Save formatted CSV \n", + " df.to_csv(os.path.join(data_dir, \"ldc93s1.csv\"), index=False)\n", + "\n", + "# Download and format data\n", + "download_sample_data()" + ] + }, + { + "cell_type": "markdown", + "id": "96e8b708", + "metadata": {}, + "source": [ + "### Take a look at the data (*Optional* )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa2aec77", + "metadata": {}, + "outputs": [], + "source": [ + "csv_file = open(\"english/ldc93s1.csv\", \"r\")\n", + "print(csv_file.read())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c046277", + "metadata": {}, + "outputs": [], + "source": [ + "alphabet_file = open(\"english/alphabet.txt\", \"r\")\n", + "print(alphabet_file.read())" + ] + }, + { + "cell_type": "markdown", + "id": "d9dfac21", + "metadata": {}, + "source": [ + "## ✅ Configure & set hyperparameters\n", + "\n", + "Coqui STT comes with a long list of hyperparameters you can tweak. We've set default values, but you will often want to set your own. You can use `initialize_globals_from_args()` to do this. \n", + "\n", + "You must **always** configure the paths to your data, and you must **always** configure your alphabet. Additionally, here we show how you can specify the size of hidden layers (`n_hidden`), the number of epochs to train for (`epochs`), and to initialize a new model from scratch (`load_train=\"init\"`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d264fdec", + "metadata": {}, + "outputs": [], + "source": [ + "from coqui_stt_training.util.config import initialize_globals_from_args\n", + "\n", + "initialize_globals_from_args(\n", + " alphabet_config_path=\"english/alphabet.txt\",\n", + " train_files=[\"english/ldc93s1.csv\"],\n", + " dev_files=[\"english/ldc93s1.csv\"],\n", + " test_files=[\"english/ldc93s1.csv\"],\n", + " load_train=\"init\",\n", + " n_hidden=100,\n", + " epochs=200,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "799c1425", + "metadata": {}, + "source": [ + "### View all Config settings (*Optional*) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03b33d2b", + "metadata": {}, + "outputs": [], + "source": [ + "from coqui_stt_training.util.config import Config\n", + "\n", + "# Take a peek at the entire Config\n", + "print(Config.to_json())" + ] + }, + { + "cell_type": "markdown", + "id": "ae82fd75", + "metadata": {}, + "source": [ + "## ✅ Train a new model\n", + "\n", + "Let's kick off a training run 🚀🚀🚀 (using the configure you set above).\n", + "\n", + "This notebook should work on either a GPU or a CPU. However, in case you're running this on _multiple_ GPUs we want to only use one, because the sample dataset (one audio file) is too small to split across multiple GPUs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "550a504e", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from coqui_stt_training.train import train, early_training_checks\n", + "import tensorflow.compat.v1 as tfv1\n", + "\n", + "# use maximum one GPU\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", + "\n", + "early_training_checks()\n", + "\n", + "tfv1.reset_default_graph()\n", + "train()" + ] + }, + { + "cell_type": "markdown", + "id": "9f6dc959", + "metadata": {}, + "source": [ + "## ✅ Test the model\n", + "\n", + "We made it! 🙌\n", + "\n", + "Let's kick off the testing run, which displays performance metrics.\n", + "\n", + "We're committing the cardinal sin of ML 😈 (aka - testing on our training data) so you don't want to deploy this model into production. In this notebook we're focusing on the workflow itself, so it's forgivable 😇\n", + "\n", + "You can see from the test output that our tiny model has overfit to the data, and basically memorized this one sentence.\n", + "\n", + "When you start training your own models, make sure your testing data doesn't include your training data 😅" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd42bc7a", + "metadata": {}, + "outputs": [], + "source": [ + "from coqui_stt_training.train import test\n", + "\n", + "tfv1.reset_default_graph()\n", + "test()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ea82ab4cb822751a19b6fe493d47c5ac745064dd Mon Sep 17 00:00:00 2001 From: Josh Meyer Date: Fri, 23 Jul 2021 12:15:12 -0400 Subject: [PATCH 7/9] Remove old unneeded files --- start.sh | 4 - tfenv.yml | 16 ---- train-ldc.ipynb | 221 ------------------------------------------------ 3 files changed, 241 deletions(-) delete mode 100755 start.sh delete mode 100644 tfenv.yml delete mode 100644 train-ldc.ipynb diff --git a/start.sh b/start.sh deleted file mode 100755 index 05476bc9..00000000 --- a/start.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash -set -eu - -jupyter lab --ip=0.0.0.0 --port=8080 --allow-root diff --git a/tfenv.yml b/tfenv.yml deleted file mode 100644 index d489c7f3..00000000 --- a/tfenv.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: tf1 -channels: - - conda-forge -dependencies: - - python=3.7 - - tensorflow-gpu==1.15 - - ipykernel - - google-auth - - tensorflow-hub - - pydicom - - pandas - - seaborn - - matplotlib - - scikit-learn - - openslide - - keras diff --git a/train-ldc.ipynb b/train-ldc.ipynb deleted file mode 100644 index 5f7a1168..00000000 --- a/train-ldc.ipynb +++ /dev/null @@ -1,221 +0,0 @@ -# Download LDC data - -import os -import sys - -import pandas -from STT.training.coqui_stt_training.util.downloader import maybe_download -#from STT.bin.import_ldc93s1 import _download_and_preprocess_data as download_data - -#download_data('/home/STT/data') - -def download_and_preprocess_data(data_dir): - # Conditionally download data - LDC93S1_BASE = "LDC93S1" - LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/" - local_file = maybe_download( - LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav" - ) - trans_file = maybe_download( - LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt" - ) - with open(trans_file, "r") as fin: - transcript = " ".join(fin.read().strip().lower().split(" ")[2:]).replace( - ".", "" - ) - - df = pandas.DataFrame( - data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)], - columns=["wav_filename", "wav_filesize", "transcript"], - ) - df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False) - -download_and_preprocess_data('/home/STT/data') - -# Train - -from STT.training.coqui_stt_training.util.config import _SttConfig, _ConfigSingleton -from STT.training.coqui_stt_training.util.augmentations import parse_augmentations, NormalizeSampleRate -from STT.training.coqui_stt_training.util.helpers import parse_file_size -from STT.training.coqui_stt_training.util.gpu import get_available_gpus -from coqui_stt_ctcdecoder import Alphabet -from xdg import BaseDirectory as xdg -import tensorflow.compat.v1 as tfv1 - -def initialize_globals(c): - - # Augmentations - c.augmentations = parse_augmentations(c.augment) - print(f"Parsed augmentations from flags: {c.augmentations}") - if c.augmentations and c.feature_cache and c.cache_for_epochs == 0: - print( - "Due to current feature-cache settings the exact same sample augmentations of the first " - "epoch will be repeated on all following epochs. This could lead to unintended over-fitting. " - "You could use --cache_for_epochs to invalidate the cache after a given number of epochs." - ) - - if c.normalize_sample_rate: - c.augmentations = [NormalizeSampleRate(c.audio_sample_rate)] + c[ - "augmentations" - ] - - # Caching - if c.cache_for_epochs == 1: - print( - "--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it." - ) - - # Read-buffer - c.read_buffer = parse_file_size(c.read_buffer) - - # Set default dropout rates - if c.dropout_rate2 < 0: - c.dropout_rate2 = c.dropout_rate - if c.dropout_rate3 < 0: - c.dropout_rate3 = c.dropout_rate - if c.dropout_rate6 < 0: - c.dropout_rate6 = c.dropout_rate - - # Set default checkpoint dir - if not c.checkpoint_dir: - c.checkpoint_dir = xdg.save_data_path(os.path.join("stt", "checkpoints")) - - if c.load_train not in ["last", "best", "init", "auto"]: - c.load_train = "auto" - - if c.load_evaluate not in ["last", "best", "auto"]: - c.load_evaluate = "auto" - - # Set default summary dir - if not c.summary_dir: - c.summary_dir = xdg.save_data_path(os.path.join("stt", "summaries")) - - # Standard session configuration that'll be used for all new sessions. - c.session_config = tfv1.ConfigProto( - allow_soft_placement=True, - log_device_placement=c.log_placement, - inter_op_parallelism_threads=c.inter_op_parallelism_threads, - intra_op_parallelism_threads=c.intra_op_parallelism_threads, - gpu_options=tfv1.GPUOptions(allow_growth=c.use_allow_growth), - ) - - # CPU device - c.cpu_device = "/cpu:0" - - # Available GPU devices - c.available_devices = get_available_gpus(c.session_config) - - # If there is no GPU available, we fall back to CPU based operation - if not c.available_devices: - c.available_devices = [c.cpu_device] - - c.alphabet_config_path="" - - if c.bytes_output_mode: - c.alphabet = UTF8Alphabet() - elif c.alphabet_config_path: - c.alphabet = Alphabet(os.path.abspath(c.alphabet_config_path)) - - # Geometric Constants - # =================== - - # For an explanation of the meaning of the geometric constants, please refer to - # doc/Geometry.md - - # Number of MFCC features - c.n_input = 26 # TODO: Determine this programmatically from the sample rate - - # The number of frames in the context - c.n_context = 9 # TODO: Determine the optimal value using a validation data set - - # Number of units in hidden layers - c.n_hidden = c.n_hidden - - c.n_hidden_1 = c.n_hidden - - c.n_hidden_2 = c.n_hidden - - c.n_hidden_5 = c.n_hidden - - # LSTM cell state dimension - c.n_cell_dim = c.n_hidden - - # The number of units in the third layer, which feeds in to the LSTM - c.n_hidden_3 = c.n_cell_dim - - # Units in the sixth layer = number of characters in the target language plus one - try: - c.n_hidden_6 = c.alphabet.GetSize() + 1 # +1 for CTC blank label - except: - AttributeError - - # Size of audio window in samples - if (c.feature_win_len * c.audio_sample_rate) % 1000 != 0: - log_error( - "--feature_win_len value ({}) in milliseconds ({}) multiplied " - "by --audio_sample_rate value ({}) must be an integer value. Adjust " - "your --feature_win_len value or resample your audio accordingly." - "".format(c.feature_win_len, c.feature_win_len / 1000, c.audio_sample_rate) - ) - sys.exit(1) - - c.audio_window_samples = c.audio_sample_rate * (c.feature_win_len / 1000) - - # Stride for feature computations in samples - if (c.feature_win_step * c.audio_sample_rate) % 1000 != 0: - log_error( - "--feature_win_step value ({}) in milliseconds ({}) multiplied " - "by --audio_sample_rate value ({}) must be an integer value. Adjust " - "your --feature_win_step value or resample your audio accordingly." - "".format( - c.feature_win_step, c.feature_win_step / 1000, c.audio_sample_rate - ) - ) - sys.exit(1) - - c.audio_step_samples = c.audio_sample_rate * (c.feature_win_step / 1000) - - if c.one_shot_infer: - if not path_exists_remote(c.one_shot_infer): - log_error("Path specified in --one_shot_infer is not a valid file.") - sys.exit(1) - - if c.train_cudnn and c.load_cudnn: - log_error( - "Trying to use --train_cudnn, but --load_cudnn " - "was also specified. The --load_cudnn flag is only " - "needed when converting a CuDNN RNN checkpoint to " - "a CPU-capable graph. If your system is capable of " - "using CuDNN RNN, you can just specify the CuDNN RNN " - "checkpoint normally with --save_checkpoint_dir." - ) - sys.exit(1) - - # If separate save and load flags were not specified, default to load and save - # from the same dir. - if not c.save_checkpoint_dir: - c.save_checkpoint_dir = c.checkpoint_dir - - if not c.load_checkpoint_dir: - c.load_checkpoint_dir = c.checkpoint_dir - - _ConfigSingleton._config = c # pylint: disable=protected-access - -from STT.training.coqui_stt_training.train import train, test, early_training_checks - -Config = _SttConfig() - -Config.alphabet = Alphabet('/home/STT/data/alphabet.txt') -Config.train_files=['/home/STT/data/ldc93s1.csv'] -Config.dev_files=['/home/STT/data/ldc93s1.csv'] -Config.test_files=['/home/STT/data/ldc93s1.csv'] -Config.n_hidden=100 -Config.epochs=200 - -initialize_globals(Config) - -#print(Config.to_json()) -early_training_checks() -train() -tfv1.reset_default_graph() -test() From 7d40d5d686e8c58fe37d8e65cd919fc94bd8c8c9 Mon Sep 17 00:00:00 2001 From: Josh Meyer Date: Fri, 23 Jul 2021 12:16:26 -0400 Subject: [PATCH 8/9] Specify latest for base Coqui STT docker image --- Dockerfile.train.jupyter | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter index a11d28b2..a92b44ca 100644 --- a/Dockerfile.train.jupyter +++ b/Dockerfile.train.jupyter @@ -1,6 +1,6 @@ # This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks -FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.10 +FROM ghcr.io/coqui-ai/stt-train:latest COPY notebooks /code/notebooks WORKDIR /code/notebooks From 31199116574298a5ba5ba4afff4402785c6cf72d Mon Sep 17 00:00:00 2001 From: Josh Meyer Date: Fri, 23 Jul 2021 12:17:55 -0400 Subject: [PATCH 9/9] Next core Coqui STT docker image will have notebooks dir --- Dockerfile.train.jupyter | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter index a92b44ca..5fa680ec 100644 --- a/Dockerfile.train.jupyter +++ b/Dockerfile.train.jupyter @@ -2,7 +2,6 @@ FROM ghcr.io/coqui-ai/stt-train:latest -COPY notebooks /code/notebooks WORKDIR /code/notebooks RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws