From f062f75e172d13489d8d790e3f6a52a3365ba80d Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Fri, 16 Jul 2021 09:24:00 -0400
Subject: [PATCH 1/9] working on dockerfile with jupyter support

---
 Dockerfile.train.jupyter        | 198 ++++++++++++++++++++++++++++++++
 Dockerfile.train.jupyter.simple |  97 ++++++++++++++++
 start.sh                        |   9 ++
 tfenv.yml                       |  16 +++
 4 files changed, 320 insertions(+)
 create mode 100644 Dockerfile.train.jupyter
 create mode 100644 Dockerfile.train.jupyter.simple
 create mode 100644 start.sh
 create mode 100644 tfenv.yml

diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter
new file mode 100644
index 00000000..09c6ed79
--- /dev/null
+++ b/Dockerfile.train.jupyter
@@ -0,0 +1,198 @@
+# This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks
+
+FROM ubuntu:20.04 AS kenlm-build
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential cmake libboost-system-dev \
+    libboost-thread-dev libboost-program-options-dev \
+    libboost-test-dev libeigen3-dev zlib1g-dev \
+    libbz2-dev liblzma-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Build KenLM to generate new scorers
+WORKDIR /code
+COPY kenlm /code/kenlm
+RUN cd /code/kenlm && \
+    mkdir -p build && \
+    cd build && \
+    cmake .. && \
+    make -j $(nproc) || \
+    ( echo "ERROR: Failed to build KenLM."; \
+    echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \
+    echo "ERROR: $ cd STT; git submodule update --init kenlm"; \
+    exit 1; )
+
+
+FROM ubuntu:20.04 AS wget-binaries
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils
+
+# Tool to convert output graph for inference
+RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \
+    unzip temp.zip && \
+    rm temp.zip
+
+RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \
+    tar -xf temp.tar.xz && \
+    rm temp.tar.xz
+
+
+FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3
+ENV DEBIAN_FRONTEND=noninteractive
+
+# We need to purge python3-xdg because
+# it's breaking STT install later with
+# errors about setuptools
+#
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        wget \
+        libopus0 \
+        libopusfile0 \
+        libsndfile1 \
+        sox \
+        libsox-fmt-mp3 && \
+    apt-get purge -y python3-xdg && \
+    rm -rf /var/lib/apt/lists/
+
+# Make sure pip and its dependencies are up-to-date
+RUN pip3 install --upgrade pip wheel setuptools
+
+WORKDIR /code
+
+COPY native_client /code/native_client
+COPY .git /code/.git
+COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION
+COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION
+
+# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
+RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
+RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
+
+COPY setup.py /code/setup.py
+COPY VERSION /code/VERSION
+COPY training /code/training
+# Copy files from previous build stages
+RUN mkdir -p /code/kenlm/build/
+COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin
+COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format
+COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package
+
+# Install STT
+# No need for the decoder since we did it earlier
+# TensorFlow GPU should already be installed on the base image,
+# and we don't want to break that
+RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .
+
+# Copy rest of the code and test training
+COPY . /code
+#RUN ./bin/run-ldc93s1.sh
+RUN rm -rf ~/.local/share/stt
+
+### START OVH THINGS
+##
+#
+
+# FROM gcr.io/kaggle-gpu-images/python:v98
+
+RUN chsh -s /bin/bash
+ENV SHELL=/bin/bash
+RUN rm /bin/sh && ln -s /bin/bash /bin/sh
+
+RUN apt-get update && apt-get install -y \
+    man \
+    vim \
+    nano \
+    htop \
+    curl \
+    wget \
+    rsync \
+    ca-certificates \
+    git \
+    zip \
+    procps \
+    ssh \
+    supervisor \
+    gettext-base \
+    less \
+    && rm -rf /var/lib/apt/lists/*
+
+# install nvm
+# https://github.com/creationix/nvm#install-script
+RUN curl --silent -o- https://raw.githubusercontent.com/creationix/nvm/v0.33.11/install.sh | bash
+
+#ENV NVM_DIR /root/.nvm
+ENV NVM_DIR /usr/local/nvm
+ENV NODE_VERSION v12.20.1
+
+# install node and npm
+RUN source $NVM_DIR/nvm.sh \
+    && nvm install $NODE_VERSION \
+    && nvm alias default $NODE_VERSION \
+    && nvm use default
+
+# add node and npm to path so the commands are available
+ENV NODE_PATH $NVM_DIR/versions/node/$NODE_VERSION/bin
+ENV PATH $NODE_PATH:$PATH
+
+RUN pip install pip==20.3.4
+RUN pip install jupyterlab==2.2.9 ipywidgets==7.6.3
+RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
+RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets
+RUN jupyter labextension install jupyterlab-plotly@4.14.3
+
+#install xlrd python module
+RUN pip install pyradiomics
+RUN pip install xlrd==1.2.0
+RUN pip install zarr
+RUN pip install imbalanced-learn
+RUN pip install openpyxl 
+RUN pip install efficientnet-pytorch
+RUN pip install monai
+RUN pip install prince
+RUN pip install vit-pytorch
+RUN pip install lifelines==0.25.11
+RUN pip install timm==0.3.2
+RUN pip install keras-retinanet==1.0.0
+RUN python -m pip install histomicstk --find-links https://girder.github.io/large_image_wheels
+RUN pip install luminoth ipympl pysurvival missingpy pyinform pingouin pyAgrum missingno autoimpute networkx community yellowbrick factor_analyzer hdbscan pyitlib
+RUN pip install eli5 dtreeviz gower batchgenerators mlinsights efficientnet-pytorch pretrainedmodels
+# Add R to Jupyter Kernel
+RUN conda install -y -c r r-irkernel
+
+#Install survival, sm, ggplot2, Hmisc, mixOmics (ce dernier est en repositoire Bioconductor)
+
+RUN conda install -y -c cran r-survival 
+RUN conda install -y -c cran r-sm
+RUN conda install -y -c cran r-ggplot2 
+RUN conda install -y -c cran r-hmisc
+RUN conda install -y -c cran r-mixomics
+RUN conda install -y -c cran r-caret
+RUN conda install -y -c cran r-survminer
+RUN conda install -y -c cran r-ggfortify
+RUN conda install -y -c cran r-wordcloud
+RUN conda install -y -c cran r-tm
+RUN conda install -y -c cran r-prioritylasso
+RUN conda install -y -c cran r-blockforest
+RUN conda install -y -c cran r-mice
+
+#### tensorflow 1
+# Create the environment:
+SHELL ["/bin/bash", "-c"]
+COPY tfenv.yml .
+RUN conda env create -f tfenv.yml
+SHELL ["conda", "run", "-n", "tf1", "/bin/bash", "-c"]
+RUN python -m ipykernel install --name=tensorflow1
+
+EXPOSE 8080
+
+ADD start.sh /
+
+WORKDIR /workspace
+RUN chown -R 42420:42420 /workspace
+
+ENTRYPOINT ["/start.sh"]
diff --git a/Dockerfile.train.jupyter.simple b/Dockerfile.train.jupyter.simple
new file mode 100644
index 00000000..bf4f798f
--- /dev/null
+++ b/Dockerfile.train.jupyter.simple
@@ -0,0 +1,97 @@
+# This is a Dockerfile useful for training models with Coqui STT.
+# You can train "acoustic models" with audio + Tensorflow, and
+# you can create "scorers" with text + KenLM.
+
+FROM ubuntu:20.04 AS kenlm-build
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential cmake libboost-system-dev \
+    libboost-thread-dev libboost-program-options-dev \
+    libboost-test-dev libeigen3-dev zlib1g-dev \
+    libbz2-dev liblzma-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Build KenLM to generate new scorers
+WORKDIR /code
+COPY kenlm /code/kenlm
+RUN cd /code/kenlm && \
+    mkdir -p build && \
+    cd build && \
+    cmake .. && \
+    make -j $(nproc) || \
+    ( echo "ERROR: Failed to build KenLM."; \
+    echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \
+    echo "ERROR: $ cd STT; git submodule update --init kenlm"; \
+    exit 1; )
+
+
+FROM ubuntu:20.04 AS wget-binaries
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils
+
+# Tool to convert output graph for inference
+RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \
+    unzip temp.zip && \
+    rm temp.zip
+
+RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \
+    tar -xf temp.tar.xz && \
+    rm temp.tar.xz
+
+
+FROM jupyter/tensorflow-notebook
+ENV DEBIAN_FRONTEND=noninteractive
+USER root
+
+# We need to purge python3-xdg because
+# it's breaking STT install later with
+# errors about setuptools
+#
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        wget \
+        libopus0 \
+        libopusfile0 \
+        libsndfile1 \
+        sox \
+        libsox-fmt-mp3 && \
+    apt-get purge -y python3-xdg && \
+    rm -rf /var/lib/apt/lists/
+
+# Make sure pip and its dependencies are up-to-date
+RUN pip3 install --upgrade pip wheel setuptools
+RUN pip3 uninstall -y tensorflow && pip3 install -y 'tensorflow-gpu==1.15.4'
+
+WORKDIR /code
+
+COPY native_client /code/native_client
+COPY .git /code/.git
+COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION
+COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION
+
+# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
+RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
+RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
+
+COPY setup.py /code/setup.py
+COPY VERSION /code/VERSION
+COPY training /code/training
+# Copy files from previous build stages
+RUN mkdir -p /code/kenlm/build/
+COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin
+COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format
+COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package
+
+# Install STT
+# No need for the decoder since we did it earlier
+# TensorFlow GPU should already be installed on the base image,
+# and we don't want to break that
+RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .
+
+# Copy rest of the code and test training
+COPY . /code
+RUN ./bin/run-ldc93s1.sh && rm -rf ~/.local/share/stt
diff --git a/start.sh b/start.sh
new file mode 100644
index 00000000..4ec5999e
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -eu
+
+jupyter lab --ip=0.0.0.0 --port=8080 --no-browser --allow-root \
+  --LabApp.token='' \
+  --LabApp.custom_display_url=${JOB_URL_SCHEME}${JOB_ID}.${JOB_HOST} \
+  --LabApp.allow_remote_access=True \
+  --LabApp.allow_origin='*' \
+  --LabApp.disable_check_xsrf=True
\ No newline at end of file
diff --git a/tfenv.yml b/tfenv.yml
new file mode 100644
index 00000000..d489c7f3
--- /dev/null
+++ b/tfenv.yml
@@ -0,0 +1,16 @@
+name: tf1
+channels:
+  - conda-forge
+dependencies:
+  - python=3.7
+  - tensorflow-gpu==1.15
+  - ipykernel
+  - google-auth
+  - tensorflow-hub
+  - pydicom
+  - pandas
+  - seaborn 
+  - matplotlib 
+  - scikit-learn 
+  - openslide
+  - keras  

From 649bc535369ac48be47bef64ff97f0dd51aaf246 Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Fri, 16 Jul 2021 11:14:00 -0400
Subject: [PATCH 2/9] Remove extra installs from Dockerfile

---
 Dockerfile.train.jupyter        | 77 +++++---------------------
 Dockerfile.train.jupyter.simple | 97 ---------------------------------
 start.sh                        |  0
 3 files changed, 13 insertions(+), 161 deletions(-)
 delete mode 100644 Dockerfile.train.jupyter.simple
 mode change 100644 => 100755 start.sh

diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter
index 09c6ed79..1e2a0ff7 100644
--- a/Dockerfile.train.jupyter
+++ b/Dockerfile.train.jupyter
@@ -40,6 +40,7 @@ RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/downloa
     rm temp.tar.xz
 
 
+#FROM gcr.io/kaggle-gpu-images/python:v98
 FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -97,7 +98,7 @@ RUN rm -rf ~/.local/share/stt
 ##
 #
 
-# FROM gcr.io/kaggle-gpu-images/python:v98
+#FROM gcr.io/kaggle-gpu-images/python:v98
 
 RUN chsh -s /bin/bash
 ENV SHELL=/bin/bash
@@ -119,74 +120,22 @@ RUN apt-get update && apt-get install -y \
     supervisor \
     gettext-base \
     less \
+    nodejs \
+    npm \
     && rm -rf /var/lib/apt/lists/*
 
-# install nvm
-# https://github.com/creationix/nvm#install-script
-RUN curl --silent -o- https://raw.githubusercontent.com/creationix/nvm/v0.33.11/install.sh | bash
-
-#ENV NVM_DIR /root/.nvm
-ENV NVM_DIR /usr/local/nvm
-ENV NODE_VERSION v12.20.1
-
-# install node and npm
-RUN source $NVM_DIR/nvm.sh \
-    && nvm install $NODE_VERSION \
-    && nvm alias default $NODE_VERSION \
-    && nvm use default
-
-# add node and npm to path so the commands are available
-ENV NODE_PATH $NVM_DIR/versions/node/$NODE_VERSION/bin
-ENV PATH $NODE_PATH:$PATH
-
-RUN pip install pip==20.3.4
-RUN pip install jupyterlab==2.2.9 ipywidgets==7.6.3
+RUN pip3 install jupyterlab ipywidgets
 RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
 RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets
-RUN jupyter labextension install jupyterlab-plotly@4.14.3
+RUN jupyter labextension install jupyterlab-plotly
 
-#install xlrd python module
-RUN pip install pyradiomics
-RUN pip install xlrd==1.2.0
-RUN pip install zarr
-RUN pip install imbalanced-learn
-RUN pip install openpyxl 
-RUN pip install efficientnet-pytorch
-RUN pip install monai
-RUN pip install prince
-RUN pip install vit-pytorch
-RUN pip install lifelines==0.25.11
-RUN pip install timm==0.3.2
-RUN pip install keras-retinanet==1.0.0
-RUN python -m pip install histomicstk --find-links https://girder.github.io/large_image_wheels
-RUN pip install luminoth ipympl pysurvival missingpy pyinform pingouin pyAgrum missingno autoimpute networkx community yellowbrick factor_analyzer hdbscan pyitlib
-RUN pip install eli5 dtreeviz gower batchgenerators mlinsights efficientnet-pytorch pretrainedmodels
-# Add R to Jupyter Kernel
-RUN conda install -y -c r r-irkernel
-
-#Install survival, sm, ggplot2, Hmisc, mixOmics (ce dernier est en repositoire Bioconductor)
-
-RUN conda install -y -c cran r-survival 
-RUN conda install -y -c cran r-sm
-RUN conda install -y -c cran r-ggplot2 
-RUN conda install -y -c cran r-hmisc
-RUN conda install -y -c cran r-mixomics
-RUN conda install -y -c cran r-caret
-RUN conda install -y -c cran r-survminer
-RUN conda install -y -c cran r-ggfortify
-RUN conda install -y -c cran r-wordcloud
-RUN conda install -y -c cran r-tm
-RUN conda install -y -c cran r-prioritylasso
-RUN conda install -y -c cran r-blockforest
-RUN conda install -y -c cran r-mice
-
-#### tensorflow 1
-# Create the environment:
-SHELL ["/bin/bash", "-c"]
-COPY tfenv.yml .
-RUN conda env create -f tfenv.yml
-SHELL ["conda", "run", "-n", "tf1", "/bin/bash", "-c"]
-RUN python -m ipykernel install --name=tensorflow1
+# #### tensorflow 1
+# # Create the environment:
+# SHELL ["/bin/bash", "-c"]
+# COPY tfenv.yml .
+# RUN conda env create -f tfenv.yml
+# SHELL ["conda", "run", "-n", "tf1", "/bin/bash", "-c"]
+# RUN python -m ipykernel install --name=tensorflow1
 
 EXPOSE 8080
 
diff --git a/Dockerfile.train.jupyter.simple b/Dockerfile.train.jupyter.simple
deleted file mode 100644
index bf4f798f..00000000
--- a/Dockerfile.train.jupyter.simple
+++ /dev/null
@@ -1,97 +0,0 @@
-# This is a Dockerfile useful for training models with Coqui STT.
-# You can train "acoustic models" with audio + Tensorflow, and
-# you can create "scorers" with text + KenLM.
-
-FROM ubuntu:20.04 AS kenlm-build
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    build-essential cmake libboost-system-dev \
-    libboost-thread-dev libboost-program-options-dev \
-    libboost-test-dev libeigen3-dev zlib1g-dev \
-    libbz2-dev liblzma-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-# Build KenLM to generate new scorers
-WORKDIR /code
-COPY kenlm /code/kenlm
-RUN cd /code/kenlm && \
-    mkdir -p build && \
-    cd build && \
-    cmake .. && \
-    make -j $(nproc) || \
-    ( echo "ERROR: Failed to build KenLM."; \
-    echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \
-    echo "ERROR: $ cd STT; git submodule update --init kenlm"; \
-    exit 1; )
-
-
-FROM ubuntu:20.04 AS wget-binaries
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils
-
-# Tool to convert output graph for inference
-RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \
-    unzip temp.zip && \
-    rm temp.zip
-
-RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \
-    tar -xf temp.tar.xz && \
-    rm temp.tar.xz
-
-
-FROM jupyter/tensorflow-notebook
-ENV DEBIAN_FRONTEND=noninteractive
-USER root
-
-# We need to purge python3-xdg because
-# it's breaking STT install later with
-# errors about setuptools
-#
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        wget \
-        libopus0 \
-        libopusfile0 \
-        libsndfile1 \
-        sox \
-        libsox-fmt-mp3 && \
-    apt-get purge -y python3-xdg && \
-    rm -rf /var/lib/apt/lists/
-
-# Make sure pip and its dependencies are up-to-date
-RUN pip3 install --upgrade pip wheel setuptools
-RUN pip3 uninstall -y tensorflow && pip3 install -y 'tensorflow-gpu==1.15.4'
-
-WORKDIR /code
-
-COPY native_client /code/native_client
-COPY .git /code/.git
-COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION
-COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION
-
-# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
-RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
-RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
-
-COPY setup.py /code/setup.py
-COPY VERSION /code/VERSION
-COPY training /code/training
-# Copy files from previous build stages
-RUN mkdir -p /code/kenlm/build/
-COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin
-COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format
-COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package
-
-# Install STT
-# No need for the decoder since we did it earlier
-# TensorFlow GPU should already be installed on the base image,
-# and we don't want to break that
-RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .
-
-# Copy rest of the code and test training
-COPY . /code
-RUN ./bin/run-ldc93s1.sh && rm -rf ~/.local/share/stt
diff --git a/start.sh b/start.sh
old mode 100644
new mode 100755

From d0f8eb96cd833e633761c5196c62867a6f3d8b21 Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Fri, 16 Jul 2021 11:52:51 -0400
Subject: [PATCH 3/9] Take out OVH run-time params

---
 Dockerfile.train.jupyter | 15 ---------------
 start.sh                 |  7 +------
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter
index 1e2a0ff7..d0a4872a 100644
--- a/Dockerfile.train.jupyter
+++ b/Dockerfile.train.jupyter
@@ -40,7 +40,6 @@ RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/downloa
     rm temp.tar.xz
 
 
-#FROM gcr.io/kaggle-gpu-images/python:v98
 FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -98,12 +97,6 @@ RUN rm -rf ~/.local/share/stt
 ##
 #
 
-#FROM gcr.io/kaggle-gpu-images/python:v98
-
-RUN chsh -s /bin/bash
-ENV SHELL=/bin/bash
-RUN rm /bin/sh && ln -s /bin/bash /bin/sh
-
 RUN apt-get update && apt-get install -y \
     man \
     vim \
@@ -129,14 +122,6 @@ RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
 RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets
 RUN jupyter labextension install jupyterlab-plotly
 
-# #### tensorflow 1
-# # Create the environment:
-# SHELL ["/bin/bash", "-c"]
-# COPY tfenv.yml .
-# RUN conda env create -f tfenv.yml
-# SHELL ["conda", "run", "-n", "tf1", "/bin/bash", "-c"]
-# RUN python -m ipykernel install --name=tensorflow1
-
 EXPOSE 8080
 
 ADD start.sh /
diff --git a/start.sh b/start.sh
index 4ec5999e..05476bc9 100755
--- a/start.sh
+++ b/start.sh
@@ -1,9 +1,4 @@
 #!/usr/bin/env bash
 set -eu
 
-jupyter lab --ip=0.0.0.0 --port=8080 --no-browser --allow-root \
-  --LabApp.token='' \
-  --LabApp.custom_display_url=${JOB_URL_SCHEME}${JOB_ID}.${JOB_HOST} \
-  --LabApp.allow_remote_access=True \
-  --LabApp.allow_origin='*' \
-  --LabApp.disable_check_xsrf=True
\ No newline at end of file
+jupyter lab --ip=0.0.0.0 --port=8080 --allow-root

From a37ca2ec27d9e831c0901c9912d2bda45de52f08 Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Tue, 20 Jul 2021 04:20:57 -0400
Subject: [PATCH 4/9] Simplyfy dockerfile and add notebook

---
 Dockerfile.train.jupyter | 133 +++------------------------------------
 train-ldc.ipynb          |  46 ++++++++++++++
 2 files changed, 53 insertions(+), 126 deletions(-)
 create mode 100644 train-ldc.ipynb

diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter
index d0a4872a..71f3e466 100644
--- a/Dockerfile.train.jupyter
+++ b/Dockerfile.train.jupyter
@@ -1,132 +1,13 @@
 # This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks
 
-FROM ubuntu:20.04 AS kenlm-build
-ENV DEBIAN_FRONTEND=noninteractive
+FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.9
 
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    build-essential cmake libboost-system-dev \
-    libboost-thread-dev libboost-program-options-dev \
-    libboost-test-dev libeigen3-dev zlib1g-dev \
-    libbz2-dev liblzma-dev && \
-    rm -rf /var/lib/apt/lists/*
+RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
-# Build KenLM to generate new scorers
-WORKDIR /code
-COPY kenlm /code/kenlm
-RUN cd /code/kenlm && \
-    mkdir -p build && \
-    cd build && \
-    cmake .. && \
-    make -j $(nproc) || \
-    ( echo "ERROR: Failed to build KenLM."; \
-    echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \
-    echo "ERROR: $ cd STT; git submodule update --init kenlm"; \
-    exit 1; )
+RUN mv /code /home/STT
+WORKDIR /home
 
+EXPOSE 8888
 
-FROM ubuntu:20.04 AS wget-binaries
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils
-
-# Tool to convert output graph for inference
-RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \
-    unzip temp.zip && \
-    rm temp.zip
-
-RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \
-    tar -xf temp.tar.xz && \
-    rm temp.tar.xz
-
-
-FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3
-ENV DEBIAN_FRONTEND=noninteractive
-
-# We need to purge python3-xdg because
-# it's breaking STT install later with
-# errors about setuptools
-#
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        wget \
-        libopus0 \
-        libopusfile0 \
-        libsndfile1 \
-        sox \
-        libsox-fmt-mp3 && \
-    apt-get purge -y python3-xdg && \
-    rm -rf /var/lib/apt/lists/
-
-# Make sure pip and its dependencies are up-to-date
-RUN pip3 install --upgrade pip wheel setuptools
-
-WORKDIR /code
-
-COPY native_client /code/native_client
-COPY .git /code/.git
-COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION
-COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION
-
-# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
-RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
-RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
-
-COPY setup.py /code/setup.py
-COPY VERSION /code/VERSION
-COPY training /code/training
-# Copy files from previous build stages
-RUN mkdir -p /code/kenlm/build/
-COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin
-COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format
-COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package
-
-# Install STT
-# No need for the decoder since we did it earlier
-# TensorFlow GPU should already be installed on the base image,
-# and we don't want to break that
-RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .
-
-# Copy rest of the code and test training
-COPY . /code
-#RUN ./bin/run-ldc93s1.sh
-RUN rm -rf ~/.local/share/stt
-
-### START OVH THINGS
-##
-#
-
-RUN apt-get update && apt-get install -y \
-    man \
-    vim \
-    nano \
-    htop \
-    curl \
-    wget \
-    rsync \
-    ca-certificates \
-    git \
-    zip \
-    procps \
-    ssh \
-    supervisor \
-    gettext-base \
-    less \
-    nodejs \
-    npm \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN pip3 install jupyterlab ipywidgets
-RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
-RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets
-RUN jupyter labextension install jupyterlab-plotly
-
-EXPOSE 8080
-
-ADD start.sh /
-
-WORKDIR /workspace
-RUN chown -R 42420:42420 /workspace
-
-ENTRYPOINT ["/start.sh"]
+CMD ["bash", "-c", "jupyter notebook --notebook-dir=/home --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/train-ldc.ipynb b/train-ldc.ipynb
new file mode 100644
index 00000000..78a94c2f
--- /dev/null
+++ b/train-ldc.ipynb
@@ -0,0 +1,46 @@
+import os
+import sys
+
+import pandas
+from STT.training.coqui_stt_training.util.downloader import maybe_download
+#from STT.bin.import_ldc93s1 import _download_and_preprocess_data as download_data
+
+#download_data('/home/STT/data')
+
+def download_and_preprocess_data(data_dir):
+    # Conditionally download data
+    LDC93S1_BASE = "LDC93S1"
+    LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/"
+    local_file = maybe_download(
+        LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav"
+    )
+    trans_file = maybe_download(
+        LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt"
+    )
+    with open(trans_file, "r") as fin:
+        transcript = " ".join(fin.read().strip().lower().split(" ")[2:]).replace(
+            ".", ""
+        )
+
+    df = pandas.DataFrame(
+        data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)],
+        columns=["wav_filename", "wav_filesize", "transcript"],
+    )
+    df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False)
+
+download_and_preprocess_data('/home/STT/data')
+
+
+from STT.training.coqui_stt_training.train import train, early_training_checks
+from STT.training.coqui_stt_training.util.config import initialize_globals
+
+#Config.train_files=['/home/STT/data/ldc.csv']
+#Config.dev_files=['/home/STT/data/ldc.csv']
+#Config.test_files=['/home/STT/data/ldc.csv']
+
+#Config.alphabet_config_path='/home/STT/data/alphabet.txt'
+initialize_globals()
+
+early_training_checks()
+
+train()

From 59e32556a4633a3a941f4a5c1c752bcbb12b1a28 Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Tue, 20 Jul 2021 09:07:54 -0400
Subject: [PATCH 5/9] Currently working notebook

---
 train-ldc.ipynb | 191 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 183 insertions(+), 8 deletions(-)

diff --git a/train-ldc.ipynb b/train-ldc.ipynb
index 78a94c2f..5f7a1168 100644
--- a/train-ldc.ipynb
+++ b/train-ldc.ipynb
@@ -1,3 +1,5 @@
+# Download LDC data
+
 import os
 import sys
 
@@ -30,17 +32,190 @@ def download_and_preprocess_data(data_dir):
 
 download_and_preprocess_data('/home/STT/data')
 
+# Train
 
-from STT.training.coqui_stt_training.train import train, early_training_checks
-from STT.training.coqui_stt_training.util.config import initialize_globals
+from STT.training.coqui_stt_training.util.config import _SttConfig, _ConfigSingleton
+from STT.training.coqui_stt_training.util.augmentations import parse_augmentations, NormalizeSampleRate
+from STT.training.coqui_stt_training.util.helpers import parse_file_size
+from STT.training.coqui_stt_training.util.gpu import get_available_gpus
+from coqui_stt_ctcdecoder import Alphabet
+from xdg import BaseDirectory as xdg
+import tensorflow.compat.v1 as tfv1
 
-#Config.train_files=['/home/STT/data/ldc.csv']
-#Config.dev_files=['/home/STT/data/ldc.csv']
-#Config.test_files=['/home/STT/data/ldc.csv']
+def initialize_globals(c):
 
-#Config.alphabet_config_path='/home/STT/data/alphabet.txt'
-initialize_globals()
+    # Augmentations
+    c.augmentations = parse_augmentations(c.augment)
+    print(f"Parsed augmentations from flags: {c.augmentations}")
+    if c.augmentations and c.feature_cache and c.cache_for_epochs == 0:
+        print(
+            "Due to current feature-cache settings the exact same sample augmentations of the first "
+            "epoch will be repeated on all following epochs. This could lead to unintended over-fitting. "
+            "You could use --cache_for_epochs <n_epochs> to invalidate the cache after a given number of epochs."
+        )
 
+    if c.normalize_sample_rate:
+        c.augmentations = [NormalizeSampleRate(c.audio_sample_rate)] + c[
+            "augmentations"
+        ]
+
+    # Caching
+    if c.cache_for_epochs == 1:
+        print(
+            "--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it."
+        )
+
+    # Read-buffer
+    c.read_buffer = parse_file_size(c.read_buffer)
+
+    # Set default dropout rates
+    if c.dropout_rate2 < 0:
+        c.dropout_rate2 = c.dropout_rate
+    if c.dropout_rate3 < 0:
+        c.dropout_rate3 = c.dropout_rate
+    if c.dropout_rate6 < 0:
+        c.dropout_rate6 = c.dropout_rate
+
+    # Set default checkpoint dir
+    if not c.checkpoint_dir:
+        c.checkpoint_dir = xdg.save_data_path(os.path.join("stt", "checkpoints"))
+
+    if c.load_train not in ["last", "best", "init", "auto"]:
+        c.load_train = "auto"
+
+    if c.load_evaluate not in ["last", "best", "auto"]:
+        c.load_evaluate = "auto"
+
+    # Set default summary dir
+    if not c.summary_dir:
+        c.summary_dir = xdg.save_data_path(os.path.join("stt", "summaries"))
+
+    # Standard session configuration that'll be used for all new sessions.
+    c.session_config = tfv1.ConfigProto(
+        allow_soft_placement=True,
+        log_device_placement=c.log_placement,
+        inter_op_parallelism_threads=c.inter_op_parallelism_threads,
+        intra_op_parallelism_threads=c.intra_op_parallelism_threads,
+        gpu_options=tfv1.GPUOptions(allow_growth=c.use_allow_growth),
+    )
+
+    # CPU device
+    c.cpu_device = "/cpu:0"
+
+    # Available GPU devices
+    c.available_devices = get_available_gpus(c.session_config)
+
+    # If there is no GPU available, we fall back to CPU based operation
+    if not c.available_devices:
+        c.available_devices = [c.cpu_device]
+
+    c.alphabet_config_path=""
+    
+    if c.bytes_output_mode:
+        c.alphabet = UTF8Alphabet()
+    elif c.alphabet_config_path:
+        c.alphabet = Alphabet(os.path.abspath(c.alphabet_config_path))
+    
+    # Geometric Constants
+    # ===================
+
+    # For an explanation of the meaning of the geometric constants, please refer to
+    # doc/Geometry.md
+
+    # Number of MFCC features
+    c.n_input = 26  # TODO: Determine this programmatically from the sample rate
+
+    # The number of frames in the context
+    c.n_context = 9  # TODO: Determine the optimal value using a validation data set
+
+    # Number of units in hidden layers
+    c.n_hidden = c.n_hidden
+
+    c.n_hidden_1 = c.n_hidden
+
+    c.n_hidden_2 = c.n_hidden
+
+    c.n_hidden_5 = c.n_hidden
+
+    # LSTM cell state dimension
+    c.n_cell_dim = c.n_hidden
+
+    # The number of units in the third layer, which feeds in to the LSTM
+    c.n_hidden_3 = c.n_cell_dim
+
+    # Units in the sixth layer = number of characters in the target language plus one
+    try:
+        c.n_hidden_6 = c.alphabet.GetSize() + 1  # +1 for CTC blank label
+    except:
+        AttributeError
+
+    # Size of audio window in samples
+    if (c.feature_win_len * c.audio_sample_rate) % 1000 != 0:
+        log_error(
+            "--feature_win_len value ({}) in milliseconds ({}) multiplied "
+            "by --audio_sample_rate value ({}) must be an integer value. Adjust "
+            "your --feature_win_len value or resample your audio accordingly."
+            "".format(c.feature_win_len, c.feature_win_len / 1000, c.audio_sample_rate)
+        )
+        sys.exit(1)
+
+    c.audio_window_samples = c.audio_sample_rate * (c.feature_win_len / 1000)
+
+    # Stride for feature computations in samples
+    if (c.feature_win_step * c.audio_sample_rate) % 1000 != 0:
+        log_error(
+            "--feature_win_step value ({}) in milliseconds ({}) multiplied "
+            "by --audio_sample_rate value ({}) must be an integer value. Adjust "
+            "your --feature_win_step value or resample your audio accordingly."
+            "".format(
+                c.feature_win_step, c.feature_win_step / 1000, c.audio_sample_rate
+            )
+        )
+        sys.exit(1)
+
+    c.audio_step_samples = c.audio_sample_rate * (c.feature_win_step / 1000)
+
+    if c.one_shot_infer:
+        if not path_exists_remote(c.one_shot_infer):
+            log_error("Path specified in --one_shot_infer is not a valid file.")
+            sys.exit(1)
+
+    if c.train_cudnn and c.load_cudnn:
+        log_error(
+            "Trying to use --train_cudnn, but --load_cudnn "
+            "was also specified. The --load_cudnn flag is only "
+            "needed when converting a CuDNN RNN checkpoint to "
+            "a CPU-capable graph. If your system is capable of "
+            "using CuDNN RNN, you can just specify the CuDNN RNN "
+            "checkpoint normally with --save_checkpoint_dir."
+        )
+        sys.exit(1)
+
+    # If separate save and load flags were not specified, default to load and save
+    # from the same dir.
+    if not c.save_checkpoint_dir:
+        c.save_checkpoint_dir = c.checkpoint_dir
+
+    if not c.load_checkpoint_dir:
+        c.load_checkpoint_dir = c.checkpoint_dir
+
+    _ConfigSingleton._config = c  # pylint: disable=protected-access
+    
+from STT.training.coqui_stt_training.train import train, test, early_training_checks
+
+Config = _SttConfig()
+
+Config.alphabet = Alphabet('/home/STT/data/alphabet.txt')
+Config.train_files=['/home/STT/data/ldc93s1.csv']
+Config.dev_files=['/home/STT/data/ldc93s1.csv']
+Config.test_files=['/home/STT/data/ldc93s1.csv']
+Config.n_hidden=100
+Config.epochs=200
+
+initialize_globals(Config)
+
+#print(Config.to_json())
 early_training_checks()
-
 train()
+tfv1.reset_default_graph()
+test()

From 9f7fda14cbce1f3848eda9d34d826deb91edc74c Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Fri, 23 Jul 2021 12:12:02 -0400
Subject: [PATCH 6/9] Add first Jupyter notebook

---
 Dockerfile.train.jupyter  |  10 +-
 notebooks/train-ldc.ipynb | 253 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 258 insertions(+), 5 deletions(-)
 create mode 100644 notebooks/train-ldc.ipynb

diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter
index 71f3e466..a11d28b2 100644
--- a/Dockerfile.train.jupyter
+++ b/Dockerfile.train.jupyter
@@ -1,13 +1,13 @@
 # This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks
 
-FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.9
+FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.10
+
+COPY notebooks /code/notebooks
+WORKDIR /code/notebooks
 
 RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws
 RUN jupyter serverextension enable --py jupyter_http_over_ws
 
-RUN mv /code /home/STT
-WORKDIR /home
-
 EXPOSE 8888
 
-CMD ["bash", "-c", "jupyter notebook --notebook-dir=/home --ip 0.0.0.0 --no-browser --allow-root"]
+CMD ["bash", "-c", "jupyter notebook --notebook-dir=/code/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/notebooks/train-ldc.ipynb b/notebooks/train-ldc.ipynb
new file mode 100644
index 00000000..895785dd
--- /dev/null
+++ b/notebooks/train-ldc.ipynb
@@ -0,0 +1,253 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f79d99ef",
+   "metadata": {},
+   "source": [
+    "# Train your first 🐸 STT model 💫\n",
+    "\n",
+    "👋 Hello and welcome to Coqui (🐸) STT \n",
+    "\n",
+    "The goal of this notebook is to show you a **typical workflow** for **training** and **testing** an STT model with 🐸.\n",
+    "\n",
+    "Let's train a very small model on a very small amount of data so we can iterate quickly.\n",
+    "\n",
+    "In this notebook, we will:\n",
+    "\n",
+    "1. Download data and format it for 🐸 STT.\n",
+    "2. Configure the training and testing runs.\n",
+    "3. Train a new model.\n",
+    "4. Test the model and display its performance.\n",
+    "\n",
+    "So, let's jump right in!\n",
+    "\n",
+    "*PS - If you just want a working, off-the-shelf model, check out the [🐸 Model Zoo](https://www.coqui.ai/models)*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be5fe49c",
+   "metadata": {},
+   "source": [
+    "## ✅ Download & format sample data for English\n",
+    "\n",
+    "**First things first**: we need some data.\n",
+    "\n",
+    "We're training a Speech-to-Text model, so we need some _speech_ and we need some _text_. Specificially, we want _transcribed speech_. Let's download an English audio file and its transcript and then format them for 🐸 STT. \n",
+    "\n",
+    "🐸 STT expects to find information about your data in a CSV file, where each line contains:\n",
+    "\n",
+    "1. the **path** to an audio file\n",
+    "2. the **size** of that audio file\n",
+    "3. the **transcript** of that audio file.\n",
+    "\n",
+    "Formatting the audio and transcript isn't too difficult in this case. We define a custom data importer called `download_sample_data()` which does all the work. If you have a custom dataset, you will probably want to write a custom data importer.\n",
+    "\n",
+    "**Second things second**: we want an alphabet. The output layer of a typical* 🐸 STT model represents letters in the alphabet, and you should specify this alphabet before training. Let's download an English alphabet from Coqui and use that.\n",
+    "\n",
+    "_*If you are working with languages with large character sets (e.g. Chinese), you can set `bytes_output_mode=True` instead of supplying an `alphabet.txt` file. In this case, the output layer of the STT model will correspond to individual UTF-8 bytes instead of individual characters._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53945462",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "### Download sample data\n",
+    "import os\n",
+    "import pandas\n",
+    "from coqui_stt_training.util.downloader import maybe_download\n",
+    "\n",
+    "def download_sample_data():\n",
+    "    data_dir=\"english/\"\n",
+    "    # Download data + alphabet\n",
+    "    audio_file = maybe_download(\"LDC93S1.wav\", data_dir, \"https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.wav\")\n",
+    "    transcript_file = maybe_download(\"LDC93S1.txt\", data_dir, \"https://catalog.ldc.upenn.edu/desc/addenda/LDC93S1.txt\")\n",
+    "    alphabet = maybe_download(\"alphabet.txt\", data_dir, \"https://raw.githubusercontent.com/coqui-ai/STT/main/data/alphabet.txt\")\n",
+    "    # Format data\n",
+    "    with open(transcript_file, \"r\") as fin:\n",
+    "        transcript = \" \".join(fin.read().strip().lower().split(\" \")[2:]).replace(\".\", \"\")\n",
+    "    df = pandas.DataFrame(data=[(os.path.abspath(audio_file), os.path.getsize(audio_file), transcript)],\n",
+    "                          columns=[\"wav_filename\", \"wav_filesize\", \"transcript\"])\n",
+    "    # Save formatted CSV \n",
+    "    df.to_csv(os.path.join(data_dir, \"ldc93s1.csv\"), index=False)\n",
+    "\n",
+    "# Download and format data\n",
+    "download_sample_data()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96e8b708",
+   "metadata": {},
+   "source": [
+    "### Take a look at the data (*Optional* )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa2aec77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "csv_file = open(\"english/ldc93s1.csv\", \"r\")\n",
+    "print(csv_file.read())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c046277",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "alphabet_file = open(\"english/alphabet.txt\", \"r\")\n",
+    "print(alphabet_file.read())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d9dfac21",
+   "metadata": {},
+   "source": [
+    "## ✅ Configure & set hyperparameters\n",
+    "\n",
+    "Coqui STT comes with a long list of hyperparameters you can tweak. We've set default values, but you will often want to set your own. You can use `initialize_globals_from_args()` to do this. \n",
+    "\n",
+    "You must **always** configure the paths to your data, and you must **always** configure your alphabet. Additionally, here we show how you can specify the size of hidden layers (`n_hidden`), the number of epochs to train for (`epochs`), and to initialize a new model from scratch (`load_train=\"init\"`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d264fdec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from coqui_stt_training.util.config import initialize_globals_from_args\n",
+    "\n",
+    "initialize_globals_from_args(\n",
+    "    alphabet_config_path=\"english/alphabet.txt\",\n",
+    "    train_files=[\"english/ldc93s1.csv\"],\n",
+    "    dev_files=[\"english/ldc93s1.csv\"],\n",
+    "    test_files=[\"english/ldc93s1.csv\"],\n",
+    "    load_train=\"init\",\n",
+    "    n_hidden=100,\n",
+    "    epochs=200,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "799c1425",
+   "metadata": {},
+   "source": [
+    "### View all Config settings (*Optional*) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03b33d2b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from coqui_stt_training.util.config import Config\n",
+    "\n",
+    "# Take a peek at the entire Config\n",
+    "print(Config.to_json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae82fd75",
+   "metadata": {},
+   "source": [
+    "## ✅ Train a new model\n",
+    "\n",
+    "Let's kick off a training run 🚀🚀🚀 (using the configure you set above).\n",
+    "\n",
+    "This notebook should work on either a GPU or a CPU. However, in case you're running this on _multiple_ GPUs we want to only use one, because the sample dataset (one audio file) is too small to split across multiple GPUs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "550a504e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from coqui_stt_training.train import train, early_training_checks\n",
+    "import tensorflow.compat.v1 as tfv1\n",
+    "\n",
+    "# use maximum one GPU\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
+    "\n",
+    "early_training_checks()\n",
+    "\n",
+    "tfv1.reset_default_graph()\n",
+    "train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f6dc959",
+   "metadata": {},
+   "source": [
+    "## ✅ Test the model\n",
+    "\n",
+    "We made it! 🙌\n",
+    "\n",
+    "Let's kick off the testing run, which displays performance metrics.\n",
+    "\n",
+    "We're committing the cardinal sin of ML 😈 (aka - testing on our training data) so you don't want to deploy this model into production. In this notebook we're focusing on the workflow itself, so it's forgivable 😇\n",
+    "\n",
+    "You can see from the test output that our tiny model has overfit to the data, and basically memorized this one sentence.\n",
+    "\n",
+    "When you start training your own models, make sure your testing data doesn't include your training data 😅"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd42bc7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from coqui_stt_training.train import test\n",
+    "\n",
+    "tfv1.reset_default_graph()\n",
+    "test()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From ea82ab4cb822751a19b6fe493d47c5ac745064dd Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Fri, 23 Jul 2021 12:15:12 -0400
Subject: [PATCH 7/9] Remove old unneeded files

---
 start.sh        |   4 -
 tfenv.yml       |  16 ----
 train-ldc.ipynb | 221 ------------------------------------------------
 3 files changed, 241 deletions(-)
 delete mode 100755 start.sh
 delete mode 100644 tfenv.yml
 delete mode 100644 train-ldc.ipynb

diff --git a/start.sh b/start.sh
deleted file mode 100755
index 05476bc9..00000000
--- a/start.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-set -eu
-
-jupyter lab --ip=0.0.0.0 --port=8080 --allow-root
diff --git a/tfenv.yml b/tfenv.yml
deleted file mode 100644
index d489c7f3..00000000
--- a/tfenv.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-name: tf1
-channels:
-  - conda-forge
-dependencies:
-  - python=3.7
-  - tensorflow-gpu==1.15
-  - ipykernel
-  - google-auth
-  - tensorflow-hub
-  - pydicom
-  - pandas
-  - seaborn 
-  - matplotlib 
-  - scikit-learn 
-  - openslide
-  - keras  
diff --git a/train-ldc.ipynb b/train-ldc.ipynb
deleted file mode 100644
index 5f7a1168..00000000
--- a/train-ldc.ipynb
+++ /dev/null
@@ -1,221 +0,0 @@
-# Download LDC data
-
-import os
-import sys
-
-import pandas
-from STT.training.coqui_stt_training.util.downloader import maybe_download
-#from STT.bin.import_ldc93s1 import _download_and_preprocess_data as download_data
-
-#download_data('/home/STT/data')
-
-def download_and_preprocess_data(data_dir):
-    # Conditionally download data
-    LDC93S1_BASE = "LDC93S1"
-    LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/"
-    local_file = maybe_download(
-        LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav"
-    )
-    trans_file = maybe_download(
-        LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt"
-    )
-    with open(trans_file, "r") as fin:
-        transcript = " ".join(fin.read().strip().lower().split(" ")[2:]).replace(
-            ".", ""
-        )
-
-    df = pandas.DataFrame(
-        data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)],
-        columns=["wav_filename", "wav_filesize", "transcript"],
-    )
-    df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False)
-
-download_and_preprocess_data('/home/STT/data')
-
-# Train
-
-from STT.training.coqui_stt_training.util.config import _SttConfig, _ConfigSingleton
-from STT.training.coqui_stt_training.util.augmentations import parse_augmentations, NormalizeSampleRate
-from STT.training.coqui_stt_training.util.helpers import parse_file_size
-from STT.training.coqui_stt_training.util.gpu import get_available_gpus
-from coqui_stt_ctcdecoder import Alphabet
-from xdg import BaseDirectory as xdg
-import tensorflow.compat.v1 as tfv1
-
-def initialize_globals(c):
-
-    # Augmentations
-    c.augmentations = parse_augmentations(c.augment)
-    print(f"Parsed augmentations from flags: {c.augmentations}")
-    if c.augmentations and c.feature_cache and c.cache_for_epochs == 0:
-        print(
-            "Due to current feature-cache settings the exact same sample augmentations of the first "
-            "epoch will be repeated on all following epochs. This could lead to unintended over-fitting. "
-            "You could use --cache_for_epochs <n_epochs> to invalidate the cache after a given number of epochs."
-        )
-
-    if c.normalize_sample_rate:
-        c.augmentations = [NormalizeSampleRate(c.audio_sample_rate)] + c[
-            "augmentations"
-        ]
-
-    # Caching
-    if c.cache_for_epochs == 1:
-        print(
-            "--cache_for_epochs == 1 is (re-)creating the feature cache on every epoch but will never use it."
-        )
-
-    # Read-buffer
-    c.read_buffer = parse_file_size(c.read_buffer)
-
-    # Set default dropout rates
-    if c.dropout_rate2 < 0:
-        c.dropout_rate2 = c.dropout_rate
-    if c.dropout_rate3 < 0:
-        c.dropout_rate3 = c.dropout_rate
-    if c.dropout_rate6 < 0:
-        c.dropout_rate6 = c.dropout_rate
-
-    # Set default checkpoint dir
-    if not c.checkpoint_dir:
-        c.checkpoint_dir = xdg.save_data_path(os.path.join("stt", "checkpoints"))
-
-    if c.load_train not in ["last", "best", "init", "auto"]:
-        c.load_train = "auto"
-
-    if c.load_evaluate not in ["last", "best", "auto"]:
-        c.load_evaluate = "auto"
-
-    # Set default summary dir
-    if not c.summary_dir:
-        c.summary_dir = xdg.save_data_path(os.path.join("stt", "summaries"))
-
-    # Standard session configuration that'll be used for all new sessions.
-    c.session_config = tfv1.ConfigProto(
-        allow_soft_placement=True,
-        log_device_placement=c.log_placement,
-        inter_op_parallelism_threads=c.inter_op_parallelism_threads,
-        intra_op_parallelism_threads=c.intra_op_parallelism_threads,
-        gpu_options=tfv1.GPUOptions(allow_growth=c.use_allow_growth),
-    )
-
-    # CPU device
-    c.cpu_device = "/cpu:0"
-
-    # Available GPU devices
-    c.available_devices = get_available_gpus(c.session_config)
-
-    # If there is no GPU available, we fall back to CPU based operation
-    if not c.available_devices:
-        c.available_devices = [c.cpu_device]
-
-    c.alphabet_config_path=""
-    
-    if c.bytes_output_mode:
-        c.alphabet = UTF8Alphabet()
-    elif c.alphabet_config_path:
-        c.alphabet = Alphabet(os.path.abspath(c.alphabet_config_path))
-    
-    # Geometric Constants
-    # ===================
-
-    # For an explanation of the meaning of the geometric constants, please refer to
-    # doc/Geometry.md
-
-    # Number of MFCC features
-    c.n_input = 26  # TODO: Determine this programmatically from the sample rate
-
-    # The number of frames in the context
-    c.n_context = 9  # TODO: Determine the optimal value using a validation data set
-
-    # Number of units in hidden layers
-    c.n_hidden = c.n_hidden
-
-    c.n_hidden_1 = c.n_hidden
-
-    c.n_hidden_2 = c.n_hidden
-
-    c.n_hidden_5 = c.n_hidden
-
-    # LSTM cell state dimension
-    c.n_cell_dim = c.n_hidden
-
-    # The number of units in the third layer, which feeds in to the LSTM
-    c.n_hidden_3 = c.n_cell_dim
-
-    # Units in the sixth layer = number of characters in the target language plus one
-    try:
-        c.n_hidden_6 = c.alphabet.GetSize() + 1  # +1 for CTC blank label
-    except:
-        AttributeError
-
-    # Size of audio window in samples
-    if (c.feature_win_len * c.audio_sample_rate) % 1000 != 0:
-        log_error(
-            "--feature_win_len value ({}) in milliseconds ({}) multiplied "
-            "by --audio_sample_rate value ({}) must be an integer value. Adjust "
-            "your --feature_win_len value or resample your audio accordingly."
-            "".format(c.feature_win_len, c.feature_win_len / 1000, c.audio_sample_rate)
-        )
-        sys.exit(1)
-
-    c.audio_window_samples = c.audio_sample_rate * (c.feature_win_len / 1000)
-
-    # Stride for feature computations in samples
-    if (c.feature_win_step * c.audio_sample_rate) % 1000 != 0:
-        log_error(
-            "--feature_win_step value ({}) in milliseconds ({}) multiplied "
-            "by --audio_sample_rate value ({}) must be an integer value. Adjust "
-            "your --feature_win_step value or resample your audio accordingly."
-            "".format(
-                c.feature_win_step, c.feature_win_step / 1000, c.audio_sample_rate
-            )
-        )
-        sys.exit(1)
-
-    c.audio_step_samples = c.audio_sample_rate * (c.feature_win_step / 1000)
-
-    if c.one_shot_infer:
-        if not path_exists_remote(c.one_shot_infer):
-            log_error("Path specified in --one_shot_infer is not a valid file.")
-            sys.exit(1)
-
-    if c.train_cudnn and c.load_cudnn:
-        log_error(
-            "Trying to use --train_cudnn, but --load_cudnn "
-            "was also specified. The --load_cudnn flag is only "
-            "needed when converting a CuDNN RNN checkpoint to "
-            "a CPU-capable graph. If your system is capable of "
-            "using CuDNN RNN, you can just specify the CuDNN RNN "
-            "checkpoint normally with --save_checkpoint_dir."
-        )
-        sys.exit(1)
-
-    # If separate save and load flags were not specified, default to load and save
-    # from the same dir.
-    if not c.save_checkpoint_dir:
-        c.save_checkpoint_dir = c.checkpoint_dir
-
-    if not c.load_checkpoint_dir:
-        c.load_checkpoint_dir = c.checkpoint_dir
-
-    _ConfigSingleton._config = c  # pylint: disable=protected-access
-    
-from STT.training.coqui_stt_training.train import train, test, early_training_checks
-
-Config = _SttConfig()
-
-Config.alphabet = Alphabet('/home/STT/data/alphabet.txt')
-Config.train_files=['/home/STT/data/ldc93s1.csv']
-Config.dev_files=['/home/STT/data/ldc93s1.csv']
-Config.test_files=['/home/STT/data/ldc93s1.csv']
-Config.n_hidden=100
-Config.epochs=200
-
-initialize_globals(Config)
-
-#print(Config.to_json())
-early_training_checks()
-train()
-tfv1.reset_default_graph()
-test()

From 7d40d5d686e8c58fe37d8e65cd919fc94bd8c8c9 Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Fri, 23 Jul 2021 12:16:26 -0400
Subject: [PATCH 8/9] Specify latest for base Coqui STT docker image

---
 Dockerfile.train.jupyter | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter
index a11d28b2..a92b44ca 100644
--- a/Dockerfile.train.jupyter
+++ b/Dockerfile.train.jupyter
@@ -1,6 +1,6 @@
 # This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks
 
-FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.10
+FROM ghcr.io/coqui-ai/stt-train:latest
 
 COPY notebooks /code/notebooks
 WORKDIR /code/notebooks

From 31199116574298a5ba5ba4afff4402785c6cf72d Mon Sep 17 00:00:00 2001
From: Josh Meyer <joshua.richard.meyer@gmail.com>
Date: Fri, 23 Jul 2021 12:17:55 -0400
Subject: [PATCH 9/9] Next core Coqui STT docker image will have notebooks dir

---
 Dockerfile.train.jupyter | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter
index a92b44ca..5fa680ec 100644
--- a/Dockerfile.train.jupyter
+++ b/Dockerfile.train.jupyter
@@ -2,7 +2,6 @@
 
 FROM ghcr.io/coqui-ai/stt-train:latest
 
-COPY notebooks /code/notebooks
 WORKDIR /code/notebooks
 
 RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws