Simplyfy dockerfile and add notebook

2021-07-20 04:20:57 -04:00 · 2021-07-20 04:20:57 -04:00 · a37ca2ec27
commit a37ca2ec27
parent d0f8eb96cd
2 changed files with 53 additions and 126 deletions
--- a/Dockerfile.train.jupyter
+++ b/Dockerfile.train.jupyter
@ -1,132 +1,13 @@
 # This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks

-FROM ubuntu:20.04 AS kenlm-build
-ENV DEBIAN_FRONTEND=noninteractive
+FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.9

-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    build-essential cmake libboost-system-dev \
-    libboost-thread-dev libboost-program-options-dev \
-    libboost-test-dev libeigen3-dev zlib1g-dev \
-    libbz2-dev liblzma-dev && \
-    rm -rf /var/lib/apt/lists/*
+RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws

-# Build KenLM to generate new scorers
-WORKDIR /code
-COPY kenlm /code/kenlm
-RUN cd /code/kenlm && \
-    mkdir -p build && \
-    cd build && \
-    cmake .. && \
-    make -j $(nproc) || \
-    ( echo "ERROR: Failed to build KenLM."; \
-    echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \
-    echo "ERROR: $ cd STT; git submodule update --init kenlm"; \
-    exit 1; )
+RUN mv /code /home/STT
+WORKDIR /home

+EXPOSE 8888

-FROM ubuntu:20.04 AS wget-binaries
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils
-
-# Tool to convert output graph for inference
-RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \
-    unzip temp.zip && \
-    rm temp.zip
-
-RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \
-    tar -xf temp.tar.xz && \
-    rm temp.tar.xz
-
-
-FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3
-ENV DEBIAN_FRONTEND=noninteractive
-
-# We need to purge python3-xdg because
-# it's breaking STT install later with
-# errors about setuptools
-#
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        wget \
-        libopus0 \
-        libopusfile0 \
-        libsndfile1 \
-        sox \
-        libsox-fmt-mp3 && \
-    apt-get purge -y python3-xdg && \
-    rm -rf /var/lib/apt/lists/
-
-# Make sure pip and its dependencies are up-to-date
-RUN pip3 install --upgrade pip wheel setuptools
-
-WORKDIR /code
-
-COPY native_client /code/native_client
-COPY .git /code/.git
-COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION
-COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION
-
-# Build CTC decoder first, to avoid clashes on incompatible versions upgrades
-RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
-RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl
-
-COPY setup.py /code/setup.py
-COPY VERSION /code/VERSION
-COPY training /code/training
-# Copy files from previous build stages
-RUN mkdir -p /code/kenlm/build/
-COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin
-COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format
-COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package
-
-# Install STT
-# No need for the decoder since we did it earlier
-# TensorFlow GPU should already be installed on the base image,
-# and we don't want to break that
-RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e .
-
-# Copy rest of the code and test training
-COPY . /code
-#RUN ./bin/run-ldc93s1.sh
-RUN rm -rf ~/.local/share/stt
-
-### START OVH THINGS
-##
-#
-
-RUN apt-get update && apt-get install -y \
-    man \
-    vim \
-    nano \
-    htop \
-    curl \
-    wget \
-    rsync \
-    ca-certificates \
-    git \
-    zip \
-    procps \
-    ssh \
-    supervisor \
-    gettext-base \
-    less \
-    nodejs \
-    npm \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN pip3 install jupyterlab ipywidgets
-RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
-RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets
-RUN jupyter labextension install jupyterlab-plotly
-
-EXPOSE 8080
-
-ADD start.sh /
-
-WORKDIR /workspace
-RUN chown -R 42420:42420 /workspace
-
-ENTRYPOINT ["/start.sh"]
+CMD ["bash", "-c", "jupyter notebook --notebook-dir=/home --ip 0.0.0.0 --no-browser --allow-root"]
--- a/train-ldc.ipynb
+++ b/train-ldc.ipynb
@ -0,0 +1,46 @@
+import os
+import sys
+
+import pandas
+from STT.training.coqui_stt_training.util.downloader import maybe_download
+#from STT.bin.import_ldc93s1 import _download_and_preprocess_data as download_data
+
+#download_data('/home/STT/data')
+
+def download_and_preprocess_data(data_dir):
+    # Conditionally download data
+    LDC93S1_BASE = "LDC93S1"
+    LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/"
+    local_file = maybe_download(
+        LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav"
+    )
+    trans_file = maybe_download(
+        LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt"
+    )
+    with open(trans_file, "r") as fin:
+        transcript = " ".join(fin.read().strip().lower().split(" ")[2:]).replace(
+            ".", ""
+        )
+
+    df = pandas.DataFrame(
+        data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)],
+        columns=["wav_filename", "wav_filesize", "transcript"],
+    )
+    df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False)
+
+download_and_preprocess_data('/home/STT/data')
+
+
+from STT.training.coqui_stt_training.train import train, early_training_checks
+from STT.training.coqui_stt_training.util.config import initialize_globals
+
+#Config.train_files=['/home/STT/data/ldc.csv']
+#Config.dev_files=['/home/STT/data/ldc.csv']
+#Config.test_files=['/home/STT/data/ldc.csv']
+
+#Config.alphabet_config_path='/home/STT/data/alphabet.txt'
+initialize_globals()
+
+early_training_checks()
+
+train()