diff --git a/Dockerfile.train.jupyter b/Dockerfile.train.jupyter index d0a4872a..71f3e466 100644 --- a/Dockerfile.train.jupyter +++ b/Dockerfile.train.jupyter @@ -1,132 +1,13 @@ # This is a Dockerfile useful for training models with Coqui STT in Jupyter notebooks -FROM ubuntu:20.04 AS kenlm-build -ENV DEBIAN_FRONTEND=noninteractive +FROM ghcr.io/coqui-ai/stt-train:v0.10.0-alpha.9 -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential cmake libboost-system-dev \ - libboost-thread-dev libboost-program-options-dev \ - libboost-test-dev libeigen3-dev zlib1g-dev \ - libbz2-dev liblzma-dev && \ - rm -rf /var/lib/apt/lists/* +RUN python3 -m pip install --no-cache-dir jupyter jupyter_http_over_ws +RUN jupyter serverextension enable --py jupyter_http_over_ws -# Build KenLM to generate new scorers -WORKDIR /code -COPY kenlm /code/kenlm -RUN cd /code/kenlm && \ - mkdir -p build && \ - cd build && \ - cmake .. && \ - make -j $(nproc) || \ - ( echo "ERROR: Failed to build KenLM."; \ - echo "ERROR: Make sure you update the kenlm submodule on host before building this Dockerfile."; \ - echo "ERROR: $ cd STT; git submodule update --init kenlm"; \ - exit 1; ) +RUN mv /code /home/STT +WORKDIR /home +EXPOSE 8888 -FROM ubuntu:20.04 AS wget-binaries -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && apt-get install -y --no-install-recommends wget unzip xz-utils - -# Tool to convert output graph for inference -RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/convert_graphdef_memmapped_format.linux.amd64.zip -O temp.zip && \ - unzip temp.zip && \ - rm temp.zip - -RUN wget --no-check-certificate https://github.com/coqui-ai/STT/releases/download/v0.9.3/native_client.tf.Linux.tar.xz -O temp.tar.xz && \ - tar -xf temp.tar.xz && \ - rm temp.tar.xz - - -FROM nvcr.io/nvidia/tensorflow:21.05-tf1-py3 -ENV DEBIAN_FRONTEND=noninteractive - -# We need to purge python3-xdg because -# it's breaking STT install later with -# errors about setuptools -# -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - git \ - wget \ - libopus0 \ - libopusfile0 \ - libsndfile1 \ - sox \ - libsox-fmt-mp3 && \ - apt-get purge -y python3-xdg && \ - rm -rf /var/lib/apt/lists/ - -# Make sure pip and its dependencies are up-to-date -RUN pip3 install --upgrade pip wheel setuptools - -WORKDIR /code - -COPY native_client /code/native_client -COPY .git /code/.git -COPY training/coqui_stt_training/VERSION /code/training/coqui_stt_training/VERSION -COPY training/coqui_stt_training/GRAPH_VERSION /code/training/coqui_stt_training/GRAPH_VERSION - -# Build CTC decoder first, to avoid clashes on incompatible versions upgrades -RUN cd native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings -RUN pip3 install --upgrade native_client/ctcdecode/dist/*.whl - -COPY setup.py /code/setup.py -COPY VERSION /code/VERSION -COPY training /code/training -# Copy files from previous build stages -RUN mkdir -p /code/kenlm/build/ -COPY --from=kenlm-build /code/kenlm/build/bin /code/kenlm/build/bin -COPY --from=wget-binaries /convert_graphdef_memmapped_format /code/convert_graphdef_memmapped_format -COPY --from=wget-binaries /generate_scorer_package /code/generate_scorer_package - -# Install STT -# No need for the decoder since we did it earlier -# TensorFlow GPU should already be installed on the base image, -# and we don't want to break that -RUN DS_NODECODER=y DS_NOTENSORFLOW=y pip3 install --upgrade -e . - -# Copy rest of the code and test training -COPY . /code -#RUN ./bin/run-ldc93s1.sh -RUN rm -rf ~/.local/share/stt - -### START OVH THINGS -## -# - -RUN apt-get update && apt-get install -y \ - man \ - vim \ - nano \ - htop \ - curl \ - wget \ - rsync \ - ca-certificates \ - git \ - zip \ - procps \ - ssh \ - supervisor \ - gettext-base \ - less \ - nodejs \ - npm \ - && rm -rf /var/lib/apt/lists/* - -RUN pip3 install jupyterlab ipywidgets -RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager -RUN jupyter nbextension enable --py widgetsnbextension #enable ipywidgets -RUN jupyter labextension install jupyterlab-plotly - -EXPOSE 8080 - -ADD start.sh / - -WORKDIR /workspace -RUN chown -R 42420:42420 /workspace - -ENTRYPOINT ["/start.sh"] +CMD ["bash", "-c", "jupyter notebook --notebook-dir=/home --ip 0.0.0.0 --no-browser --allow-root"] diff --git a/train-ldc.ipynb b/train-ldc.ipynb new file mode 100644 index 00000000..78a94c2f --- /dev/null +++ b/train-ldc.ipynb @@ -0,0 +1,46 @@ +import os +import sys + +import pandas +from STT.training.coqui_stt_training.util.downloader import maybe_download +#from STT.bin.import_ldc93s1 import _download_and_preprocess_data as download_data + +#download_data('/home/STT/data') + +def download_and_preprocess_data(data_dir): + # Conditionally download data + LDC93S1_BASE = "LDC93S1" + LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/" + local_file = maybe_download( + LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav" + ) + trans_file = maybe_download( + LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt" + ) + with open(trans_file, "r") as fin: + transcript = " ".join(fin.read().strip().lower().split(" ")[2:]).replace( + ".", "" + ) + + df = pandas.DataFrame( + data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)], + columns=["wav_filename", "wav_filesize", "transcript"], + ) + df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False) + +download_and_preprocess_data('/home/STT/data') + + +from STT.training.coqui_stt_training.train import train, early_training_checks +from STT.training.coqui_stt_training.util.config import initialize_globals + +#Config.train_files=['/home/STT/data/ldc.csv'] +#Config.dev_files=['/home/STT/data/ldc.csv'] +#Config.test_files=['/home/STT/data/ldc.csv'] + +#Config.alphabet_config_path='/home/STT/data/alphabet.txt' +initialize_globals() + +early_training_checks() + +train()