diff --git a/.install b/.install index 2e9f0cdb..8ee8ca5d 100755 --- a/.install +++ b/.install @@ -3,7 +3,7 @@ virtualenv -p python3 ../tmp/venv source ../tmp/venv/bin/activate pip install -r <(grep -v tensorflow requirements.txt) -pip install tensorflow-gpu==1.12.0rc2 +pip install tensorflow-gpu==1.12.0 python3 util/taskcluster.py --arch gpu --target ../tmp/native_client diff --git a/DeepSpeech.py b/DeepSpeech.py index 72ed9173..c8afc030 100755 --- a/DeepSpeech.py +++ b/DeepSpeech.py @@ -890,6 +890,7 @@ def main(_): if len(FLAGS.worker_hosts) == 0: # Only one local task: this process (default case - no cluster) with tf.Graph().as_default(): + tf.set_random_seed(FLAGS.random_seed) train() # Now do a final test epoch if FLAGS.test: diff --git a/Dockerfile b/Dockerfile index 94282689..5d7d6f64 100644 --- a/Dockerfile +++ b/Dockerfile @@ -186,7 +186,7 @@ RUN cp /tensorflow/bazel-bin/native_client/generate_trie /DeepSpeech/native_clie # Install TensorFlow WORKDIR /DeepSpeech/ -RUN pip install tensorflow-gpu==1.12.0rc2 +RUN pip install tensorflow-gpu==1.12.0 # Make DeepSpeech and install Python bindings diff --git a/README.md b/README.md index b9403b7a..5310fd51 100644 --- a/README.md +++ b/README.md @@ -227,7 +227,7 @@ If you have a capable (Nvidia, at least 8GB of VRAM) GPU, it is highly recommend ```bash pip3 uninstall tensorflow -pip3 install 'tensorflow-gpu==1.12.0rc2' +pip3 install 'tensorflow-gpu==1.12.0' ``` ### Common Voice training data @@ -284,7 +284,7 @@ If you are brave enough, you can also include the `other` dataset, which contain The central (Python) script is `DeepSpeech.py` in the project's root directory. For its list of command line options, you can call: ```bash -./DeepSpeech.py --help +./DeepSpeech.py --helpfull ``` To get the output of this in a slightly better-formatted way, you can also look up the option definitions top of `DeepSpeech.py`. diff --git a/VERSION b/VERSION index 1b69c974..27177a94 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.4.0-alpha.0 +0.4.0-alpha.2 diff --git a/examples/ffmpeg_vad_streaming/README.MD b/examples/ffmpeg_vad_streaming/README.MD new file mode 100644 index 00000000..c7886662 --- /dev/null +++ b/examples/ffmpeg_vad_streaming/README.MD @@ -0,0 +1,29 @@ +# FFmpeg VAD Streaming + +Streaming inference from arbitrary source (FFmpeg input) to DeepSpeech, using VAD (voice activity detection). A fairly simple example demonstrating the DeepSpeech streaming API in Node.js. + +This example was successfully tested with a mobile phone streaming a live feed to a RTMP server (nginx-rtmp), which then could be used by this script for near real time speech recognition. + +## Installation + +```bash +npm install +``` + +Moreover FFmpeg must be installed: + +```bash +sudo apt-get install ffmpeg +``` + +## Usage + +Here is an example for a local audio file: +```bash +node ./index.js --audio --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt +``` + +Here is an example for a remote RTMP-Stream: +```bash +node ./index.js --audio rtmp://:1935/live/teststream --model $HOME/models/output_graph.pbmm --alphabet $HOME/models/alphabet.txt +``` diff --git a/examples/ffmpeg_vad_streaming/index.js b/examples/ffmpeg_vad_streaming/index.js new file mode 100644 index 00000000..37f6d871 --- /dev/null +++ b/examples/ffmpeg_vad_streaming/index.js @@ -0,0 +1,118 @@ +#!/usr/bin/env node + +const VAD = require("node-vad"); +const Ds = require('deepspeech'); +const argparse = require('argparse'); +const util = require('util'); + +// These constants control the beam search decoder + +// Beam width used in the CTC decoder when building candidate transcriptions +const BEAM_WIDTH = 1024; + +// The alpha hyperparameter of the CTC decoder. Language Model weight +const LM_WEIGHT = 1.50; + +// Valid word insertion weight. This is used to lessen the word insertion penalty +// when the inserted word is part of the vocabulary +const VALID_WORD_COUNT_WEIGHT = 2.25; + +// These constants are tied to the shape of the graph used (changing them changes +// the geometry of the first layer), so make sure you use the same constants that +// were used during training + +// Number of MFCC features to use +const N_FEATURES = 26; + +// Size of the context window used for producing timesteps in the input vector +const N_CONTEXT = 9; + +let VersionAction = function VersionAction(options) { + options = options || {}; + options.nargs = 0; + argparse.Action.call(this, options); +}; + +util.inherits(VersionAction, argparse.Action); + +VersionAction.prototype.call = function(parser) { + Ds.printVersions(); + process.exit(0); +}; + +let parser = new argparse.ArgumentParser({addHelp: true, description: 'Running DeepSpeech inference.'}); +parser.addArgument(['--model'], {required: true, help: 'Path to the model (protocol buffer binary file)'}); +parser.addArgument(['--alphabet'], {required: true, help: 'Path to the configuration file specifying the alphabet used by the network'}); +parser.addArgument(['--lm'], {help: 'Path to the language model binary file', nargs: '?'}); +parser.addArgument(['--trie'], {help: 'Path to the language model trie file created with native_client/generate_trie', nargs: '?'}); +parser.addArgument(['--audio'], {required: true, help: 'Path to the audio file to run (WAV format)'}); +parser.addArgument(['--version'], {action: VersionAction, help: 'Print version and exits'}); +let args = parser.parseArgs(); + +function totalTime(hrtimeValue) { + return (hrtimeValue[0] + hrtimeValue[1] / 1000000000).toPrecision(4); +} + +console.error('Loading model from file %s', args['model']); +const model_load_start = process.hrtime(); +let model = new Ds.Model(args['model'], N_FEATURES, N_CONTEXT, args['alphabet'], BEAM_WIDTH); +const model_load_end = process.hrtime(model_load_start); +console.error('Loaded model in %ds.', totalTime(model_load_end)); + +if (args['lm'] && args['trie']) { + console.error('Loading language model from files %s %s', args['lm'], args['trie']); + const lm_load_start = process.hrtime(); + model.enableDecoderWithLM(args['alphabet'], args['lm'], args['trie'], + LM_WEIGHT, VALID_WORD_COUNT_WEIGHT); + const lm_load_end = process.hrtime(lm_load_start); + console.error('Loaded language model in %ds.', totalTime(lm_load_end)); +} + +const vad = new VAD(VAD.Mode.NORMAL); +const voice = {START: true, STOP: false}; +let sctx = model.setupStream(150, 16000); +let state = voice.STOP; + +function finishStream() { + const model_load_start = process.hrtime(); + console.error('Running inference.'); + console.log('Transcription: ', model.finishStream(sctx)); + const model_load_end = process.hrtime(model_load_start); + console.error('Inference took %ds.', totalTime(model_load_end)); +} + +let ffmpeg = require('child_process').spawn('ffmpeg', [ + '-hide_banner', + '-nostats', + '-loglevel', 'fatal', + '-i', args['audio'], + '-af', 'highpass=f=200,lowpass=f=3000', + '-vn', + '-acodec', 'pcm_s16le', + '-ac', 1, + '-ar', 16000, + '-f', 's16le', + 'pipe:' +]); + +ffmpeg.stdout.on('data', chunk => { + vad.processAudio(chunk, 16000).then(res => { + switch (res) { + case VAD.Event.SILENCE: + if (state === voice.START) { + state = voice.STOP; + finishStream(); + sctx = model.setupStream(150,16000); + } + break; + case VAD.Event.VOICE: + state = voice.START; + model.feedAudioContent(sctx, chunk.slice(0, chunk.length / 2)); + break; + } + }); +}); + +ffmpeg.stdout.on('close', code => { + finishStream(); +}); diff --git a/examples/ffmpeg_vad_streaming/package.json b/examples/ffmpeg_vad_streaming/package.json new file mode 100644 index 00000000..09b8bcce --- /dev/null +++ b/examples/ffmpeg_vad_streaming/package.json @@ -0,0 +1,16 @@ +{ + "name": "ffmpeg-vad-streaming", + "version": "1.0.0", + "description": "Streaming inference from arbitrary source with VAD and FFmpeg", + "main": "index.js", + "scripts": { + "start": "node ./index.js" + }, + "dependencies": { + "argparse": "^1.0.10", + "deepspeech": "^0.3.0", + "node-vad": "^1.1.1", + "util": "^0.11.1" + }, + "license" : "MIT" +} diff --git a/examples/mic_vad_streaming/README.md b/examples/mic_vad_streaming/README.md index 54c5c49b..211a9b61 100644 --- a/examples/mic_vad_streaming/README.md +++ b/examples/mic_vad_streaming/README.md @@ -14,6 +14,12 @@ Uses portaudio for microphone access, so on Linux, you may need to install its h sudo apt install portaudio19-dev ``` +Installation on MacOS may fail due to portaudio, use brew to install it: + +```bash +brew install portaudio +``` + ## Usage ``` diff --git a/native_client/Android.mk b/native_client/Android.mk new file mode 100644 index 00000000..d21551fd --- /dev/null +++ b/native_client/Android.mk @@ -0,0 +1,14 @@ +LOCAL_PATH := $(call my-dir) + +include $(CLEAR_VARS) +LOCAL_MODULE := deepspeech-prebuilt +LOCAL_SRC_FILES := $(TFDIR)/bazel-bin/native_client/libdeepspeech.so +include $(PREBUILT_SHARED_LIBRARY) + +include $(CLEAR_VARS) +LOCAL_CPP_EXTENSION := .cc .cxx .cpp +LOCAL_MODULE := deepspeech +LOCAL_SRC_FILES := client.cc +LOCAL_SHARED_LIBRARIES := deepspeech-prebuilt +LOCAL_LDFLAGS := -Wl,--no-as-needed +include $(BUILD_EXECUTABLE) diff --git a/native_client/README.md b/native_client/README.md index 5bef4ef1..6748cd40 100644 --- a/native_client/README.md +++ b/native_client/README.md @@ -90,6 +90,62 @@ cd ../DeepSpeech/native_client make deepspeech ``` +### Cross-building for RPi3 ARMv7 / LePotato ARM64 + +We do support cross-compilation ; please refer to our `mozilla/tensorflow` fork, where we define the following `--config` flags: + - `--config=rpi3` and `--config=rpi3_opt` for Raspbian / ARMv7 + - `--config=rpi3-armv8` and `--config=rpi3-armv8_opt` for ARMBian / ARM64 + +So your command line for RPi3 / ARMv7 should look like: +``` +bazel build --config=monolithic --config=rpi3 --config=rpi3_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie +``` + +And your command line for LePotato / ARM64 should look like: +``` +bazel build --config=monolithic --config=rpi3-armv8 --config=rpi3-armv8_opt -c opt --copt=-O3 --copt=-fvisibility=hidden //native_client:libdeepspeech.so //native_client:generate_trie +``` + +While we test only on RPi3 Raspbian Stretch / LePotato ARMBian stretch, anything compatible with `armv7-a cortex-a53` / `armv8-a cortex-a53` should be fine. + +The `deepspeech` binary can also be cross-built, with `TARGET=rpi3` or `TARGET=rpi3-armv8`. This might require you to setup a system tree using the tool `multistrap` and the multitrap configuration files: `native_client/multistrap_armbian64_stretch.conf` and `native_client/multistrap_raspbian_stretch.conf`. +The path of the system tree can be overridden from the default values defined in `definitions.mk` through `RASPBIAN` make variable. + +``` +cd ../DeepSpeech/native_client +make TARGET= deepspeech +``` + +### Android devices + +We have preliminary support for Android relying on TensorFlow Lite, with upcoming Java / JNI bindinds. For more details on how to experiment with those, please refer to `native_client/java/README.md`. + +Please refer to TensorFlow documentation on how to setup the environment to build for Android (SDK and NDK required). + +You can build the `libdeepspeech.so` using (ARMv7): + +``` +bazel build --config=monolithic --config=android --config=android_arm --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++11 --copt=-D_GLIBCXX_USE_C99 //native_client:libdeepspeech.so +``` + +Or (ARM64): +``` +bazel build --config=monolithic --config=android --config=android_arm64 --action_env ANDROID_NDK_API_LEVEL=21 --cxxopt=-std=c++11 --copt=-D_GLIBCXX_USE_C99 //native_client:libdeepspeech.so +``` + +Building the `deepspeech` binary will happen through `ndk-build` (ARMv7): + +``` +cd ../DeepSpeech/native_client +$ANDROID_NDK_HOME/ndk-build APP_PLATFORM=android-21 APP_BUILD_SCRIPT=$(pwd)/Android.mk NDK_PROJECT_PATH=$(pwd) APP_STL=c++_shared TFDIR=$(pwd)/../../tensorflow/ TARGET_ARCH_ABI=armeabi-v7a +``` + +And (ARM64): +``` +cd ../DeepSpeech/native_client +$ANDROID_NDK_HOME/ndk-build APP_PLATFORM=android-21 APP_BUILD_SCRIPT=$(pwd)/Android.mk NDK_PROJECT_PATH=$(pwd) APP_STL=c++_shared TFDIR=$(pwd)/../../tensorflowx/ TARGET_ARCH_ABI=arm64-v8a +``` + ## Installing After building, the library files and binary can optionally be installed to a system path for ease of development. This is also a required step for bindings generation. diff --git a/native_client/client.cc b/native_client/client.cc index 5826e875..ca8c9f94 100644 --- a/native_client/client.cc +++ b/native_client/client.cc @@ -6,7 +6,9 @@ #include #include #include +#ifndef __ANDROID__ #include +#endif // __ANDROID__ #include #include @@ -59,6 +61,7 @@ GetAudioBuffer(const char* path) { ds_audio_buffer res = {0}; +#ifndef __ANDROID__ sox_format_t* input = sox_open_read(path, NULL, NULL, NULL); assert(input); @@ -147,6 +150,51 @@ GetAudioBuffer(const char* path) // Close sox handles sox_close(output); sox_close(input); +#endif // __ANDROID__ + +#ifdef __ANDROID__ + // FIXME: Hack and support only 16kHz mono 16-bits PCM + FILE* wave = fopen(path, "r"); + + size_t rv; + + unsigned short audio_format; + fseek(wave, 20, SEEK_SET); rv = fread(&audio_format, 2, 1, wave); + assert(rv == 2); + + unsigned short num_channels; + fseek(wave, 22, SEEK_SET); rv = fread(&num_channels, 2, 1, wave); + assert(rv == 2); + + unsigned int sample_rate; + fseek(wave, 24, SEEK_SET); rv = fread(&sample_rate, 4, 1, wave); + assert(rv == 2); + + unsigned short bits_per_sample; + fseek(wave, 34, SEEK_SET); rv = fread(&bits_per_sample, 2, 1, wave); + assert(rv == 2); + + assert(audio_format == 1); // 1 is PCM + assert(num_channels == 1); // MONO + assert(sample_rate == 16000); // 16000 Hz + assert(bits_per_sample == 16); // 16 bits per sample + + fprintf(stderr, "audio_format=%d\n", audio_format); + fprintf(stderr, "num_channels=%d\n", num_channels); + fprintf(stderr, "sample_rate=%d\n", sample_rate); + fprintf(stderr, "bits_per_sample=%d\n", bits_per_sample); + + fseek(wave, 40, SEEK_SET); rv = fread(&res.buffer_size, 4, 1, wave); + assert(rv == 2); + fprintf(stderr, "res.buffer_size=%ld\n", res.buffer_size); + + fseek(wave, 44, SEEK_SET); + res.buffer = (char*)malloc(sizeof(char) * res.buffer_size); + rv = fread(res.buffer, sizeof(char), res.buffer_size, wave); + assert(rv == res.buffer_size); + + fclose(wave); +#endif // __ANDROID__ #ifdef __APPLE__ res.buffer_size = (size_t)(output->olength * 2); @@ -255,8 +303,10 @@ main(int argc, char **argv) break; } +#ifndef __ANDROID__ // Deinitialise and quit sox_quit(); +#endif // __ANDROID__ DS_DestroyModel(ctx); diff --git a/native_client/deepspeech.h b/native_client/deepspeech.h index b558cc2e..6a70fecf 100644 --- a/native_client/deepspeech.h +++ b/native_client/deepspeech.h @@ -1,6 +1,10 @@ #ifndef DEEPSPEECH_H #define DEEPSPEECH_H +#ifdef __ANDROID__ +#define USE_TFLITE +#endif + #ifndef SWIG #if defined _MSC_VER #define DEEPSPEECH_EXPORT extern "C" __declspec(dllexport) diff --git a/native_client/java/.gitignore b/native_client/java/.gitignore new file mode 100644 index 00000000..fd45b12f --- /dev/null +++ b/native_client/java/.gitignore @@ -0,0 +1,11 @@ +*.iml +.gradle +/local.properties +/.idea/caches/build_file_checksums.ser +/.idea/libraries +/.idea/modules.xml +/.idea/workspace.xml +.DS_Store +/build +/captures +.externalNativeBuild diff --git a/native_client/java/.idea/codeStyles/Project.xml b/native_client/java/.idea/codeStyles/Project.xml new file mode 100644 index 00000000..30aa626c --- /dev/null +++ b/native_client/java/.idea/codeStyles/Project.xml @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/native_client/java/.idea/gradle.xml b/native_client/java/.idea/gradle.xml new file mode 100644 index 00000000..7ac24c77 --- /dev/null +++ b/native_client/java/.idea/gradle.xml @@ -0,0 +1,18 @@ + + + + + + \ No newline at end of file diff --git a/native_client/java/.idea/misc.xml b/native_client/java/.idea/misc.xml new file mode 100644 index 00000000..b0c7b20c --- /dev/null +++ b/native_client/java/.idea/misc.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/native_client/java/.idea/runConfigurations.xml b/native_client/java/.idea/runConfigurations.xml new file mode 100644 index 00000000..7f68460d --- /dev/null +++ b/native_client/java/.idea/runConfigurations.xml @@ -0,0 +1,12 @@ + + + + + + \ No newline at end of file diff --git a/native_client/java/Makefile b/native_client/java/Makefile new file mode 100644 index 00000000..e84895ab --- /dev/null +++ b/native_client/java/Makefile @@ -0,0 +1,19 @@ +.PHONY: clean apk-clean + +include ../definitions.mk + +LIBDEEPSPEECH_SO ?= ${TFDIR}/bazel-bin/native_client/libdeepspeech.so + +all: apk + +clean: apk-clean + rm -rf *.java jni/deepspeech_wrap.cpp + +apk-clean: + ./gradlew clean + +apk: apk-clean bindings + LIBDEEPSPEECH_SO=$(LIBDEEPSPEECH_SO) ./gradlew build + +bindings: clean + swig -c++ -java -package deepspeech.mozilla.org.deepspeech -outdir app/src/main/java/deepspeech/mozilla/org/deepspeech/ -o jni/deepspeech_wrap.cpp jni/deepspeech.i diff --git a/native_client/java/README.md b/native_client/java/README.md new file mode 100644 index 00000000..6b743f5b --- /dev/null +++ b/native_client/java/README.md @@ -0,0 +1,32 @@ +DeepSpeech Java / Android bindings +================================== + +This is still preliminary work. Please refer to `native_client/README.md` for +building `libdeepspeech.so` and `deepspeech` binary for Android on ARMv7 and +ARM64 arch. + +Running `deepspeech` via adb +============================ +You should use `adb push` to send data to device, please refer to Android +documentation on how to use that. + +Please push DeepSpeech data to `/sdcard/deepspeech/`, including: + - `output_graph.tflite` which is the TF Lite model + - `alphabet.txt` + - `lm.binary` and `trie` files, if you want to use the language model ; please + be aware that too big language model will make the device run out of memory + +Then, push binaries from `native_client.tar.xz` to `/data/local/tmp/ds`: + - `deepspeech` + - `libdeepspeech.so` + - `libc++_shared.so` + +You should then be able to run as usual, using a shell from `adb shell`: +``` +user@device$ cd /data/local/tmp/ds/ +user@device$ LD_LIBRARY_PATH=$(pwd)/ ./deepspeech [...] +``` + +Please note that Android linker does not support `rpath` so you have to set +`LD_LIBRARY_PATH`. Properly wrapped / packaged bindings does embed the library +at a place the linker knows where to search, so Android apps will be fine. diff --git a/native_client/java/app/.gitignore b/native_client/java/app/.gitignore new file mode 100644 index 00000000..796b96d1 --- /dev/null +++ b/native_client/java/app/.gitignore @@ -0,0 +1 @@ +/build diff --git a/native_client/java/app/CMakeLists.txt b/native_client/java/app/CMakeLists.txt new file mode 100644 index 00000000..e4a51f0e --- /dev/null +++ b/native_client/java/app/CMakeLists.txt @@ -0,0 +1,58 @@ +# For more information about using CMake with Android Studio, read the +# documentation: https://d.android.com/studio/projects/add-native-code.html + +# Sets the minimum version of CMake required to build the native library. + +cmake_minimum_required(VERSION 3.4.1) + +# Creates and names a library, sets it as either STATIC +# or SHARED, and provides the relative paths to its source code. +# You can define multiple libraries, and CMake builds them for you. +# Gradle automatically packages shared libraries with your APK. + +add_library( # Sets the name of the library. + deepspeech-jni + + # Sets the library as a shared library. + SHARED + + # Provides a relative path to your source file(s). + ../jni/deepspeech_wrap.cpp ) + +add_library( deepspeech-lib + SHARED + IMPORTED ) + +set_target_properties( deepspeech-lib + PROPERTIES IMPORTED_LOCATION $ENV{LIBDEEPSPEECH_SO} ) + +add_custom_command( TARGET deepspeech-jni POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + $ENV{LIBDEEPSPEECH_SO} + ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libdeepspeech.so ) + +# Searches for a specified prebuilt library and stores the path as a +# variable. Because CMake includes system libraries in the search path by +# default, you only need to specify the name of the public NDK library +# you want to add. CMake verifies that the library exists before +# completing its build. + +find_library( # Sets the name of the path variable. + log-lib + + # Specifies the name of the NDK library that + # you want CMake to locate. + log ) + +# Specifies libraries CMake should link to your target library. You +# can link multiple libraries, such as libraries you define in this +# build script, prebuilt third-party libraries, or system libraries. + +target_link_libraries( # Specifies the target library. + deepspeech-jni + + deepspeech-lib + + # Links the target library to the log library + # included in the NDK. + ${log-lib} ) diff --git a/native_client/java/app/build.gradle b/native_client/java/app/build.gradle new file mode 100644 index 00000000..2e8fa66c --- /dev/null +++ b/native_client/java/app/build.gradle @@ -0,0 +1,41 @@ +apply plugin: 'com.android.application' + +android { + compileSdkVersion 27 + defaultConfig { + applicationId "deepspeech.mozilla.org.deepspeech" + minSdkVersion 21 + targetSdkVersion 27 + versionCode 1 + versionName "1.0" + testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner" + ndk { + abiFilters 'armeabi-v7a', 'arm64-v8a' + } + externalNativeBuild { + cmake { + cppFlags "" + } + } + } + buildTypes { + release { + minifyEnabled false + proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro' + } + } + externalNativeBuild { + cmake { + path "CMakeLists.txt" + } + } +} + +dependencies { + implementation fileTree(dir: 'libs', include: ['*.jar']) + implementation 'com.android.support:appcompat-v7:27.1.1' + implementation 'com.android.support.constraint:constraint-layout:1.1.3' + testImplementation 'junit:junit:4.12' + androidTestImplementation 'com.android.support.test:runner:1.0.2' + androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2' +} diff --git a/native_client/java/app/proguard-rules.pro b/native_client/java/app/proguard-rules.pro new file mode 100644 index 00000000..f1b42451 --- /dev/null +++ b/native_client/java/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile diff --git a/native_client/java/app/src/androidTest/java/deepspeech/mozilla/org/deepspeech/ExampleInstrumentedTest.java b/native_client/java/app/src/androidTest/java/deepspeech/mozilla/org/deepspeech/ExampleInstrumentedTest.java new file mode 100644 index 00000000..729e7b79 --- /dev/null +++ b/native_client/java/app/src/androidTest/java/deepspeech/mozilla/org/deepspeech/ExampleInstrumentedTest.java @@ -0,0 +1,26 @@ +package deepspeech.mozilla.org.deepspeech; + +import android.content.Context; +import android.support.test.InstrumentationRegistry; +import android.support.test.runner.AndroidJUnit4; + +import org.junit.Test; +import org.junit.runner.RunWith; + +import static org.junit.Assert.*; + +/** + * Instrumented test, which will execute on an Android device. + * + * @see Testing documentation + */ +@RunWith(AndroidJUnit4.class) +public class ExampleInstrumentedTest { + @Test + public void useAppContext() { + // Context of the app under test. + Context appContext = InstrumentationRegistry.getTargetContext(); + + assertEquals("deepspeech.mozilla.org.deepspeech", appContext.getPackageName()); + } +} diff --git a/native_client/java/app/src/main/AndroidManifest.xml b/native_client/java/app/src/main/AndroidManifest.xml new file mode 100644 index 00000000..b55fcb1e --- /dev/null +++ b/native_client/java/app/src/main/AndroidManifest.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + diff --git a/native_client/java/app/src/main/cpp/native-lib.cpp b/native_client/java/app/src/main/cpp/native-lib.cpp new file mode 100644 index 00000000..6e815b4b --- /dev/null +++ b/native_client/java/app/src/main/cpp/native-lib.cpp @@ -0,0 +1,10 @@ +#include +#include + +extern "C" JNIEXPORT jstring JNICALL +Java_deepspeech_mozilla_org_deepspeech_DeepSpeechActivity_stringFromJNI( + JNIEnv* env, + jobject /* this */) { + std::string hello = "Hello from C++"; + return env->NewStringUTF(hello.c_str()); +} diff --git a/native_client/java/app/src/main/java/deepspeech/mozilla/org/deepspeech/DeepSpeechActivity.java b/native_client/java/app/src/main/java/deepspeech/mozilla/org/deepspeech/DeepSpeechActivity.java new file mode 100644 index 00000000..0635f604 --- /dev/null +++ b/native_client/java/app/src/main/java/deepspeech/mozilla/org/deepspeech/DeepSpeechActivity.java @@ -0,0 +1,177 @@ +package deepspeech.mozilla.org.deepspeech; + +import android.support.v7.app.AppCompatActivity; +import android.os.Bundle; + +import android.view.View; +import android.widget.TextView; +import android.widget.EditText; +import android.widget.Button; + +import android.media.MediaPlayer; + +import java.io.RandomAccessFile; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteOrder; +import java.nio.ByteBuffer; + +public class DeepSpeechActivity extends AppCompatActivity { + + // Used to load the 'native-lib' library on application startup. + static { + System.loadLibrary("deepspeech-jni"); + System.loadLibrary("deepspeech"); + } + + Model _m = null; + + EditText _tfliteModel; + EditText _alphabet; + EditText _audioFile; + + TextView _decodedString; + TextView _tfliteStatus; + + Button _startInference; + + final int N_CEP = 26; + final int N_CONTEXT = 9; + final int BEAM_WIDTH = 50; + final float LM_WEIGHT = 1.50f; + final float VALID_WORD_COUNT_WEIGHT = 2.10f; + + private char readLEChar(RandomAccessFile f) throws IOException { + byte b1 = f.readByte(); + byte b2 = f.readByte(); + return (char)((b2 << 8) | b1); + } + + private int readLEInt(RandomAccessFile f) throws IOException { + byte b1 = f.readByte(); + byte b2 = f.readByte(); + byte b3 = f.readByte(); + byte b4 = f.readByte(); + return (int)((b1 & 0xFF) | (b2 & 0xFF) << 8 | (b3 & 0xFF) << 16 | (b4 & 0xFF) << 24); + } + + private void newModel(String tfliteModel, String alphabet) { + this._tfliteStatus.setText("Creating model"); + if (this._m == null) { + this._m = new Model(tfliteModel, N_CEP, N_CONTEXT, alphabet, BEAM_WIDTH); + } + } + + private void doInference(String audioFile) { + long inferenceExecTime = 0; + + this._startInference.setEnabled(false); + + this.newModel(this._tfliteModel.getText().toString(), this._alphabet.getText().toString()); + + this._tfliteStatus.setText("Extracting audio features ..."); + + try { + RandomAccessFile wave = new RandomAccessFile(audioFile, "r"); + + wave.seek(20); char audioFormat = this.readLEChar(wave); + assert (audioFormat == 1); // 1 is PCM + // tv_audioFormat.setText("audioFormat=" + (audioFormat == 1 ? "PCM" : "!PCM")); + + wave.seek(22); char numChannels = this.readLEChar(wave); + assert (numChannels == 1); // MONO + // tv_numChannels.setText("numChannels=" + (numChannels == 1 ? "MONO" : "!MONO")); + + wave.seek(24); int sampleRate = this.readLEInt(wave); + assert (sampleRate == 16000); // 16000 Hz + // tv_sampleRate.setText("sampleRate=" + (sampleRate == 16000 ? "16kHz" : "!16kHz")); + + wave.seek(34); char bitsPerSample = this.readLEChar(wave); + assert (bitsPerSample == 16); // 16 bits per sample + // tv_bitsPerSample.setText("bitsPerSample=" + (bitsPerSample == 16 ? "16-bits" : "!16-bits" )); + + wave.seek(40); int bufferSize = this.readLEInt(wave); + assert (bufferSize > 0); + // tv_bufferSize.setText("bufferSize=" + bufferSize); + + wave.seek(44); + byte[] bytes = new byte[bufferSize]; + wave.readFully(bytes); + + short[] shorts = new short[bytes.length/2]; + // to turn bytes to shorts as either big endian or little endian. + ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts); + + this._tfliteStatus.setText("Running inference ..."); + + long inferenceStartTime = System.currentTimeMillis(); + + String decoded = this._m.stt(shorts, shorts.length, sampleRate); + + inferenceExecTime = System.currentTimeMillis() - inferenceStartTime; + + this._decodedString.setText(decoded); + + } catch (FileNotFoundException ex) { + + } catch (IOException ex) { + + } finally { + + } + + this._tfliteStatus.setText("Finished! Took " + inferenceExecTime + "ms"); + + this._startInference.setEnabled(true); + } + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_deep_speech); + + this._decodedString = (TextView) findViewById(R.id.decodedString); + this._tfliteStatus = (TextView) findViewById(R.id.tfliteStatus); + + this._tfliteModel = (EditText) findViewById(R.id.tfliteModel); + this._alphabet = (EditText) findViewById(R.id.alphabet); + this._audioFile = (EditText) findViewById(R.id.audioFile); + + this._tfliteModel.setText("/sdcard/deepspeech/output_graph.tflite"); + this._tfliteStatus.setText("Ready, waiting ..."); + + this._alphabet.setText("/sdcard/deepspeech/alphabet.txt"); + this._audioFile.setText("/sdcard/deepspeech/audio.wav"); + + this._startInference = (Button) findViewById(R.id.btnStartInference); + } + + public void onClick_inference_handler(View v) { + this.playAudioFile(); + this.doInference(this._audioFile.getText().toString()); + } + + public void playAudioFile() { + try { + MediaPlayer mediaPlayer = new MediaPlayer(); + mediaPlayer.setDataSource(this._audioFile.getText().toString()); + mediaPlayer.prepare(); + mediaPlayer.start(); + } catch (IOException ex) { + + } + } + + public void onClick_audio_handler(View v) { + this.playAudioFile(); + } + + @Override + protected void onDestroy() { + super.onDestroy(); + + if (this._m != null) { + this._m.destroyModel(); + } + } +} diff --git a/native_client/java/app/src/main/java/deepspeech/mozilla/org/deepspeech/Model.java b/native_client/java/app/src/main/java/deepspeech/mozilla/org/deepspeech/Model.java new file mode 100644 index 00000000..303f03d7 --- /dev/null +++ b/native_client/java/app/src/main/java/deepspeech/mozilla/org/deepspeech/Model.java @@ -0,0 +1,44 @@ +package deepspeech.mozilla.org.deepspeech; + +public class Model { + + // FIXME: We should have something better than those SWIGTYPE_* + SWIGTYPE_p_p_ModelState _mspp; + SWIGTYPE_p_ModelState _msp; + + public Model(String modelPath, int n_cep, int n_context, String alphabetPath, int beam_width) { + this._mspp = impl.new_modelstatep(); + impl.CreateModel(modelPath, n_cep, n_context, alphabetPath, beam_width, this._mspp); + this._msp = impl.modelstatep_value(this._mspp); + } + + public void destroyModel() { + impl.DestroyModel(this._msp); + } + + public void enableDecoderWihLM(String alphabet, String lm, String trie, float lm_weight, float valid_word_count_weight) { + impl.EnableDecoderWithLM(this._msp, alphabet, lm, trie, lm_weight, valid_word_count_weight); + } + + public String stt(short[] buffer, int buffer_size, int sample_rate) { + return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate); + } + + public SWIGTYPE_p_StreamingState setupStream(int prealloc_frames, int sample_rate) { + SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep(); + impl.SetupStream(this._msp, prealloc_frames, sample_rate, ssp); + return impl.streamingstatep_value(ssp); + } + + public void feedAudioContent(SWIGTYPE_p_StreamingState ctx, short[] buffer, int buffer_size) { + impl.FeedAudioContent(ctx, buffer, buffer_size); + } + + public String intermediateDecode(SWIGTYPE_p_StreamingState ctx) { + return impl.IntermediateDecode(ctx); + } + + public String finishStream(SWIGTYPE_p_StreamingState ctx) { + return impl.FinishStream(ctx); + } +} diff --git a/native_client/java/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/native_client/java/app/src/main/res/drawable-v24/ic_launcher_foreground.xml new file mode 100644 index 00000000..1f6bb290 --- /dev/null +++ b/native_client/java/app/src/main/res/drawable-v24/ic_launcher_foreground.xml @@ -0,0 +1,34 @@ + + + + + + + + + + + diff --git a/native_client/java/app/src/main/res/drawable/ic_launcher_background.xml b/native_client/java/app/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 00000000..0d025f9b --- /dev/null +++ b/native_client/java/app/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/native_client/java/app/src/main/res/layout/activity_deep_speech.xml b/native_client/java/app/src/main/res/layout/activity_deep_speech.xml new file mode 100644 index 00000000..82fb4fe3 --- /dev/null +++ b/native_client/java/app/src/main/res/layout/activity_deep_speech.xml @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +