Merge pull request #2362 from lissyx/all-the-docs

All the docs
This commit is contained in:
lissyx 2019-09-24 18:23:07 +02:00 committed by GitHub
commit 5196fa6e9b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 6033 additions and 63 deletions

3
.gitignore vendored
View File

@ -20,3 +20,6 @@
/native_client/python/utils_wrap.cpp
/native_client/javascript/build
/native_client/javascript/deepspeech_wrap.cxx
/doc/.build/
/doc/xml-c/
/doc/xml-java/

17
.readthedocs.yml Normal file
View File

@ -0,0 +1,17 @@
# .readthedocs.yml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Build documentation in the docs/ directory with Sphinx
sphinx:
builder: html
configuration: doc/conf.py
# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.7
install:
- requirements: taskcluster/docs-requirements.txt

44
doc/C-API.rst Normal file
View File

@ -0,0 +1,44 @@
C
=
.. doxygenfunction:: DS_CreateModel
:project: deepspeech-c
.. doxygenfunction:: DS_FreeModel
:project: deepspeech-c
.. doxygenfunction:: DS_EnableDecoderWithLM
:project: deepspeech-c
.. doxygenfunction:: DS_SpeechToText
:project: deepspeech-c
.. doxygenfunction:: DS_SpeechToTextWithMetadata
:project: deepspeech-c
.. doxygenfunction:: DS_CreateStream
:project: deepspeech-c
.. doxygenfunction:: DS_FeedAudioContent
:project: deepspeech-c
.. doxygenfunction:: DS_IntermediateDecode
:project: deepspeech-c
.. doxygenfunction:: DS_FinishStream
:project: deepspeech-c
.. doxygenfunction:: DS_FinishStreamWithMetadata
:project: deepspeech-c
.. doxygenfunction:: DS_FreeStream
:project: deepspeech-c
.. doxygenfunction:: DS_FreeMetadata
:project: deepspeech-c
.. doxygenfunction:: DS_FreeString
:project: deepspeech-c
.. doxygenfunction:: DS_PrintVersions
:project: deepspeech-c

5
doc/Error-Codes.rst Normal file
View File

@ -0,0 +1,5 @@
Error codes
===========
.. doxygenenum:: DeepSpeech_Error_Codes
:project: deepspeech-c

23
doc/Java-API.rst Normal file
View File

@ -0,0 +1,23 @@
Java
====
DeepSpeechModel
---------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::DeepSpeechModel
:project: deepspeech-java
:members:
Metadata
--------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::Metadata
:project: deepspeech-java
:members: getItems, getNum_items, getProbability, getItem
MetadataItem
------------
.. doxygenclass:: org::mozilla::deepspeech::libdeepspeech::MetadataItem
:project: deepspeech-java
:members: getCharacter, getTimestep, getStart_time

View File

@ -12,9 +12,27 @@ BUILDDIR = .build
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
.PHONY: help pip3 npm Makefile doxygen-c doxygen-java
doxygen-c:
cd ../ && doxygen doc/doxygen-c.conf
doxygen-java:
cd ../ && doxygen doc/doxygen-java.conf
pip3:
pip3 install --user -r ../taskcluster/docs-requirements.txt
npm:
npm install jsdoc@3.6.3
dist: html
cd $(BUILDDIR)/html/ && zip -r9 ../../html.zip *
dist: html
@cd $(BUILDDIR)/html/ && zip -r9 ../../html.zip *
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
%: Makefile pip3 npm doxygen-c doxygen-java
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

31
doc/NodeJS-API.rst Normal file
View File

@ -0,0 +1,31 @@
JavaScript (NodeJS / ElectronJS)
================================
Model
-----
.. js:autoclass:: Model
:members:
Module exported methods
-----------------------
.. js:autofunction:: FreeModel
.. js:autofunction:: FreeStream
.. js:autofunction:: FreeMetadata
.. js:autofunction:: printVersions
Metadata
--------
.. js:autoclass:: Metadata
:members:
MetadataItem
------------
.. js:autoclass:: MetadataItem
:members:

22
doc/Python-API.rst Normal file
View File

@ -0,0 +1,22 @@
Python
======
.. automodule:: native_client.python
Model
-----
.. autoclass:: Model
:members:
Metadata
--------
.. autoclass:: Metadata
:members:
MetadataItem
------------
.. autoclass:: MetadataItem
:members:

16
doc/Structs.rst Normal file
View File

@ -0,0 +1,16 @@
Data structures
===============
Metadata
--------
.. doxygenstruct:: Metadata
:project: deepspeech-c
:members:
MetadataItem
------------
.. doxygenstruct:: MetadataItem
:project: deepspeech-c
:members:

View File

@ -16,13 +16,44 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# pylint: skip-file
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../'))
autodoc_mock_imports = ['deepspeech']
read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
if read_the_docs_build:
import subprocess
subprocess.call('cd ../ && doxygen doc/doxygen-c.conf', shell=True)
subprocess.call('cd ../ && doxygen doc/doxygen-java.conf', shell=True)
# -- General configuration ------------------------------------------------
import semver
# -- Project information -----------------------------------------------------
project = u'DeepSpeech'
copyright = '2019, Mozilla Corporation'
author = 'Mozilla Corporation'
with open('../VERSION', 'r') as ver:
v = ver.read().strip()
vv = semver.parse(v)
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
# The short X.Y version
version = '{}.{}'.format(vv['major'], vv['minor'])
# The full version, including alpha/beta/rc tags
release = v
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
@ -30,10 +61,23 @@ sys.path.insert(0, os.path.abspath('..'))
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'sphinx.ext.mathjax',
'sphinx.ext.viewcode']
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'sphinx.ext.mathjax',
'sphinx.ext.viewcode',
'sphinx_rtd_theme',
'sphinx_js',
'breathe'
]
breathe_projects = {
"deepspeech-c": "xml-c/",
"deepspeech-java": "xml-java/",
}
js_source_path = "../native_client/javascript"
# Add any paths that contain templates here, relative to this directory.
templates_path = ['.templates']
@ -47,20 +91,6 @@ source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'DeepSpeech'
copyright = u'2017, Mozilla Research'
author = u'Mozilla Research'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = u''
# The full version, including alpha/beta/rc tags.
release = u''
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
@ -79,13 +109,15 @@ pygments_style = 'sphinx'
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
add_module_names = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'classic'
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the

2494
doc/doxygen-c.conf Normal file

File diff suppressed because it is too large Load Diff

2494
doc/doxygen-java.conf Normal file

File diff suppressed because it is too large Load Diff

View File

@ -8,42 +8,36 @@ Welcome to DeepSpeech's documentation!
.. toctree::
:maxdepth: 2
:caption: Contents:
:caption: Introduction
DeepSpeech
.. toctree::
:maxdepth: 2
:caption: DeepSpeech Model
Geometry
ParallelOptimization
.. automodule:: DeepSpeech
:members:
.. toctree::
:maxdepth: 2
:caption: Enums and structs
.. automodule:: util.audio
:members:
Error-Codes
.. automodule:: util.text
:members:
Structs
.. automodule:: util.gpu
:members:
.. toctree::
:maxdepth: 2
:caption: API Reference
.. automodule:: util.stm
:members:
C-API
.. automodule:: util.importers.ldc93s1
:members:
NodeJS-API
.. automodule:: util.importers.ted
:members:
.. automodule:: util.importers.librivox
:members:
.. automodule:: util.importers.LDC97S62
:members:
.. automodule:: util.importers.fisher
:members:
Java-API
Python-API
Indices and tables
==================

View File

@ -8,9 +8,9 @@ extern "C" {
#ifndef SWIG
#if defined _MSC_VER
#define DEEPSPEECH_EXPORT __declspec(dllexport)
#else /*End of _MSC_VER*/
#else
#define DEEPSPEECH_EXPORT __attribute__ ((visibility("default")))
#endif /*End of SWIG*/
#endif /*End of _MSC_VER*/
#else
#define DEEPSPEECH_EXPORT
#endif
@ -19,20 +19,32 @@ typedef struct ModelState ModelState;
typedef struct StreamingState StreamingState;
// Stores each individual character, along with its timing information
/**
* @brief Stores each individual character, along with its timing information
*/
typedef struct MetadataItem {
/** The character generated for transcription */
char* character;
int timestep; // Position of the character in units of 20ms
float start_time; // Position of the character in seconds
/** Position of the character in units of 20ms */
int timestep;
/** Position of the character in seconds */
float start_time;
} MetadataItem;
// Stores the entire CTC output as an array of character metadata objects
/**
* @brief Stores the entire CTC output as an array of character metadata objects
*/
typedef struct Metadata {
/** List of items */
MetadataItem* items;
/** Size of the list of items */
int num_items;
// Approximated confidence value for this transcription. This is roughly the
// sum of the acoustic model logit values for each timestep/character that
// contributed to the creation of this transcription.
/** Approximated confidence value for this transcription. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcription.
*/
double confidence;
} Metadata;
@ -90,8 +102,6 @@ void DS_FreeModel(ModelState* ctx);
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param aCtx The ModelState pointer for the model being changed.
* @param aAlphabetConfigPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param aLMPath The path to the language model binary file.
* @param aTriePath The path to the trie file build from the same vocabu-
* lary as the language model binary.

View File

@ -21,6 +21,13 @@
%array_functions(struct MetadataItem, metadataItem_array);
%extend struct Metadata {
/**
* Retrieve one MetadataItem element
*
* @param i Array index of the MetadataItem to get
*
* @return The MetadataItem requested or null
*/
MetadataItem getItem(int i) {
return metadataItem_array_getitem(self->items, i);
}

View File

@ -43,6 +43,11 @@ android {
timeOutInMs 15 * 60 * 1000 // 10 minutes
installOptions "-d","-t"
}
// Avoid scanning libdeepspeech_doc
sourceSets {
main.java.srcDirs = [ 'src/main/java/org/mozilla/deepspeech/libdeepspeech/' ]
}
}
dependencies {

View File

@ -1,5 +1,8 @@
package org.mozilla.deepspeech.libdeepspeech;
/**
* @brief Exposes a DeepSpeech model in Java
**/
public class DeepSpeechModel {
static {
@ -11,46 +14,138 @@ public class DeepSpeechModel {
SWIGTYPE_p_p_ModelState _mspp;
SWIGTYPE_p_ModelState _msp;
/**
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @constructor
*
* @param modelPath The path to the frozen model graph.
* @param alphabetPath The path to the configuration file specifying
* the alphabet used by the network. See alphabet.h.
* @param beam_width The beam width used by the decoder. A larger beam
* width generates better results at the cost of decoding
* time.
*/
public DeepSpeechModel(String modelPath, String alphabetPath, int beam_width) {
this._mspp = impl.new_modelstatep();
impl.CreateModel(modelPath, alphabetPath, beam_width, this._mspp);
this._msp = impl.modelstatep_value(this._mspp);
}
/**
* @brief Frees associated resources and destroys model object.
*/
public void freeModel() {
impl.FreeModel(this._msp);
}
/**
* @brief Enable decoding using beam scoring with a KenLM language model.
*
* @param lm The path to the language model binary file.
* @param trie The path to the trie file build from the same vocabulary as the language model binary.
* @param lm_alpha The alpha hyperparameter of the CTC decoder. Language Model weight.
* @param lm_beta The beta hyperparameter of the CTC decoder. Word insertion weight.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
public void enableDecoderWihLM(String lm, String trie, float lm_alpha, float lm_beta) {
impl.EnableDecoderWithLM(this._msp, lm, trie, lm_alpha, lm_beta);
}
/*
* @brief Use the DeepSpeech model to perform Speech-To-Text.
*
* @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* @param buffer_size The number of samples in the audio signal.
* @param sample_rate The sample-rate of the audio signal.
*
* @return The STT result.
*/
public String stt(short[] buffer, int buffer_size, int sample_rate) {
return impl.SpeechToText(this._msp, buffer, buffer_size, sample_rate);
}
/**
* @brief Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
*
* @param buffer A 16-bit, mono raw audio signal at the appropriate
* sample rate.
* @param buffer_size The number of samples in the audio signal.
* @param sample_rate The sample-rate of the audio signal.
*
* @return Outputs a Metadata object of individual letters along with their timing information.
*/
public Metadata sttWithMetadata(short[] buffer, int buffer_size, int sample_rate) {
return impl.SpeechToTextWithMetadata(this._msp, buffer, buffer_size, sample_rate);
}
/**
* @brief Create a new streaming inference state. The streaming state returned
* by this function can then be passed to feedAudioContent()
* and finishStream().
*
* @param sample_rate The sample-rate of the audio signal.
* @return An opaque object that represents the streaming state.
*/
public DeepSpeechStreamingState createStream(int sample_rate) {
SWIGTYPE_p_p_StreamingState ssp = impl.new_streamingstatep();
impl.CreateStream(this._msp, sample_rate, ssp);
return new DeepSpeechStreamingState(impl.streamingstatep_value(ssp));
}
/**
* @brief Feed audio samples to an ongoing streaming inference.
*
* @param cctx A streaming state pointer returned by createStream().
* @param buffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate.
* @param buffer_size The number of samples in @p buffer.
*/
public void feedAudioContent(DeepSpeechStreamingState ctx, short[] buffer, int buffer_size) {
impl.FeedAudioContent(ctx.get(), buffer, buffer_size);
}
/**
* @brief Compute the intermediate decoding of an ongoing streaming inference.
* This is an expensive process as the decoder implementation isn't
* currently capable of streaming, so it always starts from the beginning
* of the audio.
*
* @param ctx A streaming state pointer returned by createStream().
*
* @return The STT intermediate result.
*/
public String intermediateDecode(DeepSpeechStreamingState ctx) {
return impl.IntermediateDecode(ctx.get());
}
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns the STT result over the whole audio signal.
*
* @param ctx A streaming state pointer returned by createStream().
*
* @return The STT result.
*
* @note This method will free the state pointer (@p ctx).
*/
public String finishStream(DeepSpeechStreamingState ctx) {
return impl.FinishStream(ctx.get());
}
/**
* @brief Signal the end of an audio signal to an ongoing streaming
* inference, returns per-letter metadata.
*
* @param ctx A streaming state pointer returned by createStream().
*
* @return Outputs a Metadata object of individual letters along with their timing information.
*
* @note This method will free the state pointer (@p ctx).
*/
public Metadata finishStreamWithMetadata(DeepSpeechStreamingState ctx) {
return impl.FinishStreamWithMetadata(ctx.get());
}

View File

@ -0,0 +1,100 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.2
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
/**
* Stores the entire CTC output as an array of character metadata objects
*/
public class Metadata {
private transient long swigCPtr;
protected transient boolean swigCMemOwn;
protected Metadata(long cPtr, boolean cMemoryOwn) {
swigCMemOwn = cMemoryOwn;
swigCPtr = cPtr;
}
protected static long getCPtr(Metadata obj) {
return (obj == null) ? 0 : obj.swigCPtr;
}
@SuppressWarnings("deprecation")
protected void finalize() {
delete();
}
public synchronized void delete() {
if (swigCPtr != 0) {
if (swigCMemOwn) {
swigCMemOwn = false;
implJNI.delete_Metadata(swigCPtr);
}
swigCPtr = 0;
}
}
/**
* List of items
*/
public void setItems(MetadataItem value) {
implJNI.Metadata_items_set(swigCPtr, this, MetadataItem.getCPtr(value), value);
}
/**
* List of items
*/
public MetadataItem getItems() {
long cPtr = implJNI.Metadata_items_get(swigCPtr, this);
return (cPtr == 0) ? null : new MetadataItem(cPtr, false);
}
/**
* Size of the list of items
*/
public void setNum_items(int value) {
implJNI.Metadata_num_items_set(swigCPtr, this, value);
}
/**
* Size of the list of items
*/
public int getNum_items() {
return implJNI.Metadata_num_items_get(swigCPtr, this);
}
/**
* Approximated confidence value for this transcription. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcription.
*/
public void setConfidence(double value) {
implJNI.Metadata_confidence_set(swigCPtr, this, value);
}
/**
* Approximated confidence value for this transcription. This is roughly the<br>
* sum of the acoustic model logit values for each timestep/character that<br>
* contributed to the creation of this transcription.
*/
public double getConfidence() {
return implJNI.Metadata_confidence_get(swigCPtr, this);
}
/**
* Retrieve one MetadataItem element<br>
* <br>
* @param i Array index of the MetadataItem to get<br>
* <br>
* @return The MetadataItem requested or null
*/
public MetadataItem getItem(int i) {
return new MetadataItem(implJNI.Metadata_getItem(swigCPtr, this, i), true);
}
}

View File

@ -0,0 +1,79 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (http://www.swig.org).
* Version 4.0.2
*
* Do not make changes to this file unless you know what you are doing--modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */
package org.mozilla.deepspeech.libdeepspeech;
/**
* Stores each individual character, along with its timing information
*/
public class MetadataItem {
private transient long swigCPtr;
protected transient boolean swigCMemOwn;
protected MetadataItem(long cPtr, boolean cMemoryOwn) {
swigCMemOwn = cMemoryOwn;
swigCPtr = cPtr;
}
protected static long getCPtr(MetadataItem obj) {
return (obj == null) ? 0 : obj.swigCPtr;
}
public synchronized void delete() {
if (swigCPtr != 0) {
if (swigCMemOwn) {
swigCMemOwn = false;
throw new UnsupportedOperationException("C++ destructor does not have public access");
}
swigCPtr = 0;
}
}
/**
* The character generated for transcription
*/
public void setCharacter(String value) {
implJNI.MetadataItem_character_set(swigCPtr, this, value);
}
/**
* The character generated for transcription
*/
public String getCharacter() {
return implJNI.MetadataItem_character_get(swigCPtr, this);
}
/**
* Position of the character in units of 20ms
*/
public void setTimestep(int value) {
implJNI.MetadataItem_timestep_set(swigCPtr, this, value);
}
/**
* Position of the character in units of 20ms
*/
public int getTimestep() {
return implJNI.MetadataItem_timestep_get(swigCPtr, this);
}
/**
* Position of the character in seconds
*/
public void setStart_time(float value) {
implJNI.MetadataItem_start_time_set(swigCPtr, this, value);
}
/**
* Position of the character in seconds
*/
public float getStart_time() {
return implJNI.MetadataItem_start_time_get(swigCPtr, this);
}
}

View File

@ -0,0 +1,9 @@
Javadoc for Sphinx
==================
This code is only here for reference for documentation generation.
To update, please build SWIG (4.0 at least) and then run from native_client/java:
```
swig -c++ -java -doxygen -package org.mozilla.deepspeech.libdeepspeech -outdir libdeepspeech/src/main/java/org/mozilla/deepspeech/libdeepspeech_doc -o jni/deepspeech_wrap.cpp jni/deepspeech.i
```

View File

@ -20,6 +20,18 @@ if (process.platform === 'win32') {
process.env['PATH'] = oldPath;
}
/**
* @class
* An object providing an interface to a trained DeepSpeech model.
*
* @param {string} aModelPath The path to the frozen model graph.
* @param {number} aNCep The number of cepstrum the model was trained with.
* @param {number} aNContext The context window the model was trained with.
* @param {string} aAlphabetConfigPath The path to the configuration file specifying the alphabet used by the network. See alphabet.h.
* @param {number} aBeamWidth The beam width used by the decoder. A larger beam width generates better results at the cost of decoding time.
*
* @throws on error
*/
function Model() {
this._impl = null;
@ -33,21 +45,59 @@ function Model() {
this._impl = impl;
}
/**
* Enable decoding using beam scoring with a KenLM language model.
*
* @param {string} aAlphabetConfigPath The path to the configuration file specifying the alphabet used by the network. See alphabet.h.
* @param {string} aLMPath The path to the language model binary file.
* @param {string} aTriePath The path to the trie file build from the same vocabulary as the language model binary.
* @param {float} aLMAlpha The alpha hyperparameter of the CTC decoder. Language Model weight.
* @param {float} aLMBeta The beta hyperparameter of the CTC decoder. Word insertion weight.
*
* @return {number} Zero on success, non-zero on failure (invalid arguments).
*/
Model.prototype.enableDecoderWithLM = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
return binding.EnableDecoderWithLM.apply(null, args);
}
/**
* Use the DeepSpeech model to perform Speech-To-Text.
*
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate.
* @param {number} aBufferSize The number of samples in the audio signal.
* @param {number} aSampleRate The sample-rate of the audio signal.
*
* @return {string} The STT result. Returns undefined on error.
*/
Model.prototype.stt = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
return binding.SpeechToText.apply(null, args);
}
/**
* Use the DeepSpeech model to perform Speech-To-Text and output metadata
* about the results.
*
* @param {object} aBuffer A 16-bit, mono raw audio signal at the appropriate sample rate.
* @param {number} aBufferSize The number of samples in the audio signal.
* @param {number} aSampleRate The sample-rate of the audio signal.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`. Returns undefined on error.
*/
Model.prototype.sttWithMetadata = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
return binding.SpeechToTextWithMetadata.apply(null, args);
}
/**
* Create a new streaming inference state. The streaming state returned by this function can then be passed to :js:func:`Model.feedAudioContent` and :js:func:`Model.finishStream`.
*
* @param {number} aSampleRate The sample-rate of the audio signal.
* @return {object} an opaque object that represents the streaming state.
*
* @throws on error
*/
Model.prototype.createStream = function() {
const args = [this._impl].concat(Array.prototype.slice.call(arguments));
const rets = binding.CreateStream.apply(null, args);
@ -59,30 +109,159 @@ Model.prototype.createStream = function() {
return ctx;
}
/**
* Feed audio samples to an ongoing streaming inference.
*
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
* @param {buffer} aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate.
* @param {number} aBufferSize The number of samples in @param aBuffer.
*/
Model.prototype.feedAudioContent = function() {
binding.FeedAudioContent.apply(null, arguments);
}
/**
* Compute the intermediate decoding of an ongoing streaming inference. This is an expensive process as the decoder implementation isn't currently capable of streaming, so it always starts from the beginning of the audio.
*
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
*
* @return {string} The STT intermediate result.
*/
Model.prototype.intermediateDecode = function() {
return binding.IntermediateDecode.apply(null, arguments);
}
/**
* Signal the end of an audio signal to an ongoing streaming inference, returns the STT result over the whole audio signal.
*
* @param {object} aSctx A streaming state returned by :js:func:`Model.setupStream`.
*
* @return {string} The STT result.
*
* This method will free the state (@param aSctx).
*/
Model.prototype.finishStream = function() {
return binding.FinishStream.apply(null, arguments);
}
/**
* Signal the end of an audio signal to an ongoing streaming inference, returns per-letter metadata.
*
* @param {object} aSctx A streaming state pointer returned by :js:func:`Model.setupStream`.
*
* @return {object} Outputs a :js:func:`Metadata` struct of individual letters along with their timing information. The user is responsible for freeing Metadata by calling :js:func:`FreeMetadata`.
*
* This method will free the state pointer (@param aSctx).
*/
Model.prototype.finishStreamWithMetadata = function() {
return binding.FinishStreamWithMetadata.apply(null, arguments);
}
/**
* Frees associated resources and destroys model object.
*
* @param {object} model A model pointer returned by :js:func:`Model`
*
*/
function FreeModel(model) {
return binding.FreeModel(model._impl);
}
/**
* Free memory allocated for metadata information.
*
* @param {object} metadata Object containing metadata as returned by :js:func:`Model.sttWithMetadata` or :js:func:`Model.finishStreamWithMetadata`
*/
function FreeMetadata(metadata) {
return binding.FreeMetadata(metadata);
}
/**
* Destroy a streaming state without decoding the computed logits. This
* can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation.
*
* @param {Object} stream A streaming state pointer returned by :js:func:`Model.createStream`.
*/
function FreeStream(stream) {
return binding.FreeStream(stream);
}
/**
* Print version of this library and of the linked TensorFlow library on standard output.
*/
function printVersions() {
return binding.PrintVersions();
}
//// Metadata and MetadataItem are here only for documentation purposes
/**
* @class
*
* Stores each individual character, along with its timing information
*/
function MetadataItem() {}
/**
* The character generated for transcription
*
* @return {string} The character generated
*/
MetadataItem.prototype.character = function() {}
/**
* Position of the character in units of 20ms
*
* @return {int} The position of the character
*/
MetadataItem.prototype.timestep = function() {};
/**
* Position of the character in seconds
*
* @return {float} The position of the character
*/
MetadataItem.prototype.start_time = function() {};
/**
* @class
*
* Stores the entire CTC output as an array of character metadata objects
*/
function Metadata () {}
/**
* List of items
*
* @return {array} List of :js:func:`MetadataItem`
*/
Metadata.prototype.items = function() {}
/**
* Size of the list of items
*
* @return {int} Number of items
*/
Metadata.prototype.num_items = function() {}
/**
* Approximated confidence value for this transcription. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcription.
*
* @return {float} Confidence value
*/
Metadata.prototype.confidence = function() {}
module.exports = {
Model: Model,
printVersions: binding.PrintVersions,
Metadata: Metadata,
MetadataItem: MetadataItem,
printVersions: printVersions,
FreeModel: FreeModel,
FreeStream: binding.FreeStream,
FreeMetadata: binding.FreeMetadata
FreeStream: FreeStream,
FreeMetadata: FreeMetadata
};

View File

@ -18,7 +18,19 @@ from deepspeech.impl import PrintVersions as printVersions
from deepspeech.impl import FreeStream as freeStream
class Model(object):
def __init__(self, *args, **kwargs):
"""
Class holding a DeepSpeech model
:param aModelPath: Path to model file to load
:type aModelPath: str
:param aAlphabetConfigPath: Path to alphabet file to load
:type aAlphabetConfigPath: str
:param aBeamWidth: Decoder beam width
:type aBeamWidth: int
"""
def __init__(self, *args, **kwargs):
# make sure the attribute is there if CreateModel fails
self._impl = None
@ -33,29 +45,198 @@ class Model(object):
self._impl = None
def enableDecoderWithLM(self, *args, **kwargs):
"""
Enable decoding using beam scoring with a KenLM language model.
:param aLMPath: The path to the language model binary file.
:type aLMPath: str
:param aTriePath: The path to the trie file build from the same vocabulary as the language model binary.
:type aTriePath: str
:param aLMAlpha: The alpha hyperparameter of the CTC decoder. Language Model weight.
:type aLMAlpha: float
:param aLMBeta: The beta hyperparameter of the CTC decoder. Word insertion weight.
:type aLMBeta: float
:return: Zero on success, non-zero on failure (invalid arguments).
:type: int
"""
return deepspeech.impl.EnableDecoderWithLM(self._impl, *args, **kwargs)
def stt(self, *args, **kwargs):
"""
Use the DeepSpeech model to perform Speech-To-Text.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate.
:type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
:param aSampleRate: The sample-rate of the audio signal.
:type aSampleRate: int
:return: The STT result.
:type: str
"""
return deepspeech.impl.SpeechToText(self._impl, *args, **kwargs)
def sttWithMetadata(self, *args, **kwargs):
"""
Use the DeepSpeech model to perform Speech-To-Text and output metadata about the results.
:param aBuffer: A 16-bit, mono raw audio signal at the appropriate sample rate.
:type aBuffer: int array
:param aBufferSize: The number of samples in the audio signal.
:type aBufferSize: int
:param aSampleRate: The sample-rate of the audio signal.
:type aSampleRate: int
:return: Outputs a struct of individual letters along with their timing information.
:type: :func:`Metadata`
"""
return deepspeech.impl.SpeechToTextWithMetadata(self._impl, *args, **kwargs)
def createStream(self, sample_rate=16000):
"""
Create a new streaming inference state. The streaming state returned
by this function can then be passed to :func:`feedAudioContent()` and :func:`finishStream()`.
:param aSampleRate: The sample-rate of the audio signal.
:type aSampleRate: int
:return: Object holding the stream
:throws: RuntimeError on error
"""
status, ctx = deepspeech.impl.CreateStream(self._impl,
aSampleRate=sample_rate)
if status != 0:
raise RuntimeError("CreateStream failed with error code {}".format(status))
return ctx
# pylint: disable=no-self-use
def feedAudioContent(self, *args, **kwargs):
"""
Feed audio samples to an ongoing streaming inference.
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
:param aBuffer: An array of 16-bit, mono raw audio samples at the appropriate sample rate.
:type aBuffer: int array
:param aBufferSize: The number of samples in @p aBuffer.
:type aBufferSize: int
"""
deepspeech.impl.FeedAudioContent(*args, **kwargs)
# pylint: disable=no-self-use
def intermediateDecode(self, *args, **kwargs):
"""
Compute the intermediate decoding of an ongoing streaming inference.
This is an expensive process as the decoder implementation isn't
currently capable of streaming, so it always starts from the beginning
of the audio.
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
:return: The STT intermediate result.
:type: str
"""
return deepspeech.impl.IntermediateDecode(*args, **kwargs)
# pylint: disable=no-self-use
def finishStream(self, *args, **kwargs):
"""
Signal the end of an audio signal to an ongoing streaming
inference, returns the STT result over the whole audio signal.
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
:return: The STT result.
:type: str
"""
return deepspeech.impl.FinishStream(*args, **kwargs)
# pylint: disable=no-self-use
def finishStreamWithMetadata(self, *args, **kwargs):
"""
Signal the end of an audio signal to an ongoing streaming
inference, returns per-letter metadata.
:param aSctx: A streaming state pointer returned by :func:`createStream()`.
:type aSctx: object
:return: Outputs a struct of individual letters along with their timing information.
:type: :func:`Metadata`
"""
return deepspeech.impl.FinishStreamWithMetadata(*args, **kwargs)
# This is only for documentation purpose
# Metadata and MetadataItem should be in sync with native_client/deepspeech.h
class MetadataItem(object):
"""
Stores each individual character, along with its timing information
"""
def character(self):
"""
The character generated for transcription
"""
# pylint: disable=unnecessary-pass
pass
def timestep(self):
"""
Position of the character in units of 20ms
"""
# pylint: disable=unnecessary-pass
pass
def start_time(self):
"""
Position of the character in seconds
"""
# pylint: disable=unnecessary-pass
pass
class Metadata(object):
"""
Stores the entire CTC output as an array of character metadata objects
"""
def items(self):
"""
List of items
:return: A list of :func:`MetadataItem` elements
:type: list
"""
# pylint: disable=unnecessary-pass
pass
def num_items(self):
"""
Size of the list of items
:return: Size of the list of items
:type: int
"""
# pylint: disable=unnecessary-pass
pass
def confidence(self):
"""
Approximated confidence value for this transcription. This is roughly the
sum of the acoustic model logit values for each timestep/character that
contributed to the creation of this transcription.
"""
# pylint: disable=unnecessary-pass
pass

View File

@ -6,6 +6,8 @@ python:
brew:
setup: 'install_local_homebrew "python-ds-test" && install_pkg_local_homebrew "sox" && install_pkg_local_homebrew "readline" && install_pkg_local_homebrew "openssl" && install_pkg_local_homebrew "pkg-config"'
env: 'export EXTRA_ENV="PATH=$TASKCLUSTER_TASK_DIR/python-ds-test.brew/bin/:$PATH"'
packages_docs_bionic:
apt: 'python3 python3-pip zip doxygen'
electronjs:
packages_xenial:
apt: 'libatk1.0-0 libatk-bridge2.0-0 libcairo2 libcups2 libdbus-1-3 libgdk-pixbuf2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libx11-xcb1 libxcomposite1 libxcursor1 libxdamage1 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 xvfb'
@ -25,6 +27,10 @@ nodejs:
prep_10: 'echo "deb http://deb.nodesource.com/node_10.x xenial main" > /etc/apt/sources.list.d/nodesource.list && wget -qO- https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -'
prep_11: 'echo "deb http://deb.nodesource.com/node_11.x xenial main" > /etc/apt/sources.list.d/nodesource.list && wget -qO- https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -'
prep_12: 'echo "deb http://deb.nodesource.com/node_12.x xenial main" > /etc/apt/sources.list.d/nodesource.list && wget -qO- https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -'
packages_docs_bionic:
apt: 'nodejs'
apt_pinning: '(echo "Package: nodejs" && echo "Pin: origin deb.nodesource.com" && echo "Pin-Priority: 999") > /etc/apt/preferences'
prep_12: 'echo "deb http://deb.nodesource.com/node_12.x bionic main" > /etc/apt/sources.list.d/nodesource.list && wget -qO- https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -'
packages_buster:
apt: 'nodejs sox'
apt_pinning: '(echo "Package: nodejs" && echo "Pin: origin deb.nodesource.com" && echo "Pin-Priority: 999") > /etc/apt/preferences'

10
taskcluster/docs-build.sh Normal file
View File

@ -0,0 +1,10 @@
#!/bin/bash
set -xe
THIS=$(dirname "$0")
pushd ${THIS}/../
export PATH=$HOME/.local/bin:${THIS}/../doc/node_modules/.bin/:$PATH
make -C doc/ html dist
popd

View File

@ -0,0 +1,9 @@
#!/bin/bash
set -xe
source $(dirname "$0")/tc-tests-utils.sh
mkdir -p ${TASKCLUSTER_ARTIFACTS} || true
cp ${DS_DSDIR}/doc/html.zip ${TASKCLUSTER_ARTIFACTS}/doc-html.zip

View File

@ -0,0 +1,5 @@
breathe==4.13.1
semver==2.8.1
sphinx==2.2.0
sphinx-js==2.8
sphinx-rtd-theme==0.4.3

64
taskcluster/docs.tyml Normal file
View File

@ -0,0 +1,64 @@
$if: 'event.event in build.allowed'
then:
taskId: ${taskcluster.taskId}
provisionerId: ${taskcluster.docker.provisionerId}
workerType: ${taskcluster.docker.workerType}
taskGroupId: ${taskcluster.taskGroupId}
schedulerId: ${taskcluster.schedulerId}
dependencies:
$map: { $eval: build.dependencies }
each(b):
$eval: as_slugid(b)
created: { $fromNow: '0 sec' }
deadline: { $fromNow: '1 day' }
expires:
$if: '(event.event == "push") || (event.event == "tag")'
then: { $fromNow: '6 months' }
else: { $fromNow: '7 days' }
extra:
nc_asset_name: { $eval: build.nc_asset_name }
github:
$if: '(event.event == "push") || (event.event == "tag")'
then: { $eval: taskcluster.github_events.merge }
else: { $eval: taskcluster.github_events.pull_request }
routes:
$if: '(event.event == "push") || (event.event == "tag")'
then:
{ $eval: build.routes }
payload:
maxRunTime: { $eval: to_int(build.maxRunTime) }
image: "ubuntu:18.04"
command:
- "/bin/bash"
- "--login"
- "-cxe"
- $let:
extraSystemSetup: { $eval: strip(str(build.system_setup)) }
extraSystemConfig: { $eval: strip(str(build.system_config)) }
in: >
apt-get -qq update && apt-get -qq -y install git wget gnupg sudo && ${extraSystemSetup} &&
adduser --system --home ${system.homedir.linux} ${system.username} &&
cd ${system.homedir.linux}/ &&
echo -e "#!/bin/bash\nset -xe\n env && id && git clone --quiet ${event.head.repo.url} ~/DeepSpeech/ds/ && cd ~/DeepSpeech/ds && git checkout --quiet ${event.head.sha}" > /tmp/clone.sh && chmod +x /tmp/clone.sh &&
sudo -H -u ${system.username} /bin/bash /tmp/clone.sh && ${extraSystemConfig} &&
sudo -H -u ${system.username} --preserve-env /bin/bash ${system.homedir.linux}/DeepSpeech/ds/${build.scripts.build} &&
sudo -H -u ${system.username} /bin/bash ${system.homedir.linux}/DeepSpeech/ds/${build.scripts.package}
artifacts:
"public":
type: "directory"
path: "/tmp/artifacts/"
expires:
$if: '(event.event == "push") || (event.event == "tag")'
then: { $fromNow: '6 months' }
else: { $fromNow: '7 days' }
metadata:
name: ${build.metadata.name}
description: ${build.metadata.description}
owner: ${event.head.user.email}
source: ${event.head.repo.url}

18
taskcluster/docs.yml Normal file
View File

@ -0,0 +1,18 @@
build:
template_file: docs.tyml
routes:
- "index.project.deepspeech.deepspeech.native_client.${event.head.branchortag}.docs"
- "index.project.deepspeech.deepspeech.native_client.${event.head.branchortag}.${event.head.sha}.docs"
- "index.project.deepspeech.deepspeech.native_client.docs.${event.head.sha}"
- "notify.irc-channel.${notifications.irc}.on-exception"
- "notify.irc-channel.${notifications.irc}.on-failed"
system_setup:
>
${nodejs.packages_docs_bionic.prep_12} && ${nodejs.packages_docs_bionic.apt_pinning}
&& apt-get -qq update && apt-get -qq -y install ${nodejs.packages_docs_bionic.apt} ${python.packages_docs_bionic.apt}
scripts:
build: "taskcluster/docs-build.sh"
package: "taskcluster/docs-package.sh"
metadata:
name: "DeepSpeech API Documentation"
description: "Building DeepSpeech API Documentation"