Address review comments

This commit is contained in:
Reuben Morais 2020-07-20 11:16:57 +02:00
parent de7a249fcd
commit 5e5db17371
7 changed files with 0 additions and 469 deletions

View File

@ -1,357 +0,0 @@
#ifndef DEEPSPEECH_H
#define DEEPSPEECH_H
#ifdef __cplusplus
extern "C" {
#endif
#ifndef SWIG
#if defined _MSC_VER
#define DEEPSPEECH_EXPORT __declspec(dllexport)
#else
#define DEEPSPEECH_EXPORT __attribute__ ((visibility("default")))
#endif /*End of _MSC_VER*/
#else
#define DEEPSPEECH_EXPORT
#endif
typedef struct ModelState ModelState;
typedef struct StreamingState StreamingState;
/**
* @brief Stores text of an individual token, along with its timing information
*/
typedef struct TokenMetadata {
/** The text corresponding to this token */
const char* const text;
/** Position of the token in units of 20ms */
const unsigned int timestep;
/** Position of the token in seconds */
const float start_time;
} TokenMetadata;
/**
* @brief A single transcript computed by the model, including a confidence
* value and the metadata for its constituent tokens.
*/
typedef struct CandidateTranscript {
/** Array of TokenMetadata objects */
const TokenMetadata* const tokens;
/** Size of the tokens array */
const unsigned int num_tokens;
/** Approximated confidence value for this transcript. This is roughly the
* sum of the acoustic model logit values for each timestep/character that
* contributed to the creation of this transcript.
*/
const double confidence;
} CandidateTranscript;
/**
* @brief An array of CandidateTranscript objects computed by the model.
*/
typedef struct Metadata {
/** Array of CandidateTranscript objects */
const CandidateTranscript* const transcripts;
/** Size of the transcripts array */
const unsigned int num_transcripts;
} Metadata;
// sphinx-doc: error_code_listing_start
#define DS_FOR_EACH_ERROR(APPLY) \
APPLY(DS_ERR_OK, 0x0000, "No error.") \
APPLY(DS_ERR_NO_MODEL, 0x1000, "Missing model information.") \
APPLY(DS_ERR_INVALID_ALPHABET, 0x2000, "Invalid alphabet embedded in model. (Data corruption?)") \
APPLY(DS_ERR_INVALID_SHAPE, 0x2001, "Invalid model shape.") \
APPLY(DS_ERR_INVALID_SCORER, 0x2002, "Invalid scorer file.") \
APPLY(DS_ERR_MODEL_INCOMPATIBLE, 0x2003, "Incompatible model.") \
APPLY(DS_ERR_SCORER_NOT_ENABLED, 0x2004, "External scorer is not enabled.") \
APPLY(DS_ERR_SCORER_UNREADABLE, 0x2005, "Could not read scorer file.") \
APPLY(DS_ERR_SCORER_INVALID_LM, 0x2006, "Could not recognize language model header in scorer.") \
APPLY(DS_ERR_SCORER_NO_TRIE, 0x2007, "Reached end of scorer file before loading vocabulary trie.") \
APPLY(DS_ERR_SCORER_INVALID_TRIE, 0x2008, "Invalid magic in trie header.") \
APPLY(DS_ERR_SCORER_VERSION_MISMATCH, 0x2009, "Scorer file version does not match expected version.") \
APPLY(DS_ERR_FAIL_INIT_MMAP, 0x3000, "Failed to initialize memory mapped model.") \
APPLY(DS_ERR_FAIL_INIT_SESS, 0x3001, "Failed to initialize the session.") \
APPLY(DS_ERR_FAIL_INTERPRETER, 0x3002, "Interpreter failed.") \
APPLY(DS_ERR_FAIL_RUN_SESS, 0x3003, "Failed to run the session.") \
APPLY(DS_ERR_FAIL_CREATE_STREAM, 0x3004, "Error creating the stream.") \
APPLY(DS_ERR_FAIL_READ_PROTOBUF, 0x3005, "Error reading the proto buffer model file.") \
APPLY(DS_ERR_FAIL_CREATE_SESS, 0x3006, "Failed to create session.") \
APPLY(DS_ERR_FAIL_CREATE_MODEL, 0x3007, "Could not allocate model state.")
// sphinx-doc: error_code_listing_end
enum DeepSpeech_Error_Codes
{
#define DEFINE(NAME, VALUE, DESC) NAME = VALUE,
DS_FOR_EACH_ERROR(DEFINE)
#undef DEFINE
};
/**
* @brief An object providing an interface to a trained DeepSpeech model.
*
* @param aModelPath The path to the frozen model graph.
* @param[out] retval a ModelState pointer
*
* @return Zero on success, non-zero on failure.
*/
DEEPSPEECH_EXPORT
int DS_CreateModel(const char* aModelPath,
ModelState** retval);
/**
* @brief Get beam width value used by the model. If {@link DS_SetModelBeamWidth}
* was not called before, will return the default value loaded from the
* model file.
*
* @param aCtx A ModelState pointer created with {@link DS_CreateModel}.
*
* @return Beam width value used by the model.
*/
DEEPSPEECH_EXPORT
unsigned int DS_GetModelBeamWidth(const ModelState* aCtx);
/**
* @brief Set beam width value used by the model.
*
* @param aCtx A ModelState pointer created with {@link DS_CreateModel}.
* @param aBeamWidth The beam width used by the model. A larger beam width value
* generates better results at the cost of decoding time.
*
* @return Zero on success, non-zero on failure.
*/
DEEPSPEECH_EXPORT
int DS_SetModelBeamWidth(ModelState* aCtx,
unsigned int aBeamWidth);
/**
* @brief Return the sample rate expected by a model.
*
* @param aCtx A ModelState pointer created with {@link DS_CreateModel}.
*
* @return Sample rate expected by the model for its input.
*/
DEEPSPEECH_EXPORT
int DS_GetModelSampleRate(const ModelState* aCtx);
/**
* @brief Frees associated resources and destroys model object.
*/
DEEPSPEECH_EXPORT
void DS_FreeModel(ModelState* ctx);
/**
* @brief Enable decoding using an external scorer.
*
* @param aCtx The ModelState pointer for the model being changed.
* @param aScorerPath The path to the external scorer file.
*
* @return Zero on success, non-zero on failure (invalid arguments).
*/
DEEPSPEECH_EXPORT
int DS_EnableExternalScorer(ModelState* aCtx,
const char* aScorerPath);
/**
* @brief Disable decoding using an external scorer.
*
* @param aCtx The ModelState pointer for the model being changed.
*
* @return Zero on success, non-zero on failure.
*/
DEEPSPEECH_EXPORT
int DS_DisableExternalScorer(ModelState* aCtx);
/**
* @brief Set hyperparameters alpha and beta of the external scorer.
*
* @param aCtx The ModelState pointer for the model being changed.
* @param aAlpha The alpha hyperparameter of the decoder. Language model weight.
* @param aLMBeta The beta hyperparameter of the decoder. Word insertion weight.
*
* @return Zero on success, non-zero on failure.
*/
DEEPSPEECH_EXPORT
int DS_SetScorerAlphaBeta(ModelState* aCtx,
float aAlpha,
float aBeta);
/**
* @brief Use the DeepSpeech model to convert speech to text.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
*
* @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}. Returns NULL on error.
*/
DEEPSPEECH_EXPORT
char* DS_SpeechToText(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize);
/**
* @brief Use the DeepSpeech model to convert speech to text and output results
* including metadata.
*
* @param aCtx The ModelState pointer for the model to use.
* @param aBuffer A 16-bit, mono raw audio signal at the appropriate
* sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in the audio signal.
* @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this.
*
* @return Metadata struct containing multiple CandidateTranscript structs. Each
* transcript has per-token metadata including timing information. The
* user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
*/
DEEPSPEECH_EXPORT
Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx,
const short* aBuffer,
unsigned int aBufferSize,
unsigned int aNumResults);
/**
* @brief Create a new streaming inference state. The streaming state returned
* by this function can then be passed to {@link DS_FeedAudioContent()}
* and {@link DS_FinishStream()}.
*
* @param aCtx The ModelState pointer for the model to use.
* @param[out] retval an opaque pointer that represents the streaming state. Can
* be NULL if an error occurs.
*
* @return Zero for success, non-zero on failure.
*/
DEEPSPEECH_EXPORT
int DS_CreateStream(ModelState* aCtx,
StreamingState** retval);
/**
* @brief Feed audio samples to an ongoing streaming inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aBuffer An array of 16-bit, mono raw audio samples at the
* appropriate sample rate (matching what the model was trained on).
* @param aBufferSize The number of samples in @p aBuffer.
*/
DEEPSPEECH_EXPORT
void DS_FeedAudioContent(StreamingState* aSctx,
const short* aBuffer,
unsigned int aBufferSize);
/**
* @brief Compute the intermediate decoding of an ongoing streaming inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @return The STT intermediate result. The user is responsible for freeing the
* string using {@link DS_FreeString()}.
*/
DEEPSPEECH_EXPORT
char* DS_IntermediateDecode(const StreamingState* aSctx);
/**
* @brief Compute the intermediate decoding of an ongoing streaming inference,
* return results including metadata.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
*
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information. The user is
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
*/
DEEPSPEECH_EXPORT
Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx,
unsigned int aNumResults);
/**
* @brief Compute the final decoding of an ongoing streaming inference and return
* the result. Signals the end of an ongoing streaming inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @return The STT result. The user is responsible for freeing the string using
* {@link DS_FreeString()}.
*
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
char* DS_FinishStream(StreamingState* aSctx);
/**
* @brief Compute the final decoding of an ongoing streaming inference and return
* results including metadata. Signals the end of an ongoing streaming
* inference.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
* @param aNumResults The number of candidate transcripts to return.
*
* @return Metadata struct containing multiple candidate transcripts. Each transcript
* has per-token metadata including timing information. The user is
* responsible for freeing Metadata by calling {@link DS_FreeMetadata()}.
* Returns NULL on error.
*
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx,
unsigned int aNumResults);
/**
* @brief Destroy a streaming state without decoding the computed logits. This
* can be used if you no longer need the result of an ongoing streaming
* inference and don't want to perform a costly decode operation.
*
* @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}.
*
* @note This method will free the state pointer (@p aSctx).
*/
DEEPSPEECH_EXPORT
void DS_FreeStream(StreamingState* aSctx);
/**
* @brief Free memory allocated for metadata information.
*/
DEEPSPEECH_EXPORT
void DS_FreeMetadata(Metadata* m);
/**
* @brief Free a char* string returned by the DeepSpeech API.
*/
DEEPSPEECH_EXPORT
void DS_FreeString(char* str);
/**
* @brief Returns the version of this library. The returned version is a semantic
* version (SemVer 2.0.0). The string returned must be freed with {@link DS_FreeString()}.
*
* @return The version string.
*/
DEEPSPEECH_EXPORT
char* DS_Version();
/**
* @brief Returns a textual description corresponding to an error code.
* The string returned must be freed with @{link DS_FreeString()}.
*
* @return The error description.
*/
DEEPSPEECH_EXPORT
char* DS_ErrorCodeToErrorMessage(int aErrorCode);
#undef DEEPSPEECH_EXPORT
#ifdef __cplusplus
}
#endif
#endif /* DEEPSPEECH_H */

View File

@ -138,32 +138,6 @@ func render(audioContext: AudioContext?, stream: DeepSpeechStream) {
func test(model: DeepSpeechModel, audioPath: String, completion: @escaping () -> ()) { func test(model: DeepSpeechModel, audioPath: String, completion: @escaping () -> ()) {
let url = URL(fileURLWithPath: audioPath) let url = URL(fileURLWithPath: audioPath)
//var format = AudioStreamBasicDescription.init()
//format.mSampleRate = 16000;
//format.mFormatID = kAudioFormatLinearPCM;
//format.mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagsNativeEndian | kAudioFormatFlagIsPacked;
//format.mBitsPerChannel = 16;
//format.mChannelsPerFrame = 1;
//format.mBytesPerFrame = format.mChannelsPerFrame * format.mBitsPerChannel / 8;
//format.mFramesPerPacket = 1;
//format.mBytesPerPacket = format.mFramesPerPacket * format.mBytesPerFrame;
//
//var file = Optional<ExtAudioFileRef>.init(nilLiteral: ());
//let status = ExtAudioFileCreateWithURL(url as CFURL,
// kAudioFileWAVEType,
// &format,
// nil,
// 0,
// &file)
//print("status: \(status)")
//let status2 = ExtAudioFileSetProperty(file!,
// kExtAudioFileProperty_ClientDataFormat,
// UInt32(MemoryLayout<AudioStreamBasicDescription>.size),
// &format)
//print("status: \(status2)")
//
//ExtAudioFileRead(file, <#T##ioNumberFrames: UnsafeMutablePointer<UInt32>##UnsafeMutablePointer<UInt32>#>, <#T##ioData: UnsafeMutablePointer<AudioBufferList>##UnsafeMutablePointer<AudioBufferList>#>)
let stream = try! model.createStream() let stream = try! model.createStream()
print("\(audioPath)") print("\(audioPath)")
let start = CFAbsoluteTimeGetCurrent() let start = CFAbsoluteTimeGetCurrent()
@ -177,25 +151,6 @@ func test(model: DeepSpeechModel, audioPath: String, completion: @escaping () ->
print("\"\(audioPath)\": \(end - start) - \(result)") print("\"\(audioPath)\": \(end - start) - \(result)")
completion() completion()
}) })
//let file = try! AVAudioFile(forReading: url)
//print("file length \(file.length)")
//let format = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false)!
//let stream = createStream(modelState: modelState)
//while file.framePosition < file.length {
// let pcmBuf = AVAudioPCMBuffer.init(pcmFormat: format, frameCapacity: 8 * 1024)! // arbitrary frameCapacity
// try! file.read(into: pcmBuf)
// if pcmBuf.frameLength == 0 {
// break
// }
// print("read \(pcmBuf.frameLength) frames into buffer")
// let rawPtr = pcmBuf.audioBufferList.pointee.mBuffers.mData!
// let ptr = rawPtr.bindMemory(to: Int16.self, capacity: Int(pcmBuf.frameLength))
// print("first few samples: \(ptr[0]) \(ptr[1]) \(ptr[2]) \(ptr[3]) ")
// DS_FeedAudioContent(stream, ptr, UInt32(pcmBuf.frameLength))
//}
//let result = DS_FinishStream(stream)
//return String.init(cString: result!)
} }
@UIApplicationMain @UIApplicationMain

View File

@ -14,8 +14,6 @@ deepspeech:
tensorflow: tensorflow:
packages_xenial: packages_xenial:
apt: 'apt-get -qq update && apt-get -qq -y install realpath build-essential python-virtualenv python-dev python-pip libblas-dev liblapack-dev gfortran wget software-properties-common pixz zip zlib1g-dev unzip' apt: 'apt-get -qq update && apt-get -qq -y install realpath build-essential python-virtualenv python-dev python-pip libblas-dev liblapack-dev gfortran wget software-properties-common pixz zip zlib1g-dev unzip'
packages_macos:
brew: '$TASKCLUSTER_TASK_DIR/DeepSpeech/ds/taskcluster/tf_tc-brew.sh'
packages_win: packages_win:
pacman: 'pacman --noconfirm -S patch unzip tar' pacman: 'pacman --noconfirm -S patch unzip tar'
msys64: 'ln -s $USERPROFILE/msys64 $TASKCLUSTER_TASK_DIR/msys64' msys64: 'ln -s $USERPROFILE/msys64 $TASKCLUSTER_TASK_DIR/msys64'

View File

@ -5,9 +5,6 @@ build:
artifact_namespace: ${system.tensorflow.darwin_amd64.namespace} artifact_namespace: ${system.tensorflow.darwin_amd64.namespace}
generic: generic:
workerType: "ds-macos-heavy" workerType: "ds-macos-heavy"
system_config:
>
${tensorflow.packages_macos.brew}
scripts: scripts:
setup: "taskcluster/tf_tc-setup.sh" setup: "taskcluster/tf_tc-setup.sh"
build: "taskcluster/tf_tc-build.sh --cpu" build: "taskcluster/tf_tc-build.sh --cpu"

View File

@ -5,9 +5,6 @@ build:
artifact_namespace: ${system.tensorflow.ios_arm64.namespace} artifact_namespace: ${system.tensorflow.ios_arm64.namespace}
generic: generic:
workerType: "ds-macos-heavy" workerType: "ds-macos-heavy"
system_config:
>
${tensorflow.packages_macos.brew}
scripts: scripts:
setup: "taskcluster/tf_tc-setup.sh" setup: "taskcluster/tf_tc-setup.sh"
build: "taskcluster/tf_tc-build.sh --ios-arm64" build: "taskcluster/tf_tc-build.sh --ios-arm64"

View File

@ -5,9 +5,6 @@ build:
artifact_namespace: ${system.tensorflow.ios_x86_64.namespace} artifact_namespace: ${system.tensorflow.ios_x86_64.namespace}
generic: generic:
workerType: "ds-macos-heavy" workerType: "ds-macos-heavy"
system_config:
>
${tensorflow.packages_macos.brew}
scripts: scripts:
setup: "taskcluster/tf_tc-setup.sh" setup: "taskcluster/tf_tc-setup.sh"
build: "taskcluster/tf_tc-build.sh --ios-x86_64" build: "taskcluster/tf_tc-build.sh --ios-x86_64"

View File

@ -1,56 +0,0 @@
#!/bin/bash
set -ex
# if [ -z "${TASKCLUSTER_TASK_DIR}" ]; then
# echo "No TASKCLUSTER_TASK_DIR, aborting."
# exit 1
# fi
# LOCAL_BREW="${TASKCLUSTER_TASK_DIR}/homebrew"
# export PATH=${LOCAL_BREW}/bin:$PATH
# export HOMEBREW_LOGS="${TASKCLUSTER_TASK_DIR}/homebrew.logs/"
# export HOMEBREW_CACHE="${TASKCLUSTER_TASK_DIR}/homebrew.cache/"
# export HOMEBREW_FORMULAS_COMMIT=93fe256e0168db3b1c70c26a01941be59ce76311
# export HOMEBREW_NO_AUTO_UPDATE=1
# # Never fail on pre-existing homebrew/ directory
# mkdir -p "${LOCAL_BREW}" || true
# mkdir -p "${HOMEBREW_CACHE}" || true
# # Make sure to verify there is a 'brew' binary there, otherwise install things.
# if [ ! -x "${LOCAL_BREW}/bin/brew" ]; then
# curl -L https://github.com/Homebrew/brew/tarball/2.2.17 | tar xz --strip 1 -C "${LOCAL_BREW}"
# fi;
# echo "local brew list (should be empty) ..."
# brew list
# echo "local brew prefix ..."
# local_prefix=$(brew --prefix)
# echo "${local_prefix}"
# if [ "${LOCAL_BREW}" != "${local_prefix}" ]; then
# echo "Weird state:"
# echo "LOCAL_BREW=${LOCAL_BREW}"
# echo "local_prefix=${local_prefix}"
# exit 1
# fi;
# # Then we force onto a specific well-known commit
# mkdir -p "$(brew --prefix)/Library/Taps/homebrew/homebrew-core"
# pushd "$(brew --prefix)/Library/Taps/homebrew/homebrew-core"
# git init
# git remote add origin https://github.com/Homebrew/homebrew-core.git
# git fetch origin
# git checkout ${HOMEBREW_FORMULAS_COMMIT}
# popd
# # coreutils, pyenv-virtualenv required for build of tensorflow
# all_pkgs="coreutils pyenv-virtualenv"
# for pkg in ${all_pkgs};
# do
# (brew list --versions ${pkg} && brew upgrade ${pkg}) || brew install ${pkg}
# done;