From 5e5db17371f167c57ad8255e8324629acf1dea7d Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 20 Jul 2020 11:16:57 +0200 Subject: [PATCH] Address review comments --- .../swift/deepspeech_ios/deepspeech.h | 357 ------------------ .../deepspeech_ios_test/AppDelegate.swift | 45 --- taskcluster/.shared.yml | 2 - taskcluster/tf_darwin-amd64-opt.yml | 3 - taskcluster/tf_ios-arm64-opt.yml | 3 - taskcluster/tf_ios-x86_64-opt.yml | 3 - taskcluster/tf_tc-brew.sh | 56 --- 7 files changed, 469 deletions(-) delete mode 100644 native_client/swift/deepspeech_ios/deepspeech.h delete mode 100755 taskcluster/tf_tc-brew.sh diff --git a/native_client/swift/deepspeech_ios/deepspeech.h b/native_client/swift/deepspeech_ios/deepspeech.h deleted file mode 100644 index 1df3cf2e..00000000 --- a/native_client/swift/deepspeech_ios/deepspeech.h +++ /dev/null @@ -1,357 +0,0 @@ -#ifndef DEEPSPEECH_H -#define DEEPSPEECH_H - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef SWIG - #if defined _MSC_VER - #define DEEPSPEECH_EXPORT __declspec(dllexport) - #else - #define DEEPSPEECH_EXPORT __attribute__ ((visibility("default"))) - #endif /*End of _MSC_VER*/ -#else - #define DEEPSPEECH_EXPORT -#endif - -typedef struct ModelState ModelState; - -typedef struct StreamingState StreamingState; - -/** - * @brief Stores text of an individual token, along with its timing information - */ -typedef struct TokenMetadata { - /** The text corresponding to this token */ - const char* const text; - - /** Position of the token in units of 20ms */ - const unsigned int timestep; - - /** Position of the token in seconds */ - const float start_time; -} TokenMetadata; - -/** - * @brief A single transcript computed by the model, including a confidence - * value and the metadata for its constituent tokens. - */ -typedef struct CandidateTranscript { - /** Array of TokenMetadata objects */ - const TokenMetadata* const tokens; - /** Size of the tokens array */ - const unsigned int num_tokens; - /** Approximated confidence value for this transcript. This is roughly the - * sum of the acoustic model logit values for each timestep/character that - * contributed to the creation of this transcript. - */ - const double confidence; -} CandidateTranscript; - -/** - * @brief An array of CandidateTranscript objects computed by the model. - */ -typedef struct Metadata { - /** Array of CandidateTranscript objects */ - const CandidateTranscript* const transcripts; - /** Size of the transcripts array */ - const unsigned int num_transcripts; -} Metadata; - -// sphinx-doc: error_code_listing_start - -#define DS_FOR_EACH_ERROR(APPLY) \ - APPLY(DS_ERR_OK, 0x0000, "No error.") \ - APPLY(DS_ERR_NO_MODEL, 0x1000, "Missing model information.") \ - APPLY(DS_ERR_INVALID_ALPHABET, 0x2000, "Invalid alphabet embedded in model. (Data corruption?)") \ - APPLY(DS_ERR_INVALID_SHAPE, 0x2001, "Invalid model shape.") \ - APPLY(DS_ERR_INVALID_SCORER, 0x2002, "Invalid scorer file.") \ - APPLY(DS_ERR_MODEL_INCOMPATIBLE, 0x2003, "Incompatible model.") \ - APPLY(DS_ERR_SCORER_NOT_ENABLED, 0x2004, "External scorer is not enabled.") \ - APPLY(DS_ERR_SCORER_UNREADABLE, 0x2005, "Could not read scorer file.") \ - APPLY(DS_ERR_SCORER_INVALID_LM, 0x2006, "Could not recognize language model header in scorer.") \ - APPLY(DS_ERR_SCORER_NO_TRIE, 0x2007, "Reached end of scorer file before loading vocabulary trie.") \ - APPLY(DS_ERR_SCORER_INVALID_TRIE, 0x2008, "Invalid magic in trie header.") \ - APPLY(DS_ERR_SCORER_VERSION_MISMATCH, 0x2009, "Scorer file version does not match expected version.") \ - APPLY(DS_ERR_FAIL_INIT_MMAP, 0x3000, "Failed to initialize memory mapped model.") \ - APPLY(DS_ERR_FAIL_INIT_SESS, 0x3001, "Failed to initialize the session.") \ - APPLY(DS_ERR_FAIL_INTERPRETER, 0x3002, "Interpreter failed.") \ - APPLY(DS_ERR_FAIL_RUN_SESS, 0x3003, "Failed to run the session.") \ - APPLY(DS_ERR_FAIL_CREATE_STREAM, 0x3004, "Error creating the stream.") \ - APPLY(DS_ERR_FAIL_READ_PROTOBUF, 0x3005, "Error reading the proto buffer model file.") \ - APPLY(DS_ERR_FAIL_CREATE_SESS, 0x3006, "Failed to create session.") \ - APPLY(DS_ERR_FAIL_CREATE_MODEL, 0x3007, "Could not allocate model state.") - -// sphinx-doc: error_code_listing_end - -enum DeepSpeech_Error_Codes -{ -#define DEFINE(NAME, VALUE, DESC) NAME = VALUE, -DS_FOR_EACH_ERROR(DEFINE) -#undef DEFINE -}; - -/** - * @brief An object providing an interface to a trained DeepSpeech model. - * - * @param aModelPath The path to the frozen model graph. - * @param[out] retval a ModelState pointer - * - * @return Zero on success, non-zero on failure. - */ -DEEPSPEECH_EXPORT -int DS_CreateModel(const char* aModelPath, - ModelState** retval); - -/** - * @brief Get beam width value used by the model. If {@link DS_SetModelBeamWidth} - * was not called before, will return the default value loaded from the - * model file. - * - * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. - * - * @return Beam width value used by the model. - */ -DEEPSPEECH_EXPORT -unsigned int DS_GetModelBeamWidth(const ModelState* aCtx); - -/** - * @brief Set beam width value used by the model. - * - * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. - * @param aBeamWidth The beam width used by the model. A larger beam width value - * generates better results at the cost of decoding time. - * - * @return Zero on success, non-zero on failure. - */ -DEEPSPEECH_EXPORT -int DS_SetModelBeamWidth(ModelState* aCtx, - unsigned int aBeamWidth); - -/** - * @brief Return the sample rate expected by a model. - * - * @param aCtx A ModelState pointer created with {@link DS_CreateModel}. - * - * @return Sample rate expected by the model for its input. - */ -DEEPSPEECH_EXPORT -int DS_GetModelSampleRate(const ModelState* aCtx); - -/** - * @brief Frees associated resources and destroys model object. - */ -DEEPSPEECH_EXPORT -void DS_FreeModel(ModelState* ctx); - -/** - * @brief Enable decoding using an external scorer. - * - * @param aCtx The ModelState pointer for the model being changed. - * @param aScorerPath The path to the external scorer file. - * - * @return Zero on success, non-zero on failure (invalid arguments). - */ -DEEPSPEECH_EXPORT -int DS_EnableExternalScorer(ModelState* aCtx, - const char* aScorerPath); - -/** - * @brief Disable decoding using an external scorer. - * - * @param aCtx The ModelState pointer for the model being changed. - * - * @return Zero on success, non-zero on failure. - */ -DEEPSPEECH_EXPORT -int DS_DisableExternalScorer(ModelState* aCtx); - -/** - * @brief Set hyperparameters alpha and beta of the external scorer. - * - * @param aCtx The ModelState pointer for the model being changed. - * @param aAlpha The alpha hyperparameter of the decoder. Language model weight. - * @param aLMBeta The beta hyperparameter of the decoder. Word insertion weight. - * - * @return Zero on success, non-zero on failure. - */ -DEEPSPEECH_EXPORT -int DS_SetScorerAlphaBeta(ModelState* aCtx, - float aAlpha, - float aBeta); - -/** - * @brief Use the DeepSpeech model to convert speech to text. - * - * @param aCtx The ModelState pointer for the model to use. - * @param aBuffer A 16-bit, mono raw audio signal at the appropriate - * sample rate (matching what the model was trained on). - * @param aBufferSize The number of samples in the audio signal. - * - * @return The STT result. The user is responsible for freeing the string using - * {@link DS_FreeString()}. Returns NULL on error. - */ -DEEPSPEECH_EXPORT -char* DS_SpeechToText(ModelState* aCtx, - const short* aBuffer, - unsigned int aBufferSize); - -/** - * @brief Use the DeepSpeech model to convert speech to text and output results - * including metadata. - * - * @param aCtx The ModelState pointer for the model to use. - * @param aBuffer A 16-bit, mono raw audio signal at the appropriate - * sample rate (matching what the model was trained on). - * @param aBufferSize The number of samples in the audio signal. - * @param aNumResults The maximum number of CandidateTranscript structs to return. Returned value might be smaller than this. - * - * @return Metadata struct containing multiple CandidateTranscript structs. Each - * transcript has per-token metadata including timing information. The - * user is responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. - * Returns NULL on error. - */ -DEEPSPEECH_EXPORT -Metadata* DS_SpeechToTextWithMetadata(ModelState* aCtx, - const short* aBuffer, - unsigned int aBufferSize, - unsigned int aNumResults); - -/** - * @brief Create a new streaming inference state. The streaming state returned - * by this function can then be passed to {@link DS_FeedAudioContent()} - * and {@link DS_FinishStream()}. - * - * @param aCtx The ModelState pointer for the model to use. - * @param[out] retval an opaque pointer that represents the streaming state. Can - * be NULL if an error occurs. - * - * @return Zero for success, non-zero on failure. - */ -DEEPSPEECH_EXPORT -int DS_CreateStream(ModelState* aCtx, - StreamingState** retval); - -/** - * @brief Feed audio samples to an ongoing streaming inference. - * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. - * @param aBuffer An array of 16-bit, mono raw audio samples at the - * appropriate sample rate (matching what the model was trained on). - * @param aBufferSize The number of samples in @p aBuffer. - */ -DEEPSPEECH_EXPORT -void DS_FeedAudioContent(StreamingState* aSctx, - const short* aBuffer, - unsigned int aBufferSize); - -/** - * @brief Compute the intermediate decoding of an ongoing streaming inference. - * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. - * - * @return The STT intermediate result. The user is responsible for freeing the - * string using {@link DS_FreeString()}. - */ -DEEPSPEECH_EXPORT -char* DS_IntermediateDecode(const StreamingState* aSctx); - -/** - * @brief Compute the intermediate decoding of an ongoing streaming inference, - * return results including metadata. - * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. - * @param aNumResults The number of candidate transcripts to return. - * - * @return Metadata struct containing multiple candidate transcripts. Each transcript - * has per-token metadata including timing information. The user is - * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. - * Returns NULL on error. - */ -DEEPSPEECH_EXPORT -Metadata* DS_IntermediateDecodeWithMetadata(const StreamingState* aSctx, - unsigned int aNumResults); - -/** - * @brief Compute the final decoding of an ongoing streaming inference and return - * the result. Signals the end of an ongoing streaming inference. - * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. - * - * @return The STT result. The user is responsible for freeing the string using - * {@link DS_FreeString()}. - * - * @note This method will free the state pointer (@p aSctx). - */ -DEEPSPEECH_EXPORT -char* DS_FinishStream(StreamingState* aSctx); - -/** - * @brief Compute the final decoding of an ongoing streaming inference and return - * results including metadata. Signals the end of an ongoing streaming - * inference. - * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. - * @param aNumResults The number of candidate transcripts to return. - * - * @return Metadata struct containing multiple candidate transcripts. Each transcript - * has per-token metadata including timing information. The user is - * responsible for freeing Metadata by calling {@link DS_FreeMetadata()}. - * Returns NULL on error. - * - * @note This method will free the state pointer (@p aSctx). - */ -DEEPSPEECH_EXPORT -Metadata* DS_FinishStreamWithMetadata(StreamingState* aSctx, - unsigned int aNumResults); - -/** - * @brief Destroy a streaming state without decoding the computed logits. This - * can be used if you no longer need the result of an ongoing streaming - * inference and don't want to perform a costly decode operation. - * - * @param aSctx A streaming state pointer returned by {@link DS_CreateStream()}. - * - * @note This method will free the state pointer (@p aSctx). - */ -DEEPSPEECH_EXPORT -void DS_FreeStream(StreamingState* aSctx); - -/** - * @brief Free memory allocated for metadata information. - */ -DEEPSPEECH_EXPORT -void DS_FreeMetadata(Metadata* m); - -/** - * @brief Free a char* string returned by the DeepSpeech API. - */ -DEEPSPEECH_EXPORT -void DS_FreeString(char* str); - -/** - * @brief Returns the version of this library. The returned version is a semantic - * version (SemVer 2.0.0). The string returned must be freed with {@link DS_FreeString()}. - * - * @return The version string. - */ -DEEPSPEECH_EXPORT -char* DS_Version(); - -/** - * @brief Returns a textual description corresponding to an error code. - * The string returned must be freed with @{link DS_FreeString()}. - * - * @return The error description. - */ -DEEPSPEECH_EXPORT -char* DS_ErrorCodeToErrorMessage(int aErrorCode); - -#undef DEEPSPEECH_EXPORT - -#ifdef __cplusplus -} -#endif - -#endif /* DEEPSPEECH_H */ diff --git a/native_client/swift/deepspeech_ios_test/deepspeech_ios_test/AppDelegate.swift b/native_client/swift/deepspeech_ios_test/deepspeech_ios_test/AppDelegate.swift index b589df39..a2dcb427 100644 --- a/native_client/swift/deepspeech_ios_test/deepspeech_ios_test/AppDelegate.swift +++ b/native_client/swift/deepspeech_ios_test/deepspeech_ios_test/AppDelegate.swift @@ -138,32 +138,6 @@ func render(audioContext: AudioContext?, stream: DeepSpeechStream) { func test(model: DeepSpeechModel, audioPath: String, completion: @escaping () -> ()) { let url = URL(fileURLWithPath: audioPath) - //var format = AudioStreamBasicDescription.init() - //format.mSampleRate = 16000; - //format.mFormatID = kAudioFormatLinearPCM; - //format.mFormatFlags = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagsNativeEndian | kAudioFormatFlagIsPacked; - //format.mBitsPerChannel = 16; - //format.mChannelsPerFrame = 1; - //format.mBytesPerFrame = format.mChannelsPerFrame * format.mBitsPerChannel / 8; - //format.mFramesPerPacket = 1; - //format.mBytesPerPacket = format.mFramesPerPacket * format.mBytesPerFrame; - // - //var file = Optional.init(nilLiteral: ()); - //let status = ExtAudioFileCreateWithURL(url as CFURL, - // kAudioFileWAVEType, - // &format, - // nil, - // 0, - // &file) - //print("status: \(status)") - //let status2 = ExtAudioFileSetProperty(file!, - // kExtAudioFileProperty_ClientDataFormat, - // UInt32(MemoryLayout.size), - // &format) - //print("status: \(status2)") - // - //ExtAudioFileRead(file, <#T##ioNumberFrames: UnsafeMutablePointer##UnsafeMutablePointer#>, <#T##ioData: UnsafeMutablePointer##UnsafeMutablePointer#>) - let stream = try! model.createStream() print("\(audioPath)") let start = CFAbsoluteTimeGetCurrent() @@ -177,25 +151,6 @@ func test(model: DeepSpeechModel, audioPath: String, completion: @escaping () -> print("\"\(audioPath)\": \(end - start) - \(result)") completion() }) - - //let file = try! AVAudioFile(forReading: url) - //print("file length \(file.length)") - //let format = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false)! - //let stream = createStream(modelState: modelState) - //while file.framePosition < file.length { - // let pcmBuf = AVAudioPCMBuffer.init(pcmFormat: format, frameCapacity: 8 * 1024)! // arbitrary frameCapacity - // try! file.read(into: pcmBuf) - // if pcmBuf.frameLength == 0 { - // break - // } - // print("read \(pcmBuf.frameLength) frames into buffer") - // let rawPtr = pcmBuf.audioBufferList.pointee.mBuffers.mData! - // let ptr = rawPtr.bindMemory(to: Int16.self, capacity: Int(pcmBuf.frameLength)) - // print("first few samples: \(ptr[0]) \(ptr[1]) \(ptr[2]) \(ptr[3]) ") - // DS_FeedAudioContent(stream, ptr, UInt32(pcmBuf.frameLength)) - //} - //let result = DS_FinishStream(stream) - //return String.init(cString: result!) } @UIApplicationMain diff --git a/taskcluster/.shared.yml b/taskcluster/.shared.yml index d80ef2f7..03bdd3fd 100644 --- a/taskcluster/.shared.yml +++ b/taskcluster/.shared.yml @@ -14,8 +14,6 @@ deepspeech: tensorflow: packages_xenial: apt: 'apt-get -qq update && apt-get -qq -y install realpath build-essential python-virtualenv python-dev python-pip libblas-dev liblapack-dev gfortran wget software-properties-common pixz zip zlib1g-dev unzip' - packages_macos: - brew: '$TASKCLUSTER_TASK_DIR/DeepSpeech/ds/taskcluster/tf_tc-brew.sh' packages_win: pacman: 'pacman --noconfirm -S patch unzip tar' msys64: 'ln -s $USERPROFILE/msys64 $TASKCLUSTER_TASK_DIR/msys64' diff --git a/taskcluster/tf_darwin-amd64-opt.yml b/taskcluster/tf_darwin-amd64-opt.yml index 365e1700..64674b1f 100644 --- a/taskcluster/tf_darwin-amd64-opt.yml +++ b/taskcluster/tf_darwin-amd64-opt.yml @@ -5,9 +5,6 @@ build: artifact_namespace: ${system.tensorflow.darwin_amd64.namespace} generic: workerType: "ds-macos-heavy" - system_config: - > - ${tensorflow.packages_macos.brew} scripts: setup: "taskcluster/tf_tc-setup.sh" build: "taskcluster/tf_tc-build.sh --cpu" diff --git a/taskcluster/tf_ios-arm64-opt.yml b/taskcluster/tf_ios-arm64-opt.yml index 9f253b3f..edb3eb2b 100644 --- a/taskcluster/tf_ios-arm64-opt.yml +++ b/taskcluster/tf_ios-arm64-opt.yml @@ -5,9 +5,6 @@ build: artifact_namespace: ${system.tensorflow.ios_arm64.namespace} generic: workerType: "ds-macos-heavy" - system_config: - > - ${tensorflow.packages_macos.brew} scripts: setup: "taskcluster/tf_tc-setup.sh" build: "taskcluster/tf_tc-build.sh --ios-arm64" diff --git a/taskcluster/tf_ios-x86_64-opt.yml b/taskcluster/tf_ios-x86_64-opt.yml index c56ad3ca..8f82cb95 100644 --- a/taskcluster/tf_ios-x86_64-opt.yml +++ b/taskcluster/tf_ios-x86_64-opt.yml @@ -5,9 +5,6 @@ build: artifact_namespace: ${system.tensorflow.ios_x86_64.namespace} generic: workerType: "ds-macos-heavy" - system_config: - > - ${tensorflow.packages_macos.brew} scripts: setup: "taskcluster/tf_tc-setup.sh" build: "taskcluster/tf_tc-build.sh --ios-x86_64" diff --git a/taskcluster/tf_tc-brew.sh b/taskcluster/tf_tc-brew.sh deleted file mode 100755 index 8d4128a2..00000000 --- a/taskcluster/tf_tc-brew.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash - -set -ex - -# if [ -z "${TASKCLUSTER_TASK_DIR}" ]; then -# echo "No TASKCLUSTER_TASK_DIR, aborting." -# exit 1 -# fi - -# LOCAL_BREW="${TASKCLUSTER_TASK_DIR}/homebrew" -# export PATH=${LOCAL_BREW}/bin:$PATH -# export HOMEBREW_LOGS="${TASKCLUSTER_TASK_DIR}/homebrew.logs/" -# export HOMEBREW_CACHE="${TASKCLUSTER_TASK_DIR}/homebrew.cache/" -# export HOMEBREW_FORMULAS_COMMIT=93fe256e0168db3b1c70c26a01941be59ce76311 -# export HOMEBREW_NO_AUTO_UPDATE=1 - -# # Never fail on pre-existing homebrew/ directory -# mkdir -p "${LOCAL_BREW}" || true -# mkdir -p "${HOMEBREW_CACHE}" || true - -# # Make sure to verify there is a 'brew' binary there, otherwise install things. -# if [ ! -x "${LOCAL_BREW}/bin/brew" ]; then -# curl -L https://github.com/Homebrew/brew/tarball/2.2.17 | tar xz --strip 1 -C "${LOCAL_BREW}" -# fi; - -# echo "local brew list (should be empty) ..." -# brew list - -# echo "local brew prefix ..." -# local_prefix=$(brew --prefix) -# echo "${local_prefix}" - -# if [ "${LOCAL_BREW}" != "${local_prefix}" ]; then -# echo "Weird state:" -# echo "LOCAL_BREW=${LOCAL_BREW}" -# echo "local_prefix=${local_prefix}" -# exit 1 -# fi; - - -# # Then we force onto a specific well-known commit -# mkdir -p "$(brew --prefix)/Library/Taps/homebrew/homebrew-core" -# pushd "$(brew --prefix)/Library/Taps/homebrew/homebrew-core" -# git init -# git remote add origin https://github.com/Homebrew/homebrew-core.git -# git fetch origin -# git checkout ${HOMEBREW_FORMULAS_COMMIT} -# popd - -# # coreutils, pyenv-virtualenv required for build of tensorflow -# all_pkgs="coreutils pyenv-virtualenv" - -# for pkg in ${all_pkgs}; -# do -# (brew list --versions ${pkg} && brew upgrade ${pkg}) || brew install ${pkg} -# done;