From 267287861867d581f500cfc748c30ea3501a9608 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 20 Jul 2020 11:52:35 +0200 Subject: [PATCH] Add docs to Swift bindings and missing methods --- .../swift/deepspeech_ios/DeepSpeech.swift | 193 ++++++++++++++++-- 1 file changed, 173 insertions(+), 20 deletions(-) diff --git a/native_client/swift/deepspeech_ios/DeepSpeech.swift b/native_client/swift/deepspeech_ios/DeepSpeech.swift index b694995b..50c32553 100644 --- a/native_client/swift/deepspeech_ios/DeepSpeech.swift +++ b/native_client/swift/deepspeech_ios/DeepSpeech.swift @@ -30,7 +30,8 @@ public enum DeepSpeechError: Error { case failCreateSess(errorCode: Int32) case failCreateModel(errorCode: Int32) - // Additional case for invalid error codes, should never happen unless the user has mixed header and binary versions + // Additional case for invalid error codes, should never happen unless the + // user has mixed header and binary versions. case invalidErrorCode(errorCode: Int32) } @@ -115,9 +116,15 @@ private func evaluateErrorCode(errorCode: Int32) throws { } } +/// Stores text of an individual token, along with its timing information public struct DeepSpeechTokenMetadata { + /// The text corresponding to this token let text: String + + /// Position of the token in units of 20ms let timestep: Int + + /// Position of the token in seconds let startTime: Float internal init(fromInternal: TokenMetadata) { @@ -127,8 +134,17 @@ public struct DeepSpeechTokenMetadata { } } +/** A single transcript computed by the model, including a confidence value and + the metadata for its constituent tokens +*/ public struct DeepSpeechCandidateTranscript { + /// Array of DeepSpeechTokenMetadata objects private(set) var tokens: [DeepSpeechTokenMetadata] = [] + + /** Approximated confidence value for this transcript. This corresponds to + both acoustic model and language model scores that contributed to the + creation of this transcript. + */ let confidence: Double internal init(fromInternal: CandidateTranscript) { @@ -140,12 +156,16 @@ public struct DeepSpeechCandidateTranscript { } } +/// An array of DeepSpeechCandidateTranscript objects computed by the model public struct DeepSpeechMetadata { + /// Array of DeepSpeechCandidateTranscript objects private(set) var transcripts: [DeepSpeechCandidateTranscript] = [] internal init(fromInternal: UnsafeMutablePointer) { let md = fromInternal.pointee - let transcriptsBuffer = UnsafeBufferPointer(start: md.transcripts, count: Int(md.num_transcripts)) + let transcriptsBuffer = UnsafeBufferPointer( + start: md.transcripts, + count: Int(md.num_transcripts)) for tr in transcriptsBuffer { transcripts.append(DeepSpeechCandidateTranscript(fromInternal: tr)) @@ -167,6 +187,13 @@ public class DeepSpeechStream { } } + /** Feed audio samples to an ongoing streaming inference. + + - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate + sample rate (matching what the model was trained on). + + - Precondition: `finishStream()` has not been called on this stream. + */ public func feedAudioContent(buffer: Array) { precondition(streamCtx != nil, "calling method on invalidated Stream") @@ -175,12 +202,25 @@ public class DeepSpeechStream { } } + /** Feed audio samples to an ongoing streaming inference. + + - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate + sample rate (matching what the model was trained on). + + - Precondition: `finishStream()` has not been called on this stream. + */ public func feedAudioContent(buffer: UnsafeBufferPointer) { precondition(streamCtx != nil, "calling method on invalidated Stream") DS_FeedAudioContent(streamCtx, buffer.baseAddress, UInt32(buffer.count)) } + /** Compute the intermediate decoding of an ongoing streaming inference. + + - Precondition: `finishStream()` has not been called on this stream. + + - Returns: The STT intermediate result. + */ public func intermediateDecode() -> String { precondition(streamCtx != nil, "calling method on invalidated Stream") @@ -189,6 +229,16 @@ public class DeepSpeechStream { return String(cString: result!) } + /** Compute the intermediate decoding of an ongoing streaming inference, + return results including metadata. + + - Parameter numResults: The number of candidate transcripts to return. + + - Precondition: `finishStream()` has not been called on this stream. + + - Returns: Metadata struct containing multiple CandidateTranscript structs. + Each transcript has per-token metadata including timing information. + */ public func intermediateDecodeWithMetadata(numResults: Int) -> DeepSpeechMetadata { precondition(streamCtx != nil, "calling method on invalidated Stream") let result = DS_IntermediateDecodeWithMetadata(streamCtx, UInt32(numResults))! @@ -196,6 +246,15 @@ public class DeepSpeechStream { return DeepSpeechMetadata(fromInternal: result) } + /** Compute the final decoding of an ongoing streaming inference and return + the result. Signals the end of an ongoing streaming inference. + + - Precondition: `finishStream()` has not been called on this stream. + + - Returns: The STT result. + + - Postcondition: This method will invalidate this streaming context. + */ public func finishStream() -> String { precondition(streamCtx != nil, "calling method on invalidated Stream") @@ -206,11 +265,38 @@ public class DeepSpeechStream { } return String(cString: result!) } + + /** Compute the final decoding of an ongoing streaming inference and return + results including metadata. Signals the end of an ongoing streaming + inference. + + - Parameter numResults: The number of candidate transcripts to return. + + - Precondition: `finishStream()` has not been called on this stream. + + - Returns: Metadata struct containing multiple CandidateTranscript structs. + Each transcript has per-token metadata including timing information. + + - Postcondition: This method will invalidate this streaming context. + */ + public func finishStreamWithMetadata(numResults: Int) -> DeepSpeechMetadata { + precondition(streamCtx != nil, "calling method on invalidated Stream") + + let result = DS_FinishStreamWithMetadata(streamCtx, UInt32(numResults))! + defer { DS_FreeMetadata(result) } + return DeepSpeechMetadata(fromInternal: result) + } } +/// An object providing an interface to a trained DeepSpeech model. public class DeepSpeechModel { private var modelCtx: OpaquePointer! + /** + - Parameter modelPath: The path to the model file. + + - Throws: `DeepSpeechError` on failure. + */ public init(modelPath: String) throws { let err = DS_CreateModel(modelPath, &modelCtx) try evaluateErrorCode(errorCode: err) @@ -221,77 +307,144 @@ public class DeepSpeechModel { modelCtx = nil } + /** Get beam width value used by the model. If {@link DS_SetModelBeamWidth} + was not called before, will return the default value loaded from the + model file. + + - Returns: Beam width value used by the model. + */ public func getBeamWidth() -> Int { return Int(DS_GetModelBeamWidth(modelCtx)) } + /** Set beam width value used by the model. + + - Parameter beamWidth: The beam width used by the model. A larger beam + width value generates better results at the cost + of decoding time. + + - Throws: `DeepSpeechError` on failure. + */ public func setBeamWidth(beamWidth: Int) throws { let err = DS_SetModelBeamWidth(modelCtx, UInt32(beamWidth)) try evaluateErrorCode(errorCode: err) } + // The sample rate expected by the model. public var sampleRate: Int { get { return Int(DS_GetModelSampleRate(modelCtx)) } } + /** Enable decoding using an external scorer. + + - Parameter scorerPath: The path to the external scorer file. + + - Throws: `DeepSpeechError` on failure. + */ public func enableExternalScorer(scorerPath: String) throws { let err = DS_EnableExternalScorer(modelCtx, scorerPath) try evaluateErrorCode(errorCode: err) } + /** Disable decoding using an external scorer. + + - Throws: `DeepSpeechError` on failure. + */ public func disableExternalScorer() throws { let err = DS_DisableExternalScorer(modelCtx) try evaluateErrorCode(errorCode: err) } + /** Set hyperparameters alpha and beta of the external scorer. + + - Parameter alpha: The alpha hyperparameter of the decoder. Language model weight. + - Parameter beta: The beta hyperparameter of the decoder. Word insertion weight. + + - Throws: `DeepSpeechError` on failure. + */ public func setScorerAlphaBeta(alpha: Float, beta: Float) throws { let err = DS_SetScorerAlphaBeta(modelCtx, alpha, beta) try evaluateErrorCode(errorCode: err) } + /** Use the DeepSpeech model to convert speech to text. + + - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate + sample rate (matching what the model was trained on). + + - Returns: The STT result. + */ public func speechToText(buffer: Array) -> String { return buffer.withUnsafeBufferPointer { unsafeBufferPointer -> String in return speechToText(buffer: unsafeBufferPointer) } } + /** Use the DeepSpeech model to convert speech to text. + + - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate + sample rate (matching what the model was trained on). + + - Returns: The STT result. + */ public func speechToText(buffer: UnsafeBufferPointer) -> String { let result = DS_SpeechToText(modelCtx, buffer.baseAddress, UInt32(buffer.count)) defer { DS_FreeString(result) } return String(cString: result!) } + /** Use the DeepSpeech model to convert speech to text and output results + including metadata. + + - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate + sample rate (matching what the model was trained on). + - Parameter numResults: The maximum number of DeepSpeechCandidateTranscript + structs to return. Returned value might be smaller than this. + + - Returns: Metadata struct containing multiple CandidateTranscript structs. + Each transcript has per-token metadata including timing information. + */ public func speechToTextWithMetadata(buffer: Array, numResults: Int) -> DeepSpeechMetadata { return buffer.withUnsafeBufferPointer { unsafeBufferPointer -> DeepSpeechMetadata in - let result = DS_SpeechToTextWithMetadata(modelCtx, unsafeBufferPointer.baseAddress, UInt32(buffer.count), UInt32(numResults))! - defer { DS_FreeMetadata(result) } - return DeepSpeechMetadata(fromInternal: result) + return speechToTextWithMetadata(buffer: unsafeBufferPointer, numResults: numResults) } } + /** Use the DeepSpeech model to convert speech to text and output results + including metadata. + + - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate + sample rate (matching what the model was trained on). + - Parameter numResults: The maximum number of DeepSpeechCandidateTranscript + structs to return. Returned value might be smaller than this. + + - Returns: Metadata struct containing multiple CandidateTranscript structs. + Each transcript has per-token metadata including timing information. + */ + public func speechToTextWithMetadata(buffer: UnsafeBufferPointer, numResults: Int) -> DeepSpeechMetadata { + let result = DS_SpeechToTextWithMetadata( + modelCtx, + buffer.baseAddress, + UInt32(buffer.count), + UInt32(numResults))! + defer { DS_FreeMetadata(result) } + return DeepSpeechMetadata(fromInternal: result) + } + + /** Create a new streaming inference state. + + - Returns: DeepSpeechStream object representing the streaming state. + + - Throws: `DeepSpeechError` on failure. + */ public func createStream() throws -> DeepSpeechStream { var streamContext: OpaquePointer! let err = DS_CreateStream(modelCtx, &streamContext) try evaluateErrorCode(errorCode: err) return DeepSpeechStream(streamContext: streamContext) } - - public class func open(path: String, scorerPath: Optional = nil) -> OpaquePointer { - var fooOpaque: OpaquePointer! - DS_CreateModel(path, &fooOpaque) - if let scorerPath = scorerPath { - DS_EnableExternalScorer(fooOpaque, scorerPath) - } - return fooOpaque - } - - public class func createStream(modelState: OpaquePointer) -> OpaquePointer { - var fooOpaque: OpaquePointer! - DS_CreateStream(modelState, &fooOpaque) - return fooOpaque - } } public func DeepSpeechVersion() -> String {