Add docs to Swift bindings and missing methods

2020-07-20 11:52:35 +02:00 · 2020-07-20 11:52:35 +02:00 · 2672878618
parent d9dac13343
commit 2672878618
1 changed files with 173 additions and 20 deletions
--- a/native_client/swift/deepspeech_ios/DeepSpeech.swift
+++ b/native_client/swift/deepspeech_ios/DeepSpeech.swift
@ -30,7 +30,8 @@ public enum DeepSpeechError: Error {
    case failCreateSess(errorCode: Int32)
    case failCreateModel(errorCode: Int32)
-    // Additional case for invalid error codes, should never happen unless the user has mixed header and binary versions
+    // Additional case for invalid error codes, should never happen unless the
    // user has mixed header and binary versions.
    case invalidErrorCode(errorCode: Int32)
 }
@ -115,9 +116,15 @@ private func evaluateErrorCode(errorCode: Int32) throws {
    }
 }
 /// Stores text of an individual token, along with its timing information
 public struct DeepSpeechTokenMetadata {
    /// The text corresponding to this token
    let text: String
    /// Position of the token in units of 20ms
    let timestep: Int
    /// Position of the token in seconds
    let startTime: Float
    internal init(fromInternal: TokenMetadata) {
@ -127,8 +134,17 @@ public struct DeepSpeechTokenMetadata {
    }
 }
 /** A single transcript computed by the model, including a confidence value and
    the metadata for its constituent tokens
 */
 public struct DeepSpeechCandidateTranscript {
    /// Array of DeepSpeechTokenMetadata objects
    private(set) var tokens: [DeepSpeechTokenMetadata] = []
    /** Approximated confidence value for this transcript. This corresponds to
        both acoustic model and language model scores that contributed to the
        creation of this transcript.
    */
    let confidence: Double
    internal init(fromInternal: CandidateTranscript) {
@ -140,12 +156,16 @@ public struct DeepSpeechCandidateTranscript {
    }
 }
 /// An array of DeepSpeechCandidateTranscript objects computed by the model
 public struct DeepSpeechMetadata {
    /// Array of DeepSpeechCandidateTranscript objects
    private(set) var transcripts: [DeepSpeechCandidateTranscript] = []
    internal init(fromInternal: UnsafeMutablePointer<Metadata>) {
        let md = fromInternal.pointee
-        let transcriptsBuffer = UnsafeBufferPointer<CandidateTranscript>(start: md.transcripts, count: Int(md.num_transcripts))
+        let transcriptsBuffer = UnsafeBufferPointer<CandidateTranscript>(
            start: md.transcripts,
            count: Int(md.num_transcripts))
        for tr in transcriptsBuffer {
            transcripts.append(DeepSpeechCandidateTranscript(fromInternal: tr))
@ -167,6 +187,13 @@ public class DeepSpeechStream {
        }
    }
    /** Feed audio samples to an ongoing streaming inference.
        - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
                            sample rate (matching what the model was trained on).
        - Precondition: `finishStream()` has not been called on this stream.
    */
    public func feedAudioContent(buffer: Array<Int16>) {
        precondition(streamCtx != nil, "calling method on invalidated Stream")
@ -175,12 +202,25 @@ public class DeepSpeechStream {
        }
    }
    /** Feed audio samples to an ongoing streaming inference.
        - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
                            sample rate (matching what the model was trained on).
        - Precondition: `finishStream()` has not been called on this stream.
    */
    public func feedAudioContent(buffer: UnsafeBufferPointer<Int16>) {
        precondition(streamCtx != nil, "calling method on invalidated Stream")
        DS_FeedAudioContent(streamCtx, buffer.baseAddress, UInt32(buffer.count))
    }
    /** Compute the intermediate decoding of an ongoing streaming inference.
        - Precondition: `finishStream()` has not been called on this stream.
        - Returns: The STT intermediate result.
    */
    public func intermediateDecode() -> String {
        precondition(streamCtx != nil, "calling method on invalidated Stream")
@ -189,6 +229,16 @@ public class DeepSpeechStream {
        return String(cString: result!)
    }
    /** Compute the intermediate decoding of an ongoing streaming inference,
        return results including metadata.
        - Parameter numResults: The number of candidate transcripts to return.
        - Precondition: `finishStream()` has not been called on this stream.
        - Returns: Metadata struct containing multiple CandidateTranscript structs.
                   Each transcript has per-token metadata including timing information.
    */
    public func intermediateDecodeWithMetadata(numResults: Int) -> DeepSpeechMetadata {
        precondition(streamCtx != nil, "calling method on invalidated Stream")
        let result = DS_IntermediateDecodeWithMetadata(streamCtx, UInt32(numResults))!
@ -196,6 +246,15 @@ public class DeepSpeechStream {
        return DeepSpeechMetadata(fromInternal: result)
    }
    /** Compute the final decoding of an ongoing streaming inference and return
        the result. Signals the end of an ongoing streaming inference.
        - Precondition: `finishStream()` has not been called on this stream.
        - Returns: The STT result.
        - Postcondition: This method will invalidate this streaming context.
    */
    public func finishStream() -> String {
        precondition(streamCtx != nil, "calling method on invalidated Stream")
@ -206,11 +265,38 @@ public class DeepSpeechStream {
        }
        return String(cString: result!)
    }
    /** Compute the final decoding of an ongoing streaming inference and return
        results including metadata. Signals the end of an ongoing streaming
        inference.
        - Parameter numResults: The number of candidate transcripts to return.
        - Precondition: `finishStream()` has not been called on this stream.
        - Returns: Metadata struct containing multiple CandidateTranscript structs.
                   Each transcript has per-token metadata including timing information.
        - Postcondition: This method will invalidate this streaming context.
    */
    public func finishStreamWithMetadata(numResults: Int) -> DeepSpeechMetadata {
        precondition(streamCtx != nil, "calling method on invalidated Stream")
        let result = DS_FinishStreamWithMetadata(streamCtx, UInt32(numResults))!
        defer { DS_FreeMetadata(result) }
        return DeepSpeechMetadata(fromInternal: result)
    }
 }
 /// An object providing an interface to a trained DeepSpeech model.
 public class DeepSpeechModel {
    private var modelCtx: OpaquePointer!
    /**
        - Parameter modelPath: The path to the model file.
        - Throws: `DeepSpeechError` on failure.
    */
    public init(modelPath: String) throws {
        let err = DS_CreateModel(modelPath, &modelCtx)
        try evaluateErrorCode(errorCode: err)
@ -221,77 +307,144 @@ public class DeepSpeechModel {
        modelCtx = nil
    }
    /** Get beam width value used by the model. If {@link DS_SetModelBeamWidth}
        was not called before, will return the default value loaded from the
        model file.
        - Returns: Beam width value used by the model.
    */
    public func getBeamWidth() -> Int {
        return Int(DS_GetModelBeamWidth(modelCtx))
    }
    /** Set beam width value used by the model.
        - Parameter beamWidth: The beam width used by the model. A larger beam
                               width value generates better results at the cost
                               of decoding time.
        - Throws: `DeepSpeechError` on failure.
    */
    public func setBeamWidth(beamWidth: Int) throws {
        let err = DS_SetModelBeamWidth(modelCtx, UInt32(beamWidth))
        try evaluateErrorCode(errorCode: err)
    }
    // The sample rate expected by the model.
    public var sampleRate: Int {
        get {
            return Int(DS_GetModelSampleRate(modelCtx))
        }
    }
    /** Enable decoding using an external scorer.
        - Parameter scorerPath: The path to the external scorer file.
        - Throws: `DeepSpeechError` on failure.
    */
    public func enableExternalScorer(scorerPath: String) throws {
        let err = DS_EnableExternalScorer(modelCtx, scorerPath)
        try evaluateErrorCode(errorCode: err)
    }
    /** Disable decoding using an external scorer.
        - Throws: `DeepSpeechError` on failure.
    */
    public func disableExternalScorer() throws {
        let err = DS_DisableExternalScorer(modelCtx)
        try evaluateErrorCode(errorCode: err)
    }
    /** Set hyperparameters alpha and beta of the external scorer.
        - Parameter alpha: The alpha hyperparameter of the decoder. Language model weight.
        - Parameter beta: The beta hyperparameter of the decoder. Word insertion weight.
        - Throws: `DeepSpeechError` on failure.
    */
    public func setScorerAlphaBeta(alpha: Float, beta: Float) throws {
        let err = DS_SetScorerAlphaBeta(modelCtx, alpha, beta)
        try evaluateErrorCode(errorCode: err)
    }
    /** Use the DeepSpeech model to convert speech to text.
        - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
                            sample rate (matching what the model was trained on).
        - Returns: The STT result.
    */
    public func speechToText(buffer: Array<Int16>) -> String {
        return buffer.withUnsafeBufferPointer { unsafeBufferPointer -> String in
            return speechToText(buffer: unsafeBufferPointer)
        }
    }
    /** Use the DeepSpeech model to convert speech to text.
        - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
                            sample rate (matching what the model was trained on).
        - Returns: The STT result.
    */
    public func speechToText(buffer: UnsafeBufferPointer<Int16>) -> String {
        let result = DS_SpeechToText(modelCtx, buffer.baseAddress, UInt32(buffer.count))
        defer { DS_FreeString(result) }
        return String(cString: result!)
    }
    /** Use the DeepSpeech model to convert speech to text and output results
        including metadata.
        - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
                            sample rate (matching what the model was trained on).
        - Parameter numResults: The maximum number of DeepSpeechCandidateTranscript
                                structs to return. Returned value might be smaller than this.
        - Returns: Metadata struct containing multiple CandidateTranscript structs.
                   Each transcript has per-token metadata including timing information.
   */
    public func speechToTextWithMetadata(buffer: Array<Int16>, numResults: Int) -> DeepSpeechMetadata {
        return buffer.withUnsafeBufferPointer { unsafeBufferPointer -> DeepSpeechMetadata in
-            let result = DS_SpeechToTextWithMetadata(modelCtx, unsafeBufferPointer.baseAddress, UInt32(buffer.count), UInt32(numResults))!
+            return speechToTextWithMetadata(buffer: unsafeBufferPointer, numResults: numResults)
            defer { DS_FreeMetadata(result) }
            return DeepSpeechMetadata(fromInternal: result)
        }
    }
    /** Use the DeepSpeech model to convert speech to text and output results
        including metadata.
        - Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
                            sample rate (matching what the model was trained on).
        - Parameter numResults: The maximum number of DeepSpeechCandidateTranscript
                                structs to return. Returned value might be smaller than this.
        - Returns: Metadata struct containing multiple CandidateTranscript structs.
                   Each transcript has per-token metadata including timing information.
   */
    public func speechToTextWithMetadata(buffer: UnsafeBufferPointer<Int16>, numResults: Int) -> DeepSpeechMetadata {
        let result = DS_SpeechToTextWithMetadata(
            modelCtx,
            buffer.baseAddress,
            UInt32(buffer.count),
            UInt32(numResults))!
        defer { DS_FreeMetadata(result) }
        return DeepSpeechMetadata(fromInternal: result)
    }
    /** Create a new streaming inference state.
        - Returns: DeepSpeechStream object representing the streaming state.
        - Throws: `DeepSpeechError` on failure.
    */
    public func createStream() throws -> DeepSpeechStream {
        var streamContext: OpaquePointer!
        let err = DS_CreateStream(modelCtx, &streamContext)
        try evaluateErrorCode(errorCode: err)
        return DeepSpeechStream(streamContext: streamContext)
    }
    public class func open(path: String, scorerPath: Optional<String> = nil) -> OpaquePointer {
        var fooOpaque: OpaquePointer!
        DS_CreateModel(path, &fooOpaque)
        if let scorerPath = scorerPath {
            DS_EnableExternalScorer(fooOpaque, scorerPath)
        }
        return fooOpaque
    }
    public class func createStream(modelState: OpaquePointer) -> OpaquePointer {
        var fooOpaque: OpaquePointer!
        DS_CreateStream(modelState, &fooOpaque)
        return fooOpaque
    }
 }
 public func DeepSpeechVersion() -> String {