Add docs to Swift bindings and missing methods

This commit is contained in:
Reuben Morais 2020-07-20 11:52:35 +02:00
parent d9dac13343
commit 2672878618
1 changed files with 173 additions and 20 deletions

View File

@ -30,7 +30,8 @@ public enum DeepSpeechError: Error {
case failCreateSess(errorCode: Int32) case failCreateSess(errorCode: Int32)
case failCreateModel(errorCode: Int32) case failCreateModel(errorCode: Int32)
// Additional case for invalid error codes, should never happen unless the user has mixed header and binary versions // Additional case for invalid error codes, should never happen unless the
// user has mixed header and binary versions.
case invalidErrorCode(errorCode: Int32) case invalidErrorCode(errorCode: Int32)
} }
@ -115,9 +116,15 @@ private func evaluateErrorCode(errorCode: Int32) throws {
} }
} }
/// Stores text of an individual token, along with its timing information
public struct DeepSpeechTokenMetadata { public struct DeepSpeechTokenMetadata {
/// The text corresponding to this token
let text: String let text: String
/// Position of the token in units of 20ms
let timestep: Int let timestep: Int
/// Position of the token in seconds
let startTime: Float let startTime: Float
internal init(fromInternal: TokenMetadata) { internal init(fromInternal: TokenMetadata) {
@ -127,8 +134,17 @@ public struct DeepSpeechTokenMetadata {
} }
} }
/** A single transcript computed by the model, including a confidence value and
the metadata for its constituent tokens
*/
public struct DeepSpeechCandidateTranscript { public struct DeepSpeechCandidateTranscript {
/// Array of DeepSpeechTokenMetadata objects
private(set) var tokens: [DeepSpeechTokenMetadata] = [] private(set) var tokens: [DeepSpeechTokenMetadata] = []
/** Approximated confidence value for this transcript. This corresponds to
both acoustic model and language model scores that contributed to the
creation of this transcript.
*/
let confidence: Double let confidence: Double
internal init(fromInternal: CandidateTranscript) { internal init(fromInternal: CandidateTranscript) {
@ -140,12 +156,16 @@ public struct DeepSpeechCandidateTranscript {
} }
} }
/// An array of DeepSpeechCandidateTranscript objects computed by the model
public struct DeepSpeechMetadata { public struct DeepSpeechMetadata {
/// Array of DeepSpeechCandidateTranscript objects
private(set) var transcripts: [DeepSpeechCandidateTranscript] = [] private(set) var transcripts: [DeepSpeechCandidateTranscript] = []
internal init(fromInternal: UnsafeMutablePointer<Metadata>) { internal init(fromInternal: UnsafeMutablePointer<Metadata>) {
let md = fromInternal.pointee let md = fromInternal.pointee
let transcriptsBuffer = UnsafeBufferPointer<CandidateTranscript>(start: md.transcripts, count: Int(md.num_transcripts)) let transcriptsBuffer = UnsafeBufferPointer<CandidateTranscript>(
start: md.transcripts,
count: Int(md.num_transcripts))
for tr in transcriptsBuffer { for tr in transcriptsBuffer {
transcripts.append(DeepSpeechCandidateTranscript(fromInternal: tr)) transcripts.append(DeepSpeechCandidateTranscript(fromInternal: tr))
@ -167,6 +187,13 @@ public class DeepSpeechStream {
} }
} }
/** Feed audio samples to an ongoing streaming inference.
- Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
sample rate (matching what the model was trained on).
- Precondition: `finishStream()` has not been called on this stream.
*/
public func feedAudioContent(buffer: Array<Int16>) { public func feedAudioContent(buffer: Array<Int16>) {
precondition(streamCtx != nil, "calling method on invalidated Stream") precondition(streamCtx != nil, "calling method on invalidated Stream")
@ -175,12 +202,25 @@ public class DeepSpeechStream {
} }
} }
/** Feed audio samples to an ongoing streaming inference.
- Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
sample rate (matching what the model was trained on).
- Precondition: `finishStream()` has not been called on this stream.
*/
public func feedAudioContent(buffer: UnsafeBufferPointer<Int16>) { public func feedAudioContent(buffer: UnsafeBufferPointer<Int16>) {
precondition(streamCtx != nil, "calling method on invalidated Stream") precondition(streamCtx != nil, "calling method on invalidated Stream")
DS_FeedAudioContent(streamCtx, buffer.baseAddress, UInt32(buffer.count)) DS_FeedAudioContent(streamCtx, buffer.baseAddress, UInt32(buffer.count))
} }
/** Compute the intermediate decoding of an ongoing streaming inference.
- Precondition: `finishStream()` has not been called on this stream.
- Returns: The STT intermediate result.
*/
public func intermediateDecode() -> String { public func intermediateDecode() -> String {
precondition(streamCtx != nil, "calling method on invalidated Stream") precondition(streamCtx != nil, "calling method on invalidated Stream")
@ -189,6 +229,16 @@ public class DeepSpeechStream {
return String(cString: result!) return String(cString: result!)
} }
/** Compute the intermediate decoding of an ongoing streaming inference,
return results including metadata.
- Parameter numResults: The number of candidate transcripts to return.
- Precondition: `finishStream()` has not been called on this stream.
- Returns: Metadata struct containing multiple CandidateTranscript structs.
Each transcript has per-token metadata including timing information.
*/
public func intermediateDecodeWithMetadata(numResults: Int) -> DeepSpeechMetadata { public func intermediateDecodeWithMetadata(numResults: Int) -> DeepSpeechMetadata {
precondition(streamCtx != nil, "calling method on invalidated Stream") precondition(streamCtx != nil, "calling method on invalidated Stream")
let result = DS_IntermediateDecodeWithMetadata(streamCtx, UInt32(numResults))! let result = DS_IntermediateDecodeWithMetadata(streamCtx, UInt32(numResults))!
@ -196,6 +246,15 @@ public class DeepSpeechStream {
return DeepSpeechMetadata(fromInternal: result) return DeepSpeechMetadata(fromInternal: result)
} }
/** Compute the final decoding of an ongoing streaming inference and return
the result. Signals the end of an ongoing streaming inference.
- Precondition: `finishStream()` has not been called on this stream.
- Returns: The STT result.
- Postcondition: This method will invalidate this streaming context.
*/
public func finishStream() -> String { public func finishStream() -> String {
precondition(streamCtx != nil, "calling method on invalidated Stream") precondition(streamCtx != nil, "calling method on invalidated Stream")
@ -206,11 +265,38 @@ public class DeepSpeechStream {
} }
return String(cString: result!) return String(cString: result!)
} }
/** Compute the final decoding of an ongoing streaming inference and return
results including metadata. Signals the end of an ongoing streaming
inference.
- Parameter numResults: The number of candidate transcripts to return.
- Precondition: `finishStream()` has not been called on this stream.
- Returns: Metadata struct containing multiple CandidateTranscript structs.
Each transcript has per-token metadata including timing information.
- Postcondition: This method will invalidate this streaming context.
*/
public func finishStreamWithMetadata(numResults: Int) -> DeepSpeechMetadata {
precondition(streamCtx != nil, "calling method on invalidated Stream")
let result = DS_FinishStreamWithMetadata(streamCtx, UInt32(numResults))!
defer { DS_FreeMetadata(result) }
return DeepSpeechMetadata(fromInternal: result)
}
} }
/// An object providing an interface to a trained DeepSpeech model.
public class DeepSpeechModel { public class DeepSpeechModel {
private var modelCtx: OpaquePointer! private var modelCtx: OpaquePointer!
/**
- Parameter modelPath: The path to the model file.
- Throws: `DeepSpeechError` on failure.
*/
public init(modelPath: String) throws { public init(modelPath: String) throws {
let err = DS_CreateModel(modelPath, &modelCtx) let err = DS_CreateModel(modelPath, &modelCtx)
try evaluateErrorCode(errorCode: err) try evaluateErrorCode(errorCode: err)
@ -221,77 +307,144 @@ public class DeepSpeechModel {
modelCtx = nil modelCtx = nil
} }
/** Get beam width value used by the model. If {@link DS_SetModelBeamWidth}
was not called before, will return the default value loaded from the
model file.
- Returns: Beam width value used by the model.
*/
public func getBeamWidth() -> Int { public func getBeamWidth() -> Int {
return Int(DS_GetModelBeamWidth(modelCtx)) return Int(DS_GetModelBeamWidth(modelCtx))
} }
/** Set beam width value used by the model.
- Parameter beamWidth: The beam width used by the model. A larger beam
width value generates better results at the cost
of decoding time.
- Throws: `DeepSpeechError` on failure.
*/
public func setBeamWidth(beamWidth: Int) throws { public func setBeamWidth(beamWidth: Int) throws {
let err = DS_SetModelBeamWidth(modelCtx, UInt32(beamWidth)) let err = DS_SetModelBeamWidth(modelCtx, UInt32(beamWidth))
try evaluateErrorCode(errorCode: err) try evaluateErrorCode(errorCode: err)
} }
// The sample rate expected by the model.
public var sampleRate: Int { public var sampleRate: Int {
get { get {
return Int(DS_GetModelSampleRate(modelCtx)) return Int(DS_GetModelSampleRate(modelCtx))
} }
} }
/** Enable decoding using an external scorer.
- Parameter scorerPath: The path to the external scorer file.
- Throws: `DeepSpeechError` on failure.
*/
public func enableExternalScorer(scorerPath: String) throws { public func enableExternalScorer(scorerPath: String) throws {
let err = DS_EnableExternalScorer(modelCtx, scorerPath) let err = DS_EnableExternalScorer(modelCtx, scorerPath)
try evaluateErrorCode(errorCode: err) try evaluateErrorCode(errorCode: err)
} }
/** Disable decoding using an external scorer.
- Throws: `DeepSpeechError` on failure.
*/
public func disableExternalScorer() throws { public func disableExternalScorer() throws {
let err = DS_DisableExternalScorer(modelCtx) let err = DS_DisableExternalScorer(modelCtx)
try evaluateErrorCode(errorCode: err) try evaluateErrorCode(errorCode: err)
} }
/** Set hyperparameters alpha and beta of the external scorer.
- Parameter alpha: The alpha hyperparameter of the decoder. Language model weight.
- Parameter beta: The beta hyperparameter of the decoder. Word insertion weight.
- Throws: `DeepSpeechError` on failure.
*/
public func setScorerAlphaBeta(alpha: Float, beta: Float) throws { public func setScorerAlphaBeta(alpha: Float, beta: Float) throws {
let err = DS_SetScorerAlphaBeta(modelCtx, alpha, beta) let err = DS_SetScorerAlphaBeta(modelCtx, alpha, beta)
try evaluateErrorCode(errorCode: err) try evaluateErrorCode(errorCode: err)
} }
/** Use the DeepSpeech model to convert speech to text.
- Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
sample rate (matching what the model was trained on).
- Returns: The STT result.
*/
public func speechToText(buffer: Array<Int16>) -> String { public func speechToText(buffer: Array<Int16>) -> String {
return buffer.withUnsafeBufferPointer { unsafeBufferPointer -> String in return buffer.withUnsafeBufferPointer { unsafeBufferPointer -> String in
return speechToText(buffer: unsafeBufferPointer) return speechToText(buffer: unsafeBufferPointer)
} }
} }
/** Use the DeepSpeech model to convert speech to text.
- Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
sample rate (matching what the model was trained on).
- Returns: The STT result.
*/
public func speechToText(buffer: UnsafeBufferPointer<Int16>) -> String { public func speechToText(buffer: UnsafeBufferPointer<Int16>) -> String {
let result = DS_SpeechToText(modelCtx, buffer.baseAddress, UInt32(buffer.count)) let result = DS_SpeechToText(modelCtx, buffer.baseAddress, UInt32(buffer.count))
defer { DS_FreeString(result) } defer { DS_FreeString(result) }
return String(cString: result!) return String(cString: result!)
} }
/** Use the DeepSpeech model to convert speech to text and output results
including metadata.
- Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
sample rate (matching what the model was trained on).
- Parameter numResults: The maximum number of DeepSpeechCandidateTranscript
structs to return. Returned value might be smaller than this.
- Returns: Metadata struct containing multiple CandidateTranscript structs.
Each transcript has per-token metadata including timing information.
*/
public func speechToTextWithMetadata(buffer: Array<Int16>, numResults: Int) -> DeepSpeechMetadata { public func speechToTextWithMetadata(buffer: Array<Int16>, numResults: Int) -> DeepSpeechMetadata {
return buffer.withUnsafeBufferPointer { unsafeBufferPointer -> DeepSpeechMetadata in return buffer.withUnsafeBufferPointer { unsafeBufferPointer -> DeepSpeechMetadata in
let result = DS_SpeechToTextWithMetadata(modelCtx, unsafeBufferPointer.baseAddress, UInt32(buffer.count), UInt32(numResults))! return speechToTextWithMetadata(buffer: unsafeBufferPointer, numResults: numResults)
defer { DS_FreeMetadata(result) }
return DeepSpeechMetadata(fromInternal: result)
} }
} }
/** Use the DeepSpeech model to convert speech to text and output results
including metadata.
- Parameter buffer: A 16-bit, mono raw audio signal at the appropriate
sample rate (matching what the model was trained on).
- Parameter numResults: The maximum number of DeepSpeechCandidateTranscript
structs to return. Returned value might be smaller than this.
- Returns: Metadata struct containing multiple CandidateTranscript structs.
Each transcript has per-token metadata including timing information.
*/
public func speechToTextWithMetadata(buffer: UnsafeBufferPointer<Int16>, numResults: Int) -> DeepSpeechMetadata {
let result = DS_SpeechToTextWithMetadata(
modelCtx,
buffer.baseAddress,
UInt32(buffer.count),
UInt32(numResults))!
defer { DS_FreeMetadata(result) }
return DeepSpeechMetadata(fromInternal: result)
}
/** Create a new streaming inference state.
- Returns: DeepSpeechStream object representing the streaming state.
- Throws: `DeepSpeechError` on failure.
*/
public func createStream() throws -> DeepSpeechStream { public func createStream() throws -> DeepSpeechStream {
var streamContext: OpaquePointer! var streamContext: OpaquePointer!
let err = DS_CreateStream(modelCtx, &streamContext) let err = DS_CreateStream(modelCtx, &streamContext)
try evaluateErrorCode(errorCode: err) try evaluateErrorCode(errorCode: err)
return DeepSpeechStream(streamContext: streamContext) return DeepSpeechStream(streamContext: streamContext)
} }
public class func open(path: String, scorerPath: Optional<String> = nil) -> OpaquePointer {
var fooOpaque: OpaquePointer!
DS_CreateModel(path, &fooOpaque)
if let scorerPath = scorerPath {
DS_EnableExternalScorer(fooOpaque, scorerPath)
}
return fooOpaque
}
public class func createStream(modelState: OpaquePointer) -> OpaquePointer {
var fooOpaque: OpaquePointer!
DS_CreateStream(modelState, &fooOpaque)
return fooOpaque
}
} }
public func DeepSpeechVersion() -> String { public func DeepSpeechVersion() -> String {