From 35d2908db9b273cf8c727994ea337a55fb84c214 Mon Sep 17 00:00:00 2001 From: Erik Ziegler Date: Mon, 27 Jul 2020 16:56:32 +0200 Subject: [PATCH] Add support for microphone streaming in swift native client test project --- .../deepspeech_ios_test/AppDelegate.swift | 180 ----------- .../deepspeech_ios_test/AudioContext.swift | 68 ++++ .../deepspeech_ios_test/ContentView.swift | 32 +- .../deepspeech_ios_test/DeepSpeech.swift | 299 ++++++++++++++++++ .../swift/deepspeech_ios_test/Info.plist | 2 + 5 files changed, 400 insertions(+), 181 deletions(-) create mode 100644 native_client/swift/deepspeech_ios_test/AudioContext.swift create mode 100644 native_client/swift/deepspeech_ios_test/DeepSpeech.swift diff --git a/native_client/swift/deepspeech_ios_test/AppDelegate.swift b/native_client/swift/deepspeech_ios_test/AppDelegate.swift index a2dcb427..32753486 100644 --- a/native_client/swift/deepspeech_ios_test/AppDelegate.swift +++ b/native_client/swift/deepspeech_ios_test/AppDelegate.swift @@ -7,190 +7,10 @@ // import UIKit -import Foundation -import AVFoundation -import AudioToolbox -import Accelerate - -import deepspeech_ios - -/// Holds audio information used for building waveforms -final class AudioContext { - - /// The audio asset URL used to load the context - public let audioURL: URL - - /// Total number of samples in loaded asset - public let totalSamples: Int - - /// Loaded asset - public let asset: AVAsset - - // Loaded assetTrack - public let assetTrack: AVAssetTrack - - private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) { - self.audioURL = audioURL - self.totalSamples = totalSamples - self.asset = asset - self.assetTrack = assetTrack - } - - public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) { - let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)]) - - guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else { - fatalError("Couldn't load AVAssetTrack") - } - - asset.loadValuesAsynchronously(forKeys: ["duration"]) { - var error: NSError? - let status = asset.statusOfValue(forKey: "duration", error: &error) - switch status { - case .loaded: - guard - let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription], - let audioFormatDesc = formatDescriptions.first, - let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc) - else { break } - - let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale)) - let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack) - completionHandler(audioContext) - return - - case .failed, .cancelled, .loading, .unknown: - print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")") - } - - completionHandler(nil) - } - } -} - -func render(audioContext: AudioContext?, stream: DeepSpeechStream) { - guard let audioContext = audioContext else { - fatalError("Couldn't create the audioContext") - } - - let sampleRange: CountableRange = 0..? - CMBlockBufferGetDataPointer(readBuffer, - atOffset: 0, - lengthAtOffsetOut: &readBufferLength, - totalLengthOut: nil, - dataPointerOut: &readBufferPointer) - sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength)) - CMSampleBufferInvalidate(readSampleBuffer) - - let totalSamples = sampleBuffer.count / MemoryLayout.size - print("read \(totalSamples) samples") - - sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in - let unsafeBufferPointer = samples.bindMemory(to: Int16.self) - stream.feedAudioContent(buffer: unsafeBufferPointer) - } - - sampleBuffer.removeAll() - } - - // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown) - guard reader.status == .completed else { - fatalError("Couldn't read the audio file") - } -} - -func test(model: DeepSpeechModel, audioPath: String, completion: @escaping () -> ()) { - let url = URL(fileURLWithPath: audioPath) - - let stream = try! model.createStream() - print("\(audioPath)") - let start = CFAbsoluteTimeGetCurrent() - AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in - guard let audioContext = audioContext else { - fatalError("Couldn't create the audioContext") - } - render(audioContext: audioContext, stream: stream) - let result = stream.finishStream() - let end = CFAbsoluteTimeGetCurrent() - print("\"\(audioPath)\": \(end - start) - \(result)") - completion() - }) -} @UIApplicationMain class AppDelegate: UIResponder, UIApplicationDelegate { func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool { - let model = try! DeepSpeechModel(modelPath: Bundle.main.path(forResource: "output_graph", ofType: "tflite")!) - try! model.enableExternalScorer(scorerPath: Bundle.main.path(forResource: "librispeech_en_utf8_nonpruned_o6", ofType: "scorer")!) - - let files = [ - "5639-40744-0008", - "1089-134686-0019", - "2094-142345-0053", - "8463-294825-0010", - "121-123852-0001", - "7021-79740-0008", - "6930-76324-0010", - "5105-28240-0001", - "1089-134691-0012", - "5142-33396-0027", - "260-123288-0004", - "6930-75918-0008", - "8463-294828-0005", - "61-70970-0002" - ] - - let serialQueue = DispatchQueue(label: "serialQueue") - let group = DispatchGroup() - group.enter() - serialQueue.async { - test(model: model, audioPath: Bundle.main.path(forResource: "1284-134647-0003", ofType: "wav")!) { - group.leave() - } - } - for path in files { - group.wait() - group.enter() - test(model: model, audioPath: Bundle.main.path(forResource: path, ofType: "wav")!) { - group.leave() - } - } return true } diff --git a/native_client/swift/deepspeech_ios_test/AudioContext.swift b/native_client/swift/deepspeech_ios_test/AudioContext.swift new file mode 100644 index 00000000..60999bd3 --- /dev/null +++ b/native_client/swift/deepspeech_ios_test/AudioContext.swift @@ -0,0 +1,68 @@ +// +// AudioContext.swift +// deepspeech_ios_test +// +// Created by Erik Ziegler on 27.07.20. +// Copyright © 2020 Mozilla. All rights reserved. +// + +import Foundation +import AVFoundation +import AudioToolbox +import Accelerate + +import deepspeech_ios + +/// Holds audio information used for building waveforms +final class AudioContext { + + /// The audio asset URL used to load the context + public let audioURL: URL + + /// Total number of samples in loaded asset + public let totalSamples: Int + + /// Loaded asset + public let asset: AVAsset + + // Loaded assetTrack + public let assetTrack: AVAssetTrack + + private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) { + self.audioURL = audioURL + self.totalSamples = totalSamples + self.asset = asset + self.assetTrack = assetTrack + } + + public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) { + let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)]) + + guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else { + fatalError("Couldn't load AVAssetTrack") + } + + asset.loadValuesAsynchronously(forKeys: ["duration"]) { + var error: NSError? + let status = asset.statusOfValue(forKey: "duration", error: &error) + switch status { + case .loaded: + guard + let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription], + let audioFormatDesc = formatDescriptions.first, + let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc) + else { break } + + let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale)) + let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack) + completionHandler(audioContext) + return + + case .failed, .cancelled, .loading, .unknown: + print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")") + } + + completionHandler(nil) + } + } +} diff --git a/native_client/swift/deepspeech_ios_test/ContentView.swift b/native_client/swift/deepspeech_ios_test/ContentView.swift index 5f7442f9..c33e6365 100644 --- a/native_client/swift/deepspeech_ios_test/ContentView.swift +++ b/native_client/swift/deepspeech_ios_test/ContentView.swift @@ -9,8 +9,38 @@ import SwiftUI struct ContentView: View { + private var deepspeech = DeepSpeech() + @State var isRecognizingMicrophone = false + var body: some View { - Text("Hello, World!") + VStack { + Text("DeepSpeech iOS Demo") + .font(.system(size: 30)) + Button("Recognize files", action: recognizeFiles) + .padding(30) + Button( + isRecognizingMicrophone + ? "Stop Microphone Recognition" + : "Start Microphone Recognition", + action: isRecognizingMicrophone + ? stopMicRecognition + : startMicRecognition) + .padding(30) + } + } + + func recognizeFiles() { + self.deepspeech.recognizeFiles() + } + + func startMicRecognition() { + isRecognizingMicrophone = true + self.deepspeech.startMicrophoneRecognition() + } + + func stopMicRecognition() { + isRecognizingMicrophone = false + self.deepspeech.stopMicrophoneRecognition() } } diff --git a/native_client/swift/deepspeech_ios_test/DeepSpeech.swift b/native_client/swift/deepspeech_ios_test/DeepSpeech.swift new file mode 100644 index 00000000..52124f17 --- /dev/null +++ b/native_client/swift/deepspeech_ios_test/DeepSpeech.swift @@ -0,0 +1,299 @@ +// +// DeepSpeech.swift +// deepspeech_ios_test +// +// Created by Erik Ziegler on 27.07.20. +// Copyright © 2020 Mozilla. All rights reserved. +// + +import Foundation +import AVFoundation +import AudioToolbox +import Accelerate + +import deepspeech_ios + +struct FillComplexInputParm { + var source: UnsafeMutablePointer + var sourceSize: UInt32 +}; + +class DeepSpeech : NSObject, AVCaptureAudioDataOutputSampleBufferDelegate { + private var model: DeepSpeechModel + private var stream: DeepSpeechStream? + + private var captureSession = AVCaptureSession() + private var audioData = Data() + + override init() { + let modelPath = Bundle.main.path(forResource: "deepspeech-0.7.4-models", ofType: "tflite")! + let scorerPath = Bundle.main.path(forResource: "deepspeech-0.7.4-models", ofType: "scorer")! + + model = try! DeepSpeechModel(modelPath: modelPath) + try! model.enableExternalScorer(scorerPath: scorerPath) + + super.init() + + // prepare audio capture + self.configureCaptureSession() + } + + // MARK: Microphone recognition + + private func configureCaptureSession() { + captureSession.beginConfiguration() + + let audioDevice = AVCaptureDevice.default(.builtInMicrophone, for: .audio, position: .unspecified) + + let audioDeviceInput = try! AVCaptureDeviceInput(device: audioDevice!) + guard captureSession.canAddInput(audioDeviceInput) else { return } + captureSession.addInput(audioDeviceInput) + + let serialQueue = DispatchQueue(label: "serialQueue") + let audioOutput = AVCaptureAudioDataOutput() + audioOutput.setSampleBufferDelegate(self, queue: serialQueue) + + guard captureSession.canAddOutput(audioOutput) else { return } + captureSession.sessionPreset = .inputPriority + captureSession.addOutput(audioOutput) + captureSession.commitConfiguration() + } + + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + var sourceFormat = (sampleBuffer.formatDescription?.audioFormatList[0].mASBD)! + var destinationFormat = sourceFormat + destinationFormat.mSampleRate = 16000.0 + + var audioConverterRef: AudioConverterRef? + let createConverterStatus = AudioConverterNew(&sourceFormat, &destinationFormat, &audioConverterRef) + + if (createConverterStatus != noErr) { + print("Error creating converter") + } + + var quality = kAudioConverterQuality_Max + + AudioConverterSetProperty(audioConverterRef!, kAudioConverterSampleRateConverterQuality, UInt32(MemoryLayout.size), &quality) + + let blockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer) + + var pcmLength: Int = 0 + var pcmData: UnsafeMutablePointer? + let status: OSStatus = CMBlockBufferGetDataPointer(blockBuffer!, atOffset: 0, lengthAtOffsetOut: nil, totalLengthOut: &pcmLength, dataPointerOut: &pcmData) + + if status != noErr { + print("Error getting something") + } else { + var input = FillComplexInputParm(source: pcmData!, sourceSize: UInt32(pcmLength)) + + let outputBuffer = malloc(pcmLength) + memset(outputBuffer, 0, pcmLength); + + var outputBufferList = AudioBufferList() + outputBufferList.mNumberBuffers = 1 + outputBufferList.mBuffers.mData = outputBuffer + outputBufferList.mBuffers.mDataByteSize = UInt32(Double(pcmLength) * destinationFormat.mSampleRate / sourceFormat.mSampleRate) + outputBufferList.mBuffers.mNumberChannels = 1 + + func inputDataProc( + inAudioConverter: AudioConverterRef, + ioNumberDataPacket: UnsafeMutablePointer, + ioData: UnsafeMutablePointer, + outDataPacketDescription: UnsafeMutablePointer?>?, + inUserData: UnsafeMutableRawPointer? + ) -> OSStatus { + var inputPtr = inUserData!.load(as: FillComplexInputParm.self) + + if (inputPtr.sourceSize <= 0) { + ioNumberDataPacket.pointee = 1 + return -1 + } + + let rawPtr = UnsafeMutableRawPointer(inputPtr.source) + + ioData.pointee.mNumberBuffers = 1 + ioData.pointee.mBuffers.mData = rawPtr + ioData.pointee.mBuffers.mDataByteSize = inputPtr.sourceSize + ioData.pointee.mBuffers.mNumberChannels = 1 + + ioNumberDataPacket.pointee = (inputPtr.sourceSize / 2) + inputPtr.sourceSize = 0 + + return noErr + }; + + var packetSize: UInt32 = UInt32(pcmLength / 2) + + let status: OSStatus = AudioConverterFillComplexBuffer(audioConverterRef!, inputDataProc, &input, &packetSize, &outputBufferList, nil) + + if (status != noErr) { + print("Error: " + status.description) + } else { + let data = outputBufferList.mBuffers.mData! + let byteSize = outputBufferList.mBuffers.mDataByteSize + + let shorts = UnsafeBufferPointer(start: data.assumingMemoryBound(to: Int16.self), count: Int(byteSize / 2)) + stream!.feedAudioContent(buffer: shorts) + let intermediateResult = stream!.intermediateDecode() + print("Intermediate result: " + intermediateResult) + + // save bytes to audio data for creating a pcm file later for the captured audio + let ptr = UnsafePointer(data.assumingMemoryBound(to: UInt8.self)) + audioData.append(ptr, count: Int(byteSize)) + } + + free(outputBuffer) + AudioConverterDispose(audioConverterRef!) + } + } + + + public func startMicrophoneRecognition() { + audioData = Data() + stream = try! model.createStream() + captureSession.startRunning() + print("Started listening...") + } + + private func writeAudioDataToPCMFile() { + let documents = NSSearchPathForDirectoriesInDomains(FileManager.SearchPathDirectory.documentDirectory, FileManager.SearchPathDomainMask.userDomainMask, true)[0] + let filePath = documents + "/recording.pcm" + let url = URL(fileURLWithPath: filePath) + try! audioData.write(to: url) + print("Saved audio to " + filePath) + } + + public func stopMicrophoneRecognition() { + captureSession.stopRunning() + + let result = stream?.finishStream() + print("Result: " + result!) + + // optional, useful for checking the recorded audio + writeAudioDataToPCMFile() + } + + // MARK: Audio file recognition + + private func render(audioContext: AudioContext?, stream: DeepSpeechStream) { + guard let audioContext = audioContext else { + fatalError("Couldn't create the audioContext") + } + + let sampleRange: CountableRange = 0..? + CMBlockBufferGetDataPointer(readBuffer, + atOffset: 0, + lengthAtOffsetOut: &readBufferLength, + totalLengthOut: nil, + dataPointerOut: &readBufferPointer) + sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength)) + CMSampleBufferInvalidate(readSampleBuffer) + + let totalSamples = sampleBuffer.count / MemoryLayout.size + print("read \(totalSamples) samples") + + sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in + let unsafeBufferPointer = samples.bindMemory(to: Int16.self) + stream.feedAudioContent(buffer: unsafeBufferPointer) + } + + sampleBuffer.removeAll() + } + + // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown) + guard reader.status == .completed else { + fatalError("Couldn't read the audio file") + } + } + + private func recognizeFile(audioPath: String, completion: @escaping () -> ()) { + let url = URL(fileURLWithPath: audioPath) + + let stream = try! model.createStream() + print("\(audioPath)") + let start = CFAbsoluteTimeGetCurrent() + AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in + guard let audioContext = audioContext else { + fatalError("Couldn't create the audioContext") + } + self.render(audioContext: audioContext, stream: stream) + let result = stream.finishStream() + let end = CFAbsoluteTimeGetCurrent() + print("\"\(audioPath)\": \(end - start) - \(result)") + completion() + }) + } + + public func recognizeFiles() { + let files = [ + "5639-40744-0008", + "1089-134686-0019", + "2094-142345-0053", + "8463-294825-0010", + "121-123852-0001", + "7021-79740-0008", + "6930-76324-0010", + "5105-28240-0001", + "1089-134691-0012", + "5142-33396-0027", + "260-123288-0004", + "6930-75918-0008", + "8463-294828-0005", + "61-70970-0002" + ] + + let serialQueue = DispatchQueue(label: "serialQueue") + let group = DispatchGroup() + group.enter() + + serialQueue.async { + self.recognizeFile(audioPath: Bundle.main.path(forResource: "1284-134647-0003", ofType: "wav")!) { + group.leave() + } + } + + for path in files { + group.wait() + group.enter() + self.recognizeFile(audioPath: Bundle.main.path(forResource: path, ofType: "wav")!) { + group.leave() + } + } + } +} diff --git a/native_client/swift/deepspeech_ios_test/Info.plist b/native_client/swift/deepspeech_ios_test/Info.plist index 9742bf0f..1682607b 100644 --- a/native_client/swift/deepspeech_ios_test/Info.plist +++ b/native_client/swift/deepspeech_ios_test/Info.plist @@ -4,6 +4,8 @@ CFBundleDevelopmentRegion $(DEVELOPMENT_LANGUAGE) + NSMicrophoneUsageDescription + Please grant access to the microphone. CFBundleExecutable $(EXECUTABLE_NAME) CFBundleIdentifier