diff --git a/native_client/swift/deepspeech_ios_test.xcodeproj/project.pbxproj b/native_client/swift/deepspeech_ios_test.xcodeproj/project.pbxproj index eadc4fae..a57f983c 100644 --- a/native_client/swift/deepspeech_ios_test.xcodeproj/project.pbxproj +++ b/native_client/swift/deepspeech_ios_test.xcodeproj/project.pbxproj @@ -7,6 +7,9 @@ objects = { /* Begin PBXBuildFile section */ + 504EC34324CF4EFD0073C22E /* SpeechRecognitionImpl.swift in Sources */ = {isa = PBXBuildFile; fileRef = 504EC34124CF4EFD0073C22E /* SpeechRecognitionImpl.swift */; }; + 504EC34424CF4EFD0073C22E /* AudioContext.swift in Sources */ = {isa = PBXBuildFile; fileRef = 504EC34224CF4EFD0073C22E /* AudioContext.swift */; }; + 504EC34524CF4F4F0073C22E /* deepspeech_ios.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; }; 507CD3A124B61FE400409BBB /* deepspeech_ios.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */; }; 507CD3A324B61FEB00409BBB /* libdeepspeech.so in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; }; 507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; settings = {ATTRIBUTES = (CodeSignOnCopy, ); }; }; @@ -44,6 +47,7 @@ dstPath = ""; dstSubfolderSpec = 10; files = ( + 504EC34524CF4F4F0073C22E /* deepspeech_ios.framework in Embed Frameworks */, 507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */, ); name = "Embed Frameworks"; @@ -52,8 +56,10 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 504EC34124CF4EFD0073C22E /* SpeechRecognitionImpl.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SpeechRecognitionImpl.swift; sourceTree = ""; }; + 504EC34224CF4EFD0073C22E /* AudioContext.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = AudioContext.swift; sourceTree = ""; }; 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = deepspeech_ios.framework; sourceTree = BUILT_PRODUCTS_DIR; }; - 507CD3A224B61FEA00409BBB /* libdeepspeech.so */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libdeepspeech.so; path = libdeepspeech.so; sourceTree = ""; }; + 507CD3A224B61FEA00409BBB /* libdeepspeech.so */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; path = libdeepspeech.so; sourceTree = ""; }; 50F787EF2497683900D52237 /* deepspeech_ios_test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = deepspeech_ios_test.app; sourceTree = BUILT_PRODUCTS_DIR; }; 50F787F22497683900D52237 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = ""; }; 50F787F42497683900D52237 /* SceneDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SceneDelegate.swift; sourceTree = ""; }; @@ -130,6 +136,8 @@ 50F787F12497683900D52237 /* deepspeech_ios_test */ = { isa = PBXGroup; children = ( + 504EC34224CF4EFD0073C22E /* AudioContext.swift */, + 504EC34124CF4EFD0073C22E /* SpeechRecognitionImpl.swift */, 50F787F22497683900D52237 /* AppDelegate.swift */, 50F787F42497683900D52237 /* SceneDelegate.swift */, 50F787F62497683900D52237 /* ContentView.swift */, @@ -299,7 +307,9 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + 504EC34424CF4EFD0073C22E /* AudioContext.swift in Sources */, 50F787F32497683900D52237 /* AppDelegate.swift in Sources */, + 504EC34324CF4EFD0073C22E /* SpeechRecognitionImpl.swift in Sources */, 50F787F52497683900D52237 /* SceneDelegate.swift in Sources */, 50F787F72497683900D52237 /* ContentView.swift in Sources */, ); diff --git a/native_client/swift/deepspeech_ios_test/AppDelegate.swift b/native_client/swift/deepspeech_ios_test/AppDelegate.swift index a2dcb427..32753486 100644 --- a/native_client/swift/deepspeech_ios_test/AppDelegate.swift +++ b/native_client/swift/deepspeech_ios_test/AppDelegate.swift @@ -7,190 +7,10 @@ // import UIKit -import Foundation -import AVFoundation -import AudioToolbox -import Accelerate - -import deepspeech_ios - -/// Holds audio information used for building waveforms -final class AudioContext { - - /// The audio asset URL used to load the context - public let audioURL: URL - - /// Total number of samples in loaded asset - public let totalSamples: Int - - /// Loaded asset - public let asset: AVAsset - - // Loaded assetTrack - public let assetTrack: AVAssetTrack - - private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) { - self.audioURL = audioURL - self.totalSamples = totalSamples - self.asset = asset - self.assetTrack = assetTrack - } - - public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) { - let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)]) - - guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else { - fatalError("Couldn't load AVAssetTrack") - } - - asset.loadValuesAsynchronously(forKeys: ["duration"]) { - var error: NSError? - let status = asset.statusOfValue(forKey: "duration", error: &error) - switch status { - case .loaded: - guard - let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription], - let audioFormatDesc = formatDescriptions.first, - let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc) - else { break } - - let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale)) - let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack) - completionHandler(audioContext) - return - - case .failed, .cancelled, .loading, .unknown: - print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")") - } - - completionHandler(nil) - } - } -} - -func render(audioContext: AudioContext?, stream: DeepSpeechStream) { - guard let audioContext = audioContext else { - fatalError("Couldn't create the audioContext") - } - - let sampleRange: CountableRange = 0..? - CMBlockBufferGetDataPointer(readBuffer, - atOffset: 0, - lengthAtOffsetOut: &readBufferLength, - totalLengthOut: nil, - dataPointerOut: &readBufferPointer) - sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength)) - CMSampleBufferInvalidate(readSampleBuffer) - - let totalSamples = sampleBuffer.count / MemoryLayout.size - print("read \(totalSamples) samples") - - sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in - let unsafeBufferPointer = samples.bindMemory(to: Int16.self) - stream.feedAudioContent(buffer: unsafeBufferPointer) - } - - sampleBuffer.removeAll() - } - - // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown) - guard reader.status == .completed else { - fatalError("Couldn't read the audio file") - } -} - -func test(model: DeepSpeechModel, audioPath: String, completion: @escaping () -> ()) { - let url = URL(fileURLWithPath: audioPath) - - let stream = try! model.createStream() - print("\(audioPath)") - let start = CFAbsoluteTimeGetCurrent() - AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in - guard let audioContext = audioContext else { - fatalError("Couldn't create the audioContext") - } - render(audioContext: audioContext, stream: stream) - let result = stream.finishStream() - let end = CFAbsoluteTimeGetCurrent() - print("\"\(audioPath)\": \(end - start) - \(result)") - completion() - }) -} @UIApplicationMain class AppDelegate: UIResponder, UIApplicationDelegate { func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool { - let model = try! DeepSpeechModel(modelPath: Bundle.main.path(forResource: "output_graph", ofType: "tflite")!) - try! model.enableExternalScorer(scorerPath: Bundle.main.path(forResource: "librispeech_en_utf8_nonpruned_o6", ofType: "scorer")!) - - let files = [ - "5639-40744-0008", - "1089-134686-0019", - "2094-142345-0053", - "8463-294825-0010", - "121-123852-0001", - "7021-79740-0008", - "6930-76324-0010", - "5105-28240-0001", - "1089-134691-0012", - "5142-33396-0027", - "260-123288-0004", - "6930-75918-0008", - "8463-294828-0005", - "61-70970-0002" - ] - - let serialQueue = DispatchQueue(label: "serialQueue") - let group = DispatchGroup() - group.enter() - serialQueue.async { - test(model: model, audioPath: Bundle.main.path(forResource: "1284-134647-0003", ofType: "wav")!) { - group.leave() - } - } - for path in files { - group.wait() - group.enter() - test(model: model, audioPath: Bundle.main.path(forResource: path, ofType: "wav")!) { - group.leave() - } - } return true } diff --git a/native_client/swift/deepspeech_ios_test/AudioContext.swift b/native_client/swift/deepspeech_ios_test/AudioContext.swift new file mode 100644 index 00000000..60999bd3 --- /dev/null +++ b/native_client/swift/deepspeech_ios_test/AudioContext.swift @@ -0,0 +1,68 @@ +// +// AudioContext.swift +// deepspeech_ios_test +// +// Created by Erik Ziegler on 27.07.20. +// Copyright © 2020 Mozilla. All rights reserved. +// + +import Foundation +import AVFoundation +import AudioToolbox +import Accelerate + +import deepspeech_ios + +/// Holds audio information used for building waveforms +final class AudioContext { + + /// The audio asset URL used to load the context + public let audioURL: URL + + /// Total number of samples in loaded asset + public let totalSamples: Int + + /// Loaded asset + public let asset: AVAsset + + // Loaded assetTrack + public let assetTrack: AVAssetTrack + + private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) { + self.audioURL = audioURL + self.totalSamples = totalSamples + self.asset = asset + self.assetTrack = assetTrack + } + + public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) { + let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)]) + + guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else { + fatalError("Couldn't load AVAssetTrack") + } + + asset.loadValuesAsynchronously(forKeys: ["duration"]) { + var error: NSError? + let status = asset.statusOfValue(forKey: "duration", error: &error) + switch status { + case .loaded: + guard + let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription], + let audioFormatDesc = formatDescriptions.first, + let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc) + else { break } + + let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale)) + let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack) + completionHandler(audioContext) + return + + case .failed, .cancelled, .loading, .unknown: + print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")") + } + + completionHandler(nil) + } + } +} diff --git a/native_client/swift/deepspeech_ios_test/ContentView.swift b/native_client/swift/deepspeech_ios_test/ContentView.swift index 5f7442f9..0eb7c776 100644 --- a/native_client/swift/deepspeech_ios_test/ContentView.swift +++ b/native_client/swift/deepspeech_ios_test/ContentView.swift @@ -9,8 +9,38 @@ import SwiftUI struct ContentView: View { + private var stt = SpeechRecognitionImpl() + @State var isRecognizingMicrophone = false + var body: some View { - Text("Hello, World!") + VStack { + Text("DeepSpeech iOS Demo") + .font(.system(size: 30)) + Button("Recognize files", action: recognizeFiles) + .padding(30) + Button( + isRecognizingMicrophone + ? "Stop Microphone Recognition" + : "Start Microphone Recognition", + action: isRecognizingMicrophone + ? stopMicRecognition + : startMicRecognition) + .padding(30) + } + } + + func recognizeFiles() { + self.stt.recognizeFiles() + } + + func startMicRecognition() { + isRecognizingMicrophone = true + self.stt.startMicrophoneRecognition() + } + + func stopMicRecognition() { + isRecognizingMicrophone = false + self.stt.stopMicrophoneRecognition() } } diff --git a/native_client/swift/deepspeech_ios_test/Info.plist b/native_client/swift/deepspeech_ios_test/Info.plist index 9742bf0f..1682607b 100644 --- a/native_client/swift/deepspeech_ios_test/Info.plist +++ b/native_client/swift/deepspeech_ios_test/Info.plist @@ -4,6 +4,8 @@ CFBundleDevelopmentRegion $(DEVELOPMENT_LANGUAGE) + NSMicrophoneUsageDescription + Please grant access to the microphone. CFBundleExecutable $(EXECUTABLE_NAME) CFBundleIdentifier diff --git a/native_client/swift/deepspeech_ios_test/SpeechRecognitionImpl.swift b/native_client/swift/deepspeech_ios_test/SpeechRecognitionImpl.swift new file mode 100644 index 00000000..b3a4ac9b --- /dev/null +++ b/native_client/swift/deepspeech_ios_test/SpeechRecognitionImpl.swift @@ -0,0 +1,286 @@ +// +// DeepSpeech.swift +// deepspeech_ios_test +// +// Created by Erik Ziegler on 27.07.20. +// Copyright © 2020 Mozilla. All rights reserved. +// + +import Foundation +import AVFoundation +import AudioToolbox +import Accelerate + +import deepspeech_ios + +struct FillComplexInputParm { + var source: UnsafeMutablePointer + var sourceSize: UInt32 +}; + +class SpeechRecognitionImpl : NSObject, AVCaptureAudioDataOutputSampleBufferDelegate { + private var model: DeepSpeechModel + private var stream: DeepSpeechStream? + + private var captureSession = AVCaptureSession() + private var audioData = Data() + + override init() { + let modelPath = Bundle.main.path(forResource: "deepspeech-0.7.4-models", ofType: "tflite")! + let scorerPath = Bundle.main.path(forResource: "deepspeech-0.7.4-models", ofType: "scorer")! + + model = try! DeepSpeechModel(modelPath: modelPath) + try! model.enableExternalScorer(scorerPath: scorerPath) + + super.init() + + // prepare audio capture + self.configureCaptureSession() + } + + // MARK: Microphone recognition + + private func configureCaptureSession() { + captureSession.beginConfiguration() + + let audioDevice = AVCaptureDevice.default(.builtInMicrophone, for: .audio, position: .unspecified) + + let audioDeviceInput = try! AVCaptureDeviceInput(device: audioDevice!) + guard captureSession.canAddInput(audioDeviceInput) else { return } + captureSession.addInput(audioDeviceInput) + + let serialQueue = DispatchQueue(label: "serialQueue") + let audioOutput = AVCaptureAudioDataOutput() + audioOutput.setSampleBufferDelegate(self, queue: serialQueue) + + guard captureSession.canAddOutput(audioOutput) else { return } + captureSession.sessionPreset = .inputPriority + captureSession.addOutput(audioOutput) + captureSession.commitConfiguration() + } + + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + var sourceFormat = (sampleBuffer.formatDescription?.audioFormatList[0].mASBD)! + var destinationFormat = sourceFormat + destinationFormat.mSampleRate = 16000.0 + + var audioConverterRef: AudioConverterRef? + let createConverterStatus = AudioConverterNew(&sourceFormat, &destinationFormat, &audioConverterRef) + + if (createConverterStatus != noErr) { + print("Error creating converter") + } + + var quality = kAudioConverterQuality_Max + + AudioConverterSetProperty(audioConverterRef!, kAudioConverterSampleRateConverterQuality, UInt32(MemoryLayout.size), &quality) + + let blockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer) + + var pcmLength: Int = 0 + var pcmData: UnsafeMutablePointer? + let status: OSStatus = CMBlockBufferGetDataPointer(blockBuffer!, atOffset: 0, lengthAtOffsetOut: nil, totalLengthOut: &pcmLength, dataPointerOut: &pcmData) + + if status != noErr { + print("Error getting something") + } else { + var input = FillComplexInputParm(source: pcmData!, sourceSize: UInt32(pcmLength)) + + let outputBuffer = malloc(pcmLength) + memset(outputBuffer, 0, pcmLength); + + var outputBufferList = AudioBufferList() + outputBufferList.mNumberBuffers = 1 + outputBufferList.mBuffers.mData = outputBuffer + outputBufferList.mBuffers.mDataByteSize = UInt32(Double(pcmLength) * destinationFormat.mSampleRate / sourceFormat.mSampleRate) + outputBufferList.mBuffers.mNumberChannels = 1 + + func inputDataProc( + inAudioConverter: AudioConverterRef, + ioNumberDataPacket: UnsafeMutablePointer, + ioData: UnsafeMutablePointer, + outDataPacketDescription: UnsafeMutablePointer?>?, + inUserData: UnsafeMutableRawPointer? + ) -> OSStatus { + var inputPtr = inUserData!.load(as: FillComplexInputParm.self) + + if (inputPtr.sourceSize <= 0) { + ioNumberDataPacket.pointee = 1 + return -1 + } + + let rawPtr = UnsafeMutableRawPointer(inputPtr.source) + + ioData.pointee.mNumberBuffers = 1 + ioData.pointee.mBuffers.mData = rawPtr + ioData.pointee.mBuffers.mDataByteSize = inputPtr.sourceSize + ioData.pointee.mBuffers.mNumberChannels = 1 + + ioNumberDataPacket.pointee = (inputPtr.sourceSize / 2) + inputPtr.sourceSize = 0 + + return noErr + }; + + var packetSize: UInt32 = UInt32(pcmLength / 2) + + let status: OSStatus = AudioConverterFillComplexBuffer(audioConverterRef!, inputDataProc, &input, &packetSize, &outputBufferList, nil) + + if (status != noErr) { + print("Error: " + status.description) + } else { + let data = outputBufferList.mBuffers.mData! + let byteSize = outputBufferList.mBuffers.mDataByteSize + + let shorts = UnsafeBufferPointer(start: data.assumingMemoryBound(to: Int16.self), count: Int(byteSize / 2)) + stream!.feedAudioContent(buffer: shorts) + + // save bytes to audio data for creating a pcm file later for the captured audio + let ptr = UnsafePointer(data.assumingMemoryBound(to: UInt8.self)) + audioData.append(ptr, count: Int(byteSize)) + } + + free(outputBuffer) + AudioConverterDispose(audioConverterRef!) + } + } + + + public func startMicrophoneRecognition() { + audioData = Data() + stream = try! model.createStream() + captureSession.startRunning() + print("Started listening...") + } + + private func writeAudioDataToPCMFile() { + let documents = NSSearchPathForDirectoriesInDomains(FileManager.SearchPathDirectory.documentDirectory, FileManager.SearchPathDomainMask.userDomainMask, true)[0] + let filePath = documents + "/recording.pcm" + let url = URL(fileURLWithPath: filePath) + try! audioData.write(to: url) + print("Saved audio to " + filePath) + } + + public func stopMicrophoneRecognition() { + captureSession.stopRunning() + + let result = stream?.finishStream() + print("Result: " + result!) + + // optional, useful for checking the recorded audio + writeAudioDataToPCMFile() + } + + // MARK: Audio file recognition + + private func render(audioContext: AudioContext?, stream: DeepSpeechStream) { + guard let audioContext = audioContext else { + fatalError("Couldn't create the audioContext") + } + + let sampleRange: CountableRange = 0..? + CMBlockBufferGetDataPointer(readBuffer, + atOffset: 0, + lengthAtOffsetOut: &readBufferLength, + totalLengthOut: nil, + dataPointerOut: &readBufferPointer) + sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength)) + CMSampleBufferInvalidate(readSampleBuffer) + + let totalSamples = sampleBuffer.count / MemoryLayout.size + print("read \(totalSamples) samples") + + sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in + let unsafeBufferPointer = samples.bindMemory(to: Int16.self) + stream.feedAudioContent(buffer: unsafeBufferPointer) + } + + sampleBuffer.removeAll() + } + + // if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown) + guard reader.status == .completed else { + fatalError("Couldn't read the audio file") + } + } + + private func recognizeFile(audioPath: String, completion: @escaping () -> ()) { + let url = URL(fileURLWithPath: audioPath) + + let stream = try! model.createStream() + print("\(audioPath)") + let start = CFAbsoluteTimeGetCurrent() + AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in + guard let audioContext = audioContext else { + fatalError("Couldn't create the audioContext") + } + self.render(audioContext: audioContext, stream: stream) + let result = stream.finishStream() + let end = CFAbsoluteTimeGetCurrent() + print("\"\(audioPath)\": \(end - start) - \(result)") + completion() + }) + } + + public func recognizeFiles() { + // Add file names (without extension) here if you want to test recognition from files. + // Remember to add them to the project under Copy Bundle Resources. + let files: [String] = [] + + let serialQueue = DispatchQueue(label: "serialQueue") + let group = DispatchGroup() + group.enter() + + if let first = files.first { + serialQueue.async { + self.recognizeFile(audioPath: Bundle.main.path(forResource: first, ofType: "wav")!) { + group.leave() + } + } + } + + for path in files.dropFirst() { + group.wait() + group.enter() + self.recognizeFile(audioPath: Bundle.main.path(forResource: path, ofType: "wav")!) { + group.leave() + } + } + } +}