Merge pull request #3191 from reuben/swift-mic-streaming
iOS microphone streaming
This commit is contained in:
commit
396504ea07
|
@ -7,6 +7,9 @@
|
||||||
objects = {
|
objects = {
|
||||||
|
|
||||||
/* Begin PBXBuildFile section */
|
/* Begin PBXBuildFile section */
|
||||||
|
504EC34324CF4EFD0073C22E /* SpeechRecognitionImpl.swift in Sources */ = {isa = PBXBuildFile; fileRef = 504EC34124CF4EFD0073C22E /* SpeechRecognitionImpl.swift */; };
|
||||||
|
504EC34424CF4EFD0073C22E /* AudioContext.swift in Sources */ = {isa = PBXBuildFile; fileRef = 504EC34224CF4EFD0073C22E /* AudioContext.swift */; };
|
||||||
|
504EC34524CF4F4F0073C22E /* deepspeech_ios.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
|
||||||
507CD3A124B61FE400409BBB /* deepspeech_ios.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */; };
|
507CD3A124B61FE400409BBB /* deepspeech_ios.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */; };
|
||||||
507CD3A324B61FEB00409BBB /* libdeepspeech.so in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; };
|
507CD3A324B61FEB00409BBB /* libdeepspeech.so in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; };
|
||||||
507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; settings = {ATTRIBUTES = (CodeSignOnCopy, ); }; };
|
507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; settings = {ATTRIBUTES = (CodeSignOnCopy, ); }; };
|
||||||
|
@ -44,6 +47,7 @@
|
||||||
dstPath = "";
|
dstPath = "";
|
||||||
dstSubfolderSpec = 10;
|
dstSubfolderSpec = 10;
|
||||||
files = (
|
files = (
|
||||||
|
504EC34524CF4F4F0073C22E /* deepspeech_ios.framework in Embed Frameworks */,
|
||||||
507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */,
|
507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */,
|
||||||
);
|
);
|
||||||
name = "Embed Frameworks";
|
name = "Embed Frameworks";
|
||||||
|
@ -52,8 +56,10 @@
|
||||||
/* End PBXCopyFilesBuildPhase section */
|
/* End PBXCopyFilesBuildPhase section */
|
||||||
|
|
||||||
/* Begin PBXFileReference section */
|
/* Begin PBXFileReference section */
|
||||||
|
504EC34124CF4EFD0073C22E /* SpeechRecognitionImpl.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SpeechRecognitionImpl.swift; sourceTree = "<group>"; };
|
||||||
|
504EC34224CF4EFD0073C22E /* AudioContext.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = AudioContext.swift; sourceTree = "<group>"; };
|
||||||
507CD3A024B61FE400409BBB /* deepspeech_ios.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = deepspeech_ios.framework; sourceTree = BUILT_PRODUCTS_DIR; };
|
507CD3A024B61FE400409BBB /* deepspeech_ios.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = deepspeech_ios.framework; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
507CD3A224B61FEA00409BBB /* libdeepspeech.so */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libdeepspeech.so; path = libdeepspeech.so; sourceTree = "<group>"; };
|
507CD3A224B61FEA00409BBB /* libdeepspeech.so */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; path = libdeepspeech.so; sourceTree = "<group>"; };
|
||||||
50F787EF2497683900D52237 /* deepspeech_ios_test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = deepspeech_ios_test.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
50F787EF2497683900D52237 /* deepspeech_ios_test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = deepspeech_ios_test.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||||
50F787F22497683900D52237 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
|
50F787F22497683900D52237 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
|
||||||
50F787F42497683900D52237 /* SceneDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SceneDelegate.swift; sourceTree = "<group>"; };
|
50F787F42497683900D52237 /* SceneDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SceneDelegate.swift; sourceTree = "<group>"; };
|
||||||
|
@ -130,6 +136,8 @@
|
||||||
50F787F12497683900D52237 /* deepspeech_ios_test */ = {
|
50F787F12497683900D52237 /* deepspeech_ios_test */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
504EC34224CF4EFD0073C22E /* AudioContext.swift */,
|
||||||
|
504EC34124CF4EFD0073C22E /* SpeechRecognitionImpl.swift */,
|
||||||
50F787F22497683900D52237 /* AppDelegate.swift */,
|
50F787F22497683900D52237 /* AppDelegate.swift */,
|
||||||
50F787F42497683900D52237 /* SceneDelegate.swift */,
|
50F787F42497683900D52237 /* SceneDelegate.swift */,
|
||||||
50F787F62497683900D52237 /* ContentView.swift */,
|
50F787F62497683900D52237 /* ContentView.swift */,
|
||||||
|
@ -299,7 +307,9 @@
|
||||||
isa = PBXSourcesBuildPhase;
|
isa = PBXSourcesBuildPhase;
|
||||||
buildActionMask = 2147483647;
|
buildActionMask = 2147483647;
|
||||||
files = (
|
files = (
|
||||||
|
504EC34424CF4EFD0073C22E /* AudioContext.swift in Sources */,
|
||||||
50F787F32497683900D52237 /* AppDelegate.swift in Sources */,
|
50F787F32497683900D52237 /* AppDelegate.swift in Sources */,
|
||||||
|
504EC34324CF4EFD0073C22E /* SpeechRecognitionImpl.swift in Sources */,
|
||||||
50F787F52497683900D52237 /* SceneDelegate.swift in Sources */,
|
50F787F52497683900D52237 /* SceneDelegate.swift in Sources */,
|
||||||
50F787F72497683900D52237 /* ContentView.swift in Sources */,
|
50F787F72497683900D52237 /* ContentView.swift in Sources */,
|
||||||
);
|
);
|
||||||
|
|
|
@ -7,190 +7,10 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
import UIKit
|
import UIKit
|
||||||
import Foundation
|
|
||||||
import AVFoundation
|
|
||||||
import AudioToolbox
|
|
||||||
import Accelerate
|
|
||||||
|
|
||||||
import deepspeech_ios
|
|
||||||
|
|
||||||
/// Holds audio information used for building waveforms
|
|
||||||
final class AudioContext {
|
|
||||||
|
|
||||||
/// The audio asset URL used to load the context
|
|
||||||
public let audioURL: URL
|
|
||||||
|
|
||||||
/// Total number of samples in loaded asset
|
|
||||||
public let totalSamples: Int
|
|
||||||
|
|
||||||
/// Loaded asset
|
|
||||||
public let asset: AVAsset
|
|
||||||
|
|
||||||
// Loaded assetTrack
|
|
||||||
public let assetTrack: AVAssetTrack
|
|
||||||
|
|
||||||
private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
|
|
||||||
self.audioURL = audioURL
|
|
||||||
self.totalSamples = totalSamples
|
|
||||||
self.asset = asset
|
|
||||||
self.assetTrack = assetTrack
|
|
||||||
}
|
|
||||||
|
|
||||||
public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
|
|
||||||
let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
|
|
||||||
|
|
||||||
guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
|
|
||||||
fatalError("Couldn't load AVAssetTrack")
|
|
||||||
}
|
|
||||||
|
|
||||||
asset.loadValuesAsynchronously(forKeys: ["duration"]) {
|
|
||||||
var error: NSError?
|
|
||||||
let status = asset.statusOfValue(forKey: "duration", error: &error)
|
|
||||||
switch status {
|
|
||||||
case .loaded:
|
|
||||||
guard
|
|
||||||
let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
|
|
||||||
let audioFormatDesc = formatDescriptions.first,
|
|
||||||
let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
|
|
||||||
else { break }
|
|
||||||
|
|
||||||
let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
|
|
||||||
let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
|
|
||||||
completionHandler(audioContext)
|
|
||||||
return
|
|
||||||
|
|
||||||
case .failed, .cancelled, .loading, .unknown:
|
|
||||||
print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
|
|
||||||
}
|
|
||||||
|
|
||||||
completionHandler(nil)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func render(audioContext: AudioContext?, stream: DeepSpeechStream) {
|
|
||||||
guard let audioContext = audioContext else {
|
|
||||||
fatalError("Couldn't create the audioContext")
|
|
||||||
}
|
|
||||||
|
|
||||||
let sampleRange: CountableRange<Int> = 0..<audioContext.totalSamples
|
|
||||||
|
|
||||||
guard let reader = try? AVAssetReader(asset: audioContext.asset)
|
|
||||||
else {
|
|
||||||
fatalError("Couldn't initialize the AVAssetReader")
|
|
||||||
}
|
|
||||||
|
|
||||||
reader.timeRange = CMTimeRange(start: CMTime(value: Int64(sampleRange.lowerBound), timescale: audioContext.asset.duration.timescale),
|
|
||||||
duration: CMTime(value: Int64(sampleRange.count), timescale: audioContext.asset.duration.timescale))
|
|
||||||
|
|
||||||
let outputSettingsDict: [String : Any] = [
|
|
||||||
AVFormatIDKey: Int(kAudioFormatLinearPCM),
|
|
||||||
AVLinearPCMBitDepthKey: 16,
|
|
||||||
AVLinearPCMIsBigEndianKey: false,
|
|
||||||
AVLinearPCMIsFloatKey: false,
|
|
||||||
AVLinearPCMIsNonInterleaved: false
|
|
||||||
]
|
|
||||||
|
|
||||||
let readerOutput = AVAssetReaderTrackOutput(track: audioContext.assetTrack,
|
|
||||||
outputSettings: outputSettingsDict)
|
|
||||||
readerOutput.alwaysCopiesSampleData = false
|
|
||||||
reader.add(readerOutput)
|
|
||||||
|
|
||||||
var sampleBuffer = Data()
|
|
||||||
|
|
||||||
// 16-bit samples
|
|
||||||
reader.startReading()
|
|
||||||
defer { reader.cancelReading() }
|
|
||||||
|
|
||||||
while reader.status == .reading {
|
|
||||||
guard let readSampleBuffer = readerOutput.copyNextSampleBuffer(),
|
|
||||||
let readBuffer = CMSampleBufferGetDataBuffer(readSampleBuffer) else {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
// Append audio sample buffer into our current sample buffer
|
|
||||||
var readBufferLength = 0
|
|
||||||
var readBufferPointer: UnsafeMutablePointer<Int8>?
|
|
||||||
CMBlockBufferGetDataPointer(readBuffer,
|
|
||||||
atOffset: 0,
|
|
||||||
lengthAtOffsetOut: &readBufferLength,
|
|
||||||
totalLengthOut: nil,
|
|
||||||
dataPointerOut: &readBufferPointer)
|
|
||||||
sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
|
|
||||||
CMSampleBufferInvalidate(readSampleBuffer)
|
|
||||||
|
|
||||||
let totalSamples = sampleBuffer.count / MemoryLayout<Int16>.size
|
|
||||||
print("read \(totalSamples) samples")
|
|
||||||
|
|
||||||
sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in
|
|
||||||
let unsafeBufferPointer = samples.bindMemory(to: Int16.self)
|
|
||||||
stream.feedAudioContent(buffer: unsafeBufferPointer)
|
|
||||||
}
|
|
||||||
|
|
||||||
sampleBuffer.removeAll()
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
|
|
||||||
guard reader.status == .completed else {
|
|
||||||
fatalError("Couldn't read the audio file")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func test(model: DeepSpeechModel, audioPath: String, completion: @escaping () -> ()) {
|
|
||||||
let url = URL(fileURLWithPath: audioPath)
|
|
||||||
|
|
||||||
let stream = try! model.createStream()
|
|
||||||
print("\(audioPath)")
|
|
||||||
let start = CFAbsoluteTimeGetCurrent()
|
|
||||||
AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in
|
|
||||||
guard let audioContext = audioContext else {
|
|
||||||
fatalError("Couldn't create the audioContext")
|
|
||||||
}
|
|
||||||
render(audioContext: audioContext, stream: stream)
|
|
||||||
let result = stream.finishStream()
|
|
||||||
let end = CFAbsoluteTimeGetCurrent()
|
|
||||||
print("\"\(audioPath)\": \(end - start) - \(result)")
|
|
||||||
completion()
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
@UIApplicationMain
|
@UIApplicationMain
|
||||||
class AppDelegate: UIResponder, UIApplicationDelegate {
|
class AppDelegate: UIResponder, UIApplicationDelegate {
|
||||||
func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
|
func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
|
||||||
let model = try! DeepSpeechModel(modelPath: Bundle.main.path(forResource: "output_graph", ofType: "tflite")!)
|
|
||||||
try! model.enableExternalScorer(scorerPath: Bundle.main.path(forResource: "librispeech_en_utf8_nonpruned_o6", ofType: "scorer")!)
|
|
||||||
|
|
||||||
let files = [
|
|
||||||
"5639-40744-0008",
|
|
||||||
"1089-134686-0019",
|
|
||||||
"2094-142345-0053",
|
|
||||||
"8463-294825-0010",
|
|
||||||
"121-123852-0001",
|
|
||||||
"7021-79740-0008",
|
|
||||||
"6930-76324-0010",
|
|
||||||
"5105-28240-0001",
|
|
||||||
"1089-134691-0012",
|
|
||||||
"5142-33396-0027",
|
|
||||||
"260-123288-0004",
|
|
||||||
"6930-75918-0008",
|
|
||||||
"8463-294828-0005",
|
|
||||||
"61-70970-0002"
|
|
||||||
]
|
|
||||||
|
|
||||||
let serialQueue = DispatchQueue(label: "serialQueue")
|
|
||||||
let group = DispatchGroup()
|
|
||||||
group.enter()
|
|
||||||
serialQueue.async {
|
|
||||||
test(model: model, audioPath: Bundle.main.path(forResource: "1284-134647-0003", ofType: "wav")!) {
|
|
||||||
group.leave()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for path in files {
|
|
||||||
group.wait()
|
|
||||||
group.enter()
|
|
||||||
test(model: model, audioPath: Bundle.main.path(forResource: path, ofType: "wav")!) {
|
|
||||||
group.leave()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
//
|
||||||
|
// AudioContext.swift
|
||||||
|
// deepspeech_ios_test
|
||||||
|
//
|
||||||
|
// Created by Erik Ziegler on 27.07.20.
|
||||||
|
// Copyright © 2020 Mozilla. All rights reserved.
|
||||||
|
//
|
||||||
|
|
||||||
|
import Foundation
|
||||||
|
import AVFoundation
|
||||||
|
import AudioToolbox
|
||||||
|
import Accelerate
|
||||||
|
|
||||||
|
import deepspeech_ios
|
||||||
|
|
||||||
|
/// Holds audio information used for building waveforms
|
||||||
|
final class AudioContext {
|
||||||
|
|
||||||
|
/// The audio asset URL used to load the context
|
||||||
|
public let audioURL: URL
|
||||||
|
|
||||||
|
/// Total number of samples in loaded asset
|
||||||
|
public let totalSamples: Int
|
||||||
|
|
||||||
|
/// Loaded asset
|
||||||
|
public let asset: AVAsset
|
||||||
|
|
||||||
|
// Loaded assetTrack
|
||||||
|
public let assetTrack: AVAssetTrack
|
||||||
|
|
||||||
|
private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
|
||||||
|
self.audioURL = audioURL
|
||||||
|
self.totalSamples = totalSamples
|
||||||
|
self.asset = asset
|
||||||
|
self.assetTrack = assetTrack
|
||||||
|
}
|
||||||
|
|
||||||
|
public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
|
||||||
|
let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
|
||||||
|
|
||||||
|
guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
|
||||||
|
fatalError("Couldn't load AVAssetTrack")
|
||||||
|
}
|
||||||
|
|
||||||
|
asset.loadValuesAsynchronously(forKeys: ["duration"]) {
|
||||||
|
var error: NSError?
|
||||||
|
let status = asset.statusOfValue(forKey: "duration", error: &error)
|
||||||
|
switch status {
|
||||||
|
case .loaded:
|
||||||
|
guard
|
||||||
|
let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
|
||||||
|
let audioFormatDesc = formatDescriptions.first,
|
||||||
|
let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
|
||||||
|
else { break }
|
||||||
|
|
||||||
|
let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
|
||||||
|
let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
|
||||||
|
completionHandler(audioContext)
|
||||||
|
return
|
||||||
|
|
||||||
|
case .failed, .cancelled, .loading, .unknown:
|
||||||
|
print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
|
||||||
|
}
|
||||||
|
|
||||||
|
completionHandler(nil)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -9,8 +9,38 @@
|
||||||
import SwiftUI
|
import SwiftUI
|
||||||
|
|
||||||
struct ContentView: View {
|
struct ContentView: View {
|
||||||
|
private var stt = SpeechRecognitionImpl()
|
||||||
|
@State var isRecognizingMicrophone = false
|
||||||
|
|
||||||
var body: some View {
|
var body: some View {
|
||||||
Text("Hello, World!")
|
VStack {
|
||||||
|
Text("DeepSpeech iOS Demo")
|
||||||
|
.font(.system(size: 30))
|
||||||
|
Button("Recognize files", action: recognizeFiles)
|
||||||
|
.padding(30)
|
||||||
|
Button(
|
||||||
|
isRecognizingMicrophone
|
||||||
|
? "Stop Microphone Recognition"
|
||||||
|
: "Start Microphone Recognition",
|
||||||
|
action: isRecognizingMicrophone
|
||||||
|
? stopMicRecognition
|
||||||
|
: startMicRecognition)
|
||||||
|
.padding(30)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func recognizeFiles() {
|
||||||
|
self.stt.recognizeFiles()
|
||||||
|
}
|
||||||
|
|
||||||
|
func startMicRecognition() {
|
||||||
|
isRecognizingMicrophone = true
|
||||||
|
self.stt.startMicrophoneRecognition()
|
||||||
|
}
|
||||||
|
|
||||||
|
func stopMicRecognition() {
|
||||||
|
isRecognizingMicrophone = false
|
||||||
|
self.stt.stopMicrophoneRecognition()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
<dict>
|
<dict>
|
||||||
<key>CFBundleDevelopmentRegion</key>
|
<key>CFBundleDevelopmentRegion</key>
|
||||||
<string>$(DEVELOPMENT_LANGUAGE)</string>
|
<string>$(DEVELOPMENT_LANGUAGE)</string>
|
||||||
|
<key>NSMicrophoneUsageDescription</key>
|
||||||
|
<string>Please grant access to the microphone.</string>
|
||||||
<key>CFBundleExecutable</key>
|
<key>CFBundleExecutable</key>
|
||||||
<string>$(EXECUTABLE_NAME)</string>
|
<string>$(EXECUTABLE_NAME)</string>
|
||||||
<key>CFBundleIdentifier</key>
|
<key>CFBundleIdentifier</key>
|
||||||
|
|
|
@ -0,0 +1,286 @@
|
||||||
|
//
|
||||||
|
// DeepSpeech.swift
|
||||||
|
// deepspeech_ios_test
|
||||||
|
//
|
||||||
|
// Created by Erik Ziegler on 27.07.20.
|
||||||
|
// Copyright © 2020 Mozilla. All rights reserved.
|
||||||
|
//
|
||||||
|
|
||||||
|
import Foundation
|
||||||
|
import AVFoundation
|
||||||
|
import AudioToolbox
|
||||||
|
import Accelerate
|
||||||
|
|
||||||
|
import deepspeech_ios
|
||||||
|
|
||||||
|
struct FillComplexInputParm {
|
||||||
|
var source: UnsafeMutablePointer<Int8>
|
||||||
|
var sourceSize: UInt32
|
||||||
|
};
|
||||||
|
|
||||||
|
class SpeechRecognitionImpl : NSObject, AVCaptureAudioDataOutputSampleBufferDelegate {
|
||||||
|
private var model: DeepSpeechModel
|
||||||
|
private var stream: DeepSpeechStream?
|
||||||
|
|
||||||
|
private var captureSession = AVCaptureSession()
|
||||||
|
private var audioData = Data()
|
||||||
|
|
||||||
|
override init() {
|
||||||
|
let modelPath = Bundle.main.path(forResource: "deepspeech-0.7.4-models", ofType: "tflite")!
|
||||||
|
let scorerPath = Bundle.main.path(forResource: "deepspeech-0.7.4-models", ofType: "scorer")!
|
||||||
|
|
||||||
|
model = try! DeepSpeechModel(modelPath: modelPath)
|
||||||
|
try! model.enableExternalScorer(scorerPath: scorerPath)
|
||||||
|
|
||||||
|
super.init()
|
||||||
|
|
||||||
|
// prepare audio capture
|
||||||
|
self.configureCaptureSession()
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: Microphone recognition
|
||||||
|
|
||||||
|
private func configureCaptureSession() {
|
||||||
|
captureSession.beginConfiguration()
|
||||||
|
|
||||||
|
let audioDevice = AVCaptureDevice.default(.builtInMicrophone, for: .audio, position: .unspecified)
|
||||||
|
|
||||||
|
let audioDeviceInput = try! AVCaptureDeviceInput(device: audioDevice!)
|
||||||
|
guard captureSession.canAddInput(audioDeviceInput) else { return }
|
||||||
|
captureSession.addInput(audioDeviceInput)
|
||||||
|
|
||||||
|
let serialQueue = DispatchQueue(label: "serialQueue")
|
||||||
|
let audioOutput = AVCaptureAudioDataOutput()
|
||||||
|
audioOutput.setSampleBufferDelegate(self, queue: serialQueue)
|
||||||
|
|
||||||
|
guard captureSession.canAddOutput(audioOutput) else { return }
|
||||||
|
captureSession.sessionPreset = .inputPriority
|
||||||
|
captureSession.addOutput(audioOutput)
|
||||||
|
captureSession.commitConfiguration()
|
||||||
|
}
|
||||||
|
|
||||||
|
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
|
||||||
|
var sourceFormat = (sampleBuffer.formatDescription?.audioFormatList[0].mASBD)!
|
||||||
|
var destinationFormat = sourceFormat
|
||||||
|
destinationFormat.mSampleRate = 16000.0
|
||||||
|
|
||||||
|
var audioConverterRef: AudioConverterRef?
|
||||||
|
let createConverterStatus = AudioConverterNew(&sourceFormat, &destinationFormat, &audioConverterRef)
|
||||||
|
|
||||||
|
if (createConverterStatus != noErr) {
|
||||||
|
print("Error creating converter")
|
||||||
|
}
|
||||||
|
|
||||||
|
var quality = kAudioConverterQuality_Max
|
||||||
|
|
||||||
|
AudioConverterSetProperty(audioConverterRef!, kAudioConverterSampleRateConverterQuality, UInt32(MemoryLayout<UInt32>.size), &quality)
|
||||||
|
|
||||||
|
let blockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer)
|
||||||
|
|
||||||
|
var pcmLength: Int = 0
|
||||||
|
var pcmData: UnsafeMutablePointer<Int8>?
|
||||||
|
let status: OSStatus = CMBlockBufferGetDataPointer(blockBuffer!, atOffset: 0, lengthAtOffsetOut: nil, totalLengthOut: &pcmLength, dataPointerOut: &pcmData)
|
||||||
|
|
||||||
|
if status != noErr {
|
||||||
|
print("Error getting something")
|
||||||
|
} else {
|
||||||
|
var input = FillComplexInputParm(source: pcmData!, sourceSize: UInt32(pcmLength))
|
||||||
|
|
||||||
|
let outputBuffer = malloc(pcmLength)
|
||||||
|
memset(outputBuffer, 0, pcmLength);
|
||||||
|
|
||||||
|
var outputBufferList = AudioBufferList()
|
||||||
|
outputBufferList.mNumberBuffers = 1
|
||||||
|
outputBufferList.mBuffers.mData = outputBuffer
|
||||||
|
outputBufferList.mBuffers.mDataByteSize = UInt32(Double(pcmLength) * destinationFormat.mSampleRate / sourceFormat.mSampleRate)
|
||||||
|
outputBufferList.mBuffers.mNumberChannels = 1
|
||||||
|
|
||||||
|
func inputDataProc(
|
||||||
|
inAudioConverter: AudioConverterRef,
|
||||||
|
ioNumberDataPacket: UnsafeMutablePointer<UInt32>,
|
||||||
|
ioData: UnsafeMutablePointer<AudioBufferList>,
|
||||||
|
outDataPacketDescription: UnsafeMutablePointer<UnsafeMutablePointer<AudioStreamPacketDescription>?>?,
|
||||||
|
inUserData: UnsafeMutableRawPointer?
|
||||||
|
) -> OSStatus {
|
||||||
|
var inputPtr = inUserData!.load(as: FillComplexInputParm.self)
|
||||||
|
|
||||||
|
if (inputPtr.sourceSize <= 0) {
|
||||||
|
ioNumberDataPacket.pointee = 1
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
let rawPtr = UnsafeMutableRawPointer(inputPtr.source)
|
||||||
|
|
||||||
|
ioData.pointee.mNumberBuffers = 1
|
||||||
|
ioData.pointee.mBuffers.mData = rawPtr
|
||||||
|
ioData.pointee.mBuffers.mDataByteSize = inputPtr.sourceSize
|
||||||
|
ioData.pointee.mBuffers.mNumberChannels = 1
|
||||||
|
|
||||||
|
ioNumberDataPacket.pointee = (inputPtr.sourceSize / 2)
|
||||||
|
inputPtr.sourceSize = 0
|
||||||
|
|
||||||
|
return noErr
|
||||||
|
};
|
||||||
|
|
||||||
|
var packetSize: UInt32 = UInt32(pcmLength / 2)
|
||||||
|
|
||||||
|
let status: OSStatus = AudioConverterFillComplexBuffer(audioConverterRef!, inputDataProc, &input, &packetSize, &outputBufferList, nil)
|
||||||
|
|
||||||
|
if (status != noErr) {
|
||||||
|
print("Error: " + status.description)
|
||||||
|
} else {
|
||||||
|
let data = outputBufferList.mBuffers.mData!
|
||||||
|
let byteSize = outputBufferList.mBuffers.mDataByteSize
|
||||||
|
|
||||||
|
let shorts = UnsafeBufferPointer(start: data.assumingMemoryBound(to: Int16.self), count: Int(byteSize / 2))
|
||||||
|
stream!.feedAudioContent(buffer: shorts)
|
||||||
|
|
||||||
|
// save bytes to audio data for creating a pcm file later for the captured audio
|
||||||
|
let ptr = UnsafePointer(data.assumingMemoryBound(to: UInt8.self))
|
||||||
|
audioData.append(ptr, count: Int(byteSize))
|
||||||
|
}
|
||||||
|
|
||||||
|
free(outputBuffer)
|
||||||
|
AudioConverterDispose(audioConverterRef!)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public func startMicrophoneRecognition() {
|
||||||
|
audioData = Data()
|
||||||
|
stream = try! model.createStream()
|
||||||
|
captureSession.startRunning()
|
||||||
|
print("Started listening...")
|
||||||
|
}
|
||||||
|
|
||||||
|
private func writeAudioDataToPCMFile() {
|
||||||
|
let documents = NSSearchPathForDirectoriesInDomains(FileManager.SearchPathDirectory.documentDirectory, FileManager.SearchPathDomainMask.userDomainMask, true)[0]
|
||||||
|
let filePath = documents + "/recording.pcm"
|
||||||
|
let url = URL(fileURLWithPath: filePath)
|
||||||
|
try! audioData.write(to: url)
|
||||||
|
print("Saved audio to " + filePath)
|
||||||
|
}
|
||||||
|
|
||||||
|
public func stopMicrophoneRecognition() {
|
||||||
|
captureSession.stopRunning()
|
||||||
|
|
||||||
|
let result = stream?.finishStream()
|
||||||
|
print("Result: " + result!)
|
||||||
|
|
||||||
|
// optional, useful for checking the recorded audio
|
||||||
|
writeAudioDataToPCMFile()
|
||||||
|
}
|
||||||
|
|
||||||
|
// MARK: Audio file recognition
|
||||||
|
|
||||||
|
private func render(audioContext: AudioContext?, stream: DeepSpeechStream) {
|
||||||
|
guard let audioContext = audioContext else {
|
||||||
|
fatalError("Couldn't create the audioContext")
|
||||||
|
}
|
||||||
|
|
||||||
|
let sampleRange: CountableRange<Int> = 0..<audioContext.totalSamples
|
||||||
|
|
||||||
|
guard let reader = try? AVAssetReader(asset: audioContext.asset)
|
||||||
|
else {
|
||||||
|
fatalError("Couldn't initialize the AVAssetReader")
|
||||||
|
}
|
||||||
|
|
||||||
|
reader.timeRange = CMTimeRange(start: CMTime(value: Int64(sampleRange.lowerBound), timescale: audioContext.asset.duration.timescale),
|
||||||
|
duration: CMTime(value: Int64(sampleRange.count), timescale: audioContext.asset.duration.timescale))
|
||||||
|
|
||||||
|
let outputSettingsDict: [String : Any] = [
|
||||||
|
AVFormatIDKey: Int(kAudioFormatLinearPCM),
|
||||||
|
AVLinearPCMBitDepthKey: 16,
|
||||||
|
AVLinearPCMIsBigEndianKey: false,
|
||||||
|
AVLinearPCMIsFloatKey: false,
|
||||||
|
AVLinearPCMIsNonInterleaved: false
|
||||||
|
]
|
||||||
|
|
||||||
|
let readerOutput = AVAssetReaderTrackOutput(track: audioContext.assetTrack,
|
||||||
|
outputSettings: outputSettingsDict)
|
||||||
|
readerOutput.alwaysCopiesSampleData = false
|
||||||
|
reader.add(readerOutput)
|
||||||
|
|
||||||
|
var sampleBuffer = Data()
|
||||||
|
|
||||||
|
// 16-bit samples
|
||||||
|
reader.startReading()
|
||||||
|
defer { reader.cancelReading() }
|
||||||
|
|
||||||
|
while reader.status == .reading {
|
||||||
|
guard let readSampleBuffer = readerOutput.copyNextSampleBuffer(),
|
||||||
|
let readBuffer = CMSampleBufferGetDataBuffer(readSampleBuffer) else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Append audio sample buffer into our current sample buffer
|
||||||
|
var readBufferLength = 0
|
||||||
|
var readBufferPointer: UnsafeMutablePointer<Int8>?
|
||||||
|
CMBlockBufferGetDataPointer(readBuffer,
|
||||||
|
atOffset: 0,
|
||||||
|
lengthAtOffsetOut: &readBufferLength,
|
||||||
|
totalLengthOut: nil,
|
||||||
|
dataPointerOut: &readBufferPointer)
|
||||||
|
sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
|
||||||
|
CMSampleBufferInvalidate(readSampleBuffer)
|
||||||
|
|
||||||
|
let totalSamples = sampleBuffer.count / MemoryLayout<Int16>.size
|
||||||
|
print("read \(totalSamples) samples")
|
||||||
|
|
||||||
|
sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in
|
||||||
|
let unsafeBufferPointer = samples.bindMemory(to: Int16.self)
|
||||||
|
stream.feedAudioContent(buffer: unsafeBufferPointer)
|
||||||
|
}
|
||||||
|
|
||||||
|
sampleBuffer.removeAll()
|
||||||
|
}
|
||||||
|
|
||||||
|
// if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
|
||||||
|
guard reader.status == .completed else {
|
||||||
|
fatalError("Couldn't read the audio file")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private func recognizeFile(audioPath: String, completion: @escaping () -> ()) {
|
||||||
|
let url = URL(fileURLWithPath: audioPath)
|
||||||
|
|
||||||
|
let stream = try! model.createStream()
|
||||||
|
print("\(audioPath)")
|
||||||
|
let start = CFAbsoluteTimeGetCurrent()
|
||||||
|
AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in
|
||||||
|
guard let audioContext = audioContext else {
|
||||||
|
fatalError("Couldn't create the audioContext")
|
||||||
|
}
|
||||||
|
self.render(audioContext: audioContext, stream: stream)
|
||||||
|
let result = stream.finishStream()
|
||||||
|
let end = CFAbsoluteTimeGetCurrent()
|
||||||
|
print("\"\(audioPath)\": \(end - start) - \(result)")
|
||||||
|
completion()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
public func recognizeFiles() {
|
||||||
|
// Add file names (without extension) here if you want to test recognition from files.
|
||||||
|
// Remember to add them to the project under Copy Bundle Resources.
|
||||||
|
let files: [String] = []
|
||||||
|
|
||||||
|
let serialQueue = DispatchQueue(label: "serialQueue")
|
||||||
|
let group = DispatchGroup()
|
||||||
|
group.enter()
|
||||||
|
|
||||||
|
if let first = files.first {
|
||||||
|
serialQueue.async {
|
||||||
|
self.recognizeFile(audioPath: Bundle.main.path(forResource: first, ofType: "wav")!) {
|
||||||
|
group.leave()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for path in files.dropFirst() {
|
||||||
|
group.wait()
|
||||||
|
group.enter()
|
||||||
|
self.recognizeFile(audioPath: Bundle.main.path(forResource: path, ofType: "wav")!) {
|
||||||
|
group.leave()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue