Merge pull request #3191 from reuben/swift-mic-streaming

iOS microphone streaming
This commit is contained in:
Reuben Morais 2020-07-28 14:25:27 +02:00 committed by GitHub
commit 396504ea07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 398 additions and 182 deletions

View File

@ -7,6 +7,9 @@
objects = { objects = {
/* Begin PBXBuildFile section */ /* Begin PBXBuildFile section */
504EC34324CF4EFD0073C22E /* SpeechRecognitionImpl.swift in Sources */ = {isa = PBXBuildFile; fileRef = 504EC34124CF4EFD0073C22E /* SpeechRecognitionImpl.swift */; };
504EC34424CF4EFD0073C22E /* AudioContext.swift in Sources */ = {isa = PBXBuildFile; fileRef = 504EC34224CF4EFD0073C22E /* AudioContext.swift */; };
504EC34524CF4F4F0073C22E /* deepspeech_ios.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
507CD3A124B61FE400409BBB /* deepspeech_ios.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */; }; 507CD3A124B61FE400409BBB /* deepspeech_ios.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */; };
507CD3A324B61FEB00409BBB /* libdeepspeech.so in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; }; 507CD3A324B61FEB00409BBB /* libdeepspeech.so in Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; };
507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; settings = {ATTRIBUTES = (CodeSignOnCopy, ); }; }; 507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 507CD3A224B61FEA00409BBB /* libdeepspeech.so */; settings = {ATTRIBUTES = (CodeSignOnCopy, ); }; };
@ -44,6 +47,7 @@
dstPath = ""; dstPath = "";
dstSubfolderSpec = 10; dstSubfolderSpec = 10;
files = ( files = (
504EC34524CF4F4F0073C22E /* deepspeech_ios.framework in Embed Frameworks */,
507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */, 507CD3A424B61FFC00409BBB /* libdeepspeech.so in Embed Frameworks */,
); );
name = "Embed Frameworks"; name = "Embed Frameworks";
@ -52,8 +56,10 @@
/* End PBXCopyFilesBuildPhase section */ /* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */ /* Begin PBXFileReference section */
504EC34124CF4EFD0073C22E /* SpeechRecognitionImpl.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SpeechRecognitionImpl.swift; sourceTree = "<group>"; };
504EC34224CF4EFD0073C22E /* AudioContext.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = AudioContext.swift; sourceTree = "<group>"; };
507CD3A024B61FE400409BBB /* deepspeech_ios.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = deepspeech_ios.framework; sourceTree = BUILT_PRODUCTS_DIR; }; 507CD3A024B61FE400409BBB /* deepspeech_ios.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = deepspeech_ios.framework; sourceTree = BUILT_PRODUCTS_DIR; };
507CD3A224B61FEA00409BBB /* libdeepspeech.so */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libdeepspeech.so; path = libdeepspeech.so; sourceTree = "<group>"; }; 507CD3A224B61FEA00409BBB /* libdeepspeech.so */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; path = libdeepspeech.so; sourceTree = "<group>"; };
50F787EF2497683900D52237 /* deepspeech_ios_test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = deepspeech_ios_test.app; sourceTree = BUILT_PRODUCTS_DIR; }; 50F787EF2497683900D52237 /* deepspeech_ios_test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = deepspeech_ios_test.app; sourceTree = BUILT_PRODUCTS_DIR; };
50F787F22497683900D52237 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; }; 50F787F22497683900D52237 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
50F787F42497683900D52237 /* SceneDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SceneDelegate.swift; sourceTree = "<group>"; }; 50F787F42497683900D52237 /* SceneDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SceneDelegate.swift; sourceTree = "<group>"; };
@ -130,6 +136,8 @@
50F787F12497683900D52237 /* deepspeech_ios_test */ = { 50F787F12497683900D52237 /* deepspeech_ios_test */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
504EC34224CF4EFD0073C22E /* AudioContext.swift */,
504EC34124CF4EFD0073C22E /* SpeechRecognitionImpl.swift */,
50F787F22497683900D52237 /* AppDelegate.swift */, 50F787F22497683900D52237 /* AppDelegate.swift */,
50F787F42497683900D52237 /* SceneDelegate.swift */, 50F787F42497683900D52237 /* SceneDelegate.swift */,
50F787F62497683900D52237 /* ContentView.swift */, 50F787F62497683900D52237 /* ContentView.swift */,
@ -299,7 +307,9 @@
isa = PBXSourcesBuildPhase; isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647; buildActionMask = 2147483647;
files = ( files = (
504EC34424CF4EFD0073C22E /* AudioContext.swift in Sources */,
50F787F32497683900D52237 /* AppDelegate.swift in Sources */, 50F787F32497683900D52237 /* AppDelegate.swift in Sources */,
504EC34324CF4EFD0073C22E /* SpeechRecognitionImpl.swift in Sources */,
50F787F52497683900D52237 /* SceneDelegate.swift in Sources */, 50F787F52497683900D52237 /* SceneDelegate.swift in Sources */,
50F787F72497683900D52237 /* ContentView.swift in Sources */, 50F787F72497683900D52237 /* ContentView.swift in Sources */,
); );

View File

@ -7,190 +7,10 @@
// //
import UIKit import UIKit
import Foundation
import AVFoundation
import AudioToolbox
import Accelerate
import deepspeech_ios
/// Holds audio information used for building waveforms
final class AudioContext {
/// The audio asset URL used to load the context
public let audioURL: URL
/// Total number of samples in loaded asset
public let totalSamples: Int
/// Loaded asset
public let asset: AVAsset
// Loaded assetTrack
public let assetTrack: AVAssetTrack
private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
self.audioURL = audioURL
self.totalSamples = totalSamples
self.asset = asset
self.assetTrack = assetTrack
}
public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
fatalError("Couldn't load AVAssetTrack")
}
asset.loadValuesAsynchronously(forKeys: ["duration"]) {
var error: NSError?
let status = asset.statusOfValue(forKey: "duration", error: &error)
switch status {
case .loaded:
guard
let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
let audioFormatDesc = formatDescriptions.first,
let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
else { break }
let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
completionHandler(audioContext)
return
case .failed, .cancelled, .loading, .unknown:
print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
}
completionHandler(nil)
}
}
}
func render(audioContext: AudioContext?, stream: DeepSpeechStream) {
guard let audioContext = audioContext else {
fatalError("Couldn't create the audioContext")
}
let sampleRange: CountableRange<Int> = 0..<audioContext.totalSamples
guard let reader = try? AVAssetReader(asset: audioContext.asset)
else {
fatalError("Couldn't initialize the AVAssetReader")
}
reader.timeRange = CMTimeRange(start: CMTime(value: Int64(sampleRange.lowerBound), timescale: audioContext.asset.duration.timescale),
duration: CMTime(value: Int64(sampleRange.count), timescale: audioContext.asset.duration.timescale))
let outputSettingsDict: [String : Any] = [
AVFormatIDKey: Int(kAudioFormatLinearPCM),
AVLinearPCMBitDepthKey: 16,
AVLinearPCMIsBigEndianKey: false,
AVLinearPCMIsFloatKey: false,
AVLinearPCMIsNonInterleaved: false
]
let readerOutput = AVAssetReaderTrackOutput(track: audioContext.assetTrack,
outputSettings: outputSettingsDict)
readerOutput.alwaysCopiesSampleData = false
reader.add(readerOutput)
var sampleBuffer = Data()
// 16-bit samples
reader.startReading()
defer { reader.cancelReading() }
while reader.status == .reading {
guard let readSampleBuffer = readerOutput.copyNextSampleBuffer(),
let readBuffer = CMSampleBufferGetDataBuffer(readSampleBuffer) else {
break
}
// Append audio sample buffer into our current sample buffer
var readBufferLength = 0
var readBufferPointer: UnsafeMutablePointer<Int8>?
CMBlockBufferGetDataPointer(readBuffer,
atOffset: 0,
lengthAtOffsetOut: &readBufferLength,
totalLengthOut: nil,
dataPointerOut: &readBufferPointer)
sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
CMSampleBufferInvalidate(readSampleBuffer)
let totalSamples = sampleBuffer.count / MemoryLayout<Int16>.size
print("read \(totalSamples) samples")
sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in
let unsafeBufferPointer = samples.bindMemory(to: Int16.self)
stream.feedAudioContent(buffer: unsafeBufferPointer)
}
sampleBuffer.removeAll()
}
// if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
guard reader.status == .completed else {
fatalError("Couldn't read the audio file")
}
}
func test(model: DeepSpeechModel, audioPath: String, completion: @escaping () -> ()) {
let url = URL(fileURLWithPath: audioPath)
let stream = try! model.createStream()
print("\(audioPath)")
let start = CFAbsoluteTimeGetCurrent()
AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in
guard let audioContext = audioContext else {
fatalError("Couldn't create the audioContext")
}
render(audioContext: audioContext, stream: stream)
let result = stream.finishStream()
let end = CFAbsoluteTimeGetCurrent()
print("\"\(audioPath)\": \(end - start) - \(result)")
completion()
})
}
@UIApplicationMain @UIApplicationMain
class AppDelegate: UIResponder, UIApplicationDelegate { class AppDelegate: UIResponder, UIApplicationDelegate {
func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool { func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
let model = try! DeepSpeechModel(modelPath: Bundle.main.path(forResource: "output_graph", ofType: "tflite")!)
try! model.enableExternalScorer(scorerPath: Bundle.main.path(forResource: "librispeech_en_utf8_nonpruned_o6", ofType: "scorer")!)
let files = [
"5639-40744-0008",
"1089-134686-0019",
"2094-142345-0053",
"8463-294825-0010",
"121-123852-0001",
"7021-79740-0008",
"6930-76324-0010",
"5105-28240-0001",
"1089-134691-0012",
"5142-33396-0027",
"260-123288-0004",
"6930-75918-0008",
"8463-294828-0005",
"61-70970-0002"
]
let serialQueue = DispatchQueue(label: "serialQueue")
let group = DispatchGroup()
group.enter()
serialQueue.async {
test(model: model, audioPath: Bundle.main.path(forResource: "1284-134647-0003", ofType: "wav")!) {
group.leave()
}
}
for path in files {
group.wait()
group.enter()
test(model: model, audioPath: Bundle.main.path(forResource: path, ofType: "wav")!) {
group.leave()
}
}
return true return true
} }

View File

@ -0,0 +1,68 @@
//
// AudioContext.swift
// deepspeech_ios_test
//
// Created by Erik Ziegler on 27.07.20.
// Copyright © 2020 Mozilla. All rights reserved.
//
import Foundation
import AVFoundation
import AudioToolbox
import Accelerate
import deepspeech_ios
/// Holds audio information used for building waveforms
final class AudioContext {
/// The audio asset URL used to load the context
public let audioURL: URL
/// Total number of samples in loaded asset
public let totalSamples: Int
/// Loaded asset
public let asset: AVAsset
// Loaded assetTrack
public let assetTrack: AVAssetTrack
private init(audioURL: URL, totalSamples: Int, asset: AVAsset, assetTrack: AVAssetTrack) {
self.audioURL = audioURL
self.totalSamples = totalSamples
self.asset = asset
self.assetTrack = assetTrack
}
public static func load(fromAudioURL audioURL: URL, completionHandler: @escaping (_ audioContext: AudioContext?) -> ()) {
let asset = AVURLAsset(url: audioURL, options: [AVURLAssetPreferPreciseDurationAndTimingKey: NSNumber(value: true as Bool)])
guard let assetTrack = asset.tracks(withMediaType: AVMediaType.audio).first else {
fatalError("Couldn't load AVAssetTrack")
}
asset.loadValuesAsynchronously(forKeys: ["duration"]) {
var error: NSError?
let status = asset.statusOfValue(forKey: "duration", error: &error)
switch status {
case .loaded:
guard
let formatDescriptions = assetTrack.formatDescriptions as? [CMAudioFormatDescription],
let audioFormatDesc = formatDescriptions.first,
let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(audioFormatDesc)
else { break }
let totalSamples = Int((asbd.pointee.mSampleRate) * Float64(asset.duration.value) / Float64(asset.duration.timescale))
let audioContext = AudioContext(audioURL: audioURL, totalSamples: totalSamples, asset: asset, assetTrack: assetTrack)
completionHandler(audioContext)
return
case .failed, .cancelled, .loading, .unknown:
print("Couldn't load asset: \(error?.localizedDescription ?? "Unknown error")")
}
completionHandler(nil)
}
}
}

View File

@ -9,8 +9,38 @@
import SwiftUI import SwiftUI
struct ContentView: View { struct ContentView: View {
private var stt = SpeechRecognitionImpl()
@State var isRecognizingMicrophone = false
var body: some View { var body: some View {
Text("Hello, World!") VStack {
Text("DeepSpeech iOS Demo")
.font(.system(size: 30))
Button("Recognize files", action: recognizeFiles)
.padding(30)
Button(
isRecognizingMicrophone
? "Stop Microphone Recognition"
: "Start Microphone Recognition",
action: isRecognizingMicrophone
? stopMicRecognition
: startMicRecognition)
.padding(30)
}
}
func recognizeFiles() {
self.stt.recognizeFiles()
}
func startMicRecognition() {
isRecognizingMicrophone = true
self.stt.startMicrophoneRecognition()
}
func stopMicRecognition() {
isRecognizingMicrophone = false
self.stt.stopMicrophoneRecognition()
} }
} }

View File

@ -4,6 +4,8 @@
<dict> <dict>
<key>CFBundleDevelopmentRegion</key> <key>CFBundleDevelopmentRegion</key>
<string>$(DEVELOPMENT_LANGUAGE)</string> <string>$(DEVELOPMENT_LANGUAGE)</string>
<key>NSMicrophoneUsageDescription</key>
<string>Please grant access to the microphone.</string>
<key>CFBundleExecutable</key> <key>CFBundleExecutable</key>
<string>$(EXECUTABLE_NAME)</string> <string>$(EXECUTABLE_NAME)</string>
<key>CFBundleIdentifier</key> <key>CFBundleIdentifier</key>

View File

@ -0,0 +1,286 @@
//
// DeepSpeech.swift
// deepspeech_ios_test
//
// Created by Erik Ziegler on 27.07.20.
// Copyright © 2020 Mozilla. All rights reserved.
//
import Foundation
import AVFoundation
import AudioToolbox
import Accelerate
import deepspeech_ios
struct FillComplexInputParm {
var source: UnsafeMutablePointer<Int8>
var sourceSize: UInt32
};
class SpeechRecognitionImpl : NSObject, AVCaptureAudioDataOutputSampleBufferDelegate {
private var model: DeepSpeechModel
private var stream: DeepSpeechStream?
private var captureSession = AVCaptureSession()
private var audioData = Data()
override init() {
let modelPath = Bundle.main.path(forResource: "deepspeech-0.7.4-models", ofType: "tflite")!
let scorerPath = Bundle.main.path(forResource: "deepspeech-0.7.4-models", ofType: "scorer")!
model = try! DeepSpeechModel(modelPath: modelPath)
try! model.enableExternalScorer(scorerPath: scorerPath)
super.init()
// prepare audio capture
self.configureCaptureSession()
}
// MARK: Microphone recognition
private func configureCaptureSession() {
captureSession.beginConfiguration()
let audioDevice = AVCaptureDevice.default(.builtInMicrophone, for: .audio, position: .unspecified)
let audioDeviceInput = try! AVCaptureDeviceInput(device: audioDevice!)
guard captureSession.canAddInput(audioDeviceInput) else { return }
captureSession.addInput(audioDeviceInput)
let serialQueue = DispatchQueue(label: "serialQueue")
let audioOutput = AVCaptureAudioDataOutput()
audioOutput.setSampleBufferDelegate(self, queue: serialQueue)
guard captureSession.canAddOutput(audioOutput) else { return }
captureSession.sessionPreset = .inputPriority
captureSession.addOutput(audioOutput)
captureSession.commitConfiguration()
}
func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
var sourceFormat = (sampleBuffer.formatDescription?.audioFormatList[0].mASBD)!
var destinationFormat = sourceFormat
destinationFormat.mSampleRate = 16000.0
var audioConverterRef: AudioConverterRef?
let createConverterStatus = AudioConverterNew(&sourceFormat, &destinationFormat, &audioConverterRef)
if (createConverterStatus != noErr) {
print("Error creating converter")
}
var quality = kAudioConverterQuality_Max
AudioConverterSetProperty(audioConverterRef!, kAudioConverterSampleRateConverterQuality, UInt32(MemoryLayout<UInt32>.size), &quality)
let blockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer)
var pcmLength: Int = 0
var pcmData: UnsafeMutablePointer<Int8>?
let status: OSStatus = CMBlockBufferGetDataPointer(blockBuffer!, atOffset: 0, lengthAtOffsetOut: nil, totalLengthOut: &pcmLength, dataPointerOut: &pcmData)
if status != noErr {
print("Error getting something")
} else {
var input = FillComplexInputParm(source: pcmData!, sourceSize: UInt32(pcmLength))
let outputBuffer = malloc(pcmLength)
memset(outputBuffer, 0, pcmLength);
var outputBufferList = AudioBufferList()
outputBufferList.mNumberBuffers = 1
outputBufferList.mBuffers.mData = outputBuffer
outputBufferList.mBuffers.mDataByteSize = UInt32(Double(pcmLength) * destinationFormat.mSampleRate / sourceFormat.mSampleRate)
outputBufferList.mBuffers.mNumberChannels = 1
func inputDataProc(
inAudioConverter: AudioConverterRef,
ioNumberDataPacket: UnsafeMutablePointer<UInt32>,
ioData: UnsafeMutablePointer<AudioBufferList>,
outDataPacketDescription: UnsafeMutablePointer<UnsafeMutablePointer<AudioStreamPacketDescription>?>?,
inUserData: UnsafeMutableRawPointer?
) -> OSStatus {
var inputPtr = inUserData!.load(as: FillComplexInputParm.self)
if (inputPtr.sourceSize <= 0) {
ioNumberDataPacket.pointee = 1
return -1
}
let rawPtr = UnsafeMutableRawPointer(inputPtr.source)
ioData.pointee.mNumberBuffers = 1
ioData.pointee.mBuffers.mData = rawPtr
ioData.pointee.mBuffers.mDataByteSize = inputPtr.sourceSize
ioData.pointee.mBuffers.mNumberChannels = 1
ioNumberDataPacket.pointee = (inputPtr.sourceSize / 2)
inputPtr.sourceSize = 0
return noErr
};
var packetSize: UInt32 = UInt32(pcmLength / 2)
let status: OSStatus = AudioConverterFillComplexBuffer(audioConverterRef!, inputDataProc, &input, &packetSize, &outputBufferList, nil)
if (status != noErr) {
print("Error: " + status.description)
} else {
let data = outputBufferList.mBuffers.mData!
let byteSize = outputBufferList.mBuffers.mDataByteSize
let shorts = UnsafeBufferPointer(start: data.assumingMemoryBound(to: Int16.self), count: Int(byteSize / 2))
stream!.feedAudioContent(buffer: shorts)
// save bytes to audio data for creating a pcm file later for the captured audio
let ptr = UnsafePointer(data.assumingMemoryBound(to: UInt8.self))
audioData.append(ptr, count: Int(byteSize))
}
free(outputBuffer)
AudioConverterDispose(audioConverterRef!)
}
}
public func startMicrophoneRecognition() {
audioData = Data()
stream = try! model.createStream()
captureSession.startRunning()
print("Started listening...")
}
private func writeAudioDataToPCMFile() {
let documents = NSSearchPathForDirectoriesInDomains(FileManager.SearchPathDirectory.documentDirectory, FileManager.SearchPathDomainMask.userDomainMask, true)[0]
let filePath = documents + "/recording.pcm"
let url = URL(fileURLWithPath: filePath)
try! audioData.write(to: url)
print("Saved audio to " + filePath)
}
public func stopMicrophoneRecognition() {
captureSession.stopRunning()
let result = stream?.finishStream()
print("Result: " + result!)
// optional, useful for checking the recorded audio
writeAudioDataToPCMFile()
}
// MARK: Audio file recognition
private func render(audioContext: AudioContext?, stream: DeepSpeechStream) {
guard let audioContext = audioContext else {
fatalError("Couldn't create the audioContext")
}
let sampleRange: CountableRange<Int> = 0..<audioContext.totalSamples
guard let reader = try? AVAssetReader(asset: audioContext.asset)
else {
fatalError("Couldn't initialize the AVAssetReader")
}
reader.timeRange = CMTimeRange(start: CMTime(value: Int64(sampleRange.lowerBound), timescale: audioContext.asset.duration.timescale),
duration: CMTime(value: Int64(sampleRange.count), timescale: audioContext.asset.duration.timescale))
let outputSettingsDict: [String : Any] = [
AVFormatIDKey: Int(kAudioFormatLinearPCM),
AVLinearPCMBitDepthKey: 16,
AVLinearPCMIsBigEndianKey: false,
AVLinearPCMIsFloatKey: false,
AVLinearPCMIsNonInterleaved: false
]
let readerOutput = AVAssetReaderTrackOutput(track: audioContext.assetTrack,
outputSettings: outputSettingsDict)
readerOutput.alwaysCopiesSampleData = false
reader.add(readerOutput)
var sampleBuffer = Data()
// 16-bit samples
reader.startReading()
defer { reader.cancelReading() }
while reader.status == .reading {
guard let readSampleBuffer = readerOutput.copyNextSampleBuffer(),
let readBuffer = CMSampleBufferGetDataBuffer(readSampleBuffer) else {
break
}
// Append audio sample buffer into our current sample buffer
var readBufferLength = 0
var readBufferPointer: UnsafeMutablePointer<Int8>?
CMBlockBufferGetDataPointer(readBuffer,
atOffset: 0,
lengthAtOffsetOut: &readBufferLength,
totalLengthOut: nil,
dataPointerOut: &readBufferPointer)
sampleBuffer.append(UnsafeBufferPointer(start: readBufferPointer, count: readBufferLength))
CMSampleBufferInvalidate(readSampleBuffer)
let totalSamples = sampleBuffer.count / MemoryLayout<Int16>.size
print("read \(totalSamples) samples")
sampleBuffer.withUnsafeBytes { (samples: UnsafeRawBufferPointer) in
let unsafeBufferPointer = samples.bindMemory(to: Int16.self)
stream.feedAudioContent(buffer: unsafeBufferPointer)
}
sampleBuffer.removeAll()
}
// if (reader.status == AVAssetReaderStatusFailed || reader.status == AVAssetReaderStatusUnknown)
guard reader.status == .completed else {
fatalError("Couldn't read the audio file")
}
}
private func recognizeFile(audioPath: String, completion: @escaping () -> ()) {
let url = URL(fileURLWithPath: audioPath)
let stream = try! model.createStream()
print("\(audioPath)")
let start = CFAbsoluteTimeGetCurrent()
AudioContext.load(fromAudioURL: url, completionHandler: { audioContext in
guard let audioContext = audioContext else {
fatalError("Couldn't create the audioContext")
}
self.render(audioContext: audioContext, stream: stream)
let result = stream.finishStream()
let end = CFAbsoluteTimeGetCurrent()
print("\"\(audioPath)\": \(end - start) - \(result)")
completion()
})
}
public func recognizeFiles() {
// Add file names (without extension) here if you want to test recognition from files.
// Remember to add them to the project under Copy Bundle Resources.
let files: [String] = []
let serialQueue = DispatchQueue(label: "serialQueue")
let group = DispatchGroup()
group.enter()
if let first = files.first {
serialQueue.async {
self.recognizeFile(audioPath: Bundle.main.path(forResource: first, ofType: "wav")!) {
group.leave()
}
}
}
for path in files.dropFirst() {
group.wait()
group.enter()
self.recognizeFile(audioPath: Bundle.main.path(forResource: path, ofType: "wav")!) {
group.leave()
}
}
}
}