65 lines
2.3 KiB
Python
65 lines
2.3 KiB
Python
from __future__ import absolute_import, division, print_function
|
|
|
|
import struct
|
|
|
|
import numpy as np
|
|
|
|
|
|
def text_to_char_array(transcript, alphabet, context=""):
|
|
r"""
|
|
Given a transcript string, map characters to
|
|
integers and return a numpy array representing the processed string.
|
|
Use a string in `context` for adding text to raised exceptions.
|
|
"""
|
|
if not alphabet.CanEncode(transcript):
|
|
# Provide the row context (especially wav_filename) for alphabet errors
|
|
raise ValueError(
|
|
'Alphabet cannot encode transcript "{}" while processing sample "{}", '
|
|
"check that your alphabet contains all characters in the training corpus. "
|
|
"Missing characters are: {}.".format(
|
|
transcript,
|
|
context,
|
|
list(ch for ch in transcript if not alphabet.CanEncodeSingle(ch)),
|
|
)
|
|
)
|
|
|
|
encoded = alphabet.Encode(transcript)
|
|
if len(encoded) == 0:
|
|
raise ValueError(
|
|
"While processing {}: Found an empty transcript! "
|
|
"You must include a transcript for all training data.".format(context)
|
|
)
|
|
return encoded
|
|
|
|
|
|
# The following code is from: http://hetland.org/coding/python/levenshtein.py
|
|
|
|
# This is a straightforward implementation of a well-known algorithm, and thus
|
|
# probably shouldn't be covered by copyright to begin with. But in case it is,
|
|
# the author (Magnus Lie Hetland) has, to the extent possible under law,
|
|
# dedicated all copyright and related and neighboring rights to this software
|
|
# to the public domain worldwide, by distributing it under the CC0 license,
|
|
# version 1.0. This software is distributed without any warranty. For more
|
|
# information, see <http://creativecommons.org/publicdomain/zero/1.0>
|
|
|
|
|
|
def levenshtein(a, b):
|
|
"Calculates the Levenshtein distance between a and b."
|
|
n, m = len(a), len(b)
|
|
if n > m:
|
|
# Make sure n <= m, to use O(min(n,m)) space
|
|
a, b = b, a
|
|
n, m = m, n
|
|
|
|
current = list(range(n + 1))
|
|
for i in range(1, m + 1):
|
|
previous, current = current, [i] + [0] * n
|
|
for j in range(1, n + 1):
|
|
add, delete = previous[j] + 1, current[j - 1] + 1
|
|
change = previous[j - 1]
|
|
if a[j - 1] != b[i - 1]:
|
|
change = change + 1
|
|
current[j] = min(add, delete, change)
|
|
|
|
return current[n]
|