Merge pull request #542 from mozilla/ted-utf8-stm
Open STM files as UTF-8
This commit is contained in:
commit
958db24232
|
@ -1,3 +1,4 @@
|
||||||
|
import codecs
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
class STMSegment(object):
|
class STMSegment(object):
|
||||||
|
@ -55,7 +56,7 @@ def parse_stm_file(stm_file):
|
||||||
Parses an STM file at ``stm_file`` into a list of :class:`STMSegment`.
|
Parses an STM file at ``stm_file`` into a list of :class:`STMSegment`.
|
||||||
"""
|
"""
|
||||||
stm_segments = []
|
stm_segments = []
|
||||||
with open(stm_file) as stm_lines:
|
with codecs.open(stm_file, encoding="utf-8") as stm_lines:
|
||||||
for stm_line in stm_lines:
|
for stm_line in stm_lines:
|
||||||
stmSegment = STMSegment(stm_line)
|
stmSegment = STMSegment(stm_line)
|
||||||
if not "ignore_time_segment_in_scoring" == stmSegment.transcript:
|
if not "ignore_time_segment_in_scoring" == stmSegment.transcript:
|
||||||
|
|
Loading…
Reference in New Issue