Merge pull request #3131 from mozilla/alphabet-fallible
Add methods to check for label presence in Alphabet
This commit is contained in:
commit
c64e416f61
|
@ -137,6 +137,24 @@ Alphabet::Deserialize(const char* buffer, const int buffer_size)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
Alphabet::CanEncodeSingle(const std::string& input) const
|
||||||
|
{
|
||||||
|
auto it = str_to_label_.find(input);
|
||||||
|
return it != str_to_label_.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
Alphabet::CanEncode(const std::string& input) const
|
||||||
|
{
|
||||||
|
for (auto cp : split_into_codepoints(input)) {
|
||||||
|
if (!CanEncodeSingle(cp)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
std::string
|
std::string
|
||||||
Alphabet::DecodeSingle(unsigned int label) const
|
Alphabet::DecodeSingle(unsigned int label) const
|
||||||
{
|
{
|
||||||
|
@ -191,6 +209,18 @@ Alphabet::Encode(const std::string& input) const
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
UTF8Alphabet::CanEncodeSingle(const std::string& input) const
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
UTF8Alphabet::CanEncode(const std::string& input) const
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<unsigned int>
|
std::vector<unsigned int>
|
||||||
UTF8Alphabet::Encode(const std::string& input) const
|
UTF8Alphabet::Encode(const std::string& input) const
|
||||||
{
|
{
|
||||||
|
|
|
@ -37,10 +37,19 @@ public:
|
||||||
return space_label_;
|
return space_label_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns true if the single character/output class has a corresponding label
|
||||||
|
// in the alphabet.
|
||||||
|
virtual bool CanEncodeSingle(const std::string& string) const;
|
||||||
|
|
||||||
|
// Returns true if the entire string can be encoded into labels in this
|
||||||
|
// alphabet.
|
||||||
|
virtual bool CanEncode(const std::string& string) const;
|
||||||
|
|
||||||
// Decode a single label into a string.
|
// Decode a single label into a string.
|
||||||
std::string DecodeSingle(unsigned int label) const;
|
std::string DecodeSingle(unsigned int label) const;
|
||||||
|
|
||||||
// Encode a single character/output class into a label.
|
// Encode a single character/output class into a label. Character must be in
|
||||||
|
// the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
|
||||||
unsigned int EncodeSingle(const std::string& string) const;
|
unsigned int EncodeSingle(const std::string& string) const;
|
||||||
|
|
||||||
// Decode a sequence of labels into a string.
|
// Decode a sequence of labels into a string.
|
||||||
|
@ -52,6 +61,8 @@ public:
|
||||||
|
|
||||||
// Encode a sequence of character/output classes into a sequence of labels.
|
// Encode a sequence of character/output classes into a sequence of labels.
|
||||||
// Characters are assumed to always take a single Unicode codepoint.
|
// Characters are assumed to always take a single Unicode codepoint.
|
||||||
|
// Characters must be in the alphabet, this method will assert that. Use
|
||||||
|
// `CanEncode` and `CanEncodeSingle` to test.
|
||||||
virtual std::vector<unsigned int> Encode(const std::string& input) const;
|
virtual std::vector<unsigned int> Encode(const std::string& input) const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
@ -78,6 +89,8 @@ public:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool CanEncodeSingle(const std::string& string) const override;
|
||||||
|
bool CanEncode(const std::string& string) const override;
|
||||||
std::vector<unsigned int> Encode(const std::string& input) const override;
|
std::vector<unsigned int> Encode(const std::string& input) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -47,6 +47,12 @@ class Alphabet(swigwrapper.Alphabet):
|
||||||
if err != 0:
|
if err != 0:
|
||||||
raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))
|
raise ValueError('Alphabet initialization failed with error code 0x{:X}'.format(err))
|
||||||
|
|
||||||
|
def CanEncodeSingle(self, input):
|
||||||
|
return super(Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
|
||||||
|
|
||||||
|
def CanEncode(self, input):
|
||||||
|
return super(Alphabet, self).CanEncode(input.encode('utf-8'))
|
||||||
|
|
||||||
def EncodeSingle(self, input):
|
def EncodeSingle(self, input):
|
||||||
return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))
|
return super(Alphabet, self).EncodeSingle(input.encode('utf-8'))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue