Refactor test and add default ops type

This commit is contained in:
Andrii Prymostka 2019-08-14 15:34:51 +03:00
parent 2aabaa4b27
commit 51cd587741
6 changed files with 108 additions and 420 deletions

View File

@ -27,8 +27,6 @@ limitations under the License.
namespace tensorflow {
// typedef Eigen::ThreadPoolDevice CPUDevice;
template<typename T>
class CTCLossOp : public OpKernel {
typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,

View File

@ -34,7 +34,7 @@ REGISTER_OP("CTCLoss")
.Attr("ignore_longer_outputs_than_inputs: bool = false")
.Output("loss: T")
.Output("gradient: T")
.Attr("T: {float, double}")
.Attr("T: {float, double} = DT_FLOAT")
.SetShapeFn([](InferenceContext* c) {
ShapeHandle inputs;
ShapeHandle labels_indices;
@ -70,7 +70,7 @@ REGISTER_OP("CTCGreedyDecoder")
.Output("decoded_values: int64")
.Output("decoded_shape: int64")
.Output("log_probability: T")
.Attr("T: {float, double}")
.Attr("T: {float, double} = DT_FLOAT")
.SetShapeFn([](InferenceContext* c) {
ShapeHandle inputs;
ShapeHandle sequence_length;
@ -101,7 +101,7 @@ REGISTER_OP("CTCBeamSearchDecoder")
.Output("decoded_values: top_paths * int64")
.Output("decoded_shape: top_paths * int64")
.Output("log_probability: T")
.Attr("T: {float, double}")
.Attr("T: {float, double} = DT_FLOAT")
.SetShapeFn([](InferenceContext* c) {
ShapeHandle inputs;
ShapeHandle sequence_length;

View File

@ -59,8 +59,7 @@ tf_cc_tests(
name = "ctc_beam_search_test",
size = "small",
srcs = [
"ctc_beam_search_test_float.cc",
"ctc_beam_search_test_double.cc",
"ctc_beam_search_test.cc",
],
deps = [
":ctc_beam_search_lib",

View File

@ -67,7 +67,7 @@ struct BeamEntry {
inline bool Active() const { return newp.total != kLogZero<T>::val; }
// Return the child at the given index, or construct a new one in-place if
// none was found.
BeamEntry<T>& GetChild(int ind) {
BeamEntry<T, CTCBeamState>& GetChild(int ind) {
auto entry = children.emplace(ind, nullptr);
auto& child_entry = entry.first->second;
// If this is a new child, populate the BeamEntry<CTCBeamState>*.
@ -79,7 +79,7 @@ struct BeamEntry {
std::vector<int> LabelSeq(bool merge_repeated) const {
std::vector<int> labels;
int prev_label = -1;
const BeamEntry<T>* c = this;
const BeamEntry<T, CTCBeamState>* c = this;
while (c->parent != nullptr) { // Checking c->parent to skip root leaf.
if (!merge_repeated || c->label != prev_label) {
labels.push_back(c->label);

View File

@ -24,14 +24,18 @@ limitations under the License.
namespace {
typedef std::vector<std::vector<std::vector<double>>> TestData;
typedef tensorflow::ctc::CTCBeamSearchDecoder<double> CTCBeamSearchDecoder
typedef tensorflow::ctc::CTCDecoder<double> CTCDecoder;
template <class T>
using TestData = std::vector<std::vector<std::vector<T>>>;
//using tensorflow::ctc::CTCBeamSearchDecoder;
//using tensorflow::ctc::CTCDecoder;
using namespace tensorflow::ctc;
// The HistoryBeamState is used to keep track of the current candidate and
// caches the expansion score (needed by the scorer).
template <class T>
struct HistoryBeamState {
double score;
T score;
std::vector<int> labels;
};
@ -40,32 +44,33 @@ struct HistoryBeamState {
// a prefix of a dictionary word it gets a low probability at each step.
//
// The dictionary itself is hard-coded a static const variable of the class.
template <class T, class BeamState>
class DictionaryBeamScorer
: public tensorflow::ctc::BaseBeamScorer<HistoryBeamState> {
: public tensorflow::ctc::BaseBeamScorer<T, BeamState > {
public:
void InitializeState(HistoryBeamState* root) const override {
void InitializeState(BeamState* root) const override {
root->score = 0;
}
void ExpandState(const HistoryBeamState& from_state, int from_label,
HistoryBeamState* to_state, int to_label) const override {
void ExpandState(const BeamState& from_state, int from_label,
BeamState* to_state, int to_label) const override {
// Keep track of the current complete candidate by storing the labels along
// the expansion path in the beam state.
to_state->labels.push_back(to_label);
SetStateScoreAccordingToDict(to_state);
}
void ExpandStateEnd(HistoryBeamState* state) const override {
void ExpandStateEnd(BeamState* state) const override {
SetStateScoreAccordingToDict(state);
}
double GetStateExpansionScore(const HistoryBeamState& state,
double previous_score) const override {
T GetStateExpansionScore(const BeamState& state,
T previous_score) const override {
return previous_score + state.score;
}
double GetStateEndExpansionScore(
const HistoryBeamState& state) const override {
T GetStateEndExpansionScore(
const BeamState& state) const override {
return state.score;
}
@ -74,14 +79,16 @@ class DictionaryBeamScorer
static const std::vector<std::vector<int>> dictionary_;
private:
void SetStateScoreAccordingToDict(HistoryBeamState* state) const;
void SetStateScoreAccordingToDict(BeamState* state) const;
};
const std::vector<std::vector<int>> DictionaryBeamScorer::dictionary_ = {
template<class T, class BeamState>
const std::vector<std::vector<int>> DictionaryBeamScorer<T, BeamState>::dictionary_ = {
{3}, {3, 1}};
void DictionaryBeamScorer::SetStateScoreAccordingToDict(
HistoryBeamState* state) const {
template <class T, class BeamState>
void DictionaryBeamScorer<T, BeamState>::SetStateScoreAccordingToDict(
BeamState* state) const {
// Check if the beam can still be a dictionary word (e.g. prefix of one).
const std::vector<int>& candidate = state->labels;
for (int w = 0; w < dictionary_.size(); ++w) {
@ -92,32 +99,33 @@ void DictionaryBeamScorer::SetStateScoreAccordingToDict(
}
if (std::equal(word.begin(), word.begin() + candidate.size(),
candidate.begin())) {
state->score = std::log(1.0);
state->score = std::log(T(1.0));
return;
}
}
// At this point, the candidate certainly can't be in the dictionary.
state->score = std::log(0.01);
state->score = std::log(T(0.01));
}
TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
template<class T>
void ctc_beam_search_decoding_with_and_without_dictionary() {
const int batch_size = 1;
const int timesteps = 5;
const int top_paths = 3;
const int num_classes = 6;
// Plain decoder using hibernating beam search algorithm.
CTCBeamSearchDecoder<>::DefaultBeamScorer default_scorer;
CTCBeamSearchDecoder<> decoder(num_classes, 10 * top_paths, &default_scorer);
typename CTCBeamSearchDecoder<T>::DefaultBeamScorer default_scorer;
CTCBeamSearchDecoder<T> decoder(num_classes, 10 * top_paths, &default_scorer);
// Dictionary decoder, allowing only two dictionary words : {3}, {3, 1}.
DictionaryBeamScorer dictionary_scorer;
CTCBeamSearchDecoder<HistoryBeamState> dictionary_decoder(
DictionaryBeamScorer<T, HistoryBeamState<T>> dictionary_scorer;
CTCBeamSearchDecoder<T, HistoryBeamState<T> > dictionary_decoder(
num_classes, top_paths, &dictionary_scorer);
// Raw data containers (arrays of floats64, ints, etc.).
int sequence_lengths[batch_size] = {timesteps};
double input_data_mat[timesteps][batch_size][num_classes] = {
T input_data_mat[timesteps][batch_size][num_classes] = {
{{0, 0.6, 0, 0.4, 0, 0}},
{{0, 0.5, 0, 0.5, 0, 0}},
{{0, 0.4, 0, 0.6, 0, 0}},
@ -134,14 +142,14 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
}
// Plain output, without any additional scoring.
std::vector<CTCDecoder::Output> expected_output = {
std::vector<typename CTCDecoder<T>::Output> expected_output = {
{{1, 3}, {1, 3, 1}, {3, 1, 3}},
};
// Dictionary outputs: preference for dictionary candidates. The
// second-candidate is there, despite it not being a dictionary word, due to
// stronger probability in the input to the decoder.
std::vector<CTCDecoder::Output> expected_dict_output = {
std::vector<typename CTCDecoder<T>::Output> expected_dict_output = {
{{3}, {1, 3}, {3, 1}},
};
@ -149,19 +157,19 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
// mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
// using Eigen::Map.
Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
std::vector<Eigen::Map<const Eigen::MatrixXd>> inputs;
std::vector<Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>> inputs;
inputs.reserve(timesteps);
for (int t = 0; t < timesteps; ++t) {
inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
}
// Prepare containers for output and scores.
std::vector<CTCDecoder::Output> outputs(top_paths);
for (CTCDecoder::Output& output : outputs) {
std::vector<typename CTCDecoder<T>::Output> outputs(top_paths);
for (typename CTCDecoder<T>::Output& output : outputs) {
output.resize(batch_size);
}
double score[batch_size][top_paths] = {{0.0}};
Eigen::Map<Eigen::MatrixXd> scores(&score[0][0], batch_size, top_paths);
T score[batch_size][top_paths] = {{0.0}};
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> scores(&score[0][0], batch_size, top_paths);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
@ -169,8 +177,8 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
}
// Prepare dictionary outputs.
std::vector<CTCDecoder::Output> dict_outputs(top_paths);
for (CTCDecoder::Output& output : dict_outputs) {
std::vector<typename CTCDecoder<T>::Output> dict_outputs(top_paths);
for (typename CTCDecoder<T>::Output& output : dict_outputs) {
output.resize(batch_size);
}
EXPECT_TRUE(
@ -180,38 +188,39 @@ TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
}
}
TEST(CtcBeamSearch, AllBeamElementsHaveFiniteScores) {
template<class T>
void ctc_beam_search_decoding_all_beam_elements_have_finite_scores() {
const int batch_size = 1;
const int timesteps = 1;
const int top_paths = 3;
const int num_classes = 6;
// Plain decoder using hibernating beam search algorithm.
CTCBeamSearchDecoder<>::DefaultBeamScorer default_scorer;
CTCBeamSearchDecoder<> decoder(num_classes, top_paths, &default_scorer);
typename CTCBeamSearchDecoder<T>::DefaultBeamScorer default_scorer;
CTCBeamSearchDecoder<T> decoder(num_classes, top_paths, &default_scorer);
// Raw data containers (arrays of floats64, ints, etc.).
int sequence_lengths[batch_size] = {timesteps};
double input_data_mat[timesteps][batch_size][num_classes] = {
T input_data_mat[timesteps][batch_size][num_classes] = {
{{0.4, 0.3, 0, 0, 0, 0.5}}};
// Convert data containers to the format accepted by the decoder, simply
// mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
// using Eigen::Map.
Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
std::vector<Eigen::Map<const Eigen::MatrixXd>> inputs;
std::vector<Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>> inputs;
inputs.reserve(timesteps);
for (int t = 0; t < timesteps; ++t) {
inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
}
// Prepare containers for output and scores.
std::vector<CTCDecoder::Output> outputs(top_paths);
for (CTCDecoder::Output& output : outputs) {
std::vector<typename CTCDecoder<T>::Output> outputs(top_paths);
for (typename CTCDecoder<T>::Output& output : outputs) {
output.resize(batch_size);
}
double score[batch_size][top_paths] = {{0.0}};
Eigen::Map<Eigen::MatrixXd> scores(&score[0][0], batch_size, top_paths);
T score[batch_size][top_paths] = {{0.0}};
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> scores(&score[0][0], batch_size, top_paths);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
// Make sure all scores are finite.
@ -226,8 +235,9 @@ TEST(CtcBeamSearch, AllBeamElementsHaveFiniteScores) {
typedef int LabelState; // The state is simply the final label.
template <class T>
class RapidlyDroppingLabelScorer
: public tensorflow::ctc::BaseBeamScorer<LabelState> {
: public tensorflow::ctc::BaseBeamScorer<T, LabelState> {
public:
void InitializeState(LabelState* root) const override {}
@ -238,27 +248,28 @@ class RapidlyDroppingLabelScorer
void ExpandStateEnd(LabelState* state) const override {}
double GetStateExpansionScore(const LabelState& state,
double previous_score) const override {
T GetStateExpansionScore(const LabelState& state,
T previous_score) const override {
// Drop off rapidly for later labels.
const double kRapidly = 100;
const T kRapidly = 100;
return previous_score - kRapidly * state;
}
double GetStateEndExpansionScore(const LabelState& state) const override {
return 0;
T GetStateEndExpansionScore(const LabelState& state) const override {
return T(0);
}
};
TEST(CtcBeamSearch, LabelSelection) {
template<class T>
void ctc_beam_search_label_selection() {
const int batch_size = 1;
const int timesteps = 3;
const int top_paths = 5;
const int num_classes = 6;
// Decoder which drops off log-probabilities for labels 0 >> 1 >> 2 >> 3.
RapidlyDroppingLabelScorer scorer;
CTCBeamSearchDecoder<LabelState> decoder(num_classes, top_paths, &scorer);
RapidlyDroppingLabelScorer<T> scorer;
CTCBeamSearchDecoder<T, LabelState> decoder(num_classes, top_paths, &scorer);
// Raw data containers (arrays of floats64, ints, etc.).
int sequence_lengths[batch_size] = {timesteps};
@ -267,26 +278,26 @@ TEST(CtcBeamSearch, LabelSelection) {
// The last one is empty label, and for simplicity we give it an extremely
// high cost to ignore it. We also use the first label to break up the
// repeated label sequence.
double input_data_mat[timesteps][batch_size][num_classes] = {
T input_data_mat[timesteps][batch_size][num_classes] = {
{{-1e6, 1, 2, 3, 4, -1e6}},
{{1e6, 0, 0, 0, 0, -1e6}}, // force label 0 to break up repeated
{{-1e6, 1.1, 2.2, 3.3, 4.4, -1e6}},
};
// Expected output without label selection
std::vector<CTCDecoder::Output> expected_default_output = {
std::vector<typename CTCDecoder<T>::Output> expected_default_output = {
{{1, 0, 1}, {1, 0, 2}, {2, 0, 1}, {1, 0, 3}, {2, 0, 2}},
};
// Expected output with label selection limiting to 2 items
// this is suboptimal because only labels 3 and 4 were allowed to be seen.
std::vector<CTCDecoder::Output> expected_output_size2 = {
std::vector<typename CTCDecoder<T>::Output> expected_output_size2 = {
{{3, 0, 3}, {3, 0, 4}, {4, 0, 3}, {4, 0, 4}, {3}},
};
// Expected output with label width of 2.0. This would permit three labels at
// the first timestep, but only two at the last.
std::vector<CTCDecoder::Output> expected_output_width2 = {
std::vector<typename CTCDecoder<T>::Output> expected_output_width2 = {
{{2, 0, 3}, {2, 0, 4}, {3, 0, 3}, {3, 0, 4}, {4, 0, 3}},
};
@ -294,19 +305,19 @@ TEST(CtcBeamSearch, LabelSelection) {
// mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
// using Eigen::Map.
Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
std::vector<Eigen::Map<const Eigen::MatrixXd>> inputs;
std::vector<Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>> inputs;
inputs.reserve(timesteps);
for (int t = 0; t < timesteps; ++t) {
inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
}
// Prepare containers for output and scores.
std::vector<CTCDecoder::Output> outputs(top_paths);
for (CTCDecoder::Output& output : outputs) {
std::vector<typename CTCDecoder<T>::Output> outputs(top_paths);
for (typename CTCDecoder<T>::Output& output : outputs) {
output.resize(batch_size);
}
double score[batch_size][top_paths] = {{0.0}};
Eigen::Map<Eigen::MatrixXd> scores(&score[0][0], batch_size, top_paths);
T score[batch_size][top_paths] = {{0.0}};
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> scores(&score[0][0], batch_size, top_paths);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
@ -314,14 +325,14 @@ TEST(CtcBeamSearch, LabelSelection) {
}
// Try label selection size 2
decoder.SetLabelSelectionParameters(2, -1);
decoder.SetLabelSelectionParameters(2, T(-1));
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_output_size2[0][path]);
}
// Try label selection width 2.0
decoder.SetLabelSelectionParameters(0, 2.0);
decoder.SetLabelSelectionParameters(0, T(2.0));
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_output_width2[0][path]);
@ -329,18 +340,44 @@ TEST(CtcBeamSearch, LabelSelection) {
// Try both size 2 and width 2.0: the former is more constraining, so
// it's equivalent to that.
decoder.SetLabelSelectionParameters(2, 2.0);
decoder.SetLabelSelectionParameters(2, T(2.0));
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_output_size2[0][path]);
}
// Size 4 and width > 3.3 are equivalent to no label selection
decoder.SetLabelSelectionParameters(4, 3.3001);
decoder.SetLabelSelectionParameters(4, T(3.3001));
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_default_output[0][path]);
}
}
TEST(CtcBeamSearch, FloatDecodingWithAndWithoutDictionary) {
ctc_beam_search_decoding_with_and_without_dictionary<float>();
}
TEST(CtcBeamSearch, DoubleDecodingWithAndWithoutDictionary) {
ctc_beam_search_decoding_with_and_without_dictionary<double>();
}
TEST(CtcBeamSearch, FloatAllBeamElementsHaveFiniteScores) {
ctc_beam_search_decoding_all_beam_elements_have_finite_scores<float>();
};
TEST(CtcBeamSearch, DoubleAllBeamElementsHaveFiniteScores) {
ctc_beam_search_decoding_all_beam_elements_have_finite_scores<double>();
};
TEST(CtcBeamSearch, FloatLabelSelection) {
ctc_beam_search_label_selection<float>();
}
TEST(CtcBeamSearch, DoubleLabelSelection) {
ctc_beam_search_label_selection<double>();
}
} // namespace

View File

@ -1,346 +0,0 @@
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This test illustrates how to make use of the CTCBeamSearchDecoder using a
// custom BeamScorer and BeamState based on a dictionary with a few artificial
// words.
#include "tensorflow/core/util/ctc/ctc_beam_search.h"
#include <cmath>
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/test.h"
namespace {
typedef std::vector<std::vector<std::vector<float>>> TestData;
typedef tensorflow::ctc::CTCBeamSearchDecoder<float> CTCBeamSearchDecoder;
using tensorflow::ctc::CTCDecoder<double> CTCDecoder;
// The HistoryBeamState is used to keep track of the current candidate and
// caches the expansion score (needed by the scorer).
struct HistoryBeamState {
float score;
std::vector<int> labels;
};
// DictionaryBeamScorer essentially favors candidates that can still become
// dictionary words. As soon as a beam candidate is not a dictionary word or
// a prefix of a dictionary word it gets a low probability at each step.
//
// The dictionary itself is hard-coded a static const variable of the class.
class DictionaryBeamScorer
: public tensorflow::ctc::BaseBeamScorer<HistoryBeamState> {
public:
void InitializeState(HistoryBeamState* root) const override {
root->score = 0;
}
void ExpandState(const HistoryBeamState& from_state, int from_label,
HistoryBeamState* to_state, int to_label) const override {
// Keep track of the current complete candidate by storing the labels along
// the expansion path in the beam state.
to_state->labels.push_back(to_label);
SetStateScoreAccordingToDict(to_state);
}
void ExpandStateEnd(HistoryBeamState* state) const override {
SetStateScoreAccordingToDict(state);
}
float GetStateExpansionScore(const HistoryBeamState& state,
float previous_score) const override {
return previous_score + state.score;
}
float GetStateEndExpansionScore(
const HistoryBeamState& state) const override {
return state.score;
}
// Simple dictionary used when scoring the beams to check if they are prefixes
// of dictionary words (see SetStateScoreAccordingToDict below).
static const std::vector<std::vector<int>> dictionary_;
private:
void SetStateScoreAccordingToDict(HistoryBeamState* state) const;
};
const std::vector<std::vector<int>> DictionaryBeamScorer::dictionary_ = {
{3}, {3, 1}};
void DictionaryBeamScorer::SetStateScoreAccordingToDict(
HistoryBeamState* state) const {
// Check if the beam can still be a dictionary word (e.g. prefix of one).
const std::vector<int>& candidate = state->labels;
for (int w = 0; w < dictionary_.size(); ++w) {
const std::vector<int>& word = dictionary_[w];
// If the length of the current beam is already larger, skip.
if (candidate.size() > word.size()) {
continue;
}
if (std::equal(word.begin(), word.begin() + candidate.size(),
candidate.begin())) {
state->score = std::log(1.0);
return;
}
}
// At this point, the candidate certainly can't be in the dictionary.
state->score = std::log(0.01);
}
TEST(CtcBeamSearch, DecodingWithAndWithoutDictionary) {
const int batch_size = 1;
const int timesteps = 5;
const int top_paths = 3;
const int num_classes = 6;
// Plain decoder using hibernating beam search algorithm.
CTCBeamSearchDecoder<>::DefaultBeamScorer default_scorer;
CTCBeamSearchDecoder<> decoder(num_classes, 10 * top_paths, &default_scorer);
// Dictionary decoder, allowing only two dictionary words : {3}, {3, 1}.
DictionaryBeamScorer dictionary_scorer;
CTCBeamSearchDecoder<HistoryBeamState> dictionary_decoder(
num_classes, top_paths, &dictionary_scorer);
// Raw data containers (arrays of floats64, ints, etc.).
int sequence_lengths[batch_size] = {timesteps};
float input_data_mat[timesteps][batch_size][num_classes] = {
{{0, 0.6, 0, 0.4, 0, 0}},
{{0, 0.5, 0, 0.5, 0, 0}},
{{0, 0.4, 0, 0.6, 0, 0}},
{{0, 0.4, 0, 0.6, 0, 0}},
{{0, 0.4, 0, 0.6, 0, 0}}};
// The CTCDecoder works with log-probs.
for (int t = 0; t < timesteps; ++t) {
for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < num_classes; ++c) {
input_data_mat[t][b][c] = std::log(input_data_mat[t][b][c]);
}
}
}
// Plain output, without any additional scoring.
std::vector<CTCDecoder::Output> expected_output = {
{{1, 3}, {1, 3, 1}, {3, 1, 3}},
};
// Dictionary outputs: preference for dictionary candidates. The
// second-candidate is there, despite it not being a dictionary word, due to
// stronger probability in the input to the decoder.
std::vector<CTCDecoder::Output> expected_dict_output = {
{{3}, {1, 3}, {3, 1}},
};
// Convert data containers to the format accepted by the decoder, simply
// mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
// using Eigen::Map.
Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
std::vector<Eigen::Map<const Eigen::MatrixXd>> inputs;
inputs.reserve(timesteps);
for (int t = 0; t < timesteps; ++t) {
inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
}
// Prepare containers for output and scores.
std::vector<CTCDecoder::Output> outputs(top_paths);
for (CTCDecoder::Output& output : outputs) {
output.resize(batch_size);
}
float score[batch_size][top_paths] = {{0.0}};
Eigen::Map<Eigen::MatrixXd> scores(&score[0][0], batch_size, top_paths);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_output[0][path]);
}
// Prepare dictionary outputs.
std::vector<CTCDecoder::Output> dict_outputs(top_paths);
for (CTCDecoder::Output& output : dict_outputs) {
output.resize(batch_size);
}
EXPECT_TRUE(
dictionary_decoder.Decode(seq_len, inputs, &dict_outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(dict_outputs[path][0], expected_dict_output[0][path]);
}
}
TEST(CtcBeamSearch, AllBeamElementsHaveFiniteScores) {
const int batch_size = 1;
const int timesteps = 1;
const int top_paths = 3;
const int num_classes = 6;
// Plain decoder using hibernating beam search algorithm.
CTCBeamSearchDecoder<>::DefaultBeamScorer default_scorer;
CTCBeamSearchDecoder<> decoder(num_classes, top_paths, &default_scorer);
// Raw data containers (arrays of floats64, ints, etc.).
int sequence_lengths[batch_size] = {timesteps};
float input_data_mat[timesteps][batch_size][num_classes] = {
{{0.4, 0.3, 0, 0, 0, 0.5}}};
// Convert data containers to the format accepted by the decoder, simply
// mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
// using Eigen::Map.
Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
std::vector<Eigen::Map<const Eigen::MatrixXd>> inputs;
inputs.reserve(timesteps);
for (int t = 0; t < timesteps; ++t) {
inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
}
// Prepare containers for output and scores.
std::vector<CTCDecoder::Output> outputs(top_paths);
for (CTCDecoder::Output& output : outputs) {
output.resize(batch_size);
}
float score[batch_size][top_paths] = {{0.0}};
Eigen::Map<Eigen::MatrixXd> scores(&score[0][0], batch_size, top_paths);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
// Make sure all scores are finite.
for (int path = 0; path < top_paths; ++path) {
LOG(INFO) << "path " << path;
EXPECT_FALSE(std::isinf(score[0][path]));
}
}
// A beam decoder to test label selection. It simply models N labels with
// rapidly dropping off log-probability.
typedef int LabelState; // The state is simply the final label.
class RapidlyDroppingLabelScorer
: public tensorflow::ctc::BaseBeamScorer<LabelState> {
public:
void InitializeState(LabelState* root) const override {}
void ExpandState(const LabelState& from_state, int from_label,
LabelState* to_state, int to_label) const override {
*to_state = to_label;
}
void ExpandStateEnd(LabelState* state) const override {}
float GetStateExpansionScore(const LabelState& state,
float previous_score) const override {
// Drop off rapidly for later labels.
const float kRapidly = 100;
return previous_score - kRapidly * state;
}
float GetStateEndExpansionScore(const LabelState& state) const override {
return 0;
}
};
TEST(CtcBeamSearch, LabelSelection) {
const int batch_size = 1;
const int timesteps = 3;
const int top_paths = 5;
const int num_classes = 6;
// Decoder which drops off log-probabilities for labels 0 >> 1 >> 2 >> 3.
RapidlyDroppingLabelScorer scorer;
CTCBeamSearchDecoder<LabelState> decoder(num_classes, top_paths, &scorer);
// Raw data containers (arrays of floats64, ints, etc.).
int sequence_lengths[batch_size] = {timesteps};
// Log probabilities, slightly preferring later labels, this decision
// should be overridden by the scorer which strongly prefers earlier labels.
// The last one is empty label, and for simplicity we give it an extremely
// high cost to ignore it. We also use the first label to break up the
// repeated label sequence.
float input_data_mat[timesteps][batch_size][num_classes] = {
{{-1e6, 1, 2, 3, 4, -1e6}},
{{1e6, 0, 0, 0, 0, -1e6}}, // force label 0 to break up repeated
{{-1e6, 1.1, 2.2, 3.3, 4.4, -1e6}},
};
// Expected output without label selection
std::vector<CTCDecoder::Output> expected_default_output = {
{{1, 0, 1}, {1, 0, 2}, {2, 0, 1}, {1, 0, 3}, {2, 0, 2}},
};
// Expected output with label selection limiting to 2 items
// this is suboptimal because only labels 3 and 4 were allowed to be seen.
std::vector<CTCDecoder::Output> expected_output_size2 = {
{{3, 0, 3}, {3, 0, 4}, {4, 0, 3}, {4, 0, 4}, {3}},
};
// Expected output with label width of 2.0. This would permit three labels at
// the first timestep, but only two at the last.
std::vector<CTCDecoder::Output> expected_output_width2 = {
{{2, 0, 3}, {2, 0, 4}, {3, 0, 3}, {3, 0, 4}, {4, 0, 3}},
};
// Convert data containers to the format accepted by the decoder, simply
// mapping the memory from the container to an Eigen::ArrayXi,::MatrixXf,
// using Eigen::Map.
Eigen::Map<const Eigen::ArrayXi> seq_len(&sequence_lengths[0], batch_size);
std::vector<Eigen::Map<const Eigen::MatrixXd>> inputs;
inputs.reserve(timesteps);
for (int t = 0; t < timesteps; ++t) {
inputs.emplace_back(&input_data_mat[t][0][0], batch_size, num_classes);
}
// Prepare containers for output and scores.
std::vector<CTCDecoder::Output> outputs(top_paths);
for (CTCDecoder::Output& output : outputs) {
output.resize(batch_size);
}
float score[batch_size][top_paths] = {{0.0}};
Eigen::Map<Eigen::MatrixXd> scores(&score[0][0], batch_size, top_paths);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_default_output[0][path]);
}
// Try label selection size 2
decoder.SetLabelSelectionParameters(2, -1);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_output_size2[0][path]);
}
// Try label selection width 2.0
decoder.SetLabelSelectionParameters(0, 2.0);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_output_width2[0][path]);
}
// Try both size 2 and width 2.0: the former is more constraining, so
// it's equivalent to that.
decoder.SetLabelSelectionParameters(2, 2.0);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_output_size2[0][path]);
}
// Size 4 and width > 3.3 are equivalent to no label selection
decoder.SetLabelSelectionParameters(4, 3.3001);
EXPECT_TRUE(decoder.Decode(seq_len, inputs, &outputs, &scores).ok());
for (int path = 0; path < top_paths; ++path) {
EXPECT_EQ(outputs[path][0], expected_default_output[0][path]);
}
}
} // namespace